Model save
Browse files- README.md +2 -4
- all_results.json +4 -4
- config.json +1 -1
- model-00001-of-00002.safetensors +1 -1
- model-00002-of-00002.safetensors +1 -1
- train_results.json +4 -4
- trainer_state.json +515 -515
- training_args.bin +1 -1
README.md
CHANGED
|
@@ -1,11 +1,9 @@
|
|
| 1 |
---
|
| 2 |
base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
-
datasets: open-r1/OpenR1-Math-220k
|
| 4 |
library_name: transformers
|
| 5 |
model_name: Qwen2.5-3B-Open-R1-GRPO
|
| 6 |
tags:
|
| 7 |
- generated_from_trainer
|
| 8 |
-
- open-r1
|
| 9 |
- trl
|
| 10 |
- grpo
|
| 11 |
licence: license
|
|
@@ -13,7 +11,7 @@ licence: license
|
|
| 13 |
|
| 14 |
# Model Card for Qwen2.5-3B-Open-R1-GRPO
|
| 15 |
|
| 16 |
-
This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)
|
| 17 |
It has been trained using [TRL](https://github.com/huggingface/trl).
|
| 18 |
|
| 19 |
## Quick start
|
|
@@ -29,7 +27,7 @@ print(output["generated_text"])
|
|
| 29 |
|
| 30 |
## Training procedure
|
| 31 |
|
| 32 |
-
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenyukang2020-nvidia/huggingface/runs/
|
| 33 |
|
| 34 |
|
| 35 |
This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
|
|
|
|
| 1 |
---
|
| 2 |
base_model: Qwen/Qwen2.5-3B-Instruct
|
|
|
|
| 3 |
library_name: transformers
|
| 4 |
model_name: Qwen2.5-3B-Open-R1-GRPO
|
| 5 |
tags:
|
| 6 |
- generated_from_trainer
|
|
|
|
| 7 |
- trl
|
| 8 |
- grpo
|
| 9 |
licence: license
|
|
|
|
| 11 |
|
| 12 |
# Model Card for Qwen2.5-3B-Open-R1-GRPO
|
| 13 |
|
| 14 |
+
This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct).
|
| 15 |
It has been trained using [TRL](https://github.com/huggingface/trl).
|
| 16 |
|
| 17 |
## Quick start
|
|
|
|
| 27 |
|
| 28 |
## Training procedure
|
| 29 |
|
| 30 |
+
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenyukang2020-nvidia/huggingface/runs/9wwsfr8r)
|
| 31 |
|
| 32 |
|
| 33 |
This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
|
all_results.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"total_flos": 0.0,
|
| 3 |
-
"train_loss": 0.
|
| 4 |
-
"train_runtime":
|
| 5 |
"train_samples": 93733,
|
| 6 |
-
"train_samples_per_second":
|
| 7 |
-
"train_steps_per_second": 2.
|
| 8 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"total_flos": 0.0,
|
| 3 |
+
"train_loss": 0.00015187089497744325,
|
| 4 |
+
"train_runtime": 1302.9313,
|
| 5 |
"train_samples": 93733,
|
| 6 |
+
"train_samples_per_second": 71.94,
|
| 7 |
+
"train_steps_per_second": 2.249
|
| 8 |
}
|
config.json
CHANGED
|
@@ -22,7 +22,7 @@
|
|
| 22 |
"tie_word_embeddings": true,
|
| 23 |
"torch_dtype": "bfloat16",
|
| 24 |
"transformers_version": "4.52.3",
|
| 25 |
-
"use_cache":
|
| 26 |
"use_sliding_window": false,
|
| 27 |
"vocab_size": 151936
|
| 28 |
}
|
|
|
|
| 22 |
"tie_word_embeddings": true,
|
| 23 |
"torch_dtype": "bfloat16",
|
| 24 |
"transformers_version": "4.52.3",
|
| 25 |
+
"use_cache": false,
|
| 26 |
"use_sliding_window": false,
|
| 27 |
"vocab_size": 151936
|
| 28 |
}
|
model-00001-of-00002.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4957560304
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40274e3ec9e8dc4d8fc7fa0b582f6f3b3ce9d41fdcacd2960d3506f7a944ed4b
|
| 3 |
size 4957560304
|
model-00002-of-00002.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1214366696
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a25ec01907d0215b10d3faf0841cf152610f26e87c5b5cccfd966513984b58f0
|
| 3 |
size 1214366696
|
train_results.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"total_flos": 0.0,
|
| 3 |
-
"train_loss": 0.
|
| 4 |
-
"train_runtime":
|
| 5 |
"train_samples": 93733,
|
| 6 |
-
"train_samples_per_second":
|
| 7 |
-
"train_steps_per_second": 2.
|
| 8 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"total_flos": 0.0,
|
| 3 |
+
"train_loss": 0.00015187089497744325,
|
| 4 |
+
"train_runtime": 1302.9313,
|
| 5 |
"train_samples": 93733,
|
| 6 |
+
"train_samples_per_second": 71.94,
|
| 7 |
+
"train_steps_per_second": 2.249
|
| 8 |
}
|
trainer_state.json
CHANGED
|
@@ -87024,7 +87024,7 @@
|
|
| 87024 |
"completions/min_terminated_length": 371.0,
|
| 87025 |
"epoch": 0.9903558931467099,
|
| 87026 |
"frac_reward_zero_std": 0.6875,
|
| 87027 |
-
"grad_norm": 0.
|
| 87028 |
"kl": 0.07421875,
|
| 87029 |
"learning_rate": 6.386247842353755e-09,
|
| 87030 |
"loss": 0.0128,
|
|
@@ -87046,23 +87046,23 @@
|
|
| 87046 |
"clip_ratio/low_min": 0.0,
|
| 87047 |
"clip_ratio/region_mean": 0.0,
|
| 87048 |
"completions/clipped_ratio": -7.0,
|
| 87049 |
-
"completions/max_length":
|
| 87050 |
-
"completions/max_terminated_length":
|
| 87051 |
-
"completions/mean_length":
|
| 87052 |
-
"completions/mean_terminated_length":
|
| 87053 |
-
"completions/min_length":
|
| 87054 |
-
"completions/min_terminated_length":
|
| 87055 |
"epoch": 0.9906972774601007,
|
| 87056 |
"frac_reward_zero_std": 0.6875,
|
| 87057 |
-
"grad_norm": 0.
|
| 87058 |
-
"kl": 0.
|
| 87059 |
"learning_rate": 5.967635461854304e-09,
|
| 87060 |
-
"loss": 0.
|
| 87061 |
-
"num_tokens":
|
| 87062 |
-
"reward": 2.
|
| 87063 |
-
"reward_std": 0.
|
| 87064 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87065 |
-
"rewards/accuracy_reward/std": 0.
|
| 87066 |
"rewards/format_reward/mean": 0.9921875,
|
| 87067 |
"rewards/format_reward/std": 0.08812850713729858,
|
| 87068 |
"rewards/tag_count_reward/mean": 0.99951171875,
|
|
@@ -87075,28 +87075,28 @@
|
|
| 87075 |
"clip_ratio/low_mean": 0.0,
|
| 87076 |
"clip_ratio/low_min": 0.0,
|
| 87077 |
"clip_ratio/region_mean": 0.0,
|
| 87078 |
-
"completions/clipped_ratio": -
|
| 87079 |
-
"completions/max_length":
|
| 87080 |
-
"completions/max_terminated_length":
|
| 87081 |
-
"completions/mean_length":
|
| 87082 |
-
"completions/mean_terminated_length":
|
| 87083 |
-
"completions/min_length":
|
| 87084 |
-
"completions/min_terminated_length":
|
| 87085 |
"epoch": 0.9910386617734915,
|
| 87086 |
"frac_reward_zero_std": 0.59375,
|
| 87087 |
-
"grad_norm": 0.
|
| 87088 |
-
"kl": 0.
|
| 87089 |
"learning_rate": 5.563207782363078e-09,
|
| 87090 |
-
"loss": 0.
|
| 87091 |
-
"num_tokens":
|
| 87092 |
-
"reward": 2.
|
| 87093 |
-
"reward_std": 0.
|
| 87094 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87095 |
-
"rewards/accuracy_reward/std": 0.
|
| 87096 |
-
"rewards/format_reward/mean": 0.
|
| 87097 |
-
"rewards/format_reward/std": 0.
|
| 87098 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87099 |
-
"rewards/tag_count_reward/std": 0.
|
| 87100 |
"step": 2903
|
| 87101 |
},
|
| 87102 |
{
|
|
@@ -87106,27 +87106,27 @@
|
|
| 87106 |
"clip_ratio/low_min": 0.0,
|
| 87107 |
"clip_ratio/region_mean": 0.0,
|
| 87108 |
"completions/clipped_ratio": -7.0,
|
| 87109 |
-
"completions/max_length":
|
| 87110 |
-
"completions/max_terminated_length":
|
| 87111 |
-
"completions/mean_length":
|
| 87112 |
-
"completions/mean_terminated_length":
|
| 87113 |
-
"completions/min_length":
|
| 87114 |
-
"completions/min_terminated_length":
|
| 87115 |
"epoch": 0.9913800460868823,
|
| 87116 |
-
"frac_reward_zero_std": 0.
|
| 87117 |
-
"grad_norm": 0.
|
| 87118 |
-
"kl": 0.
|
| 87119 |
"learning_rate": 5.172965377890915e-09,
|
| 87120 |
-
"loss": 0.
|
| 87121 |
-
"num_tokens":
|
| 87122 |
-
"reward": 2.
|
| 87123 |
-
"reward_std": 0.
|
| 87124 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87125 |
-
"rewards/accuracy_reward/std": 0.
|
| 87126 |
-
"rewards/format_reward/mean": 0.
|
| 87127 |
-
"rewards/format_reward/std": 0.
|
| 87128 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87129 |
-
"rewards/tag_count_reward/std": 0.
|
| 87130 |
"step": 2904
|
| 87131 |
},
|
| 87132 |
{
|
|
@@ -87135,28 +87135,28 @@
|
|
| 87135 |
"clip_ratio/low_mean": 0.0,
|
| 87136 |
"clip_ratio/low_min": 0.0,
|
| 87137 |
"clip_ratio/region_mean": 0.0,
|
| 87138 |
-
"completions/clipped_ratio": -
|
| 87139 |
-
"completions/max_length":
|
| 87140 |
-
"completions/max_terminated_length":
|
| 87141 |
-
"completions/mean_length":
|
| 87142 |
-
"completions/mean_terminated_length":
|
| 87143 |
-
"completions/min_length":
|
| 87144 |
-
"completions/min_terminated_length":
|
| 87145 |
"epoch": 0.9917214304002732,
|
| 87146 |
-
"frac_reward_zero_std": 0.
|
| 87147 |
-
"grad_norm": 0.
|
| 87148 |
-
"kl": 0.
|
| 87149 |
"learning_rate": 4.79690880231587e-09,
|
| 87150 |
-
"loss": 0.
|
| 87151 |
-
"num_tokens":
|
| 87152 |
-
"reward": 2.
|
| 87153 |
-
"reward_std": 0.
|
| 87154 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87155 |
-
"rewards/accuracy_reward/std": 0.
|
| 87156 |
-
"rewards/format_reward/mean": 0.
|
| 87157 |
-
"rewards/format_reward/std": 0.
|
| 87158 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87159 |
-
"rewards/tag_count_reward/std": 0.
|
| 87160 |
"step": 2905
|
| 87161 |
},
|
| 87162 |
{
|
|
@@ -87166,27 +87166,27 @@
|
|
| 87166 |
"clip_ratio/low_min": 0.0,
|
| 87167 |
"clip_ratio/region_mean": 0.0,
|
| 87168 |
"completions/clipped_ratio": -7.0,
|
| 87169 |
-
"completions/max_length":
|
| 87170 |
-
"completions/max_terminated_length":
|
| 87171 |
-
"completions/mean_length":
|
| 87172 |
-
"completions/mean_terminated_length":
|
| 87173 |
-
"completions/min_length":
|
| 87174 |
-
"completions/min_terminated_length":
|
| 87175 |
"epoch": 0.9920628147136639,
|
| 87176 |
-
"frac_reward_zero_std": 0.
|
| 87177 |
-
"grad_norm": 0.
|
| 87178 |
-
"kl": 0.
|
| 87179 |
"learning_rate": 4.435038589380991e-09,
|
| 87180 |
-
"loss": 0.
|
| 87181 |
-
"num_tokens":
|
| 87182 |
-
"reward": 2.
|
| 87183 |
-
"reward_std": 0.
|
| 87184 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87185 |
-
"rewards/accuracy_reward/std": 0.
|
| 87186 |
-
"rewards/format_reward/mean": 0.
|
| 87187 |
-
"rewards/format_reward/std": 0.
|
| 87188 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87189 |
-
"rewards/tag_count_reward/std": 0.
|
| 87190 |
"step": 2906
|
| 87191 |
},
|
| 87192 |
{
|
|
@@ -87196,27 +87196,27 @@
|
|
| 87196 |
"clip_ratio/low_min": 0.0,
|
| 87197 |
"clip_ratio/region_mean": 0.0,
|
| 87198 |
"completions/clipped_ratio": -7.0,
|
| 87199 |
-
"completions/max_length":
|
| 87200 |
-
"completions/max_terminated_length":
|
| 87201 |
-
"completions/mean_length":
|
| 87202 |
-
"completions/mean_terminated_length":
|
| 87203 |
-
"completions/min_length":
|
| 87204 |
-
"completions/min_terminated_length":
|
| 87205 |
"epoch": 0.9924041990270547,
|
| 87206 |
-
"frac_reward_zero_std": 0.
|
| 87207 |
-
"grad_norm": 0.
|
| 87208 |
-
"kl": 0.
|
| 87209 |
"learning_rate": 4.087355252694325e-09,
|
| 87210 |
-
"loss": 0.
|
| 87211 |
-
"num_tokens":
|
| 87212 |
-
"reward": 2.
|
| 87213 |
-
"reward_std": 0.
|
| 87214 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87215 |
-
"rewards/accuracy_reward/std": 0.
|
| 87216 |
-
"rewards/format_reward/mean": 0.
|
| 87217 |
-
"rewards/format_reward/std": 0.
|
| 87218 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87219 |
-
"rewards/tag_count_reward/std": 0.
|
| 87220 |
"step": 2907
|
| 87221 |
},
|
| 87222 |
{
|
|
@@ -87225,28 +87225,28 @@
|
|
| 87225 |
"clip_ratio/low_mean": 0.0,
|
| 87226 |
"clip_ratio/low_min": 0.0,
|
| 87227 |
"clip_ratio/region_mean": 0.0,
|
| 87228 |
-
"completions/clipped_ratio": -6.
|
| 87229 |
"completions/max_length": 2048.0,
|
| 87230 |
-
"completions/max_terminated_length":
|
| 87231 |
-
"completions/mean_length":
|
| 87232 |
-
"completions/mean_terminated_length":
|
| 87233 |
-
"completions/min_length":
|
| 87234 |
-
"completions/min_terminated_length":
|
| 87235 |
"epoch": 0.9927455833404455,
|
| 87236 |
-
"frac_reward_zero_std": 0.
|
| 87237 |
-
"grad_norm": 0.
|
| 87238 |
-
"kl": 0.
|
| 87239 |
"learning_rate": 3.753859285730022e-09,
|
| 87240 |
-
"loss": 0.
|
| 87241 |
-
"num_tokens":
|
| 87242 |
-
"reward": 2.
|
| 87243 |
-
"reward_std": 0.
|
| 87244 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87245 |
-
"rewards/accuracy_reward/std": 0.
|
| 87246 |
-
"rewards/format_reward/mean": 0.
|
| 87247 |
-
"rewards/format_reward/std": 0.
|
| 87248 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87249 |
-
"rewards/tag_count_reward/std": 0.
|
| 87250 |
"step": 2908
|
| 87251 |
},
|
| 87252 |
{
|
|
@@ -87255,28 +87255,28 @@
|
|
| 87255 |
"clip_ratio/low_mean": 0.0,
|
| 87256 |
"clip_ratio/low_min": 0.0,
|
| 87257 |
"clip_ratio/region_mean": 0.0,
|
| 87258 |
-
"completions/clipped_ratio": -
|
| 87259 |
-
"completions/max_length":
|
| 87260 |
-
"completions/max_terminated_length":
|
| 87261 |
-
"completions/mean_length":
|
| 87262 |
-
"completions/mean_terminated_length":
|
| 87263 |
-
"completions/min_length":
|
| 87264 |
-
"completions/min_terminated_length":
|
| 87265 |
"epoch": 0.9930869676538363,
|
| 87266 |
-
"frac_reward_zero_std": 0.
|
| 87267 |
-
"grad_norm": 0.
|
| 87268 |
-
"kl": 0.
|
| 87269 |
"learning_rate": 3.4345511618238957e-09,
|
| 87270 |
-
"loss": 0.
|
| 87271 |
-
"num_tokens":
|
| 87272 |
-
"reward": 2.
|
| 87273 |
-
"reward_std": 0.
|
| 87274 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87275 |
-
"rewards/accuracy_reward/std": 0.
|
| 87276 |
-
"rewards/format_reward/mean": 0.
|
| 87277 |
-
"rewards/format_reward/std": 0.
|
| 87278 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87279 |
-
"rewards/tag_count_reward/std": 0.
|
| 87280 |
"step": 2909
|
| 87281 |
},
|
| 87282 |
{
|
|
@@ -87285,28 +87285,28 @@
|
|
| 87285 |
"clip_ratio/low_mean": 0.0,
|
| 87286 |
"clip_ratio/low_min": 0.0,
|
| 87287 |
"clip_ratio/region_mean": 0.0,
|
| 87288 |
-
"completions/clipped_ratio": -
|
| 87289 |
-
"completions/max_length":
|
| 87290 |
-
"completions/max_terminated_length":
|
| 87291 |
-
"completions/mean_length":
|
| 87292 |
-
"completions/mean_terminated_length": 776.
|
| 87293 |
-
"completions/min_length":
|
| 87294 |
-
"completions/min_terminated_length":
|
| 87295 |
"epoch": 0.9934283519672271,
|
| 87296 |
-
"frac_reward_zero_std": 0.
|
| 87297 |
-
"grad_norm": 0.
|
| 87298 |
-
"kl": 0.
|
| 87299 |
"learning_rate": 3.129431334175648e-09,
|
| 87300 |
-
"loss": 0.
|
| 87301 |
-
"num_tokens":
|
| 87302 |
-
"reward": 2.
|
| 87303 |
-
"reward_std": 0.
|
| 87304 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87305 |
-
"rewards/accuracy_reward/std": 0.
|
| 87306 |
-
"rewards/format_reward/mean": 0.
|
| 87307 |
-
"rewards/format_reward/std": 0.
|
| 87308 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87309 |
-
"rewards/tag_count_reward/std": 0.
|
| 87310 |
"step": 2910
|
| 87311 |
},
|
| 87312 |
{
|
|
@@ -87315,26 +87315,26 @@
|
|
| 87315 |
"clip_ratio/low_mean": 0.0,
|
| 87316 |
"clip_ratio/low_min": 0.0,
|
| 87317 |
"clip_ratio/region_mean": 0.0,
|
| 87318 |
-
"completions/clipped_ratio": -
|
| 87319 |
-
"completions/max_length":
|
| 87320 |
-
"completions/max_terminated_length":
|
| 87321 |
-
"completions/mean_length":
|
| 87322 |
-
"completions/mean_terminated_length":
|
| 87323 |
-
"completions/min_length":
|
| 87324 |
-
"completions/min_terminated_length":
|
| 87325 |
"epoch": 0.9937697362806179,
|
| 87326 |
-
"frac_reward_zero_std": 0.
|
| 87327 |
-
"grad_norm": 0.
|
| 87328 |
-
"kl": 0.
|
| 87329 |
"learning_rate": 2.8385002358466417e-09,
|
| 87330 |
-
"loss": 0.
|
| 87331 |
-
"num_tokens":
|
| 87332 |
-
"reward": 2.
|
| 87333 |
-
"reward_std": 0.
|
| 87334 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87335 |
-
"rewards/accuracy_reward/std": 0.
|
| 87336 |
-
"rewards/format_reward/mean": 0.
|
| 87337 |
-
"rewards/format_reward/std": 0.
|
| 87338 |
"rewards/tag_count_reward/mean": 0.998046875,
|
| 87339 |
"rewards/tag_count_reward/std": 0.022032126784324646,
|
| 87340 |
"step": 2911
|
|
@@ -87346,27 +87346,27 @@
|
|
| 87346 |
"clip_ratio/low_min": 0.0,
|
| 87347 |
"clip_ratio/region_mean": 0.0,
|
| 87348 |
"completions/clipped_ratio": -7.0,
|
| 87349 |
-
"completions/max_length":
|
| 87350 |
-
"completions/max_terminated_length":
|
| 87351 |
-
"completions/mean_length":
|
| 87352 |
-
"completions/mean_terminated_length":
|
| 87353 |
-
"completions/min_length":
|
| 87354 |
-
"completions/min_terminated_length":
|
| 87355 |
"epoch": 0.9941111205940087,
|
| 87356 |
-
"frac_reward_zero_std": 0.
|
| 87357 |
-
"grad_norm": 0.
|
| 87358 |
-
"kl": 0.
|
| 87359 |
"learning_rate": 2.5617582797610174e-09,
|
| 87360 |
-
"loss": 0.
|
| 87361 |
-
"num_tokens":
|
| 87362 |
-
"reward": 2.
|
| 87363 |
-
"reward_std": 0.
|
| 87364 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87365 |
-
"rewards/accuracy_reward/std": 0.
|
| 87366 |
-
"rewards/format_reward/mean": 0.
|
| 87367 |
-
"rewards/format_reward/std": 0.
|
| 87368 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87369 |
-
"rewards/tag_count_reward/std": 0.
|
| 87370 |
"step": 2912
|
| 87371 |
},
|
| 87372 |
{
|
|
@@ -87376,27 +87376,27 @@
|
|
| 87376 |
"clip_ratio/low_min": 0.0,
|
| 87377 |
"clip_ratio/region_mean": 0.0,
|
| 87378 |
"completions/clipped_ratio": -7.0,
|
| 87379 |
-
"completions/max_length":
|
| 87380 |
-
"completions/max_terminated_length":
|
| 87381 |
-
"completions/mean_length":
|
| 87382 |
-
"completions/mean_terminated_length":
|
| 87383 |
-
"completions/min_length":
|
| 87384 |
-
"completions/min_terminated_length":
|
| 87385 |
"epoch": 0.9944525049073996,
|
| 87386 |
-
"frac_reward_zero_std": 0.
|
| 87387 |
-
"grad_norm": 0.
|
| 87388 |
-
"kl": 0.
|
| 87389 |
"learning_rate": 2.299205858702358e-09,
|
| 87390 |
-
"loss": 0.
|
| 87391 |
-
"num_tokens":
|
| 87392 |
-
"reward": 2.
|
| 87393 |
-
"reward_std": 0.
|
| 87394 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87395 |
-
"rewards/accuracy_reward/std": 0.
|
| 87396 |
-
"rewards/format_reward/mean": 0.
|
| 87397 |
-
"rewards/format_reward/std": 0.
|
| 87398 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87399 |
-
"rewards/tag_count_reward/std": 0.
|
| 87400 |
"step": 2913
|
| 87401 |
},
|
| 87402 |
{
|
|
@@ -87405,28 +87405,28 @@
|
|
| 87405 |
"clip_ratio/low_mean": 0.0,
|
| 87406 |
"clip_ratio/low_min": 0.0,
|
| 87407 |
"clip_ratio/region_mean": 0.0,
|
| 87408 |
-
"completions/clipped_ratio": -
|
| 87409 |
-
"completions/max_length":
|
| 87410 |
-
"completions/max_terminated_length":
|
| 87411 |
-
"completions/mean_length":
|
| 87412 |
-
"completions/mean_terminated_length":
|
| 87413 |
-
"completions/min_length":
|
| 87414 |
-
"completions/min_terminated_length":
|
| 87415 |
"epoch": 0.9947938892207903,
|
| 87416 |
-
"frac_reward_zero_std": 0.
|
| 87417 |
-
"grad_norm": 0.
|
| 87418 |
-
"kl": 0.
|
| 87419 |
"learning_rate": 2.0508433453170218e-09,
|
| 87420 |
-
"loss": 0.
|
| 87421 |
-
"num_tokens":
|
| 87422 |
-
"reward": 2.
|
| 87423 |
-
"reward_std": 0.
|
| 87424 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87425 |
-
"rewards/accuracy_reward/std": 0.
|
| 87426 |
-
"rewards/format_reward/mean": 0.
|
| 87427 |
-
"rewards/format_reward/std": 0.
|
| 87428 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87429 |
-
"rewards/tag_count_reward/std": 0.
|
| 87430 |
"step": 2914
|
| 87431 |
},
|
| 87432 |
{
|
|
@@ -87436,25 +87436,25 @@
|
|
| 87436 |
"clip_ratio/low_min": 0.0,
|
| 87437 |
"clip_ratio/region_mean": 0.0,
|
| 87438 |
"completions/clipped_ratio": -7.0,
|
| 87439 |
-
"completions/max_length":
|
| 87440 |
-
"completions/max_terminated_length":
|
| 87441 |
-
"completions/mean_length":
|
| 87442 |
-
"completions/mean_terminated_length":
|
| 87443 |
-
"completions/min_length":
|
| 87444 |
-
"completions/min_terminated_length":
|
| 87445 |
"epoch": 0.9951352735341811,
|
| 87446 |
-
"frac_reward_zero_std": 0.
|
| 87447 |
-
"grad_norm": 0.
|
| 87448 |
-
"kl": 0.
|
| 87449 |
"learning_rate": 1.8166710921097008e-09,
|
| 87450 |
-
"loss": 0.
|
| 87451 |
-
"num_tokens":
|
| 87452 |
-
"reward": 2.
|
| 87453 |
-
"reward_std": 0.
|
| 87454 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87455 |
-
"rewards/accuracy_reward/std": 0.
|
| 87456 |
-
"rewards/format_reward/mean": 0.
|
| 87457 |
-
"rewards/format_reward/std": 0.
|
| 87458 |
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87459 |
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87460 |
"step": 2915
|
|
@@ -87467,26 +87467,26 @@
|
|
| 87467 |
"clip_ratio/region_mean": 0.0,
|
| 87468 |
"completions/clipped_ratio": -6.984375,
|
| 87469 |
"completions/max_length": 2048.0,
|
| 87470 |
-
"completions/max_terminated_length":
|
| 87471 |
-
"completions/mean_length":
|
| 87472 |
-
"completions/mean_terminated_length":
|
| 87473 |
-
"completions/min_length":
|
| 87474 |
-
"completions/min_terminated_length":
|
| 87475 |
"epoch": 0.9954766578475719,
|
| 87476 |
"frac_reward_zero_std": 0.59375,
|
| 87477 |
-
"grad_norm": 0.
|
| 87478 |
-
"kl": 0.
|
| 87479 |
"learning_rate": 1.5966894314456416e-09,
|
| 87480 |
-
"loss": 0.
|
| 87481 |
-
"num_tokens":
|
| 87482 |
-
"reward": 2.
|
| 87483 |
-
"reward_std": 0.
|
| 87484 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87485 |
-
"rewards/accuracy_reward/std": 0.
|
| 87486 |
-
"rewards/format_reward/mean": 0.
|
| 87487 |
-
"rewards/format_reward/std": 0.
|
| 87488 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87489 |
-
"rewards/tag_count_reward/std": 0.
|
| 87490 |
"step": 2916
|
| 87491 |
},
|
| 87492 |
{
|
|
@@ -87496,25 +87496,25 @@
|
|
| 87496 |
"clip_ratio/low_min": 0.0,
|
| 87497 |
"clip_ratio/region_mean": 0.0,
|
| 87498 |
"completions/clipped_ratio": -7.0,
|
| 87499 |
-
"completions/max_length":
|
| 87500 |
-
"completions/max_terminated_length":
|
| 87501 |
-
"completions/mean_length":
|
| 87502 |
-
"completions/mean_terminated_length":
|
| 87503 |
-
"completions/min_length":
|
| 87504 |
-
"completions/min_terminated_length":
|
| 87505 |
"epoch": 0.9958180421609627,
|
| 87506 |
-
"frac_reward_zero_std": 0.
|
| 87507 |
-
"grad_norm": 0.
|
| 87508 |
-
"kl": 0.
|
| 87509 |
"learning_rate": 1.3908986755473142e-09,
|
| 87510 |
-
"loss": 0.
|
| 87511 |
-
"num_tokens":
|
| 87512 |
-
"reward": 2.
|
| 87513 |
-
"reward_std": 0.
|
| 87514 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87515 |
-
"rewards/accuracy_reward/std": 0.
|
| 87516 |
-
"rewards/format_reward/mean": 0.
|
| 87517 |
-
"rewards/format_reward/std": 0.
|
| 87518 |
"rewards/tag_count_reward/mean": 0.9990234375,
|
| 87519 |
"rewards/tag_count_reward/std": 0.015609703958034515,
|
| 87520 |
"step": 2917
|
|
@@ -87526,27 +87526,27 @@
|
|
| 87526 |
"clip_ratio/low_min": 0.0,
|
| 87527 |
"clip_ratio/region_mean": 0.0,
|
| 87528 |
"completions/clipped_ratio": -7.0,
|
| 87529 |
-
"completions/max_length":
|
| 87530 |
-
"completions/max_terminated_length":
|
| 87531 |
-
"completions/mean_length":
|
| 87532 |
-
"completions/mean_terminated_length":
|
| 87533 |
-
"completions/min_length":
|
| 87534 |
-
"completions/min_terminated_length":
|
| 87535 |
"epoch": 0.9961594264743535,
|
| 87536 |
-
"frac_reward_zero_std": 0.
|
| 87537 |
-
"grad_norm": 0.
|
| 87538 |
-
"kl": 0.
|
| 87539 |
"learning_rate": 1.199299116497743e-09,
|
| 87540 |
-
"loss": 0.
|
| 87541 |
-
"num_tokens":
|
| 87542 |
-
"reward": 2.
|
| 87543 |
-
"reward_std": 0.
|
| 87544 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87545 |
-
"rewards/accuracy_reward/std": 0.
|
| 87546 |
-
"rewards/format_reward/mean": 0.
|
| 87547 |
-
"rewards/format_reward/std": 0.
|
| 87548 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87549 |
-
"rewards/tag_count_reward/std": 0.
|
| 87550 |
"step": 2918
|
| 87551 |
},
|
| 87552 |
{
|
|
@@ -87556,27 +87556,27 @@
|
|
| 87556 |
"clip_ratio/low_min": 0.0,
|
| 87557 |
"clip_ratio/region_mean": 0.0,
|
| 87558 |
"completions/clipped_ratio": -7.0,
|
| 87559 |
-
"completions/max_length":
|
| 87560 |
-
"completions/max_terminated_length":
|
| 87561 |
-
"completions/mean_length": 794.
|
| 87562 |
-
"completions/mean_terminated_length": 794.
|
| 87563 |
-
"completions/min_length":
|
| 87564 |
-
"completions/min_terminated_length":
|
| 87565 |
"epoch": 0.9965008107877443,
|
| 87566 |
-
"frac_reward_zero_std": 0.
|
| 87567 |
-
"grad_norm": 0.
|
| 87568 |
-
"kl": 0.
|
| 87569 |
"learning_rate": 1.0218910262371762e-09,
|
| 87570 |
-
"loss": 0.
|
| 87571 |
-
"num_tokens":
|
| 87572 |
-
"reward": 2.
|
| 87573 |
-
"reward_std": 0.
|
| 87574 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87575 |
-
"rewards/accuracy_reward/std": 0.
|
| 87576 |
-
"rewards/format_reward/mean": 0.
|
| 87577 |
-
"rewards/format_reward/std": 0.
|
| 87578 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87579 |
-
"rewards/tag_count_reward/std": 0.
|
| 87580 |
"step": 2919
|
| 87581 |
},
|
| 87582 |
{
|
|
@@ -87586,25 +87586,25 @@
|
|
| 87586 |
"clip_ratio/low_min": 0.0,
|
| 87587 |
"clip_ratio/region_mean": 0.0,
|
| 87588 |
"completions/clipped_ratio": -7.0,
|
| 87589 |
-
"completions/max_length":
|
| 87590 |
-
"completions/max_terminated_length":
|
| 87591 |
-
"completions/mean_length":
|
| 87592 |
-
"completions/mean_terminated_length":
|
| 87593 |
-
"completions/min_length":
|
| 87594 |
-
"completions/min_terminated_length":
|
| 87595 |
"epoch": 0.9968421951011351,
|
| 87596 |
"frac_reward_zero_std": 0.53125,
|
| 87597 |
-
"grad_norm": 0.
|
| 87598 |
-
"kl": 0.
|
| 87599 |
"learning_rate": 8.586746565641957e-10,
|
| 87600 |
-
"loss": 0.
|
| 87601 |
-
"num_tokens":
|
| 87602 |
-
"reward": 2.
|
| 87603 |
-
"reward_std": 0.
|
| 87604 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87605 |
-
"rewards/accuracy_reward/std": 0.
|
| 87606 |
-
"rewards/format_reward/mean": 0.
|
| 87607 |
-
"rewards/format_reward/std": 0.
|
| 87608 |
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87609 |
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87610 |
"step": 2920
|
|
@@ -87616,27 +87616,27 @@
|
|
| 87616 |
"clip_ratio/low_min": 0.0,
|
| 87617 |
"clip_ratio/region_mean": 0.0,
|
| 87618 |
"completions/clipped_ratio": -7.0,
|
| 87619 |
-
"completions/max_length":
|
| 87620 |
-
"completions/max_terminated_length":
|
| 87621 |
-
"completions/mean_length":
|
| 87622 |
-
"completions/mean_terminated_length":
|
| 87623 |
-
"completions/min_length":
|
| 87624 |
-
"completions/min_terminated_length":
|
| 87625 |
"epoch": 0.997183579414526,
|
| 87626 |
"frac_reward_zero_std": 0.59375,
|
| 87627 |
-
"grad_norm": 0.
|
| 87628 |
-
"kl": 0.
|
| 87629 |
"learning_rate": 7.096502391346072e-10,
|
| 87630 |
-
"loss": 0.
|
| 87631 |
-
"num_tokens":
|
| 87632 |
-
"reward": 2.
|
| 87633 |
-
"reward_std": 0.
|
| 87634 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87635 |
-
"rewards/accuracy_reward/std": 0.
|
| 87636 |
"rewards/format_reward/mean": 0.9921875,
|
| 87637 |
"rewards/format_reward/std": 0.08812850713729858,
|
| 87638 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87639 |
-
"rewards/tag_count_reward/std": 0.
|
| 87640 |
"step": 2921
|
| 87641 |
},
|
| 87642 |
{
|
|
@@ -87646,27 +87646,27 @@
|
|
| 87646 |
"clip_ratio/low_min": 0.0,
|
| 87647 |
"clip_ratio/region_mean": 0.0,
|
| 87648 |
"completions/clipped_ratio": -7.0,
|
| 87649 |
-
"completions/max_length":
|
| 87650 |
-
"completions/max_terminated_length":
|
| 87651 |
-
"completions/mean_length":
|
| 87652 |
-
"completions/mean_terminated_length":
|
| 87653 |
-
"completions/min_length":
|
| 87654 |
-
"completions/min_terminated_length":
|
| 87655 |
"epoch": 0.9975249637279167,
|
| 87656 |
-
"frac_reward_zero_std": 0.
|
| 87657 |
-
"grad_norm": 0.
|
| 87658 |
-
"kl": 0.
|
| 87659 |
"learning_rate": 5.748179854614399e-10,
|
| 87660 |
-
"loss": 0.
|
| 87661 |
-
"num_tokens":
|
| 87662 |
-
"reward": 2.
|
| 87663 |
-
"reward_std": 0.
|
| 87664 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87665 |
-
"rewards/accuracy_reward/std": 0.
|
| 87666 |
-
"rewards/format_reward/mean": 0.
|
| 87667 |
-
"rewards/format_reward/std": 0.
|
| 87668 |
"rewards/tag_count_reward/mean": 0.9970703125,
|
| 87669 |
-
"rewards/tag_count_reward/std": 0.
|
| 87670 |
"step": 2922
|
| 87671 |
},
|
| 87672 |
{
|
|
@@ -87676,27 +87676,27 @@
|
|
| 87676 |
"clip_ratio/low_min": 0.0,
|
| 87677 |
"clip_ratio/region_mean": 0.0,
|
| 87678 |
"completions/clipped_ratio": -7.0,
|
| 87679 |
-
"completions/max_length":
|
| 87680 |
-
"completions/max_terminated_length":
|
| 87681 |
-
"completions/mean_length":
|
| 87682 |
-
"completions/mean_terminated_length":
|
| 87683 |
-
"completions/min_length":
|
| 87684 |
-
"completions/min_terminated_length":
|
| 87685 |
"epoch": 0.9978663480413075,
|
| 87686 |
-
"frac_reward_zero_std": 0.
|
| 87687 |
-
"grad_norm": 0.
|
| 87688 |
-
"kl": 0.
|
| 87689 |
"learning_rate": 4.541780869138368e-10,
|
| 87690 |
-
"loss": 0.
|
| 87691 |
-
"num_tokens":
|
| 87692 |
-
"reward": 2.
|
| 87693 |
-
"reward_std": 0.
|
| 87694 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87695 |
-
"rewards/accuracy_reward/std": 0.
|
| 87696 |
"rewards/format_reward/mean": 0.990234375,
|
| 87697 |
"rewards/format_reward/std": 0.09843364357948303,
|
| 87698 |
-
"rewards/tag_count_reward/mean":
|
| 87699 |
-
"rewards/tag_count_reward/std": 0.
|
| 87700 |
"step": 2923
|
| 87701 |
},
|
| 87702 |
{
|
|
@@ -87706,27 +87706,27 @@
|
|
| 87706 |
"clip_ratio/low_min": 0.0,
|
| 87707 |
"clip_ratio/region_mean": 0.0,
|
| 87708 |
"completions/clipped_ratio": -7.0,
|
| 87709 |
-
"completions/max_length":
|
| 87710 |
-
"completions/max_terminated_length":
|
| 87711 |
-
"completions/mean_length": 797.
|
| 87712 |
-
"completions/mean_terminated_length": 797.
|
| 87713 |
-
"completions/min_length":
|
| 87714 |
-
"completions/min_terminated_length":
|
| 87715 |
"epoch": 0.9982077323546983,
|
| 87716 |
-
"frac_reward_zero_std": 0.
|
| 87717 |
-
"grad_norm": 0.
|
| 87718 |
-
"kl": 0.
|
| 87719 |
"learning_rate": 3.477307147192743e-10,
|
| 87720 |
-
"loss": 0.
|
| 87721 |
-
"num_tokens":
|
| 87722 |
-
"reward": 2.
|
| 87723 |
-
"reward_std": 0.
|
| 87724 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87725 |
-
"rewards/accuracy_reward/std": 0.
|
| 87726 |
"rewards/format_reward/mean": 0.98828125,
|
| 87727 |
"rewards/format_reward/std": 0.10772226005792618,
|
| 87728 |
"rewards/tag_count_reward/mean": 0.998046875,
|
| 87729 |
-
"rewards/tag_count_reward/std": 0.
|
| 87730 |
"step": 2924
|
| 87731 |
},
|
| 87732 |
{
|
|
@@ -87736,27 +87736,27 @@
|
|
| 87736 |
"clip_ratio/low_min": 0.0,
|
| 87737 |
"clip_ratio/region_mean": 0.0,
|
| 87738 |
"completions/clipped_ratio": -7.0,
|
| 87739 |
-
"completions/max_length":
|
| 87740 |
-
"completions/max_terminated_length":
|
| 87741 |
-
"completions/mean_length":
|
| 87742 |
-
"completions/mean_terminated_length":
|
| 87743 |
-
"completions/min_length":
|
| 87744 |
-
"completions/min_terminated_length":
|
| 87745 |
"epoch": 0.9985491166680891,
|
| 87746 |
-
"frac_reward_zero_std": 0.
|
| 87747 |
-
"grad_norm": 0.
|
| 87748 |
-
"kl": 0.
|
| 87749 |
"learning_rate": 2.5547601995912216e-10,
|
| 87750 |
-
"loss": 0.
|
| 87751 |
-
"num_tokens":
|
| 87752 |
-
"reward": 2.
|
| 87753 |
-
"reward_std": 0.
|
| 87754 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87755 |
-
"rewards/accuracy_reward/std": 0.
|
| 87756 |
-
"rewards/format_reward/mean": 0.
|
| 87757 |
-
"rewards/format_reward/std": 0.
|
| 87758 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87759 |
-
"rewards/tag_count_reward/std": 0.
|
| 87760 |
"step": 2925
|
| 87761 |
},
|
| 87762 |
{
|
|
@@ -87765,28 +87765,28 @@
|
|
| 87765 |
"clip_ratio/low_mean": 0.0,
|
| 87766 |
"clip_ratio/low_min": 0.0,
|
| 87767 |
"clip_ratio/region_mean": 0.0,
|
| 87768 |
-
"completions/clipped_ratio": -
|
| 87769 |
-
"completions/max_length":
|
| 87770 |
-
"completions/max_terminated_length":
|
| 87771 |
-
"completions/mean_length":
|
| 87772 |
-
"completions/mean_terminated_length":
|
| 87773 |
-
"completions/min_length":
|
| 87774 |
-
"completions/min_terminated_length":
|
| 87775 |
"epoch": 0.9988905009814799,
|
| 87776 |
"frac_reward_zero_std": 0.5,
|
| 87777 |
-
"grad_norm": 0.
|
| 87778 |
-
"kl": 0.
|
| 87779 |
"learning_rate": 1.7741413357197367e-10,
|
| 87780 |
-
"loss": 0.
|
| 87781 |
-
"num_tokens":
|
| 87782 |
-
"reward": 2.
|
| 87783 |
-
"reward_std": 0.
|
| 87784 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87785 |
-
"rewards/accuracy_reward/std": 0.
|
| 87786 |
-
"rewards/format_reward/mean": 0.
|
| 87787 |
-
"rewards/format_reward/std": 0.
|
| 87788 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87789 |
-
"rewards/tag_count_reward/std": 0.
|
| 87790 |
"step": 2926
|
| 87791 |
},
|
| 87792 |
{
|
|
@@ -87796,27 +87796,27 @@
|
|
| 87796 |
"clip_ratio/low_min": 0.0,
|
| 87797 |
"clip_ratio/region_mean": 0.0,
|
| 87798 |
"completions/clipped_ratio": -7.0,
|
| 87799 |
-
"completions/max_length":
|
| 87800 |
-
"completions/max_terminated_length":
|
| 87801 |
-
"completions/mean_length":
|
| 87802 |
-
"completions/mean_terminated_length":
|
| 87803 |
-
"completions/min_length":
|
| 87804 |
-
"completions/min_terminated_length":
|
| 87805 |
"epoch": 0.9992318852948707,
|
| 87806 |
-
"frac_reward_zero_std": 0.
|
| 87807 |
-
"grad_norm": 0.
|
| 87808 |
"kl": 0.0772705078125,
|
| 87809 |
"learning_rate": 1.1354516635364577e-10,
|
| 87810 |
-
"loss": 0.
|
| 87811 |
-
"num_tokens":
|
| 87812 |
-
"reward": 2.
|
| 87813 |
-
"reward_std": 0.
|
| 87814 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87815 |
-
"rewards/accuracy_reward/std": 0.
|
| 87816 |
-
"rewards/format_reward/mean": 0.
|
| 87817 |
-
"rewards/format_reward/std": 0.
|
| 87818 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87819 |
-
"rewards/tag_count_reward/std": 0.
|
| 87820 |
"step": 2927
|
| 87821 |
},
|
| 87822 |
{
|
|
@@ -87825,28 +87825,28 @@
|
|
| 87825 |
"clip_ratio/low_mean": 0.0,
|
| 87826 |
"clip_ratio/low_min": 0.0,
|
| 87827 |
"clip_ratio/region_mean": 0.0,
|
| 87828 |
-
"completions/clipped_ratio": -6.
|
| 87829 |
"completions/max_length": 2048.0,
|
| 87830 |
-
"completions/max_terminated_length":
|
| 87831 |
-
"completions/mean_length":
|
| 87832 |
-
"completions/mean_terminated_length":
|
| 87833 |
-
"completions/min_length":
|
| 87834 |
-
"completions/min_terminated_length":
|
| 87835 |
"epoch": 0.9995732696082615,
|
| 87836 |
-
"frac_reward_zero_std": 0.
|
| 87837 |
-
"grad_norm": 0.
|
| 87838 |
-
"kl": 0.
|
| 87839 |
"learning_rate": 6.386920895384841e-11,
|
| 87840 |
-
"loss": 0.
|
| 87841 |
-
"num_tokens":
|
| 87842 |
-
"reward": 2.
|
| 87843 |
-
"reward_std": 0.
|
| 87844 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87845 |
-
"rewards/accuracy_reward/std": 0.
|
| 87846 |
"rewards/format_reward/mean": 0.98046875,
|
| 87847 |
"rewards/format_reward/std": 0.1385180652141571,
|
| 87848 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87849 |
-
"rewards/tag_count_reward/std": 0.
|
| 87850 |
"step": 2928
|
| 87851 |
},
|
| 87852 |
{
|
|
@@ -87856,42 +87856,42 @@
|
|
| 87856 |
"clip_ratio/low_min": 0.0,
|
| 87857 |
"clip_ratio/region_mean": 0.0,
|
| 87858 |
"completions/clipped_ratio": -7.0,
|
| 87859 |
-
"completions/max_length":
|
| 87860 |
-
"completions/max_terminated_length":
|
| 87861 |
-
"completions/mean_length":
|
| 87862 |
-
"completions/mean_terminated_length":
|
| 87863 |
-
"completions/min_length":
|
| 87864 |
-
"completions/min_terminated_length":
|
| 87865 |
"epoch": 0.9999146539216524,
|
| 87866 |
-
"frac_reward_zero_std": 0.
|
| 87867 |
-
"grad_norm": 0.
|
| 87868 |
-
"kl": 0.
|
| 87869 |
"learning_rate": 2.838633187729478e-11,
|
| 87870 |
-
"loss": 0.
|
| 87871 |
-
"num_tokens":
|
| 87872 |
-
"reward": 2.
|
| 87873 |
-
"reward_std": 0.
|
| 87874 |
-
"rewards/accuracy_reward/mean": 0.
|
| 87875 |
-
"rewards/accuracy_reward/std": 0.
|
| 87876 |
-
"rewards/format_reward/mean": 0.
|
| 87877 |
-
"rewards/format_reward/std": 0.
|
| 87878 |
-
"rewards/tag_count_reward/mean": 0.
|
| 87879 |
-
"rewards/tag_count_reward/std": 0.
|
| 87880 |
"step": 2929
|
| 87881 |
},
|
| 87882 |
{
|
| 87883 |
"epoch": 0.9999146539216524,
|
| 87884 |
"step": 2929,
|
| 87885 |
"total_flos": 0.0,
|
| 87886 |
-
"train_loss": 0.
|
| 87887 |
-
"train_runtime":
|
| 87888 |
-
"train_samples_per_second":
|
| 87889 |
-
"train_steps_per_second": 2.
|
| 87890 |
}
|
| 87891 |
],
|
| 87892 |
"logging_steps": 1,
|
| 87893 |
"max_steps": 2930,
|
| 87894 |
-
"num_input_tokens_seen":
|
| 87895 |
"num_train_epochs": 1,
|
| 87896 |
"save_steps": 100,
|
| 87897 |
"stateful_callbacks": {
|
|
|
|
| 87024 |
"completions/min_terminated_length": 371.0,
|
| 87025 |
"epoch": 0.9903558931467099,
|
| 87026 |
"frac_reward_zero_std": 0.6875,
|
| 87027 |
+
"grad_norm": 0.09051816360814072,
|
| 87028 |
"kl": 0.07421875,
|
| 87029 |
"learning_rate": 6.386247842353755e-09,
|
| 87030 |
"loss": 0.0128,
|
|
|
|
| 87046 |
"clip_ratio/low_min": 0.0,
|
| 87047 |
"clip_ratio/region_mean": 0.0,
|
| 87048 |
"completions/clipped_ratio": -7.0,
|
| 87049 |
+
"completions/max_length": 1555.0,
|
| 87050 |
+
"completions/max_terminated_length": 1555.0,
|
| 87051 |
+
"completions/mean_length": 802.6328125,
|
| 87052 |
+
"completions/mean_terminated_length": 802.6328125,
|
| 87053 |
+
"completions/min_length": 182.0,
|
| 87054 |
+
"completions/min_terminated_length": 182.0,
|
| 87055 |
"epoch": 0.9906972774601007,
|
| 87056 |
"frac_reward_zero_std": 0.6875,
|
| 87057 |
+
"grad_norm": 0.12488894096087517,
|
| 87058 |
+
"kl": 0.0802001953125,
|
| 87059 |
"learning_rate": 5.967635461854304e-09,
|
| 87060 |
+
"loss": 0.0146,
|
| 87061 |
+
"num_tokens": 1391193434.0,
|
| 87062 |
+
"reward": 2.07763671875,
|
| 87063 |
+
"reward_std": 0.09719854593276978,
|
| 87064 |
+
"rewards/accuracy_reward/mean": 0.0859375,
|
| 87065 |
+
"rewards/accuracy_reward/std": 0.28054583072662354,
|
| 87066 |
"rewards/format_reward/mean": 0.9921875,
|
| 87067 |
"rewards/format_reward/std": 0.08812850713729858,
|
| 87068 |
"rewards/tag_count_reward/mean": 0.99951171875,
|
|
|
|
| 87075 |
"clip_ratio/low_mean": 0.0,
|
| 87076 |
"clip_ratio/low_min": 0.0,
|
| 87077 |
"clip_ratio/region_mean": 0.0,
|
| 87078 |
+
"completions/clipped_ratio": -6.984375,
|
| 87079 |
+
"completions/max_length": 2048.0,
|
| 87080 |
+
"completions/max_terminated_length": 2002.0,
|
| 87081 |
+
"completions/mean_length": 817.966796875,
|
| 87082 |
+
"completions/mean_terminated_length": 815.5596923828125,
|
| 87083 |
+
"completions/min_length": 143.0,
|
| 87084 |
+
"completions/min_terminated_length": 143.0,
|
| 87085 |
"epoch": 0.9910386617734915,
|
| 87086 |
"frac_reward_zero_std": 0.59375,
|
| 87087 |
+
"grad_norm": 0.10402330505958786,
|
| 87088 |
+
"kl": 0.0775146484375,
|
| 87089 |
"learning_rate": 5.563207782363078e-09,
|
| 87090 |
+
"loss": 0.0097,
|
| 87091 |
+
"num_tokens": 1391706153.0,
|
| 87092 |
+
"reward": 2.048828125,
|
| 87093 |
+
"reward_std": 0.13361230492591858,
|
| 87094 |
+
"rewards/accuracy_reward/mean": 0.0625,
|
| 87095 |
+
"rewards/accuracy_reward/std": 0.2422981858253479,
|
| 87096 |
+
"rewards/format_reward/mean": 0.990234375,
|
| 87097 |
+
"rewards/format_reward/std": 0.09843364357948303,
|
| 87098 |
+
"rewards/tag_count_reward/mean": 0.99609375,
|
| 87099 |
+
"rewards/tag_count_reward/std": 0.04119514673948288,
|
| 87100 |
"step": 2903
|
| 87101 |
},
|
| 87102 |
{
|
|
|
|
| 87106 |
"clip_ratio/low_min": 0.0,
|
| 87107 |
"clip_ratio/region_mean": 0.0,
|
| 87108 |
"completions/clipped_ratio": -7.0,
|
| 87109 |
+
"completions/max_length": 1835.0,
|
| 87110 |
+
"completions/max_terminated_length": 1835.0,
|
| 87111 |
+
"completions/mean_length": 742.251953125,
|
| 87112 |
+
"completions/mean_terminated_length": 742.251953125,
|
| 87113 |
+
"completions/min_length": 220.0,
|
| 87114 |
+
"completions/min_terminated_length": 220.0,
|
| 87115 |
"epoch": 0.9913800460868823,
|
| 87116 |
+
"frac_reward_zero_std": 0.53125,
|
| 87117 |
+
"grad_norm": 0.12250308762985097,
|
| 87118 |
+
"kl": 0.08251953125,
|
| 87119 |
"learning_rate": 5.172965377890915e-09,
|
| 87120 |
+
"loss": 0.0111,
|
| 87121 |
+
"num_tokens": 1392171098.0,
|
| 87122 |
+
"reward": 2.06396484375,
|
| 87123 |
+
"reward_std": 0.16945436596870422,
|
| 87124 |
+
"rewards/accuracy_reward/mean": 0.0859375,
|
| 87125 |
+
"rewards/accuracy_reward/std": 0.28054583072662354,
|
| 87126 |
+
"rewards/format_reward/mean": 0.98046875,
|
| 87127 |
+
"rewards/format_reward/std": 0.1385180652141571,
|
| 87128 |
+
"rewards/tag_count_reward/mean": 0.99755859375,
|
| 87129 |
+
"rewards/tag_count_reward/std": 0.03659820929169655,
|
| 87130 |
"step": 2904
|
| 87131 |
},
|
| 87132 |
{
|
|
|
|
| 87135 |
"clip_ratio/low_mean": 0.0,
|
| 87136 |
"clip_ratio/low_min": 0.0,
|
| 87137 |
"clip_ratio/region_mean": 0.0,
|
| 87138 |
+
"completions/clipped_ratio": -7.0,
|
| 87139 |
+
"completions/max_length": 1859.0,
|
| 87140 |
+
"completions/max_terminated_length": 1859.0,
|
| 87141 |
+
"completions/mean_length": 780.9609375,
|
| 87142 |
+
"completions/mean_terminated_length": 780.9609375,
|
| 87143 |
+
"completions/min_length": 309.0,
|
| 87144 |
+
"completions/min_terminated_length": 309.0,
|
| 87145 |
"epoch": 0.9917214304002732,
|
| 87146 |
+
"frac_reward_zero_std": 0.5625,
|
| 87147 |
+
"grad_norm": 0.09705909340956163,
|
| 87148 |
+
"kl": 0.072509765625,
|
| 87149 |
"learning_rate": 4.79690880231587e-09,
|
| 87150 |
+
"loss": 0.0189,
|
| 87151 |
+
"num_tokens": 1392645526.0,
|
| 87152 |
+
"reward": 2.05029296875,
|
| 87153 |
+
"reward_std": 0.15216603875160217,
|
| 87154 |
+
"rewards/accuracy_reward/mean": 0.064453125,
|
| 87155 |
+
"rewards/accuracy_reward/std": 0.24579854309558868,
|
| 87156 |
+
"rewards/format_reward/mean": 0.986328125,
|
| 87157 |
+
"rewards/format_reward/std": 0.1162383034825325,
|
| 87158 |
+
"rewards/tag_count_reward/mean": 0.99951171875,
|
| 87159 |
+
"rewards/tag_count_reward/std": 0.011048543266952038,
|
| 87160 |
"step": 2905
|
| 87161 |
},
|
| 87162 |
{
|
|
|
|
| 87166 |
"clip_ratio/low_min": 0.0,
|
| 87167 |
"clip_ratio/region_mean": 0.0,
|
| 87168 |
"completions/clipped_ratio": -7.0,
|
| 87169 |
+
"completions/max_length": 1650.0,
|
| 87170 |
+
"completions/max_terminated_length": 1650.0,
|
| 87171 |
+
"completions/mean_length": 745.669921875,
|
| 87172 |
+
"completions/mean_terminated_length": 745.669921875,
|
| 87173 |
+
"completions/min_length": 221.0,
|
| 87174 |
+
"completions/min_terminated_length": 221.0,
|
| 87175 |
"epoch": 0.9920628147136639,
|
| 87176 |
+
"frac_reward_zero_std": 0.53125,
|
| 87177 |
+
"grad_norm": 0.12680546398444922,
|
| 87178 |
+
"kl": 0.08154296875,
|
| 87179 |
"learning_rate": 4.435038589380991e-09,
|
| 87180 |
+
"loss": 0.0099,
|
| 87181 |
+
"num_tokens": 1393112653.0,
|
| 87182 |
+
"reward": 2.18994140625,
|
| 87183 |
+
"reward_std": 0.16484864056110382,
|
| 87184 |
+
"rewards/accuracy_reward/mean": 0.203125,
|
| 87185 |
+
"rewards/accuracy_reward/std": 0.4027182459831238,
|
| 87186 |
+
"rewards/format_reward/mean": 0.98828125,
|
| 87187 |
+
"rewards/format_reward/std": 0.10772226005792618,
|
| 87188 |
+
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87189 |
+
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87190 |
"step": 2906
|
| 87191 |
},
|
| 87192 |
{
|
|
|
|
| 87196 |
"clip_ratio/low_min": 0.0,
|
| 87197 |
"clip_ratio/region_mean": 0.0,
|
| 87198 |
"completions/clipped_ratio": -7.0,
|
| 87199 |
+
"completions/max_length": 1381.0,
|
| 87200 |
+
"completions/max_terminated_length": 1381.0,
|
| 87201 |
+
"completions/mean_length": 694.4453125,
|
| 87202 |
+
"completions/mean_terminated_length": 694.4453125,
|
| 87203 |
+
"completions/min_length": 176.0,
|
| 87204 |
+
"completions/min_terminated_length": 176.0,
|
| 87205 |
"epoch": 0.9924041990270547,
|
| 87206 |
+
"frac_reward_zero_std": 0.53125,
|
| 87207 |
+
"grad_norm": 0.11603922474180073,
|
| 87208 |
+
"kl": 0.0872802734375,
|
| 87209 |
"learning_rate": 4.087355252694325e-09,
|
| 87210 |
+
"loss": 0.0072,
|
| 87211 |
+
"num_tokens": 1393552081.0,
|
| 87212 |
+
"reward": 2.12353515625,
|
| 87213 |
+
"reward_std": 0.18750609457492828,
|
| 87214 |
+
"rewards/accuracy_reward/mean": 0.138671875,
|
| 87215 |
+
"rewards/accuracy_reward/std": 0.34594178199768066,
|
| 87216 |
+
"rewards/format_reward/mean": 0.986328125,
|
| 87217 |
+
"rewards/format_reward/std": 0.1162383034825325,
|
| 87218 |
+
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87219 |
+
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87220 |
"step": 2907
|
| 87221 |
},
|
| 87222 |
{
|
|
|
|
| 87225 |
"clip_ratio/low_mean": 0.0,
|
| 87226 |
"clip_ratio/low_min": 0.0,
|
| 87227 |
"clip_ratio/region_mean": 0.0,
|
| 87228 |
+
"completions/clipped_ratio": -6.953125,
|
| 87229 |
"completions/max_length": 2048.0,
|
| 87230 |
+
"completions/max_terminated_length": 1959.0,
|
| 87231 |
+
"completions/mean_length": 825.453125,
|
| 87232 |
+
"completions/mean_terminated_length": 818.24755859375,
|
| 87233 |
+
"completions/min_length": 169.0,
|
| 87234 |
+
"completions/min_terminated_length": 169.0,
|
| 87235 |
"epoch": 0.9927455833404455,
|
| 87236 |
+
"frac_reward_zero_std": 0.375,
|
| 87237 |
+
"grad_norm": 0.12487904460610164,
|
| 87238 |
+
"kl": 0.077392578125,
|
| 87239 |
"learning_rate": 3.753859285730022e-09,
|
| 87240 |
+
"loss": 0.0293,
|
| 87241 |
+
"num_tokens": 1394054505.0,
|
| 87242 |
+
"reward": 2.09375,
|
| 87243 |
+
"reward_std": 0.20864024758338928,
|
| 87244 |
+
"rewards/accuracy_reward/mean": 0.119140625,
|
| 87245 |
+
"rewards/accuracy_reward/std": 0.32427072525024414,
|
| 87246 |
+
"rewards/format_reward/mean": 0.978515625,
|
| 87247 |
+
"rewards/format_reward/std": 0.14513419568538666,
|
| 87248 |
+
"rewards/tag_count_reward/mean": 0.99609375,
|
| 87249 |
+
"rewards/tag_count_reward/std": 0.031035220250487328,
|
| 87250 |
"step": 2908
|
| 87251 |
},
|
| 87252 |
{
|
|
|
|
| 87255 |
"clip_ratio/low_mean": 0.0,
|
| 87256 |
"clip_ratio/low_min": 0.0,
|
| 87257 |
"clip_ratio/region_mean": 0.0,
|
| 87258 |
+
"completions/clipped_ratio": -7.0,
|
| 87259 |
+
"completions/max_length": 1732.0,
|
| 87260 |
+
"completions/max_terminated_length": 1732.0,
|
| 87261 |
+
"completions/mean_length": 787.2265625,
|
| 87262 |
+
"completions/mean_terminated_length": 787.2265625,
|
| 87263 |
+
"completions/min_length": 254.0,
|
| 87264 |
+
"completions/min_terminated_length": 254.0,
|
| 87265 |
"epoch": 0.9930869676538363,
|
| 87266 |
+
"frac_reward_zero_std": 0.5,
|
| 87267 |
+
"grad_norm": 0.11516269657952301,
|
| 87268 |
+
"kl": 0.0797119140625,
|
| 87269 |
"learning_rate": 3.4345511618238957e-09,
|
| 87270 |
+
"loss": 0.0174,
|
| 87271 |
+
"num_tokens": 1394543709.0,
|
| 87272 |
+
"reward": 2.09228515625,
|
| 87273 |
+
"reward_std": 0.17656370997428894,
|
| 87274 |
+
"rewards/accuracy_reward/mean": 0.115234375,
|
| 87275 |
+
"rewards/accuracy_reward/std": 0.3196168541908264,
|
| 87276 |
+
"rewards/format_reward/mean": 0.98046875,
|
| 87277 |
+
"rewards/format_reward/std": 0.1385180652141571,
|
| 87278 |
+
"rewards/tag_count_reward/mean": 0.99658203125,
|
| 87279 |
+
"rewards/tag_count_reward/std": 0.03972800448536873,
|
| 87280 |
"step": 2909
|
| 87281 |
},
|
| 87282 |
{
|
|
|
|
| 87285 |
"clip_ratio/low_mean": 0.0,
|
| 87286 |
"clip_ratio/low_min": 0.0,
|
| 87287 |
"clip_ratio/region_mean": 0.0,
|
| 87288 |
+
"completions/clipped_ratio": -6.984375,
|
| 87289 |
+
"completions/max_length": 1820.0,
|
| 87290 |
+
"completions/max_terminated_length": 1820.0,
|
| 87291 |
+
"completions/mean_length": 778.107421875,
|
| 87292 |
+
"completions/mean_terminated_length": 776.7182006835938,
|
| 87293 |
+
"completions/min_length": 219.0,
|
| 87294 |
+
"completions/min_terminated_length": 219.0,
|
| 87295 |
"epoch": 0.9934283519672271,
|
| 87296 |
+
"frac_reward_zero_std": 0.4375,
|
| 87297 |
+
"grad_norm": 0.3443649766769348,
|
| 87298 |
+
"kl": 0.0982666015625,
|
| 87299 |
"learning_rate": 3.129431334175648e-09,
|
| 87300 |
+
"loss": 0.0329,
|
| 87301 |
+
"num_tokens": 1395033684.0,
|
| 87302 |
+
"reward": 2.11474609375,
|
| 87303 |
+
"reward_std": 0.20991787314414978,
|
| 87304 |
+
"rewards/accuracy_reward/mean": 0.146484375,
|
| 87305 |
+
"rewards/accuracy_reward/std": 0.35393697023391724,
|
| 87306 |
+
"rewards/format_reward/mean": 0.97265625,
|
| 87307 |
+
"rewards/format_reward/std": 0.16324250400066376,
|
| 87308 |
+
"rewards/tag_count_reward/mean": 0.99560546875,
|
| 87309 |
+
"rewards/tag_count_reward/std": 0.04260620102286339,
|
| 87310 |
"step": 2910
|
| 87311 |
},
|
| 87312 |
{
|
|
|
|
| 87315 |
"clip_ratio/low_mean": 0.0,
|
| 87316 |
"clip_ratio/low_min": 0.0,
|
| 87317 |
"clip_ratio/region_mean": 0.0,
|
| 87318 |
+
"completions/clipped_ratio": -7.0,
|
| 87319 |
+
"completions/max_length": 1908.0,
|
| 87320 |
+
"completions/max_terminated_length": 1908.0,
|
| 87321 |
+
"completions/mean_length": 782.6015625,
|
| 87322 |
+
"completions/mean_terminated_length": 782.6015625,
|
| 87323 |
+
"completions/min_length": 188.0,
|
| 87324 |
+
"completions/min_terminated_length": 188.0,
|
| 87325 |
"epoch": 0.9937697362806179,
|
| 87326 |
+
"frac_reward_zero_std": 0.53125,
|
| 87327 |
+
"grad_norm": 0.12301114676381447,
|
| 87328 |
+
"kl": 0.0780029296875,
|
| 87329 |
"learning_rate": 2.8385002358466417e-09,
|
| 87330 |
+
"loss": 0.0053,
|
| 87331 |
+
"num_tokens": 1395524152.0,
|
| 87332 |
+
"reward": 2.103515625,
|
| 87333 |
+
"reward_std": 0.171902135014534,
|
| 87334 |
+
"rewards/accuracy_reward/mean": 0.12096773833036423,
|
| 87335 |
+
"rewards/accuracy_reward/std": 0.32641899585723877,
|
| 87336 |
+
"rewards/format_reward/mean": 0.98828125,
|
| 87337 |
+
"rewards/format_reward/std": 0.10772226005792618,
|
| 87338 |
"rewards/tag_count_reward/mean": 0.998046875,
|
| 87339 |
"rewards/tag_count_reward/std": 0.022032126784324646,
|
| 87340 |
"step": 2911
|
|
|
|
| 87346 |
"clip_ratio/low_min": 0.0,
|
| 87347 |
"clip_ratio/region_mean": 0.0,
|
| 87348 |
"completions/clipped_ratio": -7.0,
|
| 87349 |
+
"completions/max_length": 1561.0,
|
| 87350 |
+
"completions/max_terminated_length": 1561.0,
|
| 87351 |
+
"completions/mean_length": 714.876953125,
|
| 87352 |
+
"completions/mean_terminated_length": 714.876953125,
|
| 87353 |
+
"completions/min_length": 184.0,
|
| 87354 |
+
"completions/min_terminated_length": 184.0,
|
| 87355 |
"epoch": 0.9941111205940087,
|
| 87356 |
+
"frac_reward_zero_std": 0.53125,
|
| 87357 |
+
"grad_norm": 0.11373851773465948,
|
| 87358 |
+
"kl": 0.08544921875,
|
| 87359 |
"learning_rate": 2.5617582797610174e-09,
|
| 87360 |
+
"loss": 0.026,
|
| 87361 |
+
"num_tokens": 1395971001.0,
|
| 87362 |
+
"reward": 2.08349609375,
|
| 87363 |
+
"reward_std": 0.17116190493106842,
|
| 87364 |
+
"rewards/accuracy_reward/mean": 0.1015625,
|
| 87365 |
+
"rewards/accuracy_reward/std": 0.30236753821372986,
|
| 87366 |
+
"rewards/format_reward/mean": 0.984375,
|
| 87367 |
+
"rewards/format_reward/std": 0.12414088100194931,
|
| 87368 |
+
"rewards/tag_count_reward/mean": 0.99755859375,
|
| 87369 |
+
"rewards/tag_count_reward/std": 0.024608410894870758,
|
| 87370 |
"step": 2912
|
| 87371 |
},
|
| 87372 |
{
|
|
|
|
| 87376 |
"clip_ratio/low_min": 0.0,
|
| 87377 |
"clip_ratio/region_mean": 0.0,
|
| 87378 |
"completions/clipped_ratio": -7.0,
|
| 87379 |
+
"completions/max_length": 1902.0,
|
| 87380 |
+
"completions/max_terminated_length": 1902.0,
|
| 87381 |
+
"completions/mean_length": 794.6015625,
|
| 87382 |
+
"completions/mean_terminated_length": 794.6015625,
|
| 87383 |
+
"completions/min_length": 281.0,
|
| 87384 |
+
"completions/min_terminated_length": 281.0,
|
| 87385 |
"epoch": 0.9944525049073996,
|
| 87386 |
+
"frac_reward_zero_std": 0.5625,
|
| 87387 |
+
"grad_norm": 0.11380928184475643,
|
| 87388 |
+
"kl": 0.0804443359375,
|
| 87389 |
"learning_rate": 2.299205858702358e-09,
|
| 87390 |
+
"loss": 0.0148,
|
| 87391 |
+
"num_tokens": 1396454045.0,
|
| 87392 |
+
"reward": 2.11669921875,
|
| 87393 |
+
"reward_std": 0.15615960955619812,
|
| 87394 |
+
"rewards/accuracy_reward/mean": 0.13709677755832672,
|
| 87395 |
+
"rewards/accuracy_reward/std": 0.34429675340652466,
|
| 87396 |
+
"rewards/format_reward/mean": 0.984375,
|
| 87397 |
+
"rewards/format_reward/std": 0.12414088100194931,
|
| 87398 |
+
"rewards/tag_count_reward/mean": 0.99951171875,
|
| 87399 |
+
"rewards/tag_count_reward/std": 0.011048543266952038,
|
| 87400 |
"step": 2913
|
| 87401 |
},
|
| 87402 |
{
|
|
|
|
| 87405 |
"clip_ratio/low_mean": 0.0,
|
| 87406 |
"clip_ratio/low_min": 0.0,
|
| 87407 |
"clip_ratio/region_mean": 0.0,
|
| 87408 |
+
"completions/clipped_ratio": -6.984375,
|
| 87409 |
+
"completions/max_length": 2048.0,
|
| 87410 |
+
"completions/max_terminated_length": 2010.0,
|
| 87411 |
+
"completions/mean_length": 835.59765625,
|
| 87412 |
+
"completions/mean_terminated_length": 833.2250366210938,
|
| 87413 |
+
"completions/min_length": 245.0,
|
| 87414 |
+
"completions/min_terminated_length": 245.0,
|
| 87415 |
"epoch": 0.9947938892207903,
|
| 87416 |
+
"frac_reward_zero_std": 0.5,
|
| 87417 |
+
"grad_norm": 0.10149774742284243,
|
| 87418 |
+
"kl": 0.074462890625,
|
| 87419 |
"learning_rate": 2.0508433453170218e-09,
|
| 87420 |
+
"loss": 0.0245,
|
| 87421 |
+
"num_tokens": 1396964895.0,
|
| 87422 |
+
"reward": 2.08154296875,
|
| 87423 |
+
"reward_std": 0.1786787509918213,
|
| 87424 |
+
"rewards/accuracy_reward/mean": 0.10546875,
|
| 87425 |
+
"rewards/accuracy_reward/std": 0.3074568510055542,
|
| 87426 |
+
"rewards/format_reward/mean": 0.98046875,
|
| 87427 |
+
"rewards/format_reward/std": 0.1385180652141571,
|
| 87428 |
+
"rewards/tag_count_reward/mean": 0.99560546875,
|
| 87429 |
+
"rewards/tag_count_reward/std": 0.050489041954278946,
|
| 87430 |
"step": 2914
|
| 87431 |
},
|
| 87432 |
{
|
|
|
|
| 87436 |
"clip_ratio/low_min": 0.0,
|
| 87437 |
"clip_ratio/region_mean": 0.0,
|
| 87438 |
"completions/clipped_ratio": -7.0,
|
| 87439 |
+
"completions/max_length": 1628.0,
|
| 87440 |
+
"completions/max_terminated_length": 1628.0,
|
| 87441 |
+
"completions/mean_length": 808.962890625,
|
| 87442 |
+
"completions/mean_terminated_length": 808.962890625,
|
| 87443 |
+
"completions/min_length": 257.0,
|
| 87444 |
+
"completions/min_terminated_length": 257.0,
|
| 87445 |
"epoch": 0.9951352735341811,
|
| 87446 |
+
"frac_reward_zero_std": 0.625,
|
| 87447 |
+
"grad_norm": 0.09026605662138586,
|
| 87448 |
+
"kl": 0.0748291015625,
|
| 87449 |
"learning_rate": 1.8166710921097008e-09,
|
| 87450 |
+
"loss": 0.0103,
|
| 87451 |
+
"num_tokens": 1397456716.0,
|
| 87452 |
+
"reward": 2.08642578125,
|
| 87453 |
+
"reward_std": 0.14233165979385376,
|
| 87454 |
+
"rewards/accuracy_reward/mean": 0.099609375,
|
| 87455 |
+
"rewards/accuracy_reward/std": 0.29977133870124817,
|
| 87456 |
+
"rewards/format_reward/mean": 0.98828125,
|
| 87457 |
+
"rewards/format_reward/std": 0.10772226005792618,
|
| 87458 |
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87459 |
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87460 |
"step": 2915
|
|
|
|
| 87467 |
"clip_ratio/region_mean": 0.0,
|
| 87468 |
"completions/clipped_ratio": -6.984375,
|
| 87469 |
"completions/max_length": 2048.0,
|
| 87470 |
+
"completions/max_terminated_length": 1839.0,
|
| 87471 |
+
"completions/mean_length": 838.197265625,
|
| 87472 |
+
"completions/mean_terminated_length": 835.8297119140625,
|
| 87473 |
+
"completions/min_length": 341.0,
|
| 87474 |
+
"completions/min_terminated_length": 341.0,
|
| 87475 |
"epoch": 0.9954766578475719,
|
| 87476 |
"frac_reward_zero_std": 0.59375,
|
| 87477 |
+
"grad_norm": 0.0990546564513383,
|
| 87478 |
+
"kl": 0.0794677734375,
|
| 87479 |
"learning_rate": 1.5966894314456416e-09,
|
| 87480 |
+
"loss": 0.0194,
|
| 87481 |
+
"num_tokens": 1397977025.0,
|
| 87482 |
+
"reward": 2.06494140625,
|
| 87483 |
+
"reward_std": 0.1546308547258377,
|
| 87484 |
+
"rewards/accuracy_reward/mean": 0.080078125,
|
| 87485 |
+
"rewards/accuracy_reward/std": 0.271679550409317,
|
| 87486 |
+
"rewards/format_reward/mean": 0.986328125,
|
| 87487 |
+
"rewards/format_reward/std": 0.1162383034825325,
|
| 87488 |
+
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87489 |
+
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87490 |
"step": 2916
|
| 87491 |
},
|
| 87492 |
{
|
|
|
|
| 87496 |
"clip_ratio/low_min": 0.0,
|
| 87497 |
"clip_ratio/region_mean": 0.0,
|
| 87498 |
"completions/clipped_ratio": -7.0,
|
| 87499 |
+
"completions/max_length": 1592.0,
|
| 87500 |
+
"completions/max_terminated_length": 1592.0,
|
| 87501 |
+
"completions/mean_length": 735.02734375,
|
| 87502 |
+
"completions/mean_terminated_length": 735.02734375,
|
| 87503 |
+
"completions/min_length": 160.0,
|
| 87504 |
+
"completions/min_terminated_length": 160.0,
|
| 87505 |
"epoch": 0.9958180421609627,
|
| 87506 |
+
"frac_reward_zero_std": 0.6875,
|
| 87507 |
+
"grad_norm": 0.09306644198204554,
|
| 87508 |
+
"kl": 0.08642578125,
|
| 87509 |
"learning_rate": 1.3908986755473142e-09,
|
| 87510 |
+
"loss": 0.0144,
|
| 87511 |
+
"num_tokens": 1398435407.0,
|
| 87512 |
+
"reward": 2.0849609375,
|
| 87513 |
+
"reward_std": 0.1204758882522583,
|
| 87514 |
+
"rewards/accuracy_reward/mean": 0.099609375,
|
| 87515 |
+
"rewards/accuracy_reward/std": 0.29977133870124817,
|
| 87516 |
+
"rewards/format_reward/mean": 0.986328125,
|
| 87517 |
+
"rewards/format_reward/std": 0.1162383034825325,
|
| 87518 |
"rewards/tag_count_reward/mean": 0.9990234375,
|
| 87519 |
"rewards/tag_count_reward/std": 0.015609703958034515,
|
| 87520 |
"step": 2917
|
|
|
|
| 87526 |
"clip_ratio/low_min": 0.0,
|
| 87527 |
"clip_ratio/region_mean": 0.0,
|
| 87528 |
"completions/clipped_ratio": -7.0,
|
| 87529 |
+
"completions/max_length": 1498.0,
|
| 87530 |
+
"completions/max_terminated_length": 1498.0,
|
| 87531 |
+
"completions/mean_length": 774.666015625,
|
| 87532 |
+
"completions/mean_terminated_length": 774.666015625,
|
| 87533 |
+
"completions/min_length": 360.0,
|
| 87534 |
+
"completions/min_terminated_length": 360.0,
|
| 87535 |
"epoch": 0.9961594264743535,
|
| 87536 |
+
"frac_reward_zero_std": 0.5625,
|
| 87537 |
+
"grad_norm": 0.11717990565862879,
|
| 87538 |
+
"kl": 0.08349609375,
|
| 87539 |
"learning_rate": 1.199299116497743e-09,
|
| 87540 |
+
"loss": 0.0232,
|
| 87541 |
+
"num_tokens": 1398909236.0,
|
| 87542 |
+
"reward": 2.056640625,
|
| 87543 |
+
"reward_std": 0.13060790300369263,
|
| 87544 |
+
"rewards/accuracy_reward/mean": 0.0703125,
|
| 87545 |
+
"rewards/accuracy_reward/std": 0.25592297315597534,
|
| 87546 |
+
"rewards/format_reward/mean": 0.98828125,
|
| 87547 |
+
"rewards/format_reward/std": 0.10772226005792618,
|
| 87548 |
+
"rewards/tag_count_reward/mean": 0.998046875,
|
| 87549 |
+
"rewards/tag_count_reward/std": 0.022032126784324646,
|
| 87550 |
"step": 2918
|
| 87551 |
},
|
| 87552 |
{
|
|
|
|
| 87556 |
"clip_ratio/low_min": 0.0,
|
| 87557 |
"clip_ratio/region_mean": 0.0,
|
| 87558 |
"completions/clipped_ratio": -7.0,
|
| 87559 |
+
"completions/max_length": 1840.0,
|
| 87560 |
+
"completions/max_terminated_length": 1840.0,
|
| 87561 |
+
"completions/mean_length": 794.30078125,
|
| 87562 |
+
"completions/mean_terminated_length": 794.30078125,
|
| 87563 |
+
"completions/min_length": 239.0,
|
| 87564 |
+
"completions/min_terminated_length": 239.0,
|
| 87565 |
"epoch": 0.9965008107877443,
|
| 87566 |
+
"frac_reward_zero_std": 0.375,
|
| 87567 |
+
"grad_norm": 0.11873912143398956,
|
| 87568 |
+
"kl": 0.0755615234375,
|
| 87569 |
"learning_rate": 1.0218910262371762e-09,
|
| 87570 |
+
"loss": 0.0179,
|
| 87571 |
+
"num_tokens": 1399398206.0,
|
| 87572 |
+
"reward": 2.14208984375,
|
| 87573 |
+
"reward_std": 0.23764190077781677,
|
| 87574 |
+
"rewards/accuracy_reward/mean": 0.162109375,
|
| 87575 |
+
"rewards/accuracy_reward/std": 0.3689115643501282,
|
| 87576 |
+
"rewards/format_reward/mean": 0.982421875,
|
| 87577 |
+
"rewards/format_reward/std": 0.13154059648513794,
|
| 87578 |
+
"rewards/tag_count_reward/mean": 0.99755859375,
|
| 87579 |
+
"rewards/tag_count_reward/std": 0.03659820929169655,
|
| 87580 |
"step": 2919
|
| 87581 |
},
|
| 87582 |
{
|
|
|
|
| 87586 |
"clip_ratio/low_min": 0.0,
|
| 87587 |
"clip_ratio/region_mean": 0.0,
|
| 87588 |
"completions/clipped_ratio": -7.0,
|
| 87589 |
+
"completions/max_length": 2016.0,
|
| 87590 |
+
"completions/max_terminated_length": 2016.0,
|
| 87591 |
+
"completions/mean_length": 833.984375,
|
| 87592 |
+
"completions/mean_terminated_length": 833.984375,
|
| 87593 |
+
"completions/min_length": 292.0,
|
| 87594 |
+
"completions/min_terminated_length": 292.0,
|
| 87595 |
"epoch": 0.9968421951011351,
|
| 87596 |
"frac_reward_zero_std": 0.53125,
|
| 87597 |
+
"grad_norm": 0.10545064088363902,
|
| 87598 |
+
"kl": 0.0784912109375,
|
| 87599 |
"learning_rate": 8.586746565641957e-10,
|
| 87600 |
+
"loss": 0.0003,
|
| 87601 |
+
"num_tokens": 1399921494.0,
|
| 87602 |
+
"reward": 2.12548828125,
|
| 87603 |
+
"reward_std": 0.1864640712738037,
|
| 87604 |
+
"rewards/accuracy_reward/mean": 0.13671875,
|
| 87605 |
+
"rewards/accuracy_reward/std": 0.3438861668109894,
|
| 87606 |
+
"rewards/format_reward/mean": 0.990234375,
|
| 87607 |
+
"rewards/format_reward/std": 0.09843364357948303,
|
| 87608 |
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87609 |
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87610 |
"step": 2920
|
|
|
|
| 87616 |
"clip_ratio/low_min": 0.0,
|
| 87617 |
"clip_ratio/region_mean": 0.0,
|
| 87618 |
"completions/clipped_ratio": -7.0,
|
| 87619 |
+
"completions/max_length": 1390.0,
|
| 87620 |
+
"completions/max_terminated_length": 1390.0,
|
| 87621 |
+
"completions/mean_length": 740.46484375,
|
| 87622 |
+
"completions/mean_terminated_length": 740.46484375,
|
| 87623 |
+
"completions/min_length": 262.0,
|
| 87624 |
+
"completions/min_terminated_length": 262.0,
|
| 87625 |
"epoch": 0.997183579414526,
|
| 87626 |
"frac_reward_zero_std": 0.59375,
|
| 87627 |
+
"grad_norm": 0.10125262138426178,
|
| 87628 |
+
"kl": 0.0797119140625,
|
| 87629 |
"learning_rate": 7.096502391346072e-10,
|
| 87630 |
+
"loss": 0.0078,
|
| 87631 |
+
"num_tokens": 1400381748.0,
|
| 87632 |
+
"reward": 2.06103515625,
|
| 87633 |
+
"reward_std": 0.15187877416610718,
|
| 87634 |
+
"rewards/accuracy_reward/mean": 0.0703125,
|
| 87635 |
+
"rewards/accuracy_reward/std": 0.25592297315597534,
|
| 87636 |
"rewards/format_reward/mean": 0.9921875,
|
| 87637 |
"rewards/format_reward/std": 0.08812850713729858,
|
| 87638 |
+
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87639 |
+
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87640 |
"step": 2921
|
| 87641 |
},
|
| 87642 |
{
|
|
|
|
| 87646 |
"clip_ratio/low_min": 0.0,
|
| 87647 |
"clip_ratio/region_mean": 0.0,
|
| 87648 |
"completions/clipped_ratio": -7.0,
|
| 87649 |
+
"completions/max_length": 1748.0,
|
| 87650 |
+
"completions/max_terminated_length": 1748.0,
|
| 87651 |
+
"completions/mean_length": 813.0234375,
|
| 87652 |
+
"completions/mean_terminated_length": 813.0234375,
|
| 87653 |
+
"completions/min_length": 291.0,
|
| 87654 |
+
"completions/min_terminated_length": 291.0,
|
| 87655 |
"epoch": 0.9975249637279167,
|
| 87656 |
+
"frac_reward_zero_std": 0.5625,
|
| 87657 |
+
"grad_norm": 0.09890654551815842,
|
| 87658 |
+
"kl": 0.0748291015625,
|
| 87659 |
"learning_rate": 5.748179854614399e-10,
|
| 87660 |
+
"loss": 0.0172,
|
| 87661 |
+
"num_tokens": 1400881472.0,
|
| 87662 |
+
"reward": 2.0908203125,
|
| 87663 |
+
"reward_std": 0.1693429946899414,
|
| 87664 |
+
"rewards/accuracy_reward/mean": 0.107421875,
|
| 87665 |
+
"rewards/accuracy_reward/std": 0.30995169281959534,
|
| 87666 |
+
"rewards/format_reward/mean": 0.986328125,
|
| 87667 |
+
"rewards/format_reward/std": 0.1162383034825325,
|
| 87668 |
"rewards/tag_count_reward/mean": 0.9970703125,
|
| 87669 |
+
"rewards/tag_count_reward/std": 0.038198307156562805,
|
| 87670 |
"step": 2922
|
| 87671 |
},
|
| 87672 |
{
|
|
|
|
| 87676 |
"clip_ratio/low_min": 0.0,
|
| 87677 |
"clip_ratio/region_mean": 0.0,
|
| 87678 |
"completions/clipped_ratio": -7.0,
|
| 87679 |
+
"completions/max_length": 1460.0,
|
| 87680 |
+
"completions/max_terminated_length": 1460.0,
|
| 87681 |
+
"completions/mean_length": 737.50390625,
|
| 87682 |
+
"completions/mean_terminated_length": 737.50390625,
|
| 87683 |
+
"completions/min_length": 209.0,
|
| 87684 |
+
"completions/min_terminated_length": 209.0,
|
| 87685 |
"epoch": 0.9978663480413075,
|
| 87686 |
+
"frac_reward_zero_std": 0.625,
|
| 87687 |
+
"grad_norm": 0.10499893723997014,
|
| 87688 |
+
"kl": 0.081787109375,
|
| 87689 |
"learning_rate": 4.541780869138368e-10,
|
| 87690 |
+
"loss": 0.0114,
|
| 87691 |
+
"num_tokens": 1401334930.0,
|
| 87692 |
+
"reward": 2.06103515625,
|
| 87693 |
+
"reward_std": 0.1196913868188858,
|
| 87694 |
+
"rewards/accuracy_reward/mean": 0.072265625,
|
| 87695 |
+
"rewards/accuracy_reward/std": 0.2591804563999176,
|
| 87696 |
"rewards/format_reward/mean": 0.990234375,
|
| 87697 |
"rewards/format_reward/std": 0.09843364357948303,
|
| 87698 |
+
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87699 |
+
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87700 |
"step": 2923
|
| 87701 |
},
|
| 87702 |
{
|
|
|
|
| 87706 |
"clip_ratio/low_min": 0.0,
|
| 87707 |
"clip_ratio/region_mean": 0.0,
|
| 87708 |
"completions/clipped_ratio": -7.0,
|
| 87709 |
+
"completions/max_length": 1754.0,
|
| 87710 |
+
"completions/max_terminated_length": 1754.0,
|
| 87711 |
+
"completions/mean_length": 797.484375,
|
| 87712 |
+
"completions/mean_terminated_length": 797.484375,
|
| 87713 |
+
"completions/min_length": 212.0,
|
| 87714 |
+
"completions/min_terminated_length": 212.0,
|
| 87715 |
"epoch": 0.9982077323546983,
|
| 87716 |
+
"frac_reward_zero_std": 0.5,
|
| 87717 |
+
"grad_norm": 0.10729524502444683,
|
| 87718 |
+
"kl": 0.080078125,
|
| 87719 |
"learning_rate": 3.477307147192743e-10,
|
| 87720 |
+
"loss": 0.016,
|
| 87721 |
+
"num_tokens": 1401819642.0,
|
| 87722 |
+
"reward": 2.13671875,
|
| 87723 |
+
"reward_std": 0.1730148196220398,
|
| 87724 |
+
"rewards/accuracy_reward/mean": 0.150390625,
|
| 87725 |
+
"rewards/accuracy_reward/std": 0.35780346393585205,
|
| 87726 |
"rewards/format_reward/mean": 0.98828125,
|
| 87727 |
"rewards/format_reward/std": 0.10772226005792618,
|
| 87728 |
"rewards/tag_count_reward/mean": 0.998046875,
|
| 87729 |
+
"rewards/tag_count_reward/std": 0.022032126784324646,
|
| 87730 |
"step": 2924
|
| 87731 |
},
|
| 87732 |
{
|
|
|
|
| 87736 |
"clip_ratio/low_min": 0.0,
|
| 87737 |
"clip_ratio/region_mean": 0.0,
|
| 87738 |
"completions/clipped_ratio": -7.0,
|
| 87739 |
+
"completions/max_length": 1937.0,
|
| 87740 |
+
"completions/max_terminated_length": 1937.0,
|
| 87741 |
+
"completions/mean_length": 811.166015625,
|
| 87742 |
+
"completions/mean_terminated_length": 811.166015625,
|
| 87743 |
+
"completions/min_length": 327.0,
|
| 87744 |
+
"completions/min_terminated_length": 327.0,
|
| 87745 |
"epoch": 0.9985491166680891,
|
| 87746 |
+
"frac_reward_zero_std": 0.5,
|
| 87747 |
+
"grad_norm": 0.10537359711068275,
|
| 87748 |
+
"kl": 0.081298828125,
|
| 87749 |
"learning_rate": 2.5547601995912216e-10,
|
| 87750 |
+
"loss": 0.0178,
|
| 87751 |
+
"num_tokens": 1402320623.0,
|
| 87752 |
+
"reward": 2.14892578125,
|
| 87753 |
+
"reward_std": 0.21511411666870117,
|
| 87754 |
+
"rewards/accuracy_reward/mean": 0.162109375,
|
| 87755 |
+
"rewards/accuracy_reward/std": 0.3689115643501282,
|
| 87756 |
+
"rewards/format_reward/mean": 0.98828125,
|
| 87757 |
+
"rewards/format_reward/std": 0.10772226005792618,
|
| 87758 |
+
"rewards/tag_count_reward/mean": 0.99853515625,
|
| 87759 |
+
"rewards/tag_count_reward/std": 0.019099153578281403,
|
| 87760 |
"step": 2925
|
| 87761 |
},
|
| 87762 |
{
|
|
|
|
| 87765 |
"clip_ratio/low_mean": 0.0,
|
| 87766 |
"clip_ratio/low_min": 0.0,
|
| 87767 |
"clip_ratio/region_mean": 0.0,
|
| 87768 |
+
"completions/clipped_ratio": -7.0,
|
| 87769 |
+
"completions/max_length": 1430.0,
|
| 87770 |
+
"completions/max_terminated_length": 1430.0,
|
| 87771 |
+
"completions/mean_length": 703.416015625,
|
| 87772 |
+
"completions/mean_terminated_length": 703.416015625,
|
| 87773 |
+
"completions/min_length": 203.0,
|
| 87774 |
+
"completions/min_terminated_length": 203.0,
|
| 87775 |
"epoch": 0.9988905009814799,
|
| 87776 |
"frac_reward_zero_std": 0.5,
|
| 87777 |
+
"grad_norm": 0.1300151179262908,
|
| 87778 |
+
"kl": 0.0833740234375,
|
| 87779 |
"learning_rate": 1.7741413357197367e-10,
|
| 87780 |
+
"loss": 0.0125,
|
| 87781 |
+
"num_tokens": 1402760996.0,
|
| 87782 |
+
"reward": 2.080078125,
|
| 87783 |
+
"reward_std": 0.17316466569900513,
|
| 87784 |
+
"rewards/accuracy_reward/mean": 0.099609375,
|
| 87785 |
+
"rewards/accuracy_reward/std": 0.29977133870124817,
|
| 87786 |
+
"rewards/format_reward/mean": 0.982421875,
|
| 87787 |
+
"rewards/format_reward/std": 0.13154059648513794,
|
| 87788 |
+
"rewards/tag_count_reward/mean": 0.998046875,
|
| 87789 |
+
"rewards/tag_count_reward/std": 0.022032126784324646,
|
| 87790 |
"step": 2926
|
| 87791 |
},
|
| 87792 |
{
|
|
|
|
| 87796 |
"clip_ratio/low_min": 0.0,
|
| 87797 |
"clip_ratio/region_mean": 0.0,
|
| 87798 |
"completions/clipped_ratio": -7.0,
|
| 87799 |
+
"completions/max_length": 1723.0,
|
| 87800 |
+
"completions/max_terminated_length": 1723.0,
|
| 87801 |
+
"completions/mean_length": 840.955078125,
|
| 87802 |
+
"completions/mean_terminated_length": 840.955078125,
|
| 87803 |
+
"completions/min_length": 352.0,
|
| 87804 |
+
"completions/min_terminated_length": 352.0,
|
| 87805 |
"epoch": 0.9992318852948707,
|
| 87806 |
+
"frac_reward_zero_std": 0.5625,
|
| 87807 |
+
"grad_norm": 0.10865170665818756,
|
| 87808 |
"kl": 0.0772705078125,
|
| 87809 |
"learning_rate": 1.1354516635364577e-10,
|
| 87810 |
+
"loss": 0.0192,
|
| 87811 |
+
"num_tokens": 1403271117.0,
|
| 87812 |
+
"reward": 2.0263671875,
|
| 87813 |
+
"reward_std": 0.13820995390415192,
|
| 87814 |
+
"rewards/accuracy_reward/mean": 0.048828125,
|
| 87815 |
+
"rewards/accuracy_reward/std": 0.2157193273305893,
|
| 87816 |
+
"rewards/format_reward/mean": 0.98046875,
|
| 87817 |
+
"rewards/format_reward/std": 0.1385180652141571,
|
| 87818 |
+
"rewards/tag_count_reward/mean": 0.9970703125,
|
| 87819 |
+
"rewards/tag_count_reward/std": 0.026930565014481544,
|
| 87820 |
"step": 2927
|
| 87821 |
},
|
| 87822 |
{
|
|
|
|
| 87825 |
"clip_ratio/low_mean": 0.0,
|
| 87826 |
"clip_ratio/low_min": 0.0,
|
| 87827 |
"clip_ratio/region_mean": 0.0,
|
| 87828 |
+
"completions/clipped_ratio": -6.953125,
|
| 87829 |
"completions/max_length": 2048.0,
|
| 87830 |
+
"completions/max_terminated_length": 1749.0,
|
| 87831 |
+
"completions/mean_length": 767.625,
|
| 87832 |
+
"completions/mean_terminated_length": 760.07861328125,
|
| 87833 |
+
"completions/min_length": 279.0,
|
| 87834 |
+
"completions/min_terminated_length": 279.0,
|
| 87835 |
"epoch": 0.9995732696082615,
|
| 87836 |
+
"frac_reward_zero_std": 0.53125,
|
| 87837 |
+
"grad_norm": 0.1098002562236282,
|
| 87838 |
+
"kl": 0.0780029296875,
|
| 87839 |
"learning_rate": 6.386920895384841e-11,
|
| 87840 |
+
"loss": 0.0139,
|
| 87841 |
+
"num_tokens": 1403739997.0,
|
| 87842 |
+
"reward": 2.08935546875,
|
| 87843 |
+
"reward_std": 0.16179436445236206,
|
| 87844 |
+
"rewards/accuracy_reward/mean": 0.11895161122083664,
|
| 87845 |
+
"rewards/accuracy_reward/std": 0.3240584135055542,
|
| 87846 |
"rewards/format_reward/mean": 0.98046875,
|
| 87847 |
"rewards/format_reward/std": 0.1385180652141571,
|
| 87848 |
+
"rewards/tag_count_reward/mean": 0.99365234375,
|
| 87849 |
+
"rewards/tag_count_reward/std": 0.05493048578500748,
|
| 87850 |
"step": 2928
|
| 87851 |
},
|
| 87852 |
{
|
|
|
|
| 87856 |
"clip_ratio/low_min": 0.0,
|
| 87857 |
"clip_ratio/region_mean": 0.0,
|
| 87858 |
"completions/clipped_ratio": -7.0,
|
| 87859 |
+
"completions/max_length": 1440.0,
|
| 87860 |
+
"completions/max_terminated_length": 1440.0,
|
| 87861 |
+
"completions/mean_length": 793.716796875,
|
| 87862 |
+
"completions/mean_terminated_length": 793.716796875,
|
| 87863 |
+
"completions/min_length": 171.0,
|
| 87864 |
+
"completions/min_terminated_length": 171.0,
|
| 87865 |
"epoch": 0.9999146539216524,
|
| 87866 |
+
"frac_reward_zero_std": 0.5,
|
| 87867 |
+
"grad_norm": 0.11492651391052233,
|
| 87868 |
+
"kl": 0.0770263671875,
|
| 87869 |
"learning_rate": 2.838633187729478e-11,
|
| 87870 |
+
"loss": 0.0089,
|
| 87871 |
+
"num_tokens": 1404234140.0,
|
| 87872 |
+
"reward": 2.09912109375,
|
| 87873 |
+
"reward_std": 0.16686061024665833,
|
| 87874 |
+
"rewards/accuracy_reward/mean": 0.115234375,
|
| 87875 |
+
"rewards/accuracy_reward/std": 0.3196168541908264,
|
| 87876 |
+
"rewards/format_reward/mean": 0.986328125,
|
| 87877 |
+
"rewards/format_reward/std": 0.1162383034825325,
|
| 87878 |
+
"rewards/tag_count_reward/mean": 0.99755859375,
|
| 87879 |
+
"rewards/tag_count_reward/std": 0.024608410894870758,
|
| 87880 |
"step": 2929
|
| 87881 |
},
|
| 87882 |
{
|
| 87883 |
"epoch": 0.9999146539216524,
|
| 87884 |
"step": 2929,
|
| 87885 |
"total_flos": 0.0,
|
| 87886 |
+
"train_loss": 0.00015187089497744325,
|
| 87887 |
+
"train_runtime": 1302.9313,
|
| 87888 |
+
"train_samples_per_second": 71.94,
|
| 87889 |
+
"train_steps_per_second": 2.249
|
| 87890 |
}
|
| 87891 |
],
|
| 87892 |
"logging_steps": 1,
|
| 87893 |
"max_steps": 2930,
|
| 87894 |
+
"num_input_tokens_seen": 1404234140,
|
| 87895 |
"num_train_epochs": 1,
|
| 87896 |
"save_steps": 100,
|
| 87897 |
"stateful_callbacks": {
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 8504
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df64e4c9b10422927491885e2146cbeed575ec0206ed018c170dc3bb7d57cf3c
|
| 3 |
size 8504
|