| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.3431232091690544, |
| "eval_steps": 2500, |
| "global_step": 60000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01119269340974212, |
| "grad_norm": 2.299727201461792, |
| "learning_rate": 4.981345510983763e-05, |
| "loss": 1.8848, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.02238538681948424, |
| "grad_norm": 1.9952893257141113, |
| "learning_rate": 4.962691021967526e-05, |
| "loss": 1.7595, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.03357808022922636, |
| "grad_norm": 2.1056811809539795, |
| "learning_rate": 4.944036532951289e-05, |
| "loss": 1.6994, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.04477077363896848, |
| "grad_norm": 2.0474352836608887, |
| "learning_rate": 4.925382043935053e-05, |
| "loss": 1.6629, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.0559634670487106, |
| "grad_norm": 1.9989269971847534, |
| "learning_rate": 4.906727554918816e-05, |
| "loss": 1.6236, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.0559634670487106, |
| "eval_accuracy": 0.5569847646608951, |
| "eval_loss": 2.425182342529297, |
| "eval_runtime": 707.8445, |
| "eval_samples_per_second": 91.796, |
| "eval_steps_per_second": 3.826, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.06715616045845273, |
| "grad_norm": 1.879557490348816, |
| "learning_rate": 4.888073065902579e-05, |
| "loss": 1.5991, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.07834885386819485, |
| "grad_norm": 1.9889895915985107, |
| "learning_rate": 4.869418576886342e-05, |
| "loss": 1.5751, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.08954154727793696, |
| "grad_norm": 1.910925269126892, |
| "learning_rate": 4.8507640878701055e-05, |
| "loss": 1.5587, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.10073424068767908, |
| "grad_norm": 1.9268312454223633, |
| "learning_rate": 4.8321095988538685e-05, |
| "loss": 1.546, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.1119269340974212, |
| "grad_norm": 1.8074718713760376, |
| "learning_rate": 4.8134551098376315e-05, |
| "loss": 1.5301, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.1119269340974212, |
| "eval_accuracy": 0.566450867740456, |
| "eval_loss": 2.3531110286712646, |
| "eval_runtime": 716.3757, |
| "eval_samples_per_second": 90.702, |
| "eval_steps_per_second": 3.78, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.12311962750716332, |
| "grad_norm": 1.8723756074905396, |
| "learning_rate": 4.7948006208213945e-05, |
| "loss": 1.5153, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.13431232091690545, |
| "grad_norm": 1.8938133716583252, |
| "learning_rate": 4.7761461318051575e-05, |
| "loss": 1.5051, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.14550501432664756, |
| "grad_norm": 1.8093421459197998, |
| "learning_rate": 4.757491642788921e-05, |
| "loss": 1.4922, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.1566977077363897, |
| "grad_norm": 1.8811379671096802, |
| "learning_rate": 4.738837153772684e-05, |
| "loss": 1.4841, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.1678904011461318, |
| "grad_norm": 1.8162873983383179, |
| "learning_rate": 4.720182664756447e-05, |
| "loss": 1.4664, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.1678904011461318, |
| "eval_accuracy": 0.5726688422748262, |
| "eval_loss": 2.2988929748535156, |
| "eval_runtime": 706.3059, |
| "eval_samples_per_second": 91.996, |
| "eval_steps_per_second": 3.834, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.17908309455587393, |
| "grad_norm": 1.861790418624878, |
| "learning_rate": 4.70152817574021e-05, |
| "loss": 1.4613, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.19027578796561603, |
| "grad_norm": 1.7351659536361694, |
| "learning_rate": 4.682873686723974e-05, |
| "loss": 1.4554, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.20146848137535817, |
| "grad_norm": 1.796727180480957, |
| "learning_rate": 4.664219197707737e-05, |
| "loss": 1.4469, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.2126611747851003, |
| "grad_norm": 1.751111388206482, |
| "learning_rate": 4.6455647086915e-05, |
| "loss": 1.4405, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.2238538681948424, |
| "grad_norm": 1.793644905090332, |
| "learning_rate": 4.626910219675263e-05, |
| "loss": 1.4314, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.2238538681948424, |
| "eval_accuracy": 0.5781162212828304, |
| "eval_loss": 2.257195472717285, |
| "eval_runtime": 709.7465, |
| "eval_samples_per_second": 91.55, |
| "eval_steps_per_second": 3.815, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.23504656160458454, |
| "grad_norm": 1.7030937671661377, |
| "learning_rate": 4.6082557306590264e-05, |
| "loss": 1.425, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.24623925501432664, |
| "grad_norm": 1.7245328426361084, |
| "learning_rate": 4.5896012416427894e-05, |
| "loss": 1.4206, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.25743194842406875, |
| "grad_norm": 1.7355397939682007, |
| "learning_rate": 4.570946752626552e-05, |
| "loss": 1.409, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.2686246418338109, |
| "grad_norm": 1.7283306121826172, |
| "learning_rate": 4.5522922636103154e-05, |
| "loss": 1.4086, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.279817335243553, |
| "grad_norm": 1.7133527994155884, |
| "learning_rate": 4.5336377745940784e-05, |
| "loss": 1.4042, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.279817335243553, |
| "eval_accuracy": 0.5822483255357088, |
| "eval_loss": 2.2244207859039307, |
| "eval_runtime": 708.1859, |
| "eval_samples_per_second": 91.751, |
| "eval_steps_per_second": 3.824, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.2910100286532951, |
| "grad_norm": 1.688602328300476, |
| "learning_rate": 4.514983285577842e-05, |
| "loss": 1.3952, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.3022027220630373, |
| "grad_norm": 1.6839321851730347, |
| "learning_rate": 4.4963287965616043e-05, |
| "loss": 1.3932, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.3133954154727794, |
| "grad_norm": 1.7225844860076904, |
| "learning_rate": 4.477674307545368e-05, |
| "loss": 1.3839, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.3245881088825215, |
| "grad_norm": 1.6329905986785889, |
| "learning_rate": 4.459019818529131e-05, |
| "loss": 1.3856, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.3357808022922636, |
| "grad_norm": 1.7012953758239746, |
| "learning_rate": 4.440365329512895e-05, |
| "loss": 1.3771, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.3357808022922636, |
| "eval_accuracy": 0.586269614225024, |
| "eval_loss": 2.1954798698425293, |
| "eval_runtime": 718.0126, |
| "eval_samples_per_second": 90.496, |
| "eval_steps_per_second": 3.772, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.34697349570200575, |
| "grad_norm": 1.6593496799468994, |
| "learning_rate": 4.421710840496657e-05, |
| "loss": 1.376, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.35816618911174786, |
| "grad_norm": 1.6412550210952759, |
| "learning_rate": 4.4030563514804206e-05, |
| "loss": 1.3712, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.36935888252148996, |
| "grad_norm": 1.6455302238464355, |
| "learning_rate": 4.3844018624641836e-05, |
| "loss": 1.3699, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.38055157593123207, |
| "grad_norm": 1.6210881471633911, |
| "learning_rate": 4.3657473734479466e-05, |
| "loss": 1.3618, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.3917442693409742, |
| "grad_norm": 1.6821410655975342, |
| "learning_rate": 4.3470928844317096e-05, |
| "loss": 1.3563, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.3917442693409742, |
| "eval_accuracy": 0.589598096204646, |
| "eval_loss": 2.168947219848633, |
| "eval_runtime": 707.8628, |
| "eval_samples_per_second": 91.793, |
| "eval_steps_per_second": 3.826, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.40293696275071633, |
| "grad_norm": 1.719738245010376, |
| "learning_rate": 4.3284383954154726e-05, |
| "loss": 1.3585, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.41412965616045844, |
| "grad_norm": 1.660507321357727, |
| "learning_rate": 4.309783906399236e-05, |
| "loss": 1.3502, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.4253223495702006, |
| "grad_norm": 1.7758148908615112, |
| "learning_rate": 4.291129417382999e-05, |
| "loss": 1.3459, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.4365150429799427, |
| "grad_norm": 1.6665699481964111, |
| "learning_rate": 4.272474928366762e-05, |
| "loss": 1.3435, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.4477077363896848, |
| "grad_norm": 1.6364027261734009, |
| "learning_rate": 4.253820439350525e-05, |
| "loss": 1.3401, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.4477077363896848, |
| "eval_accuracy": 0.5922608205511055, |
| "eval_loss": 2.1485562324523926, |
| "eval_runtime": 704.3891, |
| "eval_samples_per_second": 92.246, |
| "eval_steps_per_second": 3.844, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.4589004297994269, |
| "grad_norm": 9.470758438110352, |
| "learning_rate": 4.426374462750716e-05, |
| "loss": 8.0235, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.4700931232091691, |
| "grad_norm": 9.91232967376709, |
| "learning_rate": 4.412383595988539e-05, |
| "loss": 7.9603, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.4812858166189112, |
| "grad_norm": 9.734143257141113, |
| "learning_rate": 4.398392729226361e-05, |
| "loss": 7.9793, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.4924785100286533, |
| "grad_norm": 9.574400901794434, |
| "learning_rate": 4.3844018624641836e-05, |
| "loss": 7.9731, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.5036712034383954, |
| "grad_norm": 10.017444610595703, |
| "learning_rate": 4.370410995702006e-05, |
| "loss": 7.9335, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.5036712034383954, |
| "eval_accuracy": 0.5952892349509148, |
| "eval_loss": 2.1270551681518555, |
| "eval_runtime": 525.2336, |
| "eval_samples_per_second": 123.711, |
| "eval_steps_per_second": 2.578, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.5148638968481375, |
| "grad_norm": 9.66054916381836, |
| "learning_rate": 4.356420128939828e-05, |
| "loss": 7.9224, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.5260565902578797, |
| "grad_norm": 9.483991622924805, |
| "learning_rate": 4.342429262177651e-05, |
| "loss": 7.9197, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.5372492836676218, |
| "grad_norm": 9.803547859191895, |
| "learning_rate": 4.3284383954154726e-05, |
| "loss": 7.8932, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.5484419770773639, |
| "grad_norm": 9.597293853759766, |
| "learning_rate": 4.3144475286532955e-05, |
| "loss": 7.8622, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.559634670487106, |
| "grad_norm": 9.679096221923828, |
| "learning_rate": 4.300456661891118e-05, |
| "loss": 7.8644, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.559634670487106, |
| "eval_accuracy": 0.5979848217689566, |
| "eval_loss": 2.106226682662964, |
| "eval_runtime": 528.4389, |
| "eval_samples_per_second": 122.96, |
| "eval_steps_per_second": 2.562, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.5708273638968482, |
| "grad_norm": 9.878997802734375, |
| "learning_rate": 4.28646579512894e-05, |
| "loss": 7.8388, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.5820200573065902, |
| "grad_norm": 9.320840835571289, |
| "learning_rate": 4.272474928366762e-05, |
| "loss": 7.8199, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.5932127507163324, |
| "grad_norm": 9.581457138061523, |
| "learning_rate": 4.2584840616045845e-05, |
| "loss": 7.8194, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.6044054441260746, |
| "grad_norm": 9.690735816955566, |
| "learning_rate": 4.2444931948424074e-05, |
| "loss": 7.8147, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.6155981375358166, |
| "grad_norm": 9.55455207824707, |
| "learning_rate": 4.230502328080229e-05, |
| "loss": 7.7927, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.6155981375358166, |
| "eval_accuracy": 0.599545230267029, |
| "eval_loss": 2.09478497505188, |
| "eval_runtime": 531.729, |
| "eval_samples_per_second": 122.199, |
| "eval_steps_per_second": 2.546, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.6267908309455588, |
| "grad_norm": 9.352036476135254, |
| "learning_rate": 4.216511461318052e-05, |
| "loss": 7.7711, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.6379835243553008, |
| "grad_norm": 9.413168907165527, |
| "learning_rate": 4.202520594555874e-05, |
| "loss": 7.7733, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.649176217765043, |
| "grad_norm": 9.420402526855469, |
| "learning_rate": 4.1885297277936964e-05, |
| "loss": 7.74, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.6603689111747851, |
| "grad_norm": 9.579030990600586, |
| "learning_rate": 4.1745388610315186e-05, |
| "loss": 7.7237, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.6715616045845272, |
| "grad_norm": 12.816407203674316, |
| "learning_rate": 4.160547994269341e-05, |
| "loss": 7.7401, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.6715616045845272, |
| "eval_accuracy": 0.6018761556694041, |
| "eval_loss": 2.079362630844116, |
| "eval_runtime": 531.1941, |
| "eval_samples_per_second": 122.323, |
| "eval_steps_per_second": 2.549, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.6827542979942693, |
| "grad_norm": 9.477621078491211, |
| "learning_rate": 4.146557127507164e-05, |
| "loss": 7.717, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.6939469914040115, |
| "grad_norm": 9.8326416015625, |
| "learning_rate": 4.132566260744986e-05, |
| "loss": 7.7148, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.7051396848137536, |
| "grad_norm": 9.668205261230469, |
| "learning_rate": 4.118575393982808e-05, |
| "loss": 7.6845, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.7163323782234957, |
| "grad_norm": 9.344961166381836, |
| "learning_rate": 4.1045845272206305e-05, |
| "loss": 7.673, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.7275250716332379, |
| "grad_norm": 12.754666328430176, |
| "learning_rate": 4.090593660458453e-05, |
| "loss": 7.646, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.7275250716332379, |
| "eval_accuracy": 0.6036969981017439, |
| "eval_loss": 2.0638949871063232, |
| "eval_runtime": 534.6839, |
| "eval_samples_per_second": 121.524, |
| "eval_steps_per_second": 2.532, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.7387177650429799, |
| "grad_norm": 9.269234657287598, |
| "learning_rate": 4.076602793696275e-05, |
| "loss": 7.6452, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.7499104584527221, |
| "grad_norm": 9.59334659576416, |
| "learning_rate": 4.062611926934098e-05, |
| "loss": 7.6369, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.7611031518624641, |
| "grad_norm": 9.979016304016113, |
| "learning_rate": 4.04862106017192e-05, |
| "loss": 7.6306, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.7722958452722063, |
| "grad_norm": 9.395634651184082, |
| "learning_rate": 4.0346301934097424e-05, |
| "loss": 7.6083, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.7834885386819485, |
| "grad_norm": 9.377208709716797, |
| "learning_rate": 4.0206393266475646e-05, |
| "loss": 7.6113, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.7834885386819485, |
| "eval_accuracy": 0.6060486530458662, |
| "eval_loss": 2.046678066253662, |
| "eval_runtime": 527.2553, |
| "eval_samples_per_second": 123.236, |
| "eval_steps_per_second": 2.568, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.7946812320916905, |
| "grad_norm": 9.33324146270752, |
| "learning_rate": 4.006648459885387e-05, |
| "loss": 7.596, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.8058739255014327, |
| "grad_norm": 10.012749671936035, |
| "learning_rate": 3.992657593123209e-05, |
| "loss": 7.5944, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.8170666189111748, |
| "grad_norm": 9.17791748046875, |
| "learning_rate": 3.9786667263610314e-05, |
| "loss": 7.5724, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.8282593123209169, |
| "grad_norm": 9.714068412780762, |
| "learning_rate": 3.964675859598854e-05, |
| "loss": 7.5716, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.839452005730659, |
| "grad_norm": 9.122146606445312, |
| "learning_rate": 3.9506849928366765e-05, |
| "loss": 7.5428, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.839452005730659, |
| "eval_accuracy": 0.6080310471813272, |
| "eval_loss": 2.0341005325317383, |
| "eval_runtime": 534.9624, |
| "eval_samples_per_second": 121.461, |
| "eval_steps_per_second": 2.531, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.8506446991404012, |
| "grad_norm": 8.890284538269043, |
| "learning_rate": 3.936694126074499e-05, |
| "loss": 7.5108, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.8618373925501432, |
| "grad_norm": 9.258638381958008, |
| "learning_rate": 3.922703259312321e-05, |
| "loss": 7.5283, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.8730300859598854, |
| "grad_norm": 9.524474143981934, |
| "learning_rate": 3.908712392550143e-05, |
| "loss": 7.5168, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.8842227793696275, |
| "grad_norm": 9.608149528503418, |
| "learning_rate": 3.894721525787966e-05, |
| "loss": 7.5206, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.8954154727793696, |
| "grad_norm": 9.405288696289062, |
| "learning_rate": 3.880730659025788e-05, |
| "loss": 7.5039, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.8954154727793696, |
| "eval_accuracy": 0.6094788673634718, |
| "eval_loss": 2.0253567695617676, |
| "eval_runtime": 535.0948, |
| "eval_samples_per_second": 121.431, |
| "eval_steps_per_second": 2.53, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.9066081661891118, |
| "grad_norm": 8.706295013427734, |
| "learning_rate": 3.8667397922636107e-05, |
| "loss": 7.4819, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.9178008595988538, |
| "grad_norm": 9.542219161987305, |
| "learning_rate": 3.852748925501433e-05, |
| "loss": 7.4888, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.928993553008596, |
| "grad_norm": 9.111319541931152, |
| "learning_rate": 3.838758058739255e-05, |
| "loss": 7.4749, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.9401862464183381, |
| "grad_norm": 9.335123062133789, |
| "learning_rate": 3.824767191977078e-05, |
| "loss": 7.4591, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.9513789398280802, |
| "grad_norm": 9.537328720092773, |
| "learning_rate": 3.8107763252148996e-05, |
| "loss": 7.4533, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.9513789398280802, |
| "eval_accuracy": 0.6107361281876699, |
| "eval_loss": 2.0133583545684814, |
| "eval_runtime": 530.2968, |
| "eval_samples_per_second": 122.529, |
| "eval_steps_per_second": 2.553, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.9625716332378224, |
| "grad_norm": 9.227987289428711, |
| "learning_rate": 3.7967854584527225e-05, |
| "loss": 7.4506, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.9737643266475645, |
| "grad_norm": 9.076460838317871, |
| "learning_rate": 3.782794591690544e-05, |
| "loss": 7.4557, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.9849570200573066, |
| "grad_norm": 9.841446876525879, |
| "learning_rate": 3.768803724928367e-05, |
| "loss": 7.4319, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.9961497134670487, |
| "grad_norm": 9.169388771057129, |
| "learning_rate": 3.754812858166189e-05, |
| "loss": 7.4453, |
| "step": 44500 |
| }, |
| { |
| "epoch": 1.0073424068767909, |
| "grad_norm": 9.200368881225586, |
| "learning_rate": 3.7408219914040115e-05, |
| "loss": 7.4149, |
| "step": 45000 |
| }, |
| { |
| "epoch": 1.0073424068767909, |
| "eval_accuracy": 0.6120696404224526, |
| "eval_loss": 2.0035457611083984, |
| "eval_runtime": 532.3092, |
| "eval_samples_per_second": 122.066, |
| "eval_steps_per_second": 2.544, |
| "step": 45000 |
| }, |
| { |
| "epoch": 1.018535100286533, |
| "grad_norm": 9.189336776733398, |
| "learning_rate": 3.7268311246418344e-05, |
| "loss": 7.3981, |
| "step": 45500 |
| }, |
| { |
| "epoch": 1.029727793696275, |
| "grad_norm": 9.504659652709961, |
| "learning_rate": 3.712840257879656e-05, |
| "loss": 7.4031, |
| "step": 46000 |
| }, |
| { |
| "epoch": 1.0409204871060171, |
| "grad_norm": 9.516868591308594, |
| "learning_rate": 3.698849391117479e-05, |
| "loss": 7.3822, |
| "step": 46500 |
| }, |
| { |
| "epoch": 1.0521131805157593, |
| "grad_norm": 9.417741775512695, |
| "learning_rate": 3.6848585243553005e-05, |
| "loss": 7.3887, |
| "step": 47000 |
| }, |
| { |
| "epoch": 1.0633058739255015, |
| "grad_norm": 9.202630043029785, |
| "learning_rate": 3.6708676575931234e-05, |
| "loss": 7.379, |
| "step": 47500 |
| }, |
| { |
| "epoch": 1.0633058739255015, |
| "eval_accuracy": 0.6134326919026999, |
| "eval_loss": 1.9946683645248413, |
| "eval_runtime": 529.4253, |
| "eval_samples_per_second": 122.731, |
| "eval_steps_per_second": 2.557, |
| "step": 47500 |
| }, |
| { |
| "epoch": 1.0744985673352436, |
| "grad_norm": 9.18812084197998, |
| "learning_rate": 3.6568767908309456e-05, |
| "loss": 7.3648, |
| "step": 48000 |
| }, |
| { |
| "epoch": 1.0856912607449858, |
| "grad_norm": 9.317421913146973, |
| "learning_rate": 3.642885924068768e-05, |
| "loss": 7.3581, |
| "step": 48500 |
| }, |
| { |
| "epoch": 1.0968839541547277, |
| "grad_norm": 9.30117130279541, |
| "learning_rate": 3.628895057306591e-05, |
| "loss": 7.3242, |
| "step": 49000 |
| }, |
| { |
| "epoch": 1.1080766475644699, |
| "grad_norm": 9.295071601867676, |
| "learning_rate": 3.6149041905444124e-05, |
| "loss": 7.3343, |
| "step": 49500 |
| }, |
| { |
| "epoch": 1.119269340974212, |
| "grad_norm": 9.372967720031738, |
| "learning_rate": 3.600913323782235e-05, |
| "loss": 7.324, |
| "step": 50000 |
| }, |
| { |
| "epoch": 1.119269340974212, |
| "eval_accuracy": 0.6151487162989436, |
| "eval_loss": 1.9853588342666626, |
| "eval_runtime": 528.6238, |
| "eval_samples_per_second": 122.917, |
| "eval_steps_per_second": 2.561, |
| "step": 50000 |
| }, |
| { |
| "epoch": 1.1304620343839542, |
| "grad_norm": 10.693807601928711, |
| "learning_rate": 3.5869224570200575e-05, |
| "loss": 7.3164, |
| "step": 50500 |
| }, |
| { |
| "epoch": 1.1416547277936964, |
| "grad_norm": 9.047393798828125, |
| "learning_rate": 3.57293159025788e-05, |
| "loss": 7.3028, |
| "step": 51000 |
| }, |
| { |
| "epoch": 1.1528474212034383, |
| "grad_norm": 9.055428504943848, |
| "learning_rate": 3.558940723495702e-05, |
| "loss": 7.316, |
| "step": 51500 |
| }, |
| { |
| "epoch": 1.1640401146131805, |
| "grad_norm": 8.821599960327148, |
| "learning_rate": 3.544949856733524e-05, |
| "loss": 7.2759, |
| "step": 52000 |
| }, |
| { |
| "epoch": 1.1752328080229226, |
| "grad_norm": 8.971498489379883, |
| "learning_rate": 3.530958989971347e-05, |
| "loss": 7.3041, |
| "step": 52500 |
| }, |
| { |
| "epoch": 1.1752328080229226, |
| "eval_accuracy": 0.6162336395081003, |
| "eval_loss": 1.9736484289169312, |
| "eval_runtime": 526.2473, |
| "eval_samples_per_second": 123.472, |
| "eval_steps_per_second": 2.573, |
| "step": 52500 |
| }, |
| { |
| "epoch": 1.1864255014326648, |
| "grad_norm": 9.30490779876709, |
| "learning_rate": 3.5169681232091694e-05, |
| "loss": 7.2966, |
| "step": 53000 |
| }, |
| { |
| "epoch": 1.197618194842407, |
| "grad_norm": 9.367337226867676, |
| "learning_rate": 3.5029772564469917e-05, |
| "loss": 7.2862, |
| "step": 53500 |
| }, |
| { |
| "epoch": 1.2088108882521489, |
| "grad_norm": 9.002731323242188, |
| "learning_rate": 3.488986389684814e-05, |
| "loss": 7.2858, |
| "step": 54000 |
| }, |
| { |
| "epoch": 1.220003581661891, |
| "grad_norm": 9.070691108703613, |
| "learning_rate": 3.474995522922636e-05, |
| "loss": 7.2692, |
| "step": 54500 |
| }, |
| { |
| "epoch": 1.2311962750716332, |
| "grad_norm": 9.154426574707031, |
| "learning_rate": 3.4610046561604584e-05, |
| "loss": 7.262, |
| "step": 55000 |
| }, |
| { |
| "epoch": 1.2311962750716332, |
| "eval_accuracy": 0.6174860367511424, |
| "eval_loss": 1.9672149419784546, |
| "eval_runtime": 529.8804, |
| "eval_samples_per_second": 122.626, |
| "eval_steps_per_second": 2.555, |
| "step": 55000 |
| }, |
| { |
| "epoch": 1.2423889684813754, |
| "grad_norm": 9.364106178283691, |
| "learning_rate": 3.447013789398281e-05, |
| "loss": 7.2489, |
| "step": 55500 |
| }, |
| { |
| "epoch": 1.2535816618911175, |
| "grad_norm": 9.267243385314941, |
| "learning_rate": 3.4330229226361035e-05, |
| "loss": 7.2664, |
| "step": 56000 |
| }, |
| { |
| "epoch": 1.2647743553008595, |
| "grad_norm": 9.162137031555176, |
| "learning_rate": 3.419032055873926e-05, |
| "loss": 7.2475, |
| "step": 56500 |
| }, |
| { |
| "epoch": 1.2759670487106018, |
| "grad_norm": 9.292202949523926, |
| "learning_rate": 3.405041189111748e-05, |
| "loss": 7.2357, |
| "step": 57000 |
| }, |
| { |
| "epoch": 1.2871597421203438, |
| "grad_norm": 9.280839920043945, |
| "learning_rate": 3.39105032234957e-05, |
| "loss": 7.2169, |
| "step": 57500 |
| }, |
| { |
| "epoch": 1.2871597421203438, |
| "eval_accuracy": 0.6184868881174969, |
| "eval_loss": 1.9653985500335693, |
| "eval_runtime": 531.1309, |
| "eval_samples_per_second": 122.337, |
| "eval_steps_per_second": 2.549, |
| "step": 57500 |
| }, |
| { |
| "epoch": 1.298352435530086, |
| "grad_norm": 8.75936222076416, |
| "learning_rate": 3.3770594555873925e-05, |
| "loss": 7.2052, |
| "step": 58000 |
| }, |
| { |
| "epoch": 1.309545128939828, |
| "grad_norm": 8.891804695129395, |
| "learning_rate": 3.363068588825215e-05, |
| "loss": 7.2258, |
| "step": 58500 |
| }, |
| { |
| "epoch": 1.3207378223495703, |
| "grad_norm": 8.931051254272461, |
| "learning_rate": 3.349077722063038e-05, |
| "loss": 7.1899, |
| "step": 59000 |
| }, |
| { |
| "epoch": 1.3319305157593124, |
| "grad_norm": 9.616579055786133, |
| "learning_rate": 3.33508685530086e-05, |
| "loss": 7.2068, |
| "step": 59500 |
| }, |
| { |
| "epoch": 1.3431232091690544, |
| "grad_norm": 8.981892585754395, |
| "learning_rate": 3.321095988538682e-05, |
| "loss": 7.2084, |
| "step": 60000 |
| }, |
| { |
| "epoch": 1.3431232091690544, |
| "eval_accuracy": 0.6190439375486008, |
| "eval_loss": 1.9544332027435303, |
| "eval_runtime": 528.8193, |
| "eval_samples_per_second": 122.872, |
| "eval_steps_per_second": 2.56, |
| "step": 60000 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 178688, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 2500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.2799139827941786e+18, |
| "train_batch_size": 24, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|