diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,252028 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.911154371867623, + "eval_steps": 200000, + "global_step": 360000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.69921875, + "learning_rate": 5e-05, + "loss": 0.979, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 2.125, + "learning_rate": 0.0001, + "loss": 0.8256, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 2.59375, + "learning_rate": 0.00015, + "loss": 0.6018, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.875, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 3.375, + "learning_rate": 0.00025, + "loss": 0.5708, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 2.484375, + "learning_rate": 0.0003, + "loss": 0.4826, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 1.4765625, + "learning_rate": 0.00035, + "loss": 0.4008, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 1.5390625, + "learning_rate": 0.0004, + "loss": 0.47, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 2.375, + "learning_rate": 0.00045000000000000004, + "loss": 0.4558, + "step": 90 + }, + { + "epoch": 0.0, + "grad_norm": 3.78125, + "learning_rate": 0.0005, + "loss": 0.4735, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 4.5, + "learning_rate": 0.0004999999997647635, + "loss": 0.4286, + "step": 110 + }, + { + "epoch": 0.0, + "grad_norm": 2.5625, + "learning_rate": 0.000499999999059054, + "loss": 0.4376, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 3.609375, + "learning_rate": 0.0004999999978828715, + "loss": 0.4174, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 1.734375, + "learning_rate": 0.000499999996236216, + "loss": 0.4391, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 2.96875, + "learning_rate": 0.0004999999941190875, + "loss": 0.4171, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 4.9375, + "learning_rate": 0.000499999991531486, + "loss": 0.3757, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 1.265625, + "learning_rate": 0.0004999999884734115, + "loss": 0.3802, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 1.640625, + "learning_rate": 0.000499999984944864, + "loss": 0.4809, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 1.6171875, + "learning_rate": 0.0004999999809458436, + "loss": 0.3962, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 2.140625, + "learning_rate": 0.0004999999764763503, + "loss": 0.399, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 1.9453125, + "learning_rate": 0.0004999999715363839, + "loss": 0.3718, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 1.7421875, + "learning_rate": 0.0004999999661259445, + "loss": 0.3845, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 2.21875, + "learning_rate": 0.0004999999602450324, + "loss": 0.3646, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 1.640625, + "learning_rate": 0.0004999999538936471, + "loss": 0.3809, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 0.984375, + "learning_rate": 0.000499999947071789, + "loss": 0.3322, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 2.34375, + "learning_rate": 0.0004999999397794581, + "loss": 0.319, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 1.078125, + "learning_rate": 0.0004999999320166543, + "loss": 0.4018, + "step": 270 + }, + { + "epoch": 0.01, + "grad_norm": 1.1328125, + "learning_rate": 0.0004999999237833775, + "loss": 0.3773, + "step": 280 + }, + { + "epoch": 0.01, + "grad_norm": 1.640625, + "learning_rate": 0.0004999999150796278, + "loss": 0.3919, + "step": 290 + }, + { + "epoch": 0.01, + "grad_norm": 1.953125, + "learning_rate": 0.0004999999059054055, + "loss": 0.377, + "step": 300 + }, + { + "epoch": 0.01, + "grad_norm": 0.68359375, + "learning_rate": 0.0004999998962607102, + "loss": 0.3864, + "step": 310 + }, + { + "epoch": 0.01, + "grad_norm": 1.5546875, + "learning_rate": 0.000499999886145542, + "loss": 0.4597, + "step": 320 + }, + { + "epoch": 0.01, + "grad_norm": 1.515625, + "learning_rate": 0.0004999998755599013, + "loss": 0.4042, + "step": 330 + }, + { + "epoch": 0.01, + "grad_norm": 4.53125, + "learning_rate": 0.0004999998645037876, + "loss": 0.4348, + "step": 340 + }, + { + "epoch": 0.01, + "grad_norm": 4.15625, + "learning_rate": 0.0004999998529772012, + "loss": 0.4317, + "step": 350 + }, + { + "epoch": 0.01, + "grad_norm": 1.890625, + "learning_rate": 0.0004999998409801421, + "loss": 0.5081, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 0.94921875, + "learning_rate": 0.0004999998285126103, + "loss": 0.3064, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 2.90625, + "learning_rate": 0.0004999998155746057, + "loss": 0.3847, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 2.5625, + "learning_rate": 0.0004999998021661287, + "loss": 0.4232, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 1.28125, + "learning_rate": 0.0004999997882871788, + "loss": 0.3319, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 1.5859375, + "learning_rate": 0.0004999997739377564, + "loss": 0.4151, + "step": 410 + }, + { + "epoch": 0.02, + "grad_norm": 1.171875, + "learning_rate": 0.0004999997591178614, + "loss": 0.3973, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 1.1640625, + "learning_rate": 0.000499999743827494, + "loss": 0.3519, + "step": 430 + }, + { + "epoch": 0.02, + "grad_norm": 0.93359375, + "learning_rate": 0.0004999997280666539, + "loss": 0.3441, + "step": 440 + }, + { + "epoch": 0.02, + "grad_norm": 2.90625, + "learning_rate": 0.0004999997118353414, + "loss": 0.3376, + "step": 450 + }, + { + "epoch": 0.02, + "grad_norm": 1.734375, + "learning_rate": 0.0004999996951335564, + "loss": 0.318, + "step": 460 + }, + { + "epoch": 0.02, + "grad_norm": 1.875, + "learning_rate": 0.000499999677961299, + "loss": 0.3783, + "step": 470 + }, + { + "epoch": 0.02, + "grad_norm": 0.94921875, + "learning_rate": 0.0004999996603185692, + "loss": 0.2997, + "step": 480 + }, + { + "epoch": 0.02, + "grad_norm": 1.21875, + "learning_rate": 0.000499999642205367, + "loss": 0.3152, + "step": 490 + }, + { + "epoch": 0.02, + "grad_norm": 1.8359375, + "learning_rate": 0.0004999996236216925, + "loss": 0.4532, + "step": 500 + }, + { + "epoch": 0.02, + "grad_norm": 1.078125, + "learning_rate": 0.0004999996045675458, + "loss": 0.3603, + "step": 510 + }, + { + "epoch": 0.02, + "grad_norm": 0.85546875, + "learning_rate": 0.0004999995850429266, + "loss": 0.3464, + "step": 520 + }, + { + "epoch": 0.02, + "grad_norm": 1.171875, + "learning_rate": 0.0004999995650478353, + "loss": 0.4385, + "step": 530 + }, + { + "epoch": 0.02, + "grad_norm": 1.5078125, + "learning_rate": 0.000499999544582272, + "loss": 0.3639, + "step": 540 + }, + { + "epoch": 0.02, + "grad_norm": 2.875, + "learning_rate": 0.0004999995236462364, + "loss": 0.3404, + "step": 550 + }, + { + "epoch": 0.02, + "grad_norm": 0.6796875, + "learning_rate": 0.0004999995022397286, + "loss": 0.4101, + "step": 560 + }, + { + "epoch": 0.02, + "grad_norm": 1.21875, + "learning_rate": 0.0004999994803627489, + "loss": 0.37, + "step": 570 + }, + { + "epoch": 0.02, + "grad_norm": 0.86328125, + "learning_rate": 0.000499999458015297, + "loss": 0.4176, + "step": 580 + }, + { + "epoch": 0.02, + "grad_norm": 1.203125, + "learning_rate": 0.0004999994351973733, + "loss": 0.4137, + "step": 590 + }, + { + "epoch": 0.02, + "grad_norm": 1.671875, + "learning_rate": 0.0004999994119089776, + "loss": 0.332, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 0.96875, + "learning_rate": 0.0004999993881501099, + "loss": 0.3805, + "step": 610 + }, + { + "epoch": 0.03, + "grad_norm": 1.5625, + "learning_rate": 0.0004999993639207705, + "loss": 0.4088, + "step": 620 + }, + { + "epoch": 0.03, + "grad_norm": 1.2109375, + "learning_rate": 0.0004999993392209592, + "loss": 0.3283, + "step": 630 + }, + { + "epoch": 0.03, + "grad_norm": 1.421875, + "learning_rate": 0.0004999993140506762, + "loss": 0.3476, + "step": 640 + }, + { + "epoch": 0.03, + "grad_norm": 1.7109375, + "learning_rate": 0.0004999992884099215, + "loss": 0.3169, + "step": 650 + }, + { + "epoch": 0.03, + "grad_norm": 1.046875, + "learning_rate": 0.000499999262298695, + "loss": 0.359, + "step": 660 + }, + { + "epoch": 0.03, + "grad_norm": 0.78515625, + "learning_rate": 0.000499999235716997, + "loss": 0.363, + "step": 670 + }, + { + "epoch": 0.03, + "grad_norm": 0.58203125, + "learning_rate": 0.0004999992086648274, + "loss": 0.302, + "step": 680 + }, + { + "epoch": 0.03, + "grad_norm": 0.890625, + "learning_rate": 0.0004999991811421863, + "loss": 0.3503, + "step": 690 + }, + { + "epoch": 0.03, + "grad_norm": 1.21875, + "learning_rate": 0.0004999991531490737, + "loss": 0.3585, + "step": 700 + }, + { + "epoch": 0.03, + "grad_norm": 2.8125, + "learning_rate": 0.0004999991246854898, + "loss": 0.3214, + "step": 710 + }, + { + "epoch": 0.03, + "grad_norm": 2.1875, + "learning_rate": 0.0004999990957514344, + "loss": 0.3568, + "step": 720 + }, + { + "epoch": 0.03, + "grad_norm": 0.6640625, + "learning_rate": 0.0004999990663469079, + "loss": 0.2573, + "step": 730 + }, + { + "epoch": 0.03, + "grad_norm": 1.2109375, + "learning_rate": 0.00049999903647191, + "loss": 0.3604, + "step": 740 + }, + { + "epoch": 0.03, + "grad_norm": 0.8125, + "learning_rate": 0.0004999990061264409, + "loss": 0.3285, + "step": 750 + }, + { + "epoch": 0.03, + "grad_norm": 1.203125, + "learning_rate": 0.0004999989753105007, + "loss": 0.2736, + "step": 760 + }, + { + "epoch": 0.03, + "grad_norm": 0.70703125, + "learning_rate": 0.0004999989440240896, + "loss": 0.294, + "step": 770 + }, + { + "epoch": 0.03, + "grad_norm": 1.1484375, + "learning_rate": 0.0004999989122672072, + "loss": 0.3478, + "step": 780 + }, + { + "epoch": 0.03, + "grad_norm": 1.0078125, + "learning_rate": 0.0004999988800398539, + "loss": 0.3922, + "step": 790 + }, + { + "epoch": 0.03, + "grad_norm": 0.72265625, + "learning_rate": 0.0004999988473420299, + "loss": 0.3029, + "step": 800 + }, + { + "epoch": 0.03, + "grad_norm": 1.3046875, + "learning_rate": 0.0004999988141737349, + "loss": 0.3642, + "step": 810 + }, + { + "epoch": 0.03, + "grad_norm": 1.6015625, + "learning_rate": 0.0004999987805349692, + "loss": 0.315, + "step": 820 + }, + { + "epoch": 0.03, + "grad_norm": 0.44140625, + "learning_rate": 0.0004999987464257327, + "loss": 0.3264, + "step": 830 + }, + { + "epoch": 0.03, + "grad_norm": 2.359375, + "learning_rate": 0.0004999987118460257, + "loss": 0.3657, + "step": 840 + }, + { + "epoch": 0.04, + "grad_norm": 1.1875, + "learning_rate": 0.000499998676795848, + "loss": 0.3896, + "step": 850 + }, + { + "epoch": 0.04, + "grad_norm": 0.8359375, + "learning_rate": 0.0004999986412751998, + "loss": 0.3145, + "step": 860 + }, + { + "epoch": 0.04, + "grad_norm": 0.8203125, + "learning_rate": 0.0004999986052840812, + "loss": 0.4001, + "step": 870 + }, + { + "epoch": 0.04, + "grad_norm": 1.1796875, + "learning_rate": 0.0004999985688224921, + "loss": 0.3462, + "step": 880 + }, + { + "epoch": 0.04, + "grad_norm": 0.828125, + "learning_rate": 0.0004999985318904328, + "loss": 0.3135, + "step": 890 + }, + { + "epoch": 0.04, + "grad_norm": 0.8671875, + "learning_rate": 0.0004999984944879034, + "loss": 0.3184, + "step": 900 + }, + { + "epoch": 0.04, + "grad_norm": 0.322265625, + "learning_rate": 0.0004999984566149036, + "loss": 0.3851, + "step": 910 + }, + { + "epoch": 0.04, + "grad_norm": 6.53125, + "learning_rate": 0.0004999984182714339, + "loss": 0.3676, + "step": 920 + }, + { + "epoch": 0.04, + "grad_norm": 2.390625, + "learning_rate": 0.0004999983794574939, + "loss": 0.3173, + "step": 930 + }, + { + "epoch": 0.04, + "grad_norm": 1.7421875, + "learning_rate": 0.0004999983401730842, + "loss": 0.3962, + "step": 940 + }, + { + "epoch": 0.04, + "grad_norm": 0.79296875, + "learning_rate": 0.0004999983004182045, + "loss": 0.3593, + "step": 950 + }, + { + "epoch": 0.04, + "grad_norm": 0.474609375, + "learning_rate": 0.0004999982601928551, + "loss": 0.3684, + "step": 960 + }, + { + "epoch": 0.04, + "grad_norm": 1.375, + "learning_rate": 0.0004999982194970359, + "loss": 0.3056, + "step": 970 + }, + { + "epoch": 0.04, + "grad_norm": 1.75, + "learning_rate": 0.000499998178330747, + "loss": 0.3502, + "step": 980 + }, + { + "epoch": 0.04, + "grad_norm": 2.75, + "learning_rate": 0.0004999981366939886, + "loss": 0.2836, + "step": 990 + }, + { + "epoch": 0.04, + "grad_norm": 0.76953125, + "learning_rate": 0.0004999980945867606, + "loss": 0.3602, + "step": 1000 + }, + { + "epoch": 0.04, + "grad_norm": 1.671875, + "learning_rate": 0.0004999980520090634, + "loss": 0.2913, + "step": 1010 + }, + { + "epoch": 0.04, + "grad_norm": 1.1953125, + "learning_rate": 0.0004999980089608967, + "loss": 0.2992, + "step": 1020 + }, + { + "epoch": 0.04, + "grad_norm": 1.34375, + "learning_rate": 0.0004999979654422607, + "loss": 0.2966, + "step": 1030 + }, + { + "epoch": 0.04, + "grad_norm": 0.91796875, + "learning_rate": 0.0004999979214531556, + "loss": 0.3145, + "step": 1040 + }, + { + "epoch": 0.04, + "grad_norm": 0.72265625, + "learning_rate": 0.0004999978769935815, + "loss": 0.3271, + "step": 1050 + }, + { + "epoch": 0.04, + "grad_norm": 2.171875, + "learning_rate": 0.0004999978320635383, + "loss": 0.3331, + "step": 1060 + }, + { + "epoch": 0.04, + "grad_norm": 0.75, + "learning_rate": 0.0004999977866630261, + "loss": 0.2827, + "step": 1070 + }, + { + "epoch": 0.04, + "grad_norm": 1.4921875, + "learning_rate": 0.0004999977407920452, + "loss": 0.3029, + "step": 1080 + }, + { + "epoch": 0.05, + "grad_norm": 1.0234375, + "learning_rate": 0.0004999976944505954, + "loss": 0.371, + "step": 1090 + }, + { + "epoch": 0.05, + "grad_norm": 0.89453125, + "learning_rate": 0.000499997647638677, + "loss": 0.3152, + "step": 1100 + }, + { + "epoch": 0.05, + "grad_norm": 1.0546875, + "learning_rate": 0.0004999976003562901, + "loss": 0.3901, + "step": 1110 + }, + { + "epoch": 0.05, + "grad_norm": 1.6015625, + "learning_rate": 0.0004999975526034347, + "loss": 0.4085, + "step": 1120 + }, + { + "epoch": 0.05, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999975043801108, + "loss": 0.2502, + "step": 1130 + }, + { + "epoch": 0.05, + "grad_norm": 0.68359375, + "learning_rate": 0.0004999974556863187, + "loss": 0.3639, + "step": 1140 + }, + { + "epoch": 0.05, + "grad_norm": 0.6875, + "learning_rate": 0.0004999974065220583, + "loss": 0.312, + "step": 1150 + }, + { + "epoch": 0.05, + "grad_norm": 3.75, + "learning_rate": 0.0004999973568873299, + "loss": 0.331, + "step": 1160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7265625, + "learning_rate": 0.0004999973067821333, + "loss": 0.3724, + "step": 1170 + }, + { + "epoch": 0.05, + "grad_norm": 0.578125, + "learning_rate": 0.0004999972562064689, + "loss": 0.3137, + "step": 1180 + }, + { + "epoch": 0.05, + "grad_norm": 1.2109375, + "learning_rate": 0.0004999972051603366, + "loss": 0.3284, + "step": 1190 + }, + { + "epoch": 0.05, + "grad_norm": 1.515625, + "learning_rate": 0.0004999971536437366, + "loss": 0.3006, + "step": 1200 + }, + { + "epoch": 0.05, + "grad_norm": 0.50390625, + "learning_rate": 0.0004999971016566689, + "loss": 0.3346, + "step": 1210 + }, + { + "epoch": 0.05, + "grad_norm": 0.82421875, + "learning_rate": 0.0004999970491991338, + "loss": 0.3051, + "step": 1220 + }, + { + "epoch": 0.05, + "grad_norm": 1.265625, + "learning_rate": 0.0004999969962711311, + "loss": 0.3407, + "step": 1230 + }, + { + "epoch": 0.05, + "grad_norm": 1.203125, + "learning_rate": 0.0004999969428726611, + "loss": 0.3765, + "step": 1240 + }, + { + "epoch": 0.05, + "grad_norm": 0.578125, + "learning_rate": 0.0004999968890037238, + "loss": 0.2441, + "step": 1250 + }, + { + "epoch": 0.05, + "grad_norm": 1.421875, + "learning_rate": 0.0004999968346643194, + "loss": 0.3162, + "step": 1260 + }, + { + "epoch": 0.05, + "grad_norm": 1.1953125, + "learning_rate": 0.0004999967798544479, + "loss": 0.3095, + "step": 1270 + }, + { + "epoch": 0.05, + "grad_norm": 0.4609375, + "learning_rate": 0.0004999967245741095, + "loss": 0.3518, + "step": 1280 + }, + { + "epoch": 0.05, + "grad_norm": 0.921875, + "learning_rate": 0.0004999966688233043, + "loss": 0.3693, + "step": 1290 + }, + { + "epoch": 0.05, + "grad_norm": 1.4765625, + "learning_rate": 0.0004999966126020323, + "loss": 0.3685, + "step": 1300 + }, + { + "epoch": 0.05, + "grad_norm": 1.7578125, + "learning_rate": 0.0004999965559102938, + "loss": 0.3648, + "step": 1310 + }, + { + "epoch": 0.05, + "grad_norm": 0.486328125, + "learning_rate": 0.0004999964987480886, + "loss": 0.3722, + "step": 1320 + }, + { + "epoch": 0.06, + "grad_norm": 0.8359375, + "learning_rate": 0.000499996441115417, + "loss": 0.2889, + "step": 1330 + }, + { + "epoch": 0.06, + "grad_norm": 0.828125, + "learning_rate": 0.0004999963830122793, + "loss": 0.3683, + "step": 1340 + }, + { + "epoch": 0.06, + "grad_norm": 1.203125, + "learning_rate": 0.0004999963244386753, + "loss": 0.3026, + "step": 1350 + }, + { + "epoch": 0.06, + "grad_norm": 0.6328125, + "learning_rate": 0.0004999962653946051, + "loss": 0.3666, + "step": 1360 + }, + { + "epoch": 0.06, + "grad_norm": 1.2265625, + "learning_rate": 0.000499996205880069, + "loss": 0.295, + "step": 1370 + }, + { + "epoch": 0.06, + "grad_norm": 0.8203125, + "learning_rate": 0.000499996145895067, + "loss": 0.3602, + "step": 1380 + }, + { + "epoch": 0.06, + "grad_norm": 1.765625, + "learning_rate": 0.0004999960854395994, + "loss": 0.2869, + "step": 1390 + }, + { + "epoch": 0.06, + "grad_norm": 1.2578125, + "learning_rate": 0.000499996024513666, + "loss": 0.2896, + "step": 1400 + }, + { + "epoch": 0.06, + "grad_norm": 0.86328125, + "learning_rate": 0.0004999959631172672, + "loss": 0.32, + "step": 1410 + }, + { + "epoch": 0.06, + "grad_norm": 0.7265625, + "learning_rate": 0.0004999959012504029, + "loss": 0.3323, + "step": 1420 + }, + { + "epoch": 0.06, + "grad_norm": 0.30078125, + "learning_rate": 0.0004999958389130733, + "loss": 0.2821, + "step": 1430 + }, + { + "epoch": 0.06, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999957761052786, + "loss": 0.3225, + "step": 1440 + }, + { + "epoch": 0.06, + "grad_norm": 0.86328125, + "learning_rate": 0.0004999957128270188, + "loss": 0.3507, + "step": 1450 + }, + { + "epoch": 0.06, + "grad_norm": 0.98046875, + "learning_rate": 0.0004999956490782942, + "loss": 0.3555, + "step": 1460 + }, + { + "epoch": 0.06, + "grad_norm": 1.7578125, + "learning_rate": 0.0004999955848591047, + "loss": 0.325, + "step": 1470 + }, + { + "epoch": 0.06, + "grad_norm": 2.34375, + "learning_rate": 0.0004999955201694504, + "loss": 0.3127, + "step": 1480 + }, + { + "epoch": 0.06, + "grad_norm": 1.0546875, + "learning_rate": 0.0004999954550093317, + "loss": 0.2929, + "step": 1490 + }, + { + "epoch": 0.06, + "grad_norm": 0.78515625, + "learning_rate": 0.0004999953893787484, + "loss": 0.3094, + "step": 1500 + }, + { + "epoch": 0.06, + "grad_norm": 1.234375, + "learning_rate": 0.0004999953232777008, + "loss": 0.334, + "step": 1510 + }, + { + "epoch": 0.06, + "grad_norm": 2.15625, + "learning_rate": 0.0004999952567061891, + "loss": 0.3412, + "step": 1520 + }, + { + "epoch": 0.06, + "grad_norm": 0.5546875, + "learning_rate": 0.0004999951896642132, + "loss": 0.3145, + "step": 1530 + }, + { + "epoch": 0.06, + "grad_norm": 0.640625, + "learning_rate": 0.0004999951221517734, + "loss": 0.2527, + "step": 1540 + }, + { + "epoch": 0.06, + "grad_norm": 0.5, + "learning_rate": 0.0004999950541688697, + "loss": 0.2872, + "step": 1550 + }, + { + "epoch": 0.06, + "grad_norm": 1.1171875, + "learning_rate": 0.0004999949857155024, + "loss": 0.2888, + "step": 1560 + }, + { + "epoch": 0.07, + "grad_norm": 0.96484375, + "learning_rate": 0.0004999949167916716, + "loss": 0.3165, + "step": 1570 + }, + { + "epoch": 0.07, + "grad_norm": 0.875, + "learning_rate": 0.0004999948473973772, + "loss": 0.2907, + "step": 1580 + }, + { + "epoch": 0.07, + "grad_norm": 2.59375, + "learning_rate": 0.0004999947775326197, + "loss": 0.3347, + "step": 1590 + }, + { + "epoch": 0.07, + "grad_norm": 1.1796875, + "learning_rate": 0.0004999947071973989, + "loss": 0.35, + "step": 1600 + }, + { + "epoch": 0.07, + "grad_norm": 0.84375, + "learning_rate": 0.0004999946363917151, + "loss": 0.3087, + "step": 1610 + }, + { + "epoch": 0.07, + "grad_norm": 0.859375, + "learning_rate": 0.0004999945651155683, + "loss": 0.3198, + "step": 1620 + }, + { + "epoch": 0.07, + "grad_norm": 2.328125, + "learning_rate": 0.0004999944933689588, + "loss": 0.3703, + "step": 1630 + }, + { + "epoch": 0.07, + "grad_norm": 0.494140625, + "learning_rate": 0.0004999944211518866, + "loss": 0.348, + "step": 1640 + }, + { + "epoch": 0.07, + "grad_norm": 1.125, + "learning_rate": 0.0004999943484643519, + "loss": 0.3179, + "step": 1650 + }, + { + "epoch": 0.07, + "grad_norm": 1.4375, + "learning_rate": 0.0004999942753063549, + "loss": 0.3416, + "step": 1660 + }, + { + "epoch": 0.07, + "grad_norm": 1.265625, + "learning_rate": 0.0004999942016778957, + "loss": 0.3047, + "step": 1670 + }, + { + "epoch": 0.07, + "grad_norm": 1.09375, + "learning_rate": 0.0004999941275789743, + "loss": 0.2894, + "step": 1680 + }, + { + "epoch": 0.07, + "grad_norm": 1.03125, + "learning_rate": 0.000499994053009591, + "loss": 0.2702, + "step": 1690 + }, + { + "epoch": 0.07, + "grad_norm": 1.0234375, + "learning_rate": 0.0004999939779697459, + "loss": 0.2902, + "step": 1700 + }, + { + "epoch": 0.07, + "grad_norm": 0.46484375, + "learning_rate": 0.0004999939024594391, + "loss": 0.3448, + "step": 1710 + }, + { + "epoch": 0.07, + "grad_norm": 1.28125, + "learning_rate": 0.0004999938264786708, + "loss": 0.33, + "step": 1720 + }, + { + "epoch": 0.07, + "grad_norm": 0.0, + "learning_rate": 0.0004999937500274411, + "loss": 0.3007, + "step": 1730 + }, + { + "epoch": 0.07, + "grad_norm": 1.0078125, + "learning_rate": 0.0004999936731057502, + "loss": 0.3127, + "step": 1740 + }, + { + "epoch": 0.07, + "grad_norm": 0.76953125, + "learning_rate": 0.0004999935957135981, + "loss": 0.3853, + "step": 1750 + }, + { + "epoch": 0.07, + "grad_norm": 0.72265625, + "learning_rate": 0.0004999935178509852, + "loss": 0.3469, + "step": 1760 + }, + { + "epoch": 0.07, + "grad_norm": 1.2421875, + "learning_rate": 0.0004999934395179114, + "loss": 0.2776, + "step": 1770 + }, + { + "epoch": 0.07, + "grad_norm": 1.2578125, + "learning_rate": 0.000499993360714377, + "loss": 0.3026, + "step": 1780 + }, + { + "epoch": 0.07, + "grad_norm": 1.09375, + "learning_rate": 0.0004999932814403821, + "loss": 0.2991, + "step": 1790 + }, + { + "epoch": 0.07, + "grad_norm": 1.296875, + "learning_rate": 0.0004999932016959267, + "loss": 0.3332, + "step": 1800 + }, + { + "epoch": 0.07, + "grad_norm": 1.328125, + "learning_rate": 0.0004999931214810111, + "loss": 0.2901, + "step": 1810 + }, + { + "epoch": 0.08, + "grad_norm": 0.8046875, + "learning_rate": 0.0004999930407956356, + "loss": 0.3706, + "step": 1820 + }, + { + "epoch": 0.08, + "grad_norm": 0.87109375, + "learning_rate": 0.0004999929596398002, + "loss": 0.2969, + "step": 1830 + }, + { + "epoch": 0.08, + "grad_norm": 0.671875, + "learning_rate": 0.0004999928780135049, + "loss": 0.3116, + "step": 1840 + }, + { + "epoch": 0.08, + "grad_norm": 0.4609375, + "learning_rate": 0.00049999279591675, + "loss": 0.3009, + "step": 1850 + }, + { + "epoch": 0.08, + "grad_norm": 0.84765625, + "learning_rate": 0.0004999927133495358, + "loss": 0.3357, + "step": 1860 + }, + { + "epoch": 0.08, + "grad_norm": 0.330078125, + "learning_rate": 0.0004999926303118623, + "loss": 0.353, + "step": 1870 + }, + { + "epoch": 0.08, + "grad_norm": 1.390625, + "learning_rate": 0.0004999925468037296, + "loss": 0.2716, + "step": 1880 + }, + { + "epoch": 0.08, + "grad_norm": 0.58203125, + "learning_rate": 0.0004999924628251379, + "loss": 0.2685, + "step": 1890 + }, + { + "epoch": 0.08, + "grad_norm": 0.86328125, + "learning_rate": 0.0004999923783760874, + "loss": 0.3665, + "step": 1900 + }, + { + "epoch": 0.08, + "grad_norm": 0.5546875, + "learning_rate": 0.0004999922934565783, + "loss": 0.2999, + "step": 1910 + }, + { + "epoch": 0.08, + "grad_norm": 1.375, + "learning_rate": 0.0004999922080666106, + "loss": 0.3282, + "step": 1920 + }, + { + "epoch": 0.08, + "grad_norm": 1.265625, + "learning_rate": 0.0004999921222061846, + "loss": 0.3201, + "step": 1930 + }, + { + "epoch": 0.08, + "grad_norm": 1.84375, + "learning_rate": 0.0004999920358753004, + "loss": 0.2926, + "step": 1940 + }, + { + "epoch": 0.08, + "grad_norm": 0.50390625, + "learning_rate": 0.0004999919490739583, + "loss": 0.2654, + "step": 1950 + }, + { + "epoch": 0.08, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999918618021583, + "loss": 0.3121, + "step": 1960 + }, + { + "epoch": 0.08, + "grad_norm": 1.0703125, + "learning_rate": 0.0004999917740599005, + "loss": 0.3108, + "step": 1970 + }, + { + "epoch": 0.08, + "grad_norm": 1.8828125, + "learning_rate": 0.0004999916858471852, + "loss": 0.3156, + "step": 1980 + }, + { + "epoch": 0.08, + "grad_norm": 3.25, + "learning_rate": 0.0004999915971640127, + "loss": 0.3008, + "step": 1990 + }, + { + "epoch": 0.08, + "grad_norm": 0.384765625, + "learning_rate": 0.0004999915080103829, + "loss": 0.3155, + "step": 2000 + }, + { + "epoch": 0.08, + "grad_norm": 0.59765625, + "learning_rate": 0.0004999914183862961, + "loss": 0.334, + "step": 2010 + }, + { + "epoch": 0.08, + "grad_norm": 0.83984375, + "learning_rate": 0.0004999913282917524, + "loss": 0.3051, + "step": 2020 + }, + { + "epoch": 0.08, + "grad_norm": 0.31640625, + "learning_rate": 0.0004999912377267521, + "loss": 0.3024, + "step": 2030 + }, + { + "epoch": 0.08, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999911466912952, + "loss": 0.3597, + "step": 2040 + }, + { + "epoch": 0.08, + "grad_norm": 0.9140625, + "learning_rate": 0.0004999910551853821, + "loss": 0.2553, + "step": 2050 + }, + { + "epoch": 0.09, + "grad_norm": 1.8125, + "learning_rate": 0.0004999909632090126, + "loss": 0.3038, + "step": 2060 + }, + { + "epoch": 0.09, + "grad_norm": 0.68359375, + "learning_rate": 0.0004999908707621873, + "loss": 0.3228, + "step": 2070 + }, + { + "epoch": 0.09, + "grad_norm": 1.125, + "learning_rate": 0.0004999907778449061, + "loss": 0.2831, + "step": 2080 + }, + { + "epoch": 0.09, + "grad_norm": 0.8671875, + "learning_rate": 0.0004999906844571693, + "loss": 0.3181, + "step": 2090 + }, + { + "epoch": 0.09, + "grad_norm": 0.71484375, + "learning_rate": 0.000499990590598977, + "loss": 0.2869, + "step": 2100 + }, + { + "epoch": 0.09, + "grad_norm": 0.75390625, + "learning_rate": 0.0004999904962703294, + "loss": 0.3167, + "step": 2110 + }, + { + "epoch": 0.09, + "grad_norm": 0.765625, + "learning_rate": 0.0004999904014712267, + "loss": 0.2201, + "step": 2120 + }, + { + "epoch": 0.09, + "grad_norm": 0.97265625, + "learning_rate": 0.000499990306201669, + "loss": 0.3236, + "step": 2130 + }, + { + "epoch": 0.09, + "grad_norm": 1.09375, + "learning_rate": 0.0004999902104616566, + "loss": 0.2701, + "step": 2140 + }, + { + "epoch": 0.09, + "grad_norm": 0.8828125, + "learning_rate": 0.0004999901142511895, + "loss": 0.3678, + "step": 2150 + }, + { + "epoch": 0.09, + "grad_norm": 1.1171875, + "learning_rate": 0.0004999900175702682, + "loss": 0.3429, + "step": 2160 + }, + { + "epoch": 0.09, + "grad_norm": 0.76953125, + "learning_rate": 0.0004999899204188925, + "loss": 0.2931, + "step": 2170 + }, + { + "epoch": 0.09, + "grad_norm": 0.890625, + "learning_rate": 0.000499989822797063, + "loss": 0.302, + "step": 2180 + }, + { + "epoch": 0.09, + "grad_norm": 0.59765625, + "learning_rate": 0.0004999897247047794, + "loss": 0.3104, + "step": 2190 + }, + { + "epoch": 0.09, + "grad_norm": 1.4375, + "learning_rate": 0.0004999896261420423, + "loss": 0.3297, + "step": 2200 + }, + { + "epoch": 0.09, + "grad_norm": 0.8359375, + "learning_rate": 0.0004999895271088517, + "loss": 0.3463, + "step": 2210 + }, + { + "epoch": 0.09, + "grad_norm": 2.203125, + "learning_rate": 0.0004999894276052077, + "loss": 0.3334, + "step": 2220 + }, + { + "epoch": 0.09, + "grad_norm": 1.3125, + "learning_rate": 0.0004999893276311107, + "loss": 0.3042, + "step": 2230 + }, + { + "epoch": 0.09, + "grad_norm": 0.7265625, + "learning_rate": 0.0004999892271865607, + "loss": 0.2923, + "step": 2240 + }, + { + "epoch": 0.09, + "grad_norm": 0.78515625, + "learning_rate": 0.000499989126271558, + "loss": 0.3354, + "step": 2250 + }, + { + "epoch": 0.09, + "grad_norm": 0.83984375, + "learning_rate": 0.0004999890248861029, + "loss": 0.285, + "step": 2260 + }, + { + "epoch": 0.09, + "grad_norm": 0.73046875, + "learning_rate": 0.0004999889230301953, + "loss": 0.3075, + "step": 2270 + }, + { + "epoch": 0.09, + "grad_norm": 0.80859375, + "learning_rate": 0.0004999888207038356, + "loss": 0.2968, + "step": 2280 + }, + { + "epoch": 0.09, + "grad_norm": 2.328125, + "learning_rate": 0.0004999887179070238, + "loss": 0.3542, + "step": 2290 + }, + { + "epoch": 0.1, + "grad_norm": 0.259765625, + "learning_rate": 0.0004999886146397605, + "loss": 0.258, + "step": 2300 + }, + { + "epoch": 0.1, + "grad_norm": 1.78125, + "learning_rate": 0.0004999885109020453, + "loss": 0.291, + "step": 2310 + }, + { + "epoch": 0.1, + "grad_norm": 1.09375, + "learning_rate": 0.0004999884066938789, + "loss": 0.2664, + "step": 2320 + }, + { + "epoch": 0.1, + "grad_norm": 0.7109375, + "learning_rate": 0.0004999883020152614, + "loss": 0.2862, + "step": 2330 + }, + { + "epoch": 0.1, + "grad_norm": 2.75, + "learning_rate": 0.0004999881968661928, + "loss": 0.3029, + "step": 2340 + }, + { + "epoch": 0.1, + "grad_norm": 1.6015625, + "learning_rate": 0.0004999880912466733, + "loss": 0.3229, + "step": 2350 + }, + { + "epoch": 0.1, + "grad_norm": 0.90234375, + "learning_rate": 0.0004999879851567033, + "loss": 0.3332, + "step": 2360 + }, + { + "epoch": 0.1, + "grad_norm": 1.625, + "learning_rate": 0.000499987878596283, + "loss": 0.2553, + "step": 2370 + }, + { + "epoch": 0.1, + "grad_norm": 2.859375, + "learning_rate": 0.0004999877715654124, + "loss": 0.2685, + "step": 2380 + }, + { + "epoch": 0.1, + "grad_norm": 0.6171875, + "learning_rate": 0.0004999876640640919, + "loss": 0.2665, + "step": 2390 + }, + { + "epoch": 0.1, + "grad_norm": 0.7265625, + "learning_rate": 0.0004999875560923215, + "loss": 0.3255, + "step": 2400 + }, + { + "epoch": 0.1, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999874476501016, + "loss": 0.2994, + "step": 2410 + }, + { + "epoch": 0.1, + "grad_norm": 0.75390625, + "learning_rate": 0.0004999873387374323, + "loss": 0.2796, + "step": 2420 + }, + { + "epoch": 0.1, + "grad_norm": 0.6953125, + "learning_rate": 0.0004999872293543139, + "loss": 0.3015, + "step": 2430 + }, + { + "epoch": 0.1, + "grad_norm": 5.5625, + "learning_rate": 0.0004999871195007463, + "loss": 0.3009, + "step": 2440 + }, + { + "epoch": 0.1, + "grad_norm": 0.3046875, + "learning_rate": 0.0004999870091767303, + "loss": 0.3356, + "step": 2450 + }, + { + "epoch": 0.1, + "grad_norm": 1.4296875, + "learning_rate": 0.0004999868983822654, + "loss": 0.2623, + "step": 2460 + }, + { + "epoch": 0.1, + "grad_norm": 1.25, + "learning_rate": 0.0004999867871173523, + "loss": 0.3621, + "step": 2470 + }, + { + "epoch": 0.1, + "grad_norm": 1.2109375, + "learning_rate": 0.0004999866753819911, + "loss": 0.3105, + "step": 2480 + }, + { + "epoch": 0.1, + "grad_norm": 1.3046875, + "learning_rate": 0.0004999865631761819, + "loss": 0.2932, + "step": 2490 + }, + { + "epoch": 0.1, + "grad_norm": 1.7734375, + "learning_rate": 0.000499986450499925, + "loss": 0.2864, + "step": 2500 + }, + { + "epoch": 0.1, + "grad_norm": 0.92578125, + "learning_rate": 0.0004999863373532207, + "loss": 0.3091, + "step": 2510 + }, + { + "epoch": 0.1, + "grad_norm": 1.5703125, + "learning_rate": 0.000499986223736069, + "loss": 0.3078, + "step": 2520 + }, + { + "epoch": 0.1, + "grad_norm": 0.78125, + "learning_rate": 0.0004999861096484702, + "loss": 0.3218, + "step": 2530 + }, + { + "epoch": 0.11, + "grad_norm": 0.5234375, + "learning_rate": 0.0004999859950904245, + "loss": 0.2197, + "step": 2540 + }, + { + "epoch": 0.11, + "grad_norm": 0.62890625, + "learning_rate": 0.0004999858800619324, + "loss": 0.288, + "step": 2550 + }, + { + "epoch": 0.11, + "grad_norm": 1.0234375, + "learning_rate": 0.0004999857645629936, + "loss": 0.302, + "step": 2560 + }, + { + "epoch": 0.11, + "grad_norm": 0.83984375, + "learning_rate": 0.0004999856485936087, + "loss": 0.3331, + "step": 2570 + }, + { + "epoch": 0.11, + "grad_norm": 1.4765625, + "learning_rate": 0.0004999855321537777, + "loss": 0.3044, + "step": 2580 + }, + { + "epoch": 0.11, + "grad_norm": 0.546875, + "learning_rate": 0.0004999854152435011, + "loss": 0.2696, + "step": 2590 + }, + { + "epoch": 0.11, + "grad_norm": 1.234375, + "learning_rate": 0.0004999852978627789, + "loss": 0.2431, + "step": 2600 + }, + { + "epoch": 0.11, + "grad_norm": 0.59765625, + "learning_rate": 0.0004999851800116113, + "loss": 0.2824, + "step": 2610 + }, + { + "epoch": 0.11, + "grad_norm": 0.63671875, + "learning_rate": 0.0004999850616899986, + "loss": 0.3289, + "step": 2620 + }, + { + "epoch": 0.11, + "grad_norm": 0.92578125, + "learning_rate": 0.000499984942897941, + "loss": 0.3242, + "step": 2630 + }, + { + "epoch": 0.11, + "grad_norm": 1.34375, + "learning_rate": 0.0004999848236354388, + "loss": 0.2995, + "step": 2640 + }, + { + "epoch": 0.11, + "grad_norm": 0.64453125, + "learning_rate": 0.0004999847039024922, + "loss": 0.3169, + "step": 2650 + }, + { + "epoch": 0.11, + "grad_norm": 0.703125, + "learning_rate": 0.0004999845836991013, + "loss": 0.3488, + "step": 2660 + }, + { + "epoch": 0.11, + "grad_norm": 0.40234375, + "learning_rate": 0.0004999844630252663, + "loss": 0.2909, + "step": 2670 + }, + { + "epoch": 0.11, + "grad_norm": 0.7109375, + "learning_rate": 0.0004999843418809877, + "loss": 0.268, + "step": 2680 + }, + { + "epoch": 0.11, + "grad_norm": 0.62890625, + "learning_rate": 0.0004999842202662655, + "loss": 0.3085, + "step": 2690 + }, + { + "epoch": 0.11, + "grad_norm": 0.25, + "learning_rate": 0.0004999840981811, + "loss": 0.2817, + "step": 2700 + }, + { + "epoch": 0.11, + "grad_norm": 0.70703125, + "learning_rate": 0.0004999839756254915, + "loss": 0.2842, + "step": 2710 + }, + { + "epoch": 0.11, + "grad_norm": 0.89453125, + "learning_rate": 0.00049998385259944, + "loss": 0.3295, + "step": 2720 + }, + { + "epoch": 0.11, + "grad_norm": 0.421875, + "learning_rate": 0.000499983729102946, + "loss": 0.2718, + "step": 2730 + }, + { + "epoch": 0.11, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999836051360096, + "loss": 0.2964, + "step": 2740 + }, + { + "epoch": 0.11, + "grad_norm": 0.2138671875, + "learning_rate": 0.0004999834806986309, + "loss": 0.2851, + "step": 2750 + }, + { + "epoch": 0.11, + "grad_norm": 1.0859375, + "learning_rate": 0.0004999833557908105, + "loss": 0.2808, + "step": 2760 + }, + { + "epoch": 0.11, + "grad_norm": 0.66015625, + "learning_rate": 0.0004999832304125484, + "loss": 0.3468, + "step": 2770 + }, + { + "epoch": 0.12, + "grad_norm": 0.515625, + "learning_rate": 0.0004999831045638447, + "loss": 0.3336, + "step": 2780 + }, + { + "epoch": 0.12, + "grad_norm": 0.9921875, + "learning_rate": 0.0004999829782446999, + "loss": 0.2583, + "step": 2790 + }, + { + "epoch": 0.12, + "grad_norm": 1.4765625, + "learning_rate": 0.0004999828514551141, + "loss": 0.2889, + "step": 2800 + }, + { + "epoch": 0.12, + "grad_norm": 2.03125, + "learning_rate": 0.0004999827241950876, + "loss": 0.2915, + "step": 2810 + }, + { + "epoch": 0.12, + "grad_norm": 1.6953125, + "learning_rate": 0.0004999825964646207, + "loss": 0.3186, + "step": 2820 + }, + { + "epoch": 0.12, + "grad_norm": 0.78515625, + "learning_rate": 0.0004999824682637134, + "loss": 0.3093, + "step": 2830 + }, + { + "epoch": 0.12, + "grad_norm": 2.4375, + "learning_rate": 0.0004999823395923662, + "loss": 0.3435, + "step": 2840 + }, + { + "epoch": 0.12, + "grad_norm": 0.59375, + "learning_rate": 0.0004999822104505791, + "loss": 0.261, + "step": 2850 + }, + { + "epoch": 0.12, + "grad_norm": 0.60546875, + "learning_rate": 0.0004999820808383527, + "loss": 0.2935, + "step": 2860 + }, + { + "epoch": 0.12, + "grad_norm": 0.494140625, + "learning_rate": 0.0004999819507556868, + "loss": 0.3102, + "step": 2870 + }, + { + "epoch": 0.12, + "grad_norm": 0.326171875, + "learning_rate": 0.0004999818202025819, + "loss": 0.2597, + "step": 2880 + }, + { + "epoch": 0.12, + "grad_norm": 0.1796875, + "learning_rate": 0.0004999816891790382, + "loss": 0.328, + "step": 2890 + }, + { + "epoch": 0.12, + "grad_norm": 1.1015625, + "learning_rate": 0.000499981557685056, + "loss": 0.2938, + "step": 2900 + }, + { + "epoch": 0.12, + "grad_norm": 0.984375, + "learning_rate": 0.0004999814257206355, + "loss": 0.3352, + "step": 2910 + }, + { + "epoch": 0.12, + "grad_norm": 3.3125, + "learning_rate": 0.000499981293285777, + "loss": 0.2745, + "step": 2920 + }, + { + "epoch": 0.12, + "grad_norm": 1.015625, + "learning_rate": 0.0004999811603804806, + "loss": 0.2161, + "step": 2930 + }, + { + "epoch": 0.12, + "grad_norm": 0.37109375, + "learning_rate": 0.0004999810270047468, + "loss": 0.2805, + "step": 2940 + }, + { + "epoch": 0.12, + "grad_norm": 1.0234375, + "learning_rate": 0.0004999808931585755, + "loss": 0.2409, + "step": 2950 + }, + { + "epoch": 0.12, + "grad_norm": 0.65234375, + "learning_rate": 0.0004999807588419674, + "loss": 0.3236, + "step": 2960 + }, + { + "epoch": 0.12, + "grad_norm": 0.625, + "learning_rate": 0.0004999806240549222, + "loss": 0.2807, + "step": 2970 + }, + { + "epoch": 0.12, + "grad_norm": 0.373046875, + "learning_rate": 0.0004999804887974407, + "loss": 0.2781, + "step": 2980 + }, + { + "epoch": 0.12, + "grad_norm": 0.84375, + "learning_rate": 0.0004999803530695229, + "loss": 0.3015, + "step": 2990 + }, + { + "epoch": 0.12, + "grad_norm": 1.375, + "learning_rate": 0.000499980216871169, + "loss": 0.3282, + "step": 3000 + }, + { + "epoch": 0.12, + "grad_norm": 5.09375, + "learning_rate": 0.0004999800802023794, + "loss": 0.3094, + "step": 3010 + }, + { + "epoch": 0.13, + "grad_norm": 0.67578125, + "learning_rate": 0.0004999799430631542, + "loss": 0.281, + "step": 3020 + }, + { + "epoch": 0.13, + "grad_norm": 1.359375, + "learning_rate": 0.0004999798054534937, + "loss": 0.2694, + "step": 3030 + }, + { + "epoch": 0.13, + "grad_norm": 1.1171875, + "learning_rate": 0.0004999796673733983, + "loss": 0.2808, + "step": 3040 + }, + { + "epoch": 0.13, + "grad_norm": 0.703125, + "learning_rate": 0.0004999795288228682, + "loss": 0.2501, + "step": 3050 + }, + { + "epoch": 0.13, + "grad_norm": 1.3046875, + "learning_rate": 0.0004999793898019035, + "loss": 0.3492, + "step": 3060 + }, + { + "epoch": 0.13, + "grad_norm": 0.421875, + "learning_rate": 0.0004999792503105048, + "loss": 0.219, + "step": 3070 + }, + { + "epoch": 0.13, + "grad_norm": 0.90625, + "learning_rate": 0.0004999791103486719, + "loss": 0.2859, + "step": 3080 + }, + { + "epoch": 0.13, + "grad_norm": 1.0625, + "learning_rate": 0.0004999789699164053, + "loss": 0.3115, + "step": 3090 + }, + { + "epoch": 0.13, + "grad_norm": 0.80078125, + "learning_rate": 0.0004999788290137054, + "loss": 0.3104, + "step": 3100 + }, + { + "epoch": 0.13, + "grad_norm": 8.4375, + "learning_rate": 0.0004999786876405724, + "loss": 0.2714, + "step": 3110 + }, + { + "epoch": 0.13, + "grad_norm": 1.34375, + "learning_rate": 0.0004999785457970064, + "loss": 0.366, + "step": 3120 + }, + { + "epoch": 0.13, + "grad_norm": 0.8046875, + "learning_rate": 0.0004999784034830078, + "loss": 0.3262, + "step": 3130 + }, + { + "epoch": 0.13, + "grad_norm": 1.2421875, + "learning_rate": 0.0004999782606985769, + "loss": 0.3268, + "step": 3140 + }, + { + "epoch": 0.13, + "grad_norm": 0.921875, + "learning_rate": 0.0004999781174437138, + "loss": 0.2109, + "step": 3150 + }, + { + "epoch": 0.13, + "grad_norm": 1.125, + "learning_rate": 0.0004999779737184189, + "loss": 0.2906, + "step": 3160 + }, + { + "epoch": 0.13, + "grad_norm": 0.455078125, + "learning_rate": 0.0004999778295226925, + "loss": 0.2829, + "step": 3170 + }, + { + "epoch": 0.13, + "grad_norm": 0.76953125, + "learning_rate": 0.0004999776848565347, + "loss": 0.3023, + "step": 3180 + }, + { + "epoch": 0.13, + "grad_norm": 0.73828125, + "learning_rate": 0.000499977539719946, + "loss": 0.2116, + "step": 3190 + }, + { + "epoch": 0.13, + "grad_norm": 0.248046875, + "learning_rate": 0.0004999773941129265, + "loss": 0.3067, + "step": 3200 + }, + { + "epoch": 0.13, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999772480354766, + "loss": 0.3927, + "step": 3210 + }, + { + "epoch": 0.13, + "grad_norm": 1.203125, + "learning_rate": 0.0004999771014875965, + "loss": 0.2088, + "step": 3220 + }, + { + "epoch": 0.13, + "grad_norm": 0.8359375, + "learning_rate": 0.0004999769544692866, + "loss": 0.2711, + "step": 3230 + }, + { + "epoch": 0.13, + "grad_norm": 1.171875, + "learning_rate": 0.0004999768069805469, + "loss": 0.3164, + "step": 3240 + }, + { + "epoch": 0.13, + "grad_norm": 0.66015625, + "learning_rate": 0.0004999766590213779, + "loss": 0.2701, + "step": 3250 + }, + { + "epoch": 0.14, + "grad_norm": 0.67578125, + "learning_rate": 0.0004999765105917799, + "loss": 0.3645, + "step": 3260 + }, + { + "epoch": 0.14, + "grad_norm": 1.328125, + "learning_rate": 0.000499976361691753, + "loss": 0.266, + "step": 3270 + }, + { + "epoch": 0.14, + "grad_norm": 0.2236328125, + "learning_rate": 0.0004999762123212975, + "loss": 0.2915, + "step": 3280 + }, + { + "epoch": 0.14, + "grad_norm": 1.359375, + "learning_rate": 0.0004999760624804139, + "loss": 0.3049, + "step": 3290 + }, + { + "epoch": 0.14, + "grad_norm": 0.97265625, + "learning_rate": 0.0004999759121691023, + "loss": 0.2528, + "step": 3300 + }, + { + "epoch": 0.14, + "grad_norm": 1.0703125, + "learning_rate": 0.0004999757613873631, + "loss": 0.2787, + "step": 3310 + }, + { + "epoch": 0.14, + "grad_norm": 1.609375, + "learning_rate": 0.0004999756101351964, + "loss": 0.2743, + "step": 3320 + }, + { + "epoch": 0.14, + "grad_norm": 1.4453125, + "learning_rate": 0.0004999754584126027, + "loss": 0.3245, + "step": 3330 + }, + { + "epoch": 0.14, + "grad_norm": 0.9765625, + "learning_rate": 0.0004999753062195822, + "loss": 0.3649, + "step": 3340 + }, + { + "epoch": 0.14, + "grad_norm": 1.34375, + "learning_rate": 0.0004999751535561351, + "loss": 0.3301, + "step": 3350 + }, + { + "epoch": 0.14, + "grad_norm": 1.078125, + "learning_rate": 0.0004999750004222618, + "loss": 0.2969, + "step": 3360 + }, + { + "epoch": 0.14, + "grad_norm": 7.6875, + "learning_rate": 0.0004999748468179624, + "loss": 0.2987, + "step": 3370 + }, + { + "epoch": 0.14, + "grad_norm": 0.3359375, + "learning_rate": 0.0004999746927432375, + "loss": 0.3099, + "step": 3380 + }, + { + "epoch": 0.14, + "grad_norm": 0.40234375, + "learning_rate": 0.0004999745381980872, + "loss": 0.3225, + "step": 3390 + }, + { + "epoch": 0.14, + "grad_norm": 0.765625, + "learning_rate": 0.0004999743831825117, + "loss": 0.2727, + "step": 3400 + }, + { + "epoch": 0.14, + "grad_norm": 0.9609375, + "learning_rate": 0.0004999742276965114, + "loss": 0.3039, + "step": 3410 + }, + { + "epoch": 0.14, + "grad_norm": 0.703125, + "learning_rate": 0.0004999740717400868, + "loss": 0.337, + "step": 3420 + }, + { + "epoch": 0.14, + "grad_norm": 0.380859375, + "learning_rate": 0.0004999739153132379, + "loss": 0.2486, + "step": 3430 + }, + { + "epoch": 0.14, + "grad_norm": 0.75390625, + "learning_rate": 0.000499973758415965, + "loss": 0.3389, + "step": 3440 + }, + { + "epoch": 0.14, + "grad_norm": 0.478515625, + "learning_rate": 0.0004999736010482685, + "loss": 0.3683, + "step": 3450 + }, + { + "epoch": 0.14, + "grad_norm": 1.21875, + "learning_rate": 0.0004999734432101487, + "loss": 0.3068, + "step": 3460 + }, + { + "epoch": 0.14, + "grad_norm": 0.365234375, + "learning_rate": 0.0004999732849016059, + "loss": 0.325, + "step": 3470 + }, + { + "epoch": 0.14, + "grad_norm": 1.046875, + "learning_rate": 0.0004999731261226403, + "loss": 0.2552, + "step": 3480 + }, + { + "epoch": 0.14, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999729668732525, + "loss": 0.2935, + "step": 3490 + }, + { + "epoch": 0.14, + "grad_norm": 0.65625, + "learning_rate": 0.0004999728071534424, + "loss": 0.2868, + "step": 3500 + }, + { + "epoch": 0.15, + "grad_norm": 1.5078125, + "learning_rate": 0.0004999726469632104, + "loss": 0.2938, + "step": 3510 + }, + { + "epoch": 0.15, + "grad_norm": 0.431640625, + "learning_rate": 0.000499972486302557, + "loss": 0.2528, + "step": 3520 + }, + { + "epoch": 0.15, + "grad_norm": 1.3359375, + "learning_rate": 0.0004999723251714823, + "loss": 0.272, + "step": 3530 + }, + { + "epoch": 0.15, + "grad_norm": 1.0390625, + "learning_rate": 0.0004999721635699867, + "loss": 0.2603, + "step": 3540 + }, + { + "epoch": 0.15, + "grad_norm": 0.404296875, + "learning_rate": 0.0004999720014980704, + "loss": 0.2908, + "step": 3550 + }, + { + "epoch": 0.15, + "grad_norm": 1.125, + "learning_rate": 0.000499971838955734, + "loss": 0.3134, + "step": 3560 + }, + { + "epoch": 0.15, + "grad_norm": 2.515625, + "learning_rate": 0.0004999716759429775, + "loss": 0.2135, + "step": 3570 + }, + { + "epoch": 0.15, + "grad_norm": 1.1953125, + "learning_rate": 0.0004999715124598013, + "loss": 0.2759, + "step": 3580 + }, + { + "epoch": 0.15, + "grad_norm": 0.435546875, + "learning_rate": 0.0004999713485062057, + "loss": 0.2508, + "step": 3590 + }, + { + "epoch": 0.15, + "grad_norm": 0.484375, + "learning_rate": 0.0004999711840821909, + "loss": 0.3017, + "step": 3600 + }, + { + "epoch": 0.15, + "grad_norm": 0.81640625, + "learning_rate": 0.0004999710191877575, + "loss": 0.2752, + "step": 3610 + }, + { + "epoch": 0.15, + "grad_norm": 1.328125, + "learning_rate": 0.0004999708538229055, + "loss": 0.2971, + "step": 3620 + }, + { + "epoch": 0.15, + "grad_norm": 0.8046875, + "learning_rate": 0.0004999706879876354, + "loss": 0.2524, + "step": 3630 + }, + { + "epoch": 0.15, + "grad_norm": 2.859375, + "learning_rate": 0.0004999705216819475, + "loss": 0.2615, + "step": 3640 + }, + { + "epoch": 0.15, + "grad_norm": 1.859375, + "learning_rate": 0.000499970354905842, + "loss": 0.252, + "step": 3650 + }, + { + "epoch": 0.15, + "grad_norm": 1.7578125, + "learning_rate": 0.0004999701876593194, + "loss": 0.2969, + "step": 3660 + }, + { + "epoch": 0.15, + "grad_norm": 0.62890625, + "learning_rate": 0.0004999700199423798, + "loss": 0.3443, + "step": 3670 + }, + { + "epoch": 0.15, + "grad_norm": 0.4453125, + "learning_rate": 0.0004999698517550236, + "loss": 0.2812, + "step": 3680 + }, + { + "epoch": 0.15, + "grad_norm": 0.640625, + "learning_rate": 0.0004999696830972511, + "loss": 0.2697, + "step": 3690 + }, + { + "epoch": 0.15, + "grad_norm": 1.0625, + "learning_rate": 0.0004999695139690628, + "loss": 0.2609, + "step": 3700 + }, + { + "epoch": 0.15, + "grad_norm": 0.453125, + "learning_rate": 0.0004999693443704588, + "loss": 0.323, + "step": 3710 + }, + { + "epoch": 0.15, + "grad_norm": 2.703125, + "learning_rate": 0.0004999691743014394, + "loss": 0.2663, + "step": 3720 + }, + { + "epoch": 0.15, + "grad_norm": 0.453125, + "learning_rate": 0.0004999690037620052, + "loss": 0.3385, + "step": 3730 + }, + { + "epoch": 0.15, + "grad_norm": 0.875, + "learning_rate": 0.0004999688327521562, + "loss": 0.2737, + "step": 3740 + }, + { + "epoch": 0.16, + "grad_norm": 0.484375, + "learning_rate": 0.0004999686612718929, + "loss": 0.2881, + "step": 3750 + }, + { + "epoch": 0.16, + "grad_norm": 0.4765625, + "learning_rate": 0.0004999684893212155, + "loss": 0.2836, + "step": 3760 + }, + { + "epoch": 0.16, + "grad_norm": 1.140625, + "learning_rate": 0.0004999683169001245, + "loss": 0.3308, + "step": 3770 + }, + { + "epoch": 0.16, + "grad_norm": 0.6484375, + "learning_rate": 0.00049996814400862, + "loss": 0.2909, + "step": 3780 + }, + { + "epoch": 0.16, + "grad_norm": 0.78515625, + "learning_rate": 0.0004999679706467025, + "loss": 0.3195, + "step": 3790 + }, + { + "epoch": 0.16, + "grad_norm": 0.201171875, + "learning_rate": 0.0004999677968143723, + "loss": 0.2826, + "step": 3800 + }, + { + "epoch": 0.16, + "grad_norm": 0.7578125, + "learning_rate": 0.0004999676225116297, + "loss": 0.3067, + "step": 3810 + }, + { + "epoch": 0.16, + "grad_norm": 0.96875, + "learning_rate": 0.000499967447738475, + "loss": 0.375, + "step": 3820 + }, + { + "epoch": 0.16, + "grad_norm": 1.03125, + "learning_rate": 0.0004999672724949086, + "loss": 0.3255, + "step": 3830 + }, + { + "epoch": 0.16, + "grad_norm": 1.6484375, + "learning_rate": 0.0004999670967809307, + "loss": 0.2972, + "step": 3840 + }, + { + "epoch": 0.16, + "grad_norm": 0.0, + "learning_rate": 0.0004999669205965418, + "loss": 0.2676, + "step": 3850 + }, + { + "epoch": 0.16, + "grad_norm": 0.703125, + "learning_rate": 0.0004999667439417421, + "loss": 0.3539, + "step": 3860 + }, + { + "epoch": 0.16, + "grad_norm": 1.296875, + "learning_rate": 0.0004999665668165321, + "loss": 0.3361, + "step": 3870 + }, + { + "epoch": 0.16, + "grad_norm": 1.0, + "learning_rate": 0.0004999663892209119, + "loss": 0.3094, + "step": 3880 + }, + { + "epoch": 0.16, + "grad_norm": 0.9296875, + "learning_rate": 0.0004999662111548819, + "loss": 0.2927, + "step": 3890 + }, + { + "epoch": 0.16, + "grad_norm": 0.447265625, + "learning_rate": 0.0004999660326184427, + "loss": 0.2805, + "step": 3900 + }, + { + "epoch": 0.16, + "grad_norm": 1.640625, + "learning_rate": 0.0004999658536115942, + "loss": 0.2376, + "step": 3910 + }, + { + "epoch": 0.16, + "grad_norm": 1.328125, + "learning_rate": 0.000499965674134337, + "loss": 0.2676, + "step": 3920 + }, + { + "epoch": 0.16, + "grad_norm": 1.1171875, + "learning_rate": 0.0004999654941866715, + "loss": 0.311, + "step": 3930 + }, + { + "epoch": 0.16, + "grad_norm": 0.77734375, + "learning_rate": 0.0004999653137685979, + "loss": 0.2543, + "step": 3940 + }, + { + "epoch": 0.16, + "grad_norm": 0.84375, + "learning_rate": 0.0004999651328801164, + "loss": 0.2481, + "step": 3950 + }, + { + "epoch": 0.16, + "grad_norm": 0.72265625, + "learning_rate": 0.0004999649515212277, + "loss": 0.2252, + "step": 3960 + }, + { + "epoch": 0.16, + "grad_norm": 0.56640625, + "learning_rate": 0.0004999647696919319, + "loss": 0.343, + "step": 3970 + }, + { + "epoch": 0.16, + "grad_norm": 0.578125, + "learning_rate": 0.0004999645873922295, + "loss": 0.2701, + "step": 3980 + }, + { + "epoch": 0.17, + "grad_norm": 0.5078125, + "learning_rate": 0.0004999644046221205, + "loss": 0.35, + "step": 3990 + }, + { + "epoch": 0.17, + "grad_norm": 1.3046875, + "learning_rate": 0.0004999642213816057, + "loss": 0.2573, + "step": 4000 + }, + { + "epoch": 0.17, + "grad_norm": 0.3984375, + "learning_rate": 0.0004999640376706852, + "loss": 0.3026, + "step": 4010 + }, + { + "epoch": 0.17, + "grad_norm": 1.5, + "learning_rate": 0.0004999638534893593, + "loss": 0.2408, + "step": 4020 + }, + { + "epoch": 0.17, + "grad_norm": 1.53125, + "learning_rate": 0.0004999636688376285, + "loss": 0.278, + "step": 4030 + }, + { + "epoch": 0.17, + "grad_norm": 0.29296875, + "learning_rate": 0.0004999634837154931, + "loss": 0.2663, + "step": 4040 + }, + { + "epoch": 0.17, + "grad_norm": 1.3984375, + "learning_rate": 0.0004999632981229533, + "loss": 0.234, + "step": 4050 + }, + { + "epoch": 0.17, + "grad_norm": 0.83203125, + "learning_rate": 0.0004999631120600096, + "loss": 0.2501, + "step": 4060 + }, + { + "epoch": 0.17, + "grad_norm": 1.828125, + "learning_rate": 0.0004999629255266623, + "loss": 0.2926, + "step": 4070 + }, + { + "epoch": 0.17, + "grad_norm": 0.96875, + "learning_rate": 0.0004999627385229118, + "loss": 0.3074, + "step": 4080 + }, + { + "epoch": 0.17, + "grad_norm": 0.291015625, + "learning_rate": 0.0004999625510487584, + "loss": 0.2032, + "step": 4090 + }, + { + "epoch": 0.17, + "grad_norm": 0.58984375, + "learning_rate": 0.0004999623631042025, + "loss": 0.2784, + "step": 4100 + }, + { + "epoch": 0.17, + "grad_norm": 1.1953125, + "learning_rate": 0.0004999621746892445, + "loss": 0.3041, + "step": 4110 + }, + { + "epoch": 0.17, + "grad_norm": 0.625, + "learning_rate": 0.0004999619858038845, + "loss": 0.2728, + "step": 4120 + }, + { + "epoch": 0.17, + "grad_norm": 0.8046875, + "learning_rate": 0.0004999617964481231, + "loss": 0.3083, + "step": 4130 + }, + { + "epoch": 0.17, + "grad_norm": 0.376953125, + "learning_rate": 0.0004999616066219606, + "loss": 0.2734, + "step": 4140 + }, + { + "epoch": 0.17, + "grad_norm": 0.5, + "learning_rate": 0.0004999614163253974, + "loss": 0.2871, + "step": 4150 + }, + { + "epoch": 0.17, + "grad_norm": 1.4140625, + "learning_rate": 0.0004999612255584338, + "loss": 0.2708, + "step": 4160 + }, + { + "epoch": 0.17, + "grad_norm": 0.79296875, + "learning_rate": 0.0004999610343210701, + "loss": 0.2589, + "step": 4170 + }, + { + "epoch": 0.17, + "grad_norm": 0.5234375, + "learning_rate": 0.0004999608426133069, + "loss": 0.2684, + "step": 4180 + }, + { + "epoch": 0.17, + "grad_norm": 1.984375, + "learning_rate": 0.0004999606504351441, + "loss": 0.2734, + "step": 4190 + }, + { + "epoch": 0.17, + "grad_norm": 0.8125, + "learning_rate": 0.0004999604577865827, + "loss": 0.3128, + "step": 4200 + }, + { + "epoch": 0.17, + "grad_norm": 0.345703125, + "learning_rate": 0.0004999602646676225, + "loss": 0.2464, + "step": 4210 + }, + { + "epoch": 0.17, + "grad_norm": 0.78125, + "learning_rate": 0.0004999600710782641, + "loss": 0.3485, + "step": 4220 + }, + { + "epoch": 0.18, + "grad_norm": 0.875, + "learning_rate": 0.0004999598770185077, + "loss": 0.2814, + "step": 4230 + }, + { + "epoch": 0.18, + "grad_norm": 1.8046875, + "learning_rate": 0.000499959682488354, + "loss": 0.3007, + "step": 4240 + }, + { + "epoch": 0.18, + "grad_norm": 0.81640625, + "learning_rate": 0.0004999594874878031, + "loss": 0.2577, + "step": 4250 + }, + { + "epoch": 0.18, + "grad_norm": 0.67578125, + "learning_rate": 0.0004999592920168555, + "loss": 0.2729, + "step": 4260 + }, + { + "epoch": 0.18, + "grad_norm": 1.984375, + "learning_rate": 0.0004999590960755114, + "loss": 0.2438, + "step": 4270 + }, + { + "epoch": 0.18, + "grad_norm": 0.2578125, + "learning_rate": 0.0004999588996637714, + "loss": 0.2645, + "step": 4280 + }, + { + "epoch": 0.18, + "grad_norm": 1.8125, + "learning_rate": 0.0004999587027816356, + "loss": 0.2711, + "step": 4290 + }, + { + "epoch": 0.18, + "grad_norm": 1.2109375, + "learning_rate": 0.0004999585054291046, + "loss": 0.3607, + "step": 4300 + }, + { + "epoch": 0.18, + "grad_norm": 0.455078125, + "learning_rate": 0.0004999583076061787, + "loss": 0.3275, + "step": 4310 + }, + { + "epoch": 0.18, + "grad_norm": 0.84375, + "learning_rate": 0.0004999581093128582, + "loss": 0.2892, + "step": 4320 + }, + { + "epoch": 0.18, + "grad_norm": 0.328125, + "learning_rate": 0.0004999579105491437, + "loss": 0.2947, + "step": 4330 + }, + { + "epoch": 0.18, + "grad_norm": 0.7578125, + "learning_rate": 0.0004999577113150352, + "loss": 0.3011, + "step": 4340 + }, + { + "epoch": 0.18, + "grad_norm": 0.640625, + "learning_rate": 0.0004999575116105333, + "loss": 0.2475, + "step": 4350 + }, + { + "epoch": 0.18, + "grad_norm": 0.97265625, + "learning_rate": 0.0004999573114356384, + "loss": 0.2808, + "step": 4360 + }, + { + "epoch": 0.18, + "grad_norm": 0.70703125, + "learning_rate": 0.0004999571107903508, + "loss": 0.3107, + "step": 4370 + }, + { + "epoch": 0.18, + "grad_norm": 0.48046875, + "learning_rate": 0.000499956909674671, + "loss": 0.2195, + "step": 4380 + }, + { + "epoch": 0.18, + "grad_norm": 0.84375, + "learning_rate": 0.0004999567080885992, + "loss": 0.2721, + "step": 4390 + }, + { + "epoch": 0.18, + "grad_norm": 1.125, + "learning_rate": 0.000499956506032136, + "loss": 0.3345, + "step": 4400 + }, + { + "epoch": 0.18, + "grad_norm": 0.490234375, + "learning_rate": 0.0004999563035052815, + "loss": 0.2697, + "step": 4410 + }, + { + "epoch": 0.18, + "grad_norm": 0.7109375, + "learning_rate": 0.0004999561005080363, + "loss": 0.2423, + "step": 4420 + }, + { + "epoch": 0.18, + "grad_norm": 0.7578125, + "learning_rate": 0.0004999558970404007, + "loss": 0.2606, + "step": 4430 + }, + { + "epoch": 0.18, + "grad_norm": 0.1845703125, + "learning_rate": 0.0004999556931023751, + "loss": 0.2731, + "step": 4440 + }, + { + "epoch": 0.18, + "grad_norm": 1.1796875, + "learning_rate": 0.0004999554886939599, + "loss": 0.2086, + "step": 4450 + }, + { + "epoch": 0.18, + "grad_norm": 0.8515625, + "learning_rate": 0.0004999552838151555, + "loss": 0.2924, + "step": 4460 + }, + { + "epoch": 0.19, + "grad_norm": 0.78125, + "learning_rate": 0.0004999550784659621, + "loss": 0.2017, + "step": 4470 + }, + { + "epoch": 0.19, + "grad_norm": 1.25, + "learning_rate": 0.0004999548726463803, + "loss": 0.2898, + "step": 4480 + }, + { + "epoch": 0.19, + "grad_norm": 1.0234375, + "learning_rate": 0.0004999546663564104, + "loss": 0.2527, + "step": 4490 + }, + { + "epoch": 0.19, + "grad_norm": 0.72265625, + "learning_rate": 0.0004999544595960529, + "loss": 0.2789, + "step": 4500 + }, + { + "epoch": 0.19, + "grad_norm": 1.078125, + "learning_rate": 0.0004999542523653081, + "loss": 0.2697, + "step": 4510 + }, + { + "epoch": 0.19, + "grad_norm": 0.57421875, + "learning_rate": 0.0004999540446641764, + "loss": 0.2611, + "step": 4520 + }, + { + "epoch": 0.19, + "grad_norm": 0.52734375, + "learning_rate": 0.000499953836492658, + "loss": 0.2665, + "step": 4530 + }, + { + "epoch": 0.19, + "grad_norm": 0.51953125, + "learning_rate": 0.0004999536278507536, + "loss": 0.2592, + "step": 4540 + }, + { + "epoch": 0.19, + "grad_norm": 0.69140625, + "learning_rate": 0.0004999534187384634, + "loss": 0.2374, + "step": 4550 + }, + { + "epoch": 0.19, + "grad_norm": 1.390625, + "learning_rate": 0.000499953209155788, + "loss": 0.2703, + "step": 4560 + }, + { + "epoch": 0.19, + "grad_norm": 0.86328125, + "learning_rate": 0.0004999529991027275, + "loss": 0.2952, + "step": 4570 + }, + { + "epoch": 0.19, + "grad_norm": 1.9609375, + "learning_rate": 0.0004999527885792826, + "loss": 0.2998, + "step": 4580 + }, + { + "epoch": 0.19, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999525775854534, + "loss": 0.2716, + "step": 4590 + }, + { + "epoch": 0.19, + "grad_norm": 0.765625, + "learning_rate": 0.0004999523661212405, + "loss": 0.2741, + "step": 4600 + }, + { + "epoch": 0.19, + "grad_norm": 0.625, + "learning_rate": 0.0004999521541866443, + "loss": 0.3269, + "step": 4610 + }, + { + "epoch": 0.19, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999519417816651, + "loss": 0.325, + "step": 4620 + }, + { + "epoch": 0.19, + "grad_norm": 0.76171875, + "learning_rate": 0.0004999517289063033, + "loss": 0.2495, + "step": 4630 + }, + { + "epoch": 0.19, + "grad_norm": 0.83984375, + "learning_rate": 0.0004999515155605594, + "loss": 0.2341, + "step": 4640 + }, + { + "epoch": 0.19, + "grad_norm": 0.41015625, + "learning_rate": 0.0004999513017444337, + "loss": 0.2627, + "step": 4650 + }, + { + "epoch": 0.19, + "grad_norm": 0.51171875, + "learning_rate": 0.0004999510874579266, + "loss": 0.2857, + "step": 4660 + }, + { + "epoch": 0.19, + "grad_norm": 2.703125, + "learning_rate": 0.0004999508727010386, + "loss": 0.3004, + "step": 4670 + }, + { + "epoch": 0.19, + "grad_norm": 1.375, + "learning_rate": 0.0004999506574737701, + "loss": 0.2956, + "step": 4680 + }, + { + "epoch": 0.19, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999504417761214, + "loss": 0.2723, + "step": 4690 + }, + { + "epoch": 0.19, + "grad_norm": 0.51953125, + "learning_rate": 0.0004999502256080928, + "loss": 0.2646, + "step": 4700 + }, + { + "epoch": 0.2, + "grad_norm": 0.3046875, + "learning_rate": 0.0004999500089696851, + "loss": 0.2751, + "step": 4710 + }, + { + "epoch": 0.2, + "grad_norm": 0.83203125, + "learning_rate": 0.0004999497918608984, + "loss": 0.3105, + "step": 4720 + }, + { + "epoch": 0.2, + "grad_norm": 0.70703125, + "learning_rate": 0.0004999495742817332, + "loss": 0.2986, + "step": 4730 + }, + { + "epoch": 0.2, + "grad_norm": 0.55078125, + "learning_rate": 0.0004999493562321899, + "loss": 0.2921, + "step": 4740 + }, + { + "epoch": 0.2, + "grad_norm": 0.7734375, + "learning_rate": 0.0004999491377122689, + "loss": 0.2576, + "step": 4750 + }, + { + "epoch": 0.2, + "grad_norm": 0.296875, + "learning_rate": 0.0004999489187219705, + "loss": 0.2669, + "step": 4760 + }, + { + "epoch": 0.2, + "grad_norm": 1.1328125, + "learning_rate": 0.0004999486992612954, + "loss": 0.3043, + "step": 4770 + }, + { + "epoch": 0.2, + "grad_norm": 1.1640625, + "learning_rate": 0.0004999484793302437, + "loss": 0.2603, + "step": 4780 + }, + { + "epoch": 0.2, + "grad_norm": 0.2080078125, + "learning_rate": 0.0004999482589288161, + "loss": 0.2197, + "step": 4790 + }, + { + "epoch": 0.2, + "grad_norm": 0.40234375, + "learning_rate": 0.0004999480380570127, + "loss": 0.2672, + "step": 4800 + }, + { + "epoch": 0.2, + "grad_norm": 0.96875, + "learning_rate": 0.0004999478167148342, + "loss": 0.2163, + "step": 4810 + }, + { + "epoch": 0.2, + "grad_norm": 0.62890625, + "learning_rate": 0.0004999475949022809, + "loss": 0.2583, + "step": 4820 + }, + { + "epoch": 0.2, + "grad_norm": 1.03125, + "learning_rate": 0.0004999473726193532, + "loss": 0.2959, + "step": 4830 + }, + { + "epoch": 0.2, + "grad_norm": 0.40625, + "learning_rate": 0.0004999471498660515, + "loss": 0.2997, + "step": 4840 + }, + { + "epoch": 0.2, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999469266423762, + "loss": 0.2495, + "step": 4850 + }, + { + "epoch": 0.2, + "grad_norm": 0.498046875, + "learning_rate": 0.0004999467029483279, + "loss": 0.2332, + "step": 4860 + }, + { + "epoch": 0.2, + "grad_norm": 1.4921875, + "learning_rate": 0.0004999464787839069, + "loss": 0.3074, + "step": 4870 + }, + { + "epoch": 0.2, + "grad_norm": 1.0078125, + "learning_rate": 0.0004999462541491136, + "loss": 0.2415, + "step": 4880 + }, + { + "epoch": 0.2, + "grad_norm": 1.21875, + "learning_rate": 0.0004999460290439484, + "loss": 0.3081, + "step": 4890 + }, + { + "epoch": 0.2, + "grad_norm": 0.83203125, + "learning_rate": 0.0004999458034684117, + "loss": 0.2545, + "step": 4900 + }, + { + "epoch": 0.2, + "grad_norm": 1.109375, + "learning_rate": 0.0004999455774225041, + "loss": 0.3227, + "step": 4910 + }, + { + "epoch": 0.2, + "grad_norm": 0.2490234375, + "learning_rate": 0.0004999453509062259, + "loss": 0.2479, + "step": 4920 + }, + { + "epoch": 0.2, + "grad_norm": 0.37109375, + "learning_rate": 0.0004999451239195775, + "loss": 0.2326, + "step": 4930 + }, + { + "epoch": 0.2, + "grad_norm": 0.4453125, + "learning_rate": 0.0004999448964625593, + "loss": 0.2613, + "step": 4940 + }, + { + "epoch": 0.21, + "grad_norm": 0.63671875, + "learning_rate": 0.000499944668535172, + "loss": 0.2278, + "step": 4950 + }, + { + "epoch": 0.21, + "grad_norm": 0.56640625, + "learning_rate": 0.0004999444401374157, + "loss": 0.2715, + "step": 4960 + }, + { + "epoch": 0.21, + "grad_norm": 1.3515625, + "learning_rate": 0.000499944211269291, + "loss": 0.2595, + "step": 4970 + }, + { + "epoch": 0.21, + "grad_norm": 0.64453125, + "learning_rate": 0.0004999439819307983, + "loss": 0.2903, + "step": 4980 + }, + { + "epoch": 0.21, + "grad_norm": 0.6328125, + "learning_rate": 0.0004999437521219378, + "loss": 0.2561, + "step": 4990 + }, + { + "epoch": 0.21, + "grad_norm": 1.015625, + "learning_rate": 0.0004999435218427104, + "loss": 0.2737, + "step": 5000 + }, + { + "epoch": 0.21, + "grad_norm": 2.15625, + "learning_rate": 0.0004999432910931162, + "loss": 0.2805, + "step": 5010 + }, + { + "epoch": 0.21, + "grad_norm": 0.79296875, + "learning_rate": 0.0004999430598731558, + "loss": 0.2532, + "step": 5020 + }, + { + "epoch": 0.21, + "grad_norm": 0.77734375, + "learning_rate": 0.0004999428281828295, + "loss": 0.2889, + "step": 5030 + }, + { + "epoch": 0.21, + "grad_norm": 0.921875, + "learning_rate": 0.0004999425960221378, + "loss": 0.2324, + "step": 5040 + }, + { + "epoch": 0.21, + "grad_norm": 0.6171875, + "learning_rate": 0.000499942363391081, + "loss": 0.2554, + "step": 5050 + }, + { + "epoch": 0.21, + "grad_norm": 0.62109375, + "learning_rate": 0.0004999421302896598, + "loss": 0.2961, + "step": 5060 + }, + { + "epoch": 0.21, + "grad_norm": 0.58984375, + "learning_rate": 0.0004999418967178744, + "loss": 0.2982, + "step": 5070 + }, + { + "epoch": 0.21, + "grad_norm": 1.078125, + "learning_rate": 0.0004999416626757253, + "loss": 0.2943, + "step": 5080 + }, + { + "epoch": 0.21, + "grad_norm": 0.79296875, + "learning_rate": 0.0004999414281632132, + "loss": 0.2682, + "step": 5090 + }, + { + "epoch": 0.21, + "grad_norm": 1.09375, + "learning_rate": 0.0004999411931803382, + "loss": 0.2447, + "step": 5100 + }, + { + "epoch": 0.21, + "grad_norm": 0.62109375, + "learning_rate": 0.0004999409577271009, + "loss": 0.2873, + "step": 5110 + }, + { + "epoch": 0.21, + "grad_norm": 1.1875, + "learning_rate": 0.0004999407218035017, + "loss": 0.3185, + "step": 5120 + }, + { + "epoch": 0.21, + "grad_norm": 0.6796875, + "learning_rate": 0.000499940485409541, + "loss": 0.2467, + "step": 5130 + }, + { + "epoch": 0.21, + "grad_norm": 0.48046875, + "learning_rate": 0.0004999402485452194, + "loss": 0.2611, + "step": 5140 + }, + { + "epoch": 0.21, + "grad_norm": 0.80859375, + "learning_rate": 0.0004999400112105371, + "loss": 0.203, + "step": 5150 + }, + { + "epoch": 0.21, + "grad_norm": 0.52734375, + "learning_rate": 0.0004999397734054948, + "loss": 0.3108, + "step": 5160 + }, + { + "epoch": 0.21, + "grad_norm": 1.8671875, + "learning_rate": 0.0004999395351300928, + "loss": 0.2525, + "step": 5170 + }, + { + "epoch": 0.21, + "grad_norm": 0.93359375, + "learning_rate": 0.0004999392963843316, + "loss": 0.233, + "step": 5180 + }, + { + "epoch": 0.21, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999390571682116, + "loss": 0.282, + "step": 5190 + }, + { + "epoch": 0.22, + "grad_norm": 1.1171875, + "learning_rate": 0.0004999388174817334, + "loss": 0.3435, + "step": 5200 + }, + { + "epoch": 0.22, + "grad_norm": 0.33984375, + "learning_rate": 0.0004999385773248971, + "loss": 0.3237, + "step": 5210 + }, + { + "epoch": 0.22, + "grad_norm": 2.4375, + "learning_rate": 0.0004999383366977036, + "loss": 0.2247, + "step": 5220 + }, + { + "epoch": 0.22, + "grad_norm": 1.09375, + "learning_rate": 0.000499938095600153, + "loss": 0.2347, + "step": 5230 + }, + { + "epoch": 0.22, + "grad_norm": 0.96484375, + "learning_rate": 0.000499937854032246, + "loss": 0.2529, + "step": 5240 + }, + { + "epoch": 0.22, + "grad_norm": 0.78515625, + "learning_rate": 0.0004999376119939829, + "loss": 0.2809, + "step": 5250 + }, + { + "epoch": 0.22, + "grad_norm": 0.9140625, + "learning_rate": 0.0004999373694853643, + "loss": 0.349, + "step": 5260 + }, + { + "epoch": 0.22, + "grad_norm": 0.55859375, + "learning_rate": 0.0004999371265063904, + "loss": 0.2144, + "step": 5270 + }, + { + "epoch": 0.22, + "grad_norm": 0.0, + "learning_rate": 0.000499936883057062, + "loss": 0.2463, + "step": 5280 + }, + { + "epoch": 0.22, + "grad_norm": 0.3984375, + "learning_rate": 0.0004999366391373793, + "loss": 0.213, + "step": 5290 + }, + { + "epoch": 0.22, + "grad_norm": 0.51171875, + "learning_rate": 0.0004999363947473428, + "loss": 0.2825, + "step": 5300 + }, + { + "epoch": 0.22, + "grad_norm": 0.9140625, + "learning_rate": 0.000499936149886953, + "loss": 0.2826, + "step": 5310 + }, + { + "epoch": 0.22, + "grad_norm": 0.61328125, + "learning_rate": 0.0004999359045562104, + "loss": 0.2517, + "step": 5320 + }, + { + "epoch": 0.22, + "grad_norm": 0.56640625, + "learning_rate": 0.0004999356587551154, + "loss": 0.2937, + "step": 5330 + }, + { + "epoch": 0.22, + "grad_norm": 0.287109375, + "learning_rate": 0.0004999354124836684, + "loss": 0.2733, + "step": 5340 + }, + { + "epoch": 0.22, + "grad_norm": 1.2109375, + "learning_rate": 0.0004999351657418701, + "loss": 0.3671, + "step": 5350 + }, + { + "epoch": 0.22, + "grad_norm": 0.7265625, + "learning_rate": 0.0004999349185297207, + "loss": 0.3006, + "step": 5360 + }, + { + "epoch": 0.22, + "grad_norm": 0.5234375, + "learning_rate": 0.0004999346708472208, + "loss": 0.2936, + "step": 5370 + }, + { + "epoch": 0.22, + "grad_norm": 0.58203125, + "learning_rate": 0.0004999344226943708, + "loss": 0.3282, + "step": 5380 + }, + { + "epoch": 0.22, + "grad_norm": 0.6328125, + "learning_rate": 0.0004999341740711713, + "loss": 0.233, + "step": 5390 + }, + { + "epoch": 0.22, + "grad_norm": 1.1796875, + "learning_rate": 0.0004999339249776225, + "loss": 0.2972, + "step": 5400 + }, + { + "epoch": 0.22, + "grad_norm": 1.484375, + "learning_rate": 0.0004999336754137252, + "loss": 0.2508, + "step": 5410 + }, + { + "epoch": 0.22, + "grad_norm": 0.478515625, + "learning_rate": 0.0004999334253794797, + "loss": 0.3503, + "step": 5420 + }, + { + "epoch": 0.22, + "grad_norm": 0.275390625, + "learning_rate": 0.0004999331748748864, + "loss": 0.2447, + "step": 5430 + }, + { + "epoch": 0.23, + "grad_norm": 1.25, + "learning_rate": 0.0004999329238999459, + "loss": 0.2452, + "step": 5440 + }, + { + "epoch": 0.23, + "grad_norm": 0.59765625, + "learning_rate": 0.0004999326724546587, + "loss": 0.2644, + "step": 5450 + }, + { + "epoch": 0.23, + "grad_norm": 0.8125, + "learning_rate": 0.0004999324205390252, + "loss": 0.3061, + "step": 5460 + }, + { + "epoch": 0.23, + "grad_norm": 0.75, + "learning_rate": 0.0004999321681530458, + "loss": 0.257, + "step": 5470 + }, + { + "epoch": 0.23, + "grad_norm": 0.56640625, + "learning_rate": 0.000499931915296721, + "loss": 0.267, + "step": 5480 + }, + { + "epoch": 0.23, + "grad_norm": 1.3984375, + "learning_rate": 0.0004999316619700515, + "loss": 0.252, + "step": 5490 + }, + { + "epoch": 0.23, + "grad_norm": 1.4453125, + "learning_rate": 0.0004999314081730374, + "loss": 0.2536, + "step": 5500 + }, + { + "epoch": 0.23, + "grad_norm": 1.1328125, + "learning_rate": 0.0004999311539056796, + "loss": 0.2684, + "step": 5510 + }, + { + "epoch": 0.23, + "grad_norm": 0.1884765625, + "learning_rate": 0.0004999308991679781, + "loss": 0.2814, + "step": 5520 + }, + { + "epoch": 0.23, + "grad_norm": 0.65625, + "learning_rate": 0.0004999306439599338, + "loss": 0.2445, + "step": 5530 + }, + { + "epoch": 0.23, + "grad_norm": 0.69140625, + "learning_rate": 0.000499930388281547, + "loss": 0.3406, + "step": 5540 + }, + { + "epoch": 0.23, + "grad_norm": 2.84375, + "learning_rate": 0.0004999301321328182, + "loss": 0.2602, + "step": 5550 + }, + { + "epoch": 0.23, + "grad_norm": 1.75, + "learning_rate": 0.000499929875513748, + "loss": 0.2742, + "step": 5560 + }, + { + "epoch": 0.23, + "grad_norm": 0.6875, + "learning_rate": 0.0004999296184243365, + "loss": 0.2872, + "step": 5570 + }, + { + "epoch": 0.23, + "grad_norm": 0.76953125, + "learning_rate": 0.0004999293608645846, + "loss": 0.2783, + "step": 5580 + }, + { + "epoch": 0.23, + "grad_norm": 0.6875, + "learning_rate": 0.0004999291028344926, + "loss": 0.2612, + "step": 5590 + }, + { + "epoch": 0.23, + "grad_norm": 3.296875, + "learning_rate": 0.0004999288443340611, + "loss": 0.2853, + "step": 5600 + }, + { + "epoch": 0.23, + "grad_norm": 1.46875, + "learning_rate": 0.0004999285853632905, + "loss": 0.2347, + "step": 5610 + }, + { + "epoch": 0.23, + "grad_norm": 0.88671875, + "learning_rate": 0.0004999283259221811, + "loss": 0.2836, + "step": 5620 + }, + { + "epoch": 0.23, + "grad_norm": 1.0703125, + "learning_rate": 0.0004999280660107337, + "loss": 0.2657, + "step": 5630 + }, + { + "epoch": 0.23, + "grad_norm": 1.5234375, + "learning_rate": 0.0004999278056289487, + "loss": 0.2512, + "step": 5640 + }, + { + "epoch": 0.23, + "grad_norm": 4.0625, + "learning_rate": 0.0004999275447768266, + "loss": 0.2362, + "step": 5650 + }, + { + "epoch": 0.23, + "grad_norm": 0.4453125, + "learning_rate": 0.0004999272834543678, + "loss": 0.2655, + "step": 5660 + }, + { + "epoch": 0.23, + "grad_norm": 1.140625, + "learning_rate": 0.0004999270216615728, + "loss": 0.2737, + "step": 5670 + }, + { + "epoch": 0.24, + "grad_norm": 0.45703125, + "learning_rate": 0.0004999267593984422, + "loss": 0.2887, + "step": 5680 + }, + { + "epoch": 0.24, + "grad_norm": 0.38671875, + "learning_rate": 0.0004999264966649763, + "loss": 0.266, + "step": 5690 + }, + { + "epoch": 0.24, + "grad_norm": 1.8046875, + "learning_rate": 0.0004999262334611759, + "loss": 0.2254, + "step": 5700 + }, + { + "epoch": 0.24, + "grad_norm": 1.0234375, + "learning_rate": 0.0004999259697870413, + "loss": 0.3001, + "step": 5710 + }, + { + "epoch": 0.24, + "grad_norm": 0.345703125, + "learning_rate": 0.0004999257056425729, + "loss": 0.2493, + "step": 5720 + }, + { + "epoch": 0.24, + "grad_norm": 0.828125, + "learning_rate": 0.0004999254410277714, + "loss": 0.3177, + "step": 5730 + }, + { + "epoch": 0.24, + "grad_norm": 1.3125, + "learning_rate": 0.0004999251759426372, + "loss": 0.2835, + "step": 5740 + }, + { + "epoch": 0.24, + "grad_norm": 1.1328125, + "learning_rate": 0.0004999249103871707, + "loss": 0.199, + "step": 5750 + }, + { + "epoch": 0.24, + "grad_norm": 0.67578125, + "learning_rate": 0.0004999246443613726, + "loss": 0.3123, + "step": 5760 + }, + { + "epoch": 0.24, + "grad_norm": 0.5625, + "learning_rate": 0.0004999243778652433, + "loss": 0.2364, + "step": 5770 + }, + { + "epoch": 0.24, + "grad_norm": 0.5859375, + "learning_rate": 0.0004999241108987833, + "loss": 0.3185, + "step": 5780 + }, + { + "epoch": 0.24, + "grad_norm": 1.015625, + "learning_rate": 0.0004999238434619932, + "loss": 0.2743, + "step": 5790 + }, + { + "epoch": 0.24, + "grad_norm": 0.84375, + "learning_rate": 0.0004999235755548733, + "loss": 0.2819, + "step": 5800 + }, + { + "epoch": 0.24, + "grad_norm": 0.828125, + "learning_rate": 0.0004999233071774243, + "loss": 0.2655, + "step": 5810 + }, + { + "epoch": 0.24, + "grad_norm": 0.69921875, + "learning_rate": 0.0004999230383296466, + "loss": 0.2494, + "step": 5820 + }, + { + "epoch": 0.24, + "grad_norm": 0.62890625, + "learning_rate": 0.0004999227690115407, + "loss": 0.2733, + "step": 5830 + }, + { + "epoch": 0.24, + "grad_norm": 1.359375, + "learning_rate": 0.0004999224992231072, + "loss": 0.275, + "step": 5840 + }, + { + "epoch": 0.24, + "grad_norm": 0.65234375, + "learning_rate": 0.0004999222289643465, + "loss": 0.2669, + "step": 5850 + }, + { + "epoch": 0.24, + "grad_norm": 0.81640625, + "learning_rate": 0.0004999219582352591, + "loss": 0.2788, + "step": 5860 + }, + { + "epoch": 0.24, + "grad_norm": 0.29296875, + "learning_rate": 0.0004999216870358456, + "loss": 0.2962, + "step": 5870 + }, + { + "epoch": 0.24, + "grad_norm": 0.96484375, + "learning_rate": 0.0004999214153661065, + "loss": 0.3491, + "step": 5880 + }, + { + "epoch": 0.24, + "grad_norm": 2.328125, + "learning_rate": 0.0004999211432260423, + "loss": 0.2553, + "step": 5890 + }, + { + "epoch": 0.24, + "grad_norm": 1.3359375, + "learning_rate": 0.0004999208706156535, + "loss": 0.2887, + "step": 5900 + }, + { + "epoch": 0.24, + "grad_norm": 0.85546875, + "learning_rate": 0.0004999205975349405, + "loss": 0.2612, + "step": 5910 + }, + { + "epoch": 0.25, + "grad_norm": 0.9375, + "learning_rate": 0.0004999203239839041, + "loss": 0.2811, + "step": 5920 + }, + { + "epoch": 0.25, + "grad_norm": 0.796875, + "learning_rate": 0.0004999200499625446, + "loss": 0.2453, + "step": 5930 + }, + { + "epoch": 0.25, + "grad_norm": 0.416015625, + "learning_rate": 0.0004999197754708625, + "loss": 0.2228, + "step": 5940 + }, + { + "epoch": 0.25, + "grad_norm": 0.46875, + "learning_rate": 0.0004999195005088584, + "loss": 0.2179, + "step": 5950 + }, + { + "epoch": 0.25, + "grad_norm": 0.83203125, + "learning_rate": 0.0004999192250765328, + "loss": 0.2533, + "step": 5960 + }, + { + "epoch": 0.25, + "grad_norm": 0.2490234375, + "learning_rate": 0.0004999189491738861, + "loss": 0.2556, + "step": 5970 + }, + { + "epoch": 0.25, + "grad_norm": 1.1328125, + "learning_rate": 0.000499918672800919, + "loss": 0.2838, + "step": 5980 + }, + { + "epoch": 0.25, + "grad_norm": 1.4140625, + "learning_rate": 0.0004999183959576319, + "loss": 0.3194, + "step": 5990 + }, + { + "epoch": 0.25, + "grad_norm": 0.8125, + "learning_rate": 0.0004999181186440255, + "loss": 0.2582, + "step": 6000 + }, + { + "epoch": 0.25, + "grad_norm": 0.5546875, + "learning_rate": 0.0004999178408601001, + "loss": 0.2744, + "step": 6010 + }, + { + "epoch": 0.25, + "grad_norm": 0.0, + "learning_rate": 0.0004999175626058563, + "loss": 0.2508, + "step": 6020 + }, + { + "epoch": 0.25, + "grad_norm": 0.859375, + "learning_rate": 0.0004999172838812947, + "loss": 0.2944, + "step": 6030 + }, + { + "epoch": 0.25, + "grad_norm": 0.66015625, + "learning_rate": 0.0004999170046864156, + "loss": 0.2641, + "step": 6040 + }, + { + "epoch": 0.25, + "grad_norm": 0.71484375, + "learning_rate": 0.0004999167250212199, + "loss": 0.2694, + "step": 6050 + }, + { + "epoch": 0.25, + "grad_norm": 0.37890625, + "learning_rate": 0.0004999164448857078, + "loss": 0.2722, + "step": 6060 + }, + { + "epoch": 0.25, + "grad_norm": 0.97265625, + "learning_rate": 0.0004999161642798799, + "loss": 0.328, + "step": 6070 + }, + { + "epoch": 0.25, + "grad_norm": 0.3046875, + "learning_rate": 0.0004999158832037368, + "loss": 0.2708, + "step": 6080 + }, + { + "epoch": 0.25, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999156016572791, + "loss": 0.2589, + "step": 6090 + }, + { + "epoch": 0.25, + "grad_norm": 0.5625, + "learning_rate": 0.0004999153196405071, + "loss": 0.2697, + "step": 6100 + }, + { + "epoch": 0.25, + "grad_norm": 0.62109375, + "learning_rate": 0.0004999150371534215, + "loss": 0.2656, + "step": 6110 + }, + { + "epoch": 0.25, + "grad_norm": 0.234375, + "learning_rate": 0.0004999147541960228, + "loss": 0.2574, + "step": 6120 + }, + { + "epoch": 0.25, + "grad_norm": 0.47265625, + "learning_rate": 0.0004999144707683116, + "loss": 0.2504, + "step": 6130 + }, + { + "epoch": 0.25, + "grad_norm": 0.94140625, + "learning_rate": 0.0004999141868702882, + "loss": 0.283, + "step": 6140 + }, + { + "epoch": 0.25, + "grad_norm": 0.330078125, + "learning_rate": 0.0004999139025019533, + "loss": 0.2157, + "step": 6150 + }, + { + "epoch": 0.26, + "grad_norm": 2.453125, + "learning_rate": 0.0004999136176633075, + "loss": 0.1966, + "step": 6160 + }, + { + "epoch": 0.26, + "grad_norm": 0.5234375, + "learning_rate": 0.0004999133323543512, + "loss": 0.2488, + "step": 6170 + }, + { + "epoch": 0.26, + "grad_norm": 0.55078125, + "learning_rate": 0.0004999130465750851, + "loss": 0.2782, + "step": 6180 + }, + { + "epoch": 0.26, + "grad_norm": 1.3046875, + "learning_rate": 0.0004999127603255095, + "loss": 0.249, + "step": 6190 + }, + { + "epoch": 0.26, + "grad_norm": 0.9453125, + "learning_rate": 0.0004999124736056252, + "loss": 0.2818, + "step": 6200 + }, + { + "epoch": 0.26, + "grad_norm": 0.9921875, + "learning_rate": 0.0004999121864154325, + "loss": 0.2484, + "step": 6210 + }, + { + "epoch": 0.26, + "grad_norm": 1.0546875, + "learning_rate": 0.0004999118987549321, + "loss": 0.3662, + "step": 6220 + }, + { + "epoch": 0.26, + "grad_norm": 0.94921875, + "learning_rate": 0.0004999116106241245, + "loss": 0.2629, + "step": 6230 + }, + { + "epoch": 0.26, + "grad_norm": 0.90234375, + "learning_rate": 0.0004999113220230103, + "loss": 0.296, + "step": 6240 + }, + { + "epoch": 0.26, + "grad_norm": 0.61328125, + "learning_rate": 0.0004999110329515899, + "loss": 0.2875, + "step": 6250 + }, + { + "epoch": 0.26, + "grad_norm": 0.69921875, + "learning_rate": 0.0004999107434098639, + "loss": 0.3191, + "step": 6260 + }, + { + "epoch": 0.26, + "grad_norm": 0.51953125, + "learning_rate": 0.000499910453397833, + "loss": 0.2137, + "step": 6270 + }, + { + "epoch": 0.26, + "grad_norm": 0.49609375, + "learning_rate": 0.0004999101629154975, + "loss": 0.3463, + "step": 6280 + }, + { + "epoch": 0.26, + "grad_norm": 0.96484375, + "learning_rate": 0.0004999098719628581, + "loss": 0.2234, + "step": 6290 + }, + { + "epoch": 0.26, + "grad_norm": 0.98046875, + "learning_rate": 0.0004999095805399153, + "loss": 0.3228, + "step": 6300 + }, + { + "epoch": 0.26, + "grad_norm": 0.341796875, + "learning_rate": 0.0004999092886466696, + "loss": 0.3047, + "step": 6310 + }, + { + "epoch": 0.26, + "grad_norm": 1.0859375, + "learning_rate": 0.0004999089962831217, + "loss": 0.256, + "step": 6320 + }, + { + "epoch": 0.26, + "grad_norm": 0.294921875, + "learning_rate": 0.000499908703449272, + "loss": 0.2991, + "step": 6330 + }, + { + "epoch": 0.26, + "grad_norm": 0.228515625, + "learning_rate": 0.0004999084101451211, + "loss": 0.256, + "step": 6340 + }, + { + "epoch": 0.26, + "grad_norm": 0.484375, + "learning_rate": 0.0004999081163706696, + "loss": 0.229, + "step": 6350 + }, + { + "epoch": 0.26, + "grad_norm": 0.88671875, + "learning_rate": 0.000499907822125918, + "loss": 0.2649, + "step": 6360 + }, + { + "epoch": 0.26, + "grad_norm": 0.578125, + "learning_rate": 0.0004999075274108669, + "loss": 0.2622, + "step": 6370 + }, + { + "epoch": 0.26, + "grad_norm": 0.67578125, + "learning_rate": 0.0004999072322255167, + "loss": 0.2987, + "step": 6380 + }, + { + "epoch": 0.26, + "grad_norm": 0.56640625, + "learning_rate": 0.0004999069365698681, + "loss": 0.2557, + "step": 6390 + }, + { + "epoch": 0.27, + "grad_norm": 0.48046875, + "learning_rate": 0.0004999066404439218, + "loss": 0.299, + "step": 6400 + }, + { + "epoch": 0.27, + "grad_norm": 0.58984375, + "learning_rate": 0.000499906343847678, + "loss": 0.2982, + "step": 6410 + }, + { + "epoch": 0.27, + "grad_norm": 1.84375, + "learning_rate": 0.0004999060467811376, + "loss": 0.2717, + "step": 6420 + }, + { + "epoch": 0.27, + "grad_norm": 0.72265625, + "learning_rate": 0.000499905749244301, + "loss": 0.2326, + "step": 6430 + }, + { + "epoch": 0.27, + "grad_norm": 0.56640625, + "learning_rate": 0.0004999054512371686, + "loss": 0.2836, + "step": 6440 + }, + { + "epoch": 0.27, + "grad_norm": 0.734375, + "learning_rate": 0.0004999051527597413, + "loss": 0.2804, + "step": 6450 + }, + { + "epoch": 0.27, + "grad_norm": 0.62109375, + "learning_rate": 0.0004999048538120195, + "loss": 0.2319, + "step": 6460 + }, + { + "epoch": 0.27, + "grad_norm": 1.2109375, + "learning_rate": 0.0004999045543940036, + "loss": 0.2958, + "step": 6470 + }, + { + "epoch": 0.27, + "grad_norm": 1.125, + "learning_rate": 0.0004999042545056944, + "loss": 0.3421, + "step": 6480 + }, + { + "epoch": 0.27, + "grad_norm": 0.76171875, + "learning_rate": 0.0004999039541470924, + "loss": 0.2333, + "step": 6490 + }, + { + "epoch": 0.27, + "grad_norm": 2.765625, + "learning_rate": 0.000499903653318198, + "loss": 0.2685, + "step": 6500 + }, + { + "epoch": 0.27, + "grad_norm": 0.2001953125, + "learning_rate": 0.0004999033520190121, + "loss": 0.2689, + "step": 6510 + }, + { + "epoch": 0.27, + "grad_norm": 0.6875, + "learning_rate": 0.0004999030502495351, + "loss": 0.3213, + "step": 6520 + }, + { + "epoch": 0.27, + "grad_norm": 0.4375, + "learning_rate": 0.0004999027480097674, + "loss": 0.2467, + "step": 6530 + }, + { + "epoch": 0.27, + "grad_norm": 1.90625, + "learning_rate": 0.0004999024452997097, + "loss": 0.327, + "step": 6540 + }, + { + "epoch": 0.27, + "grad_norm": 0.90625, + "learning_rate": 0.0004999021421193627, + "loss": 0.254, + "step": 6550 + }, + { + "epoch": 0.27, + "grad_norm": 0.58984375, + "learning_rate": 0.0004999018384687268, + "loss": 0.2201, + "step": 6560 + }, + { + "epoch": 0.27, + "grad_norm": 0.83984375, + "learning_rate": 0.0004999015343478027, + "loss": 0.3098, + "step": 6570 + }, + { + "epoch": 0.27, + "grad_norm": 0.95703125, + "learning_rate": 0.0004999012297565908, + "loss": 0.2631, + "step": 6580 + }, + { + "epoch": 0.27, + "grad_norm": 0.349609375, + "learning_rate": 0.0004999009246950918, + "loss": 0.2423, + "step": 6590 + }, + { + "epoch": 0.27, + "grad_norm": 0.6953125, + "learning_rate": 0.0004999006191633063, + "loss": 0.2571, + "step": 6600 + }, + { + "epoch": 0.27, + "grad_norm": 0.66796875, + "learning_rate": 0.0004999003131612347, + "loss": 0.2164, + "step": 6610 + }, + { + "epoch": 0.27, + "grad_norm": 0.5546875, + "learning_rate": 0.0004999000066888779, + "loss": 0.2425, + "step": 6620 + }, + { + "epoch": 0.27, + "grad_norm": 0.318359375, + "learning_rate": 0.0004998996997462362, + "loss": 0.2572, + "step": 6630 + }, + { + "epoch": 0.28, + "grad_norm": 0.796875, + "learning_rate": 0.0004998993923333102, + "loss": 0.2865, + "step": 6640 + }, + { + "epoch": 0.28, + "grad_norm": 0.498046875, + "learning_rate": 0.0004998990844501005, + "loss": 0.2969, + "step": 6650 + }, + { + "epoch": 0.28, + "grad_norm": 0.443359375, + "learning_rate": 0.0004998987760966077, + "loss": 0.269, + "step": 6660 + }, + { + "epoch": 0.28, + "grad_norm": 0.640625, + "learning_rate": 0.0004998984672728324, + "loss": 0.2682, + "step": 6670 + }, + { + "epoch": 0.28, + "grad_norm": 0.671875, + "learning_rate": 0.0004998981579787753, + "loss": 0.2851, + "step": 6680 + }, + { + "epoch": 0.28, + "grad_norm": 0.62890625, + "learning_rate": 0.0004998978482144367, + "loss": 0.2411, + "step": 6690 + }, + { + "epoch": 0.28, + "grad_norm": 0.77734375, + "learning_rate": 0.0004998975379798174, + "loss": 0.2918, + "step": 6700 + }, + { + "epoch": 0.28, + "grad_norm": 0.431640625, + "learning_rate": 0.000499897227274918, + "loss": 0.2642, + "step": 6710 + }, + { + "epoch": 0.28, + "grad_norm": 0.310546875, + "learning_rate": 0.0004998969160997388, + "loss": 0.2663, + "step": 6720 + }, + { + "epoch": 0.28, + "grad_norm": 0.80078125, + "learning_rate": 0.0004998966044542808, + "loss": 0.2772, + "step": 6730 + }, + { + "epoch": 0.28, + "grad_norm": 0.6171875, + "learning_rate": 0.0004998962923385443, + "loss": 0.257, + "step": 6740 + }, + { + "epoch": 0.28, + "grad_norm": 1.75, + "learning_rate": 0.00049989597975253, + "loss": 0.328, + "step": 6750 + }, + { + "epoch": 0.28, + "grad_norm": 0.55859375, + "learning_rate": 0.0004998956666962383, + "loss": 0.2458, + "step": 6760 + }, + { + "epoch": 0.28, + "grad_norm": 0.74609375, + "learning_rate": 0.00049989535316967, + "loss": 0.2825, + "step": 6770 + }, + { + "epoch": 0.28, + "grad_norm": 0.60546875, + "learning_rate": 0.0004998950391728258, + "loss": 0.2644, + "step": 6780 + }, + { + "epoch": 0.28, + "grad_norm": 0.9765625, + "learning_rate": 0.000499894724705706, + "loss": 0.2367, + "step": 6790 + }, + { + "epoch": 0.28, + "grad_norm": 0.59375, + "learning_rate": 0.0004998944097683113, + "loss": 0.246, + "step": 6800 + }, + { + "epoch": 0.28, + "grad_norm": 0.65625, + "learning_rate": 0.0004998940943606422, + "loss": 0.3284, + "step": 6810 + }, + { + "epoch": 0.28, + "grad_norm": 0.4375, + "learning_rate": 0.0004998937784826996, + "loss": 0.2721, + "step": 6820 + }, + { + "epoch": 0.28, + "grad_norm": 0.48046875, + "learning_rate": 0.0004998934621344838, + "loss": 0.2482, + "step": 6830 + }, + { + "epoch": 0.28, + "grad_norm": 0.80078125, + "learning_rate": 0.0004998931453159955, + "loss": 0.2069, + "step": 6840 + }, + { + "epoch": 0.28, + "grad_norm": 1.296875, + "learning_rate": 0.0004998928280272354, + "loss": 0.2775, + "step": 6850 + }, + { + "epoch": 0.28, + "grad_norm": 0.75, + "learning_rate": 0.0004998925102682038, + "loss": 0.2783, + "step": 6860 + }, + { + "epoch": 0.28, + "grad_norm": 0.6640625, + "learning_rate": 0.0004998921920389015, + "loss": 0.3178, + "step": 6870 + }, + { + "epoch": 0.28, + "grad_norm": 1.28125, + "learning_rate": 0.0004998918733393293, + "loss": 0.2495, + "step": 6880 + }, + { + "epoch": 0.29, + "grad_norm": 0.6484375, + "learning_rate": 0.0004998915541694873, + "loss": 0.2152, + "step": 6890 + }, + { + "epoch": 0.29, + "grad_norm": 0.50390625, + "learning_rate": 0.0004998912345293765, + "loss": 0.2574, + "step": 6900 + }, + { + "epoch": 0.29, + "grad_norm": 0.68359375, + "learning_rate": 0.0004998909144189975, + "loss": 0.2611, + "step": 6910 + }, + { + "epoch": 0.29, + "grad_norm": 1.03125, + "learning_rate": 0.0004998905938383506, + "loss": 0.3197, + "step": 6920 + }, + { + "epoch": 0.29, + "grad_norm": 0.30859375, + "learning_rate": 0.0004998902727874367, + "loss": 0.2412, + "step": 6930 + }, + { + "epoch": 0.29, + "grad_norm": 0.6328125, + "learning_rate": 0.0004998899512662563, + "loss": 0.2402, + "step": 6940 + }, + { + "epoch": 0.29, + "grad_norm": 0.58203125, + "learning_rate": 0.0004998896292748099, + "loss": 0.2811, + "step": 6950 + }, + { + "epoch": 0.29, + "grad_norm": 1.1640625, + "learning_rate": 0.0004998893068130983, + "loss": 0.2634, + "step": 6960 + }, + { + "epoch": 0.29, + "grad_norm": 0.7421875, + "learning_rate": 0.000499888983881122, + "loss": 0.3311, + "step": 6970 + }, + { + "epoch": 0.29, + "grad_norm": 0.53125, + "learning_rate": 0.0004998886604788815, + "loss": 0.3394, + "step": 6980 + }, + { + "epoch": 0.29, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998883366063775, + "loss": 0.2298, + "step": 6990 + }, + { + "epoch": 0.29, + "grad_norm": 0.462890625, + "learning_rate": 0.0004998880122636108, + "loss": 0.2717, + "step": 7000 + }, + { + "epoch": 0.29, + "grad_norm": 0.63671875, + "learning_rate": 0.0004998876874505818, + "loss": 0.2882, + "step": 7010 + }, + { + "epoch": 0.29, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004998873621672911, + "loss": 0.2549, + "step": 7020 + }, + { + "epoch": 0.29, + "grad_norm": 0.5703125, + "learning_rate": 0.0004998870364137395, + "loss": 0.3172, + "step": 7030 + }, + { + "epoch": 0.29, + "grad_norm": 0.515625, + "learning_rate": 0.0004998867101899273, + "loss": 0.3427, + "step": 7040 + }, + { + "epoch": 0.29, + "grad_norm": 0.77734375, + "learning_rate": 0.0004998863834958555, + "loss": 0.2506, + "step": 7050 + }, + { + "epoch": 0.29, + "grad_norm": 0.6796875, + "learning_rate": 0.0004998860563315244, + "loss": 0.2623, + "step": 7060 + }, + { + "epoch": 0.29, + "grad_norm": 0.53125, + "learning_rate": 0.0004998857286969348, + "loss": 0.3157, + "step": 7070 + }, + { + "epoch": 0.29, + "grad_norm": 0.466796875, + "learning_rate": 0.0004998854005920871, + "loss": 0.2776, + "step": 7080 + }, + { + "epoch": 0.29, + "grad_norm": 0.279296875, + "learning_rate": 0.0004998850720169822, + "loss": 0.2527, + "step": 7090 + }, + { + "epoch": 0.29, + "grad_norm": 0.5234375, + "learning_rate": 0.0004998847429716205, + "loss": 0.2899, + "step": 7100 + }, + { + "epoch": 0.29, + "grad_norm": 0.447265625, + "learning_rate": 0.0004998844134560026, + "loss": 0.2246, + "step": 7110 + }, + { + "epoch": 0.29, + "grad_norm": 0.26171875, + "learning_rate": 0.0004998840834701294, + "loss": 0.2994, + "step": 7120 + }, + { + "epoch": 0.3, + "grad_norm": 1.2265625, + "learning_rate": 0.0004998837530140013, + "loss": 0.2738, + "step": 7130 + }, + { + "epoch": 0.3, + "grad_norm": 0.609375, + "learning_rate": 0.000499883422087619, + "loss": 0.2933, + "step": 7140 + }, + { + "epoch": 0.3, + "grad_norm": 1.1328125, + "learning_rate": 0.000499883090690983, + "loss": 0.2967, + "step": 7150 + }, + { + "epoch": 0.3, + "grad_norm": 0.6484375, + "learning_rate": 0.000499882758824094, + "loss": 0.2614, + "step": 7160 + }, + { + "epoch": 0.3, + "grad_norm": 0.6796875, + "learning_rate": 0.0004998824264869527, + "loss": 0.2934, + "step": 7170 + }, + { + "epoch": 0.3, + "grad_norm": 0.546875, + "learning_rate": 0.0004998820936795597, + "loss": 0.299, + "step": 7180 + }, + { + "epoch": 0.3, + "grad_norm": 1.1484375, + "learning_rate": 0.0004998817604019155, + "loss": 0.3003, + "step": 7190 + }, + { + "epoch": 0.3, + "grad_norm": 1.859375, + "learning_rate": 0.0004998814266540208, + "loss": 0.2196, + "step": 7200 + }, + { + "epoch": 0.3, + "grad_norm": 0.7421875, + "learning_rate": 0.0004998810924358762, + "loss": 0.3219, + "step": 7210 + }, + { + "epoch": 0.3, + "grad_norm": 0.77734375, + "learning_rate": 0.0004998807577474825, + "loss": 0.2483, + "step": 7220 + }, + { + "epoch": 0.3, + "grad_norm": 0.7109375, + "learning_rate": 0.0004998804225888401, + "loss": 0.2674, + "step": 7230 + }, + { + "epoch": 0.3, + "grad_norm": 0.6328125, + "learning_rate": 0.0004998800869599497, + "loss": 0.2998, + "step": 7240 + }, + { + "epoch": 0.3, + "grad_norm": 0.53515625, + "learning_rate": 0.0004998797508608121, + "loss": 0.2499, + "step": 7250 + }, + { + "epoch": 0.3, + "grad_norm": 0.34765625, + "learning_rate": 0.0004998794142914277, + "loss": 0.2512, + "step": 7260 + }, + { + "epoch": 0.3, + "grad_norm": 1.3046875, + "learning_rate": 0.0004998790772517972, + "loss": 0.2656, + "step": 7270 + }, + { + "epoch": 0.3, + "grad_norm": 0.36328125, + "learning_rate": 0.0004998787397419213, + "loss": 0.2781, + "step": 7280 + }, + { + "epoch": 0.3, + "grad_norm": 1.0390625, + "learning_rate": 0.0004998784017618006, + "loss": 0.2953, + "step": 7290 + }, + { + "epoch": 0.3, + "grad_norm": 0.36328125, + "learning_rate": 0.0004998780633114357, + "loss": 0.2848, + "step": 7300 + }, + { + "epoch": 0.3, + "grad_norm": 0.28515625, + "learning_rate": 0.0004998777243908273, + "loss": 0.209, + "step": 7310 + }, + { + "epoch": 0.3, + "grad_norm": 0.69140625, + "learning_rate": 0.000499877384999976, + "loss": 0.2633, + "step": 7320 + }, + { + "epoch": 0.3, + "grad_norm": 0.5546875, + "learning_rate": 0.0004998770451388825, + "loss": 0.2502, + "step": 7330 + }, + { + "epoch": 0.3, + "grad_norm": 2.15625, + "learning_rate": 0.0004998767048075473, + "loss": 0.2453, + "step": 7340 + }, + { + "epoch": 0.3, + "grad_norm": 1.2578125, + "learning_rate": 0.0004998763640059712, + "loss": 0.2631, + "step": 7350 + }, + { + "epoch": 0.3, + "grad_norm": 0.9609375, + "learning_rate": 0.0004998760227341547, + "loss": 0.1911, + "step": 7360 + }, + { + "epoch": 0.31, + "grad_norm": 2.375, + "learning_rate": 0.0004998756809920985, + "loss": 0.284, + "step": 7370 + }, + { + "epoch": 0.31, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998753387798034, + "loss": 0.2477, + "step": 7380 + }, + { + "epoch": 0.31, + "grad_norm": 0.5, + "learning_rate": 0.0004998749960972697, + "loss": 0.2838, + "step": 7390 + }, + { + "epoch": 0.31, + "grad_norm": 0.76953125, + "learning_rate": 0.0004998746529444984, + "loss": 0.2795, + "step": 7400 + }, + { + "epoch": 0.31, + "grad_norm": 0.48046875, + "learning_rate": 0.00049987430932149, + "loss": 0.256, + "step": 7410 + }, + { + "epoch": 0.31, + "grad_norm": 0.2421875, + "learning_rate": 0.000499873965228245, + "loss": 0.2568, + "step": 7420 + }, + { + "epoch": 0.31, + "grad_norm": 0.76953125, + "learning_rate": 0.0004998736206647642, + "loss": 0.2957, + "step": 7430 + }, + { + "epoch": 0.31, + "grad_norm": 0.90234375, + "learning_rate": 0.0004998732756310483, + "loss": 0.3161, + "step": 7440 + }, + { + "epoch": 0.31, + "grad_norm": 0.96875, + "learning_rate": 0.0004998729301270978, + "loss": 0.3257, + "step": 7450 + }, + { + "epoch": 0.31, + "grad_norm": 0.6171875, + "learning_rate": 0.0004998725841529135, + "loss": 0.2278, + "step": 7460 + }, + { + "epoch": 0.31, + "grad_norm": 0.80859375, + "learning_rate": 0.000499872237708496, + "loss": 0.3528, + "step": 7470 + }, + { + "epoch": 0.31, + "grad_norm": 1.1640625, + "learning_rate": 0.0004998718907938458, + "loss": 0.2537, + "step": 7480 + }, + { + "epoch": 0.31, + "grad_norm": 1.1796875, + "learning_rate": 0.0004998715434089638, + "loss": 0.2937, + "step": 7490 + }, + { + "epoch": 0.31, + "grad_norm": 0.484375, + "learning_rate": 0.0004998711955538505, + "loss": 0.2759, + "step": 7500 + }, + { + "epoch": 0.31, + "grad_norm": 0.86328125, + "learning_rate": 0.0004998708472285067, + "loss": 0.2315, + "step": 7510 + }, + { + "epoch": 0.31, + "grad_norm": 2.109375, + "learning_rate": 0.0004998704984329328, + "loss": 0.276, + "step": 7520 + }, + { + "epoch": 0.31, + "grad_norm": 0.6953125, + "learning_rate": 0.0004998701491671296, + "loss": 0.2863, + "step": 7530 + }, + { + "epoch": 0.31, + "grad_norm": 0.58203125, + "learning_rate": 0.0004998697994310979, + "loss": 0.3342, + "step": 7540 + }, + { + "epoch": 0.31, + "grad_norm": 0.69140625, + "learning_rate": 0.0004998694492248381, + "loss": 0.3033, + "step": 7550 + }, + { + "epoch": 0.31, + "grad_norm": 0.70703125, + "learning_rate": 0.000499869098548351, + "loss": 0.2557, + "step": 7560 + }, + { + "epoch": 0.31, + "grad_norm": 1.5078125, + "learning_rate": 0.0004998687474016373, + "loss": 0.2467, + "step": 7570 + }, + { + "epoch": 0.31, + "grad_norm": 0.859375, + "learning_rate": 0.0004998683957846975, + "loss": 0.2534, + "step": 7580 + }, + { + "epoch": 0.31, + "grad_norm": 0.9921875, + "learning_rate": 0.0004998680436975325, + "loss": 0.2629, + "step": 7590 + }, + { + "epoch": 0.31, + "grad_norm": 1.4765625, + "learning_rate": 0.0004998676911401427, + "loss": 0.2304, + "step": 7600 + }, + { + "epoch": 0.32, + "grad_norm": 0.78515625, + "learning_rate": 0.0004998673381125289, + "loss": 0.2324, + "step": 7610 + }, + { + "epoch": 0.32, + "grad_norm": 0.78125, + "learning_rate": 0.0004998669846146919, + "loss": 0.2593, + "step": 7620 + }, + { + "epoch": 0.32, + "grad_norm": 1.0625, + "learning_rate": 0.0004998666306466321, + "loss": 0.2533, + "step": 7630 + }, + { + "epoch": 0.32, + "grad_norm": 1.28125, + "learning_rate": 0.0004998662762083503, + "loss": 0.2852, + "step": 7640 + }, + { + "epoch": 0.32, + "grad_norm": 0.58203125, + "learning_rate": 0.0004998659212998471, + "loss": 0.2666, + "step": 7650 + }, + { + "epoch": 0.32, + "grad_norm": 1.328125, + "learning_rate": 0.0004998655659211233, + "loss": 0.2299, + "step": 7660 + }, + { + "epoch": 0.32, + "grad_norm": 0.458984375, + "learning_rate": 0.0004998652100721794, + "loss": 0.2339, + "step": 7670 + }, + { + "epoch": 0.32, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998648537530162, + "loss": 0.2222, + "step": 7680 + }, + { + "epoch": 0.32, + "grad_norm": 0.6171875, + "learning_rate": 0.0004998644969636343, + "loss": 0.28, + "step": 7690 + }, + { + "epoch": 0.32, + "grad_norm": 0.6015625, + "learning_rate": 0.0004998641397040345, + "loss": 0.2721, + "step": 7700 + }, + { + "epoch": 0.32, + "grad_norm": 0.1484375, + "learning_rate": 0.0004998637819742174, + "loss": 0.289, + "step": 7710 + }, + { + "epoch": 0.32, + "grad_norm": 0.48828125, + "learning_rate": 0.0004998634237741835, + "loss": 0.2741, + "step": 7720 + }, + { + "epoch": 0.32, + "grad_norm": 0.50390625, + "learning_rate": 0.0004998630651039337, + "loss": 0.271, + "step": 7730 + }, + { + "epoch": 0.32, + "grad_norm": 0.279296875, + "learning_rate": 0.0004998627059634686, + "loss": 0.2092, + "step": 7740 + }, + { + "epoch": 0.32, + "grad_norm": 3.140625, + "learning_rate": 0.0004998623463527888, + "loss": 0.2307, + "step": 7750 + }, + { + "epoch": 0.32, + "grad_norm": 0.828125, + "learning_rate": 0.0004998619862718951, + "loss": 0.2485, + "step": 7760 + }, + { + "epoch": 0.32, + "grad_norm": 0.453125, + "learning_rate": 0.0004998616257207881, + "loss": 0.3343, + "step": 7770 + }, + { + "epoch": 0.32, + "grad_norm": 0.6015625, + "learning_rate": 0.0004998612646994686, + "loss": 0.2756, + "step": 7780 + }, + { + "epoch": 0.32, + "grad_norm": 0.671875, + "learning_rate": 0.000499860903207937, + "loss": 0.179, + "step": 7790 + }, + { + "epoch": 0.32, + "grad_norm": 0.26171875, + "learning_rate": 0.0004998605412461943, + "loss": 0.2581, + "step": 7800 + }, + { + "epoch": 0.32, + "grad_norm": 0.6171875, + "learning_rate": 0.0004998601788142411, + "loss": 0.284, + "step": 7810 + }, + { + "epoch": 0.32, + "grad_norm": 0.94921875, + "learning_rate": 0.0004998598159120779, + "loss": 0.2596, + "step": 7820 + }, + { + "epoch": 0.32, + "grad_norm": 1.3359375, + "learning_rate": 0.0004998594525397054, + "loss": 0.2653, + "step": 7830 + }, + { + "epoch": 0.32, + "grad_norm": 1.0703125, + "learning_rate": 0.0004998590886971246, + "loss": 0.2733, + "step": 7840 + }, + { + "epoch": 0.33, + "grad_norm": 0.7578125, + "learning_rate": 0.000499858724384336, + "loss": 0.2572, + "step": 7850 + }, + { + "epoch": 0.33, + "grad_norm": 0.61328125, + "learning_rate": 0.0004998583596013401, + "loss": 0.2722, + "step": 7860 + }, + { + "epoch": 0.33, + "grad_norm": 0.96484375, + "learning_rate": 0.0004998579943481378, + "loss": 0.3304, + "step": 7870 + }, + { + "epoch": 0.33, + "grad_norm": 1.7265625, + "learning_rate": 0.0004998576286247298, + "loss": 0.253, + "step": 7880 + }, + { + "epoch": 0.33, + "grad_norm": 1.7109375, + "learning_rate": 0.0004998572624311167, + "loss": 0.2158, + "step": 7890 + }, + { + "epoch": 0.33, + "grad_norm": 0.58203125, + "learning_rate": 0.0004998568957672992, + "loss": 0.2663, + "step": 7900 + }, + { + "epoch": 0.33, + "grad_norm": 0.46484375, + "learning_rate": 0.000499856528633278, + "loss": 0.2257, + "step": 7910 + }, + { + "epoch": 0.33, + "grad_norm": 0.859375, + "learning_rate": 0.0004998561610290538, + "loss": 0.2291, + "step": 7920 + }, + { + "epoch": 0.33, + "grad_norm": 1.078125, + "learning_rate": 0.0004998557929546272, + "loss": 0.2809, + "step": 7930 + }, + { + "epoch": 0.33, + "grad_norm": 0.50390625, + "learning_rate": 0.0004998554244099991, + "loss": 0.2368, + "step": 7940 + }, + { + "epoch": 0.33, + "grad_norm": 0.470703125, + "learning_rate": 0.0004998550553951701, + "loss": 0.2638, + "step": 7950 + }, + { + "epoch": 0.33, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998546859101408, + "loss": 0.2655, + "step": 7960 + }, + { + "epoch": 0.33, + "grad_norm": 0.9296875, + "learning_rate": 0.000499854315954912, + "loss": 0.2455, + "step": 7970 + }, + { + "epoch": 0.33, + "grad_norm": 0.7109375, + "learning_rate": 0.0004998539455294842, + "loss": 0.2501, + "step": 7980 + }, + { + "epoch": 0.33, + "grad_norm": 2.359375, + "learning_rate": 0.0004998535746338585, + "loss": 0.2749, + "step": 7990 + }, + { + "epoch": 0.33, + "grad_norm": 0.734375, + "learning_rate": 0.0004998532032680352, + "loss": 0.2603, + "step": 8000 + }, + { + "epoch": 0.33, + "grad_norm": 0.330078125, + "learning_rate": 0.0004998528314320152, + "loss": 0.2983, + "step": 8010 + }, + { + "epoch": 0.33, + "grad_norm": 0.57421875, + "learning_rate": 0.0004998524591257991, + "loss": 0.2774, + "step": 8020 + }, + { + "epoch": 0.33, + "grad_norm": 0.6796875, + "learning_rate": 0.0004998520863493878, + "loss": 0.2965, + "step": 8030 + }, + { + "epoch": 0.33, + "grad_norm": 0.6796875, + "learning_rate": 0.0004998517131027817, + "loss": 0.2433, + "step": 8040 + }, + { + "epoch": 0.33, + "grad_norm": 0.5, + "learning_rate": 0.0004998513393859817, + "loss": 0.2153, + "step": 8050 + }, + { + "epoch": 0.33, + "grad_norm": 0.83203125, + "learning_rate": 0.0004998509651989886, + "loss": 0.3167, + "step": 8060 + }, + { + "epoch": 0.33, + "grad_norm": 0.8984375, + "learning_rate": 0.0004998505905418028, + "loss": 0.2944, + "step": 8070 + }, + { + "epoch": 0.33, + "grad_norm": 1.625, + "learning_rate": 0.0004998502154144252, + "loss": 0.2473, + "step": 8080 + }, + { + "epoch": 0.34, + "grad_norm": 0.984375, + "learning_rate": 0.0004998498398168565, + "loss": 0.2825, + "step": 8090 + }, + { + "epoch": 0.34, + "grad_norm": 1.078125, + "learning_rate": 0.0004998494637490973, + "loss": 0.3617, + "step": 8100 + }, + { + "epoch": 0.34, + "grad_norm": 0.189453125, + "learning_rate": 0.0004998490872111485, + "loss": 0.2453, + "step": 8110 + }, + { + "epoch": 0.34, + "grad_norm": 0.91015625, + "learning_rate": 0.0004998487102030107, + "loss": 0.272, + "step": 8120 + }, + { + "epoch": 0.34, + "grad_norm": 0.71875, + "learning_rate": 0.0004998483327246845, + "loss": 0.2994, + "step": 8130 + }, + { + "epoch": 0.34, + "grad_norm": 0.267578125, + "learning_rate": 0.0004998479547761709, + "loss": 0.2359, + "step": 8140 + }, + { + "epoch": 0.34, + "grad_norm": 0.43359375, + "learning_rate": 0.0004998475763574702, + "loss": 0.2469, + "step": 8150 + }, + { + "epoch": 0.34, + "grad_norm": 0.72265625, + "learning_rate": 0.0004998471974685835, + "loss": 0.2994, + "step": 8160 + }, + { + "epoch": 0.34, + "grad_norm": 0.68359375, + "learning_rate": 0.0004998468181095113, + "loss": 0.3199, + "step": 8170 + }, + { + "epoch": 0.34, + "grad_norm": 3.28125, + "learning_rate": 0.0004998464382802544, + "loss": 0.2537, + "step": 8180 + }, + { + "epoch": 0.34, + "grad_norm": 0.671875, + "learning_rate": 0.0004998460579808135, + "loss": 0.2669, + "step": 8190 + }, + { + "epoch": 0.34, + "grad_norm": 1.2265625, + "learning_rate": 0.0004998456772111892, + "loss": 0.2626, + "step": 8200 + }, + { + "epoch": 0.34, + "grad_norm": 1.3515625, + "learning_rate": 0.0004998452959713824, + "loss": 0.2541, + "step": 8210 + }, + { + "epoch": 0.34, + "grad_norm": 0.53125, + "learning_rate": 0.0004998449142613937, + "loss": 0.3303, + "step": 8220 + }, + { + "epoch": 0.34, + "grad_norm": 0.6328125, + "learning_rate": 0.000499844532081224, + "loss": 0.25, + "step": 8230 + }, + { + "epoch": 0.34, + "grad_norm": 0.58984375, + "learning_rate": 0.0004998441494308736, + "loss": 0.2069, + "step": 8240 + }, + { + "epoch": 0.34, + "grad_norm": 0.6328125, + "learning_rate": 0.0004998437663103437, + "loss": 0.2604, + "step": 8250 + }, + { + "epoch": 0.34, + "grad_norm": 0.57421875, + "learning_rate": 0.0004998433827196347, + "loss": 0.3084, + "step": 8260 + }, + { + "epoch": 0.34, + "grad_norm": 0.99609375, + "learning_rate": 0.0004998429986587475, + "loss": 0.2483, + "step": 8270 + }, + { + "epoch": 0.34, + "grad_norm": 0.287109375, + "learning_rate": 0.0004998426141276828, + "loss": 0.2874, + "step": 8280 + }, + { + "epoch": 0.34, + "grad_norm": 0.8515625, + "learning_rate": 0.0004998422291264411, + "loss": 0.2739, + "step": 8290 + }, + { + "epoch": 0.34, + "grad_norm": 0.46484375, + "learning_rate": 0.0004998418436550234, + "loss": 0.2946, + "step": 8300 + }, + { + "epoch": 0.34, + "grad_norm": 0.75, + "learning_rate": 0.0004998414577134305, + "loss": 0.2911, + "step": 8310 + }, + { + "epoch": 0.34, + "grad_norm": 0.546875, + "learning_rate": 0.0004998410713016628, + "loss": 0.2316, + "step": 8320 + }, + { + "epoch": 0.35, + "grad_norm": 1.5078125, + "learning_rate": 0.0004998406844197212, + "loss": 0.2325, + "step": 8330 + }, + { + "epoch": 0.35, + "grad_norm": 0.5859375, + "learning_rate": 0.0004998402970676064, + "loss": 0.2424, + "step": 8340 + }, + { + "epoch": 0.35, + "grad_norm": 1.03125, + "learning_rate": 0.0004998399092453191, + "loss": 0.2746, + "step": 8350 + }, + { + "epoch": 0.35, + "grad_norm": 1.8046875, + "learning_rate": 0.0004998395209528601, + "loss": 0.2648, + "step": 8360 + }, + { + "epoch": 0.35, + "grad_norm": 0.5546875, + "learning_rate": 0.0004998391321902301, + "loss": 0.2891, + "step": 8370 + }, + { + "epoch": 0.35, + "grad_norm": 0.7109375, + "learning_rate": 0.0004998387429574299, + "loss": 0.295, + "step": 8380 + }, + { + "epoch": 0.35, + "grad_norm": 0.287109375, + "learning_rate": 0.0004998383532544601, + "loss": 0.3096, + "step": 8390 + }, + { + "epoch": 0.35, + "grad_norm": 0.828125, + "learning_rate": 0.0004998379630813216, + "loss": 0.2383, + "step": 8400 + }, + { + "epoch": 0.35, + "grad_norm": 1.2265625, + "learning_rate": 0.000499837572438015, + "loss": 0.2351, + "step": 8410 + }, + { + "epoch": 0.35, + "grad_norm": 0.58984375, + "learning_rate": 0.0004998371813245409, + "loss": 0.239, + "step": 8420 + }, + { + "epoch": 0.35, + "grad_norm": 0.498046875, + "learning_rate": 0.0004998367897409004, + "loss": 0.2674, + "step": 8430 + }, + { + "epoch": 0.35, + "grad_norm": 1.859375, + "learning_rate": 0.000499836397687094, + "loss": 0.3296, + "step": 8440 + }, + { + "epoch": 0.35, + "grad_norm": 0.96484375, + "learning_rate": 0.0004998360051631225, + "loss": 0.2563, + "step": 8450 + }, + { + "epoch": 0.35, + "grad_norm": 1.1953125, + "learning_rate": 0.0004998356121689865, + "loss": 0.2205, + "step": 8460 + }, + { + "epoch": 0.35, + "grad_norm": 2.15625, + "learning_rate": 0.000499835218704687, + "loss": 0.2527, + "step": 8470 + }, + { + "epoch": 0.35, + "grad_norm": 1.875, + "learning_rate": 0.0004998348247702244, + "loss": 0.2221, + "step": 8480 + }, + { + "epoch": 0.35, + "grad_norm": 0.97265625, + "learning_rate": 0.0004998344303655998, + "loss": 0.2245, + "step": 8490 + }, + { + "epoch": 0.35, + "grad_norm": 0.8828125, + "learning_rate": 0.0004998340354908137, + "loss": 0.2696, + "step": 8500 + }, + { + "epoch": 0.35, + "grad_norm": 0.52734375, + "learning_rate": 0.0004998336401458671, + "loss": 0.2723, + "step": 8510 + }, + { + "epoch": 0.35, + "grad_norm": 0.94921875, + "learning_rate": 0.0004998332443307604, + "loss": 0.2877, + "step": 8520 + }, + { + "epoch": 0.35, + "grad_norm": 0.58203125, + "learning_rate": 0.0004998328480454946, + "loss": 0.2524, + "step": 8530 + }, + { + "epoch": 0.35, + "grad_norm": 0.2890625, + "learning_rate": 0.0004998324512900703, + "loss": 0.2379, + "step": 8540 + }, + { + "epoch": 0.35, + "grad_norm": 0.89453125, + "learning_rate": 0.0004998320540644883, + "loss": 0.28, + "step": 8550 + }, + { + "epoch": 0.35, + "grad_norm": 0.8984375, + "learning_rate": 0.0004998316563687493, + "loss": 0.275, + "step": 8560 + }, + { + "epoch": 0.35, + "grad_norm": 0.26171875, + "learning_rate": 0.0004998312582028542, + "loss": 0.2379, + "step": 8570 + }, + { + "epoch": 0.36, + "grad_norm": 1.4453125, + "learning_rate": 0.0004998308595668036, + "loss": 0.1995, + "step": 8580 + }, + { + "epoch": 0.36, + "grad_norm": 0.68359375, + "learning_rate": 0.0004998304604605984, + "loss": 0.2182, + "step": 8590 + }, + { + "epoch": 0.36, + "grad_norm": 0.5625, + "learning_rate": 0.0004998300608842392, + "loss": 0.2395, + "step": 8600 + }, + { + "epoch": 0.36, + "grad_norm": 0.494140625, + "learning_rate": 0.0004998296608377267, + "loss": 0.2551, + "step": 8610 + }, + { + "epoch": 0.36, + "grad_norm": 0.65234375, + "learning_rate": 0.0004998292603210619, + "loss": 0.2501, + "step": 8620 + }, + { + "epoch": 0.36, + "grad_norm": 0.95703125, + "learning_rate": 0.0004998288593342454, + "loss": 0.1997, + "step": 8630 + }, + { + "epoch": 0.36, + "grad_norm": 0.5625, + "learning_rate": 0.0004998284578772779, + "loss": 0.2105, + "step": 8640 + }, + { + "epoch": 0.36, + "grad_norm": 0.208984375, + "learning_rate": 0.0004998280559501602, + "loss": 0.2675, + "step": 8650 + }, + { + "epoch": 0.36, + "grad_norm": 0.71484375, + "learning_rate": 0.0004998276535528931, + "loss": 0.2465, + "step": 8660 + }, + { + "epoch": 0.36, + "grad_norm": 0.58984375, + "learning_rate": 0.0004998272506854774, + "loss": 0.3133, + "step": 8670 + }, + { + "epoch": 0.36, + "grad_norm": 0.314453125, + "learning_rate": 0.0004998268473479137, + "loss": 0.2646, + "step": 8680 + }, + { + "epoch": 0.36, + "grad_norm": 1.78125, + "learning_rate": 0.0004998264435402029, + "loss": 0.2282, + "step": 8690 + }, + { + "epoch": 0.36, + "grad_norm": 1.4609375, + "learning_rate": 0.0004998260392623459, + "loss": 0.3514, + "step": 8700 + }, + { + "epoch": 0.36, + "grad_norm": 0.34375, + "learning_rate": 0.0004998256345143429, + "loss": 0.2323, + "step": 8710 + }, + { + "epoch": 0.36, + "grad_norm": 0.375, + "learning_rate": 0.0004998252292961953, + "loss": 0.2963, + "step": 8720 + }, + { + "epoch": 0.36, + "grad_norm": 0.6171875, + "learning_rate": 0.0004998248236079035, + "loss": 0.2901, + "step": 8730 + }, + { + "epoch": 0.36, + "grad_norm": 0.318359375, + "learning_rate": 0.0004998244174494684, + "loss": 0.243, + "step": 8740 + }, + { + "epoch": 0.36, + "grad_norm": 1.0390625, + "learning_rate": 0.0004998240108208907, + "loss": 0.1882, + "step": 8750 + }, + { + "epoch": 0.36, + "grad_norm": 0.78125, + "learning_rate": 0.0004998236037221711, + "loss": 0.2243, + "step": 8760 + }, + { + "epoch": 0.36, + "grad_norm": 1.8828125, + "learning_rate": 0.0004998231961533107, + "loss": 0.289, + "step": 8770 + }, + { + "epoch": 0.36, + "grad_norm": 0.6875, + "learning_rate": 0.0004998227881143098, + "loss": 0.1817, + "step": 8780 + }, + { + "epoch": 0.36, + "grad_norm": 0.71875, + "learning_rate": 0.0004998223796051695, + "loss": 0.2264, + "step": 8790 + }, + { + "epoch": 0.36, + "grad_norm": 1.4765625, + "learning_rate": 0.0004998219706258904, + "loss": 0.2128, + "step": 8800 + }, + { + "epoch": 0.36, + "grad_norm": 0.68359375, + "learning_rate": 0.0004998215611764734, + "loss": 0.3047, + "step": 8810 + }, + { + "epoch": 0.37, + "grad_norm": 0.8046875, + "learning_rate": 0.0004998211512569191, + "loss": 0.2505, + "step": 8820 + }, + { + "epoch": 0.37, + "grad_norm": 0.74609375, + "learning_rate": 0.0004998207408672285, + "loss": 0.2399, + "step": 8830 + }, + { + "epoch": 0.37, + "grad_norm": 1.78125, + "learning_rate": 0.0004998203300074022, + "loss": 0.3077, + "step": 8840 + }, + { + "epoch": 0.37, + "grad_norm": 0.310546875, + "learning_rate": 0.000499819918677441, + "loss": 0.2372, + "step": 8850 + }, + { + "epoch": 0.37, + "grad_norm": 0.458984375, + "learning_rate": 0.0004998195068773456, + "loss": 0.2643, + "step": 8860 + }, + { + "epoch": 0.37, + "grad_norm": 0.69921875, + "learning_rate": 0.0004998190946071169, + "loss": 0.2693, + "step": 8870 + }, + { + "epoch": 0.37, + "grad_norm": 0.427734375, + "learning_rate": 0.0004998186818667557, + "loss": 0.2794, + "step": 8880 + }, + { + "epoch": 0.37, + "grad_norm": 0.484375, + "learning_rate": 0.0004998182686562628, + "loss": 0.2509, + "step": 8890 + }, + { + "epoch": 0.37, + "grad_norm": 0.486328125, + "learning_rate": 0.0004998178549756387, + "loss": 0.3016, + "step": 8900 + }, + { + "epoch": 0.37, + "grad_norm": 0.328125, + "learning_rate": 0.0004998174408248846, + "loss": 0.2302, + "step": 8910 + }, + { + "epoch": 0.37, + "grad_norm": 0.5, + "learning_rate": 0.0004998170262040008, + "loss": 0.212, + "step": 8920 + }, + { + "epoch": 0.37, + "grad_norm": 0.765625, + "learning_rate": 0.0004998166111129885, + "loss": 0.2341, + "step": 8930 + }, + { + "epoch": 0.37, + "grad_norm": 0.88671875, + "learning_rate": 0.0004998161955518483, + "loss": 0.2323, + "step": 8940 + }, + { + "epoch": 0.37, + "grad_norm": 0.984375, + "learning_rate": 0.000499815779520581, + "loss": 0.2473, + "step": 8950 + }, + { + "epoch": 0.37, + "grad_norm": 0.298828125, + "learning_rate": 0.0004998153630191874, + "loss": 0.1603, + "step": 8960 + }, + { + "epoch": 0.37, + "grad_norm": 0.330078125, + "learning_rate": 0.0004998149460476682, + "loss": 0.1883, + "step": 8970 + }, + { + "epoch": 0.37, + "grad_norm": 0.828125, + "learning_rate": 0.0004998145286060243, + "loss": 0.223, + "step": 8980 + }, + { + "epoch": 0.37, + "grad_norm": 0.703125, + "learning_rate": 0.0004998141106942564, + "loss": 0.2721, + "step": 8990 + }, + { + "epoch": 0.37, + "grad_norm": 2.15625, + "learning_rate": 0.0004998136923123653, + "loss": 0.2824, + "step": 9000 + }, + { + "epoch": 0.37, + "grad_norm": 1.59375, + "learning_rate": 0.0004998132734603519, + "loss": 0.2442, + "step": 9010 + }, + { + "epoch": 0.37, + "grad_norm": 0.55859375, + "learning_rate": 0.0004998128541382168, + "loss": 0.2447, + "step": 9020 + }, + { + "epoch": 0.37, + "grad_norm": 0.490234375, + "learning_rate": 0.0004998124343459609, + "loss": 0.2571, + "step": 9030 + }, + { + "epoch": 0.37, + "grad_norm": 0.3046875, + "learning_rate": 0.000499812014083585, + "loss": 0.3239, + "step": 9040 + }, + { + "epoch": 0.37, + "grad_norm": 1.3984375, + "learning_rate": 0.0004998115933510899, + "loss": 0.2907, + "step": 9050 + }, + { + "epoch": 0.38, + "grad_norm": 0.0, + "learning_rate": 0.0004998111721484763, + "loss": 0.2353, + "step": 9060 + }, + { + "epoch": 0.38, + "grad_norm": 0.318359375, + "learning_rate": 0.0004998107504757451, + "loss": 0.2586, + "step": 9070 + }, + { + "epoch": 0.38, + "grad_norm": 0.74609375, + "learning_rate": 0.0004998103283328971, + "loss": 0.2965, + "step": 9080 + }, + { + "epoch": 0.38, + "grad_norm": 0.0, + "learning_rate": 0.0004998099057199329, + "loss": 0.2697, + "step": 9090 + }, + { + "epoch": 0.38, + "grad_norm": 0.8828125, + "learning_rate": 0.0004998094826368535, + "loss": 0.2726, + "step": 9100 + }, + { + "epoch": 0.38, + "grad_norm": 1.0859375, + "learning_rate": 0.0004998090590836596, + "loss": 0.2333, + "step": 9110 + }, + { + "epoch": 0.38, + "grad_norm": 0.359375, + "learning_rate": 0.0004998086350603521, + "loss": 0.2468, + "step": 9120 + }, + { + "epoch": 0.38, + "grad_norm": 0.6015625, + "learning_rate": 0.0004998082105669316, + "loss": 0.2026, + "step": 9130 + }, + { + "epoch": 0.38, + "grad_norm": 0.498046875, + "learning_rate": 0.0004998077856033991, + "loss": 0.2667, + "step": 9140 + }, + { + "epoch": 0.38, + "grad_norm": 0.37109375, + "learning_rate": 0.0004998073601697554, + "loss": 0.2211, + "step": 9150 + }, + { + "epoch": 0.38, + "grad_norm": 1.3515625, + "learning_rate": 0.0004998069342660011, + "loss": 0.2342, + "step": 9160 + }, + { + "epoch": 0.38, + "grad_norm": 0.53125, + "learning_rate": 0.0004998065078921372, + "loss": 0.3042, + "step": 9170 + }, + { + "epoch": 0.38, + "grad_norm": 0.419921875, + "learning_rate": 0.0004998060810481644, + "loss": 0.2268, + "step": 9180 + }, + { + "epoch": 0.38, + "grad_norm": 1.59375, + "learning_rate": 0.0004998056537340836, + "loss": 0.3005, + "step": 9190 + }, + { + "epoch": 0.38, + "grad_norm": 1.4140625, + "learning_rate": 0.0004998052259498954, + "loss": 0.2707, + "step": 9200 + }, + { + "epoch": 0.38, + "grad_norm": 1.421875, + "learning_rate": 0.0004998047976956008, + "loss": 0.2413, + "step": 9210 + }, + { + "epoch": 0.38, + "grad_norm": 0.6171875, + "learning_rate": 0.0004998043689712007, + "loss": 0.2952, + "step": 9220 + }, + { + "epoch": 0.38, + "grad_norm": 0.25, + "learning_rate": 0.0004998039397766955, + "loss": 0.2654, + "step": 9230 + }, + { + "epoch": 0.38, + "grad_norm": 0.8671875, + "learning_rate": 0.0004998035101120865, + "loss": 0.1688, + "step": 9240 + }, + { + "epoch": 0.38, + "grad_norm": 0.50390625, + "learning_rate": 0.0004998030799773741, + "loss": 0.274, + "step": 9250 + }, + { + "epoch": 0.38, + "grad_norm": 0.66796875, + "learning_rate": 0.0004998026493725593, + "loss": 0.2148, + "step": 9260 + }, + { + "epoch": 0.38, + "grad_norm": 0.57421875, + "learning_rate": 0.0004998022182976429, + "loss": 0.2488, + "step": 9270 + }, + { + "epoch": 0.38, + "grad_norm": 1.015625, + "learning_rate": 0.0004998017867526257, + "loss": 0.2633, + "step": 9280 + }, + { + "epoch": 0.38, + "grad_norm": 0.80859375, + "learning_rate": 0.0004998013547375086, + "loss": 0.313, + "step": 9290 + }, + { + "epoch": 0.39, + "grad_norm": 0.6640625, + "learning_rate": 0.0004998009222522922, + "loss": 0.2938, + "step": 9300 + }, + { + "epoch": 0.39, + "grad_norm": 0.5078125, + "learning_rate": 0.0004998004892969776, + "loss": 0.2695, + "step": 9310 + }, + { + "epoch": 0.39, + "grad_norm": 0.62109375, + "learning_rate": 0.0004998000558715653, + "loss": 0.2811, + "step": 9320 + }, + { + "epoch": 0.39, + "grad_norm": 0.91015625, + "learning_rate": 0.0004997996219760564, + "loss": 0.2205, + "step": 9330 + }, + { + "epoch": 0.39, + "grad_norm": 1.234375, + "learning_rate": 0.0004997991876104515, + "loss": 0.2902, + "step": 9340 + }, + { + "epoch": 0.39, + "grad_norm": 2.3125, + "learning_rate": 0.0004997987527747515, + "loss": 0.2575, + "step": 9350 + }, + { + "epoch": 0.39, + "grad_norm": 1.2265625, + "learning_rate": 0.0004997983174689572, + "loss": 0.2652, + "step": 9360 + }, + { + "epoch": 0.39, + "grad_norm": 1.0234375, + "learning_rate": 0.0004997978816930695, + "loss": 0.2387, + "step": 9370 + }, + { + "epoch": 0.39, + "grad_norm": 0.921875, + "learning_rate": 0.0004997974454470891, + "loss": 0.2588, + "step": 9380 + }, + { + "epoch": 0.39, + "grad_norm": 0.67578125, + "learning_rate": 0.0004997970087310171, + "loss": 0.3239, + "step": 9390 + }, + { + "epoch": 0.39, + "grad_norm": 0.61328125, + "learning_rate": 0.0004997965715448539, + "loss": 0.2368, + "step": 9400 + }, + { + "epoch": 0.39, + "grad_norm": 0.50390625, + "learning_rate": 0.0004997961338886006, + "loss": 0.2535, + "step": 9410 + }, + { + "epoch": 0.39, + "grad_norm": 0.5234375, + "learning_rate": 0.0004997956957622578, + "loss": 0.291, + "step": 9420 + }, + { + "epoch": 0.39, + "grad_norm": 0.80859375, + "learning_rate": 0.0004997952571658266, + "loss": 0.2872, + "step": 9430 + }, + { + "epoch": 0.39, + "grad_norm": 0.82421875, + "learning_rate": 0.0004997948180993077, + "loss": 0.3157, + "step": 9440 + }, + { + "epoch": 0.39, + "grad_norm": 0.384765625, + "learning_rate": 0.000499794378562702, + "loss": 0.2278, + "step": 9450 + }, + { + "epoch": 0.39, + "grad_norm": 0.734375, + "learning_rate": 0.0004997939385560101, + "loss": 0.2595, + "step": 9460 + }, + { + "epoch": 0.39, + "grad_norm": 1.6875, + "learning_rate": 0.0004997934980792331, + "loss": 0.2786, + "step": 9470 + }, + { + "epoch": 0.39, + "grad_norm": 0.8125, + "learning_rate": 0.0004997930571323718, + "loss": 0.2605, + "step": 9480 + }, + { + "epoch": 0.39, + "grad_norm": 0.64453125, + "learning_rate": 0.0004997926157154268, + "loss": 0.2685, + "step": 9490 + }, + { + "epoch": 0.39, + "grad_norm": 0.5625, + "learning_rate": 0.000499792173828399, + "loss": 0.2057, + "step": 9500 + }, + { + "epoch": 0.39, + "grad_norm": 0.53125, + "learning_rate": 0.0004997917314712894, + "loss": 0.2852, + "step": 9510 + }, + { + "epoch": 0.39, + "grad_norm": 1.125, + "learning_rate": 0.0004997912886440987, + "loss": 0.204, + "step": 9520 + }, + { + "epoch": 0.39, + "grad_norm": 0.8046875, + "learning_rate": 0.0004997908453468279, + "loss": 0.2967, + "step": 9530 + }, + { + "epoch": 0.4, + "grad_norm": 0.984375, + "learning_rate": 0.0004997904015794775, + "loss": 0.3227, + "step": 9540 + }, + { + "epoch": 0.4, + "grad_norm": 0.478515625, + "learning_rate": 0.0004997899573420487, + "loss": 0.2689, + "step": 9550 + }, + { + "epoch": 0.4, + "grad_norm": 1.75, + "learning_rate": 0.0004997895126345421, + "loss": 0.2408, + "step": 9560 + }, + { + "epoch": 0.4, + "grad_norm": 0.4609375, + "learning_rate": 0.0004997890674569586, + "loss": 0.2939, + "step": 9570 + }, + { + "epoch": 0.4, + "grad_norm": 1.609375, + "learning_rate": 0.0004997886218092992, + "loss": 0.3197, + "step": 9580 + }, + { + "epoch": 0.4, + "grad_norm": 0.51171875, + "learning_rate": 0.0004997881756915644, + "loss": 0.2507, + "step": 9590 + }, + { + "epoch": 0.4, + "grad_norm": 0.65234375, + "learning_rate": 0.0004997877291037553, + "loss": 0.2775, + "step": 9600 + }, + { + "epoch": 0.4, + "grad_norm": 0.60546875, + "learning_rate": 0.0004997872820458727, + "loss": 0.237, + "step": 9610 + }, + { + "epoch": 0.4, + "grad_norm": 0.490234375, + "learning_rate": 0.0004997868345179173, + "loss": 0.2885, + "step": 9620 + }, + { + "epoch": 0.4, + "grad_norm": 0.5625, + "learning_rate": 0.0004997863865198902, + "loss": 0.236, + "step": 9630 + }, + { + "epoch": 0.4, + "grad_norm": 0.91015625, + "learning_rate": 0.000499785938051792, + "loss": 0.2855, + "step": 9640 + }, + { + "epoch": 0.4, + "grad_norm": 0.6875, + "learning_rate": 0.0004997854891136236, + "loss": 0.2697, + "step": 9650 + }, + { + "epoch": 0.4, + "grad_norm": 0.63671875, + "learning_rate": 0.000499785039705386, + "loss": 0.2297, + "step": 9660 + }, + { + "epoch": 0.4, + "grad_norm": 0.72265625, + "learning_rate": 0.0004997845898270798, + "loss": 0.2423, + "step": 9670 + }, + { + "epoch": 0.4, + "grad_norm": 0.54296875, + "learning_rate": 0.0004997841394787061, + "loss": 0.272, + "step": 9680 + }, + { + "epoch": 0.4, + "grad_norm": 0.70703125, + "learning_rate": 0.0004997836886602656, + "loss": 0.3032, + "step": 9690 + }, + { + "epoch": 0.4, + "grad_norm": 0.7421875, + "learning_rate": 0.0004997832373717591, + "loss": 0.258, + "step": 9700 + }, + { + "epoch": 0.4, + "grad_norm": 0.8359375, + "learning_rate": 0.0004997827856131876, + "loss": 0.3609, + "step": 9710 + }, + { + "epoch": 0.4, + "grad_norm": 0.42578125, + "learning_rate": 0.0004997823333845519, + "loss": 0.3004, + "step": 9720 + }, + { + "epoch": 0.4, + "grad_norm": 0.376953125, + "learning_rate": 0.0004997818806858527, + "loss": 0.2483, + "step": 9730 + }, + { + "epoch": 0.4, + "grad_norm": 0.98046875, + "learning_rate": 0.0004997814275170911, + "loss": 0.2509, + "step": 9740 + }, + { + "epoch": 0.4, + "grad_norm": 1.453125, + "learning_rate": 0.0004997809738782678, + "loss": 0.2446, + "step": 9750 + }, + { + "epoch": 0.4, + "grad_norm": 0.734375, + "learning_rate": 0.0004997805197693836, + "loss": 0.224, + "step": 9760 + }, + { + "epoch": 0.4, + "grad_norm": 0.80859375, + "learning_rate": 0.0004997800651904395, + "loss": 0.2761, + "step": 9770 + }, + { + "epoch": 0.41, + "grad_norm": 0.248046875, + "learning_rate": 0.0004997796101414363, + "loss": 0.2133, + "step": 9780 + }, + { + "epoch": 0.41, + "grad_norm": 2.59375, + "learning_rate": 0.0004997791546223748, + "loss": 0.2887, + "step": 9790 + }, + { + "epoch": 0.41, + "grad_norm": 0.4375, + "learning_rate": 0.0004997786986332559, + "loss": 0.2445, + "step": 9800 + }, + { + "epoch": 0.41, + "grad_norm": 0.97265625, + "learning_rate": 0.0004997782421740805, + "loss": 0.2669, + "step": 9810 + }, + { + "epoch": 0.41, + "grad_norm": 1.0546875, + "learning_rate": 0.0004997777852448494, + "loss": 0.2651, + "step": 9820 + }, + { + "epoch": 0.41, + "grad_norm": 0.62890625, + "learning_rate": 0.0004997773278455635, + "loss": 0.2704, + "step": 9830 + }, + { + "epoch": 0.41, + "grad_norm": 0.55859375, + "learning_rate": 0.0004997768699762236, + "loss": 0.2474, + "step": 9840 + }, + { + "epoch": 0.41, + "grad_norm": 0.84375, + "learning_rate": 0.0004997764116368307, + "loss": 0.3074, + "step": 9850 + }, + { + "epoch": 0.41, + "grad_norm": 1.46875, + "learning_rate": 0.0004997759528273855, + "loss": 0.2811, + "step": 9860 + }, + { + "epoch": 0.41, + "grad_norm": 0.81640625, + "learning_rate": 0.000499775493547889, + "loss": 0.2775, + "step": 9870 + }, + { + "epoch": 0.41, + "grad_norm": 0.90234375, + "learning_rate": 0.0004997750337983419, + "loss": 0.2756, + "step": 9880 + }, + { + "epoch": 0.41, + "grad_norm": 1.0390625, + "learning_rate": 0.0004997745735787452, + "loss": 0.2703, + "step": 9890 + }, + { + "epoch": 0.41, + "grad_norm": 0.310546875, + "learning_rate": 0.0004997741128890997, + "loss": 0.2818, + "step": 9900 + }, + { + "epoch": 0.41, + "grad_norm": 0.61328125, + "learning_rate": 0.0004997736517294064, + "loss": 0.1997, + "step": 9910 + }, + { + "epoch": 0.41, + "grad_norm": 0.83203125, + "learning_rate": 0.0004997731900996658, + "loss": 0.2691, + "step": 9920 + }, + { + "epoch": 0.41, + "grad_norm": 2.296875, + "learning_rate": 0.0004997727279998792, + "loss": 0.2514, + "step": 9930 + }, + { + "epoch": 0.41, + "grad_norm": 0.298828125, + "learning_rate": 0.0004997722654300474, + "loss": 0.1862, + "step": 9940 + }, + { + "epoch": 0.41, + "grad_norm": 0.5, + "learning_rate": 0.0004997718023901711, + "loss": 0.3287, + "step": 9950 + }, + { + "epoch": 0.41, + "grad_norm": 0.84765625, + "learning_rate": 0.0004997713388802512, + "loss": 0.2876, + "step": 9960 + }, + { + "epoch": 0.41, + "grad_norm": 0.330078125, + "learning_rate": 0.0004997708749002886, + "loss": 0.2067, + "step": 9970 + }, + { + "epoch": 0.41, + "grad_norm": 0.51171875, + "learning_rate": 0.0004997704104502842, + "loss": 0.2545, + "step": 9980 + }, + { + "epoch": 0.41, + "grad_norm": 0.5078125, + "learning_rate": 0.0004997699455302389, + "loss": 0.2143, + "step": 9990 + }, + { + "epoch": 0.41, + "grad_norm": 1.046875, + "learning_rate": 0.0004997694801401534, + "loss": 0.2766, + "step": 10000 + }, + { + "epoch": 0.41, + "grad_norm": 0.62890625, + "learning_rate": 0.0004997690142800288, + "loss": 0.2988, + "step": 10010 + }, + { + "epoch": 0.42, + "grad_norm": 1.234375, + "learning_rate": 0.0004997685479498661, + "loss": 0.2779, + "step": 10020 + }, + { + "epoch": 0.42, + "grad_norm": 1.1953125, + "learning_rate": 0.0004997680811496657, + "loss": 0.2556, + "step": 10030 + }, + { + "epoch": 0.42, + "grad_norm": 0.40625, + "learning_rate": 0.0004997676138794288, + "loss": 0.2489, + "step": 10040 + }, + { + "epoch": 0.42, + "grad_norm": 0.51171875, + "learning_rate": 0.0004997671461391561, + "loss": 0.2324, + "step": 10050 + }, + { + "epoch": 0.42, + "grad_norm": 0.298828125, + "learning_rate": 0.0004997666779288489, + "loss": 0.2369, + "step": 10060 + }, + { + "epoch": 0.42, + "grad_norm": 0.40625, + "learning_rate": 0.0004997662092485075, + "loss": 0.2779, + "step": 10070 + }, + { + "epoch": 0.42, + "grad_norm": 0.73828125, + "learning_rate": 0.0004997657400981333, + "loss": 0.2431, + "step": 10080 + }, + { + "epoch": 0.42, + "grad_norm": 0.65625, + "learning_rate": 0.0004997652704777268, + "loss": 0.2992, + "step": 10090 + }, + { + "epoch": 0.42, + "grad_norm": 0.90625, + "learning_rate": 0.0004997648003872891, + "loss": 0.285, + "step": 10100 + }, + { + "epoch": 0.42, + "grad_norm": 0.86328125, + "learning_rate": 0.000499764329826821, + "loss": 0.3023, + "step": 10110 + }, + { + "epoch": 0.42, + "grad_norm": 2.828125, + "learning_rate": 0.0004997638587963234, + "loss": 0.3089, + "step": 10120 + }, + { + "epoch": 0.42, + "grad_norm": 0.796875, + "learning_rate": 0.0004997633872957972, + "loss": 0.1711, + "step": 10130 + }, + { + "epoch": 0.42, + "grad_norm": 2.5625, + "learning_rate": 0.0004997629153252433, + "loss": 0.2699, + "step": 10140 + }, + { + "epoch": 0.42, + "grad_norm": 1.4765625, + "learning_rate": 0.0004997624428846625, + "loss": 0.2359, + "step": 10150 + }, + { + "epoch": 0.42, + "grad_norm": 0.91015625, + "learning_rate": 0.0004997619699740558, + "loss": 0.2554, + "step": 10160 + }, + { + "epoch": 0.42, + "grad_norm": 0.546875, + "learning_rate": 0.000499761496593424, + "loss": 0.293, + "step": 10170 + }, + { + "epoch": 0.42, + "grad_norm": 0.8359375, + "learning_rate": 0.000499761022742768, + "loss": 0.3406, + "step": 10180 + }, + { + "epoch": 0.42, + "grad_norm": 0.3984375, + "learning_rate": 0.0004997605484220888, + "loss": 0.2979, + "step": 10190 + }, + { + "epoch": 0.42, + "grad_norm": 0.73828125, + "learning_rate": 0.0004997600736313873, + "loss": 0.2138, + "step": 10200 + }, + { + "epoch": 0.42, + "grad_norm": 0.73046875, + "learning_rate": 0.0004997595983706642, + "loss": 0.2211, + "step": 10210 + }, + { + "epoch": 0.42, + "grad_norm": 0.47265625, + "learning_rate": 0.0004997591226399205, + "loss": 0.2854, + "step": 10220 + }, + { + "epoch": 0.42, + "grad_norm": 0.6171875, + "learning_rate": 0.0004997586464391572, + "loss": 0.2612, + "step": 10230 + }, + { + "epoch": 0.42, + "grad_norm": 0.703125, + "learning_rate": 0.0004997581697683749, + "loss": 0.2516, + "step": 10240 + }, + { + "epoch": 0.42, + "grad_norm": 0.859375, + "learning_rate": 0.000499757692627575, + "loss": 0.296, + "step": 10250 + }, + { + "epoch": 0.42, + "grad_norm": 0.71484375, + "learning_rate": 0.0004997572150167578, + "loss": 0.24, + "step": 10260 + }, + { + "epoch": 0.43, + "grad_norm": 0.326171875, + "learning_rate": 0.0004997567369359247, + "loss": 0.2024, + "step": 10270 + }, + { + "epoch": 0.43, + "grad_norm": 0.5234375, + "learning_rate": 0.0004997562583850763, + "loss": 0.2612, + "step": 10280 + }, + { + "epoch": 0.43, + "grad_norm": 0.310546875, + "learning_rate": 0.0004997557793642135, + "loss": 0.324, + "step": 10290 + }, + { + "epoch": 0.43, + "grad_norm": 0.9296875, + "learning_rate": 0.0004997552998733375, + "loss": 0.2599, + "step": 10300 + }, + { + "epoch": 0.43, + "grad_norm": 0.55078125, + "learning_rate": 0.0004997548199124488, + "loss": 0.2054, + "step": 10310 + }, + { + "epoch": 0.43, + "grad_norm": 0.72265625, + "learning_rate": 0.0004997543394815486, + "loss": 0.2677, + "step": 10320 + }, + { + "epoch": 0.43, + "grad_norm": 0.333984375, + "learning_rate": 0.0004997538585806377, + "loss": 0.2574, + "step": 10330 + }, + { + "epoch": 0.43, + "grad_norm": 0.353515625, + "learning_rate": 0.000499753377209717, + "loss": 0.2911, + "step": 10340 + }, + { + "epoch": 0.43, + "grad_norm": 0.388671875, + "learning_rate": 0.0004997528953687875, + "loss": 0.2893, + "step": 10350 + }, + { + "epoch": 0.43, + "grad_norm": 0.2275390625, + "learning_rate": 0.0004997524130578499, + "loss": 0.2223, + "step": 10360 + }, + { + "epoch": 0.43, + "grad_norm": 0.609375, + "learning_rate": 0.0004997519302769053, + "loss": 0.2902, + "step": 10370 + }, + { + "epoch": 0.43, + "grad_norm": 0.55859375, + "learning_rate": 0.0004997514470259545, + "loss": 0.3172, + "step": 10380 + }, + { + "epoch": 0.43, + "grad_norm": 1.359375, + "learning_rate": 0.0004997509633049985, + "loss": 0.2653, + "step": 10390 + }, + { + "epoch": 0.43, + "grad_norm": 0.4921875, + "learning_rate": 0.000499750479114038, + "loss": 0.2742, + "step": 10400 + }, + { + "epoch": 0.43, + "grad_norm": 1.109375, + "learning_rate": 0.0004997499944530742, + "loss": 0.255, + "step": 10410 + }, + { + "epoch": 0.43, + "grad_norm": 0.470703125, + "learning_rate": 0.000499749509322108, + "loss": 0.2522, + "step": 10420 + }, + { + "epoch": 0.43, + "grad_norm": 0.72265625, + "learning_rate": 0.00049974902372114, + "loss": 0.2381, + "step": 10430 + }, + { + "epoch": 0.43, + "grad_norm": 0.734375, + "learning_rate": 0.0004997485376501714, + "loss": 0.217, + "step": 10440 + }, + { + "epoch": 0.43, + "grad_norm": 1.9453125, + "learning_rate": 0.000499748051109203, + "loss": 0.2247, + "step": 10450 + }, + { + "epoch": 0.43, + "grad_norm": 0.67578125, + "learning_rate": 0.0004997475640982357, + "loss": 0.2944, + "step": 10460 + }, + { + "epoch": 0.43, + "grad_norm": 0.8984375, + "learning_rate": 0.0004997470766172705, + "loss": 0.2701, + "step": 10470 + }, + { + "epoch": 0.43, + "grad_norm": 0.9765625, + "learning_rate": 0.0004997465886663082, + "loss": 0.2348, + "step": 10480 + }, + { + "epoch": 0.43, + "grad_norm": 1.2578125, + "learning_rate": 0.0004997461002453498, + "loss": 0.1593, + "step": 10490 + }, + { + "epoch": 0.43, + "grad_norm": 0.72265625, + "learning_rate": 0.0004997456113543964, + "loss": 0.2726, + "step": 10500 + }, + { + "epoch": 0.44, + "grad_norm": 1.34375, + "learning_rate": 0.0004997451219934486, + "loss": 0.2785, + "step": 10510 + }, + { + "epoch": 0.44, + "grad_norm": 0.6953125, + "learning_rate": 0.0004997446321625073, + "loss": 0.2246, + "step": 10520 + }, + { + "epoch": 0.44, + "grad_norm": 1.4296875, + "learning_rate": 0.0004997441418615738, + "loss": 0.2917, + "step": 10530 + }, + { + "epoch": 0.44, + "grad_norm": 1.34375, + "learning_rate": 0.0004997436510906487, + "loss": 0.2464, + "step": 10540 + }, + { + "epoch": 0.44, + "grad_norm": 0.82421875, + "learning_rate": 0.0004997431598497329, + "loss": 0.3073, + "step": 10550 + }, + { + "epoch": 0.44, + "grad_norm": 0.287109375, + "learning_rate": 0.0004997426681388276, + "loss": 0.2618, + "step": 10560 + }, + { + "epoch": 0.44, + "grad_norm": 1.28125, + "learning_rate": 0.0004997421759579336, + "loss": 0.3002, + "step": 10570 + }, + { + "epoch": 0.44, + "grad_norm": 0.7578125, + "learning_rate": 0.0004997416833070517, + "loss": 0.3009, + "step": 10580 + }, + { + "epoch": 0.44, + "grad_norm": 0.72265625, + "learning_rate": 0.0004997411901861829, + "loss": 0.2736, + "step": 10590 + }, + { + "epoch": 0.44, + "grad_norm": 0.357421875, + "learning_rate": 0.0004997406965953283, + "loss": 0.3414, + "step": 10600 + }, + { + "epoch": 0.44, + "grad_norm": 0.72265625, + "learning_rate": 0.0004997402025344886, + "loss": 0.2631, + "step": 10610 + }, + { + "epoch": 0.44, + "grad_norm": 2.203125, + "learning_rate": 0.0004997397080036647, + "loss": 0.271, + "step": 10620 + }, + { + "epoch": 0.44, + "grad_norm": 1.4921875, + "learning_rate": 0.0004997392130028578, + "loss": 0.2247, + "step": 10630 + }, + { + "epoch": 0.44, + "grad_norm": 0.6875, + "learning_rate": 0.0004997387175320686, + "loss": 0.2633, + "step": 10640 + }, + { + "epoch": 0.44, + "grad_norm": 1.59375, + "learning_rate": 0.000499738221591298, + "loss": 0.237, + "step": 10650 + }, + { + "epoch": 0.44, + "grad_norm": 0.73828125, + "learning_rate": 0.0004997377251805471, + "loss": 0.279, + "step": 10660 + }, + { + "epoch": 0.44, + "grad_norm": 0.400390625, + "learning_rate": 0.000499737228299817, + "loss": 0.287, + "step": 10670 + }, + { + "epoch": 0.44, + "grad_norm": 0.94921875, + "learning_rate": 0.0004997367309491081, + "loss": 0.2665, + "step": 10680 + }, + { + "epoch": 0.44, + "grad_norm": 0.6640625, + "learning_rate": 0.0004997362331284217, + "loss": 0.3324, + "step": 10690 + }, + { + "epoch": 0.44, + "grad_norm": 0.8046875, + "learning_rate": 0.0004997357348377589, + "loss": 0.3001, + "step": 10700 + }, + { + "epoch": 0.44, + "grad_norm": 0.85546875, + "learning_rate": 0.0004997352360771202, + "loss": 0.3396, + "step": 10710 + }, + { + "epoch": 0.44, + "grad_norm": 0.9140625, + "learning_rate": 0.0004997347368465068, + "loss": 0.272, + "step": 10720 + }, + { + "epoch": 0.44, + "grad_norm": 0.396484375, + "learning_rate": 0.0004997342371459196, + "loss": 0.266, + "step": 10730 + }, + { + "epoch": 0.44, + "grad_norm": 2.34375, + "learning_rate": 0.0004997337369753595, + "loss": 0.303, + "step": 10740 + }, + { + "epoch": 0.45, + "grad_norm": 0.515625, + "learning_rate": 0.0004997332363348275, + "loss": 0.2887, + "step": 10750 + }, + { + "epoch": 0.45, + "grad_norm": 0.5859375, + "learning_rate": 0.0004997327352243245, + "loss": 0.2902, + "step": 10760 + }, + { + "epoch": 0.45, + "grad_norm": 1.4609375, + "learning_rate": 0.0004997322336438515, + "loss": 0.2258, + "step": 10770 + }, + { + "epoch": 0.45, + "grad_norm": 0.5234375, + "learning_rate": 0.0004997317315934094, + "loss": 0.2614, + "step": 10780 + }, + { + "epoch": 0.45, + "grad_norm": 0.5078125, + "learning_rate": 0.0004997312290729992, + "loss": 0.2558, + "step": 10790 + }, + { + "epoch": 0.45, + "grad_norm": 0.73046875, + "learning_rate": 0.0004997307260826217, + "loss": 0.2918, + "step": 10800 + }, + { + "epoch": 0.45, + "grad_norm": 0.34375, + "learning_rate": 0.0004997302226222779, + "loss": 0.2496, + "step": 10810 + }, + { + "epoch": 0.45, + "grad_norm": 0.59765625, + "learning_rate": 0.000499729718691969, + "loss": 0.2286, + "step": 10820 + }, + { + "epoch": 0.45, + "grad_norm": 0.470703125, + "learning_rate": 0.0004997292142916956, + "loss": 0.2398, + "step": 10830 + }, + { + "epoch": 0.45, + "grad_norm": 0.3984375, + "learning_rate": 0.0004997287094214587, + "loss": 0.1762, + "step": 10840 + }, + { + "epoch": 0.45, + "grad_norm": 1.65625, + "learning_rate": 0.0004997282040812596, + "loss": 0.1878, + "step": 10850 + }, + { + "epoch": 0.45, + "grad_norm": 0.4765625, + "learning_rate": 0.0004997276982710988, + "loss": 0.2679, + "step": 10860 + }, + { + "epoch": 0.45, + "grad_norm": 0.77734375, + "learning_rate": 0.0004997271919909774, + "loss": 0.2761, + "step": 10870 + }, + { + "epoch": 0.45, + "grad_norm": 0.6484375, + "learning_rate": 0.0004997266852408964, + "loss": 0.265, + "step": 10880 + }, + { + "epoch": 0.45, + "grad_norm": 0.416015625, + "learning_rate": 0.0004997261780208569, + "loss": 0.3033, + "step": 10890 + }, + { + "epoch": 0.45, + "grad_norm": 0.0, + "learning_rate": 0.0004997256703308595, + "loss": 0.2449, + "step": 10900 + }, + { + "epoch": 0.45, + "grad_norm": 0.69140625, + "learning_rate": 0.0004997251621709055, + "loss": 0.2679, + "step": 10910 + }, + { + "epoch": 0.45, + "grad_norm": 1.0234375, + "learning_rate": 0.0004997246535409956, + "loss": 0.2798, + "step": 10920 + }, + { + "epoch": 0.45, + "grad_norm": 0.453125, + "learning_rate": 0.000499724144441131, + "loss": 0.2605, + "step": 10930 + }, + { + "epoch": 0.45, + "grad_norm": 0.181640625, + "learning_rate": 0.0004997236348713124, + "loss": 0.2455, + "step": 10940 + }, + { + "epoch": 0.45, + "grad_norm": 0.79296875, + "learning_rate": 0.000499723124831541, + "loss": 0.2581, + "step": 10950 + }, + { + "epoch": 0.45, + "grad_norm": 1.453125, + "learning_rate": 0.0004997226143218177, + "loss": 0.2667, + "step": 10960 + }, + { + "epoch": 0.45, + "grad_norm": 0.98828125, + "learning_rate": 0.0004997221033421432, + "loss": 0.2716, + "step": 10970 + }, + { + "epoch": 0.45, + "grad_norm": 1.1171875, + "learning_rate": 0.0004997215918925188, + "loss": 0.2767, + "step": 10980 + }, + { + "epoch": 0.46, + "grad_norm": 0.80078125, + "learning_rate": 0.0004997210799729453, + "loss": 0.2445, + "step": 10990 + }, + { + "epoch": 0.46, + "grad_norm": 0.55859375, + "learning_rate": 0.0004997205675834237, + "loss": 0.2607, + "step": 11000 + }, + { + "epoch": 0.46, + "grad_norm": 0.59765625, + "learning_rate": 0.000499720054723955, + "loss": 0.2562, + "step": 11010 + }, + { + "epoch": 0.46, + "grad_norm": 1.015625, + "learning_rate": 0.00049971954139454, + "loss": 0.2781, + "step": 11020 + }, + { + "epoch": 0.46, + "grad_norm": 0.7734375, + "learning_rate": 0.00049971902759518, + "loss": 0.3021, + "step": 11030 + }, + { + "epoch": 0.46, + "grad_norm": 0.94140625, + "learning_rate": 0.0004997185133258756, + "loss": 0.2691, + "step": 11040 + }, + { + "epoch": 0.46, + "grad_norm": 0.5078125, + "learning_rate": 0.0004997179985866279, + "loss": 0.219, + "step": 11050 + }, + { + "epoch": 0.46, + "grad_norm": 0.55859375, + "learning_rate": 0.000499717483377438, + "loss": 0.2806, + "step": 11060 + }, + { + "epoch": 0.46, + "grad_norm": 0.97265625, + "learning_rate": 0.0004997169676983068, + "loss": 0.2534, + "step": 11070 + }, + { + "epoch": 0.46, + "grad_norm": 1.96875, + "learning_rate": 0.000499716451549235, + "loss": 0.3137, + "step": 11080 + }, + { + "epoch": 0.46, + "grad_norm": 0.56640625, + "learning_rate": 0.000499715934930224, + "loss": 0.2147, + "step": 11090 + }, + { + "epoch": 0.46, + "grad_norm": 0.93359375, + "learning_rate": 0.0004997154178412746, + "loss": 0.2544, + "step": 11100 + }, + { + "epoch": 0.46, + "grad_norm": 0.27734375, + "learning_rate": 0.0004997149002823877, + "loss": 0.2752, + "step": 11110 + }, + { + "epoch": 0.46, + "grad_norm": 0.734375, + "learning_rate": 0.0004997143822535643, + "loss": 0.2133, + "step": 11120 + }, + { + "epoch": 0.46, + "grad_norm": 0.353515625, + "learning_rate": 0.0004997138637548055, + "loss": 0.2792, + "step": 11130 + }, + { + "epoch": 0.46, + "grad_norm": 0.55859375, + "learning_rate": 0.0004997133447861119, + "loss": 0.2426, + "step": 11140 + }, + { + "epoch": 0.46, + "grad_norm": 1.3359375, + "learning_rate": 0.000499712825347485, + "loss": 0.2457, + "step": 11150 + }, + { + "epoch": 0.46, + "grad_norm": 0.6640625, + "learning_rate": 0.0004997123054389255, + "loss": 0.2882, + "step": 11160 + }, + { + "epoch": 0.46, + "grad_norm": 3.234375, + "learning_rate": 0.0004997117850604343, + "loss": 0.2904, + "step": 11170 + }, + { + "epoch": 0.46, + "grad_norm": 0.859375, + "learning_rate": 0.0004997112642120126, + "loss": 0.2814, + "step": 11180 + }, + { + "epoch": 0.46, + "grad_norm": 1.1328125, + "learning_rate": 0.0004997107428936613, + "loss": 0.3676, + "step": 11190 + }, + { + "epoch": 0.46, + "grad_norm": 1.21875, + "learning_rate": 0.0004997102211053812, + "loss": 0.1906, + "step": 11200 + }, + { + "epoch": 0.46, + "grad_norm": 0.416015625, + "learning_rate": 0.0004997096988471736, + "loss": 0.2253, + "step": 11210 + }, + { + "epoch": 0.46, + "grad_norm": 0.80859375, + "learning_rate": 0.0004997091761190391, + "loss": 0.2462, + "step": 11220 + }, + { + "epoch": 0.47, + "grad_norm": 0.68359375, + "learning_rate": 0.0004997086529209791, + "loss": 0.2329, + "step": 11230 + }, + { + "epoch": 0.47, + "grad_norm": 1.09375, + "learning_rate": 0.0004997081292529942, + "loss": 0.2471, + "step": 11240 + }, + { + "epoch": 0.47, + "grad_norm": 1.3046875, + "learning_rate": 0.0004997076051150857, + "loss": 0.2515, + "step": 11250 + }, + { + "epoch": 0.47, + "grad_norm": 2.15625, + "learning_rate": 0.0004997070805072545, + "loss": 0.2529, + "step": 11260 + }, + { + "epoch": 0.47, + "grad_norm": 0.91015625, + "learning_rate": 0.0004997065554295014, + "loss": 0.3051, + "step": 11270 + }, + { + "epoch": 0.47, + "grad_norm": 0.77734375, + "learning_rate": 0.0004997060298818276, + "loss": 0.3082, + "step": 11280 + }, + { + "epoch": 0.47, + "grad_norm": 1.21875, + "learning_rate": 0.000499705503864234, + "loss": 0.2656, + "step": 11290 + }, + { + "epoch": 0.47, + "grad_norm": 0.427734375, + "learning_rate": 0.0004997049773767216, + "loss": 0.2069, + "step": 11300 + }, + { + "epoch": 0.47, + "grad_norm": 0.59375, + "learning_rate": 0.0004997044504192915, + "loss": 0.2724, + "step": 11310 + }, + { + "epoch": 0.47, + "grad_norm": 0.4921875, + "learning_rate": 0.0004997039229919445, + "loss": 0.2362, + "step": 11320 + }, + { + "epoch": 0.47, + "grad_norm": 0.984375, + "learning_rate": 0.0004997033950946817, + "loss": 0.279, + "step": 11330 + }, + { + "epoch": 0.47, + "grad_norm": 0.5234375, + "learning_rate": 0.000499702866727504, + "loss": 0.2233, + "step": 11340 + }, + { + "epoch": 0.47, + "grad_norm": 0.51953125, + "learning_rate": 0.0004997023378904126, + "loss": 0.2283, + "step": 11350 + }, + { + "epoch": 0.47, + "grad_norm": 0.7734375, + "learning_rate": 0.0004997018085834082, + "loss": 0.2311, + "step": 11360 + }, + { + "epoch": 0.47, + "grad_norm": 0.4765625, + "learning_rate": 0.0004997012788064921, + "loss": 0.2468, + "step": 11370 + }, + { + "epoch": 0.47, + "grad_norm": 0.392578125, + "learning_rate": 0.000499700748559665, + "loss": 0.2575, + "step": 11380 + }, + { + "epoch": 0.47, + "grad_norm": 0.6953125, + "learning_rate": 0.0004997002178429283, + "loss": 0.211, + "step": 11390 + }, + { + "epoch": 0.47, + "grad_norm": 0.8671875, + "learning_rate": 0.0004996996866562827, + "loss": 0.2062, + "step": 11400 + }, + { + "epoch": 0.47, + "grad_norm": 0.796875, + "learning_rate": 0.000499699154999729, + "loss": 0.2423, + "step": 11410 + }, + { + "epoch": 0.47, + "grad_norm": 0.75390625, + "learning_rate": 0.0004996986228732687, + "loss": 0.1886, + "step": 11420 + }, + { + "epoch": 0.47, + "grad_norm": 0.96484375, + "learning_rate": 0.0004996980902769025, + "loss": 0.2616, + "step": 11430 + }, + { + "epoch": 0.47, + "grad_norm": 0.0, + "learning_rate": 0.0004996975572106315, + "loss": 0.2525, + "step": 11440 + }, + { + "epoch": 0.47, + "grad_norm": 0.74609375, + "learning_rate": 0.0004996970236744566, + "loss": 0.2928, + "step": 11450 + }, + { + "epoch": 0.47, + "grad_norm": 0.5703125, + "learning_rate": 0.0004996964896683789, + "loss": 0.2477, + "step": 11460 + }, + { + "epoch": 0.48, + "grad_norm": 0.55078125, + "learning_rate": 0.0004996959551923993, + "loss": 0.2405, + "step": 11470 + }, + { + "epoch": 0.48, + "grad_norm": 0.84765625, + "learning_rate": 0.0004996954202465189, + "loss": 0.244, + "step": 11480 + }, + { + "epoch": 0.48, + "grad_norm": 1.015625, + "learning_rate": 0.0004996948848307388, + "loss": 0.2727, + "step": 11490 + }, + { + "epoch": 0.48, + "grad_norm": 0.55859375, + "learning_rate": 0.0004996943489450599, + "loss": 0.1946, + "step": 11500 + }, + { + "epoch": 0.48, + "grad_norm": 0.92578125, + "learning_rate": 0.000499693812589483, + "loss": 0.2017, + "step": 11510 + }, + { + "epoch": 0.48, + "grad_norm": 0.392578125, + "learning_rate": 0.0004996932757640094, + "loss": 0.303, + "step": 11520 + }, + { + "epoch": 0.48, + "grad_norm": 1.09375, + "learning_rate": 0.00049969273846864, + "loss": 0.2459, + "step": 11530 + }, + { + "epoch": 0.48, + "grad_norm": 0.349609375, + "learning_rate": 0.0004996922007033759, + "loss": 0.2526, + "step": 11540 + }, + { + "epoch": 0.48, + "grad_norm": 1.390625, + "learning_rate": 0.0004996916624682181, + "loss": 0.2446, + "step": 11550 + }, + { + "epoch": 0.48, + "grad_norm": 0.6328125, + "learning_rate": 0.0004996911237631674, + "loss": 0.233, + "step": 11560 + }, + { + "epoch": 0.48, + "grad_norm": 1.171875, + "learning_rate": 0.0004996905845882251, + "loss": 0.2742, + "step": 11570 + }, + { + "epoch": 0.48, + "grad_norm": 0.80078125, + "learning_rate": 0.000499690044943392, + "loss": 0.2908, + "step": 11580 + }, + { + "epoch": 0.48, + "grad_norm": 0.984375, + "learning_rate": 0.0004996895048286692, + "loss": 0.2879, + "step": 11590 + }, + { + "epoch": 0.48, + "grad_norm": 0.2890625, + "learning_rate": 0.0004996889642440577, + "loss": 0.2929, + "step": 11600 + }, + { + "epoch": 0.48, + "grad_norm": 0.318359375, + "learning_rate": 0.0004996884231895586, + "loss": 0.3438, + "step": 11610 + }, + { + "epoch": 0.48, + "grad_norm": 0.88671875, + "learning_rate": 0.0004996878816651728, + "loss": 0.2925, + "step": 11620 + }, + { + "epoch": 0.48, + "grad_norm": 2.875, + "learning_rate": 0.0004996873396709014, + "loss": 0.2603, + "step": 11630 + }, + { + "epoch": 0.48, + "grad_norm": 0.7890625, + "learning_rate": 0.0004996867972067453, + "loss": 0.2788, + "step": 11640 + }, + { + "epoch": 0.48, + "grad_norm": 0.7109375, + "learning_rate": 0.0004996862542727057, + "loss": 0.2439, + "step": 11650 + }, + { + "epoch": 0.48, + "grad_norm": 0.333984375, + "learning_rate": 0.0004996857108687836, + "loss": 0.2506, + "step": 11660 + }, + { + "epoch": 0.48, + "grad_norm": 0.5703125, + "learning_rate": 0.00049968516699498, + "loss": 0.2646, + "step": 11670 + }, + { + "epoch": 0.48, + "grad_norm": 0.65625, + "learning_rate": 0.0004996846226512957, + "loss": 0.2454, + "step": 11680 + }, + { + "epoch": 0.48, + "grad_norm": 0.45703125, + "learning_rate": 0.0004996840778377319, + "loss": 0.2306, + "step": 11690 + }, + { + "epoch": 0.48, + "grad_norm": 0.59375, + "learning_rate": 0.0004996835325542896, + "loss": 0.1964, + "step": 11700 + }, + { + "epoch": 0.49, + "grad_norm": 0.41796875, + "learning_rate": 0.00049968298680097, + "loss": 0.2394, + "step": 11710 + }, + { + "epoch": 0.49, + "grad_norm": 0.63671875, + "learning_rate": 0.000499682440577774, + "loss": 0.2437, + "step": 11720 + }, + { + "epoch": 0.49, + "grad_norm": 0.3125, + "learning_rate": 0.0004996818938847026, + "loss": 0.2254, + "step": 11730 + }, + { + "epoch": 0.49, + "grad_norm": 0.5703125, + "learning_rate": 0.0004996813467217566, + "loss": 0.2711, + "step": 11740 + }, + { + "epoch": 0.49, + "grad_norm": 0.9609375, + "learning_rate": 0.0004996807990889376, + "loss": 0.2165, + "step": 11750 + }, + { + "epoch": 0.49, + "grad_norm": 0.4140625, + "learning_rate": 0.0004996802509862461, + "loss": 0.2162, + "step": 11760 + }, + { + "epoch": 0.49, + "grad_norm": 1.6640625, + "learning_rate": 0.0004996797024136834, + "loss": 0.2604, + "step": 11770 + }, + { + "epoch": 0.49, + "grad_norm": 0.6796875, + "learning_rate": 0.0004996791533712504, + "loss": 0.2313, + "step": 11780 + }, + { + "epoch": 0.49, + "grad_norm": 2.15625, + "learning_rate": 0.0004996786038589482, + "loss": 0.2979, + "step": 11790 + }, + { + "epoch": 0.49, + "grad_norm": 1.1328125, + "learning_rate": 0.0004996780538767779, + "loss": 0.2763, + "step": 11800 + }, + { + "epoch": 0.49, + "grad_norm": 0.96484375, + "learning_rate": 0.0004996775034247405, + "loss": 0.2545, + "step": 11810 + }, + { + "epoch": 0.49, + "grad_norm": 0.65234375, + "learning_rate": 0.0004996769525028369, + "loss": 0.2526, + "step": 11820 + }, + { + "epoch": 0.49, + "grad_norm": 0.796875, + "learning_rate": 0.0004996764011110683, + "loss": 0.2658, + "step": 11830 + }, + { + "epoch": 0.49, + "grad_norm": 0.5859375, + "learning_rate": 0.0004996758492494356, + "loss": 0.2253, + "step": 11840 + }, + { + "epoch": 0.49, + "grad_norm": 0.75, + "learning_rate": 0.00049967529691794, + "loss": 0.2808, + "step": 11850 + }, + { + "epoch": 0.49, + "grad_norm": 1.328125, + "learning_rate": 0.0004996747441165825, + "loss": 0.2238, + "step": 11860 + }, + { + "epoch": 0.49, + "grad_norm": 0.8046875, + "learning_rate": 0.0004996741908453639, + "loss": 0.2057, + "step": 11870 + }, + { + "epoch": 0.49, + "grad_norm": 0.54296875, + "learning_rate": 0.0004996736371042856, + "loss": 0.2924, + "step": 11880 + }, + { + "epoch": 0.49, + "grad_norm": 0.73046875, + "learning_rate": 0.0004996730828933484, + "loss": 0.2492, + "step": 11890 + }, + { + "epoch": 0.49, + "grad_norm": 0.419921875, + "learning_rate": 0.0004996725282125534, + "loss": 0.3119, + "step": 11900 + }, + { + "epoch": 0.49, + "grad_norm": 0.546875, + "learning_rate": 0.0004996719730619017, + "loss": 0.2726, + "step": 11910 + }, + { + "epoch": 0.49, + "grad_norm": 1.3203125, + "learning_rate": 0.0004996714174413943, + "loss": 0.2744, + "step": 11920 + }, + { + "epoch": 0.49, + "grad_norm": 0.9140625, + "learning_rate": 0.0004996708613510323, + "loss": 0.2227, + "step": 11930 + }, + { + "epoch": 0.49, + "grad_norm": 1.4609375, + "learning_rate": 0.0004996703047908167, + "loss": 0.2705, + "step": 11940 + }, + { + "epoch": 0.49, + "grad_norm": 0.2138671875, + "learning_rate": 0.0004996697477607485, + "loss": 0.2188, + "step": 11950 + }, + { + "epoch": 0.5, + "grad_norm": 0.75390625, + "learning_rate": 0.0004996691902608289, + "loss": 0.2519, + "step": 11960 + }, + { + "epoch": 0.5, + "grad_norm": 0.62890625, + "learning_rate": 0.0004996686322910587, + "loss": 0.2057, + "step": 11970 + }, + { + "epoch": 0.5, + "grad_norm": 0.86328125, + "learning_rate": 0.0004996680738514392, + "loss": 0.2539, + "step": 11980 + }, + { + "epoch": 0.5, + "grad_norm": 0.73828125, + "learning_rate": 0.0004996675149419713, + "loss": 0.2428, + "step": 11990 + }, + { + "epoch": 0.5, + "grad_norm": 1.3515625, + "learning_rate": 0.0004996669555626561, + "loss": 0.2226, + "step": 12000 + }, + { + "epoch": 0.5, + "grad_norm": 0.451171875, + "learning_rate": 0.0004996663957134947, + "loss": 0.2298, + "step": 12010 + }, + { + "epoch": 0.5, + "grad_norm": 0.47265625, + "learning_rate": 0.0004996658353944881, + "loss": 0.221, + "step": 12020 + }, + { + "epoch": 0.5, + "grad_norm": 0.306640625, + "learning_rate": 0.0004996652746056372, + "loss": 0.243, + "step": 12030 + }, + { + "epoch": 0.5, + "grad_norm": 0.640625, + "learning_rate": 0.0004996647133469434, + "loss": 0.3031, + "step": 12040 + }, + { + "epoch": 0.5, + "grad_norm": 0.84765625, + "learning_rate": 0.0004996641516184075, + "loss": 0.2261, + "step": 12050 + }, + { + "epoch": 0.5, + "grad_norm": 0.5625, + "learning_rate": 0.0004996635894200307, + "loss": 0.2584, + "step": 12060 + }, + { + "epoch": 0.5, + "grad_norm": 0.76171875, + "learning_rate": 0.0004996630267518139, + "loss": 0.169, + "step": 12070 + }, + { + "epoch": 0.5, + "grad_norm": 0.9375, + "learning_rate": 0.0004996624636137582, + "loss": 0.2332, + "step": 12080 + }, + { + "epoch": 0.5, + "grad_norm": 0.79296875, + "learning_rate": 0.0004996619000058647, + "loss": 0.2623, + "step": 12090 + }, + { + "epoch": 0.5, + "grad_norm": 0.5234375, + "learning_rate": 0.0004996613359281346, + "loss": 0.321, + "step": 12100 + }, + { + "epoch": 0.5, + "grad_norm": 0.6484375, + "learning_rate": 0.0004996607713805688, + "loss": 0.2758, + "step": 12110 + }, + { + "epoch": 0.5, + "grad_norm": 1.5625, + "learning_rate": 0.0004996602063631684, + "loss": 0.2643, + "step": 12120 + }, + { + "epoch": 0.5, + "grad_norm": 0.5078125, + "learning_rate": 0.0004996596408759343, + "loss": 0.2369, + "step": 12130 + }, + { + "epoch": 0.5, + "grad_norm": 0.42578125, + "learning_rate": 0.0004996590749188678, + "loss": 0.3099, + "step": 12140 + }, + { + "epoch": 0.5, + "grad_norm": 0.82421875, + "learning_rate": 0.0004996585084919699, + "loss": 0.2804, + "step": 12150 + }, + { + "epoch": 0.5, + "grad_norm": 0.59765625, + "learning_rate": 0.0004996579415952417, + "loss": 0.2636, + "step": 12160 + }, + { + "epoch": 0.5, + "grad_norm": 0.494140625, + "learning_rate": 0.0004996573742286842, + "loss": 0.2652, + "step": 12170 + }, + { + "epoch": 0.5, + "grad_norm": 1.7890625, + "learning_rate": 0.0004996568063922984, + "loss": 0.2606, + "step": 12180 + }, + { + "epoch": 0.5, + "grad_norm": 0.34375, + "learning_rate": 0.0004996562380860855, + "loss": 0.2746, + "step": 12190 + }, + { + "epoch": 0.51, + "grad_norm": 0.91796875, + "learning_rate": 0.0004996556693100466, + "loss": 0.2494, + "step": 12200 + }, + { + "epoch": 0.51, + "grad_norm": 0.98046875, + "learning_rate": 0.0004996551000641825, + "loss": 0.2228, + "step": 12210 + }, + { + "epoch": 0.51, + "grad_norm": 0.65625, + "learning_rate": 0.0004996545303484947, + "loss": 0.2498, + "step": 12220 + }, + { + "epoch": 0.51, + "grad_norm": 0.40625, + "learning_rate": 0.0004996539601629839, + "loss": 0.2589, + "step": 12230 + }, + { + "epoch": 0.51, + "grad_norm": 0.90625, + "learning_rate": 0.0004996533895076513, + "loss": 0.3008, + "step": 12240 + }, + { + "epoch": 0.51, + "grad_norm": 0.625, + "learning_rate": 0.000499652818382498, + "loss": 0.2513, + "step": 12250 + }, + { + "epoch": 0.51, + "grad_norm": 1.03125, + "learning_rate": 0.000499652246787525, + "loss": 0.2084, + "step": 12260 + }, + { + "epoch": 0.51, + "grad_norm": 0.578125, + "learning_rate": 0.0004996516747227336, + "loss": 0.196, + "step": 12270 + }, + { + "epoch": 0.51, + "grad_norm": 1.078125, + "learning_rate": 0.0004996511021881244, + "loss": 0.2265, + "step": 12280 + }, + { + "epoch": 0.51, + "grad_norm": 0.80078125, + "learning_rate": 0.0004996505291836991, + "loss": 0.2067, + "step": 12290 + }, + { + "epoch": 0.51, + "grad_norm": 1.84375, + "learning_rate": 0.0004996499557094584, + "loss": 0.1859, + "step": 12300 + }, + { + "epoch": 0.51, + "grad_norm": 0.48046875, + "learning_rate": 0.0004996493817654033, + "loss": 0.2147, + "step": 12310 + }, + { + "epoch": 0.51, + "grad_norm": 0.5703125, + "learning_rate": 0.0004996488073515351, + "loss": 0.2567, + "step": 12320 + }, + { + "epoch": 0.51, + "grad_norm": 0.470703125, + "learning_rate": 0.0004996482324678549, + "loss": 0.259, + "step": 12330 + }, + { + "epoch": 0.51, + "grad_norm": 0.6171875, + "learning_rate": 0.0004996476571143636, + "loss": 0.1846, + "step": 12340 + }, + { + "epoch": 0.51, + "grad_norm": 0.55859375, + "learning_rate": 0.0004996470812910623, + "loss": 0.256, + "step": 12350 + }, + { + "epoch": 0.51, + "grad_norm": 0.890625, + "learning_rate": 0.0004996465049979523, + "loss": 0.2945, + "step": 12360 + }, + { + "epoch": 0.51, + "grad_norm": 0.65234375, + "learning_rate": 0.0004996459282350344, + "loss": 0.2705, + "step": 12370 + }, + { + "epoch": 0.51, + "grad_norm": 0.6484375, + "learning_rate": 0.0004996453510023098, + "loss": 0.3059, + "step": 12380 + }, + { + "epoch": 0.51, + "grad_norm": 0.388671875, + "learning_rate": 0.0004996447732997797, + "loss": 0.2624, + "step": 12390 + }, + { + "epoch": 0.51, + "grad_norm": 0.361328125, + "learning_rate": 0.0004996441951274452, + "loss": 0.249, + "step": 12400 + }, + { + "epoch": 0.51, + "grad_norm": 0.421875, + "learning_rate": 0.0004996436164853071, + "loss": 0.2587, + "step": 12410 + }, + { + "epoch": 0.51, + "grad_norm": 0.76171875, + "learning_rate": 0.0004996430373733668, + "loss": 0.2674, + "step": 12420 + }, + { + "epoch": 0.51, + "grad_norm": 0.330078125, + "learning_rate": 0.0004996424577916251, + "loss": 0.2315, + "step": 12430 + }, + { + "epoch": 0.52, + "grad_norm": 0.7109375, + "learning_rate": 0.0004996418777400834, + "loss": 0.2058, + "step": 12440 + }, + { + "epoch": 0.52, + "grad_norm": 1.046875, + "learning_rate": 0.0004996412972187427, + "loss": 0.2426, + "step": 12450 + }, + { + "epoch": 0.52, + "grad_norm": 0.3671875, + "learning_rate": 0.0004996407162276039, + "loss": 0.2825, + "step": 12460 + }, + { + "epoch": 0.52, + "grad_norm": 0.240234375, + "learning_rate": 0.0004996401347666683, + "loss": 0.1898, + "step": 12470 + }, + { + "epoch": 0.52, + "grad_norm": 0.86328125, + "learning_rate": 0.0004996395528359368, + "loss": 0.267, + "step": 12480 + }, + { + "epoch": 0.52, + "grad_norm": 1.7109375, + "learning_rate": 0.0004996389704354107, + "loss": 0.2219, + "step": 12490 + }, + { + "epoch": 0.52, + "grad_norm": 0.67578125, + "learning_rate": 0.0004996383875650911, + "loss": 0.2615, + "step": 12500 + }, + { + "epoch": 0.52, + "grad_norm": 0.96484375, + "learning_rate": 0.000499637804224979, + "loss": 0.2419, + "step": 12510 + }, + { + "epoch": 0.52, + "grad_norm": 0.431640625, + "learning_rate": 0.0004996372204150754, + "loss": 0.2582, + "step": 12520 + }, + { + "epoch": 0.52, + "grad_norm": 0.62109375, + "learning_rate": 0.0004996366361353816, + "loss": 0.2359, + "step": 12530 + }, + { + "epoch": 0.52, + "grad_norm": 0.2197265625, + "learning_rate": 0.0004996360513858985, + "loss": 0.2114, + "step": 12540 + }, + { + "epoch": 0.52, + "grad_norm": 0.71875, + "learning_rate": 0.0004996354661666274, + "loss": 0.2562, + "step": 12550 + }, + { + "epoch": 0.52, + "grad_norm": 0.94140625, + "learning_rate": 0.0004996348804775693, + "loss": 0.2496, + "step": 12560 + }, + { + "epoch": 0.52, + "grad_norm": 0.48046875, + "learning_rate": 0.0004996342943187253, + "loss": 0.1683, + "step": 12570 + }, + { + "epoch": 0.52, + "grad_norm": 1.0625, + "learning_rate": 0.0004996337076900965, + "loss": 0.2598, + "step": 12580 + }, + { + "epoch": 0.52, + "grad_norm": 0.54296875, + "learning_rate": 0.0004996331205916841, + "loss": 0.2989, + "step": 12590 + }, + { + "epoch": 0.52, + "grad_norm": 0.47265625, + "learning_rate": 0.0004996325330234891, + "loss": 0.1701, + "step": 12600 + }, + { + "epoch": 0.52, + "grad_norm": 1.1015625, + "learning_rate": 0.0004996319449855125, + "loss": 0.2306, + "step": 12610 + }, + { + "epoch": 0.52, + "grad_norm": 0.91796875, + "learning_rate": 0.0004996313564777557, + "loss": 0.2561, + "step": 12620 + }, + { + "epoch": 0.52, + "grad_norm": 0.48828125, + "learning_rate": 0.0004996307675002197, + "loss": 0.2958, + "step": 12630 + }, + { + "epoch": 0.52, + "grad_norm": 0.412109375, + "learning_rate": 0.0004996301780529054, + "loss": 0.2615, + "step": 12640 + }, + { + "epoch": 0.52, + "grad_norm": 0.390625, + "learning_rate": 0.000499629588135814, + "loss": 0.2403, + "step": 12650 + }, + { + "epoch": 0.52, + "grad_norm": 0.88671875, + "learning_rate": 0.0004996289977489468, + "loss": 0.2499, + "step": 12660 + }, + { + "epoch": 0.52, + "grad_norm": 1.2109375, + "learning_rate": 0.0004996284068923048, + "loss": 0.2717, + "step": 12670 + }, + { + "epoch": 0.53, + "grad_norm": 0.77734375, + "learning_rate": 0.000499627815565889, + "loss": 0.2107, + "step": 12680 + }, + { + "epoch": 0.53, + "grad_norm": 0.82421875, + "learning_rate": 0.0004996272237697007, + "loss": 0.2736, + "step": 12690 + }, + { + "epoch": 0.53, + "grad_norm": 0.62890625, + "learning_rate": 0.0004996266315037409, + "loss": 0.2264, + "step": 12700 + }, + { + "epoch": 0.53, + "grad_norm": 0.271484375, + "learning_rate": 0.0004996260387680107, + "loss": 0.2796, + "step": 12710 + }, + { + "epoch": 0.53, + "grad_norm": 1.7890625, + "learning_rate": 0.0004996254455625112, + "loss": 0.2161, + "step": 12720 + }, + { + "epoch": 0.53, + "grad_norm": 0.5546875, + "learning_rate": 0.0004996248518872437, + "loss": 0.2829, + "step": 12730 + }, + { + "epoch": 0.53, + "grad_norm": 0.33984375, + "learning_rate": 0.0004996242577422091, + "loss": 0.2529, + "step": 12740 + }, + { + "epoch": 0.53, + "grad_norm": 0.640625, + "learning_rate": 0.0004996236631274087, + "loss": 0.2989, + "step": 12750 + }, + { + "epoch": 0.53, + "grad_norm": 0.92578125, + "learning_rate": 0.0004996230680428434, + "loss": 0.2907, + "step": 12760 + }, + { + "epoch": 0.53, + "grad_norm": 0.76953125, + "learning_rate": 0.0004996224724885146, + "loss": 0.2576, + "step": 12770 + }, + { + "epoch": 0.53, + "grad_norm": 0.40234375, + "learning_rate": 0.0004996218764644231, + "loss": 0.2536, + "step": 12780 + }, + { + "epoch": 0.53, + "grad_norm": 0.0, + "learning_rate": 0.0004996212799705702, + "loss": 0.2959, + "step": 12790 + }, + { + "epoch": 0.53, + "grad_norm": 0.58203125, + "learning_rate": 0.0004996206830069571, + "loss": 0.2481, + "step": 12800 + }, + { + "epoch": 0.53, + "grad_norm": 0.41015625, + "learning_rate": 0.0004996200855735848, + "loss": 0.2618, + "step": 12810 + }, + { + "epoch": 0.53, + "grad_norm": 1.2421875, + "learning_rate": 0.0004996194876704544, + "loss": 0.2346, + "step": 12820 + }, + { + "epoch": 0.53, + "grad_norm": 0.77734375, + "learning_rate": 0.0004996188892975672, + "loss": 0.2301, + "step": 12830 + }, + { + "epoch": 0.53, + "grad_norm": 0.7265625, + "learning_rate": 0.0004996182904549241, + "loss": 0.2844, + "step": 12840 + }, + { + "epoch": 0.53, + "grad_norm": 0.318359375, + "learning_rate": 0.0004996176911425263, + "loss": 0.2547, + "step": 12850 + }, + { + "epoch": 0.53, + "grad_norm": 0.75390625, + "learning_rate": 0.000499617091360375, + "loss": 0.2795, + "step": 12860 + }, + { + "epoch": 0.53, + "grad_norm": 2.0, + "learning_rate": 0.0004996164911084714, + "loss": 0.2046, + "step": 12870 + }, + { + "epoch": 0.53, + "grad_norm": 0.75390625, + "learning_rate": 0.0004996158903868164, + "loss": 0.2241, + "step": 12880 + }, + { + "epoch": 0.53, + "grad_norm": 1.6484375, + "learning_rate": 0.0004996152891954113, + "loss": 0.1926, + "step": 12890 + }, + { + "epoch": 0.53, + "grad_norm": 0.890625, + "learning_rate": 0.0004996146875342572, + "loss": 0.2101, + "step": 12900 + }, + { + "epoch": 0.53, + "grad_norm": 0.5234375, + "learning_rate": 0.0004996140854033552, + "loss": 0.2335, + "step": 12910 + }, + { + "epoch": 0.54, + "grad_norm": 0.412109375, + "learning_rate": 0.0004996134828027063, + "loss": 0.2318, + "step": 12920 + }, + { + "epoch": 0.54, + "grad_norm": 0.8203125, + "learning_rate": 0.0004996128797323119, + "loss": 0.179, + "step": 12930 + }, + { + "epoch": 0.54, + "grad_norm": 0.75390625, + "learning_rate": 0.0004996122761921731, + "loss": 0.2761, + "step": 12940 + }, + { + "epoch": 0.54, + "grad_norm": 0.59375, + "learning_rate": 0.0004996116721822909, + "loss": 0.3093, + "step": 12950 + }, + { + "epoch": 0.54, + "grad_norm": 1.1015625, + "learning_rate": 0.0004996110677026665, + "loss": 0.2626, + "step": 12960 + }, + { + "epoch": 0.54, + "grad_norm": 0.4296875, + "learning_rate": 0.000499610462753301, + "loss": 0.2832, + "step": 12970 + }, + { + "epoch": 0.54, + "grad_norm": 0.83984375, + "learning_rate": 0.0004996098573341955, + "loss": 0.2302, + "step": 12980 + }, + { + "epoch": 0.54, + "grad_norm": 0.328125, + "learning_rate": 0.0004996092514453513, + "loss": 0.2273, + "step": 12990 + }, + { + "epoch": 0.54, + "grad_norm": 0.384765625, + "learning_rate": 0.0004996086450867694, + "loss": 0.2482, + "step": 13000 + }, + { + "epoch": 0.54, + "grad_norm": 1.5625, + "learning_rate": 0.000499608038258451, + "loss": 0.2562, + "step": 13010 + }, + { + "epoch": 0.54, + "grad_norm": 0.60546875, + "learning_rate": 0.0004996074309603971, + "loss": 0.2624, + "step": 13020 + }, + { + "epoch": 0.54, + "grad_norm": 0.66796875, + "learning_rate": 0.0004996068231926092, + "loss": 0.2411, + "step": 13030 + }, + { + "epoch": 0.54, + "grad_norm": 1.546875, + "learning_rate": 0.0004996062149550881, + "loss": 0.237, + "step": 13040 + }, + { + "epoch": 0.54, + "grad_norm": 1.6875, + "learning_rate": 0.000499605606247835, + "loss": 0.2883, + "step": 13050 + }, + { + "epoch": 0.54, + "grad_norm": 0.8359375, + "learning_rate": 0.0004996049970708512, + "loss": 0.3037, + "step": 13060 + }, + { + "epoch": 0.54, + "grad_norm": 1.5078125, + "learning_rate": 0.0004996043874241378, + "loss": 0.2306, + "step": 13070 + }, + { + "epoch": 0.54, + "grad_norm": 0.51953125, + "learning_rate": 0.0004996037773076957, + "loss": 0.2694, + "step": 13080 + }, + { + "epoch": 0.54, + "grad_norm": 0.333984375, + "learning_rate": 0.0004996031667215264, + "loss": 0.1821, + "step": 13090 + }, + { + "epoch": 0.54, + "grad_norm": 0.86328125, + "learning_rate": 0.0004996025556656308, + "loss": 0.2322, + "step": 13100 + }, + { + "epoch": 0.54, + "grad_norm": 0.212890625, + "learning_rate": 0.0004996019441400102, + "loss": 0.2405, + "step": 13110 + }, + { + "epoch": 0.54, + "grad_norm": 1.1640625, + "learning_rate": 0.0004996013321446657, + "loss": 0.254, + "step": 13120 + }, + { + "epoch": 0.54, + "grad_norm": 0.58203125, + "learning_rate": 0.0004996007196795986, + "loss": 0.2997, + "step": 13130 + }, + { + "epoch": 0.54, + "grad_norm": 0.70703125, + "learning_rate": 0.0004996001067448096, + "loss": 0.2703, + "step": 13140 + }, + { + "epoch": 0.54, + "grad_norm": 0.375, + "learning_rate": 0.0004995994933403003, + "loss": 0.2569, + "step": 13150 + }, + { + "epoch": 0.55, + "grad_norm": 1.171875, + "learning_rate": 0.0004995988794660718, + "loss": 0.2327, + "step": 13160 + }, + { + "epoch": 0.55, + "grad_norm": 0.6953125, + "learning_rate": 0.0004995982651221251, + "loss": 0.2467, + "step": 13170 + }, + { + "epoch": 0.55, + "grad_norm": 0.77734375, + "learning_rate": 0.0004995976503084612, + "loss": 0.1784, + "step": 13180 + }, + { + "epoch": 0.55, + "grad_norm": 0.8359375, + "learning_rate": 0.0004995970350250819, + "loss": 0.2317, + "step": 13190 + }, + { + "epoch": 0.55, + "grad_norm": 1.1484375, + "learning_rate": 0.0004995964192719876, + "loss": 0.2616, + "step": 13200 + }, + { + "epoch": 0.55, + "grad_norm": 2.03125, + "learning_rate": 0.0004995958030491798, + "loss": 0.1955, + "step": 13210 + }, + { + "epoch": 0.55, + "grad_norm": 1.1796875, + "learning_rate": 0.0004995951863566598, + "loss": 0.215, + "step": 13220 + }, + { + "epoch": 0.55, + "grad_norm": 0.359375, + "learning_rate": 0.0004995945691944286, + "loss": 0.26, + "step": 13230 + }, + { + "epoch": 0.55, + "grad_norm": 1.1015625, + "learning_rate": 0.0004995939515624873, + "loss": 0.2212, + "step": 13240 + }, + { + "epoch": 0.55, + "grad_norm": 2.515625, + "learning_rate": 0.0004995933334608372, + "loss": 0.2671, + "step": 13250 + }, + { + "epoch": 0.55, + "grad_norm": 0.498046875, + "learning_rate": 0.0004995927148894793, + "loss": 0.226, + "step": 13260 + }, + { + "epoch": 0.55, + "grad_norm": 0.625, + "learning_rate": 0.000499592095848415, + "loss": 0.2084, + "step": 13270 + }, + { + "epoch": 0.55, + "grad_norm": 1.171875, + "learning_rate": 0.0004995914763376452, + "loss": 0.2098, + "step": 13280 + }, + { + "epoch": 0.55, + "grad_norm": 0.63671875, + "learning_rate": 0.0004995908563571713, + "loss": 0.2819, + "step": 13290 + }, + { + "epoch": 0.55, + "grad_norm": 0.51953125, + "learning_rate": 0.0004995902359069943, + "loss": 0.2602, + "step": 13300 + }, + { + "epoch": 0.55, + "grad_norm": 0.69140625, + "learning_rate": 0.0004995896149871154, + "loss": 0.2179, + "step": 13310 + }, + { + "epoch": 0.55, + "grad_norm": 0.578125, + "learning_rate": 0.0004995889935975359, + "loss": 0.2343, + "step": 13320 + }, + { + "epoch": 0.55, + "grad_norm": 0.87109375, + "learning_rate": 0.0004995883717382567, + "loss": 0.1691, + "step": 13330 + }, + { + "epoch": 0.55, + "grad_norm": 0.8984375, + "learning_rate": 0.0004995877494092793, + "loss": 0.3114, + "step": 13340 + }, + { + "epoch": 0.55, + "grad_norm": 0.515625, + "learning_rate": 0.0004995871266106047, + "loss": 0.2224, + "step": 13350 + }, + { + "epoch": 0.55, + "grad_norm": 0.72265625, + "learning_rate": 0.0004995865033422341, + "loss": 0.2726, + "step": 13360 + }, + { + "epoch": 0.55, + "grad_norm": 0.609375, + "learning_rate": 0.0004995858796041686, + "loss": 0.2317, + "step": 13370 + }, + { + "epoch": 0.55, + "grad_norm": 0.66015625, + "learning_rate": 0.0004995852553964094, + "loss": 0.2744, + "step": 13380 + }, + { + "epoch": 0.55, + "grad_norm": 0.421875, + "learning_rate": 0.0004995846307189577, + "loss": 0.2335, + "step": 13390 + }, + { + "epoch": 0.56, + "grad_norm": 0.81640625, + "learning_rate": 0.0004995840055718147, + "loss": 0.2615, + "step": 13400 + }, + { + "epoch": 0.56, + "grad_norm": 0.5, + "learning_rate": 0.0004995833799549816, + "loss": 0.2339, + "step": 13410 + }, + { + "epoch": 0.56, + "grad_norm": 0.79296875, + "learning_rate": 0.0004995827538684595, + "loss": 0.2743, + "step": 13420 + }, + { + "epoch": 0.56, + "grad_norm": 0.8984375, + "learning_rate": 0.0004995821273122495, + "loss": 0.2834, + "step": 13430 + }, + { + "epoch": 0.56, + "grad_norm": 0.47265625, + "learning_rate": 0.0004995815002863531, + "loss": 0.1678, + "step": 13440 + }, + { + "epoch": 0.56, + "grad_norm": 1.515625, + "learning_rate": 0.0004995808727907711, + "loss": 0.2685, + "step": 13450 + }, + { + "epoch": 0.56, + "grad_norm": 0.7421875, + "learning_rate": 0.0004995802448255049, + "loss": 0.2079, + "step": 13460 + }, + { + "epoch": 0.56, + "grad_norm": 1.71875, + "learning_rate": 0.0004995796163905557, + "loss": 0.1969, + "step": 13470 + }, + { + "epoch": 0.56, + "grad_norm": 0.53515625, + "learning_rate": 0.0004995789874859245, + "loss": 0.2641, + "step": 13480 + }, + { + "epoch": 0.56, + "grad_norm": 1.015625, + "learning_rate": 0.0004995783581116127, + "loss": 0.2001, + "step": 13490 + }, + { + "epoch": 0.56, + "grad_norm": 0.62109375, + "learning_rate": 0.0004995777282676213, + "loss": 0.2241, + "step": 13500 + }, + { + "epoch": 0.56, + "grad_norm": 0.62890625, + "learning_rate": 0.0004995770979539516, + "loss": 0.3077, + "step": 13510 + }, + { + "epoch": 0.56, + "grad_norm": 0.74609375, + "learning_rate": 0.0004995764671706048, + "loss": 0.2396, + "step": 13520 + }, + { + "epoch": 0.56, + "grad_norm": 0.388671875, + "learning_rate": 0.0004995758359175819, + "loss": 0.2242, + "step": 13530 + }, + { + "epoch": 0.56, + "grad_norm": 0.291015625, + "learning_rate": 0.0004995752041948843, + "loss": 0.255, + "step": 13540 + }, + { + "epoch": 0.56, + "grad_norm": 1.046875, + "learning_rate": 0.0004995745720025132, + "loss": 0.2285, + "step": 13550 + }, + { + "epoch": 0.56, + "grad_norm": 0.61328125, + "learning_rate": 0.0004995739393404697, + "loss": 0.2632, + "step": 13560 + }, + { + "epoch": 0.56, + "grad_norm": 1.25, + "learning_rate": 0.0004995733062087549, + "loss": 0.3018, + "step": 13570 + }, + { + "epoch": 0.56, + "grad_norm": 1.1328125, + "learning_rate": 0.0004995726726073701, + "loss": 0.2273, + "step": 13580 + }, + { + "epoch": 0.56, + "grad_norm": 0.75, + "learning_rate": 0.0004995720385363165, + "loss": 0.1924, + "step": 13590 + }, + { + "epoch": 0.56, + "grad_norm": 0.4453125, + "learning_rate": 0.0004995714039955953, + "loss": 0.2614, + "step": 13600 + }, + { + "epoch": 0.56, + "grad_norm": 1.4765625, + "learning_rate": 0.0004995707689852077, + "loss": 0.2057, + "step": 13610 + }, + { + "epoch": 0.56, + "grad_norm": 1.2265625, + "learning_rate": 0.0004995701335051548, + "loss": 0.2785, + "step": 13620 + }, + { + "epoch": 0.56, + "grad_norm": 0.93359375, + "learning_rate": 0.0004995694975554379, + "loss": 0.2519, + "step": 13630 + }, + { + "epoch": 0.56, + "grad_norm": 0.5546875, + "learning_rate": 0.0004995688611360581, + "loss": 0.3119, + "step": 13640 + }, + { + "epoch": 0.57, + "grad_norm": 0.54296875, + "learning_rate": 0.0004995682242470167, + "loss": 0.219, + "step": 13650 + }, + { + "epoch": 0.57, + "grad_norm": 0.640625, + "learning_rate": 0.0004995675868883149, + "loss": 0.2399, + "step": 13660 + }, + { + "epoch": 0.57, + "grad_norm": 1.546875, + "learning_rate": 0.0004995669490599538, + "loss": 0.2815, + "step": 13670 + }, + { + "epoch": 0.57, + "grad_norm": 0.62890625, + "learning_rate": 0.0004995663107619346, + "loss": 0.2547, + "step": 13680 + }, + { + "epoch": 0.57, + "grad_norm": 1.109375, + "learning_rate": 0.0004995656719942586, + "loss": 0.2507, + "step": 13690 + }, + { + "epoch": 0.57, + "grad_norm": 0.5078125, + "learning_rate": 0.000499565032756927, + "loss": 0.2775, + "step": 13700 + }, + { + "epoch": 0.57, + "grad_norm": 1.0703125, + "learning_rate": 0.000499564393049941, + "loss": 0.2631, + "step": 13710 + }, + { + "epoch": 0.57, + "grad_norm": 1.09375, + "learning_rate": 0.0004995637528733015, + "loss": 0.2436, + "step": 13720 + }, + { + "epoch": 0.57, + "grad_norm": 0.392578125, + "learning_rate": 0.0004995631122270102, + "loss": 0.1846, + "step": 13730 + }, + { + "epoch": 0.57, + "grad_norm": 0.6796875, + "learning_rate": 0.0004995624711110681, + "loss": 0.2283, + "step": 13740 + }, + { + "epoch": 0.57, + "grad_norm": 0.734375, + "learning_rate": 0.0004995618295254763, + "loss": 0.2332, + "step": 13750 + }, + { + "epoch": 0.57, + "grad_norm": 0.6640625, + "learning_rate": 0.0004995611874702361, + "loss": 0.2093, + "step": 13760 + }, + { + "epoch": 0.57, + "grad_norm": 1.1171875, + "learning_rate": 0.0004995605449453486, + "loss": 0.2575, + "step": 13770 + }, + { + "epoch": 0.57, + "grad_norm": 1.0859375, + "learning_rate": 0.0004995599019508153, + "loss": 0.2676, + "step": 13780 + }, + { + "epoch": 0.57, + "grad_norm": 1.6015625, + "learning_rate": 0.000499559258486637, + "loss": 0.2634, + "step": 13790 + }, + { + "epoch": 0.57, + "grad_norm": 0.322265625, + "learning_rate": 0.0004995586145528153, + "loss": 0.2509, + "step": 13800 + }, + { + "epoch": 0.57, + "grad_norm": 1.5390625, + "learning_rate": 0.0004995579701493511, + "loss": 0.2346, + "step": 13810 + }, + { + "epoch": 0.57, + "grad_norm": 0.65234375, + "learning_rate": 0.0004995573252762459, + "loss": 0.2452, + "step": 13820 + }, + { + "epoch": 0.57, + "grad_norm": 0.5, + "learning_rate": 0.0004995566799335006, + "loss": 0.2592, + "step": 13830 + }, + { + "epoch": 0.57, + "grad_norm": 0.52734375, + "learning_rate": 0.0004995560341211167, + "loss": 0.2139, + "step": 13840 + }, + { + "epoch": 0.57, + "grad_norm": 0.44921875, + "learning_rate": 0.0004995553878390953, + "loss": 0.3036, + "step": 13850 + }, + { + "epoch": 0.57, + "grad_norm": 1.3828125, + "learning_rate": 0.0004995547410874375, + "loss": 0.2706, + "step": 13860 + }, + { + "epoch": 0.57, + "grad_norm": 1.3125, + "learning_rate": 0.0004995540938661447, + "loss": 0.2251, + "step": 13870 + }, + { + "epoch": 0.57, + "grad_norm": 0.310546875, + "learning_rate": 0.0004995534461752181, + "loss": 0.2388, + "step": 13880 + }, + { + "epoch": 0.58, + "grad_norm": 0.486328125, + "learning_rate": 0.0004995527980146588, + "loss": 0.2894, + "step": 13890 + }, + { + "epoch": 0.58, + "grad_norm": 0.609375, + "learning_rate": 0.0004995521493844681, + "loss": 0.2616, + "step": 13900 + }, + { + "epoch": 0.58, + "grad_norm": 0.36328125, + "learning_rate": 0.0004995515002846471, + "loss": 0.2312, + "step": 13910 + }, + { + "epoch": 0.58, + "grad_norm": 1.09375, + "learning_rate": 0.0004995508507151973, + "loss": 0.25, + "step": 13920 + }, + { + "epoch": 0.58, + "grad_norm": 0.93359375, + "learning_rate": 0.0004995502006761197, + "loss": 0.2461, + "step": 13930 + }, + { + "epoch": 0.58, + "grad_norm": 0.53515625, + "learning_rate": 0.0004995495501674155, + "loss": 0.2224, + "step": 13940 + }, + { + "epoch": 0.58, + "grad_norm": 0.75, + "learning_rate": 0.000499548899189086, + "loss": 0.2569, + "step": 13950 + }, + { + "epoch": 0.58, + "grad_norm": 0.5625, + "learning_rate": 0.0004995482477411325, + "loss": 0.2508, + "step": 13960 + }, + { + "epoch": 0.58, + "grad_norm": 0.75390625, + "learning_rate": 0.0004995475958235561, + "loss": 0.2307, + "step": 13970 + }, + { + "epoch": 0.58, + "grad_norm": 0.5078125, + "learning_rate": 0.000499546943436358, + "loss": 0.2748, + "step": 13980 + }, + { + "epoch": 0.58, + "grad_norm": 0.65625, + "learning_rate": 0.0004995462905795396, + "loss": 0.2595, + "step": 13990 + }, + { + "epoch": 0.58, + "grad_norm": 0.30078125, + "learning_rate": 0.000499545637253102, + "loss": 0.2121, + "step": 14000 + }, + { + "epoch": 0.58, + "grad_norm": 0.5234375, + "learning_rate": 0.0004995449834570465, + "loss": 0.3073, + "step": 14010 + }, + { + "epoch": 0.58, + "grad_norm": 0.53125, + "learning_rate": 0.0004995443291913742, + "loss": 0.259, + "step": 14020 + }, + { + "epoch": 0.58, + "grad_norm": 0.734375, + "learning_rate": 0.0004995436744560865, + "loss": 0.2233, + "step": 14030 + }, + { + "epoch": 0.58, + "grad_norm": 0.546875, + "learning_rate": 0.0004995430192511845, + "loss": 0.2296, + "step": 14040 + }, + { + "epoch": 0.58, + "grad_norm": 1.8046875, + "learning_rate": 0.0004995423635766696, + "loss": 0.2673, + "step": 14050 + }, + { + "epoch": 0.58, + "grad_norm": 0.734375, + "learning_rate": 0.0004995417074325428, + "loss": 0.2613, + "step": 14060 + }, + { + "epoch": 0.58, + "grad_norm": 0.431640625, + "learning_rate": 0.0004995410508188054, + "loss": 0.302, + "step": 14070 + }, + { + "epoch": 0.58, + "grad_norm": 1.4765625, + "learning_rate": 0.0004995403937354588, + "loss": 0.2151, + "step": 14080 + }, + { + "epoch": 0.58, + "grad_norm": 0.81640625, + "learning_rate": 0.0004995397361825042, + "loss": 0.2904, + "step": 14090 + }, + { + "epoch": 0.58, + "grad_norm": 0.427734375, + "learning_rate": 0.0004995390781599426, + "loss": 0.2919, + "step": 14100 + }, + { + "epoch": 0.58, + "grad_norm": 0.5, + "learning_rate": 0.0004995384196677755, + "loss": 0.2536, + "step": 14110 + }, + { + "epoch": 0.58, + "grad_norm": 0.494140625, + "learning_rate": 0.000499537760706004, + "loss": 0.2561, + "step": 14120 + }, + { + "epoch": 0.59, + "grad_norm": 0.5625, + "learning_rate": 0.0004995371012746294, + "loss": 0.2942, + "step": 14130 + }, + { + "epoch": 0.59, + "grad_norm": 0.80078125, + "learning_rate": 0.0004995364413736529, + "loss": 0.2373, + "step": 14140 + }, + { + "epoch": 0.59, + "grad_norm": 0.5546875, + "learning_rate": 0.0004995357810030757, + "loss": 0.2727, + "step": 14150 + }, + { + "epoch": 0.59, + "grad_norm": 0.32421875, + "learning_rate": 0.0004995351201628992, + "loss": 0.232, + "step": 14160 + }, + { + "epoch": 0.59, + "grad_norm": 1.359375, + "learning_rate": 0.0004995344588531246, + "loss": 0.2614, + "step": 14170 + }, + { + "epoch": 0.59, + "grad_norm": 1.78125, + "learning_rate": 0.0004995337970737531, + "loss": 0.2045, + "step": 14180 + }, + { + "epoch": 0.59, + "grad_norm": 0.65234375, + "learning_rate": 0.0004995331348247858, + "loss": 0.284, + "step": 14190 + }, + { + "epoch": 0.59, + "grad_norm": 0.55078125, + "learning_rate": 0.0004995324721062242, + "loss": 0.246, + "step": 14200 + }, + { + "epoch": 0.59, + "grad_norm": 0.67578125, + "learning_rate": 0.0004995318089180694, + "loss": 0.2755, + "step": 14210 + }, + { + "epoch": 0.59, + "grad_norm": 0.62890625, + "learning_rate": 0.0004995311452603226, + "loss": 0.2174, + "step": 14220 + }, + { + "epoch": 0.59, + "grad_norm": 1.8828125, + "learning_rate": 0.0004995304811329853, + "loss": 0.2605, + "step": 14230 + }, + { + "epoch": 0.59, + "grad_norm": 0.64453125, + "learning_rate": 0.0004995298165360585, + "loss": 0.2819, + "step": 14240 + }, + { + "epoch": 0.59, + "grad_norm": 0.98046875, + "learning_rate": 0.0004995291514695435, + "loss": 0.3071, + "step": 14250 + }, + { + "epoch": 0.59, + "grad_norm": 0.384765625, + "learning_rate": 0.0004995284859334417, + "loss": 0.2543, + "step": 14260 + }, + { + "epoch": 0.59, + "grad_norm": 3.75, + "learning_rate": 0.0004995278199277541, + "loss": 0.2567, + "step": 14270 + }, + { + "epoch": 0.59, + "grad_norm": 0.39453125, + "learning_rate": 0.0004995271534524821, + "loss": 0.2601, + "step": 14280 + }, + { + "epoch": 0.59, + "grad_norm": 0.75, + "learning_rate": 0.000499526486507627, + "loss": 0.2741, + "step": 14290 + }, + { + "epoch": 0.59, + "grad_norm": 0.515625, + "learning_rate": 0.00049952581909319, + "loss": 0.2149, + "step": 14300 + }, + { + "epoch": 0.59, + "grad_norm": 1.3125, + "learning_rate": 0.0004995251512091723, + "loss": 0.247, + "step": 14310 + }, + { + "epoch": 0.59, + "grad_norm": 0.91015625, + "learning_rate": 0.0004995244828555753, + "loss": 0.2407, + "step": 14320 + }, + { + "epoch": 0.59, + "grad_norm": 0.66796875, + "learning_rate": 0.0004995238140324001, + "loss": 0.2207, + "step": 14330 + }, + { + "epoch": 0.59, + "grad_norm": 0.703125, + "learning_rate": 0.000499523144739648, + "loss": 0.2493, + "step": 14340 + }, + { + "epoch": 0.59, + "grad_norm": 0.765625, + "learning_rate": 0.0004995224749773204, + "loss": 0.3104, + "step": 14350 + }, + { + "epoch": 0.59, + "grad_norm": 0.314453125, + "learning_rate": 0.0004995218047454183, + "loss": 0.2288, + "step": 14360 + }, + { + "epoch": 0.6, + "grad_norm": 0.6328125, + "learning_rate": 0.0004995211340439432, + "loss": 0.2732, + "step": 14370 + }, + { + "epoch": 0.6, + "grad_norm": 0.77734375, + "learning_rate": 0.0004995204628728963, + "loss": 0.2424, + "step": 14380 + }, + { + "epoch": 0.6, + "grad_norm": 0.98046875, + "learning_rate": 0.0004995197912322787, + "loss": 0.2151, + "step": 14390 + }, + { + "epoch": 0.6, + "grad_norm": 0.703125, + "learning_rate": 0.000499519119122092, + "loss": 0.2538, + "step": 14400 + }, + { + "epoch": 0.6, + "grad_norm": 0.98046875, + "learning_rate": 0.0004995184465423372, + "loss": 0.2513, + "step": 14410 + }, + { + "epoch": 0.6, + "grad_norm": 1.265625, + "learning_rate": 0.0004995177734930155, + "loss": 0.2593, + "step": 14420 + }, + { + "epoch": 0.6, + "grad_norm": 0.291015625, + "learning_rate": 0.0004995170999741285, + "loss": 0.2505, + "step": 14430 + }, + { + "epoch": 0.6, + "grad_norm": 0.4765625, + "learning_rate": 0.000499516425985677, + "loss": 0.2398, + "step": 14440 + }, + { + "epoch": 0.6, + "grad_norm": 0.8125, + "learning_rate": 0.0004995157515276627, + "loss": 0.3041, + "step": 14450 + }, + { + "epoch": 0.6, + "grad_norm": 0.5390625, + "learning_rate": 0.0004995150766000867, + "loss": 0.2713, + "step": 14460 + }, + { + "epoch": 0.6, + "grad_norm": 0.486328125, + "learning_rate": 0.0004995144012029503, + "loss": 0.2588, + "step": 14470 + }, + { + "epoch": 0.6, + "grad_norm": 0.79296875, + "learning_rate": 0.0004995137253362546, + "loss": 0.2697, + "step": 14480 + }, + { + "epoch": 0.6, + "grad_norm": 0.65234375, + "learning_rate": 0.0004995130490000011, + "loss": 0.2235, + "step": 14490 + }, + { + "epoch": 0.6, + "grad_norm": 0.490234375, + "learning_rate": 0.0004995123721941911, + "loss": 0.1984, + "step": 14500 + }, + { + "epoch": 0.6, + "grad_norm": 0.8046875, + "learning_rate": 0.0004995116949188256, + "loss": 0.2747, + "step": 14510 + }, + { + "epoch": 0.6, + "grad_norm": 0.6484375, + "learning_rate": 0.0004995110171739061, + "loss": 0.2638, + "step": 14520 + }, + { + "epoch": 0.6, + "grad_norm": 0.50390625, + "learning_rate": 0.0004995103389594339, + "loss": 0.2858, + "step": 14530 + }, + { + "epoch": 0.6, + "grad_norm": 0.6875, + "learning_rate": 0.00049950966027541, + "loss": 0.2744, + "step": 14540 + }, + { + "epoch": 0.6, + "grad_norm": 0.625, + "learning_rate": 0.000499508981121836, + "loss": 0.2562, + "step": 14550 + }, + { + "epoch": 0.6, + "grad_norm": 0.796875, + "learning_rate": 0.000499508301498713, + "loss": 0.2537, + "step": 14560 + }, + { + "epoch": 0.6, + "grad_norm": 0.94140625, + "learning_rate": 0.0004995076214060422, + "loss": 0.2035, + "step": 14570 + }, + { + "epoch": 0.6, + "grad_norm": 0.5703125, + "learning_rate": 0.0004995069408438252, + "loss": 0.2859, + "step": 14580 + }, + { + "epoch": 0.6, + "grad_norm": 0.443359375, + "learning_rate": 0.000499506259812063, + "loss": 0.2544, + "step": 14590 + }, + { + "epoch": 0.6, + "grad_norm": 1.015625, + "learning_rate": 0.0004995055783107569, + "loss": 0.2407, + "step": 14600 + }, + { + "epoch": 0.61, + "grad_norm": 0.58203125, + "learning_rate": 0.0004995048963399083, + "loss": 0.2195, + "step": 14610 + }, + { + "epoch": 0.61, + "grad_norm": 0.515625, + "learning_rate": 0.0004995042138995185, + "loss": 0.2912, + "step": 14620 + }, + { + "epoch": 0.61, + "grad_norm": 0.376953125, + "learning_rate": 0.0004995035309895887, + "loss": 0.2738, + "step": 14630 + }, + { + "epoch": 0.61, + "grad_norm": 0.2890625, + "learning_rate": 0.0004995028476101201, + "loss": 0.2364, + "step": 14640 + }, + { + "epoch": 0.61, + "grad_norm": 0.8125, + "learning_rate": 0.0004995021637611141, + "loss": 0.2355, + "step": 14650 + }, + { + "epoch": 0.61, + "grad_norm": 0.240234375, + "learning_rate": 0.0004995014794425721, + "loss": 0.1927, + "step": 14660 + }, + { + "epoch": 0.61, + "grad_norm": 0.703125, + "learning_rate": 0.0004995007946544951, + "loss": 0.2553, + "step": 14670 + }, + { + "epoch": 0.61, + "grad_norm": 0.7265625, + "learning_rate": 0.0004995001093968846, + "loss": 0.2474, + "step": 14680 + }, + { + "epoch": 0.61, + "grad_norm": 0.7890625, + "learning_rate": 0.0004994994236697419, + "loss": 0.2573, + "step": 14690 + }, + { + "epoch": 0.61, + "grad_norm": 0.29296875, + "learning_rate": 0.0004994987374730682, + "loss": 0.2263, + "step": 14700 + }, + { + "epoch": 0.61, + "grad_norm": 0.875, + "learning_rate": 0.0004994980508068647, + "loss": 0.2805, + "step": 14710 + }, + { + "epoch": 0.61, + "grad_norm": 0.828125, + "learning_rate": 0.0004994973636711329, + "loss": 0.2899, + "step": 14720 + }, + { + "epoch": 0.61, + "grad_norm": 0.625, + "learning_rate": 0.0004994966760658741, + "loss": 0.2508, + "step": 14730 + }, + { + "epoch": 0.61, + "grad_norm": 0.423828125, + "learning_rate": 0.0004994959879910894, + "loss": 0.2679, + "step": 14740 + }, + { + "epoch": 0.61, + "grad_norm": 0.6796875, + "learning_rate": 0.0004994952994467802, + "loss": 0.2845, + "step": 14750 + }, + { + "epoch": 0.61, + "grad_norm": 0.44140625, + "learning_rate": 0.0004994946104329479, + "loss": 0.2588, + "step": 14760 + }, + { + "epoch": 0.61, + "grad_norm": 0.50390625, + "learning_rate": 0.0004994939209495934, + "loss": 0.2413, + "step": 14770 + }, + { + "epoch": 0.61, + "grad_norm": 0.333984375, + "learning_rate": 0.0004994932309967185, + "loss": 0.2333, + "step": 14780 + }, + { + "epoch": 0.61, + "grad_norm": 0.474609375, + "learning_rate": 0.0004994925405743243, + "loss": 0.2576, + "step": 14790 + }, + { + "epoch": 0.61, + "grad_norm": 0.2734375, + "learning_rate": 0.000499491849682412, + "loss": 0.211, + "step": 14800 + }, + { + "epoch": 0.61, + "grad_norm": 0.470703125, + "learning_rate": 0.000499491158320983, + "loss": 0.265, + "step": 14810 + }, + { + "epoch": 0.61, + "grad_norm": 0.1845703125, + "learning_rate": 0.0004994904664900387, + "loss": 0.2789, + "step": 14820 + }, + { + "epoch": 0.61, + "grad_norm": 0.2734375, + "learning_rate": 0.00049948977418958, + "loss": 0.2975, + "step": 14830 + }, + { + "epoch": 0.61, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004994890814196088, + "loss": 0.2559, + "step": 14840 + }, + { + "epoch": 0.62, + "grad_norm": 0.671875, + "learning_rate": 0.000499488388180126, + "loss": 0.2279, + "step": 14850 + }, + { + "epoch": 0.62, + "grad_norm": 0.478515625, + "learning_rate": 0.0004994876944711329, + "loss": 0.2602, + "step": 14860 + }, + { + "epoch": 0.62, + "grad_norm": 0.453125, + "learning_rate": 0.0004994870002926309, + "loss": 0.2448, + "step": 14870 + }, + { + "epoch": 0.62, + "grad_norm": 0.61328125, + "learning_rate": 0.0004994863056446214, + "loss": 0.2266, + "step": 14880 + }, + { + "epoch": 0.62, + "grad_norm": 0.36328125, + "learning_rate": 0.0004994856105271056, + "loss": 0.214, + "step": 14890 + }, + { + "epoch": 0.62, + "grad_norm": 0.462890625, + "learning_rate": 0.0004994849149400848, + "loss": 0.2992, + "step": 14900 + }, + { + "epoch": 0.62, + "grad_norm": 0.8515625, + "learning_rate": 0.0004994842188835603, + "loss": 0.219, + "step": 14910 + }, + { + "epoch": 0.62, + "grad_norm": 0.482421875, + "learning_rate": 0.0004994835223575335, + "loss": 0.227, + "step": 14920 + }, + { + "epoch": 0.62, + "grad_norm": 0.380859375, + "learning_rate": 0.0004994828253620056, + "loss": 0.1893, + "step": 14930 + }, + { + "epoch": 0.62, + "grad_norm": 0.44921875, + "learning_rate": 0.000499482127896978, + "loss": 0.2657, + "step": 14940 + }, + { + "epoch": 0.62, + "grad_norm": 0.439453125, + "learning_rate": 0.000499481429962452, + "loss": 0.2463, + "step": 14950 + }, + { + "epoch": 0.62, + "grad_norm": 0.68359375, + "learning_rate": 0.0004994807315584289, + "loss": 0.2566, + "step": 14960 + }, + { + "epoch": 0.62, + "grad_norm": 0.765625, + "learning_rate": 0.00049948003268491, + "loss": 0.2261, + "step": 14970 + }, + { + "epoch": 0.62, + "grad_norm": 1.0390625, + "learning_rate": 0.0004994793333418966, + "loss": 0.2238, + "step": 14980 + }, + { + "epoch": 0.62, + "grad_norm": 0.5625, + "learning_rate": 0.0004994786335293899, + "loss": 0.2283, + "step": 14990 + }, + { + "epoch": 0.62, + "grad_norm": 0.49609375, + "learning_rate": 0.0004994779332473916, + "loss": 0.2071, + "step": 15000 + }, + { + "epoch": 0.62, + "grad_norm": 0.3359375, + "learning_rate": 0.0004994772324959026, + "loss": 0.2802, + "step": 15010 + }, + { + "epoch": 0.62, + "grad_norm": 0.6015625, + "learning_rate": 0.0004994765312749244, + "loss": 0.2558, + "step": 15020 + }, + { + "epoch": 0.62, + "grad_norm": 0.337890625, + "learning_rate": 0.0004994758295844584, + "loss": 0.2589, + "step": 15030 + }, + { + "epoch": 0.62, + "grad_norm": 1.5859375, + "learning_rate": 0.0004994751274245057, + "loss": 0.2529, + "step": 15040 + }, + { + "epoch": 0.62, + "grad_norm": 0.87109375, + "learning_rate": 0.0004994744247950679, + "loss": 0.239, + "step": 15050 + }, + { + "epoch": 0.62, + "grad_norm": 0.6796875, + "learning_rate": 0.000499473721696146, + "loss": 0.239, + "step": 15060 + }, + { + "epoch": 0.62, + "grad_norm": 0.74609375, + "learning_rate": 0.0004994730181277417, + "loss": 0.2289, + "step": 15070 + }, + { + "epoch": 0.62, + "grad_norm": 0.87109375, + "learning_rate": 0.000499472314089856, + "loss": 0.2264, + "step": 15080 + }, + { + "epoch": 0.63, + "grad_norm": 0.56640625, + "learning_rate": 0.0004994716095824903, + "loss": 0.2753, + "step": 15090 + }, + { + "epoch": 0.63, + "grad_norm": 0.6640625, + "learning_rate": 0.000499470904605646, + "loss": 0.28, + "step": 15100 + }, + { + "epoch": 0.63, + "grad_norm": 1.3671875, + "learning_rate": 0.0004994701991593245, + "loss": 0.1957, + "step": 15110 + }, + { + "epoch": 0.63, + "grad_norm": 0.5234375, + "learning_rate": 0.000499469493243527, + "loss": 0.2011, + "step": 15120 + }, + { + "epoch": 0.63, + "grad_norm": 0.41796875, + "learning_rate": 0.0004994687868582549, + "loss": 0.2223, + "step": 15130 + }, + { + "epoch": 0.63, + "grad_norm": 0.6953125, + "learning_rate": 0.0004994680800035093, + "loss": 0.2521, + "step": 15140 + }, + { + "epoch": 0.63, + "grad_norm": 0.2470703125, + "learning_rate": 0.0004994673726792918, + "loss": 0.2931, + "step": 15150 + }, + { + "epoch": 0.63, + "grad_norm": 0.3125, + "learning_rate": 0.0004994666648856037, + "loss": 0.2298, + "step": 15160 + }, + { + "epoch": 0.63, + "grad_norm": 0.443359375, + "learning_rate": 0.0004994659566224462, + "loss": 0.2806, + "step": 15170 + }, + { + "epoch": 0.63, + "grad_norm": 0.63671875, + "learning_rate": 0.0004994652478898207, + "loss": 0.1948, + "step": 15180 + }, + { + "epoch": 0.63, + "grad_norm": 0.7578125, + "learning_rate": 0.0004994645386877286, + "loss": 0.2482, + "step": 15190 + }, + { + "epoch": 0.63, + "grad_norm": 1.1328125, + "learning_rate": 0.0004994638290161712, + "loss": 0.2693, + "step": 15200 + }, + { + "epoch": 0.63, + "grad_norm": 0.51953125, + "learning_rate": 0.0004994631188751498, + "loss": 0.2988, + "step": 15210 + }, + { + "epoch": 0.63, + "grad_norm": 0.62109375, + "learning_rate": 0.0004994624082646657, + "loss": 0.2183, + "step": 15220 + }, + { + "epoch": 0.63, + "grad_norm": 0.79296875, + "learning_rate": 0.0004994616971847202, + "loss": 0.2744, + "step": 15230 + }, + { + "epoch": 0.63, + "grad_norm": 2.0, + "learning_rate": 0.0004994609856353149, + "loss": 0.2219, + "step": 15240 + }, + { + "epoch": 0.63, + "grad_norm": 0.85546875, + "learning_rate": 0.0004994602736164509, + "loss": 0.2668, + "step": 15250 + }, + { + "epoch": 0.63, + "grad_norm": 1.890625, + "learning_rate": 0.0004994595611281296, + "loss": 0.2662, + "step": 15260 + }, + { + "epoch": 0.63, + "grad_norm": 0.8046875, + "learning_rate": 0.0004994588481703524, + "loss": 0.2058, + "step": 15270 + }, + { + "epoch": 0.63, + "grad_norm": 0.341796875, + "learning_rate": 0.0004994581347431206, + "loss": 0.2371, + "step": 15280 + }, + { + "epoch": 0.63, + "grad_norm": 0.59375, + "learning_rate": 0.0004994574208464353, + "loss": 0.2333, + "step": 15290 + }, + { + "epoch": 0.63, + "grad_norm": 0.90625, + "learning_rate": 0.0004994567064802983, + "loss": 0.3015, + "step": 15300 + }, + { + "epoch": 0.63, + "grad_norm": 1.0859375, + "learning_rate": 0.0004994559916447107, + "loss": 0.1547, + "step": 15310 + }, + { + "epoch": 0.63, + "grad_norm": 0.5625, + "learning_rate": 0.0004994552763396738, + "loss": 0.2772, + "step": 15320 + }, + { + "epoch": 0.63, + "grad_norm": 0.294921875, + "learning_rate": 0.000499454560565189, + "loss": 0.2598, + "step": 15330 + }, + { + "epoch": 0.64, + "grad_norm": 0.5390625, + "learning_rate": 0.0004994538443212577, + "loss": 0.2779, + "step": 15340 + }, + { + "epoch": 0.64, + "grad_norm": 1.1015625, + "learning_rate": 0.0004994531276078811, + "loss": 0.2526, + "step": 15350 + }, + { + "epoch": 0.64, + "grad_norm": 1.5546875, + "learning_rate": 0.0004994524104250609, + "loss": 0.2667, + "step": 15360 + }, + { + "epoch": 0.64, + "grad_norm": 0.032958984375, + "learning_rate": 0.000499451692772798, + "loss": 0.23, + "step": 15370 + }, + { + "epoch": 0.64, + "grad_norm": 0.52734375, + "learning_rate": 0.000499450974651094, + "loss": 0.224, + "step": 15380 + }, + { + "epoch": 0.64, + "grad_norm": 0.578125, + "learning_rate": 0.0004994502560599502, + "loss": 0.2363, + "step": 15390 + }, + { + "epoch": 0.64, + "grad_norm": 0.82421875, + "learning_rate": 0.0004994495369993679, + "loss": 0.259, + "step": 15400 + }, + { + "epoch": 0.64, + "grad_norm": 0.58984375, + "learning_rate": 0.0004994488174693486, + "loss": 0.3012, + "step": 15410 + }, + { + "epoch": 0.64, + "grad_norm": 0.83984375, + "learning_rate": 0.0004994480974698935, + "loss": 0.3022, + "step": 15420 + }, + { + "epoch": 0.64, + "grad_norm": 1.6015625, + "learning_rate": 0.0004994473770010041, + "loss": 0.2567, + "step": 15430 + }, + { + "epoch": 0.64, + "grad_norm": 0.455078125, + "learning_rate": 0.0004994466560626816, + "loss": 0.2118, + "step": 15440 + }, + { + "epoch": 0.64, + "grad_norm": 0.298828125, + "learning_rate": 0.0004994459346549275, + "loss": 0.2449, + "step": 15450 + }, + { + "epoch": 0.64, + "grad_norm": 0.76953125, + "learning_rate": 0.0004994452127777431, + "loss": 0.2002, + "step": 15460 + }, + { + "epoch": 0.64, + "grad_norm": 1.4609375, + "learning_rate": 0.0004994444904311296, + "loss": 0.2414, + "step": 15470 + }, + { + "epoch": 0.64, + "grad_norm": 0.61328125, + "learning_rate": 0.0004994437676150886, + "loss": 0.2411, + "step": 15480 + }, + { + "epoch": 0.64, + "grad_norm": 0.8984375, + "learning_rate": 0.0004994430443296214, + "loss": 0.277, + "step": 15490 + }, + { + "epoch": 0.64, + "grad_norm": 1.4921875, + "learning_rate": 0.0004994423205747293, + "loss": 0.2178, + "step": 15500 + }, + { + "epoch": 0.64, + "grad_norm": 0.703125, + "learning_rate": 0.0004994415963504136, + "loss": 0.2688, + "step": 15510 + }, + { + "epoch": 0.64, + "grad_norm": 1.0703125, + "learning_rate": 0.0004994408716566758, + "loss": 0.2433, + "step": 15520 + }, + { + "epoch": 0.64, + "grad_norm": 0.53125, + "learning_rate": 0.0004994401464935173, + "loss": 0.2357, + "step": 15530 + }, + { + "epoch": 0.64, + "grad_norm": 1.03125, + "learning_rate": 0.0004994394208609394, + "loss": 0.2743, + "step": 15540 + }, + { + "epoch": 0.64, + "grad_norm": 0.95703125, + "learning_rate": 0.0004994386947589434, + "loss": 0.2447, + "step": 15550 + }, + { + "epoch": 0.64, + "grad_norm": 0.84765625, + "learning_rate": 0.0004994379681875306, + "loss": 0.1985, + "step": 15560 + }, + { + "epoch": 0.64, + "grad_norm": 0.70703125, + "learning_rate": 0.0004994372411467027, + "loss": 0.233, + "step": 15570 + }, + { + "epoch": 0.65, + "grad_norm": 0.640625, + "learning_rate": 0.0004994365136364606, + "loss": 0.2202, + "step": 15580 + }, + { + "epoch": 0.65, + "grad_norm": 0.490234375, + "learning_rate": 0.0004994357856568061, + "loss": 0.2524, + "step": 15590 + }, + { + "epoch": 0.65, + "grad_norm": 0.462890625, + "learning_rate": 0.0004994350572077403, + "loss": 0.2551, + "step": 15600 + }, + { + "epoch": 0.65, + "grad_norm": 1.0390625, + "learning_rate": 0.0004994343282892647, + "loss": 0.3086, + "step": 15610 + }, + { + "epoch": 0.65, + "grad_norm": 0.70703125, + "learning_rate": 0.0004994335989013806, + "loss": 0.2364, + "step": 15620 + }, + { + "epoch": 0.65, + "grad_norm": 0.0, + "learning_rate": 0.0004994328690440895, + "loss": 0.2736, + "step": 15630 + }, + { + "epoch": 0.65, + "grad_norm": 1.0390625, + "learning_rate": 0.0004994321387173926, + "loss": 0.1977, + "step": 15640 + }, + { + "epoch": 0.65, + "grad_norm": 2.671875, + "learning_rate": 0.0004994314079212913, + "loss": 0.2901, + "step": 15650 + }, + { + "epoch": 0.65, + "grad_norm": 0.494140625, + "learning_rate": 0.0004994306766557871, + "loss": 0.1955, + "step": 15660 + }, + { + "epoch": 0.65, + "grad_norm": 0.47265625, + "learning_rate": 0.0004994299449208813, + "loss": 0.2311, + "step": 15670 + }, + { + "epoch": 0.65, + "grad_norm": 0.640625, + "learning_rate": 0.0004994292127165753, + "loss": 0.2373, + "step": 15680 + }, + { + "epoch": 0.65, + "grad_norm": 0.86328125, + "learning_rate": 0.0004994284800428704, + "loss": 0.1971, + "step": 15690 + }, + { + "epoch": 0.65, + "grad_norm": 1.359375, + "learning_rate": 0.000499427746899768, + "loss": 0.2061, + "step": 15700 + }, + { + "epoch": 0.65, + "grad_norm": 0.84375, + "learning_rate": 0.0004994270132872696, + "loss": 0.2498, + "step": 15710 + }, + { + "epoch": 0.65, + "grad_norm": 0.5078125, + "learning_rate": 0.0004994262792053765, + "loss": 0.1932, + "step": 15720 + }, + { + "epoch": 0.65, + "grad_norm": 0.439453125, + "learning_rate": 0.0004994255446540899, + "loss": 0.2652, + "step": 15730 + }, + { + "epoch": 0.65, + "grad_norm": 0.373046875, + "learning_rate": 0.0004994248096334117, + "loss": 0.2972, + "step": 15740 + }, + { + "epoch": 0.65, + "grad_norm": 0.87890625, + "learning_rate": 0.0004994240741433427, + "loss": 0.239, + "step": 15750 + }, + { + "epoch": 0.65, + "grad_norm": 0.42578125, + "learning_rate": 0.0004994233381838846, + "loss": 0.3088, + "step": 15760 + }, + { + "epoch": 0.65, + "grad_norm": 0.515625, + "learning_rate": 0.0004994226017550386, + "loss": 0.2117, + "step": 15770 + }, + { + "epoch": 0.65, + "grad_norm": 0.9296875, + "learning_rate": 0.0004994218648568063, + "loss": 0.1854, + "step": 15780 + }, + { + "epoch": 0.65, + "grad_norm": 1.21875, + "learning_rate": 0.000499421127489189, + "loss": 0.2247, + "step": 15790 + }, + { + "epoch": 0.65, + "grad_norm": 0.72265625, + "learning_rate": 0.0004994203896521881, + "loss": 0.2258, + "step": 15800 + }, + { + "epoch": 0.65, + "grad_norm": 1.1171875, + "learning_rate": 0.0004994196513458048, + "loss": 0.2129, + "step": 15810 + }, + { + "epoch": 0.66, + "grad_norm": 0.5390625, + "learning_rate": 0.0004994189125700408, + "loss": 0.286, + "step": 15820 + }, + { + "epoch": 0.66, + "grad_norm": 0.78515625, + "learning_rate": 0.0004994181733248974, + "loss": 0.2557, + "step": 15830 + }, + { + "epoch": 0.66, + "grad_norm": 0.68359375, + "learning_rate": 0.0004994174336103757, + "loss": 0.2905, + "step": 15840 + }, + { + "epoch": 0.66, + "grad_norm": 0.62890625, + "learning_rate": 0.0004994166934264775, + "loss": 0.1584, + "step": 15850 + }, + { + "epoch": 0.66, + "grad_norm": 0.66015625, + "learning_rate": 0.000499415952773204, + "loss": 0.2625, + "step": 15860 + }, + { + "epoch": 0.66, + "grad_norm": 0.8203125, + "learning_rate": 0.0004994152116505566, + "loss": 0.183, + "step": 15870 + }, + { + "epoch": 0.66, + "grad_norm": 0.345703125, + "learning_rate": 0.0004994144700585367, + "loss": 0.1989, + "step": 15880 + }, + { + "epoch": 0.66, + "grad_norm": 0.8515625, + "learning_rate": 0.0004994137279971457, + "loss": 0.2535, + "step": 15890 + }, + { + "epoch": 0.66, + "grad_norm": 0.73828125, + "learning_rate": 0.000499412985466385, + "loss": 0.2947, + "step": 15900 + }, + { + "epoch": 0.66, + "grad_norm": 2.125, + "learning_rate": 0.0004994122424662559, + "loss": 0.282, + "step": 15910 + }, + { + "epoch": 0.66, + "grad_norm": 0.5546875, + "learning_rate": 0.0004994114989967601, + "loss": 0.2482, + "step": 15920 + }, + { + "epoch": 0.66, + "grad_norm": 0.294921875, + "learning_rate": 0.0004994107550578986, + "loss": 0.2317, + "step": 15930 + }, + { + "epoch": 0.66, + "grad_norm": 0.62890625, + "learning_rate": 0.0004994100106496731, + "loss": 0.2387, + "step": 15940 + }, + { + "epoch": 0.66, + "grad_norm": 1.15625, + "learning_rate": 0.0004994092657720849, + "loss": 0.3027, + "step": 15950 + }, + { + "epoch": 0.66, + "grad_norm": 0.8828125, + "learning_rate": 0.0004994085204251354, + "loss": 0.2657, + "step": 15960 + }, + { + "epoch": 0.66, + "grad_norm": 0.79296875, + "learning_rate": 0.0004994077746088259, + "loss": 0.2443, + "step": 15970 + }, + { + "epoch": 0.66, + "grad_norm": 0.41796875, + "learning_rate": 0.0004994070283231579, + "loss": 0.2834, + "step": 15980 + }, + { + "epoch": 0.66, + "grad_norm": 1.375, + "learning_rate": 0.0004994062815681329, + "loss": 0.249, + "step": 15990 + }, + { + "epoch": 0.66, + "grad_norm": 0.953125, + "learning_rate": 0.0004994055343437523, + "loss": 0.2528, + "step": 16000 + }, + { + "epoch": 0.66, + "grad_norm": 0.546875, + "learning_rate": 0.0004994047866500172, + "loss": 0.2185, + "step": 16010 + }, + { + "epoch": 0.66, + "grad_norm": 0.80859375, + "learning_rate": 0.0004994040384869293, + "loss": 0.249, + "step": 16020 + }, + { + "epoch": 0.66, + "grad_norm": 0.99609375, + "learning_rate": 0.0004994032898544901, + "loss": 0.2617, + "step": 16030 + }, + { + "epoch": 0.66, + "grad_norm": 0.8125, + "learning_rate": 0.0004994025407527006, + "loss": 0.2215, + "step": 16040 + }, + { + "epoch": 0.66, + "grad_norm": 0.6953125, + "learning_rate": 0.0004994017911815626, + "loss": 0.2899, + "step": 16050 + }, + { + "epoch": 0.67, + "grad_norm": 1.125, + "learning_rate": 0.0004994010411410772, + "loss": 0.2479, + "step": 16060 + }, + { + "epoch": 0.67, + "grad_norm": 1.7890625, + "learning_rate": 0.0004994002906312462, + "loss": 0.2787, + "step": 16070 + }, + { + "epoch": 0.67, + "grad_norm": 0.60546875, + "learning_rate": 0.0004993995396520706, + "loss": 0.2809, + "step": 16080 + }, + { + "epoch": 0.67, + "grad_norm": 0.451171875, + "learning_rate": 0.0004993987882035521, + "loss": 0.2344, + "step": 16090 + }, + { + "epoch": 0.67, + "grad_norm": 0.62890625, + "learning_rate": 0.000499398036285692, + "loss": 0.2347, + "step": 16100 + }, + { + "epoch": 0.67, + "grad_norm": 0.7734375, + "learning_rate": 0.0004993972838984917, + "loss": 0.238, + "step": 16110 + }, + { + "epoch": 0.67, + "grad_norm": 0.50390625, + "learning_rate": 0.0004993965310419526, + "loss": 0.2533, + "step": 16120 + }, + { + "epoch": 0.67, + "grad_norm": 1.3125, + "learning_rate": 0.0004993957777160762, + "loss": 0.3023, + "step": 16130 + }, + { + "epoch": 0.67, + "grad_norm": 0.0, + "learning_rate": 0.0004993950239208639, + "loss": 0.2277, + "step": 16140 + }, + { + "epoch": 0.67, + "grad_norm": 1.96875, + "learning_rate": 0.0004993942696563171, + "loss": 0.2834, + "step": 16150 + }, + { + "epoch": 0.67, + "grad_norm": 0.6015625, + "learning_rate": 0.0004993935149224373, + "loss": 0.2556, + "step": 16160 + }, + { + "epoch": 0.67, + "grad_norm": 0.5546875, + "learning_rate": 0.0004993927597192257, + "loss": 0.2114, + "step": 16170 + }, + { + "epoch": 0.67, + "grad_norm": 0.609375, + "learning_rate": 0.0004993920040466839, + "loss": 0.2365, + "step": 16180 + }, + { + "epoch": 0.67, + "grad_norm": 0.482421875, + "learning_rate": 0.0004993912479048133, + "loss": 0.2753, + "step": 16190 + }, + { + "epoch": 0.67, + "grad_norm": 1.03125, + "learning_rate": 0.0004993904912936152, + "loss": 0.2339, + "step": 16200 + }, + { + "epoch": 0.67, + "grad_norm": 0.62890625, + "learning_rate": 0.0004993897342130913, + "loss": 0.2429, + "step": 16210 + }, + { + "epoch": 0.67, + "grad_norm": 0.77734375, + "learning_rate": 0.0004993889766632428, + "loss": 0.2122, + "step": 16220 + }, + { + "epoch": 0.67, + "grad_norm": 0.65234375, + "learning_rate": 0.0004993882186440712, + "loss": 0.203, + "step": 16230 + }, + { + "epoch": 0.67, + "grad_norm": 1.4296875, + "learning_rate": 0.0004993874601555777, + "loss": 0.3366, + "step": 16240 + }, + { + "epoch": 0.67, + "grad_norm": 0.33984375, + "learning_rate": 0.0004993867011977641, + "loss": 0.2137, + "step": 16250 + }, + { + "epoch": 0.67, + "grad_norm": 0.59765625, + "learning_rate": 0.0004993859417706317, + "loss": 0.2576, + "step": 16260 + }, + { + "epoch": 0.67, + "grad_norm": 0.4609375, + "learning_rate": 0.0004993851818741818, + "loss": 0.2684, + "step": 16270 + }, + { + "epoch": 0.67, + "grad_norm": 0.546875, + "learning_rate": 0.0004993844215084159, + "loss": 0.2314, + "step": 16280 + }, + { + "epoch": 0.67, + "grad_norm": 0.53125, + "learning_rate": 0.0004993836606733355, + "loss": 0.2644, + "step": 16290 + }, + { + "epoch": 0.68, + "grad_norm": 1.1640625, + "learning_rate": 0.000499382899368942, + "loss": 0.337, + "step": 16300 + }, + { + "epoch": 0.68, + "grad_norm": 0.4375, + "learning_rate": 0.0004993821375952368, + "loss": 0.2295, + "step": 16310 + }, + { + "epoch": 0.68, + "grad_norm": 0.384765625, + "learning_rate": 0.0004993813753522214, + "loss": 0.2582, + "step": 16320 + }, + { + "epoch": 0.68, + "grad_norm": 0.859375, + "learning_rate": 0.000499380612639897, + "loss": 0.2319, + "step": 16330 + }, + { + "epoch": 0.68, + "grad_norm": 0.73046875, + "learning_rate": 0.0004993798494582654, + "loss": 0.2881, + "step": 16340 + }, + { + "epoch": 0.68, + "grad_norm": 0.1923828125, + "learning_rate": 0.0004993790858073278, + "loss": 0.2651, + "step": 16350 + }, + { + "epoch": 0.68, + "grad_norm": 0.53515625, + "learning_rate": 0.0004993783216870858, + "loss": 0.2049, + "step": 16360 + }, + { + "epoch": 0.68, + "grad_norm": 0.59765625, + "learning_rate": 0.0004993775570975405, + "loss": 0.2234, + "step": 16370 + }, + { + "epoch": 0.68, + "grad_norm": 2.875, + "learning_rate": 0.0004993767920386937, + "loss": 0.2635, + "step": 16380 + }, + { + "epoch": 0.68, + "grad_norm": 0.5390625, + "learning_rate": 0.0004993760265105467, + "loss": 0.2483, + "step": 16390 + }, + { + "epoch": 0.68, + "grad_norm": 0.90234375, + "learning_rate": 0.0004993752605131009, + "loss": 0.2294, + "step": 16400 + }, + { + "epoch": 0.68, + "grad_norm": 0.6640625, + "learning_rate": 0.0004993744940463579, + "loss": 0.2409, + "step": 16410 + }, + { + "epoch": 0.68, + "grad_norm": 0.6171875, + "learning_rate": 0.0004993737271103189, + "loss": 0.1956, + "step": 16420 + }, + { + "epoch": 0.68, + "grad_norm": 0.6171875, + "learning_rate": 0.0004993729597049856, + "loss": 0.2682, + "step": 16430 + }, + { + "epoch": 0.68, + "grad_norm": 0.75, + "learning_rate": 0.0004993721918303592, + "loss": 0.2362, + "step": 16440 + }, + { + "epoch": 0.68, + "grad_norm": 0.7890625, + "learning_rate": 0.0004993714234864414, + "loss": 0.2321, + "step": 16450 + }, + { + "epoch": 0.68, + "grad_norm": 0.61328125, + "learning_rate": 0.0004993706546732334, + "loss": 0.2932, + "step": 16460 + }, + { + "epoch": 0.68, + "grad_norm": 0.443359375, + "learning_rate": 0.0004993698853907368, + "loss": 0.212, + "step": 16470 + }, + { + "epoch": 0.68, + "grad_norm": 2.640625, + "learning_rate": 0.000499369115638953, + "loss": 0.2107, + "step": 16480 + }, + { + "epoch": 0.68, + "grad_norm": 0.8671875, + "learning_rate": 0.0004993683454178835, + "loss": 0.2679, + "step": 16490 + }, + { + "epoch": 0.68, + "grad_norm": 0.54296875, + "learning_rate": 0.0004993675747275296, + "loss": 0.2585, + "step": 16500 + }, + { + "epoch": 0.68, + "grad_norm": 0.9375, + "learning_rate": 0.0004993668035678929, + "loss": 0.2922, + "step": 16510 + }, + { + "epoch": 0.68, + "grad_norm": 0.62109375, + "learning_rate": 0.0004993660319389748, + "loss": 0.2259, + "step": 16520 + }, + { + "epoch": 0.68, + "grad_norm": 0.640625, + "learning_rate": 0.0004993652598407768, + "loss": 0.264, + "step": 16530 + }, + { + "epoch": 0.69, + "grad_norm": 0.5078125, + "learning_rate": 0.0004993644872733003, + "loss": 0.2662, + "step": 16540 + }, + { + "epoch": 0.69, + "grad_norm": 0.76171875, + "learning_rate": 0.0004993637142365467, + "loss": 0.2643, + "step": 16550 + }, + { + "epoch": 0.69, + "grad_norm": 0.330078125, + "learning_rate": 0.0004993629407305176, + "loss": 0.2561, + "step": 16560 + }, + { + "epoch": 0.69, + "grad_norm": 0.41015625, + "learning_rate": 0.0004993621667552143, + "loss": 0.2218, + "step": 16570 + }, + { + "epoch": 0.69, + "grad_norm": 0.890625, + "learning_rate": 0.0004993613923106383, + "loss": 0.2361, + "step": 16580 + }, + { + "epoch": 0.69, + "grad_norm": 0.421875, + "learning_rate": 0.0004993606173967913, + "loss": 0.2465, + "step": 16590 + }, + { + "epoch": 0.69, + "grad_norm": 0.33203125, + "learning_rate": 0.0004993598420136744, + "loss": 0.1965, + "step": 16600 + }, + { + "epoch": 0.69, + "grad_norm": 0.70703125, + "learning_rate": 0.0004993590661612891, + "loss": 0.2469, + "step": 16610 + }, + { + "epoch": 0.69, + "grad_norm": 0.29296875, + "learning_rate": 0.0004993582898396372, + "loss": 0.2099, + "step": 16620 + }, + { + "epoch": 0.69, + "grad_norm": 0.41796875, + "learning_rate": 0.0004993575130487197, + "loss": 0.1864, + "step": 16630 + }, + { + "epoch": 0.69, + "grad_norm": 0.71484375, + "learning_rate": 0.0004993567357885384, + "loss": 0.2479, + "step": 16640 + }, + { + "epoch": 0.69, + "grad_norm": 0.7265625, + "learning_rate": 0.0004993559580590947, + "loss": 0.2677, + "step": 16650 + }, + { + "epoch": 0.69, + "grad_norm": 0.56640625, + "learning_rate": 0.0004993551798603899, + "loss": 0.2738, + "step": 16660 + }, + { + "epoch": 0.69, + "grad_norm": 0.77734375, + "learning_rate": 0.0004993544011924257, + "loss": 0.2, + "step": 16670 + }, + { + "epoch": 0.69, + "grad_norm": 0.42578125, + "learning_rate": 0.0004993536220552034, + "loss": 0.2493, + "step": 16680 + }, + { + "epoch": 0.69, + "grad_norm": 0.8046875, + "learning_rate": 0.0004993528424487245, + "loss": 0.2753, + "step": 16690 + }, + { + "epoch": 0.69, + "grad_norm": 0.609375, + "learning_rate": 0.0004993520623729905, + "loss": 0.2631, + "step": 16700 + }, + { + "epoch": 0.69, + "grad_norm": 0.578125, + "learning_rate": 0.0004993512818280028, + "loss": 0.2295, + "step": 16710 + }, + { + "epoch": 0.69, + "grad_norm": 0.53515625, + "learning_rate": 0.0004993505008137628, + "loss": 0.2647, + "step": 16720 + }, + { + "epoch": 0.69, + "grad_norm": 0.494140625, + "learning_rate": 0.0004993497193302723, + "loss": 0.2539, + "step": 16730 + }, + { + "epoch": 0.69, + "grad_norm": 0.6328125, + "learning_rate": 0.0004993489373775324, + "loss": 0.2232, + "step": 16740 + }, + { + "epoch": 0.69, + "grad_norm": 0.279296875, + "learning_rate": 0.0004993481549555449, + "loss": 0.2289, + "step": 16750 + }, + { + "epoch": 0.69, + "grad_norm": 0.4609375, + "learning_rate": 0.0004993473720643109, + "loss": 0.291, + "step": 16760 + }, + { + "epoch": 0.69, + "grad_norm": 1.5859375, + "learning_rate": 0.0004993465887038322, + "loss": 0.2992, + "step": 16770 + }, + { + "epoch": 0.7, + "grad_norm": 0.86328125, + "learning_rate": 0.0004993458048741102, + "loss": 0.2626, + "step": 16780 + }, + { + "epoch": 0.7, + "grad_norm": 1.1328125, + "learning_rate": 0.0004993450205751462, + "loss": 0.2139, + "step": 16790 + }, + { + "epoch": 0.7, + "grad_norm": 0.984375, + "learning_rate": 0.0004993442358069419, + "loss": 0.243, + "step": 16800 + }, + { + "epoch": 0.7, + "grad_norm": 1.2265625, + "learning_rate": 0.0004993434505694986, + "loss": 0.2542, + "step": 16810 + }, + { + "epoch": 0.7, + "grad_norm": 1.0859375, + "learning_rate": 0.0004993426648628179, + "loss": 0.2659, + "step": 16820 + }, + { + "epoch": 0.7, + "grad_norm": 0.439453125, + "learning_rate": 0.0004993418786869012, + "loss": 0.2235, + "step": 16830 + }, + { + "epoch": 0.7, + "grad_norm": 0.87890625, + "learning_rate": 0.0004993410920417499, + "loss": 0.2194, + "step": 16840 + }, + { + "epoch": 0.7, + "grad_norm": 1.09375, + "learning_rate": 0.0004993403049273657, + "loss": 0.286, + "step": 16850 + }, + { + "epoch": 0.7, + "grad_norm": 0.1962890625, + "learning_rate": 0.0004993395173437501, + "loss": 0.261, + "step": 16860 + }, + { + "epoch": 0.7, + "grad_norm": 0.57421875, + "learning_rate": 0.0004993387292909042, + "loss": 0.2352, + "step": 16870 + }, + { + "epoch": 0.7, + "grad_norm": 1.3203125, + "learning_rate": 0.0004993379407688299, + "loss": 0.2301, + "step": 16880 + }, + { + "epoch": 0.7, + "grad_norm": 0.4609375, + "learning_rate": 0.0004993371517775285, + "loss": 0.2677, + "step": 16890 + }, + { + "epoch": 0.7, + "grad_norm": 0.81640625, + "learning_rate": 0.0004993363623170016, + "loss": 0.2974, + "step": 16900 + }, + { + "epoch": 0.7, + "grad_norm": 0.77734375, + "learning_rate": 0.0004993355723872504, + "loss": 0.2538, + "step": 16910 + }, + { + "epoch": 0.7, + "grad_norm": 0.5546875, + "learning_rate": 0.0004993347819882767, + "loss": 0.2575, + "step": 16920 + }, + { + "epoch": 0.7, + "grad_norm": 0.65234375, + "learning_rate": 0.0004993339911200819, + "loss": 0.2861, + "step": 16930 + }, + { + "epoch": 0.7, + "grad_norm": 0.423828125, + "learning_rate": 0.0004993331997826673, + "loss": 0.2344, + "step": 16940 + }, + { + "epoch": 0.7, + "grad_norm": 0.71875, + "learning_rate": 0.0004993324079760346, + "loss": 0.2709, + "step": 16950 + }, + { + "epoch": 0.7, + "grad_norm": 1.2734375, + "learning_rate": 0.0004993316157001853, + "loss": 0.2834, + "step": 16960 + }, + { + "epoch": 0.7, + "grad_norm": 1.15625, + "learning_rate": 0.0004993308229551208, + "loss": 0.2285, + "step": 16970 + }, + { + "epoch": 0.7, + "grad_norm": 0.79296875, + "learning_rate": 0.0004993300297408426, + "loss": 0.2872, + "step": 16980 + }, + { + "epoch": 0.7, + "grad_norm": 0.703125, + "learning_rate": 0.0004993292360573522, + "loss": 0.2266, + "step": 16990 + }, + { + "epoch": 0.7, + "grad_norm": 0.0, + "learning_rate": 0.0004993284419046511, + "loss": 0.2214, + "step": 17000 + }, + { + "epoch": 0.7, + "grad_norm": 0.275390625, + "learning_rate": 0.0004993276472827408, + "loss": 0.1889, + "step": 17010 + }, + { + "epoch": 0.7, + "grad_norm": 0.74609375, + "learning_rate": 0.0004993268521916228, + "loss": 0.2453, + "step": 17020 + }, + { + "epoch": 0.71, + "grad_norm": 1.0703125, + "learning_rate": 0.0004993260566312986, + "loss": 0.249, + "step": 17030 + }, + { + "epoch": 0.71, + "grad_norm": 0.2216796875, + "learning_rate": 0.0004993252606017698, + "loss": 0.165, + "step": 17040 + }, + { + "epoch": 0.71, + "grad_norm": 0.349609375, + "learning_rate": 0.0004993244641030376, + "loss": 0.2713, + "step": 17050 + }, + { + "epoch": 0.71, + "grad_norm": 0.58984375, + "learning_rate": 0.0004993236671351038, + "loss": 0.2168, + "step": 17060 + }, + { + "epoch": 0.71, + "grad_norm": 0.435546875, + "learning_rate": 0.0004993228696979697, + "loss": 0.2315, + "step": 17070 + }, + { + "epoch": 0.71, + "grad_norm": 2.109375, + "learning_rate": 0.0004993220717916369, + "loss": 0.2322, + "step": 17080 + }, + { + "epoch": 0.71, + "grad_norm": 1.0625, + "learning_rate": 0.0004993212734161069, + "loss": 0.2528, + "step": 17090 + }, + { + "epoch": 0.71, + "grad_norm": 1.0234375, + "learning_rate": 0.0004993204745713811, + "loss": 0.2223, + "step": 17100 + }, + { + "epoch": 0.71, + "grad_norm": 0.55078125, + "learning_rate": 0.0004993196752574613, + "loss": 0.3006, + "step": 17110 + }, + { + "epoch": 0.71, + "grad_norm": 1.5, + "learning_rate": 0.0004993188754743486, + "loss": 0.2415, + "step": 17120 + }, + { + "epoch": 0.71, + "grad_norm": 0.484375, + "learning_rate": 0.0004993180752220449, + "loss": 0.2322, + "step": 17130 + }, + { + "epoch": 0.71, + "grad_norm": 0.66796875, + "learning_rate": 0.0004993172745005513, + "loss": 0.1904, + "step": 17140 + }, + { + "epoch": 0.71, + "grad_norm": 1.1171875, + "learning_rate": 0.0004993164733098696, + "loss": 0.2586, + "step": 17150 + }, + { + "epoch": 0.71, + "grad_norm": 0.310546875, + "learning_rate": 0.0004993156716500012, + "loss": 0.2273, + "step": 17160 + }, + { + "epoch": 0.71, + "grad_norm": 0.67578125, + "learning_rate": 0.0004993148695209477, + "loss": 0.2098, + "step": 17170 + }, + { + "epoch": 0.71, + "grad_norm": 0.8984375, + "learning_rate": 0.0004993140669227105, + "loss": 0.2482, + "step": 17180 + }, + { + "epoch": 0.71, + "grad_norm": 0.94921875, + "learning_rate": 0.0004993132638552911, + "loss": 0.2544, + "step": 17190 + }, + { + "epoch": 0.71, + "grad_norm": 0.58984375, + "learning_rate": 0.0004993124603186911, + "loss": 0.1878, + "step": 17200 + }, + { + "epoch": 0.71, + "grad_norm": 0.55078125, + "learning_rate": 0.000499311656312912, + "loss": 0.2489, + "step": 17210 + }, + { + "epoch": 0.71, + "grad_norm": 0.9375, + "learning_rate": 0.0004993108518379552, + "loss": 0.2692, + "step": 17220 + }, + { + "epoch": 0.71, + "grad_norm": 0.80859375, + "learning_rate": 0.0004993100468938225, + "loss": 0.2249, + "step": 17230 + }, + { + "epoch": 0.71, + "grad_norm": 0.84375, + "learning_rate": 0.0004993092414805149, + "loss": 0.2435, + "step": 17240 + }, + { + "epoch": 0.71, + "grad_norm": 0.76953125, + "learning_rate": 0.0004993084355980345, + "loss": 0.194, + "step": 17250 + }, + { + "epoch": 0.71, + "grad_norm": 0.92578125, + "learning_rate": 0.0004993076292463824, + "loss": 0.2787, + "step": 17260 + }, + { + "epoch": 0.72, + "grad_norm": 0.5390625, + "learning_rate": 0.0004993068224255603, + "loss": 0.3071, + "step": 17270 + }, + { + "epoch": 0.72, + "grad_norm": 1.375, + "learning_rate": 0.0004993060151355697, + "loss": 0.2312, + "step": 17280 + }, + { + "epoch": 0.72, + "grad_norm": 0.796875, + "learning_rate": 0.0004993052073764122, + "loss": 0.2134, + "step": 17290 + }, + { + "epoch": 0.72, + "grad_norm": 1.8671875, + "learning_rate": 0.000499304399148089, + "loss": 0.2643, + "step": 17300 + }, + { + "epoch": 0.72, + "grad_norm": 0.59375, + "learning_rate": 0.000499303590450602, + "loss": 0.2533, + "step": 17310 + }, + { + "epoch": 0.72, + "grad_norm": 0.2275390625, + "learning_rate": 0.0004993027812839525, + "loss": 0.2177, + "step": 17320 + }, + { + "epoch": 0.72, + "grad_norm": 0.59765625, + "learning_rate": 0.0004993019716481422, + "loss": 0.253, + "step": 17330 + }, + { + "epoch": 0.72, + "grad_norm": 0.515625, + "learning_rate": 0.0004993011615431723, + "loss": 0.2803, + "step": 17340 + }, + { + "epoch": 0.72, + "grad_norm": 0.376953125, + "learning_rate": 0.0004993003509690448, + "loss": 0.2332, + "step": 17350 + }, + { + "epoch": 0.72, + "grad_norm": 0.94140625, + "learning_rate": 0.0004992995399257608, + "loss": 0.2953, + "step": 17360 + }, + { + "epoch": 0.72, + "grad_norm": 0.68359375, + "learning_rate": 0.000499298728413322, + "loss": 0.2093, + "step": 17370 + }, + { + "epoch": 0.72, + "grad_norm": 0.6171875, + "learning_rate": 0.00049929791643173, + "loss": 0.2506, + "step": 17380 + }, + { + "epoch": 0.72, + "grad_norm": 0.37109375, + "learning_rate": 0.0004992971039809861, + "loss": 0.245, + "step": 17390 + }, + { + "epoch": 0.72, + "grad_norm": 1.6171875, + "learning_rate": 0.0004992962910610921, + "loss": 0.2056, + "step": 17400 + }, + { + "epoch": 0.72, + "grad_norm": 1.25, + "learning_rate": 0.0004992954776720493, + "loss": 0.2252, + "step": 17410 + }, + { + "epoch": 0.72, + "grad_norm": 0.671875, + "learning_rate": 0.0004992946638138594, + "loss": 0.1891, + "step": 17420 + }, + { + "epoch": 0.72, + "grad_norm": 0.52734375, + "learning_rate": 0.0004992938494865238, + "loss": 0.2621, + "step": 17430 + }, + { + "epoch": 0.72, + "grad_norm": 0.6171875, + "learning_rate": 0.0004992930346900442, + "loss": 0.2367, + "step": 17440 + }, + { + "epoch": 0.72, + "grad_norm": 0.107421875, + "learning_rate": 0.0004992922194244219, + "loss": 0.2166, + "step": 17450 + }, + { + "epoch": 0.72, + "grad_norm": 1.4765625, + "learning_rate": 0.0004992914036896586, + "loss": 0.2661, + "step": 17460 + }, + { + "epoch": 0.72, + "grad_norm": 0.4609375, + "learning_rate": 0.000499290587485756, + "loss": 0.2547, + "step": 17470 + }, + { + "epoch": 0.72, + "grad_norm": 1.2421875, + "learning_rate": 0.0004992897708127152, + "loss": 0.2819, + "step": 17480 + }, + { + "epoch": 0.72, + "grad_norm": 0.498046875, + "learning_rate": 0.000499288953670538, + "loss": 0.2292, + "step": 17490 + }, + { + "epoch": 0.72, + "grad_norm": 1.234375, + "learning_rate": 0.000499288136059226, + "loss": 0.255, + "step": 17500 + }, + { + "epoch": 0.73, + "grad_norm": 0.5859375, + "learning_rate": 0.0004992873179787806, + "loss": 0.2223, + "step": 17510 + }, + { + "epoch": 0.73, + "grad_norm": 1.3125, + "learning_rate": 0.0004992864994292034, + "loss": 0.2639, + "step": 17520 + }, + { + "epoch": 0.73, + "grad_norm": 0.5, + "learning_rate": 0.000499285680410496, + "loss": 0.2631, + "step": 17530 + }, + { + "epoch": 0.73, + "grad_norm": 0.6796875, + "learning_rate": 0.0004992848609226597, + "loss": 0.265, + "step": 17540 + }, + { + "epoch": 0.73, + "grad_norm": 1.0, + "learning_rate": 0.0004992840409656963, + "loss": 0.2531, + "step": 17550 + }, + { + "epoch": 0.73, + "grad_norm": 1.78125, + "learning_rate": 0.0004992832205396073, + "loss": 0.274, + "step": 17560 + }, + { + "epoch": 0.73, + "grad_norm": 0.81640625, + "learning_rate": 0.0004992823996443942, + "loss": 0.2619, + "step": 17570 + }, + { + "epoch": 0.73, + "grad_norm": 0.59375, + "learning_rate": 0.0004992815782800585, + "loss": 0.1951, + "step": 17580 + }, + { + "epoch": 0.73, + "grad_norm": 1.1015625, + "learning_rate": 0.0004992807564466017, + "loss": 0.2277, + "step": 17590 + }, + { + "epoch": 0.73, + "grad_norm": 0.5234375, + "learning_rate": 0.0004992799341440255, + "loss": 0.2044, + "step": 17600 + }, + { + "epoch": 0.73, + "grad_norm": 1.171875, + "learning_rate": 0.0004992791113723314, + "loss": 0.2635, + "step": 17610 + }, + { + "epoch": 0.73, + "grad_norm": 1.15625, + "learning_rate": 0.000499278288131521, + "loss": 0.234, + "step": 17620 + }, + { + "epoch": 0.73, + "grad_norm": 0.72265625, + "learning_rate": 0.0004992774644215957, + "loss": 0.2378, + "step": 17630 + }, + { + "epoch": 0.73, + "grad_norm": 0.625, + "learning_rate": 0.0004992766402425571, + "loss": 0.2155, + "step": 17640 + }, + { + "epoch": 0.73, + "grad_norm": 0.51171875, + "learning_rate": 0.000499275815594407, + "loss": 0.2374, + "step": 17650 + }, + { + "epoch": 0.73, + "grad_norm": 0.71875, + "learning_rate": 0.0004992749904771466, + "loss": 0.2154, + "step": 17660 + }, + { + "epoch": 0.73, + "grad_norm": 0.54296875, + "learning_rate": 0.0004992741648907775, + "loss": 0.2423, + "step": 17670 + }, + { + "epoch": 0.73, + "grad_norm": 0.451171875, + "learning_rate": 0.0004992733388353013, + "loss": 0.2596, + "step": 17680 + }, + { + "epoch": 0.73, + "grad_norm": 1.0234375, + "learning_rate": 0.0004992725123107198, + "loss": 0.2236, + "step": 17690 + }, + { + "epoch": 0.73, + "grad_norm": 0.828125, + "learning_rate": 0.0004992716853170342, + "loss": 0.1911, + "step": 17700 + }, + { + "epoch": 0.73, + "grad_norm": 0.435546875, + "learning_rate": 0.0004992708578542462, + "loss": 0.2483, + "step": 17710 + }, + { + "epoch": 0.73, + "grad_norm": 0.53125, + "learning_rate": 0.0004992700299223575, + "loss": 0.225, + "step": 17720 + }, + { + "epoch": 0.73, + "grad_norm": 1.3359375, + "learning_rate": 0.0004992692015213695, + "loss": 0.2102, + "step": 17730 + }, + { + "epoch": 0.73, + "grad_norm": 0.78515625, + "learning_rate": 0.0004992683726512836, + "loss": 0.2057, + "step": 17740 + }, + { + "epoch": 0.74, + "grad_norm": 0.59375, + "learning_rate": 0.0004992675433121017, + "loss": 0.2417, + "step": 17750 + }, + { + "epoch": 0.74, + "grad_norm": 1.109375, + "learning_rate": 0.0004992667135038252, + "loss": 0.2267, + "step": 17760 + }, + { + "epoch": 0.74, + "grad_norm": 1.453125, + "learning_rate": 0.0004992658832264557, + "loss": 0.1883, + "step": 17770 + }, + { + "epoch": 0.74, + "grad_norm": 0.5703125, + "learning_rate": 0.0004992650524799946, + "loss": 0.2357, + "step": 17780 + }, + { + "epoch": 0.74, + "grad_norm": 0.27734375, + "learning_rate": 0.0004992642212644436, + "loss": 0.249, + "step": 17790 + }, + { + "epoch": 0.74, + "grad_norm": 1.1953125, + "learning_rate": 0.0004992633895798043, + "loss": 0.2357, + "step": 17800 + }, + { + "epoch": 0.74, + "grad_norm": 0.68359375, + "learning_rate": 0.0004992625574260783, + "loss": 0.2548, + "step": 17810 + }, + { + "epoch": 0.74, + "grad_norm": 1.8125, + "learning_rate": 0.000499261724803267, + "loss": 0.2284, + "step": 17820 + }, + { + "epoch": 0.74, + "grad_norm": 0.53125, + "learning_rate": 0.000499260891711372, + "loss": 0.1952, + "step": 17830 + }, + { + "epoch": 0.74, + "grad_norm": 0.416015625, + "learning_rate": 0.0004992600581503949, + "loss": 0.1884, + "step": 17840 + }, + { + "epoch": 0.74, + "grad_norm": 0.765625, + "learning_rate": 0.0004992592241203375, + "loss": 0.2618, + "step": 17850 + }, + { + "epoch": 0.74, + "grad_norm": 0.6640625, + "learning_rate": 0.000499258389621201, + "loss": 0.2842, + "step": 17860 + }, + { + "epoch": 0.74, + "grad_norm": 0.74609375, + "learning_rate": 0.0004992575546529871, + "loss": 0.2695, + "step": 17870 + }, + { + "epoch": 0.74, + "grad_norm": 0.671875, + "learning_rate": 0.0004992567192156975, + "loss": 0.3269, + "step": 17880 + }, + { + "epoch": 0.74, + "grad_norm": 0.34375, + "learning_rate": 0.0004992558833093335, + "loss": 0.2118, + "step": 17890 + }, + { + "epoch": 0.74, + "grad_norm": 0.625, + "learning_rate": 0.000499255046933897, + "loss": 0.2652, + "step": 17900 + }, + { + "epoch": 0.74, + "grad_norm": 0.79296875, + "learning_rate": 0.0004992542100893894, + "loss": 0.2799, + "step": 17910 + }, + { + "epoch": 0.74, + "grad_norm": 1.1796875, + "learning_rate": 0.0004992533727758122, + "loss": 0.3002, + "step": 17920 + }, + { + "epoch": 0.74, + "grad_norm": 0.70703125, + "learning_rate": 0.0004992525349931672, + "loss": 0.2393, + "step": 17930 + }, + { + "epoch": 0.74, + "grad_norm": 2.34375, + "learning_rate": 0.0004992516967414559, + "loss": 0.2784, + "step": 17940 + }, + { + "epoch": 0.74, + "grad_norm": 0.52734375, + "learning_rate": 0.0004992508580206797, + "loss": 0.2165, + "step": 17950 + }, + { + "epoch": 0.74, + "grad_norm": 0.765625, + "learning_rate": 0.0004992500188308403, + "loss": 0.2561, + "step": 17960 + }, + { + "epoch": 0.74, + "grad_norm": 1.3046875, + "learning_rate": 0.0004992491791719393, + "loss": 0.3094, + "step": 17970 + }, + { + "epoch": 0.74, + "grad_norm": 0.640625, + "learning_rate": 0.0004992483390439782, + "loss": 0.2693, + "step": 17980 + }, + { + "epoch": 0.75, + "grad_norm": 0.67578125, + "learning_rate": 0.0004992474984469587, + "loss": 0.2246, + "step": 17990 + }, + { + "epoch": 0.75, + "grad_norm": 1.6953125, + "learning_rate": 0.0004992466573808823, + "loss": 0.2121, + "step": 18000 + }, + { + "epoch": 0.75, + "grad_norm": 0.63671875, + "learning_rate": 0.0004992458158457507, + "loss": 0.2536, + "step": 18010 + }, + { + "epoch": 0.75, + "grad_norm": 0.640625, + "learning_rate": 0.0004992449738415653, + "loss": 0.265, + "step": 18020 + }, + { + "epoch": 0.75, + "grad_norm": 0.82421875, + "learning_rate": 0.0004992441313683278, + "loss": 0.2412, + "step": 18030 + }, + { + "epoch": 0.75, + "grad_norm": 0.828125, + "learning_rate": 0.0004992432884260398, + "loss": 0.2425, + "step": 18040 + }, + { + "epoch": 0.75, + "grad_norm": 0.58203125, + "learning_rate": 0.0004992424450147028, + "loss": 0.1954, + "step": 18050 + }, + { + "epoch": 0.75, + "grad_norm": 1.609375, + "learning_rate": 0.0004992416011343185, + "loss": 0.2031, + "step": 18060 + }, + { + "epoch": 0.75, + "grad_norm": 1.34375, + "learning_rate": 0.0004992407567848883, + "loss": 0.2401, + "step": 18070 + }, + { + "epoch": 0.75, + "grad_norm": 1.234375, + "learning_rate": 0.000499239911966414, + "loss": 0.2886, + "step": 18080 + }, + { + "epoch": 0.75, + "grad_norm": 0.76953125, + "learning_rate": 0.0004992390666788971, + "loss": 0.3007, + "step": 18090 + }, + { + "epoch": 0.75, + "grad_norm": 0.51953125, + "learning_rate": 0.0004992382209223392, + "loss": 0.2466, + "step": 18100 + }, + { + "epoch": 0.75, + "grad_norm": 0.515625, + "learning_rate": 0.0004992373746967418, + "loss": 0.1946, + "step": 18110 + }, + { + "epoch": 0.75, + "grad_norm": 0.47265625, + "learning_rate": 0.0004992365280021066, + "loss": 0.2871, + "step": 18120 + }, + { + "epoch": 0.75, + "grad_norm": 0.8671875, + "learning_rate": 0.0004992356808384352, + "loss": 0.177, + "step": 18130 + }, + { + "epoch": 0.75, + "grad_norm": 0.255859375, + "learning_rate": 0.0004992348332057292, + "loss": 0.2067, + "step": 18140 + }, + { + "epoch": 0.75, + "grad_norm": 0.50390625, + "learning_rate": 0.0004992339851039901, + "loss": 0.2346, + "step": 18150 + }, + { + "epoch": 0.75, + "grad_norm": 0.5703125, + "learning_rate": 0.0004992331365332196, + "loss": 0.2302, + "step": 18160 + }, + { + "epoch": 0.75, + "grad_norm": 0.2890625, + "learning_rate": 0.0004992322874934192, + "loss": 0.2224, + "step": 18170 + }, + { + "epoch": 0.75, + "grad_norm": 0.36328125, + "learning_rate": 0.0004992314379845906, + "loss": 0.2123, + "step": 18180 + }, + { + "epoch": 0.75, + "grad_norm": 1.0546875, + "learning_rate": 0.0004992305880067353, + "loss": 0.2437, + "step": 18190 + }, + { + "epoch": 0.75, + "grad_norm": 0.5, + "learning_rate": 0.000499229737559855, + "loss": 0.2096, + "step": 18200 + }, + { + "epoch": 0.75, + "grad_norm": 0.25, + "learning_rate": 0.0004992288866439513, + "loss": 0.1769, + "step": 18210 + }, + { + "epoch": 0.75, + "grad_norm": 0.953125, + "learning_rate": 0.0004992280352590256, + "loss": 0.2369, + "step": 18220 + }, + { + "epoch": 0.76, + "grad_norm": 1.4453125, + "learning_rate": 0.0004992271834050797, + "loss": 0.2499, + "step": 18230 + }, + { + "epoch": 0.76, + "grad_norm": 0.6171875, + "learning_rate": 0.0004992263310821152, + "loss": 0.2691, + "step": 18240 + }, + { + "epoch": 0.76, + "grad_norm": 0.52734375, + "learning_rate": 0.0004992254782901337, + "loss": 0.2555, + "step": 18250 + }, + { + "epoch": 0.76, + "grad_norm": 1.265625, + "learning_rate": 0.0004992246250291367, + "loss": 0.2829, + "step": 18260 + }, + { + "epoch": 0.76, + "grad_norm": 0.62890625, + "learning_rate": 0.0004992237712991258, + "loss": 0.1797, + "step": 18270 + }, + { + "epoch": 0.76, + "grad_norm": 0.55078125, + "learning_rate": 0.0004992229171001028, + "loss": 0.2878, + "step": 18280 + }, + { + "epoch": 0.76, + "grad_norm": 0.74609375, + "learning_rate": 0.0004992220624320692, + "loss": 0.2282, + "step": 18290 + }, + { + "epoch": 0.76, + "grad_norm": 1.3515625, + "learning_rate": 0.0004992212072950265, + "loss": 0.2757, + "step": 18300 + }, + { + "epoch": 0.76, + "grad_norm": 0.4609375, + "learning_rate": 0.0004992203516889764, + "loss": 0.2269, + "step": 18310 + }, + { + "epoch": 0.76, + "grad_norm": 0.53515625, + "learning_rate": 0.0004992194956139205, + "loss": 0.2339, + "step": 18320 + }, + { + "epoch": 0.76, + "grad_norm": 0.423828125, + "learning_rate": 0.0004992186390698606, + "loss": 0.2265, + "step": 18330 + }, + { + "epoch": 0.76, + "grad_norm": 0.51953125, + "learning_rate": 0.0004992177820567979, + "loss": 0.2385, + "step": 18340 + }, + { + "epoch": 0.76, + "grad_norm": 0.71875, + "learning_rate": 0.0004992169245747343, + "loss": 0.3103, + "step": 18350 + }, + { + "epoch": 0.76, + "grad_norm": 0.98046875, + "learning_rate": 0.0004992160666236714, + "loss": 0.2667, + "step": 18360 + }, + { + "epoch": 0.76, + "grad_norm": 0.74609375, + "learning_rate": 0.0004992152082036108, + "loss": 0.2514, + "step": 18370 + }, + { + "epoch": 0.76, + "grad_norm": 0.435546875, + "learning_rate": 0.0004992143493145542, + "loss": 0.2051, + "step": 18380 + }, + { + "epoch": 0.76, + "grad_norm": 1.609375, + "learning_rate": 0.0004992134899565029, + "loss": 0.2016, + "step": 18390 + }, + { + "epoch": 0.76, + "grad_norm": 0.5859375, + "learning_rate": 0.0004992126301294588, + "loss": 0.243, + "step": 18400 + }, + { + "epoch": 0.76, + "grad_norm": 1.0078125, + "learning_rate": 0.0004992117698334234, + "loss": 0.2704, + "step": 18410 + }, + { + "epoch": 0.76, + "grad_norm": 0.6484375, + "learning_rate": 0.0004992109090683984, + "loss": 0.2169, + "step": 18420 + }, + { + "epoch": 0.76, + "grad_norm": 0.4375, + "learning_rate": 0.0004992100478343854, + "loss": 0.2469, + "step": 18430 + }, + { + "epoch": 0.76, + "grad_norm": 0.8671875, + "learning_rate": 0.000499209186131386, + "loss": 0.2797, + "step": 18440 + }, + { + "epoch": 0.76, + "grad_norm": 0.9140625, + "learning_rate": 0.0004992083239594018, + "loss": 0.2794, + "step": 18450 + }, + { + "epoch": 0.76, + "grad_norm": 0.8046875, + "learning_rate": 0.0004992074613184345, + "loss": 0.2631, + "step": 18460 + }, + { + "epoch": 0.77, + "grad_norm": 0.412109375, + "learning_rate": 0.0004992065982084857, + "loss": 0.224, + "step": 18470 + }, + { + "epoch": 0.77, + "grad_norm": 0.828125, + "learning_rate": 0.0004992057346295569, + "loss": 0.2397, + "step": 18480 + }, + { + "epoch": 0.77, + "grad_norm": 0.34765625, + "learning_rate": 0.0004992048705816498, + "loss": 0.2209, + "step": 18490 + }, + { + "epoch": 0.77, + "grad_norm": 0.69921875, + "learning_rate": 0.0004992040060647661, + "loss": 0.2347, + "step": 18500 + }, + { + "epoch": 0.77, + "grad_norm": 1.203125, + "learning_rate": 0.0004992031410789074, + "loss": 0.2044, + "step": 18510 + }, + { + "epoch": 0.77, + "grad_norm": 1.3359375, + "learning_rate": 0.0004992022756240752, + "loss": 0.2877, + "step": 18520 + }, + { + "epoch": 0.77, + "grad_norm": 0.5, + "learning_rate": 0.0004992014097002713, + "loss": 0.1885, + "step": 18530 + }, + { + "epoch": 0.77, + "grad_norm": 0.69140625, + "learning_rate": 0.0004992005433074973, + "loss": 0.2601, + "step": 18540 + }, + { + "epoch": 0.77, + "grad_norm": 0.087890625, + "learning_rate": 0.0004991996764457547, + "loss": 0.2354, + "step": 18550 + }, + { + "epoch": 0.77, + "grad_norm": 0.5703125, + "learning_rate": 0.0004991988091150453, + "loss": 0.245, + "step": 18560 + }, + { + "epoch": 0.77, + "grad_norm": 0.5, + "learning_rate": 0.0004991979413153705, + "loss": 0.2392, + "step": 18570 + }, + { + "epoch": 0.77, + "grad_norm": 0.8203125, + "learning_rate": 0.0004991970730467322, + "loss": 0.2513, + "step": 18580 + }, + { + "epoch": 0.77, + "grad_norm": 0.1640625, + "learning_rate": 0.000499196204309132, + "loss": 0.2127, + "step": 18590 + }, + { + "epoch": 0.77, + "grad_norm": 0.390625, + "learning_rate": 0.0004991953351025714, + "loss": 0.229, + "step": 18600 + }, + { + "epoch": 0.77, + "grad_norm": 0.578125, + "learning_rate": 0.000499194465427052, + "loss": 0.236, + "step": 18610 + }, + { + "epoch": 0.77, + "grad_norm": 0.96484375, + "learning_rate": 0.0004991935952825756, + "loss": 0.2351, + "step": 18620 + }, + { + "epoch": 0.77, + "grad_norm": 0.90234375, + "learning_rate": 0.0004991927246691438, + "loss": 0.229, + "step": 18630 + }, + { + "epoch": 0.77, + "grad_norm": 0.2421875, + "learning_rate": 0.0004991918535867581, + "loss": 0.2497, + "step": 18640 + }, + { + "epoch": 0.77, + "grad_norm": 0.439453125, + "learning_rate": 0.0004991909820354204, + "loss": 0.2857, + "step": 18650 + }, + { + "epoch": 0.77, + "grad_norm": 0.322265625, + "learning_rate": 0.000499190110015132, + "loss": 0.2132, + "step": 18660 + }, + { + "epoch": 0.77, + "grad_norm": 1.0390625, + "learning_rate": 0.0004991892375258948, + "loss": 0.2546, + "step": 18670 + }, + { + "epoch": 0.77, + "grad_norm": 0.48828125, + "learning_rate": 0.0004991883645677103, + "loss": 0.2352, + "step": 18680 + }, + { + "epoch": 0.77, + "grad_norm": 0.53515625, + "learning_rate": 0.0004991874911405804, + "loss": 0.1868, + "step": 18690 + }, + { + "epoch": 0.77, + "grad_norm": 1.2109375, + "learning_rate": 0.0004991866172445065, + "loss": 0.273, + "step": 18700 + }, + { + "epoch": 0.77, + "grad_norm": 0.5234375, + "learning_rate": 0.0004991857428794901, + "loss": 0.2649, + "step": 18710 + }, + { + "epoch": 0.78, + "grad_norm": 0.46875, + "learning_rate": 0.0004991848680455332, + "loss": 0.2093, + "step": 18720 + }, + { + "epoch": 0.78, + "grad_norm": 1.0078125, + "learning_rate": 0.0004991839927426373, + "loss": 0.2695, + "step": 18730 + }, + { + "epoch": 0.78, + "grad_norm": 2.984375, + "learning_rate": 0.0004991831169708039, + "loss": 0.2263, + "step": 18740 + }, + { + "epoch": 0.78, + "grad_norm": 0.90234375, + "learning_rate": 0.0004991822407300349, + "loss": 0.2336, + "step": 18750 + }, + { + "epoch": 0.78, + "grad_norm": 0.765625, + "learning_rate": 0.0004991813640203318, + "loss": 0.259, + "step": 18760 + }, + { + "epoch": 0.78, + "grad_norm": 3.359375, + "learning_rate": 0.0004991804868416963, + "loss": 0.1828, + "step": 18770 + }, + { + "epoch": 0.78, + "grad_norm": 0.4765625, + "learning_rate": 0.00049917960919413, + "loss": 0.2196, + "step": 18780 + }, + { + "epoch": 0.78, + "grad_norm": 0.796875, + "learning_rate": 0.0004991787310776346, + "loss": 0.1959, + "step": 18790 + }, + { + "epoch": 0.78, + "grad_norm": 1.3984375, + "learning_rate": 0.0004991778524922117, + "loss": 0.2624, + "step": 18800 + }, + { + "epoch": 0.78, + "grad_norm": 0.3984375, + "learning_rate": 0.000499176973437863, + "loss": 0.2861, + "step": 18810 + }, + { + "epoch": 0.78, + "grad_norm": 1.1953125, + "learning_rate": 0.0004991760939145902, + "loss": 0.2527, + "step": 18820 + }, + { + "epoch": 0.78, + "grad_norm": 0.76171875, + "learning_rate": 0.0004991752139223949, + "loss": 0.2685, + "step": 18830 + }, + { + "epoch": 0.78, + "grad_norm": 1.0859375, + "learning_rate": 0.0004991743334612787, + "loss": 0.2376, + "step": 18840 + }, + { + "epoch": 0.78, + "grad_norm": 0.5703125, + "learning_rate": 0.0004991734525312434, + "loss": 0.2262, + "step": 18850 + }, + { + "epoch": 0.78, + "grad_norm": 0.357421875, + "learning_rate": 0.0004991725711322905, + "loss": 0.2015, + "step": 18860 + }, + { + "epoch": 0.78, + "grad_norm": 0.63671875, + "learning_rate": 0.0004991716892644218, + "loss": 0.261, + "step": 18870 + }, + { + "epoch": 0.78, + "grad_norm": 0.41015625, + "learning_rate": 0.0004991708069276388, + "loss": 0.2486, + "step": 18880 + }, + { + "epoch": 0.78, + "grad_norm": 1.0078125, + "learning_rate": 0.0004991699241219433, + "loss": 0.2392, + "step": 18890 + }, + { + "epoch": 0.78, + "grad_norm": 0.85546875, + "learning_rate": 0.0004991690408473368, + "loss": 0.2374, + "step": 18900 + }, + { + "epoch": 0.78, + "grad_norm": 2.5625, + "learning_rate": 0.0004991681571038212, + "loss": 0.2469, + "step": 18910 + }, + { + "epoch": 0.78, + "grad_norm": 10.5625, + "learning_rate": 0.0004991672728913981, + "loss": 0.2868, + "step": 18920 + }, + { + "epoch": 0.78, + "grad_norm": 2.46875, + "learning_rate": 0.000499166388210069, + "loss": 0.2709, + "step": 18930 + }, + { + "epoch": 0.78, + "grad_norm": 0.34375, + "learning_rate": 0.0004991655030598356, + "loss": 0.276, + "step": 18940 + }, + { + "epoch": 0.78, + "grad_norm": 0.3359375, + "learning_rate": 0.0004991646174406998, + "loss": 0.2303, + "step": 18950 + }, + { + "epoch": 0.79, + "grad_norm": 0.6171875, + "learning_rate": 0.000499163731352663, + "loss": 0.1832, + "step": 18960 + }, + { + "epoch": 0.79, + "grad_norm": 0.69140625, + "learning_rate": 0.0004991628447957269, + "loss": 0.2499, + "step": 18970 + }, + { + "epoch": 0.79, + "grad_norm": 0.671875, + "learning_rate": 0.0004991619577698933, + "loss": 0.2411, + "step": 18980 + }, + { + "epoch": 0.79, + "grad_norm": 1.984375, + "learning_rate": 0.0004991610702751638, + "loss": 0.2897, + "step": 18990 + }, + { + "epoch": 0.79, + "grad_norm": 0.51953125, + "learning_rate": 0.00049916018231154, + "loss": 0.161, + "step": 19000 + }, + { + "epoch": 0.79, + "grad_norm": 0.65234375, + "learning_rate": 0.0004991592938790238, + "loss": 0.2286, + "step": 19010 + }, + { + "epoch": 0.79, + "grad_norm": 0.5078125, + "learning_rate": 0.0004991584049776165, + "loss": 0.2884, + "step": 19020 + }, + { + "epoch": 0.79, + "grad_norm": 0.90625, + "learning_rate": 0.0004991575156073202, + "loss": 0.1708, + "step": 19030 + }, + { + "epoch": 0.79, + "grad_norm": 0.458984375, + "learning_rate": 0.0004991566257681363, + "loss": 0.2228, + "step": 19040 + }, + { + "epoch": 0.79, + "grad_norm": 0.75390625, + "learning_rate": 0.0004991557354600666, + "loss": 0.2641, + "step": 19050 + }, + { + "epoch": 0.79, + "grad_norm": 0.54296875, + "learning_rate": 0.0004991548446831125, + "loss": 0.239, + "step": 19060 + }, + { + "epoch": 0.79, + "grad_norm": 0.6796875, + "learning_rate": 0.0004991539534372761, + "loss": 0.2876, + "step": 19070 + }, + { + "epoch": 0.79, + "grad_norm": 0.88671875, + "learning_rate": 0.0004991530617225587, + "loss": 0.2842, + "step": 19080 + }, + { + "epoch": 0.79, + "grad_norm": 0.95703125, + "learning_rate": 0.0004991521695389623, + "loss": 0.1881, + "step": 19090 + }, + { + "epoch": 0.79, + "grad_norm": 0.625, + "learning_rate": 0.0004991512768864883, + "loss": 0.2918, + "step": 19100 + }, + { + "epoch": 0.79, + "grad_norm": 0.73828125, + "learning_rate": 0.0004991503837651386, + "loss": 0.1823, + "step": 19110 + }, + { + "epoch": 0.79, + "grad_norm": 1.34375, + "learning_rate": 0.0004991494901749147, + "loss": 0.2628, + "step": 19120 + }, + { + "epoch": 0.79, + "grad_norm": 0.5625, + "learning_rate": 0.0004991485961158184, + "loss": 0.2186, + "step": 19130 + }, + { + "epoch": 0.79, + "grad_norm": 0.73046875, + "learning_rate": 0.0004991477015878514, + "loss": 0.2331, + "step": 19140 + }, + { + "epoch": 0.79, + "grad_norm": 1.6953125, + "learning_rate": 0.0004991468065910152, + "loss": 0.242, + "step": 19150 + }, + { + "epoch": 0.79, + "grad_norm": 0.52734375, + "learning_rate": 0.0004991459111253117, + "loss": 0.2394, + "step": 19160 + }, + { + "epoch": 0.79, + "grad_norm": 0.63671875, + "learning_rate": 0.0004991450151907425, + "loss": 0.2236, + "step": 19170 + }, + { + "epoch": 0.79, + "grad_norm": 0.7890625, + "learning_rate": 0.0004991441187873094, + "loss": 0.2406, + "step": 19180 + }, + { + "epoch": 0.79, + "grad_norm": 0.294921875, + "learning_rate": 0.0004991432219150138, + "loss": 0.2011, + "step": 19190 + }, + { + "epoch": 0.8, + "grad_norm": 1.1484375, + "learning_rate": 0.0004991423245738576, + "loss": 0.2481, + "step": 19200 + }, + { + "epoch": 0.8, + "grad_norm": 0.6328125, + "learning_rate": 0.0004991414267638425, + "loss": 0.2826, + "step": 19210 + }, + { + "epoch": 0.8, + "grad_norm": 0.8515625, + "learning_rate": 0.0004991405284849701, + "loss": 0.2581, + "step": 19220 + }, + { + "epoch": 0.8, + "grad_norm": 0.66796875, + "learning_rate": 0.0004991396297372422, + "loss": 0.274, + "step": 19230 + }, + { + "epoch": 0.8, + "grad_norm": 0.578125, + "learning_rate": 0.0004991387305206602, + "loss": 0.2679, + "step": 19240 + }, + { + "epoch": 0.8, + "grad_norm": 0.71484375, + "learning_rate": 0.0004991378308352263, + "loss": 0.2411, + "step": 19250 + }, + { + "epoch": 0.8, + "grad_norm": 1.375, + "learning_rate": 0.0004991369306809418, + "loss": 0.2653, + "step": 19260 + }, + { + "epoch": 0.8, + "grad_norm": 1.1015625, + "learning_rate": 0.0004991360300578084, + "loss": 0.2284, + "step": 19270 + }, + { + "epoch": 0.8, + "grad_norm": 0.59375, + "learning_rate": 0.000499135128965828, + "loss": 0.2059, + "step": 19280 + }, + { + "epoch": 0.8, + "grad_norm": 0.796875, + "learning_rate": 0.0004991342274050022, + "loss": 0.2395, + "step": 19290 + }, + { + "epoch": 0.8, + "grad_norm": 0.5625, + "learning_rate": 0.0004991333253753326, + "loss": 0.2862, + "step": 19300 + }, + { + "epoch": 0.8, + "grad_norm": 0.8359375, + "learning_rate": 0.0004991324228768211, + "loss": 0.2441, + "step": 19310 + }, + { + "epoch": 0.8, + "grad_norm": 0.58984375, + "learning_rate": 0.0004991315199094693, + "loss": 0.2864, + "step": 19320 + }, + { + "epoch": 0.8, + "grad_norm": 0.91796875, + "learning_rate": 0.0004991306164732788, + "loss": 0.2728, + "step": 19330 + }, + { + "epoch": 0.8, + "grad_norm": 0.5625, + "learning_rate": 0.0004991297125682513, + "loss": 0.2378, + "step": 19340 + }, + { + "epoch": 0.8, + "grad_norm": 0.71875, + "learning_rate": 0.0004991288081943887, + "loss": 0.2146, + "step": 19350 + }, + { + "epoch": 0.8, + "grad_norm": 1.8359375, + "learning_rate": 0.0004991279033516926, + "loss": 0.1939, + "step": 19360 + }, + { + "epoch": 0.8, + "grad_norm": 0.6484375, + "learning_rate": 0.0004991269980401646, + "loss": 0.2356, + "step": 19370 + }, + { + "epoch": 0.8, + "grad_norm": 0.96484375, + "learning_rate": 0.0004991260922598067, + "loss": 0.2703, + "step": 19380 + }, + { + "epoch": 0.8, + "grad_norm": 0.65625, + "learning_rate": 0.0004991251860106202, + "loss": 0.2, + "step": 19390 + }, + { + "epoch": 0.8, + "grad_norm": 0.890625, + "learning_rate": 0.000499124279292607, + "loss": 0.2227, + "step": 19400 + }, + { + "epoch": 0.8, + "grad_norm": 0.478515625, + "learning_rate": 0.0004991233721057689, + "loss": 0.2046, + "step": 19410 + }, + { + "epoch": 0.8, + "grad_norm": 0.546875, + "learning_rate": 0.0004991224644501075, + "loss": 0.2576, + "step": 19420 + }, + { + "epoch": 0.8, + "grad_norm": 0.439453125, + "learning_rate": 0.0004991215563256244, + "loss": 0.1826, + "step": 19430 + }, + { + "epoch": 0.81, + "grad_norm": 0.3828125, + "learning_rate": 0.0004991206477323216, + "loss": 0.246, + "step": 19440 + }, + { + "epoch": 0.81, + "grad_norm": 0.6640625, + "learning_rate": 0.0004991197386702005, + "loss": 0.2074, + "step": 19450 + }, + { + "epoch": 0.81, + "grad_norm": 0.86328125, + "learning_rate": 0.0004991188291392631, + "loss": 0.1499, + "step": 19460 + }, + { + "epoch": 0.81, + "grad_norm": 0.7109375, + "learning_rate": 0.0004991179191395109, + "loss": 0.2188, + "step": 19470 + }, + { + "epoch": 0.81, + "grad_norm": 0.63671875, + "learning_rate": 0.0004991170086709457, + "loss": 0.1868, + "step": 19480 + }, + { + "epoch": 0.81, + "grad_norm": 0.466796875, + "learning_rate": 0.0004991160977335691, + "loss": 0.2462, + "step": 19490 + }, + { + "epoch": 0.81, + "grad_norm": 0.474609375, + "learning_rate": 0.0004991151863273831, + "loss": 0.2261, + "step": 19500 + }, + { + "epoch": 0.81, + "grad_norm": 0.35546875, + "learning_rate": 0.000499114274452389, + "loss": 0.2167, + "step": 19510 + }, + { + "epoch": 0.81, + "grad_norm": 0.80859375, + "learning_rate": 0.0004991133621085889, + "loss": 0.3245, + "step": 19520 + }, + { + "epoch": 0.81, + "grad_norm": 0.78125, + "learning_rate": 0.0004991124492959842, + "loss": 0.2546, + "step": 19530 + }, + { + "epoch": 0.81, + "grad_norm": 0.71484375, + "learning_rate": 0.0004991115360145769, + "loss": 0.2279, + "step": 19540 + }, + { + "epoch": 0.81, + "grad_norm": 0.5546875, + "learning_rate": 0.0004991106222643685, + "loss": 0.1857, + "step": 19550 + }, + { + "epoch": 0.81, + "grad_norm": 0.80078125, + "learning_rate": 0.0004991097080453609, + "loss": 0.3075, + "step": 19560 + }, + { + "epoch": 0.81, + "grad_norm": 1.53125, + "learning_rate": 0.0004991087933575558, + "loss": 0.2187, + "step": 19570 + }, + { + "epoch": 0.81, + "grad_norm": 0.3046875, + "learning_rate": 0.0004991078782009547, + "loss": 0.191, + "step": 19580 + }, + { + "epoch": 0.81, + "grad_norm": 0.84375, + "learning_rate": 0.0004991069625755595, + "loss": 0.2783, + "step": 19590 + }, + { + "epoch": 0.81, + "grad_norm": 1.2734375, + "learning_rate": 0.000499106046481372, + "loss": 0.2072, + "step": 19600 + }, + { + "epoch": 0.81, + "grad_norm": 0.5625, + "learning_rate": 0.0004991051299183937, + "loss": 0.232, + "step": 19610 + }, + { + "epoch": 0.81, + "grad_norm": 0.0, + "learning_rate": 0.0004991042128866264, + "loss": 0.202, + "step": 19620 + }, + { + "epoch": 0.81, + "grad_norm": 3.09375, + "learning_rate": 0.000499103295386072, + "loss": 0.2601, + "step": 19630 + }, + { + "epoch": 0.81, + "grad_norm": 0.96875, + "learning_rate": 0.0004991023774167321, + "loss": 0.2078, + "step": 19640 + }, + { + "epoch": 0.81, + "grad_norm": 1.125, + "learning_rate": 0.0004991014589786083, + "loss": 0.2598, + "step": 19650 + }, + { + "epoch": 0.81, + "grad_norm": 2.078125, + "learning_rate": 0.0004991005400717026, + "loss": 0.2598, + "step": 19660 + }, + { + "epoch": 0.81, + "grad_norm": 0.5078125, + "learning_rate": 0.0004990996206960165, + "loss": 0.236, + "step": 19670 + }, + { + "epoch": 0.82, + "grad_norm": 0.625, + "learning_rate": 0.0004990987008515518, + "loss": 0.2201, + "step": 19680 + }, + { + "epoch": 0.82, + "grad_norm": 0.546875, + "learning_rate": 0.0004990977805383103, + "loss": 0.2803, + "step": 19690 + }, + { + "epoch": 0.82, + "grad_norm": 1.984375, + "learning_rate": 0.0004990968597562937, + "loss": 0.2678, + "step": 19700 + }, + { + "epoch": 0.82, + "grad_norm": 0.7578125, + "learning_rate": 0.0004990959385055037, + "loss": 0.248, + "step": 19710 + }, + { + "epoch": 0.82, + "grad_norm": 0.94921875, + "learning_rate": 0.000499095016785942, + "loss": 0.2298, + "step": 19720 + }, + { + "epoch": 0.82, + "grad_norm": 0.76171875, + "learning_rate": 0.0004990940945976104, + "loss": 0.246, + "step": 19730 + }, + { + "epoch": 0.82, + "grad_norm": 0.392578125, + "learning_rate": 0.0004990931719405106, + "loss": 0.2098, + "step": 19740 + }, + { + "epoch": 0.82, + "grad_norm": 1.3046875, + "learning_rate": 0.0004990922488146444, + "loss": 0.2199, + "step": 19750 + }, + { + "epoch": 0.82, + "grad_norm": 0.87109375, + "learning_rate": 0.0004990913252200135, + "loss": 0.2366, + "step": 19760 + }, + { + "epoch": 0.82, + "grad_norm": 1.1328125, + "learning_rate": 0.0004990904011566194, + "loss": 0.207, + "step": 19770 + }, + { + "epoch": 0.82, + "grad_norm": 0.259765625, + "learning_rate": 0.0004990894766244643, + "loss": 0.2101, + "step": 19780 + }, + { + "epoch": 0.82, + "grad_norm": 0.4296875, + "learning_rate": 0.0004990885516235496, + "loss": 0.2191, + "step": 19790 + }, + { + "epoch": 0.82, + "grad_norm": 0.251953125, + "learning_rate": 0.0004990876261538773, + "loss": 0.2621, + "step": 19800 + }, + { + "epoch": 0.82, + "grad_norm": 0.443359375, + "learning_rate": 0.0004990867002154488, + "loss": 0.2337, + "step": 19810 + }, + { + "epoch": 0.82, + "grad_norm": 0.31640625, + "learning_rate": 0.0004990857738082662, + "loss": 0.205, + "step": 19820 + }, + { + "epoch": 0.82, + "grad_norm": 0.5078125, + "learning_rate": 0.0004990848469323309, + "loss": 0.2116, + "step": 19830 + }, + { + "epoch": 0.82, + "grad_norm": 0.423828125, + "learning_rate": 0.0004990839195876448, + "loss": 0.2426, + "step": 19840 + }, + { + "epoch": 0.82, + "grad_norm": 0.71875, + "learning_rate": 0.0004990829917742098, + "loss": 0.2261, + "step": 19850 + }, + { + "epoch": 0.82, + "grad_norm": 1.09375, + "learning_rate": 0.0004990820634920275, + "loss": 0.2587, + "step": 19860 + }, + { + "epoch": 0.82, + "grad_norm": 0.66796875, + "learning_rate": 0.0004990811347410996, + "loss": 0.2729, + "step": 19870 + }, + { + "epoch": 0.82, + "grad_norm": 0.39453125, + "learning_rate": 0.000499080205521428, + "loss": 0.2079, + "step": 19880 + }, + { + "epoch": 0.82, + "grad_norm": 0.5078125, + "learning_rate": 0.0004990792758330141, + "loss": 0.242, + "step": 19890 + }, + { + "epoch": 0.82, + "grad_norm": 0.5078125, + "learning_rate": 0.0004990783456758601, + "loss": 0.2392, + "step": 19900 + }, + { + "epoch": 0.82, + "grad_norm": 0.310546875, + "learning_rate": 0.0004990774150499676, + "loss": 0.2243, + "step": 19910 + }, + { + "epoch": 0.83, + "grad_norm": 0.515625, + "learning_rate": 0.0004990764839553383, + "loss": 0.2701, + "step": 19920 + }, + { + "epoch": 0.83, + "grad_norm": 0.515625, + "learning_rate": 0.0004990755523919738, + "loss": 0.2857, + "step": 19930 + }, + { + "epoch": 0.83, + "grad_norm": 0.6953125, + "learning_rate": 0.0004990746203598761, + "loss": 0.2446, + "step": 19940 + }, + { + "epoch": 0.83, + "grad_norm": 0.99609375, + "learning_rate": 0.0004990736878590468, + "loss": 0.2501, + "step": 19950 + }, + { + "epoch": 0.83, + "grad_norm": 1.4453125, + "learning_rate": 0.0004990727548894878, + "loss": 0.3534, + "step": 19960 + }, + { + "epoch": 0.83, + "grad_norm": 0.7421875, + "learning_rate": 0.0004990718214512007, + "loss": 0.2235, + "step": 19970 + }, + { + "epoch": 0.83, + "grad_norm": 0.23828125, + "learning_rate": 0.0004990708875441873, + "loss": 0.3046, + "step": 19980 + }, + { + "epoch": 0.83, + "grad_norm": 1.15625, + "learning_rate": 0.0004990699531684495, + "loss": 0.2258, + "step": 19990 + }, + { + "epoch": 0.83, + "grad_norm": 0.828125, + "learning_rate": 0.000499069018323989, + "loss": 0.2502, + "step": 20000 + }, + { + "epoch": 0.83, + "grad_norm": 1.109375, + "learning_rate": 0.0004990680830108074, + "loss": 0.238, + "step": 20010 + }, + { + "epoch": 0.83, + "grad_norm": 0.625, + "learning_rate": 0.0004990671472289065, + "loss": 0.1479, + "step": 20020 + }, + { + "epoch": 0.83, + "grad_norm": 0.7890625, + "learning_rate": 0.0004990662109782882, + "loss": 0.2365, + "step": 20030 + }, + { + "epoch": 0.83, + "grad_norm": 0.78515625, + "learning_rate": 0.0004990652742589542, + "loss": 0.2575, + "step": 20040 + }, + { + "epoch": 0.83, + "grad_norm": 0.484375, + "learning_rate": 0.0004990643370709063, + "loss": 0.2933, + "step": 20050 + }, + { + "epoch": 0.83, + "grad_norm": 0.416015625, + "learning_rate": 0.0004990633994141462, + "loss": 0.1982, + "step": 20060 + }, + { + "epoch": 0.83, + "grad_norm": 0.59375, + "learning_rate": 0.0004990624612886755, + "loss": 0.2551, + "step": 20070 + }, + { + "epoch": 0.83, + "grad_norm": 1.046875, + "learning_rate": 0.0004990615226944964, + "loss": 0.2263, + "step": 20080 + }, + { + "epoch": 0.83, + "grad_norm": 0.73046875, + "learning_rate": 0.0004990605836316102, + "loss": 0.2009, + "step": 20090 + }, + { + "epoch": 0.83, + "grad_norm": 0.44921875, + "learning_rate": 0.0004990596441000189, + "loss": 0.1811, + "step": 20100 + }, + { + "epoch": 0.83, + "grad_norm": 0.484375, + "learning_rate": 0.0004990587040997244, + "loss": 0.2527, + "step": 20110 + }, + { + "epoch": 0.83, + "grad_norm": 0.640625, + "learning_rate": 0.0004990577636307282, + "loss": 0.2432, + "step": 20120 + }, + { + "epoch": 0.83, + "grad_norm": 0.58984375, + "learning_rate": 0.0004990568226930322, + "loss": 0.2788, + "step": 20130 + }, + { + "epoch": 0.83, + "grad_norm": 0.466796875, + "learning_rate": 0.0004990558812866382, + "loss": 0.2421, + "step": 20140 + }, + { + "epoch": 0.83, + "grad_norm": 0.76171875, + "learning_rate": 0.0004990549394115479, + "loss": 0.2436, + "step": 20150 + }, + { + "epoch": 0.84, + "grad_norm": 0.66796875, + "learning_rate": 0.000499053997067763, + "loss": 0.276, + "step": 20160 + }, + { + "epoch": 0.84, + "grad_norm": 0.8671875, + "learning_rate": 0.0004990530542552854, + "loss": 0.2846, + "step": 20170 + }, + { + "epoch": 0.84, + "grad_norm": 0.4296875, + "learning_rate": 0.000499052110974117, + "loss": 0.2138, + "step": 20180 + }, + { + "epoch": 0.84, + "grad_norm": 0.6484375, + "learning_rate": 0.0004990511672242593, + "loss": 0.24, + "step": 20190 + }, + { + "epoch": 0.84, + "grad_norm": 0.478515625, + "learning_rate": 0.0004990502230057143, + "loss": 0.2334, + "step": 20200 + }, + { + "epoch": 0.84, + "grad_norm": 1.0078125, + "learning_rate": 0.0004990492783184836, + "loss": 0.2342, + "step": 20210 + }, + { + "epoch": 0.84, + "grad_norm": 0.90625, + "learning_rate": 0.000499048333162569, + "loss": 0.2751, + "step": 20220 + }, + { + "epoch": 0.84, + "grad_norm": 0.78515625, + "learning_rate": 0.0004990473875379724, + "loss": 0.2884, + "step": 20230 + }, + { + "epoch": 0.84, + "grad_norm": 0.4921875, + "learning_rate": 0.0004990464414446955, + "loss": 0.1955, + "step": 20240 + }, + { + "epoch": 0.84, + "grad_norm": 0.8203125, + "learning_rate": 0.0004990454948827401, + "loss": 0.2349, + "step": 20250 + }, + { + "epoch": 0.84, + "grad_norm": 0.6640625, + "learning_rate": 0.000499044547852108, + "loss": 0.3031, + "step": 20260 + }, + { + "epoch": 0.84, + "grad_norm": 0.62109375, + "learning_rate": 0.0004990436003528009, + "loss": 0.2377, + "step": 20270 + }, + { + "epoch": 0.84, + "grad_norm": 0.28515625, + "learning_rate": 0.0004990426523848207, + "loss": 0.2532, + "step": 20280 + }, + { + "epoch": 0.84, + "grad_norm": 0.73828125, + "learning_rate": 0.000499041703948169, + "loss": 0.23, + "step": 20290 + }, + { + "epoch": 0.84, + "grad_norm": 0.287109375, + "learning_rate": 0.0004990407550428479, + "loss": 0.2619, + "step": 20300 + }, + { + "epoch": 0.84, + "grad_norm": 0.5625, + "learning_rate": 0.0004990398056688588, + "loss": 0.243, + "step": 20310 + }, + { + "epoch": 0.84, + "grad_norm": 2.53125, + "learning_rate": 0.0004990388558262038, + "loss": 0.2415, + "step": 20320 + }, + { + "epoch": 0.84, + "grad_norm": 0.609375, + "learning_rate": 0.0004990379055148846, + "loss": 0.2018, + "step": 20330 + }, + { + "epoch": 0.84, + "grad_norm": 0.47265625, + "learning_rate": 0.0004990369547349028, + "loss": 0.1937, + "step": 20340 + }, + { + "epoch": 0.84, + "grad_norm": 0.46484375, + "learning_rate": 0.0004990360034862604, + "loss": 0.2325, + "step": 20350 + }, + { + "epoch": 0.84, + "grad_norm": 5.25, + "learning_rate": 0.0004990350517689592, + "loss": 0.2597, + "step": 20360 + }, + { + "epoch": 0.84, + "grad_norm": 0.55859375, + "learning_rate": 0.0004990340995830009, + "loss": 0.2195, + "step": 20370 + }, + { + "epoch": 0.84, + "grad_norm": 0.46484375, + "learning_rate": 0.0004990331469283873, + "loss": 0.2135, + "step": 20380 + }, + { + "epoch": 0.84, + "grad_norm": 0.306640625, + "learning_rate": 0.0004990321938051202, + "loss": 0.2261, + "step": 20390 + }, + { + "epoch": 0.84, + "grad_norm": 0.73828125, + "learning_rate": 0.0004990312402132015, + "loss": 0.2466, + "step": 20400 + }, + { + "epoch": 0.85, + "grad_norm": 0.5, + "learning_rate": 0.0004990302861526328, + "loss": 0.2537, + "step": 20410 + }, + { + "epoch": 0.85, + "grad_norm": 0.4609375, + "learning_rate": 0.000499029331623416, + "loss": 0.2416, + "step": 20420 + }, + { + "epoch": 0.85, + "grad_norm": 0.87109375, + "learning_rate": 0.0004990283766255529, + "loss": 0.2524, + "step": 20430 + }, + { + "epoch": 0.85, + "grad_norm": 0.5859375, + "learning_rate": 0.0004990274211590453, + "loss": 0.2667, + "step": 20440 + }, + { + "epoch": 0.85, + "grad_norm": 1.203125, + "learning_rate": 0.0004990264652238951, + "loss": 0.2409, + "step": 20450 + }, + { + "epoch": 0.85, + "grad_norm": 0.419921875, + "learning_rate": 0.0004990255088201037, + "loss": 0.2658, + "step": 20460 + }, + { + "epoch": 0.85, + "grad_norm": 0.62890625, + "learning_rate": 0.0004990245519476735, + "loss": 0.2083, + "step": 20470 + }, + { + "epoch": 0.85, + "grad_norm": 1.3671875, + "learning_rate": 0.0004990235946066057, + "loss": 0.2787, + "step": 20480 + }, + { + "epoch": 0.85, + "grad_norm": 0.515625, + "learning_rate": 0.0004990226367969027, + "loss": 0.2239, + "step": 20490 + }, + { + "epoch": 0.85, + "grad_norm": 0.73046875, + "learning_rate": 0.0004990216785185658, + "loss": 0.2148, + "step": 20500 + }, + { + "epoch": 0.85, + "grad_norm": 1.015625, + "learning_rate": 0.0004990207197715969, + "loss": 0.2627, + "step": 20510 + }, + { + "epoch": 0.85, + "grad_norm": 0.6328125, + "learning_rate": 0.000499019760555998, + "loss": 0.2456, + "step": 20520 + }, + { + "epoch": 0.85, + "grad_norm": 0.58984375, + "learning_rate": 0.0004990188008717709, + "loss": 0.2327, + "step": 20530 + }, + { + "epoch": 0.85, + "grad_norm": 0.41015625, + "learning_rate": 0.0004990178407189172, + "loss": 0.2526, + "step": 20540 + }, + { + "epoch": 0.85, + "grad_norm": 0.68359375, + "learning_rate": 0.0004990168800974387, + "loss": 0.2834, + "step": 20550 + }, + { + "epoch": 0.85, + "grad_norm": 0.671875, + "learning_rate": 0.0004990159190073376, + "loss": 0.2664, + "step": 20560 + }, + { + "epoch": 0.85, + "grad_norm": 0.5546875, + "learning_rate": 0.0004990149574486153, + "loss": 0.2531, + "step": 20570 + }, + { + "epoch": 0.85, + "grad_norm": 0.73046875, + "learning_rate": 0.0004990139954212737, + "loss": 0.2605, + "step": 20580 + }, + { + "epoch": 0.85, + "grad_norm": 0.6015625, + "learning_rate": 0.0004990130329253147, + "loss": 0.2144, + "step": 20590 + }, + { + "epoch": 0.85, + "grad_norm": 0.61328125, + "learning_rate": 0.00049901206996074, + "loss": 0.2897, + "step": 20600 + }, + { + "epoch": 0.85, + "grad_norm": 0.69921875, + "learning_rate": 0.0004990111065275516, + "loss": 0.2405, + "step": 20610 + }, + { + "epoch": 0.85, + "grad_norm": 0.365234375, + "learning_rate": 0.0004990101426257511, + "loss": 0.2058, + "step": 20620 + }, + { + "epoch": 0.85, + "grad_norm": 0.78515625, + "learning_rate": 0.0004990091782553403, + "loss": 0.1949, + "step": 20630 + }, + { + "epoch": 0.85, + "grad_norm": 1.515625, + "learning_rate": 0.0004990082134163213, + "loss": 0.2814, + "step": 20640 + }, + { + "epoch": 0.86, + "grad_norm": 0.69921875, + "learning_rate": 0.0004990072481086957, + "loss": 0.2029, + "step": 20650 + }, + { + "epoch": 0.86, + "grad_norm": 0.294921875, + "learning_rate": 0.0004990062823324652, + "loss": 0.2556, + "step": 20660 + }, + { + "epoch": 0.86, + "grad_norm": 1.125, + "learning_rate": 0.0004990053160876319, + "loss": 0.2358, + "step": 20670 + }, + { + "epoch": 0.86, + "grad_norm": 0.828125, + "learning_rate": 0.0004990043493741975, + "loss": 0.2346, + "step": 20680 + }, + { + "epoch": 0.86, + "grad_norm": 0.478515625, + "learning_rate": 0.0004990033821921637, + "loss": 0.2793, + "step": 20690 + }, + { + "epoch": 0.86, + "grad_norm": 0.546875, + "learning_rate": 0.0004990024145415325, + "loss": 0.2692, + "step": 20700 + }, + { + "epoch": 0.86, + "grad_norm": 0.61328125, + "learning_rate": 0.0004990014464223057, + "loss": 0.252, + "step": 20710 + }, + { + "epoch": 0.86, + "grad_norm": 0.42578125, + "learning_rate": 0.000499000477834485, + "loss": 0.2624, + "step": 20720 + }, + { + "epoch": 0.86, + "grad_norm": 0.7578125, + "learning_rate": 0.0004989995087780723, + "loss": 0.2461, + "step": 20730 + }, + { + "epoch": 0.86, + "grad_norm": 2.03125, + "learning_rate": 0.0004989985392530693, + "loss": 0.2804, + "step": 20740 + }, + { + "epoch": 0.86, + "grad_norm": 0.59375, + "learning_rate": 0.0004989975692594781, + "loss": 0.1743, + "step": 20750 + }, + { + "epoch": 0.86, + "grad_norm": 0.470703125, + "learning_rate": 0.0004989965987973003, + "loss": 0.2024, + "step": 20760 + }, + { + "epoch": 0.86, + "grad_norm": 1.3984375, + "learning_rate": 0.0004989956278665379, + "loss": 0.203, + "step": 20770 + }, + { + "epoch": 0.86, + "grad_norm": 1.03125, + "learning_rate": 0.0004989946564671925, + "loss": 0.2304, + "step": 20780 + }, + { + "epoch": 0.86, + "grad_norm": 0.70703125, + "learning_rate": 0.000498993684599266, + "loss": 0.2517, + "step": 20790 + }, + { + "epoch": 0.86, + "grad_norm": 0.95703125, + "learning_rate": 0.0004989927122627604, + "loss": 0.2906, + "step": 20800 + }, + { + "epoch": 0.86, + "grad_norm": 0.796875, + "learning_rate": 0.0004989917394576773, + "loss": 0.275, + "step": 20810 + }, + { + "epoch": 0.86, + "grad_norm": 1.21875, + "learning_rate": 0.0004989907661840187, + "loss": 0.2351, + "step": 20820 + }, + { + "epoch": 0.86, + "grad_norm": 0.53125, + "learning_rate": 0.0004989897924417864, + "loss": 0.2409, + "step": 20830 + }, + { + "epoch": 0.86, + "grad_norm": 0.5625, + "learning_rate": 0.0004989888182309821, + "loss": 0.2208, + "step": 20840 + }, + { + "epoch": 0.86, + "grad_norm": 0.408203125, + "learning_rate": 0.0004989878435516078, + "loss": 0.293, + "step": 20850 + }, + { + "epoch": 0.86, + "grad_norm": 0.6484375, + "learning_rate": 0.0004989868684036653, + "loss": 0.255, + "step": 20860 + }, + { + "epoch": 0.86, + "grad_norm": 0.58984375, + "learning_rate": 0.0004989858927871562, + "loss": 0.235, + "step": 20870 + }, + { + "epoch": 0.86, + "grad_norm": 1.1328125, + "learning_rate": 0.0004989849167020827, + "loss": 0.1533, + "step": 20880 + }, + { + "epoch": 0.87, + "grad_norm": 2.1875, + "learning_rate": 0.0004989839401484466, + "loss": 0.2366, + "step": 20890 + }, + { + "epoch": 0.87, + "grad_norm": 0.609375, + "learning_rate": 0.0004989829631262494, + "loss": 0.2398, + "step": 20900 + }, + { + "epoch": 0.87, + "grad_norm": 0.80859375, + "learning_rate": 0.0004989819856354933, + "loss": 0.2641, + "step": 20910 + }, + { + "epoch": 0.87, + "grad_norm": 0.59375, + "learning_rate": 0.0004989810076761798, + "loss": 0.2017, + "step": 20920 + }, + { + "epoch": 0.87, + "grad_norm": 0.48828125, + "learning_rate": 0.0004989800292483111, + "loss": 0.2434, + "step": 20930 + }, + { + "epoch": 0.87, + "grad_norm": 0.625, + "learning_rate": 0.0004989790503518888, + "loss": 0.2618, + "step": 20940 + }, + { + "epoch": 0.87, + "grad_norm": 1.046875, + "learning_rate": 0.0004989780709869149, + "loss": 0.2838, + "step": 20950 + }, + { + "epoch": 0.87, + "grad_norm": 0.296875, + "learning_rate": 0.000498977091153391, + "loss": 0.2566, + "step": 20960 + }, + { + "epoch": 0.87, + "grad_norm": 1.3515625, + "learning_rate": 0.0004989761108513193, + "loss": 0.2233, + "step": 20970 + }, + { + "epoch": 0.87, + "grad_norm": 0.921875, + "learning_rate": 0.0004989751300807012, + "loss": 0.2666, + "step": 20980 + }, + { + "epoch": 0.87, + "grad_norm": 0.392578125, + "learning_rate": 0.0004989741488415389, + "loss": 0.2227, + "step": 20990 + }, + { + "epoch": 0.87, + "grad_norm": 1.234375, + "learning_rate": 0.0004989731671338342, + "loss": 0.2401, + "step": 21000 + }, + { + "epoch": 0.87, + "grad_norm": 0.61328125, + "learning_rate": 0.0004989721849575889, + "loss": 0.2427, + "step": 21010 + }, + { + "epoch": 0.87, + "grad_norm": 1.1171875, + "learning_rate": 0.0004989712023128048, + "loss": 0.172, + "step": 21020 + }, + { + "epoch": 0.87, + "grad_norm": 0.94921875, + "learning_rate": 0.0004989702191994838, + "loss": 0.2715, + "step": 21030 + }, + { + "epoch": 0.87, + "grad_norm": 1.78125, + "learning_rate": 0.0004989692356176277, + "loss": 0.2896, + "step": 21040 + }, + { + "epoch": 0.87, + "grad_norm": 0.75390625, + "learning_rate": 0.0004989682515672383, + "loss": 0.2351, + "step": 21050 + }, + { + "epoch": 0.87, + "grad_norm": 0.76171875, + "learning_rate": 0.0004989672670483177, + "loss": 0.1964, + "step": 21060 + }, + { + "epoch": 0.87, + "grad_norm": 0.99609375, + "learning_rate": 0.0004989662820608675, + "loss": 0.2369, + "step": 21070 + }, + { + "epoch": 0.87, + "grad_norm": 0.671875, + "learning_rate": 0.0004989652966048896, + "loss": 0.2799, + "step": 21080 + }, + { + "epoch": 0.87, + "grad_norm": 0.57421875, + "learning_rate": 0.0004989643106803861, + "loss": 0.2454, + "step": 21090 + }, + { + "epoch": 0.87, + "grad_norm": 0.71484375, + "learning_rate": 0.0004989633242873584, + "loss": 0.2181, + "step": 21100 + }, + { + "epoch": 0.87, + "grad_norm": 0.69140625, + "learning_rate": 0.0004989623374258088, + "loss": 0.2384, + "step": 21110 + }, + { + "epoch": 0.87, + "grad_norm": 0.69921875, + "learning_rate": 0.0004989613500957389, + "loss": 0.2445, + "step": 21120 + }, + { + "epoch": 0.88, + "grad_norm": 0.50390625, + "learning_rate": 0.0004989603622971506, + "loss": 0.2006, + "step": 21130 + }, + { + "epoch": 0.88, + "grad_norm": 1.296875, + "learning_rate": 0.0004989593740300458, + "loss": 0.22, + "step": 21140 + }, + { + "epoch": 0.88, + "grad_norm": 0.56640625, + "learning_rate": 0.0004989583852944262, + "loss": 0.2591, + "step": 21150 + }, + { + "epoch": 0.88, + "grad_norm": 0.97265625, + "learning_rate": 0.0004989573960902941, + "loss": 0.2459, + "step": 21160 + }, + { + "epoch": 0.88, + "grad_norm": 0.62890625, + "learning_rate": 0.0004989564064176508, + "loss": 0.2651, + "step": 21170 + }, + { + "epoch": 0.88, + "grad_norm": 0.67578125, + "learning_rate": 0.0004989554162764986, + "loss": 0.2378, + "step": 21180 + }, + { + "epoch": 0.88, + "grad_norm": 0.609375, + "learning_rate": 0.0004989544256668391, + "loss": 0.2396, + "step": 21190 + }, + { + "epoch": 0.88, + "grad_norm": 0.333984375, + "learning_rate": 0.0004989534345886743, + "loss": 0.2392, + "step": 21200 + }, + { + "epoch": 0.88, + "grad_norm": 0.98046875, + "learning_rate": 0.0004989524430420061, + "loss": 0.3005, + "step": 21210 + }, + { + "epoch": 0.88, + "grad_norm": 0.26953125, + "learning_rate": 0.0004989514510268362, + "loss": 0.2001, + "step": 21220 + }, + { + "epoch": 0.88, + "grad_norm": 0.57421875, + "learning_rate": 0.0004989504585431665, + "loss": 0.2396, + "step": 21230 + }, + { + "epoch": 0.88, + "grad_norm": 0.78125, + "learning_rate": 0.000498949465590999, + "loss": 0.251, + "step": 21240 + }, + { + "epoch": 0.88, + "grad_norm": 0.373046875, + "learning_rate": 0.0004989484721703354, + "loss": 0.2587, + "step": 21250 + }, + { + "epoch": 0.88, + "grad_norm": 0.451171875, + "learning_rate": 0.0004989474782811777, + "loss": 0.2229, + "step": 21260 + }, + { + "epoch": 0.88, + "grad_norm": 0.58984375, + "learning_rate": 0.0004989464839235278, + "loss": 0.2536, + "step": 21270 + }, + { + "epoch": 0.88, + "grad_norm": 0.87890625, + "learning_rate": 0.0004989454890973874, + "loss": 0.2368, + "step": 21280 + }, + { + "epoch": 0.88, + "grad_norm": 1.0625, + "learning_rate": 0.0004989444938027585, + "loss": 0.2342, + "step": 21290 + }, + { + "epoch": 0.88, + "grad_norm": 0.5234375, + "learning_rate": 0.000498943498039643, + "loss": 0.2571, + "step": 21300 + }, + { + "epoch": 0.88, + "grad_norm": 0.55078125, + "learning_rate": 0.0004989425018080427, + "loss": 0.1802, + "step": 21310 + }, + { + "epoch": 0.88, + "grad_norm": 0.46875, + "learning_rate": 0.0004989415051079594, + "loss": 0.2183, + "step": 21320 + }, + { + "epoch": 0.88, + "grad_norm": 0.57421875, + "learning_rate": 0.0004989405079393953, + "loss": 0.2718, + "step": 21330 + }, + { + "epoch": 0.88, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004989395103023518, + "loss": 0.2058, + "step": 21340 + }, + { + "epoch": 0.88, + "grad_norm": 0.88671875, + "learning_rate": 0.0004989385121968312, + "loss": 0.2207, + "step": 21350 + }, + { + "epoch": 0.88, + "grad_norm": 0.91796875, + "learning_rate": 0.0004989375136228351, + "loss": 0.2458, + "step": 21360 + }, + { + "epoch": 0.89, + "grad_norm": 0.578125, + "learning_rate": 0.0004989365145803655, + "loss": 0.2085, + "step": 21370 + }, + { + "epoch": 0.89, + "grad_norm": 0.64453125, + "learning_rate": 0.0004989355150694242, + "loss": 0.2545, + "step": 21380 + }, + { + "epoch": 0.89, + "grad_norm": 1.4375, + "learning_rate": 0.0004989345150900133, + "loss": 0.2374, + "step": 21390 + }, + { + "epoch": 0.89, + "grad_norm": 0.921875, + "learning_rate": 0.0004989335146421345, + "loss": 0.2098, + "step": 21400 + }, + { + "epoch": 0.89, + "grad_norm": 0.6640625, + "learning_rate": 0.0004989325137257897, + "loss": 0.2977, + "step": 21410 + }, + { + "epoch": 0.89, + "grad_norm": 1.265625, + "learning_rate": 0.0004989315123409807, + "loss": 0.2245, + "step": 21420 + }, + { + "epoch": 0.89, + "grad_norm": 1.5859375, + "learning_rate": 0.0004989305104877095, + "loss": 0.2064, + "step": 21430 + }, + { + "epoch": 0.89, + "grad_norm": 1.1171875, + "learning_rate": 0.0004989295081659779, + "loss": 0.2853, + "step": 21440 + }, + { + "epoch": 0.89, + "grad_norm": 1.765625, + "learning_rate": 0.0004989285053757879, + "loss": 0.2333, + "step": 21450 + }, + { + "epoch": 0.89, + "grad_norm": 0.5, + "learning_rate": 0.0004989275021171414, + "loss": 0.2301, + "step": 21460 + }, + { + "epoch": 0.89, + "grad_norm": 0.90625, + "learning_rate": 0.0004989264983900402, + "loss": 0.2032, + "step": 21470 + }, + { + "epoch": 0.89, + "grad_norm": 0.640625, + "learning_rate": 0.0004989254941944862, + "loss": 0.2627, + "step": 21480 + }, + { + "epoch": 0.89, + "grad_norm": 0.4921875, + "learning_rate": 0.0004989244895304813, + "loss": 0.254, + "step": 21490 + }, + { + "epoch": 0.89, + "grad_norm": 0.400390625, + "learning_rate": 0.0004989234843980274, + "loss": 0.2247, + "step": 21500 + }, + { + "epoch": 0.89, + "grad_norm": 0.51171875, + "learning_rate": 0.0004989224787971264, + "loss": 0.2412, + "step": 21510 + }, + { + "epoch": 0.89, + "grad_norm": 0.3984375, + "learning_rate": 0.0004989214727277801, + "loss": 0.2496, + "step": 21520 + }, + { + "epoch": 0.89, + "grad_norm": 0.640625, + "learning_rate": 0.0004989204661899905, + "loss": 0.2019, + "step": 21530 + }, + { + "epoch": 0.89, + "grad_norm": 0.734375, + "learning_rate": 0.0004989194591837595, + "loss": 0.2503, + "step": 21540 + }, + { + "epoch": 0.89, + "grad_norm": 0.90234375, + "learning_rate": 0.000498918451709089, + "loss": 0.2004, + "step": 21550 + }, + { + "epoch": 0.89, + "grad_norm": 0.828125, + "learning_rate": 0.0004989174437659808, + "loss": 0.2082, + "step": 21560 + }, + { + "epoch": 0.89, + "grad_norm": 0.73046875, + "learning_rate": 0.0004989164353544368, + "loss": 0.2393, + "step": 21570 + }, + { + "epoch": 0.89, + "grad_norm": 1.09375, + "learning_rate": 0.000498915426474459, + "loss": 0.2451, + "step": 21580 + }, + { + "epoch": 0.89, + "grad_norm": 0.671875, + "learning_rate": 0.0004989144171260492, + "loss": 0.2657, + "step": 21590 + }, + { + "epoch": 0.89, + "grad_norm": 1.8203125, + "learning_rate": 0.0004989134073092094, + "loss": 0.25, + "step": 21600 + }, + { + "epoch": 0.9, + "grad_norm": 0.625, + "learning_rate": 0.0004989123970239415, + "loss": 0.2551, + "step": 21610 + }, + { + "epoch": 0.9, + "grad_norm": 0.7109375, + "learning_rate": 0.0004989113862702473, + "loss": 0.2555, + "step": 21620 + }, + { + "epoch": 0.9, + "grad_norm": 0.263671875, + "learning_rate": 0.0004989103750481287, + "loss": 0.2005, + "step": 21630 + }, + { + "epoch": 0.9, + "grad_norm": 0.47265625, + "learning_rate": 0.0004989093633575878, + "loss": 0.2904, + "step": 21640 + }, + { + "epoch": 0.9, + "grad_norm": 0.4296875, + "learning_rate": 0.0004989083511986262, + "loss": 0.2356, + "step": 21650 + }, + { + "epoch": 0.9, + "grad_norm": 1.109375, + "learning_rate": 0.000498907338571246, + "loss": 0.2532, + "step": 21660 + }, + { + "epoch": 0.9, + "grad_norm": 0.51953125, + "learning_rate": 0.000498906325475449, + "loss": 0.2469, + "step": 21670 + }, + { + "epoch": 0.9, + "grad_norm": 0.7109375, + "learning_rate": 0.0004989053119112373, + "loss": 0.2741, + "step": 21680 + }, + { + "epoch": 0.9, + "grad_norm": 0.875, + "learning_rate": 0.0004989042978786127, + "loss": 0.2891, + "step": 21690 + }, + { + "epoch": 0.9, + "grad_norm": 0.72265625, + "learning_rate": 0.000498903283377577, + "loss": 0.2078, + "step": 21700 + }, + { + "epoch": 0.9, + "grad_norm": 0.5234375, + "learning_rate": 0.0004989022684081323, + "loss": 0.2253, + "step": 21710 + }, + { + "epoch": 0.9, + "grad_norm": 0.57421875, + "learning_rate": 0.0004989012529702803, + "loss": 0.2536, + "step": 21720 + }, + { + "epoch": 0.9, + "grad_norm": 0.62890625, + "learning_rate": 0.0004989002370640231, + "loss": 0.2416, + "step": 21730 + }, + { + "epoch": 0.9, + "grad_norm": 0.75, + "learning_rate": 0.0004988992206893624, + "loss": 0.2148, + "step": 21740 + }, + { + "epoch": 0.9, + "grad_norm": 0.72265625, + "learning_rate": 0.0004988982038463003, + "loss": 0.2604, + "step": 21750 + }, + { + "epoch": 0.9, + "grad_norm": 0.69140625, + "learning_rate": 0.0004988971865348388, + "loss": 0.2215, + "step": 21760 + }, + { + "epoch": 0.9, + "grad_norm": 0.53515625, + "learning_rate": 0.0004988961687549796, + "loss": 0.2187, + "step": 21770 + }, + { + "epoch": 0.9, + "grad_norm": 0.7734375, + "learning_rate": 0.0004988951505067247, + "loss": 0.2722, + "step": 21780 + }, + { + "epoch": 0.9, + "grad_norm": 0.56640625, + "learning_rate": 0.000498894131790076, + "loss": 0.2064, + "step": 21790 + }, + { + "epoch": 0.9, + "grad_norm": 0.640625, + "learning_rate": 0.0004988931126050352, + "loss": 0.2452, + "step": 21800 + }, + { + "epoch": 0.9, + "grad_norm": 1.140625, + "learning_rate": 0.0004988920929516048, + "loss": 0.2675, + "step": 21810 + }, + { + "epoch": 0.9, + "grad_norm": 0.43359375, + "learning_rate": 0.000498891072829786, + "loss": 0.2287, + "step": 21820 + }, + { + "epoch": 0.9, + "grad_norm": 0.640625, + "learning_rate": 0.0004988900522395814, + "loss": 0.2285, + "step": 21830 + }, + { + "epoch": 0.9, + "grad_norm": 0.953125, + "learning_rate": 0.0004988890311809924, + "loss": 0.2738, + "step": 21840 + }, + { + "epoch": 0.91, + "grad_norm": 0.921875, + "learning_rate": 0.0004988880096540212, + "loss": 0.2145, + "step": 21850 + }, + { + "epoch": 0.91, + "grad_norm": 0.294921875, + "learning_rate": 0.0004988869876586697, + "loss": 0.2395, + "step": 21860 + }, + { + "epoch": 0.91, + "grad_norm": 0.38671875, + "learning_rate": 0.0004988859651949397, + "loss": 0.2613, + "step": 21870 + }, + { + "epoch": 0.91, + "grad_norm": 0.5078125, + "learning_rate": 0.0004988849422628332, + "loss": 0.2207, + "step": 21880 + }, + { + "epoch": 0.91, + "grad_norm": 0.5078125, + "learning_rate": 0.0004988839188623521, + "loss": 0.207, + "step": 21890 + }, + { + "epoch": 0.91, + "grad_norm": 0.6484375, + "learning_rate": 0.0004988828949934983, + "loss": 0.2436, + "step": 21900 + }, + { + "epoch": 0.91, + "grad_norm": 0.5234375, + "learning_rate": 0.0004988818706562738, + "loss": 0.2769, + "step": 21910 + }, + { + "epoch": 0.91, + "grad_norm": 0.228515625, + "learning_rate": 0.0004988808458506806, + "loss": 0.2418, + "step": 21920 + }, + { + "epoch": 0.91, + "grad_norm": 0.8203125, + "learning_rate": 0.0004988798205767204, + "loss": 0.2441, + "step": 21930 + }, + { + "epoch": 0.91, + "grad_norm": 0.498046875, + "learning_rate": 0.0004988787948343953, + "loss": 0.289, + "step": 21940 + }, + { + "epoch": 0.91, + "grad_norm": 0.67578125, + "learning_rate": 0.0004988777686237071, + "loss": 0.2456, + "step": 21950 + }, + { + "epoch": 0.91, + "grad_norm": 0.875, + "learning_rate": 0.0004988767419446579, + "loss": 0.2212, + "step": 21960 + }, + { + "epoch": 0.91, + "grad_norm": 0.79296875, + "learning_rate": 0.0004988757147972496, + "loss": 0.2029, + "step": 21970 + }, + { + "epoch": 0.91, + "grad_norm": 0.55078125, + "learning_rate": 0.000498874687181484, + "loss": 0.2465, + "step": 21980 + }, + { + "epoch": 0.91, + "grad_norm": 0.4140625, + "learning_rate": 0.0004988736590973631, + "loss": 0.2228, + "step": 21990 + }, + { + "epoch": 0.91, + "grad_norm": 0.416015625, + "learning_rate": 0.000498872630544889, + "loss": 0.2229, + "step": 22000 + }, + { + "epoch": 0.91, + "grad_norm": 0.64453125, + "learning_rate": 0.0004988716015240633, + "loss": 0.1896, + "step": 22010 + }, + { + "epoch": 0.91, + "grad_norm": 0.53125, + "learning_rate": 0.0004988705720348882, + "loss": 0.2493, + "step": 22020 + }, + { + "epoch": 0.91, + "grad_norm": 1.8828125, + "learning_rate": 0.0004988695420773656, + "loss": 0.2748, + "step": 22030 + }, + { + "epoch": 0.91, + "grad_norm": 0.72265625, + "learning_rate": 0.0004988685116514973, + "loss": 0.2328, + "step": 22040 + }, + { + "epoch": 0.91, + "grad_norm": 0.79296875, + "learning_rate": 0.0004988674807572854, + "loss": 0.268, + "step": 22050 + }, + { + "epoch": 0.91, + "grad_norm": 0.62109375, + "learning_rate": 0.0004988664493947318, + "loss": 0.2159, + "step": 22060 + }, + { + "epoch": 0.91, + "grad_norm": 0.48828125, + "learning_rate": 0.0004988654175638384, + "loss": 0.2065, + "step": 22070 + }, + { + "epoch": 0.91, + "grad_norm": 0.5859375, + "learning_rate": 0.0004988643852646071, + "loss": 0.2188, + "step": 22080 + }, + { + "epoch": 0.91, + "grad_norm": 0.7109375, + "learning_rate": 0.0004988633524970399, + "loss": 0.2031, + "step": 22090 + }, + { + "epoch": 0.92, + "grad_norm": 0.87890625, + "learning_rate": 0.0004988623192611388, + "loss": 0.2534, + "step": 22100 + }, + { + "epoch": 0.92, + "grad_norm": 0.76171875, + "learning_rate": 0.0004988612855569057, + "loss": 0.275, + "step": 22110 + }, + { + "epoch": 0.92, + "grad_norm": 0.57421875, + "learning_rate": 0.0004988602513843425, + "loss": 0.1833, + "step": 22120 + }, + { + "epoch": 0.92, + "grad_norm": 0.71875, + "learning_rate": 0.0004988592167434512, + "loss": 0.2381, + "step": 22130 + }, + { + "epoch": 0.92, + "grad_norm": 0.73046875, + "learning_rate": 0.0004988581816342337, + "loss": 0.2008, + "step": 22140 + }, + { + "epoch": 0.92, + "grad_norm": 1.6640625, + "learning_rate": 0.000498857146056692, + "loss": 0.2488, + "step": 22150 + }, + { + "epoch": 0.92, + "grad_norm": 0.2490234375, + "learning_rate": 0.000498856110010828, + "loss": 0.2451, + "step": 22160 + }, + { + "epoch": 0.92, + "grad_norm": 1.859375, + "learning_rate": 0.0004988550734966438, + "loss": 0.2201, + "step": 22170 + }, + { + "epoch": 0.92, + "grad_norm": 0.9609375, + "learning_rate": 0.0004988540365141411, + "loss": 0.2595, + "step": 22180 + }, + { + "epoch": 0.92, + "grad_norm": 0.482421875, + "learning_rate": 0.000498852999063322, + "loss": 0.25, + "step": 22190 + }, + { + "epoch": 0.92, + "grad_norm": 1.4609375, + "learning_rate": 0.0004988519611441884, + "loss": 0.2211, + "step": 22200 + }, + { + "epoch": 0.92, + "grad_norm": 0.3984375, + "learning_rate": 0.0004988509227567423, + "loss": 0.2355, + "step": 22210 + }, + { + "epoch": 0.92, + "grad_norm": 0.6015625, + "learning_rate": 0.0004988498839009857, + "loss": 0.3228, + "step": 22220 + }, + { + "epoch": 0.92, + "grad_norm": 0.5390625, + "learning_rate": 0.0004988488445769204, + "loss": 0.2455, + "step": 22230 + }, + { + "epoch": 0.92, + "grad_norm": 0.2392578125, + "learning_rate": 0.0004988478047845485, + "loss": 0.2133, + "step": 22240 + }, + { + "epoch": 0.92, + "grad_norm": 0.64453125, + "learning_rate": 0.0004988467645238719, + "loss": 0.2049, + "step": 22250 + }, + { + "epoch": 0.92, + "grad_norm": 0.193359375, + "learning_rate": 0.0004988457237948925, + "loss": 0.2623, + "step": 22260 + }, + { + "epoch": 0.92, + "grad_norm": 0.890625, + "learning_rate": 0.0004988446825976125, + "loss": 0.2319, + "step": 22270 + }, + { + "epoch": 0.92, + "grad_norm": 0.59375, + "learning_rate": 0.0004988436409320335, + "loss": 0.2191, + "step": 22280 + }, + { + "epoch": 0.92, + "grad_norm": 1.0234375, + "learning_rate": 0.0004988425987981578, + "loss": 0.229, + "step": 22290 + }, + { + "epoch": 0.92, + "grad_norm": 0.458984375, + "learning_rate": 0.000498841556195987, + "loss": 0.2768, + "step": 22300 + }, + { + "epoch": 0.92, + "grad_norm": 2.03125, + "learning_rate": 0.0004988405131255234, + "loss": 0.2441, + "step": 22310 + }, + { + "epoch": 0.92, + "grad_norm": 0.59375, + "learning_rate": 0.0004988394695867687, + "loss": 0.2294, + "step": 22320 + }, + { + "epoch": 0.92, + "grad_norm": 0.625, + "learning_rate": 0.0004988384255797251, + "loss": 0.2555, + "step": 22330 + }, + { + "epoch": 0.93, + "grad_norm": 0.51171875, + "learning_rate": 0.0004988373811043945, + "loss": 0.2099, + "step": 22340 + }, + { + "epoch": 0.93, + "grad_norm": 1.5234375, + "learning_rate": 0.0004988363361607787, + "loss": 0.2079, + "step": 22350 + }, + { + "epoch": 0.93, + "grad_norm": 0.71875, + "learning_rate": 0.0004988352907488799, + "loss": 0.2964, + "step": 22360 + }, + { + "epoch": 0.93, + "grad_norm": 0.23828125, + "learning_rate": 0.0004988342448686998, + "loss": 0.1908, + "step": 22370 + }, + { + "epoch": 0.93, + "grad_norm": 0.984375, + "learning_rate": 0.0004988331985202407, + "loss": 0.2985, + "step": 22380 + }, + { + "epoch": 0.93, + "grad_norm": 0.45703125, + "learning_rate": 0.0004988321517035044, + "loss": 0.2589, + "step": 22390 + }, + { + "epoch": 0.93, + "grad_norm": 0.95703125, + "learning_rate": 0.0004988311044184928, + "loss": 0.2479, + "step": 22400 + }, + { + "epoch": 0.93, + "grad_norm": 0.359375, + "learning_rate": 0.0004988300566652079, + "loss": 0.2502, + "step": 22410 + }, + { + "epoch": 0.93, + "grad_norm": 0.796875, + "learning_rate": 0.0004988290084436516, + "loss": 0.2057, + "step": 22420 + }, + { + "epoch": 0.93, + "grad_norm": 0.68359375, + "learning_rate": 0.0004988279597538261, + "loss": 0.2225, + "step": 22430 + }, + { + "epoch": 0.93, + "grad_norm": 0.26171875, + "learning_rate": 0.0004988269105957332, + "loss": 0.2804, + "step": 22440 + }, + { + "epoch": 0.93, + "grad_norm": 2.109375, + "learning_rate": 0.000498825860969375, + "loss": 0.2564, + "step": 22450 + }, + { + "epoch": 0.93, + "grad_norm": 0.296875, + "learning_rate": 0.0004988248108747534, + "loss": 0.2594, + "step": 22460 + }, + { + "epoch": 0.93, + "grad_norm": 1.3671875, + "learning_rate": 0.0004988237603118703, + "loss": 0.2572, + "step": 22470 + }, + { + "epoch": 0.93, + "grad_norm": 1.0859375, + "learning_rate": 0.0004988227092807279, + "loss": 0.2041, + "step": 22480 + }, + { + "epoch": 0.93, + "grad_norm": 1.0, + "learning_rate": 0.000498821657781328, + "loss": 0.2166, + "step": 22490 + }, + { + "epoch": 0.93, + "grad_norm": 0.90625, + "learning_rate": 0.0004988206058136724, + "loss": 0.2643, + "step": 22500 + }, + { + "epoch": 0.93, + "grad_norm": 0.71484375, + "learning_rate": 0.0004988195533777635, + "loss": 0.2241, + "step": 22510 + }, + { + "epoch": 0.93, + "grad_norm": 0.76953125, + "learning_rate": 0.000498818500473603, + "loss": 0.2173, + "step": 22520 + }, + { + "epoch": 0.93, + "grad_norm": 0.181640625, + "learning_rate": 0.0004988174471011929, + "loss": 0.2408, + "step": 22530 + }, + { + "epoch": 0.93, + "grad_norm": 0.46484375, + "learning_rate": 0.0004988163932605353, + "loss": 0.2781, + "step": 22540 + }, + { + "epoch": 0.93, + "grad_norm": 0.57421875, + "learning_rate": 0.0004988153389516321, + "loss": 0.2378, + "step": 22550 + }, + { + "epoch": 0.93, + "grad_norm": 0.671875, + "learning_rate": 0.0004988142841744854, + "loss": 0.273, + "step": 22560 + }, + { + "epoch": 0.93, + "grad_norm": 0.333984375, + "learning_rate": 0.000498813228929097, + "loss": 0.2437, + "step": 22570 + }, + { + "epoch": 0.94, + "grad_norm": 0.7890625, + "learning_rate": 0.0004988121732154689, + "loss": 0.2297, + "step": 22580 + }, + { + "epoch": 0.94, + "grad_norm": 0.478515625, + "learning_rate": 0.0004988111170336032, + "loss": 0.249, + "step": 22590 + }, + { + "epoch": 0.94, + "grad_norm": 2.0, + "learning_rate": 0.0004988100603835019, + "loss": 0.249, + "step": 22600 + }, + { + "epoch": 0.94, + "grad_norm": 0.625, + "learning_rate": 0.0004988090032651669, + "loss": 0.3069, + "step": 22610 + }, + { + "epoch": 0.94, + "grad_norm": 0.72265625, + "learning_rate": 0.0004988079456786003, + "loss": 0.2585, + "step": 22620 + }, + { + "epoch": 0.94, + "grad_norm": 1.28125, + "learning_rate": 0.0004988068876238039, + "loss": 0.261, + "step": 22630 + }, + { + "epoch": 0.94, + "grad_norm": 0.50390625, + "learning_rate": 0.0004988058291007798, + "loss": 0.2032, + "step": 22640 + }, + { + "epoch": 0.94, + "grad_norm": 1.625, + "learning_rate": 0.0004988047701095301, + "loss": 0.1875, + "step": 22650 + }, + { + "epoch": 0.94, + "grad_norm": 0.94921875, + "learning_rate": 0.0004988037106500567, + "loss": 0.3067, + "step": 22660 + }, + { + "epoch": 0.94, + "grad_norm": 0.439453125, + "learning_rate": 0.0004988026507223615, + "loss": 0.1534, + "step": 22670 + }, + { + "epoch": 0.94, + "grad_norm": 0.52734375, + "learning_rate": 0.0004988015903264466, + "loss": 0.23, + "step": 22680 + }, + { + "epoch": 0.94, + "grad_norm": 0.255859375, + "learning_rate": 0.000498800529462314, + "loss": 0.2227, + "step": 22690 + }, + { + "epoch": 0.94, + "grad_norm": 0.447265625, + "learning_rate": 0.0004987994681299656, + "loss": 0.2536, + "step": 22700 + }, + { + "epoch": 0.94, + "grad_norm": 0.3671875, + "learning_rate": 0.0004987984063294036, + "loss": 0.2128, + "step": 22710 + }, + { + "epoch": 0.94, + "grad_norm": 0.80078125, + "learning_rate": 0.0004987973440606299, + "loss": 0.2331, + "step": 22720 + }, + { + "epoch": 0.94, + "grad_norm": 0.54296875, + "learning_rate": 0.0004987962813236463, + "loss": 0.2089, + "step": 22730 + }, + { + "epoch": 0.94, + "grad_norm": 0.62890625, + "learning_rate": 0.000498795218118455, + "loss": 0.2931, + "step": 22740 + }, + { + "epoch": 0.94, + "grad_norm": 0.59765625, + "learning_rate": 0.000498794154445058, + "loss": 0.2273, + "step": 22750 + }, + { + "epoch": 0.94, + "grad_norm": 0.431640625, + "learning_rate": 0.0004987930903034572, + "loss": 0.2744, + "step": 22760 + }, + { + "epoch": 0.94, + "grad_norm": 0.291015625, + "learning_rate": 0.0004987920256936547, + "loss": 0.239, + "step": 22770 + }, + { + "epoch": 0.94, + "grad_norm": 0.61328125, + "learning_rate": 0.0004987909606156526, + "loss": 0.2316, + "step": 22780 + }, + { + "epoch": 0.94, + "grad_norm": 0.390625, + "learning_rate": 0.0004987898950694527, + "loss": 0.2124, + "step": 22790 + }, + { + "epoch": 0.94, + "grad_norm": 0.0, + "learning_rate": 0.000498788829055057, + "loss": 0.2282, + "step": 22800 + }, + { + "epoch": 0.94, + "grad_norm": 1.0078125, + "learning_rate": 0.0004987877625724677, + "loss": 0.2254, + "step": 22810 + }, + { + "epoch": 0.95, + "grad_norm": 0.79296875, + "learning_rate": 0.0004987866956216866, + "loss": 0.2256, + "step": 22820 + }, + { + "epoch": 0.95, + "grad_norm": 0.63671875, + "learning_rate": 0.0004987856282027159, + "loss": 0.2298, + "step": 22830 + }, + { + "epoch": 0.95, + "grad_norm": 0.26953125, + "learning_rate": 0.0004987845603155576, + "loss": 0.2776, + "step": 22840 + }, + { + "epoch": 0.95, + "grad_norm": 0.23828125, + "learning_rate": 0.0004987834919602135, + "loss": 0.2323, + "step": 22850 + }, + { + "epoch": 0.95, + "grad_norm": 0.8046875, + "learning_rate": 0.0004987824231366857, + "loss": 0.227, + "step": 22860 + }, + { + "epoch": 0.95, + "grad_norm": 1.0703125, + "learning_rate": 0.0004987813538449762, + "loss": 0.2821, + "step": 22870 + }, + { + "epoch": 0.95, + "grad_norm": 0.65234375, + "learning_rate": 0.0004987802840850873, + "loss": 0.2506, + "step": 22880 + }, + { + "epoch": 0.95, + "grad_norm": 0.703125, + "learning_rate": 0.0004987792138570205, + "loss": 0.2492, + "step": 22890 + }, + { + "epoch": 0.95, + "grad_norm": 1.734375, + "learning_rate": 0.0004987781431607782, + "loss": 0.2658, + "step": 22900 + }, + { + "epoch": 0.95, + "grad_norm": 0.75390625, + "learning_rate": 0.0004987770719963624, + "loss": 0.2242, + "step": 22910 + }, + { + "epoch": 0.95, + "grad_norm": 0.6171875, + "learning_rate": 0.0004987760003637748, + "loss": 0.271, + "step": 22920 + }, + { + "epoch": 0.95, + "grad_norm": 1.65625, + "learning_rate": 0.0004987749282630178, + "loss": 0.2293, + "step": 22930 + }, + { + "epoch": 0.95, + "grad_norm": 0.625, + "learning_rate": 0.0004987738556940932, + "loss": 0.2205, + "step": 22940 + }, + { + "epoch": 0.95, + "grad_norm": 0.73828125, + "learning_rate": 0.0004987727826570031, + "loss": 0.2698, + "step": 22950 + }, + { + "epoch": 0.95, + "grad_norm": 0.44921875, + "learning_rate": 0.0004987717091517494, + "loss": 0.2448, + "step": 22960 + }, + { + "epoch": 0.95, + "grad_norm": 0.6328125, + "learning_rate": 0.0004987706351783342, + "loss": 0.2364, + "step": 22970 + }, + { + "epoch": 0.95, + "grad_norm": 0.68359375, + "learning_rate": 0.0004987695607367597, + "loss": 0.2803, + "step": 22980 + }, + { + "epoch": 0.95, + "grad_norm": 0.8046875, + "learning_rate": 0.0004987684858270276, + "loss": 0.2736, + "step": 22990 + }, + { + "epoch": 0.95, + "grad_norm": 0.53515625, + "learning_rate": 0.0004987674104491402, + "loss": 0.2311, + "step": 23000 + }, + { + "epoch": 0.95, + "grad_norm": 1.8515625, + "learning_rate": 0.0004987663346030992, + "loss": 0.2691, + "step": 23010 + }, + { + "epoch": 0.95, + "grad_norm": 0.66796875, + "learning_rate": 0.000498765258288907, + "loss": 0.2785, + "step": 23020 + }, + { + "epoch": 0.95, + "grad_norm": 0.796875, + "learning_rate": 0.0004987641815065654, + "loss": 0.243, + "step": 23030 + }, + { + "epoch": 0.95, + "grad_norm": 0.470703125, + "learning_rate": 0.0004987631042560765, + "loss": 0.2225, + "step": 23040 + }, + { + "epoch": 0.95, + "grad_norm": 1.0859375, + "learning_rate": 0.0004987620265374422, + "loss": 0.245, + "step": 23050 + }, + { + "epoch": 0.96, + "grad_norm": 1.5078125, + "learning_rate": 0.0004987609483506647, + "loss": 0.2737, + "step": 23060 + }, + { + "epoch": 0.96, + "grad_norm": 0.73828125, + "learning_rate": 0.000498759869695746, + "loss": 0.2513, + "step": 23070 + }, + { + "epoch": 0.96, + "grad_norm": 0.7109375, + "learning_rate": 0.000498758790572688, + "loss": 0.2705, + "step": 23080 + }, + { + "epoch": 0.96, + "grad_norm": 1.0234375, + "learning_rate": 0.0004987577109814929, + "loss": 0.2242, + "step": 23090 + }, + { + "epoch": 0.96, + "grad_norm": 0.53515625, + "learning_rate": 0.0004987566309221627, + "loss": 0.2138, + "step": 23100 + }, + { + "epoch": 0.96, + "grad_norm": 0.310546875, + "learning_rate": 0.0004987555503946992, + "loss": 0.2022, + "step": 23110 + }, + { + "epoch": 0.96, + "grad_norm": 0.62890625, + "learning_rate": 0.0004987544693991048, + "loss": 0.2156, + "step": 23120 + }, + { + "epoch": 0.96, + "grad_norm": 0.5, + "learning_rate": 0.0004987533879353812, + "loss": 0.2547, + "step": 23130 + }, + { + "epoch": 0.96, + "grad_norm": 1.03125, + "learning_rate": 0.0004987523060035307, + "loss": 0.2308, + "step": 23140 + }, + { + "epoch": 0.96, + "grad_norm": 0.59765625, + "learning_rate": 0.000498751223603555, + "loss": 0.2416, + "step": 23150 + }, + { + "epoch": 0.96, + "grad_norm": 0.5625, + "learning_rate": 0.0004987501407354567, + "loss": 0.2232, + "step": 23160 + }, + { + "epoch": 0.96, + "grad_norm": 0.96484375, + "learning_rate": 0.0004987490573992372, + "loss": 0.1981, + "step": 23170 + }, + { + "epoch": 0.96, + "grad_norm": 0.703125, + "learning_rate": 0.000498747973594899, + "loss": 0.2365, + "step": 23180 + }, + { + "epoch": 0.96, + "grad_norm": 0.4609375, + "learning_rate": 0.000498746889322444, + "loss": 0.279, + "step": 23190 + }, + { + "epoch": 0.96, + "grad_norm": 0.96875, + "learning_rate": 0.0004987458045818742, + "loss": 0.2141, + "step": 23200 + }, + { + "epoch": 0.96, + "grad_norm": 0.001708984375, + "learning_rate": 0.0004987447193731916, + "loss": 0.2838, + "step": 23210 + }, + { + "epoch": 0.96, + "grad_norm": 0.8203125, + "learning_rate": 0.0004987436336963983, + "loss": 0.2849, + "step": 23220 + }, + { + "epoch": 0.96, + "grad_norm": 0.48046875, + "learning_rate": 0.0004987425475514964, + "loss": 0.2162, + "step": 23230 + }, + { + "epoch": 0.96, + "grad_norm": 0.671875, + "learning_rate": 0.0004987414609384878, + "loss": 0.2428, + "step": 23240 + }, + { + "epoch": 0.96, + "grad_norm": 0.1923828125, + "learning_rate": 0.0004987403738573746, + "loss": 0.2512, + "step": 23250 + }, + { + "epoch": 0.96, + "grad_norm": 1.0703125, + "learning_rate": 0.0004987392863081591, + "loss": 0.2453, + "step": 23260 + }, + { + "epoch": 0.96, + "grad_norm": 0.287109375, + "learning_rate": 0.000498738198290843, + "loss": 0.2459, + "step": 23270 + }, + { + "epoch": 0.96, + "grad_norm": 0.29296875, + "learning_rate": 0.0004987371098054284, + "loss": 0.2693, + "step": 23280 + }, + { + "epoch": 0.96, + "grad_norm": 0.75, + "learning_rate": 0.0004987360208519175, + "loss": 0.2442, + "step": 23290 + }, + { + "epoch": 0.97, + "grad_norm": 0.41796875, + "learning_rate": 0.0004987349314303122, + "loss": 0.2562, + "step": 23300 + }, + { + "epoch": 0.97, + "grad_norm": 0.578125, + "learning_rate": 0.0004987338415406148, + "loss": 0.2055, + "step": 23310 + }, + { + "epoch": 0.97, + "grad_norm": 0.9609375, + "learning_rate": 0.000498732751182827, + "loss": 0.2026, + "step": 23320 + }, + { + "epoch": 0.97, + "grad_norm": 0.373046875, + "learning_rate": 0.0004987316603569511, + "loss": 0.2181, + "step": 23330 + }, + { + "epoch": 0.97, + "grad_norm": 0.7734375, + "learning_rate": 0.000498730569062989, + "loss": 0.1899, + "step": 23340 + }, + { + "epoch": 0.97, + "grad_norm": 0.69140625, + "learning_rate": 0.0004987294773009429, + "loss": 0.2513, + "step": 23350 + }, + { + "epoch": 0.97, + "grad_norm": 0.267578125, + "learning_rate": 0.0004987283850708148, + "loss": 0.271, + "step": 23360 + }, + { + "epoch": 0.97, + "grad_norm": 1.2578125, + "learning_rate": 0.0004987272923726066, + "loss": 0.232, + "step": 23370 + }, + { + "epoch": 0.97, + "grad_norm": 0.953125, + "learning_rate": 0.0004987261992063206, + "loss": 0.2412, + "step": 23380 + }, + { + "epoch": 0.97, + "grad_norm": 0.2890625, + "learning_rate": 0.0004987251055719587, + "loss": 0.1915, + "step": 23390 + }, + { + "epoch": 0.97, + "grad_norm": 0.74609375, + "learning_rate": 0.0004987240114695231, + "loss": 0.2093, + "step": 23400 + }, + { + "epoch": 0.97, + "grad_norm": 0.55859375, + "learning_rate": 0.0004987229168990158, + "loss": 0.2179, + "step": 23410 + }, + { + "epoch": 0.97, + "grad_norm": 0.8671875, + "learning_rate": 0.0004987218218604387, + "loss": 0.2628, + "step": 23420 + }, + { + "epoch": 0.97, + "grad_norm": 4.59375, + "learning_rate": 0.0004987207263537941, + "loss": 0.2355, + "step": 23430 + }, + { + "epoch": 0.97, + "grad_norm": 1.71875, + "learning_rate": 0.0004987196303790839, + "loss": 0.2455, + "step": 23440 + }, + { + "epoch": 0.97, + "grad_norm": 0.86328125, + "learning_rate": 0.0004987185339363102, + "loss": 0.251, + "step": 23450 + }, + { + "epoch": 0.97, + "grad_norm": 1.5625, + "learning_rate": 0.000498717437025475, + "loss": 0.2525, + "step": 23460 + }, + { + "epoch": 0.97, + "grad_norm": 0.384765625, + "learning_rate": 0.0004987163396465806, + "loss": 0.24, + "step": 23470 + }, + { + "epoch": 0.97, + "grad_norm": 0.86328125, + "learning_rate": 0.0004987152417996289, + "loss": 0.1614, + "step": 23480 + }, + { + "epoch": 0.97, + "grad_norm": 1.2734375, + "learning_rate": 0.0004987141434846219, + "loss": 0.2353, + "step": 23490 + }, + { + "epoch": 0.97, + "grad_norm": 0.21484375, + "learning_rate": 0.0004987130447015618, + "loss": 0.2821, + "step": 23500 + }, + { + "epoch": 0.97, + "grad_norm": 0.4921875, + "learning_rate": 0.0004987119454504506, + "loss": 0.2933, + "step": 23510 + }, + { + "epoch": 0.97, + "grad_norm": 0.267578125, + "learning_rate": 0.0004987108457312902, + "loss": 0.2002, + "step": 23520 + }, + { + "epoch": 0.97, + "grad_norm": 0.95703125, + "learning_rate": 0.000498709745544083, + "loss": 0.2254, + "step": 23530 + }, + { + "epoch": 0.98, + "grad_norm": 1.40625, + "learning_rate": 0.000498708644888831, + "loss": 0.2543, + "step": 23540 + }, + { + "epoch": 0.98, + "grad_norm": 0.8671875, + "learning_rate": 0.0004987075437655361, + "loss": 0.2748, + "step": 23550 + }, + { + "epoch": 0.98, + "grad_norm": 0.00162506103515625, + "learning_rate": 0.0004987064421742004, + "loss": 0.2203, + "step": 23560 + }, + { + "epoch": 0.98, + "grad_norm": 0.484375, + "learning_rate": 0.0004987053401148261, + "loss": 0.2146, + "step": 23570 + }, + { + "epoch": 0.98, + "grad_norm": 1.15625, + "learning_rate": 0.0004987042375874152, + "loss": 0.2351, + "step": 23580 + }, + { + "epoch": 0.98, + "grad_norm": 0.44921875, + "learning_rate": 0.0004987031345919698, + "loss": 0.2165, + "step": 23590 + }, + { + "epoch": 0.98, + "grad_norm": 1.0625, + "learning_rate": 0.0004987020311284919, + "loss": 0.1965, + "step": 23600 + }, + { + "epoch": 0.98, + "grad_norm": 0.921875, + "learning_rate": 0.0004987009271969837, + "loss": 0.2556, + "step": 23610 + }, + { + "epoch": 0.98, + "grad_norm": 0.4296875, + "learning_rate": 0.0004986998227974472, + "loss": 0.2642, + "step": 23620 + }, + { + "epoch": 0.98, + "grad_norm": 0.56640625, + "learning_rate": 0.0004986987179298844, + "loss": 0.1995, + "step": 23630 + }, + { + "epoch": 0.98, + "grad_norm": 1.046875, + "learning_rate": 0.0004986976125942975, + "loss": 0.2606, + "step": 23640 + }, + { + "epoch": 0.98, + "grad_norm": 0.578125, + "learning_rate": 0.0004986965067906887, + "loss": 0.2867, + "step": 23650 + }, + { + "epoch": 0.98, + "grad_norm": 0.3984375, + "learning_rate": 0.0004986954005190598, + "loss": 0.2742, + "step": 23660 + }, + { + "epoch": 0.98, + "grad_norm": 0.51953125, + "learning_rate": 0.0004986942937794131, + "loss": 0.2334, + "step": 23670 + }, + { + "epoch": 0.98, + "grad_norm": 0.41796875, + "learning_rate": 0.0004986931865717505, + "loss": 0.2529, + "step": 23680 + }, + { + "epoch": 0.98, + "grad_norm": 0.64453125, + "learning_rate": 0.0004986920788960743, + "loss": 0.2189, + "step": 23690 + }, + { + "epoch": 0.98, + "grad_norm": 0.65625, + "learning_rate": 0.0004986909707523863, + "loss": 0.2164, + "step": 23700 + }, + { + "epoch": 0.98, + "grad_norm": 0.65234375, + "learning_rate": 0.0004986898621406889, + "loss": 0.2239, + "step": 23710 + }, + { + "epoch": 0.98, + "grad_norm": 0.6171875, + "learning_rate": 0.000498688753060984, + "loss": 0.2364, + "step": 23720 + }, + { + "epoch": 0.98, + "grad_norm": 0.0, + "learning_rate": 0.0004986876435132736, + "loss": 0.2487, + "step": 23730 + }, + { + "epoch": 0.98, + "grad_norm": 0.59375, + "learning_rate": 0.00049868653349756, + "loss": 0.2025, + "step": 23740 + }, + { + "epoch": 0.98, + "grad_norm": 0.66015625, + "learning_rate": 0.0004986854230138452, + "loss": 0.2527, + "step": 23750 + }, + { + "epoch": 0.98, + "grad_norm": 0.546875, + "learning_rate": 0.0004986843120621312, + "loss": 0.227, + "step": 23760 + }, + { + "epoch": 0.98, + "grad_norm": 0.373046875, + "learning_rate": 0.0004986832006424203, + "loss": 0.2317, + "step": 23770 + }, + { + "epoch": 0.98, + "grad_norm": 0.55859375, + "learning_rate": 0.0004986820887547145, + "loss": 0.2462, + "step": 23780 + }, + { + "epoch": 0.99, + "grad_norm": 0.60546875, + "learning_rate": 0.0004986809763990157, + "loss": 0.2071, + "step": 23790 + }, + { + "epoch": 0.99, + "grad_norm": 0.53125, + "learning_rate": 0.0004986798635753264, + "loss": 0.2287, + "step": 23800 + }, + { + "epoch": 0.99, + "grad_norm": 0.796875, + "learning_rate": 0.0004986787502836483, + "loss": 0.2148, + "step": 23810 + }, + { + "epoch": 0.99, + "grad_norm": 1.125, + "learning_rate": 0.0004986776365239837, + "loss": 0.203, + "step": 23820 + }, + { + "epoch": 0.99, + "grad_norm": 1.2890625, + "learning_rate": 0.0004986765222963345, + "loss": 0.3097, + "step": 23830 + }, + { + "epoch": 0.99, + "grad_norm": 0.9296875, + "learning_rate": 0.000498675407600703, + "loss": 0.228, + "step": 23840 + }, + { + "epoch": 0.99, + "grad_norm": 0.359375, + "learning_rate": 0.0004986742924370914, + "loss": 0.2305, + "step": 23850 + }, + { + "epoch": 0.99, + "grad_norm": 0.78515625, + "learning_rate": 0.0004986731768055015, + "loss": 0.2455, + "step": 23860 + }, + { + "epoch": 0.99, + "grad_norm": 1.0703125, + "learning_rate": 0.0004986720607059355, + "loss": 0.2323, + "step": 23870 + }, + { + "epoch": 0.99, + "grad_norm": 0.5078125, + "learning_rate": 0.0004986709441383956, + "loss": 0.2459, + "step": 23880 + }, + { + "epoch": 0.99, + "grad_norm": 0.3671875, + "learning_rate": 0.0004986698271028839, + "loss": 0.2676, + "step": 23890 + }, + { + "epoch": 0.99, + "grad_norm": 0.427734375, + "learning_rate": 0.0004986687095994023, + "loss": 0.3124, + "step": 23900 + }, + { + "epoch": 0.99, + "grad_norm": 1.4140625, + "learning_rate": 0.0004986675916279532, + "loss": 0.2653, + "step": 23910 + }, + { + "epoch": 0.99, + "grad_norm": 0.255859375, + "learning_rate": 0.0004986664731885384, + "loss": 0.2098, + "step": 23920 + }, + { + "epoch": 0.99, + "grad_norm": 0.6015625, + "learning_rate": 0.0004986653542811602, + "loss": 0.2359, + "step": 23930 + }, + { + "epoch": 0.99, + "grad_norm": 1.8125, + "learning_rate": 0.0004986642349058207, + "loss": 0.1903, + "step": 23940 + }, + { + "epoch": 0.99, + "grad_norm": 0.50390625, + "learning_rate": 0.0004986631150625219, + "loss": 0.2406, + "step": 23950 + }, + { + "epoch": 0.99, + "grad_norm": 0.86328125, + "learning_rate": 0.000498661994751266, + "loss": 0.2797, + "step": 23960 + }, + { + "epoch": 0.99, + "grad_norm": 0.515625, + "learning_rate": 0.000498660873972055, + "loss": 0.2223, + "step": 23970 + }, + { + "epoch": 0.99, + "grad_norm": 0.39453125, + "learning_rate": 0.0004986597527248912, + "loss": 0.2132, + "step": 23980 + }, + { + "epoch": 0.99, + "grad_norm": 0.66015625, + "learning_rate": 0.0004986586310097766, + "loss": 0.2734, + "step": 23990 + }, + { + "epoch": 0.99, + "grad_norm": 1.328125, + "learning_rate": 0.0004986575088267133, + "loss": 0.2672, + "step": 24000 + }, + { + "epoch": 0.99, + "grad_norm": 1.0625, + "learning_rate": 0.0004986563861757034, + "loss": 0.2256, + "step": 24010 + }, + { + "epoch": 0.99, + "grad_norm": 0.474609375, + "learning_rate": 0.000498655263056749, + "loss": 0.2494, + "step": 24020 + }, + { + "epoch": 1.0, + "grad_norm": 1.2578125, + "learning_rate": 0.0004986541394698523, + "loss": 0.2434, + "step": 24030 + }, + { + "epoch": 1.0, + "grad_norm": 0.765625, + "learning_rate": 0.0004986530154150152, + "loss": 0.1956, + "step": 24040 + }, + { + "epoch": 1.0, + "grad_norm": 0.69140625, + "learning_rate": 0.0004986518908922403, + "loss": 0.2116, + "step": 24050 + }, + { + "epoch": 1.0, + "grad_norm": 1.5625, + "learning_rate": 0.0004986507659015292, + "loss": 0.1953, + "step": 24060 + }, + { + "epoch": 1.0, + "grad_norm": 0.7734375, + "learning_rate": 0.0004986496404428842, + "loss": 0.269, + "step": 24070 + }, + { + "epoch": 1.0, + "grad_norm": 0.640625, + "learning_rate": 0.0004986485145163075, + "loss": 0.2372, + "step": 24080 + }, + { + "epoch": 1.0, + "grad_norm": 0.57421875, + "learning_rate": 0.000498647388121801, + "loss": 0.3138, + "step": 24090 + }, + { + "epoch": 1.0, + "grad_norm": 1.8515625, + "learning_rate": 0.0004986462612593671, + "loss": 0.2377, + "step": 24100 + }, + { + "epoch": 1.0, + "grad_norm": 1.2734375, + "learning_rate": 0.0004986451339290077, + "loss": 0.3302, + "step": 24110 + }, + { + "epoch": 1.0, + "grad_norm": 0.388671875, + "learning_rate": 0.0004986440061307251, + "loss": 0.2444, + "step": 24120 + }, + { + "epoch": 1.0, + "grad_norm": 0.70703125, + "learning_rate": 0.0004986428778645212, + "loss": 0.2929, + "step": 24130 + }, + { + "epoch": 1.0, + "grad_norm": 1.171875, + "learning_rate": 0.0004986417491303984, + "loss": 0.2487, + "step": 24140 + }, + { + "epoch": 1.0, + "grad_norm": 0.73046875, + "learning_rate": 0.0004986406199283586, + "loss": 0.2258, + "step": 24150 + }, + { + "epoch": 1.0, + "grad_norm": 0.78515625, + "learning_rate": 0.000498639490258404, + "loss": 0.2111, + "step": 24160 + }, + { + "epoch": 1.0, + "grad_norm": 0.486328125, + "learning_rate": 0.0004986383601205368, + "loss": 0.2215, + "step": 24170 + }, + { + "epoch": 1.0, + "grad_norm": 1.9140625, + "learning_rate": 0.000498637229514759, + "loss": 0.2223, + "step": 24180 + }, + { + "epoch": 1.0, + "grad_norm": 1.125, + "learning_rate": 0.0004986360984410728, + "loss": 0.2282, + "step": 24190 + }, + { + "epoch": 1.0, + "grad_norm": 1.2109375, + "learning_rate": 0.0004986349668994804, + "loss": 0.2304, + "step": 24200 + }, + { + "epoch": 1.0, + "grad_norm": 0.609375, + "learning_rate": 0.0004986338348899837, + "loss": 0.1814, + "step": 24210 + }, + { + "epoch": 1.0, + "grad_norm": 0.57421875, + "learning_rate": 0.0004986327024125851, + "loss": 0.1663, + "step": 24220 + }, + { + "epoch": 1.0, + "grad_norm": 1.2421875, + "learning_rate": 0.0004986315694672865, + "loss": 0.2139, + "step": 24230 + }, + { + "epoch": 1.0, + "grad_norm": 0.5546875, + "learning_rate": 0.0004986304360540901, + "loss": 0.2459, + "step": 24240 + }, + { + "epoch": 1.0, + "grad_norm": 1.0390625, + "learning_rate": 0.0004986293021729982, + "loss": 0.209, + "step": 24250 + }, + { + "epoch": 1.0, + "grad_norm": 0.93359375, + "learning_rate": 0.0004986281678240127, + "loss": 0.2936, + "step": 24260 + }, + { + "epoch": 1.01, + "grad_norm": 0.84765625, + "learning_rate": 0.0004986270330071358, + "loss": 0.2534, + "step": 24270 + }, + { + "epoch": 1.01, + "grad_norm": 0.921875, + "learning_rate": 0.0004986258977223699, + "loss": 0.2146, + "step": 24280 + }, + { + "epoch": 1.01, + "grad_norm": 0.921875, + "learning_rate": 0.0004986247619697167, + "loss": 0.2468, + "step": 24290 + }, + { + "epoch": 1.01, + "grad_norm": 0.322265625, + "learning_rate": 0.0004986236257491786, + "loss": 0.2621, + "step": 24300 + }, + { + "epoch": 1.01, + "grad_norm": 0.466796875, + "learning_rate": 0.0004986224890607577, + "loss": 0.2088, + "step": 24310 + }, + { + "epoch": 1.01, + "grad_norm": 1.171875, + "learning_rate": 0.0004986213519044561, + "loss": 0.2528, + "step": 24320 + }, + { + "epoch": 1.01, + "grad_norm": 1.5390625, + "learning_rate": 0.000498620214280276, + "loss": 0.2117, + "step": 24330 + }, + { + "epoch": 1.01, + "grad_norm": 1.0078125, + "learning_rate": 0.0004986190761882195, + "loss": 0.2141, + "step": 24340 + }, + { + "epoch": 1.01, + "grad_norm": 1.6796875, + "learning_rate": 0.0004986179376282887, + "loss": 0.2227, + "step": 24350 + }, + { + "epoch": 1.01, + "grad_norm": 0.9921875, + "learning_rate": 0.0004986167986004859, + "loss": 0.1563, + "step": 24360 + }, + { + "epoch": 1.01, + "grad_norm": 0.8046875, + "learning_rate": 0.000498615659104813, + "loss": 0.2138, + "step": 24370 + }, + { + "epoch": 1.01, + "grad_norm": 0.390625, + "learning_rate": 0.0004986145191412723, + "loss": 0.2027, + "step": 24380 + }, + { + "epoch": 1.01, + "grad_norm": 0.90234375, + "learning_rate": 0.0004986133787098661, + "loss": 0.2699, + "step": 24390 + }, + { + "epoch": 1.01, + "grad_norm": 1.2734375, + "learning_rate": 0.0004986122378105961, + "loss": 0.2732, + "step": 24400 + }, + { + "epoch": 1.01, + "grad_norm": 0.337890625, + "learning_rate": 0.0004986110964434649, + "loss": 0.3105, + "step": 24410 + }, + { + "epoch": 1.01, + "grad_norm": 0.54296875, + "learning_rate": 0.0004986099546084743, + "loss": 0.2134, + "step": 24420 + }, + { + "epoch": 1.01, + "grad_norm": 0.56640625, + "learning_rate": 0.0004986088123056268, + "loss": 0.2006, + "step": 24430 + }, + { + "epoch": 1.01, + "grad_norm": 0.78125, + "learning_rate": 0.0004986076695349243, + "loss": 0.238, + "step": 24440 + }, + { + "epoch": 1.01, + "grad_norm": 0.4921875, + "learning_rate": 0.000498606526296369, + "loss": 0.2244, + "step": 24450 + }, + { + "epoch": 1.01, + "grad_norm": 0.8828125, + "learning_rate": 0.000498605382589963, + "loss": 0.22, + "step": 24460 + }, + { + "epoch": 1.01, + "grad_norm": 0.82421875, + "learning_rate": 0.0004986042384157087, + "loss": 0.2206, + "step": 24470 + }, + { + "epoch": 1.01, + "grad_norm": 0.54296875, + "learning_rate": 0.0004986030937736079, + "loss": 0.2453, + "step": 24480 + }, + { + "epoch": 1.01, + "grad_norm": 0.94921875, + "learning_rate": 0.0004986019486636629, + "loss": 0.2871, + "step": 24490 + }, + { + "epoch": 1.01, + "grad_norm": 0.7578125, + "learning_rate": 0.000498600803085876, + "loss": 0.2127, + "step": 24500 + }, + { + "epoch": 1.02, + "grad_norm": 0.81640625, + "learning_rate": 0.0004985996570402492, + "loss": 0.2489, + "step": 24510 + }, + { + "epoch": 1.02, + "grad_norm": 1.0859375, + "learning_rate": 0.0004985985105267846, + "loss": 0.1922, + "step": 24520 + }, + { + "epoch": 1.02, + "grad_norm": 1.3125, + "learning_rate": 0.0004985973635454847, + "loss": 0.23, + "step": 24530 + }, + { + "epoch": 1.02, + "grad_norm": 0.80078125, + "learning_rate": 0.0004985962160963512, + "loss": 0.2032, + "step": 24540 + }, + { + "epoch": 1.02, + "grad_norm": 0.47265625, + "learning_rate": 0.0004985950681793865, + "loss": 0.1951, + "step": 24550 + }, + { + "epoch": 1.02, + "grad_norm": 1.6640625, + "learning_rate": 0.0004985939197945927, + "loss": 0.2382, + "step": 24560 + }, + { + "epoch": 1.02, + "grad_norm": 1.0078125, + "learning_rate": 0.000498592770941972, + "loss": 0.1994, + "step": 24570 + }, + { + "epoch": 1.02, + "grad_norm": 0.8125, + "learning_rate": 0.0004985916216215267, + "loss": 0.2084, + "step": 24580 + }, + { + "epoch": 1.02, + "grad_norm": 0.51171875, + "learning_rate": 0.0004985904718332586, + "loss": 0.1868, + "step": 24590 + }, + { + "epoch": 1.02, + "grad_norm": 0.78125, + "learning_rate": 0.0004985893215771701, + "loss": 0.2304, + "step": 24600 + }, + { + "epoch": 1.02, + "grad_norm": 0.271484375, + "learning_rate": 0.0004985881708532635, + "loss": 0.2139, + "step": 24610 + }, + { + "epoch": 1.02, + "grad_norm": 0.58203125, + "learning_rate": 0.0004985870196615406, + "loss": 0.2683, + "step": 24620 + }, + { + "epoch": 1.02, + "grad_norm": 1.0546875, + "learning_rate": 0.000498585868002004, + "loss": 0.3117, + "step": 24630 + }, + { + "epoch": 1.02, + "grad_norm": 0.4140625, + "learning_rate": 0.0004985847158746555, + "loss": 0.285, + "step": 24640 + }, + { + "epoch": 1.02, + "grad_norm": 0.83984375, + "learning_rate": 0.0004985835632794974, + "loss": 0.2129, + "step": 24650 + }, + { + "epoch": 1.02, + "grad_norm": 0.478515625, + "learning_rate": 0.000498582410216532, + "loss": 0.2616, + "step": 24660 + }, + { + "epoch": 1.02, + "grad_norm": 0.474609375, + "learning_rate": 0.0004985812566857612, + "loss": 0.31, + "step": 24670 + }, + { + "epoch": 1.02, + "grad_norm": 0.84375, + "learning_rate": 0.0004985801026871873, + "loss": 0.234, + "step": 24680 + }, + { + "epoch": 1.02, + "grad_norm": 0.765625, + "learning_rate": 0.0004985789482208126, + "loss": 0.2255, + "step": 24690 + }, + { + "epoch": 1.02, + "grad_norm": 0.31640625, + "learning_rate": 0.0004985777932866392, + "loss": 0.2532, + "step": 24700 + }, + { + "epoch": 1.02, + "grad_norm": 0.671875, + "learning_rate": 0.0004985766378846692, + "loss": 0.1965, + "step": 24710 + }, + { + "epoch": 1.02, + "grad_norm": 0.51171875, + "learning_rate": 0.0004985754820149048, + "loss": 0.2427, + "step": 24720 + }, + { + "epoch": 1.02, + "grad_norm": 1.3359375, + "learning_rate": 0.0004985743256773482, + "loss": 0.2719, + "step": 24730 + }, + { + "epoch": 1.02, + "grad_norm": 0.859375, + "learning_rate": 0.0004985731688720014, + "loss": 0.2388, + "step": 24740 + }, + { + "epoch": 1.03, + "grad_norm": 0.55078125, + "learning_rate": 0.0004985720115988669, + "loss": 0.2692, + "step": 24750 + }, + { + "epoch": 1.03, + "grad_norm": 0.33203125, + "learning_rate": 0.0004985708538579467, + "loss": 0.2172, + "step": 24760 + }, + { + "epoch": 1.03, + "grad_norm": 2.078125, + "learning_rate": 0.000498569695649243, + "loss": 0.2383, + "step": 24770 + }, + { + "epoch": 1.03, + "grad_norm": 0.251953125, + "learning_rate": 0.000498568536972758, + "loss": 0.1863, + "step": 24780 + }, + { + "epoch": 1.03, + "grad_norm": 0.578125, + "learning_rate": 0.0004985673778284939, + "loss": 0.1834, + "step": 24790 + }, + { + "epoch": 1.03, + "grad_norm": 1.421875, + "learning_rate": 0.0004985662182164527, + "loss": 0.2113, + "step": 24800 + }, + { + "epoch": 1.03, + "grad_norm": 1.25, + "learning_rate": 0.0004985650581366367, + "loss": 0.2573, + "step": 24810 + }, + { + "epoch": 1.03, + "grad_norm": 0.66015625, + "learning_rate": 0.0004985638975890483, + "loss": 0.2534, + "step": 24820 + }, + { + "epoch": 1.03, + "grad_norm": 0.359375, + "learning_rate": 0.0004985627365736893, + "loss": 0.2591, + "step": 24830 + }, + { + "epoch": 1.03, + "grad_norm": 0.88671875, + "learning_rate": 0.0004985615750905622, + "loss": 0.211, + "step": 24840 + }, + { + "epoch": 1.03, + "grad_norm": 1.0234375, + "learning_rate": 0.000498560413139669, + "loss": 0.2526, + "step": 24850 + }, + { + "epoch": 1.03, + "grad_norm": 0.2578125, + "learning_rate": 0.000498559250721012, + "loss": 0.2631, + "step": 24860 + }, + { + "epoch": 1.03, + "grad_norm": 0.890625, + "learning_rate": 0.0004985580878345932, + "loss": 0.1916, + "step": 24870 + }, + { + "epoch": 1.03, + "grad_norm": 0.6484375, + "learning_rate": 0.000498556924480415, + "loss": 0.3054, + "step": 24880 + }, + { + "epoch": 1.03, + "grad_norm": 0.71875, + "learning_rate": 0.0004985557606584795, + "loss": 0.2222, + "step": 24890 + }, + { + "epoch": 1.03, + "grad_norm": 0.81640625, + "learning_rate": 0.0004985545963687889, + "loss": 0.2524, + "step": 24900 + }, + { + "epoch": 1.03, + "grad_norm": 0.490234375, + "learning_rate": 0.0004985534316113454, + "loss": 0.2549, + "step": 24910 + }, + { + "epoch": 1.03, + "grad_norm": 1.1875, + "learning_rate": 0.0004985522663861513, + "loss": 0.2242, + "step": 24920 + }, + { + "epoch": 1.03, + "grad_norm": 1.109375, + "learning_rate": 0.0004985511006932085, + "loss": 0.2472, + "step": 24930 + }, + { + "epoch": 1.03, + "grad_norm": 0.45703125, + "learning_rate": 0.0004985499345325194, + "loss": 0.2193, + "step": 24940 + }, + { + "epoch": 1.03, + "grad_norm": 0.5390625, + "learning_rate": 0.0004985487679040862, + "loss": 0.2175, + "step": 24950 + }, + { + "epoch": 1.03, + "grad_norm": 0.73828125, + "learning_rate": 0.000498547600807911, + "loss": 0.2424, + "step": 24960 + }, + { + "epoch": 1.03, + "grad_norm": 0.37109375, + "learning_rate": 0.0004985464332439962, + "loss": 0.192, + "step": 24970 + }, + { + "epoch": 1.03, + "grad_norm": 0.796875, + "learning_rate": 0.0004985452652123437, + "loss": 0.2209, + "step": 24980 + }, + { + "epoch": 1.04, + "grad_norm": 0.75390625, + "learning_rate": 0.000498544096712956, + "loss": 0.2286, + "step": 24990 + }, + { + "epoch": 1.04, + "grad_norm": 1.4453125, + "learning_rate": 0.000498542927745835, + "loss": 0.2529, + "step": 25000 + }, + { + "epoch": 1.04, + "grad_norm": 0.55078125, + "learning_rate": 0.0004985417583109831, + "loss": 0.2017, + "step": 25010 + }, + { + "epoch": 1.04, + "grad_norm": 0.859375, + "learning_rate": 0.0004985405884084025, + "loss": 0.2254, + "step": 25020 + }, + { + "epoch": 1.04, + "grad_norm": 0.439453125, + "learning_rate": 0.0004985394180380953, + "loss": 0.2436, + "step": 25030 + }, + { + "epoch": 1.04, + "grad_norm": 0.69140625, + "learning_rate": 0.0004985382472000638, + "loss": 0.2412, + "step": 25040 + }, + { + "epoch": 1.04, + "grad_norm": 0.84765625, + "learning_rate": 0.0004985370758943101, + "loss": 0.2231, + "step": 25050 + }, + { + "epoch": 1.04, + "grad_norm": 0.8359375, + "learning_rate": 0.0004985359041208365, + "loss": 0.2159, + "step": 25060 + }, + { + "epoch": 1.04, + "grad_norm": 2.078125, + "learning_rate": 0.0004985347318796451, + "loss": 0.2401, + "step": 25070 + }, + { + "epoch": 1.04, + "grad_norm": 0.0, + "learning_rate": 0.0004985335591707383, + "loss": 0.2661, + "step": 25080 + }, + { + "epoch": 1.04, + "grad_norm": 0.77734375, + "learning_rate": 0.0004985323859941182, + "loss": 0.2408, + "step": 25090 + }, + { + "epoch": 1.04, + "grad_norm": 1.0, + "learning_rate": 0.0004985312123497868, + "loss": 0.275, + "step": 25100 + }, + { + "epoch": 1.04, + "grad_norm": 0.65625, + "learning_rate": 0.0004985300382377467, + "loss": 0.2611, + "step": 25110 + }, + { + "epoch": 1.04, + "grad_norm": 0.58203125, + "learning_rate": 0.0004985288636579998, + "loss": 0.1907, + "step": 25120 + }, + { + "epoch": 1.04, + "grad_norm": 0.88671875, + "learning_rate": 0.0004985276886105484, + "loss": 0.2402, + "step": 25130 + }, + { + "epoch": 1.04, + "grad_norm": 0.734375, + "learning_rate": 0.0004985265130953947, + "loss": 0.2019, + "step": 25140 + }, + { + "epoch": 1.04, + "grad_norm": 0.546875, + "learning_rate": 0.0004985253371125411, + "loss": 0.2776, + "step": 25150 + }, + { + "epoch": 1.04, + "grad_norm": 1.109375, + "learning_rate": 0.0004985241606619895, + "loss": 0.2922, + "step": 25160 + }, + { + "epoch": 1.04, + "grad_norm": 0.470703125, + "learning_rate": 0.0004985229837437423, + "loss": 0.2887, + "step": 25170 + }, + { + "epoch": 1.04, + "grad_norm": 0.546875, + "learning_rate": 0.0004985218063578017, + "loss": 0.2304, + "step": 25180 + }, + { + "epoch": 1.04, + "grad_norm": 0.298828125, + "learning_rate": 0.00049852062850417, + "loss": 0.2465, + "step": 25190 + }, + { + "epoch": 1.04, + "grad_norm": 1.015625, + "learning_rate": 0.0004985194501828491, + "loss": 0.2214, + "step": 25200 + }, + { + "epoch": 1.04, + "grad_norm": 0.423828125, + "learning_rate": 0.0004985182713938416, + "loss": 0.2468, + "step": 25210 + }, + { + "epoch": 1.04, + "grad_norm": 0.9609375, + "learning_rate": 0.0004985170921371496, + "loss": 0.2467, + "step": 25220 + }, + { + "epoch": 1.05, + "grad_norm": 1.1171875, + "learning_rate": 0.000498515912412775, + "loss": 0.1967, + "step": 25230 + }, + { + "epoch": 1.05, + "grad_norm": 0.3046875, + "learning_rate": 0.0004985147322207205, + "loss": 0.2804, + "step": 25240 + }, + { + "epoch": 1.05, + "grad_norm": 1.4921875, + "learning_rate": 0.0004985135515609881, + "loss": 0.2346, + "step": 25250 + }, + { + "epoch": 1.05, + "grad_norm": 0.435546875, + "learning_rate": 0.00049851237043358, + "loss": 0.2188, + "step": 25260 + }, + { + "epoch": 1.05, + "grad_norm": 0.259765625, + "learning_rate": 0.0004985111888384984, + "loss": 0.2616, + "step": 25270 + }, + { + "epoch": 1.05, + "grad_norm": 0.1171875, + "learning_rate": 0.0004985100067757457, + "loss": 0.2291, + "step": 25280 + }, + { + "epoch": 1.05, + "grad_norm": 2.8125, + "learning_rate": 0.000498508824245324, + "loss": 0.2097, + "step": 25290 + }, + { + "epoch": 1.05, + "grad_norm": 1.046875, + "learning_rate": 0.0004985076412472355, + "loss": 0.2195, + "step": 25300 + }, + { + "epoch": 1.05, + "grad_norm": 0.0, + "learning_rate": 0.0004985064577814824, + "loss": 0.2122, + "step": 25310 + }, + { + "epoch": 1.05, + "grad_norm": 1.0625, + "learning_rate": 0.0004985052738480669, + "loss": 0.2578, + "step": 25320 + }, + { + "epoch": 1.05, + "grad_norm": 0.7109375, + "learning_rate": 0.0004985040894469916, + "loss": 0.2659, + "step": 25330 + }, + { + "epoch": 1.05, + "grad_norm": 0.75, + "learning_rate": 0.0004985029045782582, + "loss": 0.186, + "step": 25340 + }, + { + "epoch": 1.05, + "grad_norm": 0.4375, + "learning_rate": 0.0004985017192418692, + "loss": 0.1916, + "step": 25350 + }, + { + "epoch": 1.05, + "grad_norm": 0.56640625, + "learning_rate": 0.0004985005334378268, + "loss": 0.2396, + "step": 25360 + }, + { + "epoch": 1.05, + "grad_norm": 0.94921875, + "learning_rate": 0.0004984993471661333, + "loss": 0.1813, + "step": 25370 + }, + { + "epoch": 1.05, + "grad_norm": 1.2578125, + "learning_rate": 0.0004984981604267909, + "loss": 0.2819, + "step": 25380 + }, + { + "epoch": 1.05, + "grad_norm": 0.3671875, + "learning_rate": 0.0004984969732198017, + "loss": 0.2509, + "step": 25390 + }, + { + "epoch": 1.05, + "grad_norm": 0.91796875, + "learning_rate": 0.000498495785545168, + "loss": 0.2134, + "step": 25400 + }, + { + "epoch": 1.05, + "grad_norm": 0.55859375, + "learning_rate": 0.0004984945974028922, + "loss": 0.2235, + "step": 25410 + }, + { + "epoch": 1.05, + "grad_norm": 0.40625, + "learning_rate": 0.0004984934087929763, + "loss": 0.2015, + "step": 25420 + }, + { + "epoch": 1.05, + "grad_norm": 0.400390625, + "learning_rate": 0.0004984922197154226, + "loss": 0.2269, + "step": 25430 + }, + { + "epoch": 1.05, + "grad_norm": 0.625, + "learning_rate": 0.0004984910301702335, + "loss": 0.2544, + "step": 25440 + }, + { + "epoch": 1.05, + "grad_norm": 2.3125, + "learning_rate": 0.000498489840157411, + "loss": 0.1949, + "step": 25450 + }, + { + "epoch": 1.05, + "grad_norm": 1.2109375, + "learning_rate": 0.0004984886496769576, + "loss": 0.2898, + "step": 25460 + }, + { + "epoch": 1.05, + "grad_norm": 0.72265625, + "learning_rate": 0.0004984874587288752, + "loss": 0.2528, + "step": 25470 + }, + { + "epoch": 1.06, + "grad_norm": 0.640625, + "learning_rate": 0.0004984862673131664, + "loss": 0.1861, + "step": 25480 + }, + { + "epoch": 1.06, + "grad_norm": 1.0859375, + "learning_rate": 0.0004984850754298333, + "loss": 0.2238, + "step": 25490 + }, + { + "epoch": 1.06, + "grad_norm": 0.490234375, + "learning_rate": 0.000498483883078878, + "loss": 0.2343, + "step": 25500 + }, + { + "epoch": 1.06, + "grad_norm": 0.4453125, + "learning_rate": 0.0004984826902603029, + "loss": 0.271, + "step": 25510 + }, + { + "epoch": 1.06, + "grad_norm": 0.6484375, + "learning_rate": 0.0004984814969741102, + "loss": 0.243, + "step": 25520 + }, + { + "epoch": 1.06, + "grad_norm": 0.5546875, + "learning_rate": 0.0004984803032203022, + "loss": 0.2179, + "step": 25530 + }, + { + "epoch": 1.06, + "grad_norm": 0.73828125, + "learning_rate": 0.0004984791089988811, + "loss": 0.1896, + "step": 25540 + }, + { + "epoch": 1.06, + "grad_norm": 0.53125, + "learning_rate": 0.0004984779143098492, + "loss": 0.2187, + "step": 25550 + }, + { + "epoch": 1.06, + "grad_norm": 0.7734375, + "learning_rate": 0.0004984767191532085, + "loss": 0.2065, + "step": 25560 + }, + { + "epoch": 1.06, + "grad_norm": 0.30859375, + "learning_rate": 0.0004984755235289617, + "loss": 0.2297, + "step": 25570 + }, + { + "epoch": 1.06, + "grad_norm": 0.8671875, + "learning_rate": 0.0004984743274371107, + "loss": 0.2037, + "step": 25580 + }, + { + "epoch": 1.06, + "grad_norm": 0.6640625, + "learning_rate": 0.0004984731308776578, + "loss": 0.2273, + "step": 25590 + }, + { + "epoch": 1.06, + "grad_norm": 0.384765625, + "learning_rate": 0.0004984719338506052, + "loss": 0.2308, + "step": 25600 + }, + { + "epoch": 1.06, + "grad_norm": 0.83203125, + "learning_rate": 0.0004984707363559555, + "loss": 0.293, + "step": 25610 + }, + { + "epoch": 1.06, + "grad_norm": 0.73828125, + "learning_rate": 0.0004984695383937106, + "loss": 0.247, + "step": 25620 + }, + { + "epoch": 1.06, + "grad_norm": 0.94921875, + "learning_rate": 0.0004984683399638729, + "loss": 0.2381, + "step": 25630 + }, + { + "epoch": 1.06, + "grad_norm": 0.78125, + "learning_rate": 0.0004984671410664445, + "loss": 0.254, + "step": 25640 + }, + { + "epoch": 1.06, + "grad_norm": 0.32421875, + "learning_rate": 0.0004984659417014279, + "loss": 0.2633, + "step": 25650 + }, + { + "epoch": 1.06, + "grad_norm": 0.59765625, + "learning_rate": 0.0004984647418688252, + "loss": 0.2071, + "step": 25660 + }, + { + "epoch": 1.06, + "grad_norm": 0.408203125, + "learning_rate": 0.0004984635415686386, + "loss": 0.3069, + "step": 25670 + }, + { + "epoch": 1.06, + "grad_norm": 0.6484375, + "learning_rate": 0.0004984623408008705, + "loss": 0.2815, + "step": 25680 + }, + { + "epoch": 1.06, + "grad_norm": 0.84765625, + "learning_rate": 0.000498461139565523, + "loss": 0.2435, + "step": 25690 + }, + { + "epoch": 1.06, + "grad_norm": 0.42578125, + "learning_rate": 0.0004984599378625987, + "loss": 0.2302, + "step": 25700 + }, + { + "epoch": 1.06, + "grad_norm": 0.65625, + "learning_rate": 0.0004984587356920995, + "loss": 0.2512, + "step": 25710 + }, + { + "epoch": 1.07, + "grad_norm": 0.2265625, + "learning_rate": 0.0004984575330540278, + "loss": 0.2526, + "step": 25720 + }, + { + "epoch": 1.07, + "grad_norm": 0.373046875, + "learning_rate": 0.0004984563299483859, + "loss": 0.2371, + "step": 25730 + }, + { + "epoch": 1.07, + "grad_norm": 1.421875, + "learning_rate": 0.0004984551263751759, + "loss": 0.204, + "step": 25740 + }, + { + "epoch": 1.07, + "grad_norm": 0.240234375, + "learning_rate": 0.0004984539223344002, + "loss": 0.2621, + "step": 25750 + }, + { + "epoch": 1.07, + "grad_norm": 0.828125, + "learning_rate": 0.0004984527178260612, + "loss": 0.2347, + "step": 25760 + }, + { + "epoch": 1.07, + "grad_norm": 0.466796875, + "learning_rate": 0.0004984515128501609, + "loss": 0.2598, + "step": 25770 + }, + { + "epoch": 1.07, + "grad_norm": 0.9921875, + "learning_rate": 0.0004984503074067017, + "loss": 0.2191, + "step": 25780 + }, + { + "epoch": 1.07, + "grad_norm": 1.0546875, + "learning_rate": 0.0004984491014956858, + "loss": 0.2482, + "step": 25790 + }, + { + "epoch": 1.07, + "grad_norm": 1.046875, + "learning_rate": 0.0004984478951171156, + "loss": 0.2523, + "step": 25800 + }, + { + "epoch": 1.07, + "grad_norm": 0.6015625, + "learning_rate": 0.0004984466882709933, + "loss": 0.223, + "step": 25810 + }, + { + "epoch": 1.07, + "grad_norm": 0.5546875, + "learning_rate": 0.0004984454809573211, + "loss": 0.2211, + "step": 25820 + }, + { + "epoch": 1.07, + "grad_norm": 1.8203125, + "learning_rate": 0.0004984442731761013, + "loss": 0.2373, + "step": 25830 + }, + { + "epoch": 1.07, + "grad_norm": 0.71484375, + "learning_rate": 0.0004984430649273363, + "loss": 0.2201, + "step": 25840 + }, + { + "epoch": 1.07, + "grad_norm": 0.9765625, + "learning_rate": 0.0004984418562110283, + "loss": 0.2155, + "step": 25850 + }, + { + "epoch": 1.07, + "grad_norm": 0.578125, + "learning_rate": 0.0004984406470271794, + "loss": 0.2077, + "step": 25860 + }, + { + "epoch": 1.07, + "grad_norm": 0.58203125, + "learning_rate": 0.0004984394373757922, + "loss": 0.2463, + "step": 25870 + }, + { + "epoch": 1.07, + "grad_norm": 0.1943359375, + "learning_rate": 0.0004984382272568687, + "loss": 0.2043, + "step": 25880 + }, + { + "epoch": 1.07, + "grad_norm": 0.396484375, + "learning_rate": 0.0004984370166704114, + "loss": 0.2843, + "step": 25890 + }, + { + "epoch": 1.07, + "grad_norm": 0.53515625, + "learning_rate": 0.0004984358056164223, + "loss": 0.218, + "step": 25900 + }, + { + "epoch": 1.07, + "grad_norm": 0.45703125, + "learning_rate": 0.000498434594094904, + "loss": 0.217, + "step": 25910 + }, + { + "epoch": 1.07, + "grad_norm": 34.5, + "learning_rate": 0.0004984333821058585, + "loss": 0.1736, + "step": 25920 + }, + { + "epoch": 1.07, + "grad_norm": 5.5625, + "learning_rate": 0.0004984321696492883, + "loss": 0.2811, + "step": 25930 + }, + { + "epoch": 1.07, + "grad_norm": 1.015625, + "learning_rate": 0.0004984309567251956, + "loss": 0.2473, + "step": 25940 + }, + { + "epoch": 1.07, + "grad_norm": 0.9921875, + "learning_rate": 0.0004984297433335826, + "loss": 0.2599, + "step": 25950 + }, + { + "epoch": 1.08, + "grad_norm": 0.734375, + "learning_rate": 0.0004984285294744516, + "loss": 0.2219, + "step": 25960 + }, + { + "epoch": 1.08, + "grad_norm": 2.71875, + "learning_rate": 0.0004984273151478051, + "loss": 0.1902, + "step": 25970 + }, + { + "epoch": 1.08, + "grad_norm": 0.58984375, + "learning_rate": 0.0004984261003536452, + "loss": 0.2356, + "step": 25980 + }, + { + "epoch": 1.08, + "grad_norm": 1.390625, + "learning_rate": 0.000498424885091974, + "loss": 0.2375, + "step": 25990 + }, + { + "epoch": 1.08, + "grad_norm": 0.35546875, + "learning_rate": 0.0004984236693627942, + "loss": 0.2711, + "step": 26000 + }, + { + "epoch": 1.08, + "grad_norm": 0.5234375, + "learning_rate": 0.0004984224531661077, + "loss": 0.2149, + "step": 26010 + }, + { + "epoch": 1.08, + "grad_norm": 1.234375, + "learning_rate": 0.0004984212365019171, + "loss": 0.2324, + "step": 26020 + }, + { + "epoch": 1.08, + "grad_norm": 0.4921875, + "learning_rate": 0.0004984200193702246, + "loss": 0.2714, + "step": 26030 + }, + { + "epoch": 1.08, + "grad_norm": 1.8671875, + "learning_rate": 0.0004984188017710323, + "loss": 0.2363, + "step": 26040 + }, + { + "epoch": 1.08, + "grad_norm": 0.5078125, + "learning_rate": 0.0004984175837043428, + "loss": 0.2355, + "step": 26050 + }, + { + "epoch": 1.08, + "grad_norm": 1.734375, + "learning_rate": 0.0004984163651701581, + "loss": 0.2401, + "step": 26060 + }, + { + "epoch": 1.08, + "grad_norm": 0.78125, + "learning_rate": 0.0004984151461684807, + "loss": 0.2344, + "step": 26070 + }, + { + "epoch": 1.08, + "grad_norm": 1.2265625, + "learning_rate": 0.0004984139266993128, + "loss": 0.2555, + "step": 26080 + }, + { + "epoch": 1.08, + "grad_norm": 0.447265625, + "learning_rate": 0.0004984127067626567, + "loss": 0.2222, + "step": 26090 + }, + { + "epoch": 1.08, + "grad_norm": 0.63671875, + "learning_rate": 0.0004984114863585147, + "loss": 0.2663, + "step": 26100 + }, + { + "epoch": 1.08, + "grad_norm": 0.9765625, + "learning_rate": 0.0004984102654868892, + "loss": 0.1978, + "step": 26110 + }, + { + "epoch": 1.08, + "grad_norm": 0.490234375, + "learning_rate": 0.0004984090441477823, + "loss": 0.2236, + "step": 26120 + }, + { + "epoch": 1.08, + "grad_norm": 0.84375, + "learning_rate": 0.0004984078223411965, + "loss": 0.2353, + "step": 26130 + }, + { + "epoch": 1.08, + "grad_norm": 0.57421875, + "learning_rate": 0.000498406600067134, + "loss": 0.2403, + "step": 26140 + }, + { + "epoch": 1.08, + "grad_norm": 0.59765625, + "learning_rate": 0.0004984053773255971, + "loss": 0.2254, + "step": 26150 + }, + { + "epoch": 1.08, + "grad_norm": 0.41796875, + "learning_rate": 0.000498404154116588, + "loss": 0.2294, + "step": 26160 + }, + { + "epoch": 1.08, + "grad_norm": 0.890625, + "learning_rate": 0.0004984029304401092, + "loss": 0.2697, + "step": 26170 + }, + { + "epoch": 1.08, + "grad_norm": 0.68359375, + "learning_rate": 0.0004984017062961629, + "loss": 0.2785, + "step": 26180 + }, + { + "epoch": 1.08, + "grad_norm": 0.796875, + "learning_rate": 0.0004984004816847514, + "loss": 0.2034, + "step": 26190 + }, + { + "epoch": 1.09, + "grad_norm": 0.384765625, + "learning_rate": 0.000498399256605877, + "loss": 0.2074, + "step": 26200 + }, + { + "epoch": 1.09, + "grad_norm": 1.2734375, + "learning_rate": 0.000498398031059542, + "loss": 0.2264, + "step": 26210 + }, + { + "epoch": 1.09, + "grad_norm": 0.54296875, + "learning_rate": 0.0004983968050457488, + "loss": 0.2372, + "step": 26220 + }, + { + "epoch": 1.09, + "grad_norm": 0.287109375, + "learning_rate": 0.0004983955785644997, + "loss": 0.2186, + "step": 26230 + }, + { + "epoch": 1.09, + "grad_norm": 0.3203125, + "learning_rate": 0.0004983943516157968, + "loss": 0.2465, + "step": 26240 + }, + { + "epoch": 1.09, + "grad_norm": 0.53515625, + "learning_rate": 0.0004983931241996426, + "loss": 0.2564, + "step": 26250 + }, + { + "epoch": 1.09, + "grad_norm": 0.53125, + "learning_rate": 0.0004983918963160394, + "loss": 0.2439, + "step": 26260 + }, + { + "epoch": 1.09, + "grad_norm": 1.15625, + "learning_rate": 0.0004983906679649894, + "loss": 0.2192, + "step": 26270 + }, + { + "epoch": 1.09, + "grad_norm": 0.87890625, + "learning_rate": 0.0004983894391464951, + "loss": 0.214, + "step": 26280 + }, + { + "epoch": 1.09, + "grad_norm": 0.80859375, + "learning_rate": 0.0004983882098605586, + "loss": 0.2495, + "step": 26290 + }, + { + "epoch": 1.09, + "grad_norm": 0.0, + "learning_rate": 0.0004983869801071824, + "loss": 0.24, + "step": 26300 + }, + { + "epoch": 1.09, + "grad_norm": 0.68359375, + "learning_rate": 0.0004983857498863687, + "loss": 0.2398, + "step": 26310 + }, + { + "epoch": 1.09, + "grad_norm": 0.94140625, + "learning_rate": 0.0004983845191981198, + "loss": 0.2574, + "step": 26320 + }, + { + "epoch": 1.09, + "grad_norm": 0.73046875, + "learning_rate": 0.0004983832880424381, + "loss": 0.2269, + "step": 26330 + }, + { + "epoch": 1.09, + "grad_norm": 0.65234375, + "learning_rate": 0.0004983820564193259, + "loss": 0.259, + "step": 26340 + }, + { + "epoch": 1.09, + "grad_norm": 0.984375, + "learning_rate": 0.0004983808243287854, + "loss": 0.2154, + "step": 26350 + }, + { + "epoch": 1.09, + "grad_norm": 0.7734375, + "learning_rate": 0.000498379591770819, + "loss": 0.2453, + "step": 26360 + }, + { + "epoch": 1.09, + "grad_norm": 0.44921875, + "learning_rate": 0.0004983783587454291, + "loss": 0.2271, + "step": 26370 + }, + { + "epoch": 1.09, + "grad_norm": 0.83203125, + "learning_rate": 0.000498377125252618, + "loss": 0.2121, + "step": 26380 + }, + { + "epoch": 1.09, + "grad_norm": 0.85546875, + "learning_rate": 0.0004983758912923879, + "loss": 0.3093, + "step": 26390 + }, + { + "epoch": 1.09, + "grad_norm": 1.1953125, + "learning_rate": 0.0004983746568647412, + "loss": 0.1661, + "step": 26400 + }, + { + "epoch": 1.09, + "grad_norm": 1.40625, + "learning_rate": 0.0004983734219696803, + "loss": 0.1865, + "step": 26410 + }, + { + "epoch": 1.09, + "grad_norm": 0.87890625, + "learning_rate": 0.0004983721866072073, + "loss": 0.2314, + "step": 26420 + }, + { + "epoch": 1.09, + "grad_norm": 0.6796875, + "learning_rate": 0.0004983709507773248, + "loss": 0.2095, + "step": 26430 + }, + { + "epoch": 1.1, + "grad_norm": 0.32421875, + "learning_rate": 0.0004983697144800348, + "loss": 0.262, + "step": 26440 + }, + { + "epoch": 1.1, + "grad_norm": 1.4296875, + "learning_rate": 0.00049836847771534, + "loss": 0.2708, + "step": 26450 + }, + { + "epoch": 1.1, + "grad_norm": 0.69921875, + "learning_rate": 0.0004983672404832425, + "loss": 0.2501, + "step": 26460 + }, + { + "epoch": 1.1, + "grad_norm": 0.4375, + "learning_rate": 0.0004983660027837447, + "loss": 0.1925, + "step": 26470 + }, + { + "epoch": 1.1, + "grad_norm": 0.458984375, + "learning_rate": 0.0004983647646168488, + "loss": 0.2765, + "step": 26480 + }, + { + "epoch": 1.1, + "grad_norm": 1.28125, + "learning_rate": 0.0004983635259825574, + "loss": 0.2878, + "step": 26490 + }, + { + "epoch": 1.1, + "grad_norm": 0.46875, + "learning_rate": 0.0004983622868808725, + "loss": 0.2154, + "step": 26500 + }, + { + "epoch": 1.1, + "grad_norm": 0.400390625, + "learning_rate": 0.0004983610473117966, + "loss": 0.2358, + "step": 26510 + }, + { + "epoch": 1.1, + "grad_norm": 0.63671875, + "learning_rate": 0.0004983598072753322, + "loss": 0.2472, + "step": 26520 + }, + { + "epoch": 1.1, + "grad_norm": 0.5390625, + "learning_rate": 0.0004983585667714814, + "loss": 0.2317, + "step": 26530 + }, + { + "epoch": 1.1, + "grad_norm": 0.6328125, + "learning_rate": 0.0004983573258002465, + "loss": 0.2166, + "step": 26540 + }, + { + "epoch": 1.1, + "grad_norm": 0.62890625, + "learning_rate": 0.00049835608436163, + "loss": 0.2662, + "step": 26550 + }, + { + "epoch": 1.1, + "grad_norm": 1.1171875, + "learning_rate": 0.0004983548424556341, + "loss": 0.2778, + "step": 26560 + }, + { + "epoch": 1.1, + "grad_norm": 0.7734375, + "learning_rate": 0.0004983536000822613, + "loss": 0.255, + "step": 26570 + }, + { + "epoch": 1.1, + "grad_norm": 0.7578125, + "learning_rate": 0.0004983523572415138, + "loss": 0.2826, + "step": 26580 + }, + { + "epoch": 1.1, + "grad_norm": 0.69140625, + "learning_rate": 0.000498351113933394, + "loss": 0.224, + "step": 26590 + }, + { + "epoch": 1.1, + "grad_norm": 1.8828125, + "learning_rate": 0.0004983498701579042, + "loss": 0.2482, + "step": 26600 + }, + { + "epoch": 1.1, + "grad_norm": 2.46875, + "learning_rate": 0.0004983486259150468, + "loss": 0.195, + "step": 26610 + }, + { + "epoch": 1.1, + "grad_norm": 0.58203125, + "learning_rate": 0.0004983473812048241, + "loss": 0.2282, + "step": 26620 + }, + { + "epoch": 1.1, + "grad_norm": 0.640625, + "learning_rate": 0.0004983461360272384, + "loss": 0.2456, + "step": 26630 + }, + { + "epoch": 1.1, + "grad_norm": 0.419921875, + "learning_rate": 0.0004983448903822922, + "loss": 0.2869, + "step": 26640 + }, + { + "epoch": 1.1, + "grad_norm": 0.416015625, + "learning_rate": 0.0004983436442699877, + "loss": 0.2751, + "step": 26650 + }, + { + "epoch": 1.1, + "grad_norm": 0.69921875, + "learning_rate": 0.0004983423976903272, + "loss": 0.2094, + "step": 26660 + }, + { + "epoch": 1.1, + "grad_norm": 0.43359375, + "learning_rate": 0.0004983411506433132, + "loss": 0.2628, + "step": 26670 + }, + { + "epoch": 1.11, + "grad_norm": 0.52734375, + "learning_rate": 0.0004983399031289479, + "loss": 0.1648, + "step": 26680 + }, + { + "epoch": 1.11, + "grad_norm": 0.77734375, + "learning_rate": 0.0004983386551472337, + "loss": 0.2174, + "step": 26690 + }, + { + "epoch": 1.11, + "grad_norm": 0.353515625, + "learning_rate": 0.000498337406698173, + "loss": 0.1865, + "step": 26700 + }, + { + "epoch": 1.11, + "grad_norm": 1.34375, + "learning_rate": 0.0004983361577817682, + "loss": 0.2267, + "step": 26710 + }, + { + "epoch": 1.11, + "grad_norm": 0.69140625, + "learning_rate": 0.0004983349083980215, + "loss": 0.2391, + "step": 26720 + }, + { + "epoch": 1.11, + "grad_norm": 0.66015625, + "learning_rate": 0.0004983336585469353, + "loss": 0.2327, + "step": 26730 + }, + { + "epoch": 1.11, + "grad_norm": 0.91796875, + "learning_rate": 0.0004983324082285121, + "loss": 0.1692, + "step": 26740 + }, + { + "epoch": 1.11, + "grad_norm": 0.5234375, + "learning_rate": 0.000498331157442754, + "loss": 0.2429, + "step": 26750 + }, + { + "epoch": 1.11, + "grad_norm": 0.67578125, + "learning_rate": 0.0004983299061896636, + "loss": 0.2452, + "step": 26760 + }, + { + "epoch": 1.11, + "grad_norm": 2.4375, + "learning_rate": 0.000498328654469243, + "loss": 0.2623, + "step": 26770 + }, + { + "epoch": 1.11, + "grad_norm": 0.49609375, + "learning_rate": 0.0004983274022814948, + "loss": 0.2316, + "step": 26780 + }, + { + "epoch": 1.11, + "grad_norm": 1.1328125, + "learning_rate": 0.0004983261496264211, + "loss": 0.249, + "step": 26790 + }, + { + "epoch": 1.11, + "grad_norm": 1.0546875, + "learning_rate": 0.0004983248965040245, + "loss": 0.2533, + "step": 26800 + }, + { + "epoch": 1.11, + "grad_norm": 0.98828125, + "learning_rate": 0.0004983236429143072, + "loss": 0.2216, + "step": 26810 + }, + { + "epoch": 1.11, + "grad_norm": 0.384765625, + "learning_rate": 0.0004983223888572717, + "loss": 0.2477, + "step": 26820 + }, + { + "epoch": 1.11, + "grad_norm": 0.9453125, + "learning_rate": 0.0004983211343329203, + "loss": 0.2172, + "step": 26830 + }, + { + "epoch": 1.11, + "grad_norm": 0.5078125, + "learning_rate": 0.0004983198793412553, + "loss": 0.27, + "step": 26840 + }, + { + "epoch": 1.11, + "grad_norm": 1.28125, + "learning_rate": 0.000498318623882279, + "loss": 0.2004, + "step": 26850 + }, + { + "epoch": 1.11, + "grad_norm": 0.4765625, + "learning_rate": 0.000498317367955994, + "loss": 0.2293, + "step": 26860 + }, + { + "epoch": 1.11, + "grad_norm": 0.57421875, + "learning_rate": 0.0004983161115624025, + "loss": 0.2498, + "step": 26870 + }, + { + "epoch": 1.11, + "grad_norm": 0.470703125, + "learning_rate": 0.0004983148547015069, + "loss": 0.1956, + "step": 26880 + }, + { + "epoch": 1.11, + "grad_norm": 0.6484375, + "learning_rate": 0.0004983135973733096, + "loss": 0.2447, + "step": 26890 + }, + { + "epoch": 1.11, + "grad_norm": 0.7578125, + "learning_rate": 0.0004983123395778128, + "loss": 0.2482, + "step": 26900 + }, + { + "epoch": 1.11, + "grad_norm": 0.82421875, + "learning_rate": 0.000498311081315019, + "loss": 0.2299, + "step": 26910 + }, + { + "epoch": 1.12, + "grad_norm": 0.71875, + "learning_rate": 0.0004983098225849308, + "loss": 0.2294, + "step": 26920 + }, + { + "epoch": 1.12, + "grad_norm": 0.921875, + "learning_rate": 0.0004983085633875501, + "loss": 0.2448, + "step": 26930 + }, + { + "epoch": 1.12, + "grad_norm": 0.8203125, + "learning_rate": 0.0004983073037228794, + "loss": 0.2398, + "step": 26940 + }, + { + "epoch": 1.12, + "grad_norm": 1.1171875, + "learning_rate": 0.0004983060435909215, + "loss": 0.2361, + "step": 26950 + }, + { + "epoch": 1.12, + "grad_norm": 0.56640625, + "learning_rate": 0.0004983047829916781, + "loss": 0.236, + "step": 26960 + }, + { + "epoch": 1.12, + "grad_norm": 1.1484375, + "learning_rate": 0.0004983035219251521, + "loss": 0.2421, + "step": 26970 + }, + { + "epoch": 1.12, + "grad_norm": 1.1640625, + "learning_rate": 0.0004983022603913457, + "loss": 0.2284, + "step": 26980 + }, + { + "epoch": 1.12, + "grad_norm": 0.341796875, + "learning_rate": 0.0004983009983902612, + "loss": 0.1811, + "step": 26990 + }, + { + "epoch": 1.12, + "grad_norm": 0.6484375, + "learning_rate": 0.0004982997359219011, + "loss": 0.2267, + "step": 27000 + }, + { + "epoch": 1.12, + "grad_norm": 0.48828125, + "learning_rate": 0.0004982984729862677, + "loss": 0.2362, + "step": 27010 + }, + { + "epoch": 1.12, + "grad_norm": 0.44140625, + "learning_rate": 0.0004982972095833633, + "loss": 0.3276, + "step": 27020 + }, + { + "epoch": 1.12, + "grad_norm": 0.8203125, + "learning_rate": 0.0004982959457131904, + "loss": 0.1894, + "step": 27030 + }, + { + "epoch": 1.12, + "grad_norm": 1.1484375, + "learning_rate": 0.0004982946813757514, + "loss": 0.1884, + "step": 27040 + }, + { + "epoch": 1.12, + "grad_norm": 0.19921875, + "learning_rate": 0.0004982934165710485, + "loss": 0.1988, + "step": 27050 + }, + { + "epoch": 1.12, + "grad_norm": 0.51171875, + "learning_rate": 0.0004982921512990844, + "loss": 0.1859, + "step": 27060 + }, + { + "epoch": 1.12, + "grad_norm": 2.046875, + "learning_rate": 0.0004982908855598611, + "loss": 0.2269, + "step": 27070 + }, + { + "epoch": 1.12, + "grad_norm": 0.447265625, + "learning_rate": 0.0004982896193533813, + "loss": 0.2167, + "step": 27080 + }, + { + "epoch": 1.12, + "grad_norm": 0.56640625, + "learning_rate": 0.0004982883526796471, + "loss": 0.2252, + "step": 27090 + }, + { + "epoch": 1.12, + "grad_norm": 1.0625, + "learning_rate": 0.0004982870855386612, + "loss": 0.2265, + "step": 27100 + }, + { + "epoch": 1.12, + "grad_norm": 0.55078125, + "learning_rate": 0.0004982858179304258, + "loss": 0.2738, + "step": 27110 + }, + { + "epoch": 1.12, + "grad_norm": 0.62890625, + "learning_rate": 0.0004982845498549432, + "loss": 0.2714, + "step": 27120 + }, + { + "epoch": 1.12, + "grad_norm": 0.54296875, + "learning_rate": 0.0004982832813122159, + "loss": 0.2506, + "step": 27130 + }, + { + "epoch": 1.12, + "grad_norm": 0.83984375, + "learning_rate": 0.0004982820123022463, + "loss": 0.2627, + "step": 27140 + }, + { + "epoch": 1.12, + "grad_norm": 0.9609375, + "learning_rate": 0.0004982807428250368, + "loss": 0.267, + "step": 27150 + }, + { + "epoch": 1.12, + "grad_norm": 0.58984375, + "learning_rate": 0.0004982794728805897, + "loss": 0.2109, + "step": 27160 + }, + { + "epoch": 1.13, + "grad_norm": 1.4453125, + "learning_rate": 0.0004982782024689075, + "loss": 0.2534, + "step": 27170 + }, + { + "epoch": 1.13, + "grad_norm": 0.65234375, + "learning_rate": 0.0004982769315899925, + "loss": 0.2894, + "step": 27180 + }, + { + "epoch": 1.13, + "grad_norm": 0.466796875, + "learning_rate": 0.0004982756602438472, + "loss": 0.2376, + "step": 27190 + }, + { + "epoch": 1.13, + "grad_norm": 0.6171875, + "learning_rate": 0.0004982743884304738, + "loss": 0.1934, + "step": 27200 + }, + { + "epoch": 1.13, + "grad_norm": 1.328125, + "learning_rate": 0.0004982731161498748, + "loss": 0.2285, + "step": 27210 + }, + { + "epoch": 1.13, + "grad_norm": 0.53515625, + "learning_rate": 0.0004982718434020527, + "loss": 0.224, + "step": 27220 + }, + { + "epoch": 1.13, + "grad_norm": 0.390625, + "learning_rate": 0.0004982705701870098, + "loss": 0.2883, + "step": 27230 + }, + { + "epoch": 1.13, + "grad_norm": 0.259765625, + "learning_rate": 0.0004982692965047485, + "loss": 0.1746, + "step": 27240 + }, + { + "epoch": 1.13, + "grad_norm": 0.546875, + "learning_rate": 0.0004982680223552711, + "loss": 0.2221, + "step": 27250 + }, + { + "epoch": 1.13, + "grad_norm": 0.37890625, + "learning_rate": 0.0004982667477385802, + "loss": 0.1898, + "step": 27260 + }, + { + "epoch": 1.13, + "grad_norm": 0.66796875, + "learning_rate": 0.000498265472654678, + "loss": 0.233, + "step": 27270 + }, + { + "epoch": 1.13, + "grad_norm": 1.8828125, + "learning_rate": 0.0004982641971035671, + "loss": 0.2934, + "step": 27280 + }, + { + "epoch": 1.13, + "grad_norm": 0.44921875, + "learning_rate": 0.0004982629210852497, + "loss": 0.2226, + "step": 27290 + }, + { + "epoch": 1.13, + "grad_norm": 0.953125, + "learning_rate": 0.0004982616445997284, + "loss": 0.2761, + "step": 27300 + }, + { + "epoch": 1.13, + "grad_norm": 0.41015625, + "learning_rate": 0.0004982603676470053, + "loss": 0.2155, + "step": 27310 + }, + { + "epoch": 1.13, + "grad_norm": 0.62890625, + "learning_rate": 0.0004982590902270831, + "loss": 0.2101, + "step": 27320 + }, + { + "epoch": 1.13, + "grad_norm": 0.7265625, + "learning_rate": 0.0004982578123399642, + "loss": 0.1988, + "step": 27330 + }, + { + "epoch": 1.13, + "grad_norm": 1.2109375, + "learning_rate": 0.0004982565339856508, + "loss": 0.2135, + "step": 27340 + }, + { + "epoch": 1.13, + "grad_norm": 0.431640625, + "learning_rate": 0.0004982552551641454, + "loss": 0.2179, + "step": 27350 + }, + { + "epoch": 1.13, + "grad_norm": 0.796875, + "learning_rate": 0.0004982539758754505, + "loss": 0.2589, + "step": 27360 + }, + { + "epoch": 1.13, + "grad_norm": 0.490234375, + "learning_rate": 0.0004982526961195684, + "loss": 0.2246, + "step": 27370 + }, + { + "epoch": 1.13, + "grad_norm": 0.62890625, + "learning_rate": 0.0004982514158965015, + "loss": 0.2175, + "step": 27380 + }, + { + "epoch": 1.13, + "grad_norm": 1.125, + "learning_rate": 0.0004982501352062523, + "loss": 0.218, + "step": 27390 + }, + { + "epoch": 1.13, + "grad_norm": 1.0546875, + "learning_rate": 0.000498248854048823, + "loss": 0.2496, + "step": 27400 + }, + { + "epoch": 1.14, + "grad_norm": 1.7578125, + "learning_rate": 0.0004982475724242164, + "loss": 0.3228, + "step": 27410 + }, + { + "epoch": 1.14, + "grad_norm": 0.296875, + "learning_rate": 0.0004982462903324346, + "loss": 0.2298, + "step": 27420 + }, + { + "epoch": 1.14, + "grad_norm": 3.28125, + "learning_rate": 0.00049824500777348, + "loss": 0.2346, + "step": 27430 + }, + { + "epoch": 1.14, + "grad_norm": 0.345703125, + "learning_rate": 0.0004982437247473551, + "loss": 0.1856, + "step": 27440 + }, + { + "epoch": 1.14, + "grad_norm": 0.875, + "learning_rate": 0.0004982424412540623, + "loss": 0.2186, + "step": 27450 + }, + { + "epoch": 1.14, + "grad_norm": 0.359375, + "learning_rate": 0.0004982411572936042, + "loss": 0.2222, + "step": 27460 + }, + { + "epoch": 1.14, + "grad_norm": 1.046875, + "learning_rate": 0.000498239872865983, + "loss": 0.2181, + "step": 27470 + }, + { + "epoch": 1.14, + "grad_norm": 0.64453125, + "learning_rate": 0.000498238587971201, + "loss": 0.1939, + "step": 27480 + }, + { + "epoch": 1.14, + "grad_norm": 0.3984375, + "learning_rate": 0.000498237302609261, + "loss": 0.1706, + "step": 27490 + }, + { + "epoch": 1.14, + "grad_norm": 1.1171875, + "learning_rate": 0.000498236016780165, + "loss": 0.1697, + "step": 27500 + }, + { + "epoch": 1.14, + "grad_norm": 0.8125, + "learning_rate": 0.0004982347304839158, + "loss": 0.2384, + "step": 27510 + }, + { + "epoch": 1.14, + "grad_norm": 0.6796875, + "learning_rate": 0.0004982334437205156, + "loss": 0.2748, + "step": 27520 + }, + { + "epoch": 1.14, + "grad_norm": 0.37109375, + "learning_rate": 0.0004982321564899669, + "loss": 0.2225, + "step": 27530 + }, + { + "epoch": 1.14, + "grad_norm": 0.375, + "learning_rate": 0.000498230868792272, + "loss": 0.2237, + "step": 27540 + }, + { + "epoch": 1.14, + "grad_norm": 0.52734375, + "learning_rate": 0.0004982295806274334, + "loss": 0.202, + "step": 27550 + }, + { + "epoch": 1.14, + "grad_norm": 0.953125, + "learning_rate": 0.0004982282919954536, + "loss": 0.1905, + "step": 27560 + }, + { + "epoch": 1.14, + "grad_norm": 0.44921875, + "learning_rate": 0.0004982270028963349, + "loss": 0.2387, + "step": 27570 + }, + { + "epoch": 1.14, + "grad_norm": 0.7265625, + "learning_rate": 0.0004982257133300798, + "loss": 0.2911, + "step": 27580 + }, + { + "epoch": 1.14, + "grad_norm": 0.478515625, + "learning_rate": 0.0004982244232966908, + "loss": 0.2126, + "step": 27590 + }, + { + "epoch": 1.14, + "grad_norm": 0.60546875, + "learning_rate": 0.0004982231327961701, + "loss": 0.2711, + "step": 27600 + }, + { + "epoch": 1.14, + "grad_norm": 0.76171875, + "learning_rate": 0.0004982218418285204, + "loss": 0.1746, + "step": 27610 + }, + { + "epoch": 1.14, + "grad_norm": 0.56640625, + "learning_rate": 0.000498220550393744, + "loss": 0.2865, + "step": 27620 + }, + { + "epoch": 1.14, + "grad_norm": 0.6953125, + "learning_rate": 0.0004982192584918432, + "loss": 0.2234, + "step": 27630 + }, + { + "epoch": 1.14, + "grad_norm": 1.1484375, + "learning_rate": 0.0004982179661228207, + "loss": 0.2296, + "step": 27640 + }, + { + "epoch": 1.15, + "grad_norm": 2.0, + "learning_rate": 0.0004982166732866788, + "loss": 0.2522, + "step": 27650 + }, + { + "epoch": 1.15, + "grad_norm": 0.87109375, + "learning_rate": 0.0004982153799834198, + "loss": 0.2316, + "step": 27660 + }, + { + "epoch": 1.15, + "grad_norm": 0.462890625, + "learning_rate": 0.0004982140862130463, + "loss": 0.233, + "step": 27670 + }, + { + "epoch": 1.15, + "grad_norm": 0.7578125, + "learning_rate": 0.0004982127919755607, + "loss": 0.2187, + "step": 27680 + }, + { + "epoch": 1.15, + "grad_norm": 0.51953125, + "learning_rate": 0.0004982114972709655, + "loss": 0.2635, + "step": 27690 + }, + { + "epoch": 1.15, + "grad_norm": 0.76953125, + "learning_rate": 0.000498210202099263, + "loss": 0.2341, + "step": 27700 + }, + { + "epoch": 1.15, + "grad_norm": 0.94140625, + "learning_rate": 0.0004982089064604557, + "loss": 0.2396, + "step": 27710 + }, + { + "epoch": 1.15, + "grad_norm": 0.52734375, + "learning_rate": 0.0004982076103545461, + "loss": 0.2598, + "step": 27720 + }, + { + "epoch": 1.15, + "grad_norm": 0.796875, + "learning_rate": 0.0004982063137815365, + "loss": 0.2848, + "step": 27730 + }, + { + "epoch": 1.15, + "grad_norm": 0.41015625, + "learning_rate": 0.0004982050167414295, + "loss": 0.172, + "step": 27740 + }, + { + "epoch": 1.15, + "grad_norm": 0.458984375, + "learning_rate": 0.0004982037192342273, + "loss": 0.1948, + "step": 27750 + }, + { + "epoch": 1.15, + "grad_norm": 0.50390625, + "learning_rate": 0.0004982024212599328, + "loss": 0.2309, + "step": 27760 + }, + { + "epoch": 1.15, + "grad_norm": 0.578125, + "learning_rate": 0.0004982011228185478, + "loss": 0.2205, + "step": 27770 + }, + { + "epoch": 1.15, + "grad_norm": 0.89453125, + "learning_rate": 0.0004981998239100753, + "loss": 0.1954, + "step": 27780 + }, + { + "epoch": 1.15, + "grad_norm": 0.98046875, + "learning_rate": 0.0004981985245345175, + "loss": 0.2255, + "step": 27790 + }, + { + "epoch": 1.15, + "grad_norm": 1.1875, + "learning_rate": 0.0004981972246918769, + "loss": 0.2177, + "step": 27800 + }, + { + "epoch": 1.15, + "grad_norm": 0.6328125, + "learning_rate": 0.0004981959243821559, + "loss": 0.2429, + "step": 27810 + }, + { + "epoch": 1.15, + "grad_norm": 0.73046875, + "learning_rate": 0.0004981946236053569, + "loss": 0.2768, + "step": 27820 + }, + { + "epoch": 1.15, + "grad_norm": 0.69140625, + "learning_rate": 0.0004981933223614825, + "loss": 0.234, + "step": 27830 + }, + { + "epoch": 1.15, + "grad_norm": 0.443359375, + "learning_rate": 0.0004981920206505352, + "loss": 0.2157, + "step": 27840 + }, + { + "epoch": 1.15, + "grad_norm": 0.75, + "learning_rate": 0.0004981907184725172, + "loss": 0.2798, + "step": 27850 + }, + { + "epoch": 1.15, + "grad_norm": 0.375, + "learning_rate": 0.000498189415827431, + "loss": 0.1793, + "step": 27860 + }, + { + "epoch": 1.15, + "grad_norm": 1.03125, + "learning_rate": 0.0004981881127152792, + "loss": 0.2022, + "step": 27870 + }, + { + "epoch": 1.15, + "grad_norm": 0.2294921875, + "learning_rate": 0.0004981868091360641, + "loss": 0.1746, + "step": 27880 + }, + { + "epoch": 1.16, + "grad_norm": 0.48046875, + "learning_rate": 0.0004981855050897883, + "loss": 0.2474, + "step": 27890 + }, + { + "epoch": 1.16, + "grad_norm": 0.484375, + "learning_rate": 0.0004981842005764541, + "loss": 0.2451, + "step": 27900 + }, + { + "epoch": 1.16, + "grad_norm": 0.455078125, + "learning_rate": 0.0004981828955960641, + "loss": 0.2206, + "step": 27910 + }, + { + "epoch": 1.16, + "grad_norm": 1.6875, + "learning_rate": 0.0004981815901486207, + "loss": 0.2313, + "step": 27920 + }, + { + "epoch": 1.16, + "grad_norm": 0.345703125, + "learning_rate": 0.0004981802842341264, + "loss": 0.2276, + "step": 27930 + }, + { + "epoch": 1.16, + "grad_norm": 0.76171875, + "learning_rate": 0.0004981789778525836, + "loss": 0.2152, + "step": 27940 + }, + { + "epoch": 1.16, + "grad_norm": 0.408203125, + "learning_rate": 0.0004981776710039946, + "loss": 0.1877, + "step": 27950 + }, + { + "epoch": 1.16, + "grad_norm": 0.96484375, + "learning_rate": 0.0004981763636883622, + "loss": 0.2116, + "step": 27960 + }, + { + "epoch": 1.16, + "grad_norm": 0.486328125, + "learning_rate": 0.0004981750559056886, + "loss": 0.2487, + "step": 27970 + }, + { + "epoch": 1.16, + "grad_norm": 0.85546875, + "learning_rate": 0.0004981737476559764, + "loss": 0.2205, + "step": 27980 + }, + { + "epoch": 1.16, + "grad_norm": 1.1640625, + "learning_rate": 0.0004981724389392279, + "loss": 0.271, + "step": 27990 + }, + { + "epoch": 1.16, + "grad_norm": 1.4609375, + "learning_rate": 0.0004981711297554458, + "loss": 0.1956, + "step": 28000 + }, + { + "epoch": 1.16, + "grad_norm": 0.578125, + "learning_rate": 0.0004981698201046323, + "loss": 0.1974, + "step": 28010 + }, + { + "epoch": 1.16, + "grad_norm": 0.48828125, + "learning_rate": 0.0004981685099867901, + "loss": 0.2166, + "step": 28020 + }, + { + "epoch": 1.16, + "grad_norm": 0.3515625, + "learning_rate": 0.0004981671994019216, + "loss": 0.1933, + "step": 28030 + }, + { + "epoch": 1.16, + "grad_norm": 0.56640625, + "learning_rate": 0.0004981658883500291, + "loss": 0.2398, + "step": 28040 + }, + { + "epoch": 1.16, + "grad_norm": 0.1904296875, + "learning_rate": 0.0004981645768311153, + "loss": 0.2739, + "step": 28050 + }, + { + "epoch": 1.16, + "grad_norm": 0.7421875, + "learning_rate": 0.0004981632648451825, + "loss": 0.2209, + "step": 28060 + }, + { + "epoch": 1.16, + "grad_norm": 0.6015625, + "learning_rate": 0.0004981619523922332, + "loss": 0.2574, + "step": 28070 + }, + { + "epoch": 1.16, + "grad_norm": 0.369140625, + "learning_rate": 0.00049816063947227, + "loss": 0.2166, + "step": 28080 + }, + { + "epoch": 1.16, + "grad_norm": 0.6328125, + "learning_rate": 0.0004981593260852953, + "loss": 0.2473, + "step": 28090 + }, + { + "epoch": 1.16, + "grad_norm": 0.451171875, + "learning_rate": 0.0004981580122313115, + "loss": 0.2462, + "step": 28100 + }, + { + "epoch": 1.16, + "grad_norm": 0.88671875, + "learning_rate": 0.000498156697910321, + "loss": 0.2027, + "step": 28110 + }, + { + "epoch": 1.16, + "grad_norm": 1.0859375, + "learning_rate": 0.0004981553831223266, + "loss": 0.253, + "step": 28120 + }, + { + "epoch": 1.17, + "grad_norm": 0.859375, + "learning_rate": 0.0004981540678673305, + "loss": 0.2102, + "step": 28130 + }, + { + "epoch": 1.17, + "grad_norm": 0.91015625, + "learning_rate": 0.0004981527521453351, + "loss": 0.2186, + "step": 28140 + }, + { + "epoch": 1.17, + "grad_norm": 0.42578125, + "learning_rate": 0.0004981514359563432, + "loss": 0.2606, + "step": 28150 + }, + { + "epoch": 1.17, + "grad_norm": 0.98046875, + "learning_rate": 0.0004981501193003571, + "loss": 0.2215, + "step": 28160 + }, + { + "epoch": 1.17, + "grad_norm": 0.546875, + "learning_rate": 0.0004981488021773791, + "loss": 0.2298, + "step": 28170 + }, + { + "epoch": 1.17, + "grad_norm": 0.75, + "learning_rate": 0.000498147484587412, + "loss": 0.2383, + "step": 28180 + }, + { + "epoch": 1.17, + "grad_norm": 0.78125, + "learning_rate": 0.000498146166530458, + "loss": 0.2266, + "step": 28190 + }, + { + "epoch": 1.17, + "grad_norm": 1.171875, + "learning_rate": 0.0004981448480065199, + "loss": 0.2783, + "step": 28200 + }, + { + "epoch": 1.17, + "grad_norm": 0.5234375, + "learning_rate": 0.0004981435290155999, + "loss": 0.2549, + "step": 28210 + }, + { + "epoch": 1.17, + "grad_norm": 1.125, + "learning_rate": 0.0004981422095577005, + "loss": 0.2298, + "step": 28220 + }, + { + "epoch": 1.17, + "grad_norm": 0.373046875, + "learning_rate": 0.0004981408896328244, + "loss": 0.2497, + "step": 28230 + }, + { + "epoch": 1.17, + "grad_norm": 0.79296875, + "learning_rate": 0.0004981395692409739, + "loss": 0.228, + "step": 28240 + }, + { + "epoch": 1.17, + "grad_norm": 0.3125, + "learning_rate": 0.0004981382483821515, + "loss": 0.2216, + "step": 28250 + }, + { + "epoch": 1.17, + "grad_norm": 0.56640625, + "learning_rate": 0.0004981369270563597, + "loss": 0.2908, + "step": 28260 + }, + { + "epoch": 1.17, + "grad_norm": 0.193359375, + "learning_rate": 0.000498135605263601, + "loss": 0.2213, + "step": 28270 + }, + { + "epoch": 1.17, + "grad_norm": 0.7265625, + "learning_rate": 0.0004981342830038778, + "loss": 0.2642, + "step": 28280 + }, + { + "epoch": 1.17, + "grad_norm": 0.439453125, + "learning_rate": 0.0004981329602771928, + "loss": 0.1845, + "step": 28290 + }, + { + "epoch": 1.17, + "grad_norm": 0.578125, + "learning_rate": 0.0004981316370835483, + "loss": 0.2037, + "step": 28300 + }, + { + "epoch": 1.17, + "grad_norm": 0.6015625, + "learning_rate": 0.000498130313422947, + "loss": 0.1987, + "step": 28310 + }, + { + "epoch": 1.17, + "grad_norm": 0.9140625, + "learning_rate": 0.0004981289892953911, + "loss": 0.2535, + "step": 28320 + }, + { + "epoch": 1.17, + "grad_norm": 0.94140625, + "learning_rate": 0.0004981276647008833, + "loss": 0.231, + "step": 28330 + }, + { + "epoch": 1.17, + "grad_norm": 0.80859375, + "learning_rate": 0.000498126339639426, + "loss": 0.2164, + "step": 28340 + }, + { + "epoch": 1.17, + "grad_norm": 0.70703125, + "learning_rate": 0.0004981250141110217, + "loss": 0.2739, + "step": 28350 + }, + { + "epoch": 1.17, + "grad_norm": 1.4296875, + "learning_rate": 0.000498123688115673, + "loss": 0.2009, + "step": 28360 + }, + { + "epoch": 1.18, + "grad_norm": 0.80859375, + "learning_rate": 0.0004981223616533822, + "loss": 0.2212, + "step": 28370 + }, + { + "epoch": 1.18, + "grad_norm": 0.427734375, + "learning_rate": 0.0004981210347241521, + "loss": 0.2718, + "step": 28380 + }, + { + "epoch": 1.18, + "grad_norm": 0.5390625, + "learning_rate": 0.0004981197073279848, + "loss": 0.2101, + "step": 28390 + }, + { + "epoch": 1.18, + "grad_norm": 0.65625, + "learning_rate": 0.0004981183794648831, + "loss": 0.2382, + "step": 28400 + }, + { + "epoch": 1.18, + "grad_norm": 0.53515625, + "learning_rate": 0.0004981170511348494, + "loss": 0.1893, + "step": 28410 + }, + { + "epoch": 1.18, + "grad_norm": 0.5, + "learning_rate": 0.0004981157223378862, + "loss": 0.2185, + "step": 28420 + }, + { + "epoch": 1.18, + "grad_norm": 0.78125, + "learning_rate": 0.000498114393073996, + "loss": 0.2393, + "step": 28430 + }, + { + "epoch": 1.18, + "grad_norm": 0.6484375, + "learning_rate": 0.0004981130633431813, + "loss": 0.1904, + "step": 28440 + }, + { + "epoch": 1.18, + "grad_norm": 0.83984375, + "learning_rate": 0.0004981117331454446, + "loss": 0.2106, + "step": 28450 + }, + { + "epoch": 1.18, + "grad_norm": 0.65625, + "learning_rate": 0.0004981104024807885, + "loss": 0.245, + "step": 28460 + }, + { + "epoch": 1.18, + "grad_norm": 0.7421875, + "learning_rate": 0.0004981090713492152, + "loss": 0.2262, + "step": 28470 + }, + { + "epoch": 1.18, + "grad_norm": 0.443359375, + "learning_rate": 0.0004981077397507276, + "loss": 0.2228, + "step": 28480 + }, + { + "epoch": 1.18, + "grad_norm": 0.546875, + "learning_rate": 0.000498106407685328, + "loss": 0.2363, + "step": 28490 + }, + { + "epoch": 1.18, + "grad_norm": 0.734375, + "learning_rate": 0.000498105075153019, + "loss": 0.2264, + "step": 28500 + }, + { + "epoch": 1.18, + "grad_norm": 0.5, + "learning_rate": 0.0004981037421538029, + "loss": 0.2238, + "step": 28510 + }, + { + "epoch": 1.18, + "grad_norm": 0.455078125, + "learning_rate": 0.0004981024086876824, + "loss": 0.2203, + "step": 28520 + }, + { + "epoch": 1.18, + "grad_norm": 0.294921875, + "learning_rate": 0.00049810107475466, + "loss": 0.2494, + "step": 28530 + }, + { + "epoch": 1.18, + "grad_norm": 0.859375, + "learning_rate": 0.0004980997403547381, + "loss": 0.2573, + "step": 28540 + }, + { + "epoch": 1.18, + "grad_norm": 0.54296875, + "learning_rate": 0.0004980984054879194, + "loss": 0.249, + "step": 28550 + }, + { + "epoch": 1.18, + "grad_norm": 0.66015625, + "learning_rate": 0.0004980970701542062, + "loss": 0.2303, + "step": 28560 + }, + { + "epoch": 1.18, + "grad_norm": 0.55859375, + "learning_rate": 0.0004980957343536011, + "loss": 0.2076, + "step": 28570 + }, + { + "epoch": 1.18, + "grad_norm": 0.671875, + "learning_rate": 0.0004980943980861066, + "loss": 0.2473, + "step": 28580 + }, + { + "epoch": 1.18, + "grad_norm": 0.87890625, + "learning_rate": 0.0004980930613517254, + "loss": 0.2257, + "step": 28590 + }, + { + "epoch": 1.18, + "grad_norm": 0.73046875, + "learning_rate": 0.0004980917241504596, + "loss": 0.207, + "step": 28600 + }, + { + "epoch": 1.19, + "grad_norm": 0.83984375, + "learning_rate": 0.0004980903864823122, + "loss": 0.2119, + "step": 28610 + }, + { + "epoch": 1.19, + "grad_norm": 0.96484375, + "learning_rate": 0.0004980890483472853, + "loss": 0.2546, + "step": 28620 + }, + { + "epoch": 1.19, + "grad_norm": 0.5703125, + "learning_rate": 0.0004980877097453817, + "loss": 0.2255, + "step": 28630 + }, + { + "epoch": 1.19, + "grad_norm": 0.72265625, + "learning_rate": 0.0004980863706766038, + "loss": 0.25, + "step": 28640 + }, + { + "epoch": 1.19, + "grad_norm": 0.439453125, + "learning_rate": 0.0004980850311409542, + "loss": 0.2986, + "step": 28650 + }, + { + "epoch": 1.19, + "grad_norm": 0.625, + "learning_rate": 0.0004980836911384353, + "loss": 0.242, + "step": 28660 + }, + { + "epoch": 1.19, + "grad_norm": 1.0703125, + "learning_rate": 0.0004980823506690497, + "loss": 0.2655, + "step": 28670 + }, + { + "epoch": 1.19, + "grad_norm": 0.439453125, + "learning_rate": 0.0004980810097327999, + "loss": 0.2492, + "step": 28680 + }, + { + "epoch": 1.19, + "grad_norm": 0.87890625, + "learning_rate": 0.0004980796683296885, + "loss": 0.2119, + "step": 28690 + }, + { + "epoch": 1.19, + "grad_norm": 0.8046875, + "learning_rate": 0.0004980783264597179, + "loss": 0.1917, + "step": 28700 + }, + { + "epoch": 1.19, + "grad_norm": 0.4140625, + "learning_rate": 0.0004980769841228906, + "loss": 0.2613, + "step": 28710 + }, + { + "epoch": 1.19, + "grad_norm": 0.5625, + "learning_rate": 0.0004980756413192093, + "loss": 0.1942, + "step": 28720 + }, + { + "epoch": 1.19, + "grad_norm": 1.7265625, + "learning_rate": 0.0004980742980486763, + "loss": 0.2592, + "step": 28730 + }, + { + "epoch": 1.19, + "grad_norm": 0.68359375, + "learning_rate": 0.0004980729543112944, + "loss": 0.2298, + "step": 28740 + }, + { + "epoch": 1.19, + "grad_norm": 0.546875, + "learning_rate": 0.0004980716101070659, + "loss": 0.2283, + "step": 28750 + }, + { + "epoch": 1.19, + "grad_norm": 2.078125, + "learning_rate": 0.0004980702654359936, + "loss": 0.2228, + "step": 28760 + }, + { + "epoch": 1.19, + "grad_norm": 1.1171875, + "learning_rate": 0.0004980689202980797, + "loss": 0.2853, + "step": 28770 + }, + { + "epoch": 1.19, + "grad_norm": 0.65234375, + "learning_rate": 0.0004980675746933268, + "loss": 0.2371, + "step": 28780 + }, + { + "epoch": 1.19, + "grad_norm": 0.66015625, + "learning_rate": 0.0004980662286217377, + "loss": 0.2193, + "step": 28790 + }, + { + "epoch": 1.19, + "grad_norm": 0.8671875, + "learning_rate": 0.0004980648820833146, + "loss": 0.2839, + "step": 28800 + }, + { + "epoch": 1.19, + "grad_norm": 0.7265625, + "learning_rate": 0.0004980635350780603, + "loss": 0.2333, + "step": 28810 + }, + { + "epoch": 1.19, + "grad_norm": 0.5390625, + "learning_rate": 0.0004980621876059772, + "loss": 0.2467, + "step": 28820 + }, + { + "epoch": 1.19, + "grad_norm": 0.49609375, + "learning_rate": 0.0004980608396670677, + "loss": 0.194, + "step": 28830 + }, + { + "epoch": 1.19, + "grad_norm": 1.125, + "learning_rate": 0.0004980594912613347, + "loss": 0.2031, + "step": 28840 + }, + { + "epoch": 1.19, + "grad_norm": 0.52734375, + "learning_rate": 0.0004980581423887804, + "loss": 0.2147, + "step": 28850 + }, + { + "epoch": 1.2, + "grad_norm": 0.68359375, + "learning_rate": 0.0004980567930494075, + "loss": 0.3, + "step": 28860 + }, + { + "epoch": 1.2, + "grad_norm": 1.1875, + "learning_rate": 0.0004980554432432185, + "loss": 0.287, + "step": 28870 + }, + { + "epoch": 1.2, + "grad_norm": 1.2421875, + "learning_rate": 0.0004980540929702159, + "loss": 0.1967, + "step": 28880 + }, + { + "epoch": 1.2, + "grad_norm": 0.205078125, + "learning_rate": 0.0004980527422304023, + "loss": 0.2325, + "step": 28890 + }, + { + "epoch": 1.2, + "grad_norm": 0.6328125, + "learning_rate": 0.0004980513910237803, + "loss": 0.2311, + "step": 28900 + }, + { + "epoch": 1.2, + "grad_norm": 0.318359375, + "learning_rate": 0.0004980500393503523, + "loss": 0.2961, + "step": 28910 + }, + { + "epoch": 1.2, + "grad_norm": 0.62890625, + "learning_rate": 0.0004980486872101209, + "loss": 0.2118, + "step": 28920 + }, + { + "epoch": 1.2, + "grad_norm": 0.9921875, + "learning_rate": 0.0004980473346030887, + "loss": 0.2, + "step": 28930 + }, + { + "epoch": 1.2, + "grad_norm": 0.51953125, + "learning_rate": 0.0004980459815292582, + "loss": 0.1977, + "step": 28940 + }, + { + "epoch": 1.2, + "grad_norm": 0.59765625, + "learning_rate": 0.0004980446279886319, + "loss": 0.2552, + "step": 28950 + }, + { + "epoch": 1.2, + "grad_norm": 0.298828125, + "learning_rate": 0.0004980432739812125, + "loss": 0.2229, + "step": 28960 + }, + { + "epoch": 1.2, + "grad_norm": 1.1484375, + "learning_rate": 0.0004980419195070023, + "loss": 0.213, + "step": 28970 + }, + { + "epoch": 1.2, + "grad_norm": 0.8828125, + "learning_rate": 0.0004980405645660041, + "loss": 0.2325, + "step": 28980 + }, + { + "epoch": 1.2, + "grad_norm": 0.375, + "learning_rate": 0.0004980392091582203, + "loss": 0.1737, + "step": 28990 + }, + { + "epoch": 1.2, + "grad_norm": 0.59375, + "learning_rate": 0.0004980378532836535, + "loss": 0.2489, + "step": 29000 + }, + { + "epoch": 1.2, + "grad_norm": 0.09326171875, + "learning_rate": 0.0004980364969423063, + "loss": 0.2361, + "step": 29010 + }, + { + "epoch": 1.2, + "grad_norm": 0.890625, + "learning_rate": 0.0004980351401341811, + "loss": 0.2159, + "step": 29020 + }, + { + "epoch": 1.2, + "grad_norm": 1.0546875, + "learning_rate": 0.0004980337828592807, + "loss": 0.2182, + "step": 29030 + }, + { + "epoch": 1.2, + "grad_norm": 0.82421875, + "learning_rate": 0.0004980324251176074, + "loss": 0.202, + "step": 29040 + }, + { + "epoch": 1.2, + "grad_norm": 1.2265625, + "learning_rate": 0.0004980310669091639, + "loss": 0.2665, + "step": 29050 + }, + { + "epoch": 1.2, + "grad_norm": 1.75, + "learning_rate": 0.0004980297082339527, + "loss": 0.2497, + "step": 29060 + }, + { + "epoch": 1.2, + "grad_norm": 1.2578125, + "learning_rate": 0.0004980283490919763, + "loss": 0.2067, + "step": 29070 + }, + { + "epoch": 1.2, + "grad_norm": 0.404296875, + "learning_rate": 0.0004980269894832375, + "loss": 0.2599, + "step": 29080 + }, + { + "epoch": 1.2, + "grad_norm": 0.8125, + "learning_rate": 0.0004980256294077385, + "loss": 0.2182, + "step": 29090 + }, + { + "epoch": 1.21, + "grad_norm": 1.1875, + "learning_rate": 0.0004980242688654821, + "loss": 0.2453, + "step": 29100 + }, + { + "epoch": 1.21, + "grad_norm": 0.375, + "learning_rate": 0.0004980229078564709, + "loss": 0.2369, + "step": 29110 + }, + { + "epoch": 1.21, + "grad_norm": 0.828125, + "learning_rate": 0.0004980215463807072, + "loss": 0.2605, + "step": 29120 + }, + { + "epoch": 1.21, + "grad_norm": 0.7890625, + "learning_rate": 0.0004980201844381939, + "loss": 0.1893, + "step": 29130 + }, + { + "epoch": 1.21, + "grad_norm": 0.58984375, + "learning_rate": 0.0004980188220289333, + "loss": 0.2269, + "step": 29140 + }, + { + "epoch": 1.21, + "grad_norm": 0.0341796875, + "learning_rate": 0.0004980174591529281, + "loss": 0.2225, + "step": 29150 + }, + { + "epoch": 1.21, + "grad_norm": 0.66015625, + "learning_rate": 0.000498016095810181, + "loss": 0.2793, + "step": 29160 + }, + { + "epoch": 1.21, + "grad_norm": 0.6328125, + "learning_rate": 0.0004980147320006942, + "loss": 0.2693, + "step": 29170 + }, + { + "epoch": 1.21, + "grad_norm": 0.484375, + "learning_rate": 0.0004980133677244705, + "loss": 0.2415, + "step": 29180 + }, + { + "epoch": 1.21, + "grad_norm": 0.8046875, + "learning_rate": 0.0004980120029815124, + "loss": 0.2184, + "step": 29190 + }, + { + "epoch": 1.21, + "grad_norm": 0.546875, + "learning_rate": 0.0004980106377718225, + "loss": 0.1811, + "step": 29200 + }, + { + "epoch": 1.21, + "grad_norm": 0.52734375, + "learning_rate": 0.0004980092720954034, + "loss": 0.2422, + "step": 29210 + }, + { + "epoch": 1.21, + "grad_norm": 0.8671875, + "learning_rate": 0.0004980079059522575, + "loss": 0.2151, + "step": 29220 + }, + { + "epoch": 1.21, + "grad_norm": 0.3671875, + "learning_rate": 0.0004980065393423876, + "loss": 0.2712, + "step": 29230 + }, + { + "epoch": 1.21, + "grad_norm": 0.39453125, + "learning_rate": 0.0004980051722657962, + "loss": 0.2402, + "step": 29240 + }, + { + "epoch": 1.21, + "grad_norm": 0.69140625, + "learning_rate": 0.0004980038047224858, + "loss": 0.2688, + "step": 29250 + }, + { + "epoch": 1.21, + "grad_norm": 0.8515625, + "learning_rate": 0.000498002436712459, + "loss": 0.1967, + "step": 29260 + }, + { + "epoch": 1.21, + "grad_norm": 0.63671875, + "learning_rate": 0.0004980010682357186, + "loss": 0.2869, + "step": 29270 + }, + { + "epoch": 1.21, + "grad_norm": 0.75, + "learning_rate": 0.0004979996992922667, + "loss": 0.2712, + "step": 29280 + }, + { + "epoch": 1.21, + "grad_norm": 1.4765625, + "learning_rate": 0.0004979983298821063, + "loss": 0.2418, + "step": 29290 + }, + { + "epoch": 1.21, + "grad_norm": 0.53125, + "learning_rate": 0.0004979969600052398, + "loss": 0.2228, + "step": 29300 + }, + { + "epoch": 1.21, + "grad_norm": 0.85546875, + "learning_rate": 0.0004979955896616699, + "loss": 0.2276, + "step": 29310 + }, + { + "epoch": 1.21, + "grad_norm": 0.310546875, + "learning_rate": 0.0004979942188513989, + "loss": 0.2477, + "step": 29320 + }, + { + "epoch": 1.21, + "grad_norm": 0.28125, + "learning_rate": 0.0004979928475744296, + "loss": 0.2218, + "step": 29330 + }, + { + "epoch": 1.22, + "grad_norm": 0.416015625, + "learning_rate": 0.0004979914758307646, + "loss": 0.2827, + "step": 29340 + }, + { + "epoch": 1.22, + "grad_norm": 2.0625, + "learning_rate": 0.0004979901036204063, + "loss": 0.2734, + "step": 29350 + }, + { + "epoch": 1.22, + "grad_norm": 0.6328125, + "learning_rate": 0.0004979887309433576, + "loss": 0.2028, + "step": 29360 + }, + { + "epoch": 1.22, + "grad_norm": 0.59375, + "learning_rate": 0.0004979873577996207, + "loss": 0.2287, + "step": 29370 + }, + { + "epoch": 1.22, + "grad_norm": 0.54296875, + "learning_rate": 0.0004979859841891985, + "loss": 0.1985, + "step": 29380 + }, + { + "epoch": 1.22, + "grad_norm": 0.89453125, + "learning_rate": 0.0004979846101120935, + "loss": 0.2328, + "step": 29390 + }, + { + "epoch": 1.22, + "grad_norm": 0.2890625, + "learning_rate": 0.0004979832355683082, + "loss": 0.1834, + "step": 29400 + }, + { + "epoch": 1.22, + "grad_norm": 0.65234375, + "learning_rate": 0.0004979818605578451, + "loss": 0.223, + "step": 29410 + }, + { + "epoch": 1.22, + "grad_norm": 1.2265625, + "learning_rate": 0.000497980485080707, + "loss": 0.2235, + "step": 29420 + }, + { + "epoch": 1.22, + "grad_norm": 0.24609375, + "learning_rate": 0.0004979791091368965, + "loss": 0.2304, + "step": 29430 + }, + { + "epoch": 1.22, + "grad_norm": 0.484375, + "learning_rate": 0.000497977732726416, + "loss": 0.3291, + "step": 29440 + }, + { + "epoch": 1.22, + "grad_norm": 0.470703125, + "learning_rate": 0.0004979763558492681, + "loss": 0.2457, + "step": 29450 + }, + { + "epoch": 1.22, + "grad_norm": 0.49609375, + "learning_rate": 0.0004979749785054557, + "loss": 0.2418, + "step": 29460 + }, + { + "epoch": 1.22, + "grad_norm": 1.84375, + "learning_rate": 0.000497973600694981, + "loss": 0.2519, + "step": 29470 + }, + { + "epoch": 1.22, + "grad_norm": 1.140625, + "learning_rate": 0.0004979722224178468, + "loss": 0.2319, + "step": 29480 + }, + { + "epoch": 1.22, + "grad_norm": 0.69921875, + "learning_rate": 0.0004979708436740557, + "loss": 0.2939, + "step": 29490 + }, + { + "epoch": 1.22, + "grad_norm": 0.81640625, + "learning_rate": 0.0004979694644636103, + "loss": 0.2502, + "step": 29500 + }, + { + "epoch": 1.22, + "grad_norm": 1.65625, + "learning_rate": 0.000497968084786513, + "loss": 0.2621, + "step": 29510 + }, + { + "epoch": 1.22, + "grad_norm": 3.03125, + "learning_rate": 0.0004979667046427666, + "loss": 0.2467, + "step": 29520 + }, + { + "epoch": 1.22, + "grad_norm": 0.734375, + "learning_rate": 0.0004979653240323736, + "loss": 0.2286, + "step": 29530 + }, + { + "epoch": 1.22, + "grad_norm": 0.84765625, + "learning_rate": 0.0004979639429553367, + "loss": 0.2387, + "step": 29540 + }, + { + "epoch": 1.22, + "grad_norm": 0.7890625, + "learning_rate": 0.0004979625614116584, + "loss": 0.2107, + "step": 29550 + }, + { + "epoch": 1.22, + "grad_norm": 0.8203125, + "learning_rate": 0.0004979611794013414, + "loss": 0.2857, + "step": 29560 + }, + { + "epoch": 1.22, + "grad_norm": 0.91015625, + "learning_rate": 0.0004979597969243882, + "loss": 0.1905, + "step": 29570 + }, + { + "epoch": 1.23, + "grad_norm": 0.734375, + "learning_rate": 0.0004979584139808014, + "loss": 0.2208, + "step": 29580 + }, + { + "epoch": 1.23, + "grad_norm": 1.03125, + "learning_rate": 0.0004979570305705838, + "loss": 0.1802, + "step": 29590 + }, + { + "epoch": 1.23, + "grad_norm": 0.7890625, + "learning_rate": 0.0004979556466937376, + "loss": 0.252, + "step": 29600 + }, + { + "epoch": 1.23, + "grad_norm": 0.796875, + "learning_rate": 0.0004979542623502659, + "loss": 0.2337, + "step": 29610 + }, + { + "epoch": 1.23, + "grad_norm": 0.8828125, + "learning_rate": 0.0004979528775401708, + "loss": 0.2237, + "step": 29620 + }, + { + "epoch": 1.23, + "grad_norm": 1.546875, + "learning_rate": 0.0004979514922634554, + "loss": 0.219, + "step": 29630 + }, + { + "epoch": 1.23, + "grad_norm": 0.5234375, + "learning_rate": 0.0004979501065201219, + "loss": 0.2679, + "step": 29640 + }, + { + "epoch": 1.23, + "grad_norm": 0.357421875, + "learning_rate": 0.0004979487203101732, + "loss": 0.1649, + "step": 29650 + }, + { + "epoch": 1.23, + "grad_norm": 0.75390625, + "learning_rate": 0.0004979473336336116, + "loss": 0.257, + "step": 29660 + }, + { + "epoch": 1.23, + "grad_norm": 0.63671875, + "learning_rate": 0.0004979459464904401, + "loss": 0.2116, + "step": 29670 + }, + { + "epoch": 1.23, + "grad_norm": 1.296875, + "learning_rate": 0.000497944558880661, + "loss": 0.207, + "step": 29680 + }, + { + "epoch": 1.23, + "grad_norm": 0.287109375, + "learning_rate": 0.000497943170804277, + "loss": 0.1607, + "step": 29690 + }, + { + "epoch": 1.23, + "grad_norm": 0.38671875, + "learning_rate": 0.0004979417822612908, + "loss": 0.2356, + "step": 29700 + }, + { + "epoch": 1.23, + "grad_norm": 0.328125, + "learning_rate": 0.000497940393251705, + "loss": 0.2094, + "step": 29710 + }, + { + "epoch": 1.23, + "grad_norm": 0.62109375, + "learning_rate": 0.000497939003775522, + "loss": 0.2159, + "step": 29720 + }, + { + "epoch": 1.23, + "grad_norm": 0.7890625, + "learning_rate": 0.0004979376138327446, + "loss": 0.184, + "step": 29730 + }, + { + "epoch": 1.23, + "grad_norm": 0.67578125, + "learning_rate": 0.0004979362234233754, + "loss": 0.2702, + "step": 29740 + }, + { + "epoch": 1.23, + "grad_norm": 0.61328125, + "learning_rate": 0.000497934832547417, + "loss": 0.2802, + "step": 29750 + }, + { + "epoch": 1.23, + "grad_norm": 0.494140625, + "learning_rate": 0.000497933441204872, + "loss": 0.24, + "step": 29760 + }, + { + "epoch": 1.23, + "grad_norm": 0.474609375, + "learning_rate": 0.0004979320493957431, + "loss": 0.2459, + "step": 29770 + }, + { + "epoch": 1.23, + "grad_norm": 0.333984375, + "learning_rate": 0.0004979306571200328, + "loss": 0.2767, + "step": 29780 + }, + { + "epoch": 1.23, + "grad_norm": 0.5703125, + "learning_rate": 0.0004979292643777439, + "loss": 0.2225, + "step": 29790 + }, + { + "epoch": 1.23, + "grad_norm": 0.828125, + "learning_rate": 0.0004979278711688788, + "loss": 0.2414, + "step": 29800 + }, + { + "epoch": 1.23, + "grad_norm": 0.640625, + "learning_rate": 0.0004979264774934402, + "loss": 0.2055, + "step": 29810 + }, + { + "epoch": 1.24, + "grad_norm": 0.578125, + "learning_rate": 0.0004979250833514308, + "loss": 0.271, + "step": 29820 + }, + { + "epoch": 1.24, + "grad_norm": 0.90234375, + "learning_rate": 0.0004979236887428531, + "loss": 0.2563, + "step": 29830 + }, + { + "epoch": 1.24, + "grad_norm": 0.59765625, + "learning_rate": 0.0004979222936677099, + "loss": 0.2031, + "step": 29840 + }, + { + "epoch": 1.24, + "grad_norm": 0.83203125, + "learning_rate": 0.0004979208981260036, + "loss": 0.213, + "step": 29850 + }, + { + "epoch": 1.24, + "grad_norm": 0.359375, + "learning_rate": 0.0004979195021177369, + "loss": 0.2208, + "step": 29860 + }, + { + "epoch": 1.24, + "grad_norm": 0.68359375, + "learning_rate": 0.0004979181056429126, + "loss": 0.2327, + "step": 29870 + }, + { + "epoch": 1.24, + "grad_norm": 0.2119140625, + "learning_rate": 0.0004979167087015332, + "loss": 0.3257, + "step": 29880 + }, + { + "epoch": 1.24, + "grad_norm": 1.0859375, + "learning_rate": 0.0004979153112936013, + "loss": 0.3074, + "step": 29890 + }, + { + "epoch": 1.24, + "grad_norm": 1.1484375, + "learning_rate": 0.0004979139134191195, + "loss": 0.264, + "step": 29900 + }, + { + "epoch": 1.24, + "grad_norm": 0.859375, + "learning_rate": 0.0004979125150780905, + "loss": 0.2336, + "step": 29910 + }, + { + "epoch": 1.24, + "grad_norm": 1.0703125, + "learning_rate": 0.000497911116270517, + "loss": 0.2675, + "step": 29920 + }, + { + "epoch": 1.24, + "grad_norm": 0.263671875, + "learning_rate": 0.0004979097169964015, + "loss": 0.1898, + "step": 29930 + }, + { + "epoch": 1.24, + "grad_norm": 0.302734375, + "learning_rate": 0.0004979083172557467, + "loss": 0.269, + "step": 29940 + }, + { + "epoch": 1.24, + "grad_norm": 0.462890625, + "learning_rate": 0.0004979069170485551, + "loss": 0.2398, + "step": 29950 + }, + { + "epoch": 1.24, + "grad_norm": 0.55078125, + "learning_rate": 0.0004979055163748296, + "loss": 0.2173, + "step": 29960 + }, + { + "epoch": 1.24, + "grad_norm": 0.421875, + "learning_rate": 0.0004979041152345727, + "loss": 0.2699, + "step": 29970 + }, + { + "epoch": 1.24, + "grad_norm": 0.7890625, + "learning_rate": 0.0004979027136277869, + "loss": 0.214, + "step": 29980 + }, + { + "epoch": 1.24, + "grad_norm": 0.52734375, + "learning_rate": 0.0004979013115544751, + "loss": 0.2756, + "step": 29990 + }, + { + "epoch": 1.24, + "grad_norm": 0.361328125, + "learning_rate": 0.0004978999090146398, + "loss": 0.1941, + "step": 30000 + }, + { + "epoch": 1.24, + "grad_norm": 1.328125, + "learning_rate": 0.0004978985060082835, + "loss": 0.2228, + "step": 30010 + }, + { + "epoch": 1.24, + "grad_norm": 1.375, + "learning_rate": 0.0004978971025354091, + "loss": 0.2229, + "step": 30020 + }, + { + "epoch": 1.24, + "grad_norm": 1.2265625, + "learning_rate": 0.0004978956985960191, + "loss": 0.2719, + "step": 30030 + }, + { + "epoch": 1.24, + "grad_norm": 0.7734375, + "learning_rate": 0.0004978942941901161, + "loss": 0.2044, + "step": 30040 + }, + { + "epoch": 1.24, + "grad_norm": 4.15625, + "learning_rate": 0.0004978928893177029, + "loss": 0.2158, + "step": 30050 + }, + { + "epoch": 1.25, + "grad_norm": 1.5625, + "learning_rate": 0.000497891483978782, + "loss": 0.27, + "step": 30060 + }, + { + "epoch": 1.25, + "grad_norm": 0.70703125, + "learning_rate": 0.0004978900781733561, + "loss": 0.2107, + "step": 30070 + }, + { + "epoch": 1.25, + "grad_norm": 2.0625, + "learning_rate": 0.0004978886719014279, + "loss": 0.2385, + "step": 30080 + }, + { + "epoch": 1.25, + "grad_norm": 0.29296875, + "learning_rate": 0.0004978872651629999, + "loss": 0.2621, + "step": 30090 + }, + { + "epoch": 1.25, + "grad_norm": 0.921875, + "learning_rate": 0.0004978858579580749, + "loss": 0.2922, + "step": 30100 + }, + { + "epoch": 1.25, + "grad_norm": 0.3828125, + "learning_rate": 0.0004978844502866554, + "loss": 0.1946, + "step": 30110 + }, + { + "epoch": 1.25, + "grad_norm": 0.9296875, + "learning_rate": 0.0004978830421487442, + "loss": 0.2284, + "step": 30120 + }, + { + "epoch": 1.25, + "grad_norm": 0.484375, + "learning_rate": 0.0004978816335443439, + "loss": 0.1997, + "step": 30130 + }, + { + "epoch": 1.25, + "grad_norm": 0.4921875, + "learning_rate": 0.0004978802244734572, + "loss": 0.201, + "step": 30140 + }, + { + "epoch": 1.25, + "grad_norm": 1.6328125, + "learning_rate": 0.0004978788149360866, + "loss": 0.1695, + "step": 30150 + }, + { + "epoch": 1.25, + "grad_norm": 0.71484375, + "learning_rate": 0.0004978774049322347, + "loss": 0.2006, + "step": 30160 + }, + { + "epoch": 1.25, + "grad_norm": 0.89453125, + "learning_rate": 0.0004978759944619045, + "loss": 0.2059, + "step": 30170 + }, + { + "epoch": 1.25, + "grad_norm": 1.375, + "learning_rate": 0.0004978745835250984, + "loss": 0.2352, + "step": 30180 + }, + { + "epoch": 1.25, + "grad_norm": 0.4296875, + "learning_rate": 0.000497873172121819, + "loss": 0.2065, + "step": 30190 + }, + { + "epoch": 1.25, + "grad_norm": 0.9453125, + "learning_rate": 0.0004978717602520692, + "loss": 0.2704, + "step": 30200 + }, + { + "epoch": 1.25, + "grad_norm": 0.76171875, + "learning_rate": 0.0004978703479158515, + "loss": 0.2881, + "step": 30210 + }, + { + "epoch": 1.25, + "grad_norm": 1.2421875, + "learning_rate": 0.0004978689351131685, + "loss": 0.2765, + "step": 30220 + }, + { + "epoch": 1.25, + "grad_norm": 0.55078125, + "learning_rate": 0.000497867521844023, + "loss": 0.2729, + "step": 30230 + }, + { + "epoch": 1.25, + "grad_norm": 0.64453125, + "learning_rate": 0.0004978661081084175, + "loss": 0.2388, + "step": 30240 + }, + { + "epoch": 1.25, + "grad_norm": 0.404296875, + "learning_rate": 0.0004978646939063548, + "loss": 0.2523, + "step": 30250 + }, + { + "epoch": 1.25, + "grad_norm": 0.77734375, + "learning_rate": 0.0004978632792378376, + "loss": 0.2598, + "step": 30260 + }, + { + "epoch": 1.25, + "grad_norm": 0.80078125, + "learning_rate": 0.0004978618641028683, + "loss": 0.2673, + "step": 30270 + }, + { + "epoch": 1.25, + "grad_norm": 0.67578125, + "learning_rate": 0.0004978604485014499, + "loss": 0.2065, + "step": 30280 + }, + { + "epoch": 1.25, + "grad_norm": 0.42578125, + "learning_rate": 0.0004978590324335848, + "loss": 0.2479, + "step": 30290 + }, + { + "epoch": 1.26, + "grad_norm": 0.431640625, + "learning_rate": 0.0004978576158992758, + "loss": 0.2577, + "step": 30300 + }, + { + "epoch": 1.26, + "grad_norm": 0.52734375, + "learning_rate": 0.0004978561988985256, + "loss": 0.2384, + "step": 30310 + }, + { + "epoch": 1.26, + "grad_norm": 1.9296875, + "learning_rate": 0.0004978547814313367, + "loss": 0.2085, + "step": 30320 + }, + { + "epoch": 1.26, + "grad_norm": 0.88671875, + "learning_rate": 0.0004978533634977118, + "loss": 0.226, + "step": 30330 + }, + { + "epoch": 1.26, + "grad_norm": 0.62109375, + "learning_rate": 0.0004978519450976538, + "loss": 0.2493, + "step": 30340 + }, + { + "epoch": 1.26, + "grad_norm": 0.54296875, + "learning_rate": 0.0004978505262311651, + "loss": 0.2145, + "step": 30350 + }, + { + "epoch": 1.26, + "grad_norm": 0.56640625, + "learning_rate": 0.0004978491068982486, + "loss": 0.2402, + "step": 30360 + }, + { + "epoch": 1.26, + "grad_norm": 0.400390625, + "learning_rate": 0.0004978476870989067, + "loss": 0.2039, + "step": 30370 + }, + { + "epoch": 1.26, + "grad_norm": 1.53125, + "learning_rate": 0.0004978462668331423, + "loss": 0.2623, + "step": 30380 + }, + { + "epoch": 1.26, + "grad_norm": 0.8359375, + "learning_rate": 0.000497844846100958, + "loss": 0.2162, + "step": 30390 + }, + { + "epoch": 1.26, + "grad_norm": 0.80859375, + "learning_rate": 0.0004978434249023563, + "loss": 0.2081, + "step": 30400 + }, + { + "epoch": 1.26, + "grad_norm": 0.609375, + "learning_rate": 0.0004978420032373403, + "loss": 0.2303, + "step": 30410 + }, + { + "epoch": 1.26, + "grad_norm": 0.63671875, + "learning_rate": 0.0004978405811059122, + "loss": 0.2476, + "step": 30420 + }, + { + "epoch": 1.26, + "grad_norm": 0.51171875, + "learning_rate": 0.0004978391585080751, + "loss": 0.1821, + "step": 30430 + }, + { + "epoch": 1.26, + "grad_norm": 0.65625, + "learning_rate": 0.0004978377354438313, + "loss": 0.264, + "step": 30440 + }, + { + "epoch": 1.26, + "grad_norm": 0.92578125, + "learning_rate": 0.0004978363119131836, + "loss": 0.2086, + "step": 30450 + }, + { + "epoch": 1.26, + "grad_norm": 0.765625, + "learning_rate": 0.0004978348879161349, + "loss": 0.2325, + "step": 30460 + }, + { + "epoch": 1.26, + "grad_norm": 0.85546875, + "learning_rate": 0.0004978334634526876, + "loss": 0.2222, + "step": 30470 + }, + { + "epoch": 1.26, + "grad_norm": 0.640625, + "learning_rate": 0.0004978320385228446, + "loss": 0.2264, + "step": 30480 + }, + { + "epoch": 1.26, + "grad_norm": 0.796875, + "learning_rate": 0.0004978306131266083, + "loss": 0.1996, + "step": 30490 + }, + { + "epoch": 1.26, + "grad_norm": 0.984375, + "learning_rate": 0.0004978291872639816, + "loss": 0.2725, + "step": 30500 + }, + { + "epoch": 1.26, + "grad_norm": 0.921875, + "learning_rate": 0.0004978277609349672, + "loss": 0.2362, + "step": 30510 + }, + { + "epoch": 1.26, + "grad_norm": 0.404296875, + "learning_rate": 0.0004978263341395677, + "loss": 0.2578, + "step": 30520 + }, + { + "epoch": 1.26, + "grad_norm": 1.0390625, + "learning_rate": 0.0004978249068777857, + "loss": 0.2669, + "step": 30530 + }, + { + "epoch": 1.26, + "grad_norm": 1.0703125, + "learning_rate": 0.0004978234791496242, + "loss": 0.2051, + "step": 30540 + }, + { + "epoch": 1.27, + "grad_norm": 0.37890625, + "learning_rate": 0.0004978220509550855, + "loss": 0.2342, + "step": 30550 + }, + { + "epoch": 1.27, + "grad_norm": 0.66015625, + "learning_rate": 0.0004978206222941725, + "loss": 0.2161, + "step": 30560 + }, + { + "epoch": 1.27, + "grad_norm": 1.0078125, + "learning_rate": 0.0004978191931668878, + "loss": 0.1928, + "step": 30570 + }, + { + "epoch": 1.27, + "grad_norm": 0.69140625, + "learning_rate": 0.0004978177635732342, + "loss": 0.225, + "step": 30580 + }, + { + "epoch": 1.27, + "grad_norm": 0.90625, + "learning_rate": 0.0004978163335132143, + "loss": 0.2134, + "step": 30590 + }, + { + "epoch": 1.27, + "grad_norm": 0.671875, + "learning_rate": 0.0004978149029868309, + "loss": 0.2774, + "step": 30600 + }, + { + "epoch": 1.27, + "grad_norm": 1.484375, + "learning_rate": 0.0004978134719940866, + "loss": 0.211, + "step": 30610 + }, + { + "epoch": 1.27, + "grad_norm": 0.734375, + "learning_rate": 0.0004978120405349839, + "loss": 0.181, + "step": 30620 + }, + { + "epoch": 1.27, + "grad_norm": 0.2373046875, + "learning_rate": 0.000497810608609526, + "loss": 0.2473, + "step": 30630 + }, + { + "epoch": 1.27, + "grad_norm": 0.5234375, + "learning_rate": 0.0004978091762177151, + "loss": 0.2203, + "step": 30640 + }, + { + "epoch": 1.27, + "grad_norm": 0.484375, + "learning_rate": 0.0004978077433595542, + "loss": 0.2202, + "step": 30650 + }, + { + "epoch": 1.27, + "grad_norm": 0.423828125, + "learning_rate": 0.0004978063100350459, + "loss": 0.2101, + "step": 30660 + }, + { + "epoch": 1.27, + "grad_norm": 0.86328125, + "learning_rate": 0.0004978048762441928, + "loss": 0.2421, + "step": 30670 + }, + { + "epoch": 1.27, + "grad_norm": 0.68359375, + "learning_rate": 0.0004978034419869977, + "loss": 0.28, + "step": 30680 + }, + { + "epoch": 1.27, + "grad_norm": 0.8046875, + "learning_rate": 0.0004978020072634633, + "loss": 0.254, + "step": 30690 + }, + { + "epoch": 1.27, + "grad_norm": 0.259765625, + "learning_rate": 0.0004978005720735923, + "loss": 0.2182, + "step": 30700 + }, + { + "epoch": 1.27, + "grad_norm": 0.7109375, + "learning_rate": 0.0004977991364173873, + "loss": 0.1637, + "step": 30710 + }, + { + "epoch": 1.27, + "grad_norm": 0.546875, + "learning_rate": 0.0004977977002948512, + "loss": 0.2135, + "step": 30720 + }, + { + "epoch": 1.27, + "grad_norm": 0.57421875, + "learning_rate": 0.0004977962637059865, + "loss": 0.2245, + "step": 30730 + }, + { + "epoch": 1.27, + "grad_norm": 0.72265625, + "learning_rate": 0.000497794826650796, + "loss": 0.2122, + "step": 30740 + }, + { + "epoch": 1.27, + "grad_norm": 0.439453125, + "learning_rate": 0.0004977933891292825, + "loss": 0.2489, + "step": 30750 + }, + { + "epoch": 1.27, + "grad_norm": 0.625, + "learning_rate": 0.0004977919511414485, + "loss": 0.2401, + "step": 30760 + }, + { + "epoch": 1.27, + "grad_norm": 0.458984375, + "learning_rate": 0.0004977905126872968, + "loss": 0.2235, + "step": 30770 + }, + { + "epoch": 1.27, + "grad_norm": 0.41796875, + "learning_rate": 0.0004977890737668301, + "loss": 0.2183, + "step": 30780 + }, + { + "epoch": 1.28, + "grad_norm": 0.369140625, + "learning_rate": 0.0004977876343800513, + "loss": 0.2103, + "step": 30790 + }, + { + "epoch": 1.28, + "grad_norm": 0.515625, + "learning_rate": 0.0004977861945269627, + "loss": 0.2056, + "step": 30800 + }, + { + "epoch": 1.28, + "grad_norm": 0.70703125, + "learning_rate": 0.0004977847542075674, + "loss": 0.1752, + "step": 30810 + }, + { + "epoch": 1.28, + "grad_norm": 0.5390625, + "learning_rate": 0.0004977833134218679, + "loss": 0.241, + "step": 30820 + }, + { + "epoch": 1.28, + "grad_norm": 0.294921875, + "learning_rate": 0.0004977818721698669, + "loss": 0.2131, + "step": 30830 + }, + { + "epoch": 1.28, + "grad_norm": 0.478515625, + "learning_rate": 0.0004977804304515673, + "loss": 0.2196, + "step": 30840 + }, + { + "epoch": 1.28, + "grad_norm": 1.1484375, + "learning_rate": 0.0004977789882669716, + "loss": 0.2553, + "step": 30850 + }, + { + "epoch": 1.28, + "grad_norm": 3.5, + "learning_rate": 0.0004977775456160826, + "loss": 0.2252, + "step": 30860 + }, + { + "epoch": 1.28, + "grad_norm": 0.5234375, + "learning_rate": 0.000497776102498903, + "loss": 0.2233, + "step": 30870 + }, + { + "epoch": 1.28, + "grad_norm": 0.5703125, + "learning_rate": 0.0004977746589154357, + "loss": 0.2417, + "step": 30880 + }, + { + "epoch": 1.28, + "grad_norm": 1.0546875, + "learning_rate": 0.000497773214865683, + "loss": 0.1938, + "step": 30890 + }, + { + "epoch": 1.28, + "grad_norm": 0.6171875, + "learning_rate": 0.0004977717703496479, + "loss": 0.2588, + "step": 30900 + }, + { + "epoch": 1.28, + "grad_norm": 0.79296875, + "learning_rate": 0.0004977703253673333, + "loss": 0.2528, + "step": 30910 + }, + { + "epoch": 1.28, + "grad_norm": 1.3125, + "learning_rate": 0.0004977688799187415, + "loss": 0.2798, + "step": 30920 + }, + { + "epoch": 1.28, + "grad_norm": 0.64453125, + "learning_rate": 0.0004977674340038756, + "loss": 0.2145, + "step": 30930 + }, + { + "epoch": 1.28, + "grad_norm": 0.46875, + "learning_rate": 0.000497765987622738, + "loss": 0.2988, + "step": 30940 + }, + { + "epoch": 1.28, + "grad_norm": 0.48046875, + "learning_rate": 0.0004977645407753316, + "loss": 0.2353, + "step": 30950 + }, + { + "epoch": 1.28, + "grad_norm": 0.197265625, + "learning_rate": 0.000497763093461659, + "loss": 0.2396, + "step": 30960 + }, + { + "epoch": 1.28, + "grad_norm": 0.7265625, + "learning_rate": 0.0004977616456817233, + "loss": 0.2279, + "step": 30970 + }, + { + "epoch": 1.28, + "grad_norm": 0.92578125, + "learning_rate": 0.0004977601974355267, + "loss": 0.2165, + "step": 30980 + }, + { + "epoch": 1.28, + "grad_norm": 0.458984375, + "learning_rate": 0.0004977587487230721, + "loss": 0.2786, + "step": 30990 + }, + { + "epoch": 1.28, + "grad_norm": 0.53125, + "learning_rate": 0.0004977572995443624, + "loss": 0.2199, + "step": 31000 + }, + { + "epoch": 1.28, + "grad_norm": 2.0, + "learning_rate": 0.0004977558498994003, + "loss": 0.2207, + "step": 31010 + }, + { + "epoch": 1.28, + "grad_norm": 0.6484375, + "learning_rate": 0.0004977543997881883, + "loss": 0.2411, + "step": 31020 + }, + { + "epoch": 1.29, + "grad_norm": 0.47265625, + "learning_rate": 0.0004977529492107294, + "loss": 0.1872, + "step": 31030 + }, + { + "epoch": 1.29, + "grad_norm": 0.49609375, + "learning_rate": 0.0004977514981670261, + "loss": 0.2231, + "step": 31040 + }, + { + "epoch": 1.29, + "grad_norm": 0.33203125, + "learning_rate": 0.0004977500466570813, + "loss": 0.2462, + "step": 31050 + }, + { + "epoch": 1.29, + "grad_norm": 1.8359375, + "learning_rate": 0.0004977485946808977, + "loss": 0.2055, + "step": 31060 + }, + { + "epoch": 1.29, + "grad_norm": 0.69140625, + "learning_rate": 0.0004977471422384779, + "loss": 0.2724, + "step": 31070 + }, + { + "epoch": 1.29, + "grad_norm": 0.74609375, + "learning_rate": 0.0004977456893298248, + "loss": 0.2735, + "step": 31080 + }, + { + "epoch": 1.29, + "grad_norm": 1.4609375, + "learning_rate": 0.0004977442359549411, + "loss": 0.1824, + "step": 31090 + }, + { + "epoch": 1.29, + "grad_norm": 0.66015625, + "learning_rate": 0.0004977427821138296, + "loss": 0.2358, + "step": 31100 + }, + { + "epoch": 1.29, + "grad_norm": 1.1796875, + "learning_rate": 0.0004977413278064927, + "loss": 0.2254, + "step": 31110 + }, + { + "epoch": 1.29, + "grad_norm": 0.353515625, + "learning_rate": 0.0004977398730329335, + "loss": 0.1862, + "step": 31120 + }, + { + "epoch": 1.29, + "grad_norm": 2.203125, + "learning_rate": 0.0004977384177931545, + "loss": 0.2135, + "step": 31130 + }, + { + "epoch": 1.29, + "grad_norm": 0.64453125, + "learning_rate": 0.0004977369620871587, + "loss": 0.253, + "step": 31140 + }, + { + "epoch": 1.29, + "grad_norm": 1.03125, + "learning_rate": 0.0004977355059149485, + "loss": 0.2312, + "step": 31150 + }, + { + "epoch": 1.29, + "grad_norm": 0.9375, + "learning_rate": 0.000497734049276527, + "loss": 0.3005, + "step": 31160 + }, + { + "epoch": 1.29, + "grad_norm": 0.8046875, + "learning_rate": 0.0004977325921718967, + "loss": 0.2577, + "step": 31170 + }, + { + "epoch": 1.29, + "grad_norm": 0.890625, + "learning_rate": 0.0004977311346010605, + "loss": 0.2061, + "step": 31180 + }, + { + "epoch": 1.29, + "grad_norm": 0.55078125, + "learning_rate": 0.000497729676564021, + "loss": 0.2037, + "step": 31190 + }, + { + "epoch": 1.29, + "grad_norm": 1.1328125, + "learning_rate": 0.0004977282180607809, + "loss": 0.2276, + "step": 31200 + }, + { + "epoch": 1.29, + "grad_norm": 0.283203125, + "learning_rate": 0.0004977267590913433, + "loss": 0.2374, + "step": 31210 + }, + { + "epoch": 1.29, + "grad_norm": 0.75390625, + "learning_rate": 0.0004977252996557105, + "loss": 0.2287, + "step": 31220 + }, + { + "epoch": 1.29, + "grad_norm": 2.40625, + "learning_rate": 0.0004977238397538855, + "loss": 0.2339, + "step": 31230 + }, + { + "epoch": 1.29, + "grad_norm": 0.77734375, + "learning_rate": 0.000497722379385871, + "loss": 0.1991, + "step": 31240 + }, + { + "epoch": 1.29, + "grad_norm": 0.60546875, + "learning_rate": 0.0004977209185516695, + "loss": 0.222, + "step": 31250 + }, + { + "epoch": 1.29, + "grad_norm": 2.078125, + "learning_rate": 0.0004977194572512842, + "loss": 0.2635, + "step": 31260 + }, + { + "epoch": 1.3, + "grad_norm": 1.40625, + "learning_rate": 0.0004977179954847176, + "loss": 0.2397, + "step": 31270 + }, + { + "epoch": 1.3, + "grad_norm": 1.4375, + "learning_rate": 0.0004977165332519726, + "loss": 0.2012, + "step": 31280 + }, + { + "epoch": 1.3, + "grad_norm": 2.140625, + "learning_rate": 0.0004977150705530516, + "loss": 0.2454, + "step": 31290 + }, + { + "epoch": 1.3, + "grad_norm": 0.984375, + "learning_rate": 0.0004977136073879577, + "loss": 0.2254, + "step": 31300 + }, + { + "epoch": 1.3, + "grad_norm": 0.7578125, + "learning_rate": 0.0004977121437566937, + "loss": 0.1934, + "step": 31310 + }, + { + "epoch": 1.3, + "grad_norm": 0.6328125, + "learning_rate": 0.000497710679659262, + "loss": 0.2675, + "step": 31320 + }, + { + "epoch": 1.3, + "grad_norm": 0.451171875, + "learning_rate": 0.0004977092150956656, + "loss": 0.2104, + "step": 31330 + }, + { + "epoch": 1.3, + "grad_norm": 0.54296875, + "learning_rate": 0.0004977077500659073, + "loss": 0.253, + "step": 31340 + }, + { + "epoch": 1.3, + "grad_norm": 0.47265625, + "learning_rate": 0.0004977062845699896, + "loss": 0.2455, + "step": 31350 + }, + { + "epoch": 1.3, + "grad_norm": 0.5546875, + "learning_rate": 0.0004977048186079155, + "loss": 0.2399, + "step": 31360 + }, + { + "epoch": 1.3, + "grad_norm": 0.35546875, + "learning_rate": 0.0004977033521796877, + "loss": 0.2204, + "step": 31370 + }, + { + "epoch": 1.3, + "grad_norm": 1.7890625, + "learning_rate": 0.000497701885285309, + "loss": 0.229, + "step": 31380 + }, + { + "epoch": 1.3, + "grad_norm": 4.6875, + "learning_rate": 0.0004977004179247819, + "loss": 0.2091, + "step": 31390 + }, + { + "epoch": 1.3, + "grad_norm": 0.26953125, + "learning_rate": 0.0004976989500981095, + "loss": 0.2801, + "step": 31400 + }, + { + "epoch": 1.3, + "grad_norm": 1.5078125, + "learning_rate": 0.0004976974818052944, + "loss": 0.191, + "step": 31410 + }, + { + "epoch": 1.3, + "grad_norm": 0.98046875, + "learning_rate": 0.0004976960130463395, + "loss": 0.2488, + "step": 31420 + }, + { + "epoch": 1.3, + "grad_norm": 0.56640625, + "learning_rate": 0.0004976945438212473, + "loss": 0.2148, + "step": 31430 + }, + { + "epoch": 1.3, + "grad_norm": 0.30859375, + "learning_rate": 0.0004976930741300207, + "loss": 0.2022, + "step": 31440 + }, + { + "epoch": 1.3, + "grad_norm": 0.419921875, + "learning_rate": 0.0004976916039726627, + "loss": 0.2821, + "step": 31450 + }, + { + "epoch": 1.3, + "grad_norm": 0.90625, + "learning_rate": 0.0004976901333491755, + "loss": 0.253, + "step": 31460 + }, + { + "epoch": 1.3, + "grad_norm": 0.859375, + "learning_rate": 0.0004976886622595625, + "loss": 0.2097, + "step": 31470 + }, + { + "epoch": 1.3, + "grad_norm": 0.71484375, + "learning_rate": 0.0004976871907038261, + "loss": 0.2773, + "step": 31480 + }, + { + "epoch": 1.3, + "grad_norm": 1.03125, + "learning_rate": 0.0004976857186819692, + "loss": 0.2952, + "step": 31490 + }, + { + "epoch": 1.3, + "grad_norm": 0.61328125, + "learning_rate": 0.0004976842461939944, + "loss": 0.2015, + "step": 31500 + }, + { + "epoch": 1.31, + "grad_norm": 0.392578125, + "learning_rate": 0.0004976827732399048, + "loss": 0.2325, + "step": 31510 + }, + { + "epoch": 1.31, + "grad_norm": 0.890625, + "learning_rate": 0.0004976812998197027, + "loss": 0.2291, + "step": 31520 + }, + { + "epoch": 1.31, + "grad_norm": 1.1953125, + "learning_rate": 0.0004976798259333913, + "loss": 0.2584, + "step": 31530 + }, + { + "epoch": 1.31, + "grad_norm": 0.64453125, + "learning_rate": 0.0004976783515809733, + "loss": 0.1883, + "step": 31540 + }, + { + "epoch": 1.31, + "grad_norm": 0.9453125, + "learning_rate": 0.0004976768767624513, + "loss": 0.2004, + "step": 31550 + }, + { + "epoch": 1.31, + "grad_norm": 1.25, + "learning_rate": 0.0004976754014778281, + "loss": 0.319, + "step": 31560 + }, + { + "epoch": 1.31, + "grad_norm": 1.53125, + "learning_rate": 0.0004976739257271066, + "loss": 0.2449, + "step": 31570 + }, + { + "epoch": 1.31, + "grad_norm": 0.578125, + "learning_rate": 0.0004976724495102896, + "loss": 0.2172, + "step": 31580 + }, + { + "epoch": 1.31, + "grad_norm": 0.3203125, + "learning_rate": 0.0004976709728273797, + "loss": 0.222, + "step": 31590 + }, + { + "epoch": 1.31, + "grad_norm": 0.47265625, + "learning_rate": 0.0004976694956783798, + "loss": 0.1728, + "step": 31600 + }, + { + "epoch": 1.31, + "grad_norm": 0.84375, + "learning_rate": 0.0004976680180632927, + "loss": 0.215, + "step": 31610 + }, + { + "epoch": 1.31, + "grad_norm": 0.5703125, + "learning_rate": 0.000497666539982121, + "loss": 0.1708, + "step": 31620 + }, + { + "epoch": 1.31, + "grad_norm": 1.84375, + "learning_rate": 0.0004976650614348677, + "loss": 0.1994, + "step": 31630 + }, + { + "epoch": 1.31, + "grad_norm": 0.435546875, + "learning_rate": 0.0004976635824215356, + "loss": 0.2158, + "step": 31640 + }, + { + "epoch": 1.31, + "grad_norm": 0.47265625, + "learning_rate": 0.0004976621029421272, + "loss": 0.2836, + "step": 31650 + }, + { + "epoch": 1.31, + "grad_norm": 0.5859375, + "learning_rate": 0.0004976606229966455, + "loss": 0.2073, + "step": 31660 + }, + { + "epoch": 1.31, + "grad_norm": 1.234375, + "learning_rate": 0.0004976591425850933, + "loss": 0.2665, + "step": 31670 + }, + { + "epoch": 1.31, + "grad_norm": 0.5625, + "learning_rate": 0.0004976576617074734, + "loss": 0.2502, + "step": 31680 + }, + { + "epoch": 1.31, + "grad_norm": 0.52734375, + "learning_rate": 0.0004976561803637884, + "loss": 0.2074, + "step": 31690 + }, + { + "epoch": 1.31, + "grad_norm": 1.1171875, + "learning_rate": 0.0004976546985540414, + "loss": 0.2976, + "step": 31700 + }, + { + "epoch": 1.31, + "grad_norm": 1.125, + "learning_rate": 0.0004976532162782348, + "loss": 0.1696, + "step": 31710 + }, + { + "epoch": 1.31, + "grad_norm": 0.625, + "learning_rate": 0.0004976517335363716, + "loss": 0.3082, + "step": 31720 + }, + { + "epoch": 1.31, + "grad_norm": 0.35546875, + "learning_rate": 0.0004976502503284547, + "loss": 0.2133, + "step": 31730 + }, + { + "epoch": 1.31, + "grad_norm": 0.380859375, + "learning_rate": 0.0004976487666544868, + "loss": 0.2313, + "step": 31740 + }, + { + "epoch": 1.32, + "grad_norm": 0.6640625, + "learning_rate": 0.0004976472825144705, + "loss": 0.2453, + "step": 31750 + }, + { + "epoch": 1.32, + "grad_norm": 0.9453125, + "learning_rate": 0.000497645797908409, + "loss": 0.1966, + "step": 31760 + }, + { + "epoch": 1.32, + "grad_norm": 1.8359375, + "learning_rate": 0.0004976443128363046, + "loss": 0.2073, + "step": 31770 + }, + { + "epoch": 1.32, + "grad_norm": 0.474609375, + "learning_rate": 0.0004976428272981605, + "loss": 0.2297, + "step": 31780 + }, + { + "epoch": 1.32, + "grad_norm": 0.515625, + "learning_rate": 0.0004976413412939792, + "loss": 0.1888, + "step": 31790 + }, + { + "epoch": 1.32, + "grad_norm": 1.015625, + "learning_rate": 0.0004976398548237638, + "loss": 0.2413, + "step": 31800 + }, + { + "epoch": 1.32, + "grad_norm": 0.54296875, + "learning_rate": 0.0004976383678875168, + "loss": 0.2598, + "step": 31810 + }, + { + "epoch": 1.32, + "grad_norm": 0.875, + "learning_rate": 0.0004976368804852412, + "loss": 0.3119, + "step": 31820 + }, + { + "epoch": 1.32, + "grad_norm": 0.51171875, + "learning_rate": 0.0004976353926169398, + "loss": 0.2339, + "step": 31830 + }, + { + "epoch": 1.32, + "grad_norm": 0.546875, + "learning_rate": 0.0004976339042826152, + "loss": 0.2267, + "step": 31840 + }, + { + "epoch": 1.32, + "grad_norm": 0.79296875, + "learning_rate": 0.0004976324154822704, + "loss": 0.2861, + "step": 31850 + }, + { + "epoch": 1.32, + "grad_norm": 1.34375, + "learning_rate": 0.0004976309262159081, + "loss": 0.205, + "step": 31860 + }, + { + "epoch": 1.32, + "grad_norm": 0.6171875, + "learning_rate": 0.0004976294364835313, + "loss": 0.2636, + "step": 31870 + }, + { + "epoch": 1.32, + "grad_norm": 0.6328125, + "learning_rate": 0.0004976279462851424, + "loss": 0.2281, + "step": 31880 + }, + { + "epoch": 1.32, + "grad_norm": 0.326171875, + "learning_rate": 0.0004976264556207446, + "loss": 0.2523, + "step": 31890 + }, + { + "epoch": 1.32, + "grad_norm": 0.77734375, + "learning_rate": 0.0004976249644903405, + "loss": 0.2484, + "step": 31900 + }, + { + "epoch": 1.32, + "grad_norm": 0.55078125, + "learning_rate": 0.000497623472893933, + "loss": 0.2287, + "step": 31910 + }, + { + "epoch": 1.32, + "grad_norm": 1.59375, + "learning_rate": 0.0004976219808315248, + "loss": 0.2188, + "step": 31920 + }, + { + "epoch": 1.32, + "grad_norm": 0.359375, + "learning_rate": 0.0004976204883031188, + "loss": 0.2556, + "step": 31930 + }, + { + "epoch": 1.32, + "grad_norm": 0.376953125, + "learning_rate": 0.0004976189953087178, + "loss": 0.2056, + "step": 31940 + }, + { + "epoch": 1.32, + "grad_norm": 1.0390625, + "learning_rate": 0.0004976175018483245, + "loss": 0.2676, + "step": 31950 + }, + { + "epoch": 1.32, + "grad_norm": 0.400390625, + "learning_rate": 0.000497616007921942, + "loss": 0.2145, + "step": 31960 + }, + { + "epoch": 1.32, + "grad_norm": 0.859375, + "learning_rate": 0.0004976145135295728, + "loss": 0.245, + "step": 31970 + }, + { + "epoch": 1.32, + "grad_norm": 0.5234375, + "learning_rate": 0.0004976130186712198, + "loss": 0.2102, + "step": 31980 + }, + { + "epoch": 1.33, + "grad_norm": 0.9921875, + "learning_rate": 0.0004976115233468858, + "loss": 0.1795, + "step": 31990 + }, + { + "epoch": 1.33, + "grad_norm": 0.54296875, + "learning_rate": 0.0004976100275565738, + "loss": 0.208, + "step": 32000 + }, + { + "epoch": 1.33, + "grad_norm": 0.380859375, + "learning_rate": 0.0004976085313002864, + "loss": 0.2244, + "step": 32010 + }, + { + "epoch": 1.33, + "grad_norm": 0.443359375, + "learning_rate": 0.0004976070345780264, + "loss": 0.262, + "step": 32020 + }, + { + "epoch": 1.33, + "grad_norm": 0.6875, + "learning_rate": 0.0004976055373897968, + "loss": 0.1825, + "step": 32030 + }, + { + "epoch": 1.33, + "grad_norm": 4.59375, + "learning_rate": 0.0004976040397356004, + "loss": 0.3612, + "step": 32040 + }, + { + "epoch": 1.33, + "grad_norm": 0.5546875, + "learning_rate": 0.0004976025416154398, + "loss": 0.24, + "step": 32050 + }, + { + "epoch": 1.33, + "grad_norm": 0.220703125, + "learning_rate": 0.0004976010430293178, + "loss": 0.2125, + "step": 32060 + }, + { + "epoch": 1.33, + "grad_norm": 0.58203125, + "learning_rate": 0.0004975995439772376, + "loss": 0.2179, + "step": 32070 + }, + { + "epoch": 1.33, + "grad_norm": 0.427734375, + "learning_rate": 0.0004975980444592018, + "loss": 0.2522, + "step": 32080 + }, + { + "epoch": 1.33, + "grad_norm": 1.0234375, + "learning_rate": 0.0004975965444752132, + "loss": 0.2569, + "step": 32090 + }, + { + "epoch": 1.33, + "grad_norm": 0.494140625, + "learning_rate": 0.0004975950440252745, + "loss": 0.2787, + "step": 32100 + }, + { + "epoch": 1.33, + "grad_norm": 0.83203125, + "learning_rate": 0.0004975935431093888, + "loss": 0.1881, + "step": 32110 + }, + { + "epoch": 1.33, + "grad_norm": 0.453125, + "learning_rate": 0.0004975920417275587, + "loss": 0.2643, + "step": 32120 + }, + { + "epoch": 1.33, + "grad_norm": 0.5078125, + "learning_rate": 0.0004975905398797873, + "loss": 0.2513, + "step": 32130 + }, + { + "epoch": 1.33, + "grad_norm": 1.828125, + "learning_rate": 0.0004975890375660771, + "loss": 0.2441, + "step": 32140 + }, + { + "epoch": 1.33, + "grad_norm": 0.443359375, + "learning_rate": 0.000497587534786431, + "loss": 0.2425, + "step": 32150 + }, + { + "epoch": 1.33, + "grad_norm": 0.5546875, + "learning_rate": 0.000497586031540852, + "loss": 0.2449, + "step": 32160 + }, + { + "epoch": 1.33, + "grad_norm": 0.6171875, + "learning_rate": 0.0004975845278293429, + "loss": 0.2251, + "step": 32170 + }, + { + "epoch": 1.33, + "grad_norm": 0.52734375, + "learning_rate": 0.0004975830236519063, + "loss": 0.2257, + "step": 32180 + }, + { + "epoch": 1.33, + "grad_norm": 0.80859375, + "learning_rate": 0.0004975815190085453, + "loss": 0.298, + "step": 32190 + }, + { + "epoch": 1.33, + "grad_norm": 0.890625, + "learning_rate": 0.0004975800138992626, + "loss": 0.2576, + "step": 32200 + }, + { + "epoch": 1.33, + "grad_norm": 0.2734375, + "learning_rate": 0.0004975785083240609, + "loss": 0.178, + "step": 32210 + }, + { + "epoch": 1.33, + "grad_norm": 0.91796875, + "learning_rate": 0.0004975770022829434, + "loss": 0.2709, + "step": 32220 + }, + { + "epoch": 1.33, + "grad_norm": 1.1875, + "learning_rate": 0.0004975754957759126, + "loss": 0.2237, + "step": 32230 + }, + { + "epoch": 1.34, + "grad_norm": 1.8203125, + "learning_rate": 0.0004975739888029714, + "loss": 0.2505, + "step": 32240 + }, + { + "epoch": 1.34, + "grad_norm": 0.8359375, + "learning_rate": 0.0004975724813641228, + "loss": 0.2173, + "step": 32250 + }, + { + "epoch": 1.34, + "grad_norm": 0.55078125, + "learning_rate": 0.0004975709734593696, + "loss": 0.2095, + "step": 32260 + }, + { + "epoch": 1.34, + "grad_norm": 0.5859375, + "learning_rate": 0.0004975694650887144, + "loss": 0.237, + "step": 32270 + }, + { + "epoch": 1.34, + "grad_norm": 0.447265625, + "learning_rate": 0.0004975679562521602, + "loss": 0.2681, + "step": 32280 + }, + { + "epoch": 1.34, + "grad_norm": 0.875, + "learning_rate": 0.00049756644694971, + "loss": 0.2306, + "step": 32290 + }, + { + "epoch": 1.34, + "grad_norm": 0.78125, + "learning_rate": 0.0004975649371813663, + "loss": 0.2532, + "step": 32300 + }, + { + "epoch": 1.34, + "grad_norm": 0.79296875, + "learning_rate": 0.0004975634269471322, + "loss": 0.2358, + "step": 32310 + }, + { + "epoch": 1.34, + "grad_norm": 0.62890625, + "learning_rate": 0.0004975619162470104, + "loss": 0.2668, + "step": 32320 + }, + { + "epoch": 1.34, + "grad_norm": 0.400390625, + "learning_rate": 0.000497560405081004, + "loss": 0.2721, + "step": 32330 + }, + { + "epoch": 1.34, + "grad_norm": 0.326171875, + "learning_rate": 0.0004975588934491154, + "loss": 0.1985, + "step": 32340 + }, + { + "epoch": 1.34, + "grad_norm": 1.3984375, + "learning_rate": 0.0004975573813513479, + "loss": 0.2611, + "step": 32350 + }, + { + "epoch": 1.34, + "grad_norm": 0.67578125, + "learning_rate": 0.000497555868787704, + "loss": 0.2644, + "step": 32360 + }, + { + "epoch": 1.34, + "grad_norm": 0.74609375, + "learning_rate": 0.0004975543557581868, + "loss": 0.2361, + "step": 32370 + }, + { + "epoch": 1.34, + "grad_norm": 0.455078125, + "learning_rate": 0.000497552842262799, + "loss": 0.213, + "step": 32380 + }, + { + "epoch": 1.34, + "grad_norm": 0.46484375, + "learning_rate": 0.0004975513283015434, + "loss": 0.29, + "step": 32390 + }, + { + "epoch": 1.34, + "grad_norm": 0.357421875, + "learning_rate": 0.0004975498138744231, + "loss": 0.1909, + "step": 32400 + }, + { + "epoch": 1.34, + "grad_norm": 0.283203125, + "learning_rate": 0.0004975482989814406, + "loss": 0.2191, + "step": 32410 + }, + { + "epoch": 1.34, + "grad_norm": 0.69140625, + "learning_rate": 0.0004975467836225991, + "loss": 0.2252, + "step": 32420 + }, + { + "epoch": 1.34, + "grad_norm": 0.890625, + "learning_rate": 0.0004975452677979012, + "loss": 0.2582, + "step": 32430 + }, + { + "epoch": 1.34, + "grad_norm": 1.2890625, + "learning_rate": 0.0004975437515073499, + "loss": 0.2052, + "step": 32440 + }, + { + "epoch": 1.34, + "grad_norm": 0.255859375, + "learning_rate": 0.0004975422347509479, + "loss": 0.2016, + "step": 32450 + }, + { + "epoch": 1.34, + "grad_norm": 0.2197265625, + "learning_rate": 0.0004975407175286981, + "loss": 0.1805, + "step": 32460 + }, + { + "epoch": 1.34, + "grad_norm": 0.5546875, + "learning_rate": 0.0004975391998406036, + "loss": 0.207, + "step": 32470 + }, + { + "epoch": 1.35, + "grad_norm": 0.890625, + "learning_rate": 0.0004975376816866669, + "loss": 0.2258, + "step": 32480 + }, + { + "epoch": 1.35, + "grad_norm": 0.50390625, + "learning_rate": 0.0004975361630668911, + "loss": 0.2221, + "step": 32490 + }, + { + "epoch": 1.35, + "grad_norm": 0.6875, + "learning_rate": 0.0004975346439812789, + "loss": 0.2446, + "step": 32500 + }, + { + "epoch": 1.35, + "grad_norm": 0.369140625, + "learning_rate": 0.0004975331244298333, + "loss": 0.248, + "step": 32510 + }, + { + "epoch": 1.35, + "grad_norm": 0.419921875, + "learning_rate": 0.0004975316044125571, + "loss": 0.1889, + "step": 32520 + }, + { + "epoch": 1.35, + "grad_norm": 0.9140625, + "learning_rate": 0.000497530083929453, + "loss": 0.2818, + "step": 32530 + }, + { + "epoch": 1.35, + "grad_norm": 0.7421875, + "learning_rate": 0.0004975285629805241, + "loss": 0.2763, + "step": 32540 + }, + { + "epoch": 1.35, + "grad_norm": 0.48046875, + "learning_rate": 0.0004975270415657732, + "loss": 0.2207, + "step": 32550 + }, + { + "epoch": 1.35, + "grad_norm": 0.76171875, + "learning_rate": 0.0004975255196852031, + "loss": 0.2306, + "step": 32560 + }, + { + "epoch": 1.35, + "grad_norm": 0.365234375, + "learning_rate": 0.0004975239973388168, + "loss": 0.1824, + "step": 32570 + }, + { + "epoch": 1.35, + "grad_norm": 0.52734375, + "learning_rate": 0.000497522474526617, + "loss": 0.2707, + "step": 32580 + }, + { + "epoch": 1.35, + "grad_norm": 0.3984375, + "learning_rate": 0.0004975209512486067, + "loss": 0.2436, + "step": 32590 + }, + { + "epoch": 1.35, + "grad_norm": 0.65234375, + "learning_rate": 0.0004975194275047886, + "loss": 0.2286, + "step": 32600 + }, + { + "epoch": 1.35, + "grad_norm": 0.419921875, + "learning_rate": 0.0004975179032951657, + "loss": 0.2406, + "step": 32610 + }, + { + "epoch": 1.35, + "grad_norm": 0.6640625, + "learning_rate": 0.0004975163786197409, + "loss": 0.1771, + "step": 32620 + }, + { + "epoch": 1.35, + "grad_norm": 0.2177734375, + "learning_rate": 0.0004975148534785169, + "loss": 0.2174, + "step": 32630 + }, + { + "epoch": 1.35, + "grad_norm": 0.6484375, + "learning_rate": 0.0004975133278714967, + "loss": 0.2257, + "step": 32640 + }, + { + "epoch": 1.35, + "grad_norm": 1.0625, + "learning_rate": 0.0004975118017986832, + "loss": 0.266, + "step": 32650 + }, + { + "epoch": 1.35, + "grad_norm": 0.515625, + "learning_rate": 0.0004975102752600791, + "loss": 0.2381, + "step": 32660 + }, + { + "epoch": 1.35, + "grad_norm": 0.27734375, + "learning_rate": 0.0004975087482556875, + "loss": 0.2707, + "step": 32670 + }, + { + "epoch": 1.35, + "grad_norm": 0.58984375, + "learning_rate": 0.0004975072207855112, + "loss": 0.242, + "step": 32680 + }, + { + "epoch": 1.35, + "grad_norm": 0.29296875, + "learning_rate": 0.000497505692849553, + "loss": 0.2092, + "step": 32690 + }, + { + "epoch": 1.35, + "grad_norm": 0.279296875, + "learning_rate": 0.0004975041644478157, + "loss": 0.2647, + "step": 32700 + }, + { + "epoch": 1.35, + "grad_norm": 0.6171875, + "learning_rate": 0.0004975026355803024, + "loss": 0.2037, + "step": 32710 + }, + { + "epoch": 1.36, + "grad_norm": 0.302734375, + "learning_rate": 0.0004975011062470159, + "loss": 0.2289, + "step": 32720 + }, + { + "epoch": 1.36, + "grad_norm": 0.53515625, + "learning_rate": 0.0004974995764479589, + "loss": 0.1803, + "step": 32730 + }, + { + "epoch": 1.36, + "grad_norm": 0.56640625, + "learning_rate": 0.0004974980461831345, + "loss": 0.1966, + "step": 32740 + }, + { + "epoch": 1.36, + "grad_norm": 1.03125, + "learning_rate": 0.0004974965154525456, + "loss": 0.26, + "step": 32750 + }, + { + "epoch": 1.36, + "grad_norm": 0.53125, + "learning_rate": 0.0004974949842561949, + "loss": 0.2281, + "step": 32760 + }, + { + "epoch": 1.36, + "grad_norm": 0.703125, + "learning_rate": 0.0004974934525940854, + "loss": 0.249, + "step": 32770 + }, + { + "epoch": 1.36, + "grad_norm": 0.75, + "learning_rate": 0.0004974919204662199, + "loss": 0.2527, + "step": 32780 + }, + { + "epoch": 1.36, + "grad_norm": 0.3125, + "learning_rate": 0.0004974903878726014, + "loss": 0.2104, + "step": 32790 + }, + { + "epoch": 1.36, + "grad_norm": 0.7578125, + "learning_rate": 0.0004974888548132326, + "loss": 0.2529, + "step": 32800 + }, + { + "epoch": 1.36, + "grad_norm": 0.97265625, + "learning_rate": 0.0004974873212881166, + "loss": 0.2916, + "step": 32810 + }, + { + "epoch": 1.36, + "grad_norm": 0.455078125, + "learning_rate": 0.0004974857872972562, + "loss": 0.266, + "step": 32820 + }, + { + "epoch": 1.36, + "grad_norm": 0.96484375, + "learning_rate": 0.0004974842528406542, + "loss": 0.2606, + "step": 32830 + }, + { + "epoch": 1.36, + "grad_norm": 0.49609375, + "learning_rate": 0.0004974827179183136, + "loss": 0.1918, + "step": 32840 + }, + { + "epoch": 1.36, + "grad_norm": 1.328125, + "learning_rate": 0.0004974811825302373, + "loss": 0.2369, + "step": 32850 + }, + { + "epoch": 1.36, + "grad_norm": 0.703125, + "learning_rate": 0.0004974796466764281, + "loss": 0.2363, + "step": 32860 + }, + { + "epoch": 1.36, + "grad_norm": 0.71484375, + "learning_rate": 0.0004974781103568889, + "loss": 0.2438, + "step": 32870 + }, + { + "epoch": 1.36, + "grad_norm": 0.58984375, + "learning_rate": 0.0004974765735716227, + "loss": 0.1976, + "step": 32880 + }, + { + "epoch": 1.36, + "grad_norm": 1.0078125, + "learning_rate": 0.0004974750363206322, + "loss": 0.2125, + "step": 32890 + }, + { + "epoch": 1.36, + "grad_norm": 0.462890625, + "learning_rate": 0.0004974734986039204, + "loss": 0.2248, + "step": 32900 + }, + { + "epoch": 1.36, + "grad_norm": 0.71484375, + "learning_rate": 0.0004974719604214904, + "loss": 0.2357, + "step": 32910 + }, + { + "epoch": 1.36, + "grad_norm": 0.77734375, + "learning_rate": 0.0004974704217733446, + "loss": 0.2727, + "step": 32920 + }, + { + "epoch": 1.36, + "grad_norm": 0.78125, + "learning_rate": 0.0004974688826594865, + "loss": 0.2314, + "step": 32930 + }, + { + "epoch": 1.36, + "grad_norm": 0.2314453125, + "learning_rate": 0.0004974673430799185, + "loss": 0.2381, + "step": 32940 + }, + { + "epoch": 1.36, + "grad_norm": 0.46875, + "learning_rate": 0.0004974658030346438, + "loss": 0.2457, + "step": 32950 + }, + { + "epoch": 1.37, + "grad_norm": 0.80078125, + "learning_rate": 0.000497464262523665, + "loss": 0.2398, + "step": 32960 + }, + { + "epoch": 1.37, + "grad_norm": 0.65625, + "learning_rate": 0.0004974627215469853, + "loss": 0.1927, + "step": 32970 + }, + { + "epoch": 1.37, + "grad_norm": 0.43359375, + "learning_rate": 0.0004974611801046075, + "loss": 0.2194, + "step": 32980 + }, + { + "epoch": 1.37, + "grad_norm": 0.91015625, + "learning_rate": 0.0004974596381965344, + "loss": 0.267, + "step": 32990 + }, + { + "epoch": 1.37, + "grad_norm": 0.765625, + "learning_rate": 0.000497458095822769, + "loss": 0.2736, + "step": 33000 + }, + { + "epoch": 1.37, + "grad_norm": 0.5078125, + "learning_rate": 0.0004974565529833143, + "loss": 0.2545, + "step": 33010 + }, + { + "epoch": 1.37, + "grad_norm": 0.51953125, + "learning_rate": 0.0004974550096781729, + "loss": 0.1434, + "step": 33020 + }, + { + "epoch": 1.37, + "grad_norm": 0.384765625, + "learning_rate": 0.0004974534659073481, + "loss": 0.254, + "step": 33030 + }, + { + "epoch": 1.37, + "grad_norm": 1.0390625, + "learning_rate": 0.0004974519216708425, + "loss": 0.2121, + "step": 33040 + }, + { + "epoch": 1.37, + "grad_norm": 0.34765625, + "learning_rate": 0.0004974503769686592, + "loss": 0.2612, + "step": 33050 + }, + { + "epoch": 1.37, + "grad_norm": 0.75390625, + "learning_rate": 0.0004974488318008008, + "loss": 0.2154, + "step": 33060 + }, + { + "epoch": 1.37, + "grad_norm": 0.76171875, + "learning_rate": 0.0004974472861672706, + "loss": 0.2612, + "step": 33070 + }, + { + "epoch": 1.37, + "grad_norm": 0.59765625, + "learning_rate": 0.0004974457400680713, + "loss": 0.2274, + "step": 33080 + }, + { + "epoch": 1.37, + "grad_norm": 0.404296875, + "learning_rate": 0.0004974441935032059, + "loss": 0.2118, + "step": 33090 + }, + { + "epoch": 1.37, + "grad_norm": 0.53125, + "learning_rate": 0.000497442646472677, + "loss": 0.2302, + "step": 33100 + }, + { + "epoch": 1.37, + "grad_norm": 0.310546875, + "learning_rate": 0.000497441098976488, + "loss": 0.1775, + "step": 33110 + }, + { + "epoch": 1.37, + "grad_norm": 0.74609375, + "learning_rate": 0.0004974395510146415, + "loss": 0.2501, + "step": 33120 + }, + { + "epoch": 1.37, + "grad_norm": 0.875, + "learning_rate": 0.0004974380025871405, + "loss": 0.2409, + "step": 33130 + }, + { + "epoch": 1.37, + "grad_norm": 0.52734375, + "learning_rate": 0.0004974364536939878, + "loss": 0.2412, + "step": 33140 + }, + { + "epoch": 1.37, + "grad_norm": 0.65625, + "learning_rate": 0.0004974349043351865, + "loss": 0.2513, + "step": 33150 + }, + { + "epoch": 1.37, + "grad_norm": 0.74609375, + "learning_rate": 0.0004974333545107395, + "loss": 0.2502, + "step": 33160 + }, + { + "epoch": 1.37, + "grad_norm": 0.88671875, + "learning_rate": 0.0004974318042206495, + "loss": 0.1957, + "step": 33170 + }, + { + "epoch": 1.37, + "grad_norm": 1.171875, + "learning_rate": 0.0004974302534649196, + "loss": 0.2504, + "step": 33180 + }, + { + "epoch": 1.37, + "grad_norm": 0.609375, + "learning_rate": 0.0004974287022435528, + "loss": 0.2253, + "step": 33190 + }, + { + "epoch": 1.38, + "grad_norm": 0.7421875, + "learning_rate": 0.0004974271505565519, + "loss": 0.2018, + "step": 33200 + }, + { + "epoch": 1.38, + "grad_norm": 1.9453125, + "learning_rate": 0.0004974255984039196, + "loss": 0.1939, + "step": 33210 + }, + { + "epoch": 1.38, + "grad_norm": 0.9609375, + "learning_rate": 0.0004974240457856592, + "loss": 0.2707, + "step": 33220 + }, + { + "epoch": 1.38, + "grad_norm": 0.78515625, + "learning_rate": 0.0004974224927017736, + "loss": 0.2298, + "step": 33230 + }, + { + "epoch": 1.38, + "grad_norm": 0.3984375, + "learning_rate": 0.0004974209391522653, + "loss": 0.2173, + "step": 33240 + }, + { + "epoch": 1.38, + "grad_norm": 0.140625, + "learning_rate": 0.0004974193851371377, + "loss": 0.3204, + "step": 33250 + }, + { + "epoch": 1.38, + "grad_norm": 0.76953125, + "learning_rate": 0.0004974178306563936, + "loss": 0.2167, + "step": 33260 + }, + { + "epoch": 1.38, + "grad_norm": 0.53125, + "learning_rate": 0.0004974162757100356, + "loss": 0.224, + "step": 33270 + }, + { + "epoch": 1.38, + "grad_norm": 0.66015625, + "learning_rate": 0.0004974147202980671, + "loss": 0.2371, + "step": 33280 + }, + { + "epoch": 1.38, + "grad_norm": 0.69921875, + "learning_rate": 0.0004974131644204908, + "loss": 0.1639, + "step": 33290 + }, + { + "epoch": 1.38, + "grad_norm": 1.2578125, + "learning_rate": 0.0004974116080773095, + "loss": 0.1847, + "step": 33300 + }, + { + "epoch": 1.38, + "grad_norm": 0.400390625, + "learning_rate": 0.0004974100512685264, + "loss": 0.2106, + "step": 33310 + }, + { + "epoch": 1.38, + "grad_norm": 0.79296875, + "learning_rate": 0.0004974084939941444, + "loss": 0.2672, + "step": 33320 + }, + { + "epoch": 1.38, + "grad_norm": 0.4921875, + "learning_rate": 0.0004974069362541661, + "loss": 0.1985, + "step": 33330 + }, + { + "epoch": 1.38, + "grad_norm": 1.1015625, + "learning_rate": 0.0004974053780485948, + "loss": 0.2302, + "step": 33340 + }, + { + "epoch": 1.38, + "grad_norm": 0.8671875, + "learning_rate": 0.0004974038193774333, + "loss": 0.3218, + "step": 33350 + }, + { + "epoch": 1.38, + "grad_norm": 0.703125, + "learning_rate": 0.0004974022602406844, + "loss": 0.2386, + "step": 33360 + }, + { + "epoch": 1.38, + "grad_norm": 0.81640625, + "learning_rate": 0.0004974007006383513, + "loss": 0.2612, + "step": 33370 + }, + { + "epoch": 1.38, + "grad_norm": 0.6875, + "learning_rate": 0.0004973991405704367, + "loss": 0.2085, + "step": 33380 + }, + { + "epoch": 1.38, + "grad_norm": 0.3671875, + "learning_rate": 0.0004973975800369438, + "loss": 0.2308, + "step": 33390 + }, + { + "epoch": 1.38, + "grad_norm": 0.76953125, + "learning_rate": 0.0004973960190378752, + "loss": 0.2609, + "step": 33400 + }, + { + "epoch": 1.38, + "grad_norm": 0.66015625, + "learning_rate": 0.0004973944575732341, + "loss": 0.215, + "step": 33410 + }, + { + "epoch": 1.38, + "grad_norm": 0.546875, + "learning_rate": 0.0004973928956430233, + "loss": 0.2207, + "step": 33420 + }, + { + "epoch": 1.38, + "grad_norm": 0.515625, + "learning_rate": 0.0004973913332472458, + "loss": 0.2002, + "step": 33430 + }, + { + "epoch": 1.39, + "grad_norm": 0.443359375, + "learning_rate": 0.0004973897703859046, + "loss": 0.2622, + "step": 33440 + }, + { + "epoch": 1.39, + "grad_norm": 0.40625, + "learning_rate": 0.0004973882070590025, + "loss": 0.1508, + "step": 33450 + }, + { + "epoch": 1.39, + "grad_norm": 0.90234375, + "learning_rate": 0.0004973866432665425, + "loss": 0.2182, + "step": 33460 + }, + { + "epoch": 1.39, + "grad_norm": 0.97265625, + "learning_rate": 0.0004973850790085276, + "loss": 0.2433, + "step": 33470 + }, + { + "epoch": 1.39, + "grad_norm": 0.28125, + "learning_rate": 0.0004973835142849607, + "loss": 0.2446, + "step": 33480 + }, + { + "epoch": 1.39, + "grad_norm": 0.73828125, + "learning_rate": 0.0004973819490958446, + "loss": 0.2571, + "step": 33490 + }, + { + "epoch": 1.39, + "grad_norm": 0.8515625, + "learning_rate": 0.0004973803834411825, + "loss": 0.266, + "step": 33500 + }, + { + "epoch": 1.39, + "grad_norm": 0.94140625, + "learning_rate": 0.0004973788173209772, + "loss": 0.2425, + "step": 33510 + }, + { + "epoch": 1.39, + "grad_norm": 0.86328125, + "learning_rate": 0.0004973772507352317, + "loss": 0.2241, + "step": 33520 + }, + { + "epoch": 1.39, + "grad_norm": 0.7421875, + "learning_rate": 0.000497375683683949, + "loss": 0.1952, + "step": 33530 + }, + { + "epoch": 1.39, + "grad_norm": 0.1982421875, + "learning_rate": 0.0004973741161671319, + "loss": 0.254, + "step": 33540 + }, + { + "epoch": 1.39, + "grad_norm": 0.765625, + "learning_rate": 0.0004973725481847834, + "loss": 0.2539, + "step": 33550 + }, + { + "epoch": 1.39, + "grad_norm": 0.953125, + "learning_rate": 0.0004973709797369065, + "loss": 0.2909, + "step": 33560 + }, + { + "epoch": 1.39, + "grad_norm": 0.58203125, + "learning_rate": 0.0004973694108235041, + "loss": 0.273, + "step": 33570 + }, + { + "epoch": 1.39, + "grad_norm": 1.3671875, + "learning_rate": 0.0004973678414445793, + "loss": 0.2238, + "step": 33580 + }, + { + "epoch": 1.39, + "grad_norm": 0.439453125, + "learning_rate": 0.0004973662716001349, + "loss": 0.2471, + "step": 33590 + }, + { + "epoch": 1.39, + "grad_norm": 2.5625, + "learning_rate": 0.0004973647012901739, + "loss": 0.2268, + "step": 33600 + }, + { + "epoch": 1.39, + "grad_norm": 1.625, + "learning_rate": 0.0004973631305146991, + "loss": 0.218, + "step": 33610 + }, + { + "epoch": 1.39, + "grad_norm": 0.6640625, + "learning_rate": 0.0004973615592737137, + "loss": 0.2226, + "step": 33620 + }, + { + "epoch": 1.39, + "grad_norm": 0.462890625, + "learning_rate": 0.0004973599875672206, + "loss": 0.2764, + "step": 33630 + }, + { + "epoch": 1.39, + "grad_norm": 0.58203125, + "learning_rate": 0.0004973584153952226, + "loss": 0.1981, + "step": 33640 + }, + { + "epoch": 1.39, + "grad_norm": 0.50390625, + "learning_rate": 0.0004973568427577229, + "loss": 0.198, + "step": 33650 + }, + { + "epoch": 1.39, + "grad_norm": 0.51953125, + "learning_rate": 0.0004973552696547242, + "loss": 0.1861, + "step": 33660 + }, + { + "epoch": 1.39, + "grad_norm": 0.65625, + "learning_rate": 0.0004973536960862297, + "loss": 0.2692, + "step": 33670 + }, + { + "epoch": 1.4, + "grad_norm": 0.78515625, + "learning_rate": 0.0004973521220522422, + "loss": 0.2121, + "step": 33680 + }, + { + "epoch": 1.4, + "grad_norm": 0.953125, + "learning_rate": 0.0004973505475527648, + "loss": 0.2738, + "step": 33690 + }, + { + "epoch": 1.4, + "grad_norm": 0.79296875, + "learning_rate": 0.0004973489725878003, + "loss": 0.2514, + "step": 33700 + }, + { + "epoch": 1.4, + "grad_norm": 0.93359375, + "learning_rate": 0.0004973473971573518, + "loss": 0.2225, + "step": 33710 + }, + { + "epoch": 1.4, + "grad_norm": 0.54296875, + "learning_rate": 0.0004973458212614222, + "loss": 0.2376, + "step": 33720 + }, + { + "epoch": 1.4, + "grad_norm": 0.5078125, + "learning_rate": 0.0004973442449000145, + "loss": 0.242, + "step": 33730 + }, + { + "epoch": 1.4, + "grad_norm": 0.703125, + "learning_rate": 0.0004973426680731315, + "loss": 0.2222, + "step": 33740 + }, + { + "epoch": 1.4, + "grad_norm": 1.4453125, + "learning_rate": 0.0004973410907807764, + "loss": 0.1971, + "step": 33750 + }, + { + "epoch": 1.4, + "grad_norm": 2.046875, + "learning_rate": 0.0004973395130229522, + "loss": 0.2418, + "step": 33760 + }, + { + "epoch": 1.4, + "grad_norm": 0.7265625, + "learning_rate": 0.0004973379347996617, + "loss": 0.2073, + "step": 33770 + }, + { + "epoch": 1.4, + "grad_norm": 0.5234375, + "learning_rate": 0.0004973363561109078, + "loss": 0.1941, + "step": 33780 + }, + { + "epoch": 1.4, + "grad_norm": 0.703125, + "learning_rate": 0.0004973347769566936, + "loss": 0.2276, + "step": 33790 + }, + { + "epoch": 1.4, + "grad_norm": 0.41796875, + "learning_rate": 0.0004973331973370221, + "loss": 0.1788, + "step": 33800 + }, + { + "epoch": 1.4, + "grad_norm": 0.875, + "learning_rate": 0.0004973316172518962, + "loss": 0.2064, + "step": 33810 + }, + { + "epoch": 1.4, + "grad_norm": 0.625, + "learning_rate": 0.000497330036701319, + "loss": 0.2022, + "step": 33820 + }, + { + "epoch": 1.4, + "grad_norm": 0.40625, + "learning_rate": 0.0004973284556852933, + "loss": 0.2576, + "step": 33830 + }, + { + "epoch": 1.4, + "grad_norm": 1.0234375, + "learning_rate": 0.0004973268742038222, + "loss": 0.238, + "step": 33840 + }, + { + "epoch": 1.4, + "grad_norm": 1.2265625, + "learning_rate": 0.0004973252922569086, + "loss": 0.2106, + "step": 33850 + }, + { + "epoch": 1.4, + "grad_norm": 0.251953125, + "learning_rate": 0.0004973237098445555, + "loss": 0.2035, + "step": 33860 + }, + { + "epoch": 1.4, + "grad_norm": 2.109375, + "learning_rate": 0.0004973221269667659, + "loss": 0.234, + "step": 33870 + }, + { + "epoch": 1.4, + "grad_norm": 1.8125, + "learning_rate": 0.0004973205436235428, + "loss": 0.2361, + "step": 33880 + }, + { + "epoch": 1.4, + "grad_norm": 0.60546875, + "learning_rate": 0.0004973189598148891, + "loss": 0.2585, + "step": 33890 + }, + { + "epoch": 1.4, + "grad_norm": 0.62890625, + "learning_rate": 0.0004973173755408078, + "loss": 0.206, + "step": 33900 + }, + { + "epoch": 1.4, + "grad_norm": 0.90234375, + "learning_rate": 0.000497315790801302, + "loss": 0.272, + "step": 33910 + }, + { + "epoch": 1.4, + "grad_norm": 0.5, + "learning_rate": 0.0004973142055963746, + "loss": 0.2236, + "step": 33920 + }, + { + "epoch": 1.41, + "grad_norm": 0.462890625, + "learning_rate": 0.0004973126199260283, + "loss": 0.2354, + "step": 33930 + }, + { + "epoch": 1.41, + "grad_norm": 0.71484375, + "learning_rate": 0.0004973110337902667, + "loss": 0.2378, + "step": 33940 + }, + { + "epoch": 1.41, + "grad_norm": 0.51953125, + "learning_rate": 0.0004973094471890923, + "loss": 0.2366, + "step": 33950 + }, + { + "epoch": 1.41, + "grad_norm": 0.671875, + "learning_rate": 0.0004973078601225082, + "loss": 0.2232, + "step": 33960 + }, + { + "epoch": 1.41, + "grad_norm": 0.51953125, + "learning_rate": 0.0004973062725905174, + "loss": 0.2387, + "step": 33970 + }, + { + "epoch": 1.41, + "grad_norm": 1.296875, + "learning_rate": 0.000497304684593123, + "loss": 0.2257, + "step": 33980 + }, + { + "epoch": 1.41, + "grad_norm": 0.83203125, + "learning_rate": 0.0004973030961303279, + "loss": 0.2441, + "step": 33990 + }, + { + "epoch": 1.41, + "grad_norm": 0.47265625, + "learning_rate": 0.000497301507202135, + "loss": 0.2468, + "step": 34000 + }, + { + "epoch": 1.41, + "grad_norm": 0.66796875, + "learning_rate": 0.0004972999178085474, + "loss": 0.2508, + "step": 34010 + }, + { + "epoch": 1.41, + "grad_norm": 0.7109375, + "learning_rate": 0.000497298327949568, + "loss": 0.2095, + "step": 34020 + }, + { + "epoch": 1.41, + "grad_norm": 0.40234375, + "learning_rate": 0.0004972967376251999, + "loss": 0.1829, + "step": 34030 + }, + { + "epoch": 1.41, + "grad_norm": 1.1796875, + "learning_rate": 0.0004972951468354461, + "loss": 0.2092, + "step": 34040 + }, + { + "epoch": 1.41, + "grad_norm": 0.55859375, + "learning_rate": 0.0004972935555803094, + "loss": 0.216, + "step": 34050 + }, + { + "epoch": 1.41, + "grad_norm": 0.400390625, + "learning_rate": 0.000497291963859793, + "loss": 0.2133, + "step": 34060 + }, + { + "epoch": 1.41, + "grad_norm": 0.7421875, + "learning_rate": 0.0004972903716738999, + "loss": 0.2427, + "step": 34070 + }, + { + "epoch": 1.41, + "grad_norm": 0.80078125, + "learning_rate": 0.000497288779022633, + "loss": 0.2072, + "step": 34080 + }, + { + "epoch": 1.41, + "grad_norm": 0.74609375, + "learning_rate": 0.0004972871859059954, + "loss": 0.2428, + "step": 34090 + }, + { + "epoch": 1.41, + "grad_norm": 0.5078125, + "learning_rate": 0.0004972855923239899, + "loss": 0.1908, + "step": 34100 + }, + { + "epoch": 1.41, + "grad_norm": 1.0390625, + "learning_rate": 0.0004972839982766195, + "loss": 0.2338, + "step": 34110 + }, + { + "epoch": 1.41, + "grad_norm": 0.546875, + "learning_rate": 0.0004972824037638875, + "loss": 0.1912, + "step": 34120 + }, + { + "epoch": 1.41, + "grad_norm": 3.125, + "learning_rate": 0.0004972808087857967, + "loss": 0.1916, + "step": 34130 + }, + { + "epoch": 1.41, + "grad_norm": 0.89453125, + "learning_rate": 0.0004972792133423501, + "loss": 0.2105, + "step": 34140 + }, + { + "epoch": 1.41, + "grad_norm": 0.61328125, + "learning_rate": 0.0004972776174335508, + "loss": 0.1574, + "step": 34150 + }, + { + "epoch": 1.41, + "grad_norm": 0.73046875, + "learning_rate": 0.0004972760210594016, + "loss": 0.2259, + "step": 34160 + }, + { + "epoch": 1.42, + "grad_norm": 0.703125, + "learning_rate": 0.0004972744242199056, + "loss": 0.2312, + "step": 34170 + }, + { + "epoch": 1.42, + "grad_norm": 0.74609375, + "learning_rate": 0.0004972728269150659, + "loss": 0.2505, + "step": 34180 + }, + { + "epoch": 1.42, + "grad_norm": 0.53125, + "learning_rate": 0.0004972712291448856, + "loss": 0.2035, + "step": 34190 + }, + { + "epoch": 1.42, + "grad_norm": 0.7109375, + "learning_rate": 0.0004972696309093673, + "loss": 0.1633, + "step": 34200 + }, + { + "epoch": 1.42, + "grad_norm": 1.5625, + "learning_rate": 0.0004972680322085144, + "loss": 0.2578, + "step": 34210 + }, + { + "epoch": 1.42, + "grad_norm": 0.33984375, + "learning_rate": 0.0004972664330423298, + "loss": 0.2219, + "step": 34220 + }, + { + "epoch": 1.42, + "grad_norm": 0.7578125, + "learning_rate": 0.0004972648334108165, + "loss": 0.2388, + "step": 34230 + }, + { + "epoch": 1.42, + "grad_norm": 0.64453125, + "learning_rate": 0.0004972632333139773, + "loss": 0.179, + "step": 34240 + }, + { + "epoch": 1.42, + "grad_norm": 0.82421875, + "learning_rate": 0.0004972616327518155, + "loss": 0.2271, + "step": 34250 + }, + { + "epoch": 1.42, + "grad_norm": 0.60546875, + "learning_rate": 0.000497260031724334, + "loss": 0.3013, + "step": 34260 + }, + { + "epoch": 1.42, + "grad_norm": 2.296875, + "learning_rate": 0.0004972584302315358, + "loss": 0.1855, + "step": 34270 + }, + { + "epoch": 1.42, + "grad_norm": 0.341796875, + "learning_rate": 0.000497256828273424, + "loss": 0.2565, + "step": 34280 + }, + { + "epoch": 1.42, + "grad_norm": 0.55078125, + "learning_rate": 0.0004972552258500015, + "loss": 0.1978, + "step": 34290 + }, + { + "epoch": 1.42, + "grad_norm": 0.67578125, + "learning_rate": 0.0004972536229612713, + "loss": 0.1941, + "step": 34300 + }, + { + "epoch": 1.42, + "grad_norm": 0.94921875, + "learning_rate": 0.0004972520196072366, + "loss": 0.2048, + "step": 34310 + }, + { + "epoch": 1.42, + "grad_norm": 0.62890625, + "learning_rate": 0.0004972504157879003, + "loss": 0.2495, + "step": 34320 + }, + { + "epoch": 1.42, + "grad_norm": 0.455078125, + "learning_rate": 0.0004972488115032653, + "loss": 0.2151, + "step": 34330 + }, + { + "epoch": 1.42, + "grad_norm": 0.55859375, + "learning_rate": 0.0004972472067533349, + "loss": 0.2775, + "step": 34340 + }, + { + "epoch": 1.42, + "grad_norm": 0.53125, + "learning_rate": 0.0004972456015381119, + "loss": 0.259, + "step": 34350 + }, + { + "epoch": 1.42, + "grad_norm": 0.2060546875, + "learning_rate": 0.0004972439958575993, + "loss": 0.1731, + "step": 34360 + }, + { + "epoch": 1.42, + "grad_norm": 0.75390625, + "learning_rate": 0.0004972423897118002, + "loss": 0.2107, + "step": 34370 + }, + { + "epoch": 1.42, + "grad_norm": 0.66796875, + "learning_rate": 0.0004972407831007178, + "loss": 0.255, + "step": 34380 + }, + { + "epoch": 1.42, + "grad_norm": 0.31640625, + "learning_rate": 0.0004972391760243547, + "loss": 0.2457, + "step": 34390 + }, + { + "epoch": 1.42, + "grad_norm": 0.92578125, + "learning_rate": 0.0004972375684827144, + "loss": 0.2535, + "step": 34400 + }, + { + "epoch": 1.43, + "grad_norm": 0.453125, + "learning_rate": 0.0004972359604757995, + "loss": 0.1854, + "step": 34410 + }, + { + "epoch": 1.43, + "grad_norm": 1.328125, + "learning_rate": 0.0004972343520036134, + "loss": 0.2135, + "step": 34420 + }, + { + "epoch": 1.43, + "grad_norm": 1.078125, + "learning_rate": 0.0004972327430661589, + "loss": 0.2137, + "step": 34430 + }, + { + "epoch": 1.43, + "grad_norm": 0.2197265625, + "learning_rate": 0.0004972311336634389, + "loss": 0.25, + "step": 34440 + }, + { + "epoch": 1.43, + "grad_norm": 0.6171875, + "learning_rate": 0.0004972295237954567, + "loss": 0.2761, + "step": 34450 + }, + { + "epoch": 1.43, + "grad_norm": 0.83203125, + "learning_rate": 0.0004972279134622153, + "loss": 0.2822, + "step": 34460 + }, + { + "epoch": 1.43, + "grad_norm": 0.71484375, + "learning_rate": 0.0004972263026637177, + "loss": 0.207, + "step": 34470 + }, + { + "epoch": 1.43, + "grad_norm": 0.47265625, + "learning_rate": 0.0004972246913999669, + "loss": 0.24, + "step": 34480 + }, + { + "epoch": 1.43, + "grad_norm": 0.462890625, + "learning_rate": 0.0004972230796709658, + "loss": 0.1965, + "step": 34490 + }, + { + "epoch": 1.43, + "grad_norm": 0.453125, + "learning_rate": 0.0004972214674767175, + "loss": 0.2433, + "step": 34500 + }, + { + "epoch": 1.43, + "grad_norm": 0.61328125, + "learning_rate": 0.0004972198548172253, + "loss": 0.1926, + "step": 34510 + }, + { + "epoch": 1.43, + "grad_norm": 0.498046875, + "learning_rate": 0.000497218241692492, + "loss": 0.2038, + "step": 34520 + }, + { + "epoch": 1.43, + "grad_norm": 0.498046875, + "learning_rate": 0.0004972166281025206, + "loss": 0.2651, + "step": 34530 + }, + { + "epoch": 1.43, + "grad_norm": 2.359375, + "learning_rate": 0.0004972150140473143, + "loss": 0.2405, + "step": 34540 + }, + { + "epoch": 1.43, + "grad_norm": 0.216796875, + "learning_rate": 0.0004972133995268759, + "loss": 0.2277, + "step": 34550 + }, + { + "epoch": 1.43, + "grad_norm": 1.9453125, + "learning_rate": 0.0004972117845412086, + "loss": 0.2697, + "step": 34560 + }, + { + "epoch": 1.43, + "grad_norm": 0.4921875, + "learning_rate": 0.0004972101690903154, + "loss": 0.2268, + "step": 34570 + }, + { + "epoch": 1.43, + "grad_norm": 0.90234375, + "learning_rate": 0.0004972085531741995, + "loss": 0.2333, + "step": 34580 + }, + { + "epoch": 1.43, + "grad_norm": 0.43359375, + "learning_rate": 0.0004972069367928638, + "loss": 0.2426, + "step": 34590 + }, + { + "epoch": 1.43, + "grad_norm": 1.0703125, + "learning_rate": 0.0004972053199463112, + "loss": 0.2353, + "step": 34600 + }, + { + "epoch": 1.43, + "grad_norm": 1.21875, + "learning_rate": 0.0004972037026345449, + "loss": 0.2012, + "step": 34610 + }, + { + "epoch": 1.43, + "grad_norm": 0.23828125, + "learning_rate": 0.000497202084857568, + "loss": 0.2463, + "step": 34620 + }, + { + "epoch": 1.43, + "grad_norm": 0.75390625, + "learning_rate": 0.0004972004666153834, + "loss": 0.2445, + "step": 34630 + }, + { + "epoch": 1.43, + "grad_norm": 0.40234375, + "learning_rate": 0.0004971988479079943, + "loss": 0.2766, + "step": 34640 + }, + { + "epoch": 1.44, + "grad_norm": 0.82421875, + "learning_rate": 0.0004971972287354037, + "loss": 0.2183, + "step": 34650 + }, + { + "epoch": 1.44, + "grad_norm": 0.61328125, + "learning_rate": 0.0004971956090976144, + "loss": 0.1914, + "step": 34660 + }, + { + "epoch": 1.44, + "grad_norm": 0.2490234375, + "learning_rate": 0.0004971939889946298, + "loss": 0.2588, + "step": 34670 + }, + { + "epoch": 1.44, + "grad_norm": 0.76953125, + "learning_rate": 0.0004971923684264529, + "loss": 0.1932, + "step": 34680 + }, + { + "epoch": 1.44, + "grad_norm": 0.94921875, + "learning_rate": 0.0004971907473930865, + "loss": 0.229, + "step": 34690 + }, + { + "epoch": 1.44, + "grad_norm": 0.69921875, + "learning_rate": 0.0004971891258945338, + "loss": 0.2208, + "step": 34700 + }, + { + "epoch": 1.44, + "grad_norm": 0.97265625, + "learning_rate": 0.000497187503930798, + "loss": 0.2328, + "step": 34710 + }, + { + "epoch": 1.44, + "grad_norm": 0.54296875, + "learning_rate": 0.0004971858815018819, + "loss": 0.2035, + "step": 34720 + }, + { + "epoch": 1.44, + "grad_norm": 0.8203125, + "learning_rate": 0.0004971842586077887, + "loss": 0.196, + "step": 34730 + }, + { + "epoch": 1.44, + "grad_norm": 0.58984375, + "learning_rate": 0.0004971826352485214, + "loss": 0.2227, + "step": 34740 + }, + { + "epoch": 1.44, + "grad_norm": 0.462890625, + "learning_rate": 0.0004971810114240831, + "loss": 0.1942, + "step": 34750 + }, + { + "epoch": 1.44, + "grad_norm": 0.546875, + "learning_rate": 0.0004971793871344769, + "loss": 0.2463, + "step": 34760 + }, + { + "epoch": 1.44, + "grad_norm": 0.7421875, + "learning_rate": 0.0004971777623797057, + "loss": 0.2441, + "step": 34770 + }, + { + "epoch": 1.44, + "grad_norm": 0.58984375, + "learning_rate": 0.0004971761371597726, + "loss": 0.2323, + "step": 34780 + }, + { + "epoch": 1.44, + "grad_norm": 0.51953125, + "learning_rate": 0.0004971745114746807, + "loss": 0.2159, + "step": 34790 + }, + { + "epoch": 1.44, + "grad_norm": 0.341796875, + "learning_rate": 0.0004971728853244332, + "loss": 0.2276, + "step": 34800 + }, + { + "epoch": 1.44, + "grad_norm": 0.78125, + "learning_rate": 0.000497171258709033, + "loss": 0.3041, + "step": 34810 + }, + { + "epoch": 1.44, + "grad_norm": 0.81640625, + "learning_rate": 0.000497169631628483, + "loss": 0.1959, + "step": 34820 + }, + { + "epoch": 1.44, + "grad_norm": 0.74609375, + "learning_rate": 0.0004971680040827866, + "loss": 0.2056, + "step": 34830 + }, + { + "epoch": 1.44, + "grad_norm": 2.46875, + "learning_rate": 0.0004971663760719468, + "loss": 0.2596, + "step": 34840 + }, + { + "epoch": 1.44, + "grad_norm": 0.44921875, + "learning_rate": 0.0004971647475959663, + "loss": 0.1424, + "step": 34850 + }, + { + "epoch": 1.44, + "grad_norm": 0.46484375, + "learning_rate": 0.0004971631186548485, + "loss": 0.2479, + "step": 34860 + }, + { + "epoch": 1.44, + "grad_norm": 0.65234375, + "learning_rate": 0.0004971614892485966, + "loss": 0.2249, + "step": 34870 + }, + { + "epoch": 1.44, + "grad_norm": 0.3984375, + "learning_rate": 0.0004971598593772133, + "loss": 0.2682, + "step": 34880 + }, + { + "epoch": 1.45, + "grad_norm": 1.0546875, + "learning_rate": 0.0004971582290407019, + "loss": 0.2116, + "step": 34890 + }, + { + "epoch": 1.45, + "grad_norm": 0.404296875, + "learning_rate": 0.0004971565982390655, + "loss": 0.2536, + "step": 34900 + }, + { + "epoch": 1.45, + "grad_norm": 0.494140625, + "learning_rate": 0.0004971549669723069, + "loss": 0.2457, + "step": 34910 + }, + { + "epoch": 1.45, + "grad_norm": 0.435546875, + "learning_rate": 0.0004971533352404293, + "loss": 0.2349, + "step": 34920 + }, + { + "epoch": 1.45, + "grad_norm": 0.5234375, + "learning_rate": 0.000497151703043436, + "loss": 0.237, + "step": 34930 + }, + { + "epoch": 1.45, + "grad_norm": 0.65625, + "learning_rate": 0.0004971500703813298, + "loss": 0.2297, + "step": 34940 + }, + { + "epoch": 1.45, + "grad_norm": 1.5, + "learning_rate": 0.0004971484372541139, + "loss": 0.2623, + "step": 34950 + }, + { + "epoch": 1.45, + "grad_norm": 0.55078125, + "learning_rate": 0.0004971468036617913, + "loss": 0.1688, + "step": 34960 + }, + { + "epoch": 1.45, + "grad_norm": 0.54296875, + "learning_rate": 0.0004971451696043651, + "loss": 0.2036, + "step": 34970 + }, + { + "epoch": 1.45, + "grad_norm": 0.671875, + "learning_rate": 0.0004971435350818383, + "loss": 0.2725, + "step": 34980 + }, + { + "epoch": 1.45, + "grad_norm": 0.61328125, + "learning_rate": 0.0004971419000942142, + "loss": 0.2261, + "step": 34990 + }, + { + "epoch": 1.45, + "grad_norm": 1.3671875, + "learning_rate": 0.0004971402646414956, + "loss": 0.2527, + "step": 35000 + }, + { + "epoch": 1.45, + "grad_norm": 1.1171875, + "learning_rate": 0.0004971386287236859, + "loss": 0.1944, + "step": 35010 + }, + { + "epoch": 1.45, + "grad_norm": 0.392578125, + "learning_rate": 0.0004971369923407878, + "loss": 0.2435, + "step": 35020 + }, + { + "epoch": 1.45, + "grad_norm": 2.25, + "learning_rate": 0.0004971353554928047, + "loss": 0.2256, + "step": 35030 + }, + { + "epoch": 1.45, + "grad_norm": 0.54296875, + "learning_rate": 0.0004971337181797394, + "loss": 0.2237, + "step": 35040 + }, + { + "epoch": 1.45, + "grad_norm": 0.66796875, + "learning_rate": 0.0004971320804015953, + "loss": 0.2559, + "step": 35050 + }, + { + "epoch": 1.45, + "grad_norm": 0.8984375, + "learning_rate": 0.0004971304421583753, + "loss": 0.2025, + "step": 35060 + }, + { + "epoch": 1.45, + "grad_norm": 0.66015625, + "learning_rate": 0.0004971288034500823, + "loss": 0.2184, + "step": 35070 + }, + { + "epoch": 1.45, + "grad_norm": 0.6484375, + "learning_rate": 0.0004971271642767197, + "loss": 0.2229, + "step": 35080 + }, + { + "epoch": 1.45, + "grad_norm": 1.09375, + "learning_rate": 0.0004971255246382904, + "loss": 0.1613, + "step": 35090 + }, + { + "epoch": 1.45, + "grad_norm": 0.78125, + "learning_rate": 0.0004971238845347978, + "loss": 0.2497, + "step": 35100 + }, + { + "epoch": 1.45, + "grad_norm": 0.80078125, + "learning_rate": 0.0004971222439662445, + "loss": 0.1692, + "step": 35110 + }, + { + "epoch": 1.45, + "grad_norm": 0.67578125, + "learning_rate": 0.0004971206029326338, + "loss": 0.2397, + "step": 35120 + }, + { + "epoch": 1.46, + "grad_norm": 0.6171875, + "learning_rate": 0.000497118961433969, + "loss": 0.2188, + "step": 35130 + }, + { + "epoch": 1.46, + "grad_norm": 0.54296875, + "learning_rate": 0.0004971173194702528, + "loss": 0.2466, + "step": 35140 + }, + { + "epoch": 1.46, + "grad_norm": 0.8046875, + "learning_rate": 0.0004971156770414886, + "loss": 0.2407, + "step": 35150 + }, + { + "epoch": 1.46, + "grad_norm": 1.0390625, + "learning_rate": 0.0004971140341476794, + "loss": 0.2371, + "step": 35160 + }, + { + "epoch": 1.46, + "grad_norm": 0.67578125, + "learning_rate": 0.0004971123907888282, + "loss": 0.2381, + "step": 35170 + }, + { + "epoch": 1.46, + "grad_norm": 0.478515625, + "learning_rate": 0.0004971107469649382, + "loss": 0.1977, + "step": 35180 + }, + { + "epoch": 1.46, + "grad_norm": 0.498046875, + "learning_rate": 0.0004971091026760125, + "loss": 0.2276, + "step": 35190 + }, + { + "epoch": 1.46, + "grad_norm": 0.70703125, + "learning_rate": 0.0004971074579220541, + "loss": 0.2475, + "step": 35200 + }, + { + "epoch": 1.46, + "grad_norm": 0.6484375, + "learning_rate": 0.0004971058127030662, + "loss": 0.2072, + "step": 35210 + }, + { + "epoch": 1.46, + "grad_norm": 0.640625, + "learning_rate": 0.0004971041670190519, + "loss": 0.2476, + "step": 35220 + }, + { + "epoch": 1.46, + "grad_norm": 0.625, + "learning_rate": 0.000497102520870014, + "loss": 0.2155, + "step": 35230 + }, + { + "epoch": 1.46, + "grad_norm": 0.6875, + "learning_rate": 0.000497100874255956, + "loss": 0.2769, + "step": 35240 + }, + { + "epoch": 1.46, + "grad_norm": 0.34375, + "learning_rate": 0.0004970992271768809, + "loss": 0.1927, + "step": 35250 + }, + { + "epoch": 1.46, + "grad_norm": 0.30078125, + "learning_rate": 0.0004970975796327917, + "loss": 0.2523, + "step": 35260 + }, + { + "epoch": 1.46, + "grad_norm": 0.953125, + "learning_rate": 0.0004970959316236915, + "loss": 0.2251, + "step": 35270 + }, + { + "epoch": 1.46, + "grad_norm": 0.384765625, + "learning_rate": 0.0004970942831495834, + "loss": 0.238, + "step": 35280 + }, + { + "epoch": 1.46, + "grad_norm": 0.63671875, + "learning_rate": 0.0004970926342104706, + "loss": 0.2836, + "step": 35290 + }, + { + "epoch": 1.46, + "grad_norm": 0.7578125, + "learning_rate": 0.0004970909848063562, + "loss": 0.2309, + "step": 35300 + }, + { + "epoch": 1.46, + "grad_norm": 0.44140625, + "learning_rate": 0.0004970893349372431, + "loss": 0.2853, + "step": 35310 + }, + { + "epoch": 1.46, + "grad_norm": 0.66796875, + "learning_rate": 0.0004970876846031347, + "loss": 0.228, + "step": 35320 + }, + { + "epoch": 1.46, + "grad_norm": 0.302734375, + "learning_rate": 0.0004970860338040339, + "loss": 0.2123, + "step": 35330 + }, + { + "epoch": 1.46, + "grad_norm": 0.73828125, + "learning_rate": 0.0004970843825399439, + "loss": 0.1809, + "step": 35340 + }, + { + "epoch": 1.46, + "grad_norm": 1.390625, + "learning_rate": 0.0004970827308108677, + "loss": 0.2515, + "step": 35350 + }, + { + "epoch": 1.46, + "grad_norm": 0.97265625, + "learning_rate": 0.0004970810786168086, + "loss": 0.2247, + "step": 35360 + }, + { + "epoch": 1.47, + "grad_norm": 0.75, + "learning_rate": 0.0004970794259577696, + "loss": 0.2446, + "step": 35370 + }, + { + "epoch": 1.47, + "grad_norm": 0.60546875, + "learning_rate": 0.0004970777728337537, + "loss": 0.2018, + "step": 35380 + }, + { + "epoch": 1.47, + "grad_norm": 0.46484375, + "learning_rate": 0.0004970761192447642, + "loss": 0.2285, + "step": 35390 + }, + { + "epoch": 1.47, + "grad_norm": 0.486328125, + "learning_rate": 0.000497074465190804, + "loss": 0.2045, + "step": 35400 + }, + { + "epoch": 1.47, + "grad_norm": 1.5, + "learning_rate": 0.0004970728106718764, + "loss": 0.2028, + "step": 35410 + }, + { + "epoch": 1.47, + "grad_norm": 0.52734375, + "learning_rate": 0.0004970711556879844, + "loss": 0.2483, + "step": 35420 + }, + { + "epoch": 1.47, + "grad_norm": 1.5234375, + "learning_rate": 0.0004970695002391313, + "loss": 0.2751, + "step": 35430 + }, + { + "epoch": 1.47, + "grad_norm": 0.828125, + "learning_rate": 0.00049706784432532, + "loss": 0.2645, + "step": 35440 + }, + { + "epoch": 1.47, + "grad_norm": 0.69921875, + "learning_rate": 0.0004970661879465537, + "loss": 0.2082, + "step": 35450 + }, + { + "epoch": 1.47, + "grad_norm": 0.59765625, + "learning_rate": 0.0004970645311028355, + "loss": 0.2525, + "step": 35460 + }, + { + "epoch": 1.47, + "grad_norm": 1.0078125, + "learning_rate": 0.0004970628737941686, + "loss": 0.3155, + "step": 35470 + }, + { + "epoch": 1.47, + "grad_norm": 0.53125, + "learning_rate": 0.000497061216020556, + "loss": 0.2763, + "step": 35480 + }, + { + "epoch": 1.47, + "grad_norm": 0.58203125, + "learning_rate": 0.0004970595577820008, + "loss": 0.2597, + "step": 35490 + }, + { + "epoch": 1.47, + "grad_norm": 0.71484375, + "learning_rate": 0.0004970578990785063, + "loss": 0.2336, + "step": 35500 + }, + { + "epoch": 1.47, + "grad_norm": 0.376953125, + "learning_rate": 0.0004970562399100756, + "loss": 0.2178, + "step": 35510 + }, + { + "epoch": 1.47, + "grad_norm": 0.33984375, + "learning_rate": 0.0004970545802767116, + "loss": 0.1701, + "step": 35520 + }, + { + "epoch": 1.47, + "grad_norm": 0.33203125, + "learning_rate": 0.0004970529201784175, + "loss": 0.2332, + "step": 35530 + }, + { + "epoch": 1.47, + "grad_norm": 0.8046875, + "learning_rate": 0.0004970512596151966, + "loss": 0.2382, + "step": 35540 + }, + { + "epoch": 1.47, + "grad_norm": 0.98046875, + "learning_rate": 0.0004970495985870519, + "loss": 0.3322, + "step": 35550 + }, + { + "epoch": 1.47, + "grad_norm": 0.51953125, + "learning_rate": 0.0004970479370939864, + "loss": 0.2165, + "step": 35560 + }, + { + "epoch": 1.47, + "grad_norm": 0.58203125, + "learning_rate": 0.0004970462751360035, + "loss": 0.2535, + "step": 35570 + }, + { + "epoch": 1.47, + "grad_norm": 1.046875, + "learning_rate": 0.0004970446127131061, + "loss": 0.1974, + "step": 35580 + }, + { + "epoch": 1.47, + "grad_norm": 0.76953125, + "learning_rate": 0.0004970429498252976, + "loss": 0.2901, + "step": 35590 + }, + { + "epoch": 1.47, + "grad_norm": 0.38671875, + "learning_rate": 0.0004970412864725808, + "loss": 0.1826, + "step": 35600 + }, + { + "epoch": 1.47, + "grad_norm": 0.484375, + "learning_rate": 0.000497039622654959, + "loss": 0.2422, + "step": 35610 + }, + { + "epoch": 1.48, + "grad_norm": 0.48828125, + "learning_rate": 0.0004970379583724352, + "loss": 0.1997, + "step": 35620 + }, + { + "epoch": 1.48, + "grad_norm": 0.259765625, + "learning_rate": 0.0004970362936250128, + "loss": 0.1904, + "step": 35630 + }, + { + "epoch": 1.48, + "grad_norm": 1.609375, + "learning_rate": 0.0004970346284126947, + "loss": 0.2623, + "step": 35640 + }, + { + "epoch": 1.48, + "grad_norm": 0.50390625, + "learning_rate": 0.0004970329627354842, + "loss": 0.2432, + "step": 35650 + }, + { + "epoch": 1.48, + "grad_norm": 0.76171875, + "learning_rate": 0.0004970312965933843, + "loss": 0.2873, + "step": 35660 + }, + { + "epoch": 1.48, + "grad_norm": 0.609375, + "learning_rate": 0.0004970296299863981, + "loss": 0.2219, + "step": 35670 + }, + { + "epoch": 1.48, + "grad_norm": 0.1884765625, + "learning_rate": 0.0004970279629145289, + "loss": 0.1834, + "step": 35680 + }, + { + "epoch": 1.48, + "grad_norm": 0.79296875, + "learning_rate": 0.0004970262953777797, + "loss": 0.2484, + "step": 35690 + }, + { + "epoch": 1.48, + "grad_norm": 1.5703125, + "learning_rate": 0.0004970246273761536, + "loss": 0.1831, + "step": 35700 + }, + { + "epoch": 1.48, + "grad_norm": 0.33203125, + "learning_rate": 0.000497022958909654, + "loss": 0.2141, + "step": 35710 + }, + { + "epoch": 1.48, + "grad_norm": 0.396484375, + "learning_rate": 0.0004970212899782837, + "loss": 0.2746, + "step": 35720 + }, + { + "epoch": 1.48, + "grad_norm": 0.96875, + "learning_rate": 0.0004970196205820462, + "loss": 0.246, + "step": 35730 + }, + { + "epoch": 1.48, + "grad_norm": 0.39453125, + "learning_rate": 0.0004970179507209443, + "loss": 0.181, + "step": 35740 + }, + { + "epoch": 1.48, + "grad_norm": 0.71484375, + "learning_rate": 0.0004970162803949814, + "loss": 0.238, + "step": 35750 + }, + { + "epoch": 1.48, + "grad_norm": 0.380859375, + "learning_rate": 0.0004970146096041605, + "loss": 0.2028, + "step": 35760 + }, + { + "epoch": 1.48, + "grad_norm": 0.435546875, + "learning_rate": 0.0004970129383484848, + "loss": 0.2708, + "step": 35770 + }, + { + "epoch": 1.48, + "grad_norm": 0.34375, + "learning_rate": 0.0004970112666279573, + "loss": 0.1731, + "step": 35780 + }, + { + "epoch": 1.48, + "grad_norm": 0.78125, + "learning_rate": 0.0004970095944425814, + "loss": 0.2642, + "step": 35790 + }, + { + "epoch": 1.48, + "grad_norm": 0.625, + "learning_rate": 0.00049700792179236, + "loss": 0.2619, + "step": 35800 + }, + { + "epoch": 1.48, + "grad_norm": 0.404296875, + "learning_rate": 0.0004970062486772965, + "loss": 0.2, + "step": 35810 + }, + { + "epoch": 1.48, + "grad_norm": 1.3515625, + "learning_rate": 0.0004970045750973939, + "loss": 0.2821, + "step": 35820 + }, + { + "epoch": 1.48, + "grad_norm": 0.578125, + "learning_rate": 0.0004970029010526552, + "loss": 0.2358, + "step": 35830 + }, + { + "epoch": 1.48, + "grad_norm": 1.9453125, + "learning_rate": 0.0004970012265430838, + "loss": 0.2242, + "step": 35840 + }, + { + "epoch": 1.48, + "grad_norm": 0.5, + "learning_rate": 0.0004969995515686829, + "loss": 0.2079, + "step": 35850 + }, + { + "epoch": 1.49, + "grad_norm": 0.4921875, + "learning_rate": 0.0004969978761294554, + "loss": 0.2329, + "step": 35860 + }, + { + "epoch": 1.49, + "grad_norm": 0.6953125, + "learning_rate": 0.0004969962002254046, + "loss": 0.2288, + "step": 35870 + }, + { + "epoch": 1.49, + "grad_norm": 0.703125, + "learning_rate": 0.0004969945238565336, + "loss": 0.1897, + "step": 35880 + }, + { + "epoch": 1.49, + "grad_norm": 0.6015625, + "learning_rate": 0.0004969928470228456, + "loss": 0.2305, + "step": 35890 + }, + { + "epoch": 1.49, + "grad_norm": 0.392578125, + "learning_rate": 0.0004969911697243437, + "loss": 0.2042, + "step": 35900 + }, + { + "epoch": 1.49, + "grad_norm": 0.458984375, + "learning_rate": 0.0004969894919610311, + "loss": 0.2139, + "step": 35910 + }, + { + "epoch": 1.49, + "grad_norm": 0.353515625, + "learning_rate": 0.000496987813732911, + "loss": 0.223, + "step": 35920 + }, + { + "epoch": 1.49, + "grad_norm": 0.0, + "learning_rate": 0.0004969861350399865, + "loss": 0.2379, + "step": 35930 + }, + { + "epoch": 1.49, + "grad_norm": 0.39453125, + "learning_rate": 0.0004969844558822607, + "loss": 0.2024, + "step": 35940 + }, + { + "epoch": 1.49, + "grad_norm": 0.443359375, + "learning_rate": 0.0004969827762597369, + "loss": 0.1963, + "step": 35950 + }, + { + "epoch": 1.49, + "grad_norm": 0.84765625, + "learning_rate": 0.0004969810961724181, + "loss": 0.2576, + "step": 35960 + }, + { + "epoch": 1.49, + "grad_norm": 0.357421875, + "learning_rate": 0.0004969794156203075, + "loss": 0.2436, + "step": 35970 + }, + { + "epoch": 1.49, + "grad_norm": 1.1875, + "learning_rate": 0.0004969777346034086, + "loss": 0.2286, + "step": 35980 + }, + { + "epoch": 1.49, + "grad_norm": 0.68359375, + "learning_rate": 0.000496976053121724, + "loss": 0.2788, + "step": 35990 + }, + { + "epoch": 1.49, + "grad_norm": 0.29296875, + "learning_rate": 0.0004969743711752573, + "loss": 0.244, + "step": 36000 + }, + { + "epoch": 1.49, + "grad_norm": 0.76953125, + "learning_rate": 0.0004969726887640114, + "loss": 0.2035, + "step": 36010 + }, + { + "epoch": 1.49, + "grad_norm": 0.8828125, + "learning_rate": 0.0004969710058879896, + "loss": 0.252, + "step": 36020 + }, + { + "epoch": 1.49, + "grad_norm": 1.375, + "learning_rate": 0.000496969322547195, + "loss": 0.2548, + "step": 36030 + }, + { + "epoch": 1.49, + "grad_norm": 1.171875, + "learning_rate": 0.0004969676387416308, + "loss": 0.2445, + "step": 36040 + }, + { + "epoch": 1.49, + "grad_norm": 0.59765625, + "learning_rate": 0.0004969659544713002, + "loss": 0.2681, + "step": 36050 + }, + { + "epoch": 1.49, + "grad_norm": 1.1953125, + "learning_rate": 0.0004969642697362064, + "loss": 0.2438, + "step": 36060 + }, + { + "epoch": 1.49, + "grad_norm": 0.470703125, + "learning_rate": 0.0004969625845363525, + "loss": 0.2305, + "step": 36070 + }, + { + "epoch": 1.49, + "grad_norm": 0.498046875, + "learning_rate": 0.0004969608988717416, + "loss": 0.2489, + "step": 36080 + }, + { + "epoch": 1.49, + "grad_norm": 0.2578125, + "learning_rate": 0.0004969592127423772, + "loss": 0.2736, + "step": 36090 + }, + { + "epoch": 1.5, + "grad_norm": 0.453125, + "learning_rate": 0.0004969575261482619, + "loss": 0.231, + "step": 36100 + }, + { + "epoch": 1.5, + "grad_norm": 1.1953125, + "learning_rate": 0.0004969558390893993, + "loss": 0.1996, + "step": 36110 + }, + { + "epoch": 1.5, + "grad_norm": 0.353515625, + "learning_rate": 0.0004969541515657925, + "loss": 0.1583, + "step": 36120 + }, + { + "epoch": 1.5, + "grad_norm": 0.84765625, + "learning_rate": 0.0004969524635774448, + "loss": 0.2699, + "step": 36130 + }, + { + "epoch": 1.5, + "grad_norm": 0.5546875, + "learning_rate": 0.0004969507751243591, + "loss": 0.2391, + "step": 36140 + }, + { + "epoch": 1.5, + "grad_norm": 0.4921875, + "learning_rate": 0.0004969490862065388, + "loss": 0.2557, + "step": 36150 + }, + { + "epoch": 1.5, + "grad_norm": 0.498046875, + "learning_rate": 0.0004969473968239868, + "loss": 0.2922, + "step": 36160 + }, + { + "epoch": 1.5, + "grad_norm": 0.67578125, + "learning_rate": 0.0004969457069767066, + "loss": 0.2295, + "step": 36170 + }, + { + "epoch": 1.5, + "grad_norm": 0.2578125, + "learning_rate": 0.0004969440166647012, + "loss": 0.191, + "step": 36180 + }, + { + "epoch": 1.5, + "grad_norm": 1.5703125, + "learning_rate": 0.0004969423258879739, + "loss": 0.2075, + "step": 36190 + }, + { + "epoch": 1.5, + "grad_norm": 1.046875, + "learning_rate": 0.0004969406346465278, + "loss": 0.2522, + "step": 36200 + }, + { + "epoch": 1.5, + "grad_norm": 1.0078125, + "learning_rate": 0.000496938942940366, + "loss": 0.262, + "step": 36210 + }, + { + "epoch": 1.5, + "grad_norm": 0.83203125, + "learning_rate": 0.0004969372507694919, + "loss": 0.1914, + "step": 36220 + }, + { + "epoch": 1.5, + "grad_norm": 0.8984375, + "learning_rate": 0.0004969355581339086, + "loss": 0.2356, + "step": 36230 + }, + { + "epoch": 1.5, + "grad_norm": 0.828125, + "learning_rate": 0.000496933865033619, + "loss": 0.2753, + "step": 36240 + }, + { + "epoch": 1.5, + "grad_norm": 0.6015625, + "learning_rate": 0.0004969321714686267, + "loss": 0.2507, + "step": 36250 + }, + { + "epoch": 1.5, + "grad_norm": 1.140625, + "learning_rate": 0.0004969304774389347, + "loss": 0.2512, + "step": 36260 + }, + { + "epoch": 1.5, + "grad_norm": 0.72265625, + "learning_rate": 0.0004969287829445462, + "loss": 0.1798, + "step": 36270 + }, + { + "epoch": 1.5, + "grad_norm": 1.109375, + "learning_rate": 0.0004969270879854644, + "loss": 0.2185, + "step": 36280 + }, + { + "epoch": 1.5, + "grad_norm": 0.64453125, + "learning_rate": 0.0004969253925616925, + "loss": 0.2104, + "step": 36290 + }, + { + "epoch": 1.5, + "grad_norm": 0.69140625, + "learning_rate": 0.0004969236966732337, + "loss": 0.2116, + "step": 36300 + }, + { + "epoch": 1.5, + "grad_norm": 0.96875, + "learning_rate": 0.0004969220003200912, + "loss": 0.2689, + "step": 36310 + }, + { + "epoch": 1.5, + "grad_norm": 1.828125, + "learning_rate": 0.000496920303502268, + "loss": 0.2033, + "step": 36320 + }, + { + "epoch": 1.5, + "grad_norm": 0.765625, + "learning_rate": 0.0004969186062197676, + "loss": 0.2434, + "step": 36330 + }, + { + "epoch": 1.51, + "grad_norm": 0.5703125, + "learning_rate": 0.000496916908472593, + "loss": 0.2484, + "step": 36340 + }, + { + "epoch": 1.51, + "grad_norm": 0.5, + "learning_rate": 0.0004969152102607474, + "loss": 0.2215, + "step": 36350 + }, + { + "epoch": 1.51, + "grad_norm": 0.63671875, + "learning_rate": 0.000496913511584234, + "loss": 0.223, + "step": 36360 + }, + { + "epoch": 1.51, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004969118124430561, + "loss": 0.2212, + "step": 36370 + }, + { + "epoch": 1.51, + "grad_norm": 0.1689453125, + "learning_rate": 0.000496910112837217, + "loss": 0.2423, + "step": 36380 + }, + { + "epoch": 1.51, + "grad_norm": 1.5, + "learning_rate": 0.0004969084127667195, + "loss": 0.2109, + "step": 36390 + }, + { + "epoch": 1.51, + "grad_norm": 0.5, + "learning_rate": 0.0004969067122315671, + "loss": 0.2117, + "step": 36400 + }, + { + "epoch": 1.51, + "grad_norm": 0.68359375, + "learning_rate": 0.000496905011231763, + "loss": 0.1734, + "step": 36410 + }, + { + "epoch": 1.51, + "grad_norm": 0.322265625, + "learning_rate": 0.0004969033097673103, + "loss": 0.2013, + "step": 36420 + }, + { + "epoch": 1.51, + "grad_norm": 0.890625, + "learning_rate": 0.0004969016078382122, + "loss": 0.2323, + "step": 36430 + }, + { + "epoch": 1.51, + "grad_norm": 0.38671875, + "learning_rate": 0.000496899905444472, + "loss": 0.2059, + "step": 36440 + }, + { + "epoch": 1.51, + "grad_norm": 1.328125, + "learning_rate": 0.0004968982025860927, + "loss": 0.2723, + "step": 36450 + }, + { + "epoch": 1.51, + "grad_norm": 0.83984375, + "learning_rate": 0.0004968964992630777, + "loss": 0.2475, + "step": 36460 + }, + { + "epoch": 1.51, + "grad_norm": 0.78515625, + "learning_rate": 0.0004968947954754302, + "loss": 0.2396, + "step": 36470 + }, + { + "epoch": 1.51, + "grad_norm": 0.609375, + "learning_rate": 0.0004968930912231534, + "loss": 0.2173, + "step": 36480 + }, + { + "epoch": 1.51, + "grad_norm": 0.404296875, + "learning_rate": 0.0004968913865062504, + "loss": 0.2017, + "step": 36490 + }, + { + "epoch": 1.51, + "grad_norm": 0.361328125, + "learning_rate": 0.0004968896813247244, + "loss": 0.292, + "step": 36500 + }, + { + "epoch": 1.51, + "grad_norm": 0.70703125, + "learning_rate": 0.0004968879756785788, + "loss": 0.2642, + "step": 36510 + }, + { + "epoch": 1.51, + "grad_norm": 0.6015625, + "learning_rate": 0.0004968862695678166, + "loss": 0.2348, + "step": 36520 + }, + { + "epoch": 1.51, + "grad_norm": 0.94140625, + "learning_rate": 0.0004968845629924412, + "loss": 0.1444, + "step": 36530 + }, + { + "epoch": 1.51, + "grad_norm": 1.0234375, + "learning_rate": 0.0004968828559524556, + "loss": 0.2514, + "step": 36540 + }, + { + "epoch": 1.51, + "grad_norm": 0.55859375, + "learning_rate": 0.0004968811484478633, + "loss": 0.2797, + "step": 36550 + }, + { + "epoch": 1.51, + "grad_norm": 0.88671875, + "learning_rate": 0.0004968794404786672, + "loss": 0.2657, + "step": 36560 + }, + { + "epoch": 1.51, + "grad_norm": 0.89453125, + "learning_rate": 0.0004968777320448706, + "loss": 0.2141, + "step": 36570 + }, + { + "epoch": 1.52, + "grad_norm": 0.4453125, + "learning_rate": 0.000496876023146477, + "loss": 0.2147, + "step": 36580 + }, + { + "epoch": 1.52, + "grad_norm": 0.6328125, + "learning_rate": 0.0004968743137834891, + "loss": 0.2236, + "step": 36590 + }, + { + "epoch": 1.52, + "grad_norm": 0.25, + "learning_rate": 0.0004968726039559107, + "loss": 0.2681, + "step": 36600 + }, + { + "epoch": 1.52, + "grad_norm": 0.546875, + "learning_rate": 0.0004968708936637445, + "loss": 0.1938, + "step": 36610 + }, + { + "epoch": 1.52, + "grad_norm": 0.41015625, + "learning_rate": 0.000496869182906994, + "loss": 0.2323, + "step": 36620 + }, + { + "epoch": 1.52, + "grad_norm": 0.8515625, + "learning_rate": 0.0004968674716856623, + "loss": 0.2188, + "step": 36630 + }, + { + "epoch": 1.52, + "grad_norm": 0.294921875, + "learning_rate": 0.0004968657599997528, + "loss": 0.2, + "step": 36640 + }, + { + "epoch": 1.52, + "grad_norm": 0.8203125, + "learning_rate": 0.0004968640478492685, + "loss": 0.2652, + "step": 36650 + }, + { + "epoch": 1.52, + "grad_norm": 0.17578125, + "learning_rate": 0.0004968623352342127, + "loss": 0.2348, + "step": 36660 + }, + { + "epoch": 1.52, + "grad_norm": 1.109375, + "learning_rate": 0.0004968606221545887, + "loss": 0.2693, + "step": 36670 + }, + { + "epoch": 1.52, + "grad_norm": 0.447265625, + "learning_rate": 0.0004968589086103997, + "loss": 0.2154, + "step": 36680 + }, + { + "epoch": 1.52, + "grad_norm": 0.328125, + "learning_rate": 0.0004968571946016488, + "loss": 0.224, + "step": 36690 + }, + { + "epoch": 1.52, + "grad_norm": 1.8046875, + "learning_rate": 0.0004968554801283395, + "loss": 0.2436, + "step": 36700 + }, + { + "epoch": 1.52, + "grad_norm": 0.5625, + "learning_rate": 0.0004968537651904747, + "loss": 0.2207, + "step": 36710 + }, + { + "epoch": 1.52, + "grad_norm": 0.59765625, + "learning_rate": 0.0004968520497880578, + "loss": 0.2341, + "step": 36720 + }, + { + "epoch": 1.52, + "grad_norm": 0.9296875, + "learning_rate": 0.0004968503339210919, + "loss": 0.2874, + "step": 36730 + }, + { + "epoch": 1.52, + "grad_norm": 0.8359375, + "learning_rate": 0.0004968486175895804, + "loss": 0.2627, + "step": 36740 + }, + { + "epoch": 1.52, + "grad_norm": 0.486328125, + "learning_rate": 0.0004968469007935265, + "loss": 0.2494, + "step": 36750 + }, + { + "epoch": 1.52, + "grad_norm": 0.84375, + "learning_rate": 0.0004968451835329334, + "loss": 0.2541, + "step": 36760 + }, + { + "epoch": 1.52, + "grad_norm": 0.55078125, + "learning_rate": 0.0004968434658078043, + "loss": 0.2114, + "step": 36770 + }, + { + "epoch": 1.52, + "grad_norm": 0.75, + "learning_rate": 0.0004968417476181423, + "loss": 0.2307, + "step": 36780 + }, + { + "epoch": 1.52, + "grad_norm": 0.89453125, + "learning_rate": 0.000496840028963951, + "loss": 0.2277, + "step": 36790 + }, + { + "epoch": 1.52, + "grad_norm": 0.7890625, + "learning_rate": 0.0004968383098452333, + "loss": 0.2088, + "step": 36800 + }, + { + "epoch": 1.52, + "grad_norm": 0.341796875, + "learning_rate": 0.0004968365902619927, + "loss": 0.1827, + "step": 36810 + }, + { + "epoch": 1.53, + "grad_norm": 0.625, + "learning_rate": 0.0004968348702142322, + "loss": 0.2659, + "step": 36820 + }, + { + "epoch": 1.53, + "grad_norm": 0.5078125, + "learning_rate": 0.0004968331497019552, + "loss": 0.2071, + "step": 36830 + }, + { + "epoch": 1.53, + "grad_norm": 0.72265625, + "learning_rate": 0.0004968314287251647, + "loss": 0.2174, + "step": 36840 + }, + { + "epoch": 1.53, + "grad_norm": 0.62890625, + "learning_rate": 0.0004968297072838642, + "loss": 0.232, + "step": 36850 + }, + { + "epoch": 1.53, + "grad_norm": 0.51953125, + "learning_rate": 0.0004968279853780569, + "loss": 0.1905, + "step": 36860 + }, + { + "epoch": 1.53, + "grad_norm": 0.54296875, + "learning_rate": 0.0004968262630077459, + "loss": 0.2117, + "step": 36870 + }, + { + "epoch": 1.53, + "grad_norm": 0.56640625, + "learning_rate": 0.0004968245401729347, + "loss": 0.2297, + "step": 36880 + }, + { + "epoch": 1.53, + "grad_norm": 0.1591796875, + "learning_rate": 0.0004968228168736262, + "loss": 0.2349, + "step": 36890 + }, + { + "epoch": 1.53, + "grad_norm": 0.62109375, + "learning_rate": 0.0004968210931098238, + "loss": 0.2037, + "step": 36900 + }, + { + "epoch": 1.53, + "grad_norm": 0.5390625, + "learning_rate": 0.0004968193688815308, + "loss": 0.187, + "step": 36910 + }, + { + "epoch": 1.53, + "grad_norm": 0.357421875, + "learning_rate": 0.0004968176441887504, + "loss": 0.1582, + "step": 36920 + }, + { + "epoch": 1.53, + "grad_norm": 0.5234375, + "learning_rate": 0.000496815919031486, + "loss": 0.2276, + "step": 36930 + }, + { + "epoch": 1.53, + "grad_norm": 1.1484375, + "learning_rate": 0.0004968141934097404, + "loss": 0.1896, + "step": 36940 + }, + { + "epoch": 1.53, + "grad_norm": 0.35546875, + "learning_rate": 0.0004968124673235174, + "loss": 0.2401, + "step": 36950 + }, + { + "epoch": 1.53, + "grad_norm": 0.0, + "learning_rate": 0.0004968107407728198, + "loss": 0.2086, + "step": 36960 + }, + { + "epoch": 1.53, + "grad_norm": 1.625, + "learning_rate": 0.0004968090137576511, + "loss": 0.2328, + "step": 36970 + }, + { + "epoch": 1.53, + "grad_norm": 0.5078125, + "learning_rate": 0.0004968072862780146, + "loss": 0.195, + "step": 36980 + }, + { + "epoch": 1.53, + "grad_norm": 0.92578125, + "learning_rate": 0.0004968055583339133, + "loss": 0.254, + "step": 36990 + }, + { + "epoch": 1.53, + "grad_norm": 2.265625, + "learning_rate": 0.0004968038299253506, + "loss": 0.2602, + "step": 37000 + }, + { + "epoch": 1.53, + "grad_norm": 0.28125, + "learning_rate": 0.0004968021010523299, + "loss": 0.3009, + "step": 37010 + }, + { + "epoch": 1.53, + "grad_norm": 0.79296875, + "learning_rate": 0.0004968003717148541, + "loss": 0.2404, + "step": 37020 + }, + { + "epoch": 1.53, + "grad_norm": 1.3671875, + "learning_rate": 0.0004967986419129268, + "loss": 0.2237, + "step": 37030 + }, + { + "epoch": 1.53, + "grad_norm": 0.5859375, + "learning_rate": 0.0004967969116465511, + "loss": 0.2699, + "step": 37040 + }, + { + "epoch": 1.53, + "grad_norm": 0.318359375, + "learning_rate": 0.0004967951809157302, + "loss": 0.2567, + "step": 37050 + }, + { + "epoch": 1.54, + "grad_norm": 0.85546875, + "learning_rate": 0.0004967934497204674, + "loss": 0.2057, + "step": 37060 + }, + { + "epoch": 1.54, + "grad_norm": 0.306640625, + "learning_rate": 0.0004967917180607659, + "loss": 0.1745, + "step": 37070 + }, + { + "epoch": 1.54, + "grad_norm": 2.15625, + "learning_rate": 0.0004967899859366293, + "loss": 0.216, + "step": 37080 + }, + { + "epoch": 1.54, + "grad_norm": 0.55859375, + "learning_rate": 0.0004967882533480604, + "loss": 0.2533, + "step": 37090 + }, + { + "epoch": 1.54, + "grad_norm": 0.96875, + "learning_rate": 0.0004967865202950627, + "loss": 0.2454, + "step": 37100 + }, + { + "epoch": 1.54, + "grad_norm": 0.73046875, + "learning_rate": 0.0004967847867776394, + "loss": 0.237, + "step": 37110 + }, + { + "epoch": 1.54, + "grad_norm": 1.125, + "learning_rate": 0.0004967830527957939, + "loss": 0.2344, + "step": 37120 + }, + { + "epoch": 1.54, + "grad_norm": 0.734375, + "learning_rate": 0.0004967813183495292, + "loss": 0.2478, + "step": 37130 + }, + { + "epoch": 1.54, + "grad_norm": 0.890625, + "learning_rate": 0.0004967795834388488, + "loss": 0.2554, + "step": 37140 + }, + { + "epoch": 1.54, + "grad_norm": 2.875, + "learning_rate": 0.0004967778480637558, + "loss": 0.2315, + "step": 37150 + }, + { + "epoch": 1.54, + "grad_norm": 1.2578125, + "learning_rate": 0.0004967761122242535, + "loss": 0.2212, + "step": 37160 + }, + { + "epoch": 1.54, + "grad_norm": 0.52734375, + "learning_rate": 0.0004967743759203454, + "loss": 0.2574, + "step": 37170 + }, + { + "epoch": 1.54, + "grad_norm": 0.66015625, + "learning_rate": 0.0004967726391520344, + "loss": 0.2372, + "step": 37180 + }, + { + "epoch": 1.54, + "grad_norm": 1.984375, + "learning_rate": 0.000496770901919324, + "loss": 0.2245, + "step": 37190 + }, + { + "epoch": 1.54, + "grad_norm": 0.75, + "learning_rate": 0.0004967691642222174, + "loss": 0.2332, + "step": 37200 + }, + { + "epoch": 1.54, + "grad_norm": 0.8359375, + "learning_rate": 0.0004967674260607179, + "loss": 0.1794, + "step": 37210 + }, + { + "epoch": 1.54, + "grad_norm": 0.0, + "learning_rate": 0.0004967656874348287, + "loss": 0.197, + "step": 37220 + }, + { + "epoch": 1.54, + "grad_norm": 1.0546875, + "learning_rate": 0.0004967639483445532, + "loss": 0.2429, + "step": 37230 + }, + { + "epoch": 1.54, + "grad_norm": 0.458984375, + "learning_rate": 0.0004967622087898945, + "loss": 0.2377, + "step": 37240 + }, + { + "epoch": 1.54, + "grad_norm": 0.318359375, + "learning_rate": 0.0004967604687708561, + "loss": 0.277, + "step": 37250 + }, + { + "epoch": 1.54, + "grad_norm": 0.7578125, + "learning_rate": 0.000496758728287441, + "loss": 0.205, + "step": 37260 + }, + { + "epoch": 1.54, + "grad_norm": 0.7265625, + "learning_rate": 0.0004967569873396529, + "loss": 0.2005, + "step": 37270 + }, + { + "epoch": 1.54, + "grad_norm": 0.546875, + "learning_rate": 0.0004967552459274945, + "loss": 0.243, + "step": 37280 + }, + { + "epoch": 1.54, + "grad_norm": 0.8671875, + "learning_rate": 0.0004967535040509694, + "loss": 0.2718, + "step": 37290 + }, + { + "epoch": 1.54, + "grad_norm": 0.94921875, + "learning_rate": 0.000496751761710081, + "loss": 0.2361, + "step": 37300 + }, + { + "epoch": 1.55, + "grad_norm": 0.7578125, + "learning_rate": 0.0004967500189048324, + "loss": 0.2434, + "step": 37310 + }, + { + "epoch": 1.55, + "grad_norm": 0.3515625, + "learning_rate": 0.0004967482756352269, + "loss": 0.2048, + "step": 37320 + }, + { + "epoch": 1.55, + "grad_norm": 0.4453125, + "learning_rate": 0.0004967465319012677, + "loss": 0.2087, + "step": 37330 + }, + { + "epoch": 1.55, + "grad_norm": 0.462890625, + "learning_rate": 0.0004967447877029581, + "loss": 0.2013, + "step": 37340 + }, + { + "epoch": 1.55, + "grad_norm": 0.427734375, + "learning_rate": 0.0004967430430403017, + "loss": 0.243, + "step": 37350 + }, + { + "epoch": 1.55, + "grad_norm": 0.69140625, + "learning_rate": 0.0004967412979133014, + "loss": 0.2354, + "step": 37360 + }, + { + "epoch": 1.55, + "grad_norm": 0.486328125, + "learning_rate": 0.0004967395523219607, + "loss": 0.1833, + "step": 37370 + }, + { + "epoch": 1.55, + "grad_norm": 0.6875, + "learning_rate": 0.0004967378062662827, + "loss": 0.2784, + "step": 37380 + }, + { + "epoch": 1.55, + "grad_norm": 0.9375, + "learning_rate": 0.0004967360597462709, + "loss": 0.2154, + "step": 37390 + }, + { + "epoch": 1.55, + "grad_norm": 0.61328125, + "learning_rate": 0.0004967343127619284, + "loss": 0.2248, + "step": 37400 + }, + { + "epoch": 1.55, + "grad_norm": 0.25390625, + "learning_rate": 0.0004967325653132586, + "loss": 0.1726, + "step": 37410 + }, + { + "epoch": 1.55, + "grad_norm": 0.55859375, + "learning_rate": 0.0004967308174002648, + "loss": 0.2466, + "step": 37420 + }, + { + "epoch": 1.55, + "grad_norm": 2.8125, + "learning_rate": 0.0004967290690229502, + "loss": 0.217, + "step": 37430 + }, + { + "epoch": 1.55, + "grad_norm": 0.65234375, + "learning_rate": 0.0004967273201813182, + "loss": 0.2195, + "step": 37440 + }, + { + "epoch": 1.55, + "grad_norm": 0.41796875, + "learning_rate": 0.0004967255708753719, + "loss": 0.2434, + "step": 37450 + }, + { + "epoch": 1.55, + "grad_norm": 0.70703125, + "learning_rate": 0.0004967238211051148, + "loss": 0.1943, + "step": 37460 + }, + { + "epoch": 1.55, + "grad_norm": 1.0390625, + "learning_rate": 0.0004967220708705501, + "loss": 0.251, + "step": 37470 + }, + { + "epoch": 1.55, + "grad_norm": 0.400390625, + "learning_rate": 0.0004967203201716811, + "loss": 0.2268, + "step": 37480 + }, + { + "epoch": 1.55, + "grad_norm": 0.16796875, + "learning_rate": 0.000496718569008511, + "loss": 0.1789, + "step": 37490 + }, + { + "epoch": 1.55, + "grad_norm": 1.203125, + "learning_rate": 0.0004967168173810435, + "loss": 0.2521, + "step": 37500 + }, + { + "epoch": 1.55, + "grad_norm": 0.72265625, + "learning_rate": 0.0004967150652892814, + "loss": 0.2471, + "step": 37510 + }, + { + "epoch": 1.55, + "grad_norm": 0.296875, + "learning_rate": 0.0004967133127332281, + "loss": 0.206, + "step": 37520 + }, + { + "epoch": 1.55, + "grad_norm": 0.60546875, + "learning_rate": 0.0004967115597128871, + "loss": 0.2749, + "step": 37530 + }, + { + "epoch": 1.55, + "grad_norm": 0.7734375, + "learning_rate": 0.0004967098062282616, + "loss": 0.225, + "step": 37540 + }, + { + "epoch": 1.56, + "grad_norm": 0.7578125, + "learning_rate": 0.0004967080522793548, + "loss": 0.2028, + "step": 37550 + }, + { + "epoch": 1.56, + "grad_norm": 0.1923828125, + "learning_rate": 0.0004967062978661702, + "loss": 0.2156, + "step": 37560 + }, + { + "epoch": 1.56, + "grad_norm": 0.294921875, + "learning_rate": 0.0004967045429887109, + "loss": 0.1863, + "step": 37570 + }, + { + "epoch": 1.56, + "grad_norm": 0.99609375, + "learning_rate": 0.0004967027876469803, + "loss": 0.2772, + "step": 37580 + }, + { + "epoch": 1.56, + "grad_norm": 2.03125, + "learning_rate": 0.0004967010318409817, + "loss": 0.2052, + "step": 37590 + }, + { + "epoch": 1.56, + "grad_norm": 1.2578125, + "learning_rate": 0.0004966992755707184, + "loss": 0.2719, + "step": 37600 + }, + { + "epoch": 1.56, + "grad_norm": 1.6640625, + "learning_rate": 0.0004966975188361938, + "loss": 0.234, + "step": 37610 + }, + { + "epoch": 1.56, + "grad_norm": 0.59375, + "learning_rate": 0.000496695761637411, + "loss": 0.2031, + "step": 37620 + }, + { + "epoch": 1.56, + "grad_norm": 1.2578125, + "learning_rate": 0.0004966940039743735, + "loss": 0.2136, + "step": 37630 + }, + { + "epoch": 1.56, + "grad_norm": 1.2734375, + "learning_rate": 0.0004966922458470844, + "loss": 0.2123, + "step": 37640 + }, + { + "epoch": 1.56, + "grad_norm": 1.3984375, + "learning_rate": 0.0004966904872555473, + "loss": 0.2626, + "step": 37650 + }, + { + "epoch": 1.56, + "grad_norm": 1.4453125, + "learning_rate": 0.0004966887281997651, + "loss": 0.2525, + "step": 37660 + }, + { + "epoch": 1.56, + "grad_norm": 0.462890625, + "learning_rate": 0.0004966869686797415, + "loss": 0.2392, + "step": 37670 + }, + { + "epoch": 1.56, + "grad_norm": 3.375, + "learning_rate": 0.0004966852086954797, + "loss": 0.2299, + "step": 37680 + }, + { + "epoch": 1.56, + "grad_norm": 0.1904296875, + "learning_rate": 0.000496683448246983, + "loss": 0.1954, + "step": 37690 + }, + { + "epoch": 1.56, + "grad_norm": 1.40625, + "learning_rate": 0.0004966816873342546, + "loss": 0.2288, + "step": 37700 + }, + { + "epoch": 1.56, + "grad_norm": 0.74609375, + "learning_rate": 0.0004966799259572979, + "loss": 0.2336, + "step": 37710 + }, + { + "epoch": 1.56, + "grad_norm": 0.8515625, + "learning_rate": 0.0004966781641161163, + "loss": 0.2269, + "step": 37720 + }, + { + "epoch": 1.56, + "grad_norm": 0.62109375, + "learning_rate": 0.000496676401810713, + "loss": 0.2347, + "step": 37730 + }, + { + "epoch": 1.56, + "grad_norm": 0.359375, + "learning_rate": 0.0004966746390410913, + "loss": 0.2399, + "step": 37740 + }, + { + "epoch": 1.56, + "grad_norm": 0.3125, + "learning_rate": 0.0004966728758072546, + "loss": 0.2203, + "step": 37750 + }, + { + "epoch": 1.56, + "grad_norm": 0.48828125, + "learning_rate": 0.0004966711121092062, + "loss": 0.2431, + "step": 37760 + }, + { + "epoch": 1.56, + "grad_norm": 0.48828125, + "learning_rate": 0.0004966693479469495, + "loss": 0.2287, + "step": 37770 + }, + { + "epoch": 1.56, + "grad_norm": 0.765625, + "learning_rate": 0.0004966675833204875, + "loss": 0.2242, + "step": 37780 + }, + { + "epoch": 1.57, + "grad_norm": 1.1796875, + "learning_rate": 0.0004966658182298239, + "loss": 0.2299, + "step": 37790 + }, + { + "epoch": 1.57, + "grad_norm": 0.318359375, + "learning_rate": 0.0004966640526749619, + "loss": 0.2695, + "step": 37800 + }, + { + "epoch": 1.57, + "grad_norm": 0.48046875, + "learning_rate": 0.0004966622866559048, + "loss": 0.2649, + "step": 37810 + }, + { + "epoch": 1.57, + "grad_norm": 0.484375, + "learning_rate": 0.0004966605201726558, + "loss": 0.2424, + "step": 37820 + }, + { + "epoch": 1.57, + "grad_norm": 0.796875, + "learning_rate": 0.0004966587532252184, + "loss": 0.2689, + "step": 37830 + }, + { + "epoch": 1.57, + "grad_norm": 0.515625, + "learning_rate": 0.0004966569858135958, + "loss": 0.1895, + "step": 37840 + }, + { + "epoch": 1.57, + "grad_norm": 1.4453125, + "learning_rate": 0.0004966552179377914, + "loss": 0.2209, + "step": 37850 + }, + { + "epoch": 1.57, + "grad_norm": 0.98828125, + "learning_rate": 0.0004966534495978085, + "loss": 0.2127, + "step": 37860 + }, + { + "epoch": 1.57, + "grad_norm": 0.65234375, + "learning_rate": 0.0004966516807936506, + "loss": 0.1849, + "step": 37870 + }, + { + "epoch": 1.57, + "grad_norm": 1.421875, + "learning_rate": 0.0004966499115253208, + "loss": 0.1644, + "step": 37880 + }, + { + "epoch": 1.57, + "grad_norm": 0.80078125, + "learning_rate": 0.0004966481417928224, + "loss": 0.2558, + "step": 37890 + }, + { + "epoch": 1.57, + "grad_norm": 0.69921875, + "learning_rate": 0.0004966463715961589, + "loss": 0.2328, + "step": 37900 + }, + { + "epoch": 1.57, + "grad_norm": 0.61328125, + "learning_rate": 0.0004966446009353336, + "loss": 0.2219, + "step": 37910 + }, + { + "epoch": 1.57, + "grad_norm": 0.6484375, + "learning_rate": 0.0004966428298103497, + "loss": 0.1814, + "step": 37920 + }, + { + "epoch": 1.57, + "grad_norm": 0.6015625, + "learning_rate": 0.0004966410582212108, + "loss": 0.2724, + "step": 37930 + }, + { + "epoch": 1.57, + "grad_norm": 0.57421875, + "learning_rate": 0.0004966392861679199, + "loss": 0.2275, + "step": 37940 + }, + { + "epoch": 1.57, + "grad_norm": 0.67578125, + "learning_rate": 0.0004966375136504806, + "loss": 0.2155, + "step": 37950 + }, + { + "epoch": 1.57, + "grad_norm": 0.451171875, + "learning_rate": 0.000496635740668896, + "loss": 0.2427, + "step": 37960 + }, + { + "epoch": 1.57, + "grad_norm": 0.451171875, + "learning_rate": 0.0004966339672231697, + "loss": 0.2243, + "step": 37970 + }, + { + "epoch": 1.57, + "grad_norm": 0.81640625, + "learning_rate": 0.0004966321933133049, + "loss": 0.2224, + "step": 37980 + }, + { + "epoch": 1.57, + "grad_norm": 0.74609375, + "learning_rate": 0.0004966304189393049, + "loss": 0.2376, + "step": 37990 + }, + { + "epoch": 1.57, + "grad_norm": 0.365234375, + "learning_rate": 0.000496628644101173, + "loss": 0.2107, + "step": 38000 + }, + { + "epoch": 1.57, + "grad_norm": 0.51953125, + "learning_rate": 0.0004966268687989128, + "loss": 0.2843, + "step": 38010 + }, + { + "epoch": 1.57, + "grad_norm": 0.7578125, + "learning_rate": 0.0004966250930325274, + "loss": 0.2228, + "step": 38020 + }, + { + "epoch": 1.58, + "grad_norm": 0.5859375, + "learning_rate": 0.0004966233168020202, + "loss": 0.2624, + "step": 38030 + }, + { + "epoch": 1.58, + "grad_norm": 0.48046875, + "learning_rate": 0.0004966215401073946, + "loss": 0.2634, + "step": 38040 + }, + { + "epoch": 1.58, + "grad_norm": 0.53125, + "learning_rate": 0.0004966197629486538, + "loss": 0.1956, + "step": 38050 + }, + { + "epoch": 1.58, + "grad_norm": 0.5625, + "learning_rate": 0.0004966179853258013, + "loss": 0.1966, + "step": 38060 + }, + { + "epoch": 1.58, + "grad_norm": 0.3125, + "learning_rate": 0.0004966162072388404, + "loss": 0.1949, + "step": 38070 + }, + { + "epoch": 1.58, + "grad_norm": 0.384765625, + "learning_rate": 0.0004966144286877743, + "loss": 0.1764, + "step": 38080 + }, + { + "epoch": 1.58, + "grad_norm": 0.94921875, + "learning_rate": 0.0004966126496726066, + "loss": 0.2084, + "step": 38090 + }, + { + "epoch": 1.58, + "grad_norm": 0.796875, + "learning_rate": 0.0004966108701933405, + "loss": 0.2283, + "step": 38100 + }, + { + "epoch": 1.58, + "grad_norm": 0.46875, + "learning_rate": 0.0004966090902499793, + "loss": 0.2196, + "step": 38110 + }, + { + "epoch": 1.58, + "grad_norm": 0.37890625, + "learning_rate": 0.0004966073098425266, + "loss": 0.1983, + "step": 38120 + }, + { + "epoch": 1.58, + "grad_norm": 1.171875, + "learning_rate": 0.0004966055289709854, + "loss": 0.2531, + "step": 38130 + }, + { + "epoch": 1.58, + "grad_norm": 0.8046875, + "learning_rate": 0.0004966037476353593, + "loss": 0.2735, + "step": 38140 + }, + { + "epoch": 1.58, + "grad_norm": 0.64453125, + "learning_rate": 0.0004966019658356516, + "loss": 0.208, + "step": 38150 + }, + { + "epoch": 1.58, + "grad_norm": 0.72265625, + "learning_rate": 0.0004966001835718656, + "loss": 0.1757, + "step": 38160 + }, + { + "epoch": 1.58, + "grad_norm": 0.404296875, + "learning_rate": 0.0004965984008440048, + "loss": 0.2421, + "step": 38170 + }, + { + "epoch": 1.58, + "grad_norm": 1.0546875, + "learning_rate": 0.0004965966176520723, + "loss": 0.2151, + "step": 38180 + }, + { + "epoch": 1.58, + "grad_norm": 0.78125, + "learning_rate": 0.0004965948339960716, + "loss": 0.2561, + "step": 38190 + }, + { + "epoch": 1.58, + "grad_norm": 0.625, + "learning_rate": 0.000496593049876006, + "loss": 0.1962, + "step": 38200 + }, + { + "epoch": 1.58, + "grad_norm": 0.5546875, + "learning_rate": 0.0004965912652918791, + "loss": 0.1981, + "step": 38210 + }, + { + "epoch": 1.58, + "grad_norm": 0.95703125, + "learning_rate": 0.0004965894802436939, + "loss": 0.1889, + "step": 38220 + }, + { + "epoch": 1.58, + "grad_norm": 0.625, + "learning_rate": 0.0004965876947314539, + "loss": 0.256, + "step": 38230 + }, + { + "epoch": 1.58, + "grad_norm": 0.55859375, + "learning_rate": 0.0004965859087551626, + "loss": 0.2032, + "step": 38240 + }, + { + "epoch": 1.58, + "grad_norm": 0.30078125, + "learning_rate": 0.0004965841223148233, + "loss": 0.199, + "step": 38250 + }, + { + "epoch": 1.58, + "grad_norm": 0.53125, + "learning_rate": 0.0004965823354104392, + "loss": 0.2076, + "step": 38260 + }, + { + "epoch": 1.59, + "grad_norm": 0.474609375, + "learning_rate": 0.0004965805480420138, + "loss": 0.2183, + "step": 38270 + }, + { + "epoch": 1.59, + "grad_norm": 0.7421875, + "learning_rate": 0.0004965787602095505, + "loss": 0.206, + "step": 38280 + }, + { + "epoch": 1.59, + "grad_norm": 0.796875, + "learning_rate": 0.0004965769719130525, + "loss": 0.2385, + "step": 38290 + }, + { + "epoch": 1.59, + "grad_norm": 1.78125, + "learning_rate": 0.0004965751831525233, + "loss": 0.2316, + "step": 38300 + }, + { + "epoch": 1.59, + "grad_norm": 2.84375, + "learning_rate": 0.0004965733939279662, + "loss": 0.2565, + "step": 38310 + }, + { + "epoch": 1.59, + "grad_norm": 0.37109375, + "learning_rate": 0.0004965716042393847, + "loss": 0.2078, + "step": 38320 + }, + { + "epoch": 1.59, + "grad_norm": 0.77734375, + "learning_rate": 0.000496569814086782, + "loss": 0.2191, + "step": 38330 + }, + { + "epoch": 1.59, + "grad_norm": 0.259765625, + "learning_rate": 0.0004965680234701615, + "loss": 0.2227, + "step": 38340 + }, + { + "epoch": 1.59, + "grad_norm": 0.828125, + "learning_rate": 0.0004965662323895267, + "loss": 0.2208, + "step": 38350 + }, + { + "epoch": 1.59, + "grad_norm": 0.41015625, + "learning_rate": 0.0004965644408448808, + "loss": 0.2335, + "step": 38360 + }, + { + "epoch": 1.59, + "grad_norm": 0.56640625, + "learning_rate": 0.0004965626488362273, + "loss": 0.2146, + "step": 38370 + }, + { + "epoch": 1.59, + "grad_norm": 0.671875, + "learning_rate": 0.0004965608563635695, + "loss": 0.2357, + "step": 38380 + }, + { + "epoch": 1.59, + "grad_norm": 0.498046875, + "learning_rate": 0.0004965590634269108, + "loss": 0.2408, + "step": 38390 + }, + { + "epoch": 1.59, + "grad_norm": 2.234375, + "learning_rate": 0.0004965572700262546, + "loss": 0.2274, + "step": 38400 + }, + { + "epoch": 1.59, + "grad_norm": 0.7578125, + "learning_rate": 0.0004965554761616043, + "loss": 0.2149, + "step": 38410 + }, + { + "epoch": 1.59, + "grad_norm": 0.2216796875, + "learning_rate": 0.000496553681832963, + "loss": 0.2548, + "step": 38420 + }, + { + "epoch": 1.59, + "grad_norm": 0.47265625, + "learning_rate": 0.0004965518870403345, + "loss": 0.2672, + "step": 38430 + }, + { + "epoch": 1.59, + "grad_norm": 1.78125, + "learning_rate": 0.0004965500917837218, + "loss": 0.2331, + "step": 38440 + }, + { + "epoch": 1.59, + "grad_norm": 0.9453125, + "learning_rate": 0.0004965482960631286, + "loss": 0.2134, + "step": 38450 + }, + { + "epoch": 1.59, + "grad_norm": 0.61328125, + "learning_rate": 0.0004965464998785581, + "loss": 0.2122, + "step": 38460 + }, + { + "epoch": 1.59, + "grad_norm": 0.828125, + "learning_rate": 0.0004965447032300136, + "loss": 0.2234, + "step": 38470 + }, + { + "epoch": 1.59, + "grad_norm": 0.8515625, + "learning_rate": 0.0004965429061174987, + "loss": 0.2588, + "step": 38480 + }, + { + "epoch": 1.59, + "grad_norm": 0.375, + "learning_rate": 0.0004965411085410168, + "loss": 0.2769, + "step": 38490 + }, + { + "epoch": 1.59, + "grad_norm": 0.6328125, + "learning_rate": 0.0004965393105005709, + "loss": 0.2516, + "step": 38500 + }, + { + "epoch": 1.6, + "grad_norm": 0.5390625, + "learning_rate": 0.0004965375119961648, + "loss": 0.2536, + "step": 38510 + }, + { + "epoch": 1.6, + "grad_norm": 0.60546875, + "learning_rate": 0.0004965357130278017, + "loss": 0.2514, + "step": 38520 + }, + { + "epoch": 1.6, + "grad_norm": 0.177734375, + "learning_rate": 0.000496533913595485, + "loss": 0.2303, + "step": 38530 + }, + { + "epoch": 1.6, + "grad_norm": 0.73046875, + "learning_rate": 0.0004965321136992182, + "loss": 0.1954, + "step": 38540 + }, + { + "epoch": 1.6, + "grad_norm": 0.55078125, + "learning_rate": 0.0004965303133390044, + "loss": 0.2214, + "step": 38550 + }, + { + "epoch": 1.6, + "grad_norm": 0.87890625, + "learning_rate": 0.0004965285125148473, + "loss": 0.2906, + "step": 38560 + }, + { + "epoch": 1.6, + "grad_norm": 0.66015625, + "learning_rate": 0.0004965267112267501, + "loss": 0.2503, + "step": 38570 + }, + { + "epoch": 1.6, + "grad_norm": 5.09375, + "learning_rate": 0.0004965249094747163, + "loss": 0.2665, + "step": 38580 + }, + { + "epoch": 1.6, + "grad_norm": 0.62890625, + "learning_rate": 0.0004965231072587492, + "loss": 0.1989, + "step": 38590 + }, + { + "epoch": 1.6, + "grad_norm": 0.361328125, + "learning_rate": 0.0004965213045788523, + "loss": 0.2027, + "step": 38600 + }, + { + "epoch": 1.6, + "grad_norm": 0.390625, + "learning_rate": 0.0004965195014350288, + "loss": 0.2034, + "step": 38610 + }, + { + "epoch": 1.6, + "grad_norm": 0.41015625, + "learning_rate": 0.0004965176978272823, + "loss": 0.2163, + "step": 38620 + }, + { + "epoch": 1.6, + "grad_norm": 0.59375, + "learning_rate": 0.0004965158937556162, + "loss": 0.1841, + "step": 38630 + }, + { + "epoch": 1.6, + "grad_norm": 0.48046875, + "learning_rate": 0.0004965140892200336, + "loss": 0.1566, + "step": 38640 + }, + { + "epoch": 1.6, + "grad_norm": 0.6015625, + "learning_rate": 0.0004965122842205382, + "loss": 0.2586, + "step": 38650 + }, + { + "epoch": 1.6, + "grad_norm": 0.330078125, + "learning_rate": 0.0004965104787571334, + "loss": 0.2449, + "step": 38660 + }, + { + "epoch": 1.6, + "grad_norm": 0.62109375, + "learning_rate": 0.0004965086728298223, + "loss": 0.2156, + "step": 38670 + }, + { + "epoch": 1.6, + "grad_norm": 0.8984375, + "learning_rate": 0.0004965068664386088, + "loss": 0.2371, + "step": 38680 + }, + { + "epoch": 1.6, + "grad_norm": 0.55859375, + "learning_rate": 0.0004965050595834957, + "loss": 0.2011, + "step": 38690 + }, + { + "epoch": 1.6, + "grad_norm": 0.80859375, + "learning_rate": 0.0004965032522644869, + "loss": 0.2581, + "step": 38700 + }, + { + "epoch": 1.6, + "grad_norm": 1.03125, + "learning_rate": 0.0004965014444815855, + "loss": 0.2612, + "step": 38710 + }, + { + "epoch": 1.6, + "grad_norm": 0.63671875, + "learning_rate": 0.000496499636234795, + "loss": 0.1947, + "step": 38720 + }, + { + "epoch": 1.6, + "grad_norm": 0.92578125, + "learning_rate": 0.0004964978275241188, + "loss": 0.1904, + "step": 38730 + }, + { + "epoch": 1.6, + "grad_norm": 0.404296875, + "learning_rate": 0.0004964960183495604, + "loss": 0.2257, + "step": 38740 + }, + { + "epoch": 1.61, + "grad_norm": 0.205078125, + "learning_rate": 0.000496494208711123, + "loss": 0.2578, + "step": 38750 + }, + { + "epoch": 1.61, + "grad_norm": 0.734375, + "learning_rate": 0.0004964923986088102, + "loss": 0.2196, + "step": 38760 + }, + { + "epoch": 1.61, + "grad_norm": 0.91015625, + "learning_rate": 0.0004964905880426253, + "loss": 0.2292, + "step": 38770 + }, + { + "epoch": 1.61, + "grad_norm": 0.494140625, + "learning_rate": 0.0004964887770125717, + "loss": 0.2129, + "step": 38780 + }, + { + "epoch": 1.61, + "grad_norm": 2.0, + "learning_rate": 0.0004964869655186529, + "loss": 0.2124, + "step": 38790 + }, + { + "epoch": 1.61, + "grad_norm": 1.609375, + "learning_rate": 0.0004964851535608722, + "loss": 0.2011, + "step": 38800 + }, + { + "epoch": 1.61, + "grad_norm": 0.90625, + "learning_rate": 0.000496483341139233, + "loss": 0.238, + "step": 38810 + }, + { + "epoch": 1.61, + "grad_norm": 0.2109375, + "learning_rate": 0.0004964815282537388, + "loss": 0.2688, + "step": 38820 + }, + { + "epoch": 1.61, + "grad_norm": 0.42578125, + "learning_rate": 0.000496479714904393, + "loss": 0.1965, + "step": 38830 + }, + { + "epoch": 1.61, + "grad_norm": 0.5234375, + "learning_rate": 0.0004964779010911991, + "loss": 0.2136, + "step": 38840 + }, + { + "epoch": 1.61, + "grad_norm": 0.68359375, + "learning_rate": 0.0004964760868141604, + "loss": 0.26, + "step": 38850 + }, + { + "epoch": 1.61, + "grad_norm": 0.76171875, + "learning_rate": 0.0004964742720732803, + "loss": 0.2183, + "step": 38860 + }, + { + "epoch": 1.61, + "grad_norm": 0.640625, + "learning_rate": 0.000496472456868562, + "loss": 0.1965, + "step": 38870 + }, + { + "epoch": 1.61, + "grad_norm": 0.796875, + "learning_rate": 0.0004964706412000094, + "loss": 0.195, + "step": 38880 + }, + { + "epoch": 1.61, + "grad_norm": 0.482421875, + "learning_rate": 0.0004964688250676256, + "loss": 0.2827, + "step": 38890 + }, + { + "epoch": 1.61, + "grad_norm": 0.6875, + "learning_rate": 0.0004964670084714141, + "loss": 0.2533, + "step": 38900 + }, + { + "epoch": 1.61, + "grad_norm": 3.203125, + "learning_rate": 0.0004964651914113783, + "loss": 0.2906, + "step": 38910 + }, + { + "epoch": 1.61, + "grad_norm": 1.546875, + "learning_rate": 0.0004964633738875216, + "loss": 0.1613, + "step": 38920 + }, + { + "epoch": 1.61, + "grad_norm": 0.87109375, + "learning_rate": 0.0004964615558998474, + "loss": 0.2324, + "step": 38930 + }, + { + "epoch": 1.61, + "grad_norm": 0.2451171875, + "learning_rate": 0.0004964597374483593, + "loss": 0.1969, + "step": 38940 + }, + { + "epoch": 1.61, + "grad_norm": 0.71484375, + "learning_rate": 0.0004964579185330605, + "loss": 0.2234, + "step": 38950 + }, + { + "epoch": 1.61, + "grad_norm": 0.51953125, + "learning_rate": 0.0004964560991539546, + "loss": 0.2566, + "step": 38960 + }, + { + "epoch": 1.61, + "grad_norm": 0.51171875, + "learning_rate": 0.0004964542793110449, + "loss": 0.196, + "step": 38970 + }, + { + "epoch": 1.61, + "grad_norm": 0.50390625, + "learning_rate": 0.0004964524590043347, + "loss": 0.1914, + "step": 38980 + }, + { + "epoch": 1.61, + "grad_norm": 0.71875, + "learning_rate": 0.0004964506382338278, + "loss": 0.2505, + "step": 38990 + }, + { + "epoch": 1.62, + "grad_norm": 0.734375, + "learning_rate": 0.0004964488169995274, + "loss": 0.2008, + "step": 39000 + }, + { + "epoch": 1.62, + "grad_norm": 0.65625, + "learning_rate": 0.0004964469953014369, + "loss": 0.2866, + "step": 39010 + }, + { + "epoch": 1.62, + "grad_norm": 0.60546875, + "learning_rate": 0.0004964451731395597, + "loss": 0.213, + "step": 39020 + }, + { + "epoch": 1.62, + "grad_norm": 0.3671875, + "learning_rate": 0.0004964433505138993, + "loss": 0.2161, + "step": 39030 + }, + { + "epoch": 1.62, + "grad_norm": 0.66796875, + "learning_rate": 0.0004964415274244592, + "loss": 0.2642, + "step": 39040 + }, + { + "epoch": 1.62, + "grad_norm": 0.9453125, + "learning_rate": 0.0004964397038712428, + "loss": 0.2482, + "step": 39050 + }, + { + "epoch": 1.62, + "grad_norm": 0.65625, + "learning_rate": 0.0004964378798542533, + "loss": 0.2686, + "step": 39060 + }, + { + "epoch": 1.62, + "grad_norm": 1.2734375, + "learning_rate": 0.0004964360553734945, + "loss": 0.1995, + "step": 39070 + }, + { + "epoch": 1.62, + "grad_norm": 0.89453125, + "learning_rate": 0.0004964342304289697, + "loss": 0.2429, + "step": 39080 + }, + { + "epoch": 1.62, + "grad_norm": 0.48046875, + "learning_rate": 0.0004964324050206821, + "loss": 0.2909, + "step": 39090 + }, + { + "epoch": 1.62, + "grad_norm": 0.2578125, + "learning_rate": 0.0004964305791486355, + "loss": 0.2243, + "step": 39100 + }, + { + "epoch": 1.62, + "grad_norm": 0.875, + "learning_rate": 0.0004964287528128329, + "loss": 0.2537, + "step": 39110 + }, + { + "epoch": 1.62, + "grad_norm": 0.37890625, + "learning_rate": 0.0004964269260132782, + "loss": 0.2358, + "step": 39120 + }, + { + "epoch": 1.62, + "grad_norm": 0.40625, + "learning_rate": 0.0004964250987499747, + "loss": 0.2647, + "step": 39130 + }, + { + "epoch": 1.62, + "grad_norm": 0.73828125, + "learning_rate": 0.0004964232710229256, + "loss": 0.2449, + "step": 39140 + }, + { + "epoch": 1.62, + "grad_norm": 0.65625, + "learning_rate": 0.0004964214428321347, + "loss": 0.2553, + "step": 39150 + }, + { + "epoch": 1.62, + "grad_norm": 0.53125, + "learning_rate": 0.0004964196141776052, + "loss": 0.2214, + "step": 39160 + }, + { + "epoch": 1.62, + "grad_norm": 0.59765625, + "learning_rate": 0.0004964177850593405, + "loss": 0.187, + "step": 39170 + }, + { + "epoch": 1.62, + "grad_norm": 0.240234375, + "learning_rate": 0.0004964159554773442, + "loss": 0.2115, + "step": 39180 + }, + { + "epoch": 1.62, + "grad_norm": 1.1640625, + "learning_rate": 0.0004964141254316197, + "loss": 0.2459, + "step": 39190 + }, + { + "epoch": 1.62, + "grad_norm": 0.66015625, + "learning_rate": 0.0004964122949221705, + "loss": 0.2429, + "step": 39200 + }, + { + "epoch": 1.62, + "grad_norm": 0.48046875, + "learning_rate": 0.0004964104639489998, + "loss": 0.1995, + "step": 39210 + }, + { + "epoch": 1.62, + "grad_norm": 1.03125, + "learning_rate": 0.0004964086325121113, + "loss": 0.2204, + "step": 39220 + }, + { + "epoch": 1.62, + "grad_norm": 0.5703125, + "learning_rate": 0.0004964068006115083, + "loss": 0.2407, + "step": 39230 + }, + { + "epoch": 1.63, + "grad_norm": 0.3671875, + "learning_rate": 0.0004964049682471944, + "loss": 0.2388, + "step": 39240 + }, + { + "epoch": 1.63, + "grad_norm": 0.259765625, + "learning_rate": 0.0004964031354191729, + "loss": 0.1868, + "step": 39250 + }, + { + "epoch": 1.63, + "grad_norm": 0.76953125, + "learning_rate": 0.0004964013021274473, + "loss": 0.2624, + "step": 39260 + }, + { + "epoch": 1.63, + "grad_norm": 0.87109375, + "learning_rate": 0.0004963994683720212, + "loss": 0.1746, + "step": 39270 + }, + { + "epoch": 1.63, + "grad_norm": 0.310546875, + "learning_rate": 0.0004963976341528978, + "loss": 0.1977, + "step": 39280 + }, + { + "epoch": 1.63, + "grad_norm": 1.4296875, + "learning_rate": 0.0004963957994700806, + "loss": 0.2068, + "step": 39290 + }, + { + "epoch": 1.63, + "grad_norm": 0.609375, + "learning_rate": 0.0004963939643235732, + "loss": 0.2348, + "step": 39300 + }, + { + "epoch": 1.63, + "grad_norm": 1.4609375, + "learning_rate": 0.0004963921287133789, + "loss": 0.2653, + "step": 39310 + }, + { + "epoch": 1.63, + "grad_norm": 2.0625, + "learning_rate": 0.0004963902926395013, + "loss": 0.1923, + "step": 39320 + }, + { + "epoch": 1.63, + "grad_norm": 2.890625, + "learning_rate": 0.0004963884561019438, + "loss": 0.2815, + "step": 39330 + }, + { + "epoch": 1.63, + "grad_norm": 0.35546875, + "learning_rate": 0.0004963866191007099, + "loss": 0.2199, + "step": 39340 + }, + { + "epoch": 1.63, + "grad_norm": 0.89453125, + "learning_rate": 0.0004963847816358027, + "loss": 0.2558, + "step": 39350 + }, + { + "epoch": 1.63, + "grad_norm": 0.58984375, + "learning_rate": 0.0004963829437072262, + "loss": 0.236, + "step": 39360 + }, + { + "epoch": 1.63, + "grad_norm": 0.9453125, + "learning_rate": 0.0004963811053149835, + "loss": 0.2675, + "step": 39370 + }, + { + "epoch": 1.63, + "grad_norm": 1.9921875, + "learning_rate": 0.0004963792664590781, + "loss": 0.2238, + "step": 39380 + }, + { + "epoch": 1.63, + "grad_norm": 2.125, + "learning_rate": 0.0004963774271395137, + "loss": 0.2198, + "step": 39390 + }, + { + "epoch": 1.63, + "grad_norm": 0.84765625, + "learning_rate": 0.0004963755873562936, + "loss": 0.2511, + "step": 39400 + }, + { + "epoch": 1.63, + "grad_norm": 0.73046875, + "learning_rate": 0.000496373747109421, + "loss": 0.2361, + "step": 39410 + }, + { + "epoch": 1.63, + "grad_norm": 0.9453125, + "learning_rate": 0.0004963719063988998, + "loss": 0.2062, + "step": 39420 + }, + { + "epoch": 1.63, + "grad_norm": 0.2373046875, + "learning_rate": 0.0004963700652247333, + "loss": 0.242, + "step": 39430 + }, + { + "epoch": 1.63, + "grad_norm": 1.4140625, + "learning_rate": 0.0004963682235869249, + "loss": 0.2005, + "step": 39440 + }, + { + "epoch": 1.63, + "grad_norm": 1.640625, + "learning_rate": 0.0004963663814854781, + "loss": 0.2058, + "step": 39450 + }, + { + "epoch": 1.63, + "grad_norm": 0.97265625, + "learning_rate": 0.0004963645389203964, + "loss": 0.2506, + "step": 39460 + }, + { + "epoch": 1.63, + "grad_norm": 0.515625, + "learning_rate": 0.0004963626958916832, + "loss": 0.2263, + "step": 39470 + }, + { + "epoch": 1.64, + "grad_norm": 0.59765625, + "learning_rate": 0.000496360852399342, + "loss": 0.2359, + "step": 39480 + }, + { + "epoch": 1.64, + "grad_norm": 1.0703125, + "learning_rate": 0.0004963590084433764, + "loss": 0.2448, + "step": 39490 + }, + { + "epoch": 1.64, + "grad_norm": 0.94921875, + "learning_rate": 0.0004963571640237896, + "loss": 0.256, + "step": 39500 + }, + { + "epoch": 1.64, + "grad_norm": 0.37109375, + "learning_rate": 0.0004963553191405852, + "loss": 0.26, + "step": 39510 + }, + { + "epoch": 1.64, + "grad_norm": 0.31640625, + "learning_rate": 0.0004963534737937669, + "loss": 0.2365, + "step": 39520 + }, + { + "epoch": 1.64, + "grad_norm": 0.59375, + "learning_rate": 0.0004963516279833379, + "loss": 0.2344, + "step": 39530 + }, + { + "epoch": 1.64, + "grad_norm": 0.5859375, + "learning_rate": 0.0004963497817093016, + "loss": 0.2077, + "step": 39540 + }, + { + "epoch": 1.64, + "grad_norm": 0.7265625, + "learning_rate": 0.0004963479349716617, + "loss": 0.2021, + "step": 39550 + }, + { + "epoch": 1.64, + "grad_norm": 0.5625, + "learning_rate": 0.0004963460877704215, + "loss": 0.2187, + "step": 39560 + }, + { + "epoch": 1.64, + "grad_norm": 0.78125, + "learning_rate": 0.0004963442401055847, + "loss": 0.2754, + "step": 39570 + }, + { + "epoch": 1.64, + "grad_norm": 0.4296875, + "learning_rate": 0.0004963423919771546, + "loss": 0.2127, + "step": 39580 + }, + { + "epoch": 1.64, + "grad_norm": 0.435546875, + "learning_rate": 0.0004963405433851348, + "loss": 0.2511, + "step": 39590 + }, + { + "epoch": 1.64, + "grad_norm": 0.62890625, + "learning_rate": 0.0004963386943295286, + "loss": 0.2444, + "step": 39600 + }, + { + "epoch": 1.64, + "grad_norm": 0.4453125, + "learning_rate": 0.0004963368448103396, + "loss": 0.2241, + "step": 39610 + }, + { + "epoch": 1.64, + "grad_norm": 0.5234375, + "learning_rate": 0.0004963349948275713, + "loss": 0.2285, + "step": 39620 + }, + { + "epoch": 1.64, + "grad_norm": 1.0390625, + "learning_rate": 0.000496333144381227, + "loss": 0.2366, + "step": 39630 + }, + { + "epoch": 1.64, + "grad_norm": 0.427734375, + "learning_rate": 0.0004963312934713104, + "loss": 0.2541, + "step": 39640 + }, + { + "epoch": 1.64, + "grad_norm": 0.76171875, + "learning_rate": 0.000496329442097825, + "loss": 0.2934, + "step": 39650 + }, + { + "epoch": 1.64, + "grad_norm": 0.84765625, + "learning_rate": 0.0004963275902607741, + "loss": 0.1902, + "step": 39660 + }, + { + "epoch": 1.64, + "grad_norm": 0.63671875, + "learning_rate": 0.0004963257379601613, + "loss": 0.2151, + "step": 39670 + }, + { + "epoch": 1.64, + "grad_norm": 0.291015625, + "learning_rate": 0.00049632388519599, + "loss": 0.2226, + "step": 39680 + }, + { + "epoch": 1.64, + "grad_norm": 0.67578125, + "learning_rate": 0.0004963220319682639, + "loss": 0.1651, + "step": 39690 + }, + { + "epoch": 1.64, + "grad_norm": 0.6015625, + "learning_rate": 0.0004963201782769862, + "loss": 0.1702, + "step": 39700 + }, + { + "epoch": 1.64, + "grad_norm": 0.54296875, + "learning_rate": 0.0004963183241221606, + "loss": 0.2205, + "step": 39710 + }, + { + "epoch": 1.65, + "grad_norm": 0.734375, + "learning_rate": 0.0004963164695037905, + "loss": 0.2104, + "step": 39720 + }, + { + "epoch": 1.65, + "grad_norm": 0.53515625, + "learning_rate": 0.0004963146144218793, + "loss": 0.2174, + "step": 39730 + }, + { + "epoch": 1.65, + "grad_norm": 1.0546875, + "learning_rate": 0.0004963127588764307, + "loss": 0.2069, + "step": 39740 + }, + { + "epoch": 1.65, + "grad_norm": 1.078125, + "learning_rate": 0.0004963109028674481, + "loss": 0.2056, + "step": 39750 + }, + { + "epoch": 1.65, + "grad_norm": 3.953125, + "learning_rate": 0.000496309046394935, + "loss": 0.2357, + "step": 39760 + }, + { + "epoch": 1.65, + "grad_norm": 0.42578125, + "learning_rate": 0.0004963071894588948, + "loss": 0.2084, + "step": 39770 + }, + { + "epoch": 1.65, + "grad_norm": 0.474609375, + "learning_rate": 0.000496305332059331, + "loss": 0.2074, + "step": 39780 + }, + { + "epoch": 1.65, + "grad_norm": 0.6640625, + "learning_rate": 0.0004963034741962473, + "loss": 0.1605, + "step": 39790 + }, + { + "epoch": 1.65, + "grad_norm": 0.486328125, + "learning_rate": 0.000496301615869647, + "loss": 0.2223, + "step": 39800 + }, + { + "epoch": 1.65, + "grad_norm": 0.95703125, + "learning_rate": 0.0004962997570795337, + "loss": 0.2065, + "step": 39810 + }, + { + "epoch": 1.65, + "grad_norm": 0.6875, + "learning_rate": 0.0004962978978259109, + "loss": 0.2912, + "step": 39820 + }, + { + "epoch": 1.65, + "grad_norm": 0.5078125, + "learning_rate": 0.000496296038108782, + "loss": 0.2187, + "step": 39830 + }, + { + "epoch": 1.65, + "grad_norm": 1.0546875, + "learning_rate": 0.0004962941779281505, + "loss": 0.2004, + "step": 39840 + }, + { + "epoch": 1.65, + "grad_norm": 0.9375, + "learning_rate": 0.00049629231728402, + "loss": 0.1986, + "step": 39850 + }, + { + "epoch": 1.65, + "grad_norm": 1.3828125, + "learning_rate": 0.0004962904561763939, + "loss": 0.2279, + "step": 39860 + }, + { + "epoch": 1.65, + "grad_norm": 0.57421875, + "learning_rate": 0.0004962885946052758, + "loss": 0.2008, + "step": 39870 + }, + { + "epoch": 1.65, + "grad_norm": 1.671875, + "learning_rate": 0.0004962867325706692, + "loss": 0.2705, + "step": 39880 + }, + { + "epoch": 1.65, + "grad_norm": 0.6484375, + "learning_rate": 0.0004962848700725775, + "loss": 0.2342, + "step": 39890 + }, + { + "epoch": 1.65, + "grad_norm": 0.63671875, + "learning_rate": 0.0004962830071110044, + "loss": 0.276, + "step": 39900 + }, + { + "epoch": 1.65, + "grad_norm": 0.63671875, + "learning_rate": 0.0004962811436859531, + "loss": 0.2465, + "step": 39910 + }, + { + "epoch": 1.65, + "grad_norm": 0.400390625, + "learning_rate": 0.0004962792797974274, + "loss": 0.2014, + "step": 39920 + }, + { + "epoch": 1.65, + "grad_norm": 0.328125, + "learning_rate": 0.0004962774154454306, + "loss": 0.2205, + "step": 39930 + }, + { + "epoch": 1.65, + "grad_norm": 0.2392578125, + "learning_rate": 0.0004962755506299664, + "loss": 0.2614, + "step": 39940 + }, + { + "epoch": 1.65, + "grad_norm": 0.365234375, + "learning_rate": 0.0004962736853510382, + "loss": 0.2035, + "step": 39950 + }, + { + "epoch": 1.66, + "grad_norm": 1.21875, + "learning_rate": 0.0004962718196086495, + "loss": 0.2384, + "step": 39960 + }, + { + "epoch": 1.66, + "grad_norm": 1.2265625, + "learning_rate": 0.0004962699534028038, + "loss": 0.2214, + "step": 39970 + }, + { + "epoch": 1.66, + "grad_norm": 1.4296875, + "learning_rate": 0.0004962680867335046, + "loss": 0.1827, + "step": 39980 + }, + { + "epoch": 1.66, + "grad_norm": 0.63671875, + "learning_rate": 0.0004962662196007555, + "loss": 0.1924, + "step": 39990 + }, + { + "epoch": 1.66, + "grad_norm": 0.2890625, + "learning_rate": 0.00049626435200456, + "loss": 0.2233, + "step": 40000 + }, + { + "epoch": 1.66, + "grad_norm": 0.62890625, + "learning_rate": 0.0004962624839449216, + "loss": 0.2202, + "step": 40010 + }, + { + "epoch": 1.66, + "grad_norm": 0.498046875, + "learning_rate": 0.0004962606154218438, + "loss": 0.2659, + "step": 40020 + }, + { + "epoch": 1.66, + "grad_norm": 0.77734375, + "learning_rate": 0.0004962587464353301, + "loss": 0.2327, + "step": 40030 + }, + { + "epoch": 1.66, + "grad_norm": 0.83984375, + "learning_rate": 0.000496256876985384, + "loss": 0.2687, + "step": 40040 + }, + { + "epoch": 1.66, + "grad_norm": 1.0625, + "learning_rate": 0.000496255007072009, + "loss": 0.2227, + "step": 40050 + }, + { + "epoch": 1.66, + "grad_norm": 0.427734375, + "learning_rate": 0.0004962531366952087, + "loss": 0.2305, + "step": 40060 + }, + { + "epoch": 1.66, + "grad_norm": 0.72265625, + "learning_rate": 0.0004962512658549868, + "loss": 0.1808, + "step": 40070 + }, + { + "epoch": 1.66, + "grad_norm": 0.7265625, + "learning_rate": 0.0004962493945513463, + "loss": 0.242, + "step": 40080 + }, + { + "epoch": 1.66, + "grad_norm": 0.6640625, + "learning_rate": 0.0004962475227842912, + "loss": 0.2242, + "step": 40090 + }, + { + "epoch": 1.66, + "grad_norm": 0.294921875, + "learning_rate": 0.0004962456505538248, + "loss": 0.1929, + "step": 40100 + }, + { + "epoch": 1.66, + "grad_norm": 0.298828125, + "learning_rate": 0.0004962437778599508, + "loss": 0.2266, + "step": 40110 + }, + { + "epoch": 1.66, + "grad_norm": 0.54296875, + "learning_rate": 0.0004962419047026724, + "loss": 0.2184, + "step": 40120 + }, + { + "epoch": 1.66, + "grad_norm": 0.5546875, + "learning_rate": 0.0004962400310819934, + "loss": 0.187, + "step": 40130 + }, + { + "epoch": 1.66, + "grad_norm": 1.28125, + "learning_rate": 0.0004962381569979174, + "loss": 0.2801, + "step": 40140 + }, + { + "epoch": 1.66, + "grad_norm": 0.8046875, + "learning_rate": 0.0004962362824504476, + "loss": 0.3076, + "step": 40150 + }, + { + "epoch": 1.66, + "grad_norm": 0.8671875, + "learning_rate": 0.0004962344074395878, + "loss": 0.1774, + "step": 40160 + }, + { + "epoch": 1.66, + "grad_norm": 0.59765625, + "learning_rate": 0.0004962325319653413, + "loss": 0.2228, + "step": 40170 + }, + { + "epoch": 1.66, + "grad_norm": 2.4375, + "learning_rate": 0.0004962306560277119, + "loss": 0.2416, + "step": 40180 + }, + { + "epoch": 1.66, + "grad_norm": 0.69921875, + "learning_rate": 0.000496228779626703, + "loss": 0.1342, + "step": 40190 + }, + { + "epoch": 1.67, + "grad_norm": 0.70703125, + "learning_rate": 0.000496226902762318, + "loss": 0.2033, + "step": 40200 + }, + { + "epoch": 1.67, + "grad_norm": 0.74609375, + "learning_rate": 0.0004962250254345608, + "loss": 0.1869, + "step": 40210 + }, + { + "epoch": 1.67, + "grad_norm": 0.63671875, + "learning_rate": 0.0004962231476434345, + "loss": 0.1955, + "step": 40220 + }, + { + "epoch": 1.67, + "grad_norm": 0.326171875, + "learning_rate": 0.0004962212693889428, + "loss": 0.2172, + "step": 40230 + }, + { + "epoch": 1.67, + "grad_norm": 0.61328125, + "learning_rate": 0.0004962193906710894, + "loss": 0.2422, + "step": 40240 + }, + { + "epoch": 1.67, + "grad_norm": 0.392578125, + "learning_rate": 0.0004962175114898777, + "loss": 0.2461, + "step": 40250 + }, + { + "epoch": 1.67, + "grad_norm": 0.91796875, + "learning_rate": 0.0004962156318453111, + "loss": 0.267, + "step": 40260 + }, + { + "epoch": 1.67, + "grad_norm": 0.255859375, + "learning_rate": 0.0004962137517373934, + "loss": 0.2113, + "step": 40270 + }, + { + "epoch": 1.67, + "grad_norm": 1.3359375, + "learning_rate": 0.0004962118711661278, + "loss": 0.2289, + "step": 40280 + }, + { + "epoch": 1.67, + "grad_norm": 1.421875, + "learning_rate": 0.0004962099901315183, + "loss": 0.2154, + "step": 40290 + }, + { + "epoch": 1.67, + "grad_norm": 0.83203125, + "learning_rate": 0.000496208108633568, + "loss": 0.2358, + "step": 40300 + }, + { + "epoch": 1.67, + "grad_norm": 0.546875, + "learning_rate": 0.0004962062266722808, + "loss": 0.2707, + "step": 40310 + }, + { + "epoch": 1.67, + "grad_norm": 0.671875, + "learning_rate": 0.0004962043442476599, + "loss": 0.2025, + "step": 40320 + }, + { + "epoch": 1.67, + "grad_norm": 0.484375, + "learning_rate": 0.0004962024613597091, + "loss": 0.189, + "step": 40330 + }, + { + "epoch": 1.67, + "grad_norm": 0.765625, + "learning_rate": 0.0004962005780084318, + "loss": 0.1713, + "step": 40340 + }, + { + "epoch": 1.67, + "grad_norm": 2.09375, + "learning_rate": 0.0004961986941938316, + "loss": 0.2291, + "step": 40350 + }, + { + "epoch": 1.67, + "grad_norm": 0.498046875, + "learning_rate": 0.0004961968099159122, + "loss": 0.1799, + "step": 40360 + }, + { + "epoch": 1.67, + "grad_norm": 0.6328125, + "learning_rate": 0.0004961949251746768, + "loss": 0.2221, + "step": 40370 + }, + { + "epoch": 1.67, + "grad_norm": 0.458984375, + "learning_rate": 0.0004961930399701292, + "loss": 0.1898, + "step": 40380 + }, + { + "epoch": 1.67, + "grad_norm": 0.359375, + "learning_rate": 0.0004961911543022729, + "loss": 0.2391, + "step": 40390 + }, + { + "epoch": 1.67, + "grad_norm": 0.357421875, + "learning_rate": 0.0004961892681711115, + "loss": 0.2079, + "step": 40400 + }, + { + "epoch": 1.67, + "grad_norm": 0.73046875, + "learning_rate": 0.0004961873815766484, + "loss": 0.2173, + "step": 40410 + }, + { + "epoch": 1.67, + "grad_norm": 0.671875, + "learning_rate": 0.0004961854945188872, + "loss": 0.2676, + "step": 40420 + }, + { + "epoch": 1.67, + "grad_norm": 0.87890625, + "learning_rate": 0.0004961836069978316, + "loss": 0.2334, + "step": 40430 + }, + { + "epoch": 1.68, + "grad_norm": 0.98828125, + "learning_rate": 0.000496181719013485, + "loss": 0.2003, + "step": 40440 + }, + { + "epoch": 1.68, + "grad_norm": 0.60546875, + "learning_rate": 0.0004961798305658509, + "loss": 0.229, + "step": 40450 + }, + { + "epoch": 1.68, + "grad_norm": 0.58203125, + "learning_rate": 0.0004961779416549331, + "loss": 0.2484, + "step": 40460 + }, + { + "epoch": 1.68, + "grad_norm": 0.58203125, + "learning_rate": 0.0004961760522807348, + "loss": 0.2288, + "step": 40470 + }, + { + "epoch": 1.68, + "grad_norm": 0.53515625, + "learning_rate": 0.0004961741624432598, + "loss": 0.2432, + "step": 40480 + }, + { + "epoch": 1.68, + "grad_norm": 1.5625, + "learning_rate": 0.0004961722721425117, + "loss": 0.2313, + "step": 40490 + }, + { + "epoch": 1.68, + "grad_norm": 5.21875, + "learning_rate": 0.000496170381378494, + "loss": 0.2333, + "step": 40500 + }, + { + "epoch": 1.68, + "grad_norm": 0.86328125, + "learning_rate": 0.0004961684901512102, + "loss": 0.1828, + "step": 40510 + }, + { + "epoch": 1.68, + "grad_norm": 0.78515625, + "learning_rate": 0.0004961665984606638, + "loss": 0.2367, + "step": 40520 + }, + { + "epoch": 1.68, + "grad_norm": 0.2734375, + "learning_rate": 0.0004961647063068585, + "loss": 0.2041, + "step": 40530 + }, + { + "epoch": 1.68, + "grad_norm": 0.310546875, + "learning_rate": 0.0004961628136897978, + "loss": 0.2168, + "step": 40540 + }, + { + "epoch": 1.68, + "grad_norm": 0.458984375, + "learning_rate": 0.0004961609206094853, + "loss": 0.2084, + "step": 40550 + }, + { + "epoch": 1.68, + "grad_norm": 0.62890625, + "learning_rate": 0.0004961590270659244, + "loss": 0.2431, + "step": 40560 + }, + { + "epoch": 1.68, + "grad_norm": 0.5234375, + "learning_rate": 0.000496157133059119, + "loss": 0.2277, + "step": 40570 + }, + { + "epoch": 1.68, + "grad_norm": 0.890625, + "learning_rate": 0.0004961552385890723, + "loss": 0.2112, + "step": 40580 + }, + { + "epoch": 1.68, + "grad_norm": 0.84765625, + "learning_rate": 0.000496153343655788, + "loss": 0.2303, + "step": 40590 + }, + { + "epoch": 1.68, + "grad_norm": 0.55078125, + "learning_rate": 0.0004961514482592699, + "loss": 0.2913, + "step": 40600 + }, + { + "epoch": 1.68, + "grad_norm": 0.58984375, + "learning_rate": 0.0004961495523995211, + "loss": 0.251, + "step": 40610 + }, + { + "epoch": 1.68, + "grad_norm": 0.6796875, + "learning_rate": 0.0004961476560765456, + "loss": 0.2111, + "step": 40620 + }, + { + "epoch": 1.68, + "grad_norm": 2.3125, + "learning_rate": 0.0004961457592903467, + "loss": 0.2221, + "step": 40630 + }, + { + "epoch": 1.68, + "grad_norm": 0.703125, + "learning_rate": 0.0004961438620409281, + "loss": 0.2591, + "step": 40640 + }, + { + "epoch": 1.68, + "grad_norm": 0.84765625, + "learning_rate": 0.0004961419643282933, + "loss": 0.2459, + "step": 40650 + }, + { + "epoch": 1.68, + "grad_norm": 0.53515625, + "learning_rate": 0.000496140066152446, + "loss": 0.2709, + "step": 40660 + }, + { + "epoch": 1.68, + "grad_norm": 1.1328125, + "learning_rate": 0.0004961381675133896, + "loss": 0.2244, + "step": 40670 + }, + { + "epoch": 1.68, + "grad_norm": 0.83984375, + "learning_rate": 0.0004961362684111277, + "loss": 0.2174, + "step": 40680 + }, + { + "epoch": 1.69, + "grad_norm": 0.353515625, + "learning_rate": 0.000496134368845664, + "loss": 0.2403, + "step": 40690 + }, + { + "epoch": 1.69, + "grad_norm": 0.37890625, + "learning_rate": 0.0004961324688170021, + "loss": 0.1625, + "step": 40700 + }, + { + "epoch": 1.69, + "grad_norm": 0.71875, + "learning_rate": 0.0004961305683251452, + "loss": 0.1845, + "step": 40710 + }, + { + "epoch": 1.69, + "grad_norm": 2.28125, + "learning_rate": 0.0004961286673700974, + "loss": 0.1744, + "step": 40720 + }, + { + "epoch": 1.69, + "grad_norm": 0.8125, + "learning_rate": 0.0004961267659518619, + "loss": 0.2237, + "step": 40730 + }, + { + "epoch": 1.69, + "grad_norm": 0.490234375, + "learning_rate": 0.0004961248640704424, + "loss": 0.2316, + "step": 40740 + }, + { + "epoch": 1.69, + "grad_norm": 0.8359375, + "learning_rate": 0.0004961229617258426, + "loss": 0.2557, + "step": 40750 + }, + { + "epoch": 1.69, + "grad_norm": 0.8984375, + "learning_rate": 0.0004961210589180658, + "loss": 0.2861, + "step": 40760 + }, + { + "epoch": 1.69, + "grad_norm": 1.0546875, + "learning_rate": 0.0004961191556471159, + "loss": 0.1975, + "step": 40770 + }, + { + "epoch": 1.69, + "grad_norm": 1.5703125, + "learning_rate": 0.0004961172519129962, + "loss": 0.2069, + "step": 40780 + }, + { + "epoch": 1.69, + "grad_norm": 0.90625, + "learning_rate": 0.0004961153477157105, + "loss": 0.2684, + "step": 40790 + }, + { + "epoch": 1.69, + "grad_norm": 0.1484375, + "learning_rate": 0.0004961134430552622, + "loss": 0.2332, + "step": 40800 + }, + { + "epoch": 1.69, + "grad_norm": 0.796875, + "learning_rate": 0.0004961115379316551, + "loss": 0.2107, + "step": 40810 + }, + { + "epoch": 1.69, + "grad_norm": 0.71875, + "learning_rate": 0.0004961096323448927, + "loss": 0.1992, + "step": 40820 + }, + { + "epoch": 1.69, + "grad_norm": 2.65625, + "learning_rate": 0.0004961077262949784, + "loss": 0.2467, + "step": 40830 + }, + { + "epoch": 1.69, + "grad_norm": 0.86328125, + "learning_rate": 0.000496105819781916, + "loss": 0.2608, + "step": 40840 + }, + { + "epoch": 1.69, + "grad_norm": 0.6171875, + "learning_rate": 0.0004961039128057091, + "loss": 0.2217, + "step": 40850 + }, + { + "epoch": 1.69, + "grad_norm": 0.54296875, + "learning_rate": 0.0004961020053663611, + "loss": 0.2174, + "step": 40860 + }, + { + "epoch": 1.69, + "grad_norm": 0.380859375, + "learning_rate": 0.0004961000974638757, + "loss": 0.1511, + "step": 40870 + }, + { + "epoch": 1.69, + "grad_norm": 0.6953125, + "learning_rate": 0.0004960981890982565, + "loss": 0.2247, + "step": 40880 + }, + { + "epoch": 1.69, + "grad_norm": 0.55078125, + "learning_rate": 0.0004960962802695072, + "loss": 0.269, + "step": 40890 + }, + { + "epoch": 1.69, + "grad_norm": 0.69140625, + "learning_rate": 0.0004960943709776311, + "loss": 0.2349, + "step": 40900 + }, + { + "epoch": 1.69, + "grad_norm": 1.296875, + "learning_rate": 0.0004960924612226321, + "loss": 0.2036, + "step": 40910 + }, + { + "epoch": 1.69, + "grad_norm": 0.427734375, + "learning_rate": 0.0004960905510045136, + "loss": 0.1978, + "step": 40920 + }, + { + "epoch": 1.7, + "grad_norm": 0.50390625, + "learning_rate": 0.0004960886403232793, + "loss": 0.2312, + "step": 40930 + }, + { + "epoch": 1.7, + "grad_norm": 2.078125, + "learning_rate": 0.0004960867291789327, + "loss": 0.2576, + "step": 40940 + }, + { + "epoch": 1.7, + "grad_norm": 0.59765625, + "learning_rate": 0.0004960848175714775, + "loss": 0.2704, + "step": 40950 + }, + { + "epoch": 1.7, + "grad_norm": 0.2294921875, + "learning_rate": 0.0004960829055009173, + "loss": 0.2106, + "step": 40960 + }, + { + "epoch": 1.7, + "grad_norm": 1.1015625, + "learning_rate": 0.0004960809929672555, + "loss": 0.1571, + "step": 40970 + }, + { + "epoch": 1.7, + "grad_norm": 0.74609375, + "learning_rate": 0.0004960790799704959, + "loss": 0.234, + "step": 40980 + }, + { + "epoch": 1.7, + "grad_norm": 0.474609375, + "learning_rate": 0.0004960771665106421, + "loss": 0.2104, + "step": 40990 + }, + { + "epoch": 1.7, + "grad_norm": 0.5234375, + "learning_rate": 0.0004960752525876976, + "loss": 0.2817, + "step": 41000 + }, + { + "epoch": 1.7, + "grad_norm": 0.78515625, + "learning_rate": 0.0004960733382016661, + "loss": 0.181, + "step": 41010 + }, + { + "epoch": 1.7, + "grad_norm": 0.81640625, + "learning_rate": 0.0004960714233525512, + "loss": 0.2442, + "step": 41020 + }, + { + "epoch": 1.7, + "grad_norm": 2.125, + "learning_rate": 0.0004960695080403563, + "loss": 0.2439, + "step": 41030 + }, + { + "epoch": 1.7, + "grad_norm": 0.640625, + "learning_rate": 0.0004960675922650853, + "loss": 0.2108, + "step": 41040 + }, + { + "epoch": 1.7, + "grad_norm": 1.3125, + "learning_rate": 0.0004960656760267416, + "loss": 0.2781, + "step": 41050 + }, + { + "epoch": 1.7, + "grad_norm": 0.640625, + "learning_rate": 0.0004960637593253288, + "loss": 0.2701, + "step": 41060 + }, + { + "epoch": 1.7, + "grad_norm": 1.6875, + "learning_rate": 0.0004960618421608506, + "loss": 0.2248, + "step": 41070 + }, + { + "epoch": 1.7, + "grad_norm": 0.8671875, + "learning_rate": 0.0004960599245333107, + "loss": 0.256, + "step": 41080 + }, + { + "epoch": 1.7, + "grad_norm": 0.88671875, + "learning_rate": 0.0004960580064427125, + "loss": 0.1663, + "step": 41090 + }, + { + "epoch": 1.7, + "grad_norm": 0.58203125, + "learning_rate": 0.0004960560878890598, + "loss": 0.1989, + "step": 41100 + }, + { + "epoch": 1.7, + "grad_norm": 0.69921875, + "learning_rate": 0.000496054168872356, + "loss": 0.2101, + "step": 41110 + }, + { + "epoch": 1.7, + "grad_norm": 1.125, + "learning_rate": 0.0004960522493926048, + "loss": 0.2232, + "step": 41120 + }, + { + "epoch": 1.7, + "grad_norm": 0.5546875, + "learning_rate": 0.00049605032944981, + "loss": 0.2435, + "step": 41130 + }, + { + "epoch": 1.7, + "grad_norm": 1.28125, + "learning_rate": 0.0004960484090439749, + "loss": 0.2385, + "step": 41140 + }, + { + "epoch": 1.7, + "grad_norm": 0.81640625, + "learning_rate": 0.0004960464881751033, + "loss": 0.2434, + "step": 41150 + }, + { + "epoch": 1.7, + "grad_norm": 0.373046875, + "learning_rate": 0.0004960445668431989, + "loss": 0.2359, + "step": 41160 + }, + { + "epoch": 1.71, + "grad_norm": 0.451171875, + "learning_rate": 0.000496042645048265, + "loss": 0.2121, + "step": 41170 + }, + { + "epoch": 1.71, + "grad_norm": 0.9375, + "learning_rate": 0.0004960407227903056, + "loss": 0.2072, + "step": 41180 + }, + { + "epoch": 1.71, + "grad_norm": 0.44140625, + "learning_rate": 0.000496038800069324, + "loss": 0.2595, + "step": 41190 + }, + { + "epoch": 1.71, + "grad_norm": 0.97265625, + "learning_rate": 0.000496036876885324, + "loss": 0.26, + "step": 41200 + }, + { + "epoch": 1.71, + "grad_norm": 0.5390625, + "learning_rate": 0.0004960349532383092, + "loss": 0.1994, + "step": 41210 + }, + { + "epoch": 1.71, + "grad_norm": 0.7421875, + "learning_rate": 0.0004960330291282831, + "loss": 0.242, + "step": 41220 + }, + { + "epoch": 1.71, + "grad_norm": 0.67578125, + "learning_rate": 0.0004960311045552494, + "loss": 0.2226, + "step": 41230 + }, + { + "epoch": 1.71, + "grad_norm": 0.84375, + "learning_rate": 0.0004960291795192118, + "loss": 0.2278, + "step": 41240 + }, + { + "epoch": 1.71, + "grad_norm": 0.88671875, + "learning_rate": 0.0004960272540201738, + "loss": 0.2212, + "step": 41250 + }, + { + "epoch": 1.71, + "grad_norm": 0.5234375, + "learning_rate": 0.0004960253280581391, + "loss": 0.2162, + "step": 41260 + }, + { + "epoch": 1.71, + "grad_norm": 0.61328125, + "learning_rate": 0.0004960234016331114, + "loss": 0.1867, + "step": 41270 + }, + { + "epoch": 1.71, + "grad_norm": 0.31640625, + "learning_rate": 0.0004960214747450941, + "loss": 0.2237, + "step": 41280 + }, + { + "epoch": 1.71, + "grad_norm": 0.45703125, + "learning_rate": 0.000496019547394091, + "loss": 0.2632, + "step": 41290 + }, + { + "epoch": 1.71, + "grad_norm": 0.6953125, + "learning_rate": 0.0004960176195801056, + "loss": 0.2152, + "step": 41300 + }, + { + "epoch": 1.71, + "grad_norm": 0.6171875, + "learning_rate": 0.0004960156913031417, + "loss": 0.1876, + "step": 41310 + }, + { + "epoch": 1.71, + "grad_norm": 0.59375, + "learning_rate": 0.0004960137625632028, + "loss": 0.2508, + "step": 41320 + }, + { + "epoch": 1.71, + "grad_norm": 0.63671875, + "learning_rate": 0.0004960118333602926, + "loss": 0.2023, + "step": 41330 + }, + { + "epoch": 1.71, + "grad_norm": 0.2470703125, + "learning_rate": 0.0004960099036944147, + "loss": 0.2154, + "step": 41340 + }, + { + "epoch": 1.71, + "grad_norm": 0.74609375, + "learning_rate": 0.0004960079735655727, + "loss": 0.2096, + "step": 41350 + }, + { + "epoch": 1.71, + "grad_norm": 0.6953125, + "learning_rate": 0.0004960060429737703, + "loss": 0.1973, + "step": 41360 + }, + { + "epoch": 1.71, + "grad_norm": 0.33984375, + "learning_rate": 0.000496004111919011, + "loss": 0.1665, + "step": 41370 + }, + { + "epoch": 1.71, + "grad_norm": 0.6484375, + "learning_rate": 0.0004960021804012987, + "loss": 0.2249, + "step": 41380 + }, + { + "epoch": 1.71, + "grad_norm": 0.412109375, + "learning_rate": 0.0004960002484206367, + "loss": 0.2756, + "step": 41390 + }, + { + "epoch": 1.71, + "grad_norm": 0.859375, + "learning_rate": 0.0004959983159770288, + "loss": 0.233, + "step": 41400 + }, + { + "epoch": 1.72, + "grad_norm": 0.578125, + "learning_rate": 0.0004959963830704787, + "loss": 0.2335, + "step": 41410 + }, + { + "epoch": 1.72, + "grad_norm": 0.86328125, + "learning_rate": 0.0004959944497009899, + "loss": 0.2248, + "step": 41420 + }, + { + "epoch": 1.72, + "grad_norm": 1.46875, + "learning_rate": 0.0004959925158685662, + "loss": 0.1753, + "step": 41430 + }, + { + "epoch": 1.72, + "grad_norm": 0.5625, + "learning_rate": 0.0004959905815732111, + "loss": 0.2309, + "step": 41440 + }, + { + "epoch": 1.72, + "grad_norm": 0.50390625, + "learning_rate": 0.0004959886468149284, + "loss": 0.2341, + "step": 41450 + }, + { + "epoch": 1.72, + "grad_norm": 0.435546875, + "learning_rate": 0.0004959867115937215, + "loss": 0.2237, + "step": 41460 + }, + { + "epoch": 1.72, + "grad_norm": 0.8515625, + "learning_rate": 0.0004959847759095943, + "loss": 0.2236, + "step": 41470 + }, + { + "epoch": 1.72, + "grad_norm": 0.8984375, + "learning_rate": 0.0004959828397625501, + "loss": 0.2042, + "step": 41480 + }, + { + "epoch": 1.72, + "grad_norm": 1.0703125, + "learning_rate": 0.0004959809031525929, + "loss": 0.2633, + "step": 41490 + }, + { + "epoch": 1.72, + "grad_norm": 0.66796875, + "learning_rate": 0.0004959789660797262, + "loss": 0.2185, + "step": 41500 + }, + { + "epoch": 1.72, + "grad_norm": 0.5625, + "learning_rate": 0.0004959770285439536, + "loss": 0.198, + "step": 41510 + }, + { + "epoch": 1.72, + "grad_norm": 0.62890625, + "learning_rate": 0.0004959750905452789, + "loss": 0.2135, + "step": 41520 + }, + { + "epoch": 1.72, + "grad_norm": 1.0390625, + "learning_rate": 0.0004959731520837056, + "loss": 0.271, + "step": 41530 + }, + { + "epoch": 1.72, + "grad_norm": 1.1796875, + "learning_rate": 0.0004959712131592373, + "loss": 0.2113, + "step": 41540 + }, + { + "epoch": 1.72, + "grad_norm": 1.203125, + "learning_rate": 0.0004959692737718779, + "loss": 0.2323, + "step": 41550 + }, + { + "epoch": 1.72, + "grad_norm": 0.76171875, + "learning_rate": 0.0004959673339216307, + "loss": 0.2136, + "step": 41560 + }, + { + "epoch": 1.72, + "grad_norm": 0.74609375, + "learning_rate": 0.0004959653936084996, + "loss": 0.2218, + "step": 41570 + }, + { + "epoch": 1.72, + "grad_norm": 0.373046875, + "learning_rate": 0.0004959634528324883, + "loss": 0.2502, + "step": 41580 + }, + { + "epoch": 1.72, + "grad_norm": 0.953125, + "learning_rate": 0.0004959615115936002, + "loss": 0.2682, + "step": 41590 + }, + { + "epoch": 1.72, + "grad_norm": 0.93359375, + "learning_rate": 0.0004959595698918393, + "loss": 0.293, + "step": 41600 + }, + { + "epoch": 1.72, + "grad_norm": 0.2255859375, + "learning_rate": 0.0004959576277272089, + "loss": 0.2166, + "step": 41610 + }, + { + "epoch": 1.72, + "grad_norm": 0.9765625, + "learning_rate": 0.0004959556850997128, + "loss": 0.2392, + "step": 41620 + }, + { + "epoch": 1.72, + "grad_norm": 0.94140625, + "learning_rate": 0.0004959537420093546, + "loss": 0.2433, + "step": 41630 + }, + { + "epoch": 1.72, + "grad_norm": 0.72265625, + "learning_rate": 0.0004959517984561381, + "loss": 0.1845, + "step": 41640 + }, + { + "epoch": 1.73, + "grad_norm": 0.55078125, + "learning_rate": 0.0004959498544400669, + "loss": 0.2361, + "step": 41650 + }, + { + "epoch": 1.73, + "grad_norm": 1.109375, + "learning_rate": 0.0004959479099611447, + "loss": 0.2494, + "step": 41660 + }, + { + "epoch": 1.73, + "grad_norm": 0.310546875, + "learning_rate": 0.000495945965019375, + "loss": 0.2742, + "step": 41670 + }, + { + "epoch": 1.73, + "grad_norm": 1.25, + "learning_rate": 0.0004959440196147615, + "loss": 0.2101, + "step": 41680 + }, + { + "epoch": 1.73, + "grad_norm": 0.6953125, + "learning_rate": 0.000495942073747308, + "loss": 0.2704, + "step": 41690 + }, + { + "epoch": 1.73, + "grad_norm": 0.5546875, + "learning_rate": 0.000495940127417018, + "loss": 0.17, + "step": 41700 + }, + { + "epoch": 1.73, + "grad_norm": 0.60546875, + "learning_rate": 0.0004959381806238953, + "loss": 0.2616, + "step": 41710 + }, + { + "epoch": 1.73, + "grad_norm": 0.369140625, + "learning_rate": 0.0004959362333679436, + "loss": 0.181, + "step": 41720 + }, + { + "epoch": 1.73, + "grad_norm": 0.68359375, + "learning_rate": 0.0004959342856491663, + "loss": 0.2084, + "step": 41730 + }, + { + "epoch": 1.73, + "grad_norm": 0.984375, + "learning_rate": 0.0004959323374675673, + "loss": 0.2392, + "step": 41740 + }, + { + "epoch": 1.73, + "grad_norm": 0.84375, + "learning_rate": 0.0004959303888231502, + "loss": 0.1728, + "step": 41750 + }, + { + "epoch": 1.73, + "grad_norm": 0.69921875, + "learning_rate": 0.0004959284397159186, + "loss": 0.2796, + "step": 41760 + }, + { + "epoch": 1.73, + "grad_norm": 0.484375, + "learning_rate": 0.0004959264901458763, + "loss": 0.195, + "step": 41770 + }, + { + "epoch": 1.73, + "grad_norm": 0.7109375, + "learning_rate": 0.0004959245401130269, + "loss": 0.1989, + "step": 41780 + }, + { + "epoch": 1.73, + "grad_norm": 2.140625, + "learning_rate": 0.0004959225896173741, + "loss": 0.1937, + "step": 41790 + }, + { + "epoch": 1.73, + "grad_norm": 1.3359375, + "learning_rate": 0.0004959206386589215, + "loss": 0.1783, + "step": 41800 + }, + { + "epoch": 1.73, + "grad_norm": 0.79296875, + "learning_rate": 0.0004959186872376728, + "loss": 0.219, + "step": 41810 + }, + { + "epoch": 1.73, + "grad_norm": 0.396484375, + "learning_rate": 0.0004959167353536318, + "loss": 0.2117, + "step": 41820 + }, + { + "epoch": 1.73, + "grad_norm": 0.8671875, + "learning_rate": 0.0004959147830068019, + "loss": 0.241, + "step": 41830 + }, + { + "epoch": 1.73, + "grad_norm": 0.53515625, + "learning_rate": 0.000495912830197187, + "loss": 0.1972, + "step": 41840 + }, + { + "epoch": 1.73, + "grad_norm": 0.56640625, + "learning_rate": 0.0004959108769247907, + "loss": 0.1595, + "step": 41850 + }, + { + "epoch": 1.73, + "grad_norm": 2.390625, + "learning_rate": 0.0004959089231896167, + "loss": 0.2558, + "step": 41860 + }, + { + "epoch": 1.73, + "grad_norm": 0.4140625, + "learning_rate": 0.0004959069689916688, + "loss": 0.257, + "step": 41870 + }, + { + "epoch": 1.73, + "grad_norm": 1.2109375, + "learning_rate": 0.0004959050143309503, + "loss": 0.2214, + "step": 41880 + }, + { + "epoch": 1.74, + "grad_norm": 0.65234375, + "learning_rate": 0.0004959030592074653, + "loss": 0.2429, + "step": 41890 + }, + { + "epoch": 1.74, + "grad_norm": 0.003509521484375, + "learning_rate": 0.0004959011036212172, + "loss": 0.1937, + "step": 41900 + }, + { + "epoch": 1.74, + "grad_norm": 0.423828125, + "learning_rate": 0.0004958991475722098, + "loss": 0.274, + "step": 41910 + }, + { + "epoch": 1.74, + "grad_norm": 0.46484375, + "learning_rate": 0.0004958971910604468, + "loss": 0.2137, + "step": 41920 + }, + { + "epoch": 1.74, + "grad_norm": 0.6015625, + "learning_rate": 0.0004958952340859318, + "loss": 0.2522, + "step": 41930 + }, + { + "epoch": 1.74, + "grad_norm": 0.75, + "learning_rate": 0.0004958932766486686, + "loss": 0.2594, + "step": 41940 + }, + { + "epoch": 1.74, + "grad_norm": 0.91015625, + "learning_rate": 0.0004958913187486606, + "loss": 0.2274, + "step": 41950 + }, + { + "epoch": 1.74, + "grad_norm": 0.703125, + "learning_rate": 0.0004958893603859119, + "loss": 0.2133, + "step": 41960 + }, + { + "epoch": 1.74, + "grad_norm": 0.361328125, + "learning_rate": 0.0004958874015604258, + "loss": 0.2457, + "step": 41970 + }, + { + "epoch": 1.74, + "grad_norm": 0.359375, + "learning_rate": 0.0004958854422722064, + "loss": 0.1895, + "step": 41980 + }, + { + "epoch": 1.74, + "grad_norm": 0.79296875, + "learning_rate": 0.000495883482521257, + "loss": 0.2872, + "step": 41990 + }, + { + "epoch": 1.74, + "grad_norm": 0.9765625, + "learning_rate": 0.0004958815223075814, + "loss": 0.2408, + "step": 42000 + }, + { + "epoch": 1.74, + "grad_norm": 0.4765625, + "learning_rate": 0.0004958795616311835, + "loss": 0.2117, + "step": 42010 + }, + { + "epoch": 1.74, + "grad_norm": 0.87109375, + "learning_rate": 0.0004958776004920667, + "loss": 0.2947, + "step": 42020 + }, + { + "epoch": 1.74, + "grad_norm": 0.0, + "learning_rate": 0.0004958756388902348, + "loss": 0.2093, + "step": 42030 + }, + { + "epoch": 1.74, + "grad_norm": 0.5390625, + "learning_rate": 0.0004958736768256915, + "loss": 0.1959, + "step": 42040 + }, + { + "epoch": 1.74, + "grad_norm": 0.8046875, + "learning_rate": 0.0004958717142984404, + "loss": 0.2742, + "step": 42050 + }, + { + "epoch": 1.74, + "grad_norm": 0.90625, + "learning_rate": 0.0004958697513084855, + "loss": 0.2233, + "step": 42060 + }, + { + "epoch": 1.74, + "grad_norm": 0.42578125, + "learning_rate": 0.0004958677878558302, + "loss": 0.2052, + "step": 42070 + }, + { + "epoch": 1.74, + "grad_norm": 1.171875, + "learning_rate": 0.0004958658239404782, + "loss": 0.194, + "step": 42080 + }, + { + "epoch": 1.74, + "grad_norm": 0.474609375, + "learning_rate": 0.0004958638595624333, + "loss": 0.2168, + "step": 42090 + }, + { + "epoch": 1.74, + "grad_norm": 0.85546875, + "learning_rate": 0.0004958618947216992, + "loss": 0.2542, + "step": 42100 + }, + { + "epoch": 1.74, + "grad_norm": 0.76953125, + "learning_rate": 0.0004958599294182796, + "loss": 0.1895, + "step": 42110 + }, + { + "epoch": 1.74, + "grad_norm": 0.99609375, + "learning_rate": 0.0004958579636521781, + "loss": 0.2348, + "step": 42120 + }, + { + "epoch": 1.75, + "grad_norm": 0.734375, + "learning_rate": 0.0004958559974233984, + "loss": 0.2046, + "step": 42130 + }, + { + "epoch": 1.75, + "grad_norm": 0.91796875, + "learning_rate": 0.0004958540307319443, + "loss": 0.2373, + "step": 42140 + }, + { + "epoch": 1.75, + "grad_norm": 0.392578125, + "learning_rate": 0.0004958520635778196, + "loss": 0.23, + "step": 42150 + }, + { + "epoch": 1.75, + "grad_norm": 1.3203125, + "learning_rate": 0.0004958500959610278, + "loss": 0.243, + "step": 42160 + }, + { + "epoch": 1.75, + "grad_norm": 0.86328125, + "learning_rate": 0.0004958481278815726, + "loss": 0.205, + "step": 42170 + }, + { + "epoch": 1.75, + "grad_norm": 0.55078125, + "learning_rate": 0.0004958461593394578, + "loss": 0.2149, + "step": 42180 + }, + { + "epoch": 1.75, + "grad_norm": 0.59375, + "learning_rate": 0.0004958441903346871, + "loss": 0.1973, + "step": 42190 + }, + { + "epoch": 1.75, + "grad_norm": 0.96875, + "learning_rate": 0.0004958422208672642, + "loss": 0.2218, + "step": 42200 + }, + { + "epoch": 1.75, + "grad_norm": 0.55078125, + "learning_rate": 0.0004958402509371928, + "loss": 0.2736, + "step": 42210 + }, + { + "epoch": 1.75, + "grad_norm": 0.85546875, + "learning_rate": 0.0004958382805444764, + "loss": 0.2161, + "step": 42220 + }, + { + "epoch": 1.75, + "grad_norm": 0.62109375, + "learning_rate": 0.0004958363096891191, + "loss": 0.2555, + "step": 42230 + }, + { + "epoch": 1.75, + "grad_norm": 0.35546875, + "learning_rate": 0.0004958343383711244, + "loss": 0.2486, + "step": 42240 + }, + { + "epoch": 1.75, + "grad_norm": 0.388671875, + "learning_rate": 0.000495832366590496, + "loss": 0.2409, + "step": 42250 + }, + { + "epoch": 1.75, + "grad_norm": 0.1845703125, + "learning_rate": 0.0004958303943472376, + "loss": 0.259, + "step": 42260 + }, + { + "epoch": 1.75, + "grad_norm": 0.427734375, + "learning_rate": 0.000495828421641353, + "loss": 0.2215, + "step": 42270 + }, + { + "epoch": 1.75, + "grad_norm": 0.5390625, + "learning_rate": 0.0004958264484728459, + "loss": 0.2313, + "step": 42280 + }, + { + "epoch": 1.75, + "grad_norm": 0.55078125, + "learning_rate": 0.0004958244748417198, + "loss": 0.2145, + "step": 42290 + }, + { + "epoch": 1.75, + "grad_norm": 0.54296875, + "learning_rate": 0.0004958225007479787, + "loss": 0.2573, + "step": 42300 + }, + { + "epoch": 1.75, + "grad_norm": 0.84765625, + "learning_rate": 0.0004958205261916262, + "loss": 0.2709, + "step": 42310 + }, + { + "epoch": 1.75, + "grad_norm": 0.189453125, + "learning_rate": 0.0004958185511726661, + "loss": 0.2335, + "step": 42320 + }, + { + "epoch": 1.75, + "grad_norm": 0.6875, + "learning_rate": 0.0004958165756911018, + "loss": 0.203, + "step": 42330 + }, + { + "epoch": 1.75, + "grad_norm": 1.484375, + "learning_rate": 0.0004958145997469375, + "loss": 0.2068, + "step": 42340 + }, + { + "epoch": 1.75, + "grad_norm": 0.84765625, + "learning_rate": 0.0004958126233401766, + "loss": 0.2758, + "step": 42350 + }, + { + "epoch": 1.75, + "grad_norm": 0.4765625, + "learning_rate": 0.0004958106464708228, + "loss": 0.2403, + "step": 42360 + }, + { + "epoch": 1.75, + "grad_norm": 1.3046875, + "learning_rate": 0.0004958086691388799, + "loss": 0.1822, + "step": 42370 + }, + { + "epoch": 1.76, + "grad_norm": 1.359375, + "learning_rate": 0.0004958066913443517, + "loss": 0.2124, + "step": 42380 + }, + { + "epoch": 1.76, + "grad_norm": 0.486328125, + "learning_rate": 0.0004958047130872419, + "loss": 0.2417, + "step": 42390 + }, + { + "epoch": 1.76, + "grad_norm": 0.51171875, + "learning_rate": 0.0004958027343675541, + "loss": 0.2178, + "step": 42400 + }, + { + "epoch": 1.76, + "grad_norm": 0.8828125, + "learning_rate": 0.000495800755185292, + "loss": 0.2033, + "step": 42410 + }, + { + "epoch": 1.76, + "grad_norm": 0.84375, + "learning_rate": 0.0004957987755404596, + "loss": 0.2539, + "step": 42420 + }, + { + "epoch": 1.76, + "grad_norm": 1.171875, + "learning_rate": 0.0004957967954330603, + "loss": 0.1937, + "step": 42430 + }, + { + "epoch": 1.76, + "grad_norm": 0.9609375, + "learning_rate": 0.000495794814863098, + "loss": 0.2562, + "step": 42440 + }, + { + "epoch": 1.76, + "grad_norm": 1.59375, + "learning_rate": 0.0004957928338305765, + "loss": 0.2323, + "step": 42450 + }, + { + "epoch": 1.76, + "grad_norm": 0.609375, + "learning_rate": 0.0004957908523354995, + "loss": 0.2053, + "step": 42460 + }, + { + "epoch": 1.76, + "grad_norm": 0.6875, + "learning_rate": 0.0004957888703778704, + "loss": 0.2252, + "step": 42470 + }, + { + "epoch": 1.76, + "grad_norm": 0.87109375, + "learning_rate": 0.0004957868879576935, + "loss": 0.2376, + "step": 42480 + }, + { + "epoch": 1.76, + "grad_norm": 1.0390625, + "learning_rate": 0.000495784905074972, + "loss": 0.2084, + "step": 42490 + }, + { + "epoch": 1.76, + "grad_norm": 1.03125, + "learning_rate": 0.0004957829217297099, + "loss": 0.2477, + "step": 42500 + }, + { + "epoch": 1.76, + "grad_norm": 0.73046875, + "learning_rate": 0.0004957809379219108, + "loss": 0.2022, + "step": 42510 + }, + { + "epoch": 1.76, + "grad_norm": 0.0006103515625, + "learning_rate": 0.0004957789536515787, + "loss": 0.2125, + "step": 42520 + }, + { + "epoch": 1.76, + "grad_norm": 0.875, + "learning_rate": 0.000495776968918717, + "loss": 0.2755, + "step": 42530 + }, + { + "epoch": 1.76, + "grad_norm": 0.65234375, + "learning_rate": 0.0004957749837233297, + "loss": 0.2386, + "step": 42540 + }, + { + "epoch": 1.76, + "grad_norm": 1.5703125, + "learning_rate": 0.0004957729980654204, + "loss": 0.2366, + "step": 42550 + }, + { + "epoch": 1.76, + "grad_norm": 0.94140625, + "learning_rate": 0.0004957710119449927, + "loss": 0.2244, + "step": 42560 + }, + { + "epoch": 1.76, + "grad_norm": 0.474609375, + "learning_rate": 0.0004957690253620507, + "loss": 0.2546, + "step": 42570 + }, + { + "epoch": 1.76, + "grad_norm": 0.310546875, + "learning_rate": 0.0004957670383165979, + "loss": 0.2512, + "step": 42580 + }, + { + "epoch": 1.76, + "grad_norm": 0.375, + "learning_rate": 0.000495765050808638, + "loss": 0.251, + "step": 42590 + }, + { + "epoch": 1.76, + "grad_norm": 0.44140625, + "learning_rate": 0.0004957630628381749, + "loss": 0.2292, + "step": 42600 + }, + { + "epoch": 1.76, + "grad_norm": 0.326171875, + "learning_rate": 0.0004957610744052121, + "loss": 0.27, + "step": 42610 + }, + { + "epoch": 1.77, + "grad_norm": 0.400390625, + "learning_rate": 0.0004957590855097536, + "loss": 0.2425, + "step": 42620 + }, + { + "epoch": 1.77, + "grad_norm": 1.3203125, + "learning_rate": 0.0004957570961518031, + "loss": 0.2005, + "step": 42630 + }, + { + "epoch": 1.77, + "grad_norm": 1.2734375, + "learning_rate": 0.0004957551063313641, + "loss": 0.2327, + "step": 42640 + }, + { + "epoch": 1.77, + "grad_norm": 0.373046875, + "learning_rate": 0.0004957531160484407, + "loss": 0.2174, + "step": 42650 + }, + { + "epoch": 1.77, + "grad_norm": 0.46875, + "learning_rate": 0.0004957511253030365, + "loss": 0.2434, + "step": 42660 + }, + { + "epoch": 1.77, + "grad_norm": 0.546875, + "learning_rate": 0.0004957491340951551, + "loss": 0.243, + "step": 42670 + }, + { + "epoch": 1.77, + "grad_norm": 0.6015625, + "learning_rate": 0.0004957471424248004, + "loss": 0.1998, + "step": 42680 + }, + { + "epoch": 1.77, + "grad_norm": 0.9609375, + "learning_rate": 0.0004957451502919761, + "loss": 0.233, + "step": 42690 + }, + { + "epoch": 1.77, + "grad_norm": 0.87890625, + "learning_rate": 0.000495743157696686, + "loss": 0.1985, + "step": 42700 + }, + { + "epoch": 1.77, + "grad_norm": 0.75, + "learning_rate": 0.0004957411646389338, + "loss": 0.3032, + "step": 42710 + }, + { + "epoch": 1.77, + "grad_norm": 0.3984375, + "learning_rate": 0.0004957391711187234, + "loss": 0.2709, + "step": 42720 + }, + { + "epoch": 1.77, + "grad_norm": 0.333984375, + "learning_rate": 0.0004957371771360582, + "loss": 0.2102, + "step": 42730 + }, + { + "epoch": 1.77, + "grad_norm": 1.2578125, + "learning_rate": 0.0004957351826909423, + "loss": 0.241, + "step": 42740 + }, + { + "epoch": 1.77, + "grad_norm": 0.640625, + "learning_rate": 0.0004957331877833793, + "loss": 0.2708, + "step": 42750 + }, + { + "epoch": 1.77, + "grad_norm": 0.72265625, + "learning_rate": 0.0004957311924133729, + "loss": 0.2049, + "step": 42760 + }, + { + "epoch": 1.77, + "grad_norm": 0.51953125, + "learning_rate": 0.0004957291965809271, + "loss": 0.2119, + "step": 42770 + }, + { + "epoch": 1.77, + "grad_norm": 0.4765625, + "learning_rate": 0.0004957272002860453, + "loss": 0.2159, + "step": 42780 + }, + { + "epoch": 1.77, + "grad_norm": 1.140625, + "learning_rate": 0.0004957252035287315, + "loss": 0.2189, + "step": 42790 + }, + { + "epoch": 1.77, + "grad_norm": 0.72265625, + "learning_rate": 0.0004957232063089895, + "loss": 0.2183, + "step": 42800 + }, + { + "epoch": 1.77, + "grad_norm": 0.64453125, + "learning_rate": 0.0004957212086268229, + "loss": 0.1821, + "step": 42810 + }, + { + "epoch": 1.77, + "grad_norm": 0.52734375, + "learning_rate": 0.0004957192104822355, + "loss": 0.2455, + "step": 42820 + }, + { + "epoch": 1.77, + "grad_norm": 0.6875, + "learning_rate": 0.0004957172118752311, + "loss": 0.1954, + "step": 42830 + }, + { + "epoch": 1.77, + "grad_norm": 0.37109375, + "learning_rate": 0.0004957152128058134, + "loss": 0.242, + "step": 42840 + }, + { + "epoch": 1.77, + "grad_norm": 2.40625, + "learning_rate": 0.0004957132132739863, + "loss": 0.2524, + "step": 42850 + }, + { + "epoch": 1.78, + "grad_norm": 0.5078125, + "learning_rate": 0.0004957112132797535, + "loss": 0.2472, + "step": 42860 + }, + { + "epoch": 1.78, + "grad_norm": 0.6875, + "learning_rate": 0.0004957092128231185, + "loss": 0.2487, + "step": 42870 + }, + { + "epoch": 1.78, + "grad_norm": 0.62890625, + "learning_rate": 0.0004957072119040855, + "loss": 0.2387, + "step": 42880 + }, + { + "epoch": 1.78, + "grad_norm": 0.60546875, + "learning_rate": 0.000495705210522658, + "loss": 0.2569, + "step": 42890 + }, + { + "epoch": 1.78, + "grad_norm": 0.6796875, + "learning_rate": 0.0004957032086788398, + "loss": 0.231, + "step": 42900 + }, + { + "epoch": 1.78, + "grad_norm": 3.25, + "learning_rate": 0.0004957012063726348, + "loss": 0.2282, + "step": 42910 + }, + { + "epoch": 1.78, + "grad_norm": 0.2216796875, + "learning_rate": 0.0004956992036040465, + "loss": 0.1809, + "step": 42920 + }, + { + "epoch": 1.78, + "grad_norm": 0.478515625, + "learning_rate": 0.0004956972003730789, + "loss": 0.2007, + "step": 42930 + }, + { + "epoch": 1.78, + "grad_norm": 0.361328125, + "learning_rate": 0.0004956951966797359, + "loss": 0.1502, + "step": 42940 + }, + { + "epoch": 1.78, + "grad_norm": 0.59375, + "learning_rate": 0.0004956931925240208, + "loss": 0.1813, + "step": 42950 + }, + { + "epoch": 1.78, + "grad_norm": 0.68359375, + "learning_rate": 0.0004956911879059377, + "loss": 0.2473, + "step": 42960 + }, + { + "epoch": 1.78, + "grad_norm": 0.5390625, + "learning_rate": 0.0004956891828254903, + "loss": 0.24, + "step": 42970 + }, + { + "epoch": 1.78, + "grad_norm": 0.4453125, + "learning_rate": 0.0004956871772826825, + "loss": 0.2179, + "step": 42980 + }, + { + "epoch": 1.78, + "grad_norm": 0.9296875, + "learning_rate": 0.0004956851712775179, + "loss": 0.2468, + "step": 42990 + }, + { + "epoch": 1.78, + "grad_norm": 0.40234375, + "learning_rate": 0.0004956831648100004, + "loss": 0.2439, + "step": 43000 + }, + { + "epoch": 1.78, + "grad_norm": 0.55859375, + "learning_rate": 0.0004956811578801336, + "loss": 0.1998, + "step": 43010 + }, + { + "epoch": 1.78, + "grad_norm": 1.28125, + "learning_rate": 0.0004956791504879216, + "loss": 0.2466, + "step": 43020 + }, + { + "epoch": 1.78, + "grad_norm": 0.421875, + "learning_rate": 0.0004956771426333677, + "loss": 0.2192, + "step": 43030 + }, + { + "epoch": 1.78, + "grad_norm": 1.7265625, + "learning_rate": 0.0004956751343164761, + "loss": 0.1666, + "step": 43040 + }, + { + "epoch": 1.78, + "grad_norm": 0.5546875, + "learning_rate": 0.0004956731255372504, + "loss": 0.2317, + "step": 43050 + }, + { + "epoch": 1.78, + "grad_norm": 0.42578125, + "learning_rate": 0.0004956711162956945, + "loss": 0.2165, + "step": 43060 + }, + { + "epoch": 1.78, + "grad_norm": 0.8671875, + "learning_rate": 0.000495669106591812, + "loss": 0.3197, + "step": 43070 + }, + { + "epoch": 1.78, + "grad_norm": 0.92578125, + "learning_rate": 0.0004956670964256067, + "loss": 0.2605, + "step": 43080 + }, + { + "epoch": 1.78, + "grad_norm": 0.828125, + "learning_rate": 0.0004956650857970825, + "loss": 0.209, + "step": 43090 + }, + { + "epoch": 1.79, + "grad_norm": 0.98046875, + "learning_rate": 0.0004956630747062432, + "loss": 0.2366, + "step": 43100 + }, + { + "epoch": 1.79, + "grad_norm": 0.369140625, + "learning_rate": 0.0004956610631530925, + "loss": 0.259, + "step": 43110 + }, + { + "epoch": 1.79, + "grad_norm": 0.34765625, + "learning_rate": 0.0004956590511376342, + "loss": 0.2248, + "step": 43120 + }, + { + "epoch": 1.79, + "grad_norm": 0.46484375, + "learning_rate": 0.0004956570386598721, + "loss": 0.1957, + "step": 43130 + }, + { + "epoch": 1.79, + "grad_norm": 1.1640625, + "learning_rate": 0.0004956550257198099, + "loss": 0.2649, + "step": 43140 + }, + { + "epoch": 1.79, + "grad_norm": 0.76171875, + "learning_rate": 0.0004956530123174516, + "loss": 0.2401, + "step": 43150 + }, + { + "epoch": 1.79, + "grad_norm": 0.33984375, + "learning_rate": 0.0004956509984528008, + "loss": 0.1975, + "step": 43160 + }, + { + "epoch": 1.79, + "grad_norm": 1.1875, + "learning_rate": 0.0004956489841258613, + "loss": 0.2014, + "step": 43170 + }, + { + "epoch": 1.79, + "grad_norm": 0.80078125, + "learning_rate": 0.000495646969336637, + "loss": 0.2124, + "step": 43180 + }, + { + "epoch": 1.79, + "grad_norm": 1.0625, + "learning_rate": 0.0004956449540851317, + "loss": 0.2114, + "step": 43190 + }, + { + "epoch": 1.79, + "grad_norm": 0.44921875, + "learning_rate": 0.000495642938371349, + "loss": 0.1994, + "step": 43200 + }, + { + "epoch": 1.79, + "grad_norm": 0.30078125, + "learning_rate": 0.0004956409221952929, + "loss": 0.271, + "step": 43210 + }, + { + "epoch": 1.79, + "grad_norm": 0.419921875, + "learning_rate": 0.000495638905556967, + "loss": 0.2151, + "step": 43220 + }, + { + "epoch": 1.79, + "grad_norm": 0.419921875, + "learning_rate": 0.0004956368884563753, + "loss": 0.2744, + "step": 43230 + }, + { + "epoch": 1.79, + "grad_norm": 0.0, + "learning_rate": 0.0004956348708935215, + "loss": 0.2196, + "step": 43240 + }, + { + "epoch": 1.79, + "grad_norm": 0.53515625, + "learning_rate": 0.0004956328528684093, + "loss": 0.1939, + "step": 43250 + }, + { + "epoch": 1.79, + "grad_norm": 0.4609375, + "learning_rate": 0.0004956308343810427, + "loss": 0.1633, + "step": 43260 + }, + { + "epoch": 1.79, + "grad_norm": 1.2265625, + "learning_rate": 0.0004956288154314254, + "loss": 0.2849, + "step": 43270 + }, + { + "epoch": 1.79, + "grad_norm": 0.765625, + "learning_rate": 0.0004956267960195611, + "loss": 0.2117, + "step": 43280 + }, + { + "epoch": 1.79, + "grad_norm": 0.4296875, + "learning_rate": 0.0004956247761454539, + "loss": 0.2161, + "step": 43290 + }, + { + "epoch": 1.79, + "grad_norm": 0.921875, + "learning_rate": 0.0004956227558091072, + "loss": 0.2415, + "step": 43300 + }, + { + "epoch": 1.79, + "grad_norm": 0.4453125, + "learning_rate": 0.0004956207350105251, + "loss": 0.2358, + "step": 43310 + }, + { + "epoch": 1.79, + "grad_norm": 0.42578125, + "learning_rate": 0.0004956187137497112, + "loss": 0.2698, + "step": 43320 + }, + { + "epoch": 1.79, + "grad_norm": 1.09375, + "learning_rate": 0.0004956166920266695, + "loss": 0.2137, + "step": 43330 + }, + { + "epoch": 1.8, + "grad_norm": 0.62890625, + "learning_rate": 0.0004956146698414037, + "loss": 0.2003, + "step": 43340 + }, + { + "epoch": 1.8, + "grad_norm": 1.109375, + "learning_rate": 0.0004956126471939175, + "loss": 0.2254, + "step": 43350 + }, + { + "epoch": 1.8, + "grad_norm": 0.90625, + "learning_rate": 0.000495610624084215, + "loss": 0.2851, + "step": 43360 + }, + { + "epoch": 1.8, + "grad_norm": 0.498046875, + "learning_rate": 0.0004956086005122997, + "loss": 0.2396, + "step": 43370 + }, + { + "epoch": 1.8, + "grad_norm": 0.18359375, + "learning_rate": 0.0004956065764781756, + "loss": 0.1669, + "step": 43380 + }, + { + "epoch": 1.8, + "grad_norm": 0.64453125, + "learning_rate": 0.0004956045519818464, + "loss": 0.2433, + "step": 43390 + }, + { + "epoch": 1.8, + "grad_norm": 0.453125, + "learning_rate": 0.000495602527023316, + "loss": 0.2686, + "step": 43400 + }, + { + "epoch": 1.8, + "grad_norm": 0.828125, + "learning_rate": 0.0004956005016025881, + "loss": 0.2149, + "step": 43410 + }, + { + "epoch": 1.8, + "grad_norm": 0.890625, + "learning_rate": 0.0004955984757196667, + "loss": 0.1936, + "step": 43420 + }, + { + "epoch": 1.8, + "grad_norm": 1.4765625, + "learning_rate": 0.0004955964493745554, + "loss": 0.1992, + "step": 43430 + }, + { + "epoch": 1.8, + "grad_norm": 0.5078125, + "learning_rate": 0.0004955944225672581, + "loss": 0.2408, + "step": 43440 + }, + { + "epoch": 1.8, + "grad_norm": 1.015625, + "learning_rate": 0.0004955923952977787, + "loss": 0.2, + "step": 43450 + }, + { + "epoch": 1.8, + "grad_norm": 0.212890625, + "learning_rate": 0.0004955903675661208, + "loss": 0.2006, + "step": 43460 + }, + { + "epoch": 1.8, + "grad_norm": 0.79296875, + "learning_rate": 0.0004955883393722884, + "loss": 0.2429, + "step": 43470 + }, + { + "epoch": 1.8, + "grad_norm": 0.4453125, + "learning_rate": 0.0004955863107162854, + "loss": 0.2313, + "step": 43480 + }, + { + "epoch": 1.8, + "grad_norm": 0.71484375, + "learning_rate": 0.0004955842815981154, + "loss": 0.2797, + "step": 43490 + }, + { + "epoch": 1.8, + "grad_norm": 0.75390625, + "learning_rate": 0.0004955822520177821, + "loss": 0.2398, + "step": 43500 + }, + { + "epoch": 1.8, + "grad_norm": 0.9296875, + "learning_rate": 0.0004955802219752898, + "loss": 0.2089, + "step": 43510 + }, + { + "epoch": 1.8, + "grad_norm": 0.66015625, + "learning_rate": 0.000495578191470642, + "loss": 0.2173, + "step": 43520 + }, + { + "epoch": 1.8, + "grad_norm": 3.609375, + "learning_rate": 0.0004955761605038425, + "loss": 0.254, + "step": 43530 + }, + { + "epoch": 1.8, + "grad_norm": 0.90625, + "learning_rate": 0.0004955741290748951, + "loss": 0.2643, + "step": 43540 + }, + { + "epoch": 1.8, + "grad_norm": 0.2490234375, + "learning_rate": 0.0004955720971838038, + "loss": 0.2381, + "step": 43550 + }, + { + "epoch": 1.8, + "grad_norm": 0.60546875, + "learning_rate": 0.0004955700648305724, + "loss": 0.2617, + "step": 43560 + }, + { + "epoch": 1.8, + "grad_norm": 0.5390625, + "learning_rate": 0.0004955680320152046, + "loss": 0.2635, + "step": 43570 + }, + { + "epoch": 1.81, + "grad_norm": 0.703125, + "learning_rate": 0.0004955659987377042, + "loss": 0.1755, + "step": 43580 + }, + { + "epoch": 1.81, + "grad_norm": 0.375, + "learning_rate": 0.0004955639649980752, + "loss": 0.2557, + "step": 43590 + }, + { + "epoch": 1.81, + "grad_norm": 0.61328125, + "learning_rate": 0.0004955619307963214, + "loss": 0.2342, + "step": 43600 + }, + { + "epoch": 1.81, + "grad_norm": 0.8125, + "learning_rate": 0.0004955598961324465, + "loss": 0.2141, + "step": 43610 + }, + { + "epoch": 1.81, + "grad_norm": 0.63671875, + "learning_rate": 0.0004955578610064543, + "loss": 0.2462, + "step": 43620 + }, + { + "epoch": 1.81, + "grad_norm": 0.7421875, + "learning_rate": 0.0004955558254183488, + "loss": 0.2507, + "step": 43630 + }, + { + "epoch": 1.81, + "grad_norm": 0.58984375, + "learning_rate": 0.0004955537893681338, + "loss": 0.2615, + "step": 43640 + }, + { + "epoch": 1.81, + "grad_norm": 1.890625, + "learning_rate": 0.000495551752855813, + "loss": 0.1688, + "step": 43650 + }, + { + "epoch": 1.81, + "grad_norm": 0.57421875, + "learning_rate": 0.0004955497158813904, + "loss": 0.2376, + "step": 43660 + }, + { + "epoch": 1.81, + "grad_norm": 0.5234375, + "learning_rate": 0.0004955476784448697, + "loss": 0.1974, + "step": 43670 + }, + { + "epoch": 1.81, + "grad_norm": 0.1494140625, + "learning_rate": 0.0004955456405462547, + "loss": 0.2362, + "step": 43680 + }, + { + "epoch": 1.81, + "grad_norm": 0.431640625, + "learning_rate": 0.0004955436021855495, + "loss": 0.2565, + "step": 43690 + }, + { + "epoch": 1.81, + "grad_norm": 0.46484375, + "learning_rate": 0.0004955415633627577, + "loss": 0.2483, + "step": 43700 + }, + { + "epoch": 1.81, + "grad_norm": 0.470703125, + "learning_rate": 0.0004955395240778831, + "loss": 0.2473, + "step": 43710 + }, + { + "epoch": 1.81, + "grad_norm": 0.90234375, + "learning_rate": 0.0004955374843309297, + "loss": 0.2016, + "step": 43720 + }, + { + "epoch": 1.81, + "grad_norm": 0.4765625, + "learning_rate": 0.0004955354441219013, + "loss": 0.2479, + "step": 43730 + }, + { + "epoch": 1.81, + "grad_norm": 0.734375, + "learning_rate": 0.0004955334034508017, + "loss": 0.2111, + "step": 43740 + }, + { + "epoch": 1.81, + "grad_norm": 0.40625, + "learning_rate": 0.0004955313623176348, + "loss": 0.2737, + "step": 43750 + }, + { + "epoch": 1.81, + "grad_norm": 0.5859375, + "learning_rate": 0.0004955293207224042, + "loss": 0.275, + "step": 43760 + }, + { + "epoch": 1.81, + "grad_norm": 0.734375, + "learning_rate": 0.0004955272786651142, + "loss": 0.2627, + "step": 43770 + }, + { + "epoch": 1.81, + "grad_norm": 0.6484375, + "learning_rate": 0.0004955252361457682, + "loss": 0.2234, + "step": 43780 + }, + { + "epoch": 1.81, + "grad_norm": 0.86328125, + "learning_rate": 0.0004955231931643702, + "loss": 0.1745, + "step": 43790 + }, + { + "epoch": 1.81, + "grad_norm": 0.2197265625, + "learning_rate": 0.0004955211497209241, + "loss": 0.2057, + "step": 43800 + }, + { + "epoch": 1.81, + "grad_norm": 1.125, + "learning_rate": 0.0004955191058154337, + "loss": 0.26, + "step": 43810 + }, + { + "epoch": 1.82, + "grad_norm": 0.5703125, + "learning_rate": 0.0004955170614479029, + "loss": 0.2292, + "step": 43820 + }, + { + "epoch": 1.82, + "grad_norm": 0.51171875, + "learning_rate": 0.0004955150166183356, + "loss": 0.2466, + "step": 43830 + }, + { + "epoch": 1.82, + "grad_norm": 1.0703125, + "learning_rate": 0.0004955129713267353, + "loss": 0.2303, + "step": 43840 + }, + { + "epoch": 1.82, + "grad_norm": 1.984375, + "learning_rate": 0.0004955109255731063, + "loss": 0.2313, + "step": 43850 + }, + { + "epoch": 1.82, + "grad_norm": 0.6875, + "learning_rate": 0.0004955088793574522, + "loss": 0.1699, + "step": 43860 + }, + { + "epoch": 1.82, + "grad_norm": 0.34375, + "learning_rate": 0.0004955068326797769, + "loss": 0.2266, + "step": 43870 + }, + { + "epoch": 1.82, + "grad_norm": 0.8671875, + "learning_rate": 0.0004955047855400842, + "loss": 0.2095, + "step": 43880 + }, + { + "epoch": 1.82, + "grad_norm": 0.6640625, + "learning_rate": 0.000495502737938378, + "loss": 0.2076, + "step": 43890 + }, + { + "epoch": 1.82, + "grad_norm": 0.78125, + "learning_rate": 0.0004955006898746623, + "loss": 0.2825, + "step": 43900 + }, + { + "epoch": 1.82, + "grad_norm": 0.7265625, + "learning_rate": 0.0004954986413489408, + "loss": 0.175, + "step": 43910 + }, + { + "epoch": 1.82, + "grad_norm": 0.59375, + "learning_rate": 0.0004954965923612172, + "loss": 0.1812, + "step": 43920 + }, + { + "epoch": 1.82, + "grad_norm": 0.4140625, + "learning_rate": 0.0004954945429114957, + "loss": 0.2217, + "step": 43930 + }, + { + "epoch": 1.82, + "grad_norm": 0.69921875, + "learning_rate": 0.0004954924929997798, + "loss": 0.1939, + "step": 43940 + }, + { + "epoch": 1.82, + "grad_norm": 0.50390625, + "learning_rate": 0.0004954904426260737, + "loss": 0.1914, + "step": 43950 + }, + { + "epoch": 1.82, + "grad_norm": 0.88671875, + "learning_rate": 0.0004954883917903809, + "loss": 0.2198, + "step": 43960 + }, + { + "epoch": 1.82, + "grad_norm": 0.361328125, + "learning_rate": 0.0004954863404927057, + "loss": 0.2406, + "step": 43970 + }, + { + "epoch": 1.82, + "grad_norm": 1.0078125, + "learning_rate": 0.0004954842887330516, + "loss": 0.2415, + "step": 43980 + }, + { + "epoch": 1.82, + "grad_norm": 0.66796875, + "learning_rate": 0.0004954822365114225, + "loss": 0.2691, + "step": 43990 + }, + { + "epoch": 1.82, + "grad_norm": 0.73828125, + "learning_rate": 0.0004954801838278225, + "loss": 0.2236, + "step": 44000 + }, + { + "epoch": 1.82, + "grad_norm": 0.51953125, + "learning_rate": 0.0004954781306822552, + "loss": 0.1991, + "step": 44010 + }, + { + "epoch": 1.82, + "grad_norm": 0.58203125, + "learning_rate": 0.0004954760770747247, + "loss": 0.1933, + "step": 44020 + }, + { + "epoch": 1.82, + "grad_norm": 0.466796875, + "learning_rate": 0.0004954740230052346, + "loss": 0.1856, + "step": 44030 + }, + { + "epoch": 1.82, + "grad_norm": 1.953125, + "learning_rate": 0.0004954719684737889, + "loss": 0.2359, + "step": 44040 + }, + { + "epoch": 1.82, + "grad_norm": 0.466796875, + "learning_rate": 0.0004954699134803915, + "loss": 0.2256, + "step": 44050 + }, + { + "epoch": 1.82, + "grad_norm": 0.99609375, + "learning_rate": 0.0004954678580250463, + "loss": 0.2034, + "step": 44060 + }, + { + "epoch": 1.83, + "grad_norm": 0.439453125, + "learning_rate": 0.0004954658021077569, + "loss": 0.2089, + "step": 44070 + }, + { + "epoch": 1.83, + "grad_norm": 0.62890625, + "learning_rate": 0.0004954637457285276, + "loss": 0.2472, + "step": 44080 + }, + { + "epoch": 1.83, + "grad_norm": 1.15625, + "learning_rate": 0.0004954616888873619, + "loss": 0.252, + "step": 44090 + }, + { + "epoch": 1.83, + "grad_norm": 0.6953125, + "learning_rate": 0.0004954596315842638, + "loss": 0.2166, + "step": 44100 + }, + { + "epoch": 1.83, + "grad_norm": 0.625, + "learning_rate": 0.0004954575738192373, + "loss": 0.2212, + "step": 44110 + }, + { + "epoch": 1.83, + "grad_norm": 0.53125, + "learning_rate": 0.000495455515592286, + "loss": 0.2023, + "step": 44120 + }, + { + "epoch": 1.83, + "grad_norm": 0.4453125, + "learning_rate": 0.0004954534569034141, + "loss": 0.2776, + "step": 44130 + }, + { + "epoch": 1.83, + "grad_norm": 0.87109375, + "learning_rate": 0.0004954513977526252, + "loss": 0.2232, + "step": 44140 + }, + { + "epoch": 1.83, + "grad_norm": 0.51171875, + "learning_rate": 0.0004954493381399232, + "loss": 0.2106, + "step": 44150 + }, + { + "epoch": 1.83, + "grad_norm": 0.8671875, + "learning_rate": 0.0004954472780653122, + "loss": 0.2479, + "step": 44160 + }, + { + "epoch": 1.83, + "grad_norm": 0.4765625, + "learning_rate": 0.0004954452175287958, + "loss": 0.2583, + "step": 44170 + }, + { + "epoch": 1.83, + "grad_norm": 1.421875, + "learning_rate": 0.0004954431565303781, + "loss": 0.2547, + "step": 44180 + }, + { + "epoch": 1.83, + "grad_norm": 0.5, + "learning_rate": 0.0004954410950700628, + "loss": 0.2991, + "step": 44190 + }, + { + "epoch": 1.83, + "grad_norm": 0.52734375, + "learning_rate": 0.0004954390331478539, + "loss": 0.2482, + "step": 44200 + }, + { + "epoch": 1.83, + "grad_norm": 0.5859375, + "learning_rate": 0.0004954369707637552, + "loss": 0.2443, + "step": 44210 + }, + { + "epoch": 1.83, + "grad_norm": 0.66796875, + "learning_rate": 0.0004954349079177708, + "loss": 0.2501, + "step": 44220 + }, + { + "epoch": 1.83, + "grad_norm": 0.70703125, + "learning_rate": 0.0004954328446099043, + "loss": 0.1853, + "step": 44230 + }, + { + "epoch": 1.83, + "grad_norm": 0.89453125, + "learning_rate": 0.0004954307808401597, + "loss": 0.2612, + "step": 44240 + }, + { + "epoch": 1.83, + "grad_norm": 0.640625, + "learning_rate": 0.0004954287166085409, + "loss": 0.2621, + "step": 44250 + }, + { + "epoch": 1.83, + "grad_norm": 1.0078125, + "learning_rate": 0.0004954266519150517, + "loss": 0.2205, + "step": 44260 + }, + { + "epoch": 1.83, + "grad_norm": 1.5, + "learning_rate": 0.000495424586759696, + "loss": 0.2468, + "step": 44270 + }, + { + "epoch": 1.83, + "grad_norm": 0.9765625, + "learning_rate": 0.0004954225211424777, + "loss": 0.2155, + "step": 44280 + }, + { + "epoch": 1.83, + "grad_norm": 0.8359375, + "learning_rate": 0.0004954204550634009, + "loss": 0.2007, + "step": 44290 + }, + { + "epoch": 1.83, + "grad_norm": 1.0859375, + "learning_rate": 0.0004954183885224693, + "loss": 0.2376, + "step": 44300 + }, + { + "epoch": 1.84, + "grad_norm": 0.46875, + "learning_rate": 0.0004954163215196867, + "loss": 0.2063, + "step": 44310 + }, + { + "epoch": 1.84, + "grad_norm": 0.921875, + "learning_rate": 0.0004954142540550571, + "loss": 0.2086, + "step": 44320 + }, + { + "epoch": 1.84, + "grad_norm": 0.6953125, + "learning_rate": 0.0004954121861285843, + "loss": 0.2228, + "step": 44330 + }, + { + "epoch": 1.84, + "grad_norm": 0.31640625, + "learning_rate": 0.0004954101177402723, + "loss": 0.2477, + "step": 44340 + }, + { + "epoch": 1.84, + "grad_norm": 0.59765625, + "learning_rate": 0.000495408048890125, + "loss": 0.2196, + "step": 44350 + }, + { + "epoch": 1.84, + "grad_norm": 0.275390625, + "learning_rate": 0.0004954059795781463, + "loss": 0.194, + "step": 44360 + }, + { + "epoch": 1.84, + "grad_norm": 2.984375, + "learning_rate": 0.00049540390980434, + "loss": 0.2281, + "step": 44370 + }, + { + "epoch": 1.84, + "grad_norm": 0.5390625, + "learning_rate": 0.0004954018395687101, + "loss": 0.2048, + "step": 44380 + }, + { + "epoch": 1.84, + "grad_norm": 0.408203125, + "learning_rate": 0.0004953997688712604, + "loss": 0.2483, + "step": 44390 + }, + { + "epoch": 1.84, + "grad_norm": 0.55859375, + "learning_rate": 0.0004953976977119948, + "loss": 0.2287, + "step": 44400 + }, + { + "epoch": 1.84, + "grad_norm": 0.30078125, + "learning_rate": 0.0004953956260909172, + "loss": 0.2485, + "step": 44410 + }, + { + "epoch": 1.84, + "grad_norm": 0.6640625, + "learning_rate": 0.0004953935540080316, + "loss": 0.2674, + "step": 44420 + }, + { + "epoch": 1.84, + "grad_norm": 1.0234375, + "learning_rate": 0.0004953914814633419, + "loss": 0.239, + "step": 44430 + }, + { + "epoch": 1.84, + "grad_norm": 0.734375, + "learning_rate": 0.0004953894084568518, + "loss": 0.2402, + "step": 44440 + }, + { + "epoch": 1.84, + "grad_norm": 0.9921875, + "learning_rate": 0.0004953873349885654, + "loss": 0.2455, + "step": 44450 + }, + { + "epoch": 1.84, + "grad_norm": 0.8515625, + "learning_rate": 0.0004953852610584866, + "loss": 0.2242, + "step": 44460 + }, + { + "epoch": 1.84, + "grad_norm": 1.2421875, + "learning_rate": 0.0004953831866666191, + "loss": 0.2361, + "step": 44470 + }, + { + "epoch": 1.84, + "grad_norm": 0.609375, + "learning_rate": 0.000495381111812967, + "loss": 0.2202, + "step": 44480 + }, + { + "epoch": 1.84, + "grad_norm": 1.0625, + "learning_rate": 0.0004953790364975342, + "loss": 0.2062, + "step": 44490 + }, + { + "epoch": 1.84, + "grad_norm": 0.375, + "learning_rate": 0.0004953769607203244, + "loss": 0.258, + "step": 44500 + }, + { + "epoch": 1.84, + "grad_norm": 1.296875, + "learning_rate": 0.0004953748844813418, + "loss": 0.1856, + "step": 44510 + }, + { + "epoch": 1.84, + "grad_norm": 0.421875, + "learning_rate": 0.0004953728077805901, + "loss": 0.2319, + "step": 44520 + }, + { + "epoch": 1.84, + "grad_norm": 0.82421875, + "learning_rate": 0.0004953707306180734, + "loss": 0.2879, + "step": 44530 + }, + { + "epoch": 1.84, + "grad_norm": 0.5703125, + "learning_rate": 0.0004953686529937953, + "loss": 0.2682, + "step": 44540 + }, + { + "epoch": 1.85, + "grad_norm": 0.498046875, + "learning_rate": 0.0004953665749077601, + "loss": 0.2218, + "step": 44550 + }, + { + "epoch": 1.85, + "grad_norm": 0.78515625, + "learning_rate": 0.0004953644963599712, + "loss": 0.1502, + "step": 44560 + }, + { + "epoch": 1.85, + "grad_norm": 2.796875, + "learning_rate": 0.0004953624173504331, + "loss": 0.2374, + "step": 44570 + }, + { + "epoch": 1.85, + "grad_norm": 0.64453125, + "learning_rate": 0.0004953603378791493, + "loss": 0.2302, + "step": 44580 + }, + { + "epoch": 1.85, + "grad_norm": 1.078125, + "learning_rate": 0.0004953582579461239, + "loss": 0.2375, + "step": 44590 + }, + { + "epoch": 1.85, + "grad_norm": 0.439453125, + "learning_rate": 0.0004953561775513606, + "loss": 0.1988, + "step": 44600 + }, + { + "epoch": 1.85, + "grad_norm": 3.125, + "learning_rate": 0.0004953540966948636, + "loss": 0.1656, + "step": 44610 + }, + { + "epoch": 1.85, + "grad_norm": 1.0859375, + "learning_rate": 0.0004953520153766367, + "loss": 0.2926, + "step": 44620 + }, + { + "epoch": 1.85, + "grad_norm": 0.45703125, + "learning_rate": 0.0004953499335966838, + "loss": 0.1837, + "step": 44630 + }, + { + "epoch": 1.85, + "grad_norm": 0.87109375, + "learning_rate": 0.0004953478513550088, + "loss": 0.2057, + "step": 44640 + }, + { + "epoch": 1.85, + "grad_norm": 1.3828125, + "learning_rate": 0.0004953457686516157, + "loss": 0.2636, + "step": 44650 + }, + { + "epoch": 1.85, + "grad_norm": 0.447265625, + "learning_rate": 0.0004953436854865082, + "loss": 0.2231, + "step": 44660 + }, + { + "epoch": 1.85, + "grad_norm": 0.75390625, + "learning_rate": 0.0004953416018596905, + "loss": 0.2063, + "step": 44670 + }, + { + "epoch": 1.85, + "grad_norm": 1.0859375, + "learning_rate": 0.0004953395177711664, + "loss": 0.2328, + "step": 44680 + }, + { + "epoch": 1.85, + "grad_norm": 0.66796875, + "learning_rate": 0.0004953374332209398, + "loss": 0.2513, + "step": 44690 + }, + { + "epoch": 1.85, + "grad_norm": 0.462890625, + "learning_rate": 0.0004953353482090146, + "loss": 0.299, + "step": 44700 + }, + { + "epoch": 1.85, + "grad_norm": 0.734375, + "learning_rate": 0.0004953332627353948, + "loss": 0.1974, + "step": 44710 + }, + { + "epoch": 1.85, + "grad_norm": 0.73046875, + "learning_rate": 0.0004953311768000843, + "loss": 0.1926, + "step": 44720 + }, + { + "epoch": 1.85, + "grad_norm": 0.47265625, + "learning_rate": 0.000495329090403087, + "loss": 0.1778, + "step": 44730 + }, + { + "epoch": 1.85, + "grad_norm": 0.8046875, + "learning_rate": 0.0004953270035444069, + "loss": 0.1957, + "step": 44740 + }, + { + "epoch": 1.85, + "grad_norm": 0.63671875, + "learning_rate": 0.0004953249162240478, + "loss": 0.1857, + "step": 44750 + }, + { + "epoch": 1.85, + "grad_norm": 0.6875, + "learning_rate": 0.0004953228284420137, + "loss": 0.1968, + "step": 44760 + }, + { + "epoch": 1.85, + "grad_norm": 0.359375, + "learning_rate": 0.0004953207401983086, + "loss": 0.2613, + "step": 44770 + }, + { + "epoch": 1.85, + "grad_norm": 1.7421875, + "learning_rate": 0.0004953186514929363, + "loss": 0.2342, + "step": 44780 + }, + { + "epoch": 1.86, + "grad_norm": 0.84765625, + "learning_rate": 0.0004953165623259008, + "loss": 0.262, + "step": 44790 + }, + { + "epoch": 1.86, + "grad_norm": 0.5703125, + "learning_rate": 0.0004953144726972061, + "loss": 0.2947, + "step": 44800 + }, + { + "epoch": 1.86, + "grad_norm": 0.51171875, + "learning_rate": 0.000495312382606856, + "loss": 0.2006, + "step": 44810 + }, + { + "epoch": 1.86, + "grad_norm": 0.396484375, + "learning_rate": 0.0004953102920548545, + "loss": 0.2011, + "step": 44820 + }, + { + "epoch": 1.86, + "grad_norm": 0.55859375, + "learning_rate": 0.0004953082010412055, + "loss": 0.2544, + "step": 44830 + }, + { + "epoch": 1.86, + "grad_norm": 1.3359375, + "learning_rate": 0.0004953061095659129, + "loss": 0.2301, + "step": 44840 + }, + { + "epoch": 1.86, + "grad_norm": 1.046875, + "learning_rate": 0.0004953040176289808, + "loss": 0.2041, + "step": 44850 + }, + { + "epoch": 1.86, + "grad_norm": 0.57421875, + "learning_rate": 0.000495301925230413, + "loss": 0.2749, + "step": 44860 + }, + { + "epoch": 1.86, + "grad_norm": 0.75, + "learning_rate": 0.0004952998323702134, + "loss": 0.2211, + "step": 44870 + }, + { + "epoch": 1.86, + "grad_norm": 0.50390625, + "learning_rate": 0.0004952977390483861, + "loss": 0.2429, + "step": 44880 + }, + { + "epoch": 1.86, + "grad_norm": 0.546875, + "learning_rate": 0.0004952956452649349, + "loss": 0.2398, + "step": 44890 + }, + { + "epoch": 1.86, + "grad_norm": 3.09375, + "learning_rate": 0.0004952935510198637, + "loss": 0.206, + "step": 44900 + }, + { + "epoch": 1.86, + "grad_norm": 1.453125, + "learning_rate": 0.0004952914563131766, + "loss": 0.224, + "step": 44910 + }, + { + "epoch": 1.86, + "grad_norm": 0.376953125, + "learning_rate": 0.0004952893611448775, + "loss": 0.1838, + "step": 44920 + }, + { + "epoch": 1.86, + "grad_norm": 2.125, + "learning_rate": 0.0004952872655149703, + "loss": 0.1794, + "step": 44930 + }, + { + "epoch": 1.86, + "grad_norm": 0.8515625, + "learning_rate": 0.0004952851694234589, + "loss": 0.2688, + "step": 44940 + }, + { + "epoch": 1.86, + "grad_norm": 0.443359375, + "learning_rate": 0.0004952830728703474, + "loss": 0.1989, + "step": 44950 + }, + { + "epoch": 1.86, + "grad_norm": 0.2109375, + "learning_rate": 0.0004952809758556395, + "loss": 0.2225, + "step": 44960 + }, + { + "epoch": 1.86, + "grad_norm": 0.458984375, + "learning_rate": 0.0004952788783793395, + "loss": 0.1707, + "step": 44970 + }, + { + "epoch": 1.86, + "grad_norm": 1.1640625, + "learning_rate": 0.000495276780441451, + "loss": 0.2445, + "step": 44980 + }, + { + "epoch": 1.86, + "grad_norm": 0.69921875, + "learning_rate": 0.0004952746820419781, + "loss": 0.245, + "step": 44990 + }, + { + "epoch": 1.86, + "grad_norm": 0.81640625, + "learning_rate": 0.0004952725831809247, + "loss": 0.2339, + "step": 45000 + }, + { + "epoch": 1.86, + "grad_norm": 0.140625, + "learning_rate": 0.0004952704838582948, + "loss": 0.1689, + "step": 45010 + }, + { + "epoch": 1.86, + "grad_norm": 0.4765625, + "learning_rate": 0.0004952683840740924, + "loss": 0.2491, + "step": 45020 + }, + { + "epoch": 1.87, + "grad_norm": 1.2421875, + "learning_rate": 0.0004952662838283214, + "loss": 0.2085, + "step": 45030 + }, + { + "epoch": 1.87, + "grad_norm": 0.423828125, + "learning_rate": 0.0004952641831209856, + "loss": 0.1871, + "step": 45040 + }, + { + "epoch": 1.87, + "grad_norm": 1.8828125, + "learning_rate": 0.0004952620819520892, + "loss": 0.2263, + "step": 45050 + }, + { + "epoch": 1.87, + "grad_norm": 0.71875, + "learning_rate": 0.000495259980321636, + "loss": 0.2186, + "step": 45060 + }, + { + "epoch": 1.87, + "grad_norm": 0.75, + "learning_rate": 0.0004952578782296301, + "loss": 0.1958, + "step": 45070 + }, + { + "epoch": 1.87, + "grad_norm": 0.59375, + "learning_rate": 0.0004952557756760752, + "loss": 0.2468, + "step": 45080 + }, + { + "epoch": 1.87, + "grad_norm": 0.73828125, + "learning_rate": 0.0004952536726609755, + "loss": 0.1879, + "step": 45090 + }, + { + "epoch": 1.87, + "grad_norm": 2.03125, + "learning_rate": 0.0004952515691843349, + "loss": 0.2318, + "step": 45100 + }, + { + "epoch": 1.87, + "grad_norm": 0.64453125, + "learning_rate": 0.0004952494652461574, + "loss": 0.2371, + "step": 45110 + }, + { + "epoch": 1.87, + "grad_norm": 0.88671875, + "learning_rate": 0.0004952473608464468, + "loss": 0.2354, + "step": 45120 + }, + { + "epoch": 1.87, + "grad_norm": 1.3046875, + "learning_rate": 0.0004952452559852072, + "loss": 0.2136, + "step": 45130 + }, + { + "epoch": 1.87, + "grad_norm": 0.6875, + "learning_rate": 0.0004952431506624424, + "loss": 0.1603, + "step": 45140 + }, + { + "epoch": 1.87, + "grad_norm": 0.62109375, + "learning_rate": 0.0004952410448781565, + "loss": 0.2231, + "step": 45150 + }, + { + "epoch": 1.87, + "grad_norm": 0.443359375, + "learning_rate": 0.0004952389386323535, + "loss": 0.2252, + "step": 45160 + }, + { + "epoch": 1.87, + "grad_norm": 0.408203125, + "learning_rate": 0.0004952368319250373, + "loss": 0.2271, + "step": 45170 + }, + { + "epoch": 1.87, + "grad_norm": 0.578125, + "learning_rate": 0.0004952347247562119, + "loss": 0.2114, + "step": 45180 + }, + { + "epoch": 1.87, + "grad_norm": 0.578125, + "learning_rate": 0.0004952326171258812, + "loss": 0.1812, + "step": 45190 + }, + { + "epoch": 1.87, + "grad_norm": 1.171875, + "learning_rate": 0.0004952305090340491, + "loss": 0.2429, + "step": 45200 + }, + { + "epoch": 1.87, + "grad_norm": 1.0234375, + "learning_rate": 0.0004952284004807197, + "loss": 0.2145, + "step": 45210 + }, + { + "epoch": 1.87, + "grad_norm": 0.94921875, + "learning_rate": 0.0004952262914658971, + "loss": 0.2121, + "step": 45220 + }, + { + "epoch": 1.87, + "grad_norm": 0.67578125, + "learning_rate": 0.0004952241819895848, + "loss": 0.218, + "step": 45230 + }, + { + "epoch": 1.87, + "grad_norm": 0.7734375, + "learning_rate": 0.0004952220720517873, + "loss": 0.1871, + "step": 45240 + }, + { + "epoch": 1.87, + "grad_norm": 2.09375, + "learning_rate": 0.0004952199616525084, + "loss": 0.1919, + "step": 45250 + }, + { + "epoch": 1.87, + "grad_norm": 0.40234375, + "learning_rate": 0.0004952178507917519, + "loss": 0.2278, + "step": 45260 + }, + { + "epoch": 1.88, + "grad_norm": 0.8046875, + "learning_rate": 0.0004952157394695218, + "loss": 0.1999, + "step": 45270 + }, + { + "epoch": 1.88, + "grad_norm": 0.41796875, + "learning_rate": 0.0004952136276858223, + "loss": 0.2051, + "step": 45280 + }, + { + "epoch": 1.88, + "grad_norm": 0.71484375, + "learning_rate": 0.0004952115154406571, + "loss": 0.2135, + "step": 45290 + }, + { + "epoch": 1.88, + "grad_norm": 0.31640625, + "learning_rate": 0.0004952094027340304, + "loss": 0.1683, + "step": 45300 + }, + { + "epoch": 1.88, + "grad_norm": 0.27734375, + "learning_rate": 0.0004952072895659461, + "loss": 0.2229, + "step": 45310 + }, + { + "epoch": 1.88, + "grad_norm": 0.80859375, + "learning_rate": 0.0004952051759364081, + "loss": 0.2462, + "step": 45320 + }, + { + "epoch": 1.88, + "grad_norm": 0.6328125, + "learning_rate": 0.0004952030618454205, + "loss": 0.2422, + "step": 45330 + }, + { + "epoch": 1.88, + "grad_norm": 0.80078125, + "learning_rate": 0.0004952009472929871, + "loss": 0.1991, + "step": 45340 + }, + { + "epoch": 1.88, + "grad_norm": 0.2236328125, + "learning_rate": 0.0004951988322791121, + "loss": 0.2089, + "step": 45350 + }, + { + "epoch": 1.88, + "grad_norm": 4.46875, + "learning_rate": 0.0004951967168037994, + "loss": 0.2536, + "step": 45360 + }, + { + "epoch": 1.88, + "grad_norm": 1.6328125, + "learning_rate": 0.000495194600867053, + "loss": 0.2529, + "step": 45370 + }, + { + "epoch": 1.88, + "grad_norm": 0.75390625, + "learning_rate": 0.0004951924844688767, + "loss": 0.2603, + "step": 45380 + }, + { + "epoch": 1.88, + "grad_norm": 0.53125, + "learning_rate": 0.0004951903676092747, + "loss": 0.2606, + "step": 45390 + }, + { + "epoch": 1.88, + "grad_norm": 0.4296875, + "learning_rate": 0.0004951882502882509, + "loss": 0.2183, + "step": 45400 + }, + { + "epoch": 1.88, + "grad_norm": 0.54296875, + "learning_rate": 0.0004951861325058093, + "loss": 0.1934, + "step": 45410 + }, + { + "epoch": 1.88, + "grad_norm": 0.765625, + "learning_rate": 0.0004951840142619538, + "loss": 0.2227, + "step": 45420 + }, + { + "epoch": 1.88, + "grad_norm": 1.0546875, + "learning_rate": 0.0004951818955566886, + "loss": 0.1898, + "step": 45430 + }, + { + "epoch": 1.88, + "grad_norm": 0.8125, + "learning_rate": 0.0004951797763900175, + "loss": 0.2063, + "step": 45440 + }, + { + "epoch": 1.88, + "grad_norm": 0.61328125, + "learning_rate": 0.0004951776567619444, + "loss": 0.2031, + "step": 45450 + }, + { + "epoch": 1.88, + "grad_norm": 0.95703125, + "learning_rate": 0.0004951755366724735, + "loss": 0.2215, + "step": 45460 + }, + { + "epoch": 1.88, + "grad_norm": 0.7421875, + "learning_rate": 0.0004951734161216089, + "loss": 0.2828, + "step": 45470 + }, + { + "epoch": 1.88, + "grad_norm": 0.77734375, + "learning_rate": 0.0004951712951093541, + "loss": 0.2082, + "step": 45480 + }, + { + "epoch": 1.88, + "grad_norm": 0.490234375, + "learning_rate": 0.0004951691736357136, + "loss": 0.2904, + "step": 45490 + }, + { + "epoch": 1.88, + "grad_norm": 0.6328125, + "learning_rate": 0.0004951670517006912, + "loss": 0.2252, + "step": 45500 + }, + { + "epoch": 1.89, + "grad_norm": 0.60546875, + "learning_rate": 0.0004951649293042908, + "loss": 0.2526, + "step": 45510 + }, + { + "epoch": 1.89, + "grad_norm": 0.9765625, + "learning_rate": 0.0004951628064465165, + "loss": 0.2061, + "step": 45520 + }, + { + "epoch": 1.89, + "grad_norm": 0.53515625, + "learning_rate": 0.0004951606831273723, + "loss": 0.1776, + "step": 45530 + }, + { + "epoch": 1.89, + "grad_norm": 1.4765625, + "learning_rate": 0.0004951585593468621, + "loss": 0.1996, + "step": 45540 + }, + { + "epoch": 1.89, + "grad_norm": 0.54296875, + "learning_rate": 0.0004951564351049901, + "loss": 0.187, + "step": 45550 + }, + { + "epoch": 1.89, + "grad_norm": 0.4609375, + "learning_rate": 0.0004951543104017601, + "loss": 0.1927, + "step": 45560 + }, + { + "epoch": 1.89, + "grad_norm": 0.8984375, + "learning_rate": 0.0004951521852371761, + "loss": 0.1802, + "step": 45570 + }, + { + "epoch": 1.89, + "grad_norm": 0.66015625, + "learning_rate": 0.0004951500596112422, + "loss": 0.1753, + "step": 45580 + }, + { + "epoch": 1.89, + "grad_norm": 0.0, + "learning_rate": 0.0004951479335239624, + "loss": 0.222, + "step": 45590 + }, + { + "epoch": 1.89, + "grad_norm": 1.1015625, + "learning_rate": 0.0004951458069753406, + "loss": 0.2526, + "step": 45600 + }, + { + "epoch": 1.89, + "grad_norm": 0.376953125, + "learning_rate": 0.0004951436799653808, + "loss": 0.2075, + "step": 45610 + }, + { + "epoch": 1.89, + "grad_norm": 1.3671875, + "learning_rate": 0.0004951415524940873, + "loss": 0.1669, + "step": 45620 + }, + { + "epoch": 1.89, + "grad_norm": 0.484375, + "learning_rate": 0.0004951394245614637, + "loss": 0.2027, + "step": 45630 + }, + { + "epoch": 1.89, + "grad_norm": 0.458984375, + "learning_rate": 0.0004951372961675142, + "loss": 0.2008, + "step": 45640 + }, + { + "epoch": 1.89, + "grad_norm": 0.64453125, + "learning_rate": 0.0004951351673122428, + "loss": 0.2652, + "step": 45650 + }, + { + "epoch": 1.89, + "grad_norm": 1.0859375, + "learning_rate": 0.0004951330379956535, + "loss": 0.2054, + "step": 45660 + }, + { + "epoch": 1.89, + "grad_norm": 0.6171875, + "learning_rate": 0.0004951309082177503, + "loss": 0.2443, + "step": 45670 + }, + { + "epoch": 1.89, + "grad_norm": 0.2314453125, + "learning_rate": 0.0004951287779785371, + "loss": 0.221, + "step": 45680 + }, + { + "epoch": 1.89, + "grad_norm": 1.9140625, + "learning_rate": 0.0004951266472780181, + "loss": 0.2148, + "step": 45690 + }, + { + "epoch": 1.89, + "grad_norm": 0.6953125, + "learning_rate": 0.0004951245161161972, + "loss": 0.2465, + "step": 45700 + }, + { + "epoch": 1.89, + "grad_norm": 1.140625, + "learning_rate": 0.0004951223844930785, + "loss": 0.2995, + "step": 45710 + }, + { + "epoch": 1.89, + "grad_norm": 0.232421875, + "learning_rate": 0.0004951202524086658, + "loss": 0.2289, + "step": 45720 + }, + { + "epoch": 1.89, + "grad_norm": 0.60546875, + "learning_rate": 0.0004951181198629634, + "loss": 0.1902, + "step": 45730 + }, + { + "epoch": 1.89, + "grad_norm": 0.6484375, + "learning_rate": 0.0004951159868559752, + "loss": 0.2424, + "step": 45740 + }, + { + "epoch": 1.89, + "grad_norm": 0.33984375, + "learning_rate": 0.000495113853387705, + "loss": 0.1497, + "step": 45750 + }, + { + "epoch": 1.9, + "grad_norm": 0.30078125, + "learning_rate": 0.0004951117194581571, + "loss": 0.2164, + "step": 45760 + }, + { + "epoch": 1.9, + "grad_norm": 0.546875, + "learning_rate": 0.0004951095850673355, + "loss": 0.1761, + "step": 45770 + }, + { + "epoch": 1.9, + "grad_norm": 0.322265625, + "learning_rate": 0.0004951074502152439, + "loss": 0.2076, + "step": 45780 + }, + { + "epoch": 1.9, + "grad_norm": 2.03125, + "learning_rate": 0.0004951053149018867, + "loss": 0.1829, + "step": 45790 + }, + { + "epoch": 1.9, + "grad_norm": 0.5390625, + "learning_rate": 0.0004951031791272678, + "loss": 0.1894, + "step": 45800 + }, + { + "epoch": 1.9, + "grad_norm": 0.54296875, + "learning_rate": 0.0004951010428913911, + "loss": 0.2705, + "step": 45810 + }, + { + "epoch": 1.9, + "grad_norm": 0.0, + "learning_rate": 0.0004950989061942608, + "loss": 0.2334, + "step": 45820 + }, + { + "epoch": 1.9, + "grad_norm": 0.56640625, + "learning_rate": 0.0004950967690358808, + "loss": 0.2491, + "step": 45830 + }, + { + "epoch": 1.9, + "grad_norm": 1.3046875, + "learning_rate": 0.0004950946314162551, + "loss": 0.2111, + "step": 45840 + }, + { + "epoch": 1.9, + "grad_norm": 0.3984375, + "learning_rate": 0.0004950924933353878, + "loss": 0.2307, + "step": 45850 + }, + { + "epoch": 1.9, + "grad_norm": 0.6328125, + "learning_rate": 0.0004950903547932829, + "loss": 0.2109, + "step": 45860 + }, + { + "epoch": 1.9, + "grad_norm": 0.70703125, + "learning_rate": 0.0004950882157899444, + "loss": 0.2225, + "step": 45870 + }, + { + "epoch": 1.9, + "grad_norm": 1.453125, + "learning_rate": 0.0004950860763253763, + "loss": 0.2136, + "step": 45880 + }, + { + "epoch": 1.9, + "grad_norm": 0.240234375, + "learning_rate": 0.0004950839363995827, + "loss": 0.2847, + "step": 45890 + }, + { + "epoch": 1.9, + "grad_norm": 0.69921875, + "learning_rate": 0.0004950817960125677, + "loss": 0.2307, + "step": 45900 + }, + { + "epoch": 1.9, + "grad_norm": 0.87890625, + "learning_rate": 0.000495079655164335, + "loss": 0.2482, + "step": 45910 + }, + { + "epoch": 1.9, + "grad_norm": 0.466796875, + "learning_rate": 0.0004950775138548891, + "loss": 0.2375, + "step": 45920 + }, + { + "epoch": 1.9, + "grad_norm": 0.5546875, + "learning_rate": 0.0004950753720842337, + "loss": 0.2304, + "step": 45930 + }, + { + "epoch": 1.9, + "grad_norm": 0.4375, + "learning_rate": 0.0004950732298523729, + "loss": 0.1987, + "step": 45940 + }, + { + "epoch": 1.9, + "grad_norm": 0.6796875, + "learning_rate": 0.0004950710871593107, + "loss": 0.2325, + "step": 45950 + }, + { + "epoch": 1.9, + "grad_norm": 2.453125, + "learning_rate": 0.0004950689440050512, + "loss": 0.2463, + "step": 45960 + }, + { + "epoch": 1.9, + "grad_norm": 1.78125, + "learning_rate": 0.0004950668003895985, + "loss": 0.2282, + "step": 45970 + }, + { + "epoch": 1.9, + "grad_norm": 0.482421875, + "learning_rate": 0.0004950646563129565, + "loss": 0.1856, + "step": 45980 + }, + { + "epoch": 1.9, + "grad_norm": 0.640625, + "learning_rate": 0.0004950625117751292, + "loss": 0.2082, + "step": 45990 + }, + { + "epoch": 1.91, + "grad_norm": 1.8828125, + "learning_rate": 0.0004950603667761208, + "loss": 0.1976, + "step": 46000 + }, + { + "epoch": 1.91, + "grad_norm": 0.54296875, + "learning_rate": 0.0004950582213159352, + "loss": 0.2325, + "step": 46010 + }, + { + "epoch": 1.91, + "grad_norm": 0.55078125, + "learning_rate": 0.0004950560753945766, + "loss": 0.2071, + "step": 46020 + }, + { + "epoch": 1.91, + "grad_norm": 0.306640625, + "learning_rate": 0.0004950539290120488, + "loss": 0.1911, + "step": 46030 + }, + { + "epoch": 1.91, + "grad_norm": 0.6328125, + "learning_rate": 0.0004950517821683562, + "loss": 0.2061, + "step": 46040 + }, + { + "epoch": 1.91, + "grad_norm": 1.1640625, + "learning_rate": 0.0004950496348635023, + "loss": 0.2183, + "step": 46050 + }, + { + "epoch": 1.91, + "grad_norm": 0.494140625, + "learning_rate": 0.0004950474870974916, + "loss": 0.1338, + "step": 46060 + }, + { + "epoch": 1.91, + "grad_norm": 0.61328125, + "learning_rate": 0.000495045338870328, + "loss": 0.1866, + "step": 46070 + }, + { + "epoch": 1.91, + "grad_norm": 0.63671875, + "learning_rate": 0.0004950431901820155, + "loss": 0.215, + "step": 46080 + }, + { + "epoch": 1.91, + "grad_norm": 1.109375, + "learning_rate": 0.0004950410410325582, + "loss": 0.2731, + "step": 46090 + }, + { + "epoch": 1.91, + "grad_norm": 1.5859375, + "learning_rate": 0.0004950388914219602, + "loss": 0.2963, + "step": 46100 + }, + { + "epoch": 1.91, + "grad_norm": 1.796875, + "learning_rate": 0.0004950367413502253, + "loss": 0.2189, + "step": 46110 + }, + { + "epoch": 1.91, + "grad_norm": 0.96875, + "learning_rate": 0.0004950345908173578, + "loss": 0.235, + "step": 46120 + }, + { + "epoch": 1.91, + "grad_norm": 0.53515625, + "learning_rate": 0.0004950324398233616, + "loss": 0.2128, + "step": 46130 + }, + { + "epoch": 1.91, + "grad_norm": 0.96875, + "learning_rate": 0.0004950302883682409, + "loss": 0.2331, + "step": 46140 + }, + { + "epoch": 1.91, + "grad_norm": 0.37890625, + "learning_rate": 0.0004950281364519997, + "loss": 0.2059, + "step": 46150 + }, + { + "epoch": 1.91, + "grad_norm": 0.89453125, + "learning_rate": 0.0004950259840746419, + "loss": 0.1896, + "step": 46160 + }, + { + "epoch": 1.91, + "grad_norm": 0.8984375, + "learning_rate": 0.0004950238312361717, + "loss": 0.1957, + "step": 46170 + }, + { + "epoch": 1.91, + "grad_norm": 0.7421875, + "learning_rate": 0.0004950216779365932, + "loss": 0.2029, + "step": 46180 + }, + { + "epoch": 1.91, + "grad_norm": 0.6328125, + "learning_rate": 0.0004950195241759102, + "loss": 0.2123, + "step": 46190 + }, + { + "epoch": 1.91, + "grad_norm": 0.48828125, + "learning_rate": 0.000495017369954127, + "loss": 0.1979, + "step": 46200 + }, + { + "epoch": 1.91, + "grad_norm": 0.38671875, + "learning_rate": 0.0004950152152712475, + "loss": 0.2659, + "step": 46210 + }, + { + "epoch": 1.91, + "grad_norm": 1.3359375, + "learning_rate": 0.0004950130601272759, + "loss": 0.2747, + "step": 46220 + }, + { + "epoch": 1.91, + "grad_norm": 0.482421875, + "learning_rate": 0.0004950109045222162, + "loss": 0.1854, + "step": 46230 + }, + { + "epoch": 1.92, + "grad_norm": 1.2265625, + "learning_rate": 0.0004950087484560723, + "loss": 0.2766, + "step": 46240 + }, + { + "epoch": 1.92, + "grad_norm": 0.5234375, + "learning_rate": 0.0004950065919288486, + "loss": 0.2172, + "step": 46250 + }, + { + "epoch": 1.92, + "grad_norm": 0.478515625, + "learning_rate": 0.0004950044349405488, + "loss": 0.162, + "step": 46260 + }, + { + "epoch": 1.92, + "grad_norm": 1.6015625, + "learning_rate": 0.0004950022774911771, + "loss": 0.2125, + "step": 46270 + }, + { + "epoch": 1.92, + "grad_norm": 0.63671875, + "learning_rate": 0.0004950001195807377, + "loss": 0.2237, + "step": 46280 + }, + { + "epoch": 1.92, + "grad_norm": 1.4296875, + "learning_rate": 0.0004949979612092344, + "loss": 0.1529, + "step": 46290 + }, + { + "epoch": 1.92, + "grad_norm": 0.6640625, + "learning_rate": 0.0004949958023766714, + "loss": 0.2296, + "step": 46300 + }, + { + "epoch": 1.92, + "grad_norm": 0.59765625, + "learning_rate": 0.0004949936430830528, + "loss": 0.224, + "step": 46310 + }, + { + "epoch": 1.92, + "grad_norm": 0.37890625, + "learning_rate": 0.0004949914833283827, + "loss": 0.1948, + "step": 46320 + }, + { + "epoch": 1.92, + "grad_norm": 0.9921875, + "learning_rate": 0.000494989323112665, + "loss": 0.2165, + "step": 46330 + }, + { + "epoch": 1.92, + "grad_norm": 1.3125, + "learning_rate": 0.0004949871624359038, + "loss": 0.18, + "step": 46340 + }, + { + "epoch": 1.92, + "grad_norm": 0.2578125, + "learning_rate": 0.0004949850012981033, + "loss": 0.2608, + "step": 46350 + }, + { + "epoch": 1.92, + "grad_norm": 0.9140625, + "learning_rate": 0.0004949828396992675, + "loss": 0.2633, + "step": 46360 + }, + { + "epoch": 1.92, + "grad_norm": 1.90625, + "learning_rate": 0.0004949806776394004, + "loss": 0.2338, + "step": 46370 + }, + { + "epoch": 1.92, + "grad_norm": 0.51953125, + "learning_rate": 0.0004949785151185062, + "loss": 0.2231, + "step": 46380 + }, + { + "epoch": 1.92, + "grad_norm": 0.5, + "learning_rate": 0.0004949763521365887, + "loss": 0.2529, + "step": 46390 + }, + { + "epoch": 1.92, + "grad_norm": 0.7734375, + "learning_rate": 0.0004949741886936524, + "loss": 0.2192, + "step": 46400 + }, + { + "epoch": 1.92, + "grad_norm": 0.373046875, + "learning_rate": 0.000494972024789701, + "loss": 0.2505, + "step": 46410 + }, + { + "epoch": 1.92, + "grad_norm": 0.69140625, + "learning_rate": 0.0004949698604247387, + "loss": 0.2232, + "step": 46420 + }, + { + "epoch": 1.92, + "grad_norm": 0.33203125, + "learning_rate": 0.0004949676955987697, + "loss": 0.2244, + "step": 46430 + }, + { + "epoch": 1.92, + "grad_norm": 0.3984375, + "learning_rate": 0.0004949655303117978, + "loss": 0.2253, + "step": 46440 + }, + { + "epoch": 1.92, + "grad_norm": 0.2275390625, + "learning_rate": 0.0004949633645638272, + "loss": 0.2685, + "step": 46450 + }, + { + "epoch": 1.92, + "grad_norm": 0.6484375, + "learning_rate": 0.0004949611983548621, + "loss": 0.2142, + "step": 46460 + }, + { + "epoch": 1.92, + "grad_norm": 1.640625, + "learning_rate": 0.0004949590316849064, + "loss": 0.2475, + "step": 46470 + }, + { + "epoch": 1.93, + "grad_norm": 1.484375, + "learning_rate": 0.0004949568645539644, + "loss": 0.1797, + "step": 46480 + }, + { + "epoch": 1.93, + "grad_norm": 0.76171875, + "learning_rate": 0.00049495469696204, + "loss": 0.2523, + "step": 46490 + }, + { + "epoch": 1.93, + "grad_norm": 0.51953125, + "learning_rate": 0.0004949525289091372, + "loss": 0.1976, + "step": 46500 + }, + { + "epoch": 1.93, + "grad_norm": 0.859375, + "learning_rate": 0.0004949503603952603, + "loss": 0.2209, + "step": 46510 + }, + { + "epoch": 1.93, + "grad_norm": 0.7265625, + "learning_rate": 0.0004949481914204131, + "loss": 0.2363, + "step": 46520 + }, + { + "epoch": 1.93, + "grad_norm": 0.54296875, + "learning_rate": 0.0004949460219846, + "loss": 0.2378, + "step": 46530 + }, + { + "epoch": 1.93, + "grad_norm": 0.7421875, + "learning_rate": 0.000494943852087825, + "loss": 0.2644, + "step": 46540 + }, + { + "epoch": 1.93, + "grad_norm": 0.60546875, + "learning_rate": 0.000494941681730092, + "loss": 0.2739, + "step": 46550 + }, + { + "epoch": 1.93, + "grad_norm": 0.41796875, + "learning_rate": 0.0004949395109114053, + "loss": 0.219, + "step": 46560 + }, + { + "epoch": 1.93, + "grad_norm": 1.0546875, + "learning_rate": 0.0004949373396317688, + "loss": 0.2384, + "step": 46570 + }, + { + "epoch": 1.93, + "grad_norm": 0.734375, + "learning_rate": 0.0004949351678911868, + "loss": 0.2522, + "step": 46580 + }, + { + "epoch": 1.93, + "grad_norm": 0.4765625, + "learning_rate": 0.0004949329956896631, + "loss": 0.2475, + "step": 46590 + }, + { + "epoch": 1.93, + "grad_norm": 0.0, + "learning_rate": 0.000494930823027202, + "loss": 0.2511, + "step": 46600 + }, + { + "epoch": 1.93, + "grad_norm": 0.255859375, + "learning_rate": 0.0004949286499038076, + "loss": 0.2501, + "step": 46610 + }, + { + "epoch": 1.93, + "grad_norm": 1.1171875, + "learning_rate": 0.000494926476319484, + "loss": 0.275, + "step": 46620 + }, + { + "epoch": 1.93, + "grad_norm": 0.546875, + "learning_rate": 0.000494924302274235, + "loss": 0.2559, + "step": 46630 + }, + { + "epoch": 1.93, + "grad_norm": 0.57421875, + "learning_rate": 0.0004949221277680651, + "loss": 0.2224, + "step": 46640 + }, + { + "epoch": 1.93, + "grad_norm": 1.265625, + "learning_rate": 0.0004949199528009781, + "loss": 0.1925, + "step": 46650 + }, + { + "epoch": 1.93, + "grad_norm": 0.5078125, + "learning_rate": 0.0004949177773729783, + "loss": 0.1857, + "step": 46660 + }, + { + "epoch": 1.93, + "grad_norm": 1.359375, + "learning_rate": 0.0004949156014840697, + "loss": 0.2329, + "step": 46670 + }, + { + "epoch": 1.93, + "grad_norm": 0.640625, + "learning_rate": 0.0004949134251342563, + "loss": 0.2251, + "step": 46680 + }, + { + "epoch": 1.93, + "grad_norm": 0.0, + "learning_rate": 0.0004949112483235423, + "loss": 0.2388, + "step": 46690 + }, + { + "epoch": 1.93, + "grad_norm": 0.2080078125, + "learning_rate": 0.0004949090710519317, + "loss": 0.2138, + "step": 46700 + }, + { + "epoch": 1.93, + "grad_norm": 0.57421875, + "learning_rate": 0.0004949068933194289, + "loss": 0.1994, + "step": 46710 + }, + { + "epoch": 1.94, + "grad_norm": 0.474609375, + "learning_rate": 0.0004949047151260375, + "loss": 0.2329, + "step": 46720 + }, + { + "epoch": 1.94, + "grad_norm": 1.2421875, + "learning_rate": 0.0004949025364717621, + "loss": 0.2026, + "step": 46730 + }, + { + "epoch": 1.94, + "grad_norm": 0.396484375, + "learning_rate": 0.0004949003573566065, + "loss": 0.2817, + "step": 46740 + }, + { + "epoch": 1.94, + "grad_norm": 0.75390625, + "learning_rate": 0.0004948981777805748, + "loss": 0.243, + "step": 46750 + }, + { + "epoch": 1.94, + "grad_norm": 0.2373046875, + "learning_rate": 0.0004948959977436712, + "loss": 0.2324, + "step": 46760 + }, + { + "epoch": 1.94, + "grad_norm": 1.1640625, + "learning_rate": 0.0004948938172458999, + "loss": 0.1978, + "step": 46770 + }, + { + "epoch": 1.94, + "grad_norm": 0.671875, + "learning_rate": 0.0004948916362872648, + "loss": 0.2194, + "step": 46780 + }, + { + "epoch": 1.94, + "grad_norm": 0.83984375, + "learning_rate": 0.0004948894548677701, + "loss": 0.198, + "step": 46790 + }, + { + "epoch": 1.94, + "grad_norm": 0.71875, + "learning_rate": 0.0004948872729874197, + "loss": 0.2495, + "step": 46800 + }, + { + "epoch": 1.94, + "grad_norm": 0.69921875, + "learning_rate": 0.0004948850906462181, + "loss": 0.1812, + "step": 46810 + }, + { + "epoch": 1.94, + "grad_norm": 0.57421875, + "learning_rate": 0.0004948829078441692, + "loss": 0.1923, + "step": 46820 + }, + { + "epoch": 1.94, + "grad_norm": 0.55859375, + "learning_rate": 0.0004948807245812771, + "loss": 0.2381, + "step": 46830 + }, + { + "epoch": 1.94, + "grad_norm": 0.93359375, + "learning_rate": 0.000494878540857546, + "loss": 0.2327, + "step": 46840 + }, + { + "epoch": 1.94, + "grad_norm": 1.1953125, + "learning_rate": 0.0004948763566729798, + "loss": 0.2043, + "step": 46850 + }, + { + "epoch": 1.94, + "grad_norm": 0.5625, + "learning_rate": 0.0004948741720275828, + "loss": 0.2103, + "step": 46860 + }, + { + "epoch": 1.94, + "grad_norm": 0.65234375, + "learning_rate": 0.000494871986921359, + "loss": 0.2318, + "step": 46870 + }, + { + "epoch": 1.94, + "grad_norm": 1.0, + "learning_rate": 0.0004948698013543125, + "loss": 0.1933, + "step": 46880 + }, + { + "epoch": 1.94, + "grad_norm": 0.359375, + "learning_rate": 0.0004948676153264476, + "loss": 0.2348, + "step": 46890 + }, + { + "epoch": 1.94, + "grad_norm": 0.62890625, + "learning_rate": 0.0004948654288377683, + "loss": 0.2443, + "step": 46900 + }, + { + "epoch": 1.94, + "grad_norm": 1.109375, + "learning_rate": 0.0004948632418882787, + "loss": 0.2017, + "step": 46910 + }, + { + "epoch": 1.94, + "grad_norm": 0.984375, + "learning_rate": 0.0004948610544779829, + "loss": 0.2482, + "step": 46920 + }, + { + "epoch": 1.94, + "grad_norm": 0.86328125, + "learning_rate": 0.000494858866606885, + "loss": 0.2571, + "step": 46930 + }, + { + "epoch": 1.94, + "grad_norm": 0.361328125, + "learning_rate": 0.0004948566782749892, + "loss": 0.2302, + "step": 46940 + }, + { + "epoch": 1.94, + "grad_norm": 0.2734375, + "learning_rate": 0.0004948544894822995, + "loss": 0.2543, + "step": 46950 + }, + { + "epoch": 1.95, + "grad_norm": 0.69140625, + "learning_rate": 0.0004948523002288203, + "loss": 0.2115, + "step": 46960 + }, + { + "epoch": 1.95, + "grad_norm": 0.5703125, + "learning_rate": 0.0004948501105145553, + "loss": 0.227, + "step": 46970 + }, + { + "epoch": 1.95, + "grad_norm": 0.734375, + "learning_rate": 0.0004948479203395089, + "loss": 0.2494, + "step": 46980 + }, + { + "epoch": 1.95, + "grad_norm": 0.439453125, + "learning_rate": 0.0004948457297036851, + "loss": 0.1791, + "step": 46990 + }, + { + "epoch": 1.95, + "grad_norm": 0.328125, + "learning_rate": 0.0004948435386070882, + "loss": 0.2351, + "step": 47000 + }, + { + "epoch": 1.95, + "grad_norm": 0.51171875, + "learning_rate": 0.0004948413470497221, + "loss": 0.1721, + "step": 47010 + }, + { + "epoch": 1.95, + "grad_norm": 0.9453125, + "learning_rate": 0.000494839155031591, + "loss": 0.2856, + "step": 47020 + }, + { + "epoch": 1.95, + "grad_norm": 0.384765625, + "learning_rate": 0.0004948369625526991, + "loss": 0.1849, + "step": 47030 + }, + { + "epoch": 1.95, + "grad_norm": 0.404296875, + "learning_rate": 0.0004948347696130505, + "loss": 0.2958, + "step": 47040 + }, + { + "epoch": 1.95, + "grad_norm": 0.74609375, + "learning_rate": 0.0004948325762126493, + "loss": 0.2914, + "step": 47050 + }, + { + "epoch": 1.95, + "grad_norm": 0.25390625, + "learning_rate": 0.0004948303823514997, + "loss": 0.2159, + "step": 47060 + }, + { + "epoch": 1.95, + "grad_norm": 0.69140625, + "learning_rate": 0.0004948281880296056, + "loss": 0.2382, + "step": 47070 + }, + { + "epoch": 1.95, + "grad_norm": 0.8984375, + "learning_rate": 0.0004948259932469714, + "loss": 0.2152, + "step": 47080 + }, + { + "epoch": 1.95, + "grad_norm": 0.4921875, + "learning_rate": 0.0004948237980036012, + "loss": 0.2376, + "step": 47090 + }, + { + "epoch": 1.95, + "grad_norm": 1.2578125, + "learning_rate": 0.0004948216022994989, + "loss": 0.2115, + "step": 47100 + }, + { + "epoch": 1.95, + "grad_norm": 0.53515625, + "learning_rate": 0.0004948194061346688, + "loss": 0.204, + "step": 47110 + }, + { + "epoch": 1.95, + "grad_norm": 0.875, + "learning_rate": 0.0004948172095091151, + "loss": 0.182, + "step": 47120 + }, + { + "epoch": 1.95, + "grad_norm": 0.71875, + "learning_rate": 0.0004948150124228417, + "loss": 0.188, + "step": 47130 + }, + { + "epoch": 1.95, + "grad_norm": 0.515625, + "learning_rate": 0.000494812814875853, + "loss": 0.2406, + "step": 47140 + }, + { + "epoch": 1.95, + "grad_norm": 0.8359375, + "learning_rate": 0.000494810616868153, + "loss": 0.2605, + "step": 47150 + }, + { + "epoch": 1.95, + "grad_norm": 0.90625, + "learning_rate": 0.0004948084183997458, + "loss": 0.2217, + "step": 47160 + }, + { + "epoch": 1.95, + "grad_norm": 0.69921875, + "learning_rate": 0.0004948062194706357, + "loss": 0.2901, + "step": 47170 + }, + { + "epoch": 1.95, + "grad_norm": 0.77734375, + "learning_rate": 0.0004948040200808266, + "loss": 0.2331, + "step": 47180 + }, + { + "epoch": 1.95, + "grad_norm": 0.62890625, + "learning_rate": 0.0004948018202303229, + "loss": 0.2306, + "step": 47190 + }, + { + "epoch": 1.96, + "grad_norm": 1.15625, + "learning_rate": 0.0004947996199191285, + "loss": 0.2015, + "step": 47200 + }, + { + "epoch": 1.96, + "grad_norm": 1.140625, + "learning_rate": 0.0004947974191472478, + "loss": 0.2065, + "step": 47210 + }, + { + "epoch": 1.96, + "grad_norm": 0.384765625, + "learning_rate": 0.0004947952179146846, + "loss": 0.2276, + "step": 47220 + }, + { + "epoch": 1.96, + "grad_norm": 0.70703125, + "learning_rate": 0.0004947930162214433, + "loss": 0.2289, + "step": 47230 + }, + { + "epoch": 1.96, + "grad_norm": 0.9453125, + "learning_rate": 0.0004947908140675282, + "loss": 0.2242, + "step": 47240 + }, + { + "epoch": 1.96, + "grad_norm": 0.6484375, + "learning_rate": 0.000494788611452943, + "loss": 0.2046, + "step": 47250 + }, + { + "epoch": 1.96, + "grad_norm": 1.3359375, + "learning_rate": 0.0004947864083776921, + "loss": 0.195, + "step": 47260 + }, + { + "epoch": 1.96, + "grad_norm": 0.322265625, + "learning_rate": 0.0004947842048417796, + "loss": 0.2153, + "step": 47270 + }, + { + "epoch": 1.96, + "grad_norm": 0.416015625, + "learning_rate": 0.0004947820008452097, + "loss": 0.1993, + "step": 47280 + }, + { + "epoch": 1.96, + "grad_norm": 0.71484375, + "learning_rate": 0.0004947797963879865, + "loss": 0.2119, + "step": 47290 + }, + { + "epoch": 1.96, + "grad_norm": 0.8671875, + "learning_rate": 0.000494777591470114, + "loss": 0.2173, + "step": 47300 + }, + { + "epoch": 1.96, + "grad_norm": 1.3984375, + "learning_rate": 0.0004947753860915968, + "loss": 0.2539, + "step": 47310 + }, + { + "epoch": 1.96, + "grad_norm": 0.2490234375, + "learning_rate": 0.0004947731802524385, + "loss": 0.2189, + "step": 47320 + }, + { + "epoch": 1.96, + "grad_norm": 0.365234375, + "learning_rate": 0.0004947709739526437, + "loss": 0.2338, + "step": 47330 + }, + { + "epoch": 1.96, + "grad_norm": 0.421875, + "learning_rate": 0.0004947687671922163, + "loss": 0.2655, + "step": 47340 + }, + { + "epoch": 1.96, + "grad_norm": 0.5546875, + "learning_rate": 0.0004947665599711605, + "loss": 0.2284, + "step": 47350 + }, + { + "epoch": 1.96, + "grad_norm": 2.25, + "learning_rate": 0.0004947643522894803, + "loss": 0.2521, + "step": 47360 + }, + { + "epoch": 1.96, + "grad_norm": 0.54296875, + "learning_rate": 0.0004947621441471802, + "loss": 0.2137, + "step": 47370 + }, + { + "epoch": 1.96, + "grad_norm": 0.609375, + "learning_rate": 0.0004947599355442642, + "loss": 0.2098, + "step": 47380 + }, + { + "epoch": 1.96, + "grad_norm": 0.57421875, + "learning_rate": 0.0004947577264807364, + "loss": 0.2221, + "step": 47390 + }, + { + "epoch": 1.96, + "grad_norm": 0.65234375, + "learning_rate": 0.0004947555169566009, + "loss": 0.1779, + "step": 47400 + }, + { + "epoch": 1.96, + "grad_norm": 0.421875, + "learning_rate": 0.0004947533069718621, + "loss": 0.2085, + "step": 47410 + }, + { + "epoch": 1.96, + "grad_norm": 0.6328125, + "learning_rate": 0.000494751096526524, + "loss": 0.2256, + "step": 47420 + }, + { + "epoch": 1.96, + "grad_norm": 0.4765625, + "learning_rate": 0.0004947488856205907, + "loss": 0.2319, + "step": 47430 + }, + { + "epoch": 1.96, + "grad_norm": 1.0625, + "learning_rate": 0.0004947466742540664, + "loss": 0.2138, + "step": 47440 + }, + { + "epoch": 1.97, + "grad_norm": 0.9375, + "learning_rate": 0.0004947444624269553, + "loss": 0.2133, + "step": 47450 + }, + { + "epoch": 1.97, + "grad_norm": 0.37890625, + "learning_rate": 0.0004947422501392616, + "loss": 0.2613, + "step": 47460 + }, + { + "epoch": 1.97, + "grad_norm": 0.6484375, + "learning_rate": 0.0004947400373909894, + "loss": 0.1776, + "step": 47470 + }, + { + "epoch": 1.97, + "grad_norm": 0.298828125, + "learning_rate": 0.0004947378241821429, + "loss": 0.2339, + "step": 47480 + }, + { + "epoch": 1.97, + "grad_norm": 0.859375, + "learning_rate": 0.0004947356105127261, + "loss": 0.2308, + "step": 47490 + }, + { + "epoch": 1.97, + "grad_norm": 0.96875, + "learning_rate": 0.0004947333963827435, + "loss": 0.2348, + "step": 47500 + }, + { + "epoch": 1.97, + "grad_norm": 0.45703125, + "learning_rate": 0.000494731181792199, + "loss": 0.2173, + "step": 47510 + }, + { + "epoch": 1.97, + "grad_norm": 0.474609375, + "learning_rate": 0.0004947289667410968, + "loss": 0.2403, + "step": 47520 + }, + { + "epoch": 1.97, + "grad_norm": 0.421875, + "learning_rate": 0.0004947267512294412, + "loss": 0.2291, + "step": 47530 + }, + { + "epoch": 1.97, + "grad_norm": 0.48828125, + "learning_rate": 0.0004947245352572362, + "loss": 0.2197, + "step": 47540 + }, + { + "epoch": 1.97, + "grad_norm": 1.484375, + "learning_rate": 0.0004947223188244861, + "loss": 0.215, + "step": 47550 + }, + { + "epoch": 1.97, + "grad_norm": 0.4296875, + "learning_rate": 0.000494720101931195, + "loss": 0.2694, + "step": 47560 + }, + { + "epoch": 1.97, + "grad_norm": 0.796875, + "learning_rate": 0.0004947178845773671, + "loss": 0.2067, + "step": 47570 + }, + { + "epoch": 1.97, + "grad_norm": 0.6328125, + "learning_rate": 0.0004947156667630066, + "loss": 0.2187, + "step": 47580 + }, + { + "epoch": 1.97, + "grad_norm": 0.494140625, + "learning_rate": 0.0004947134484881177, + "loss": 0.2134, + "step": 47590 + }, + { + "epoch": 1.97, + "grad_norm": 0.984375, + "learning_rate": 0.0004947112297527043, + "loss": 0.2748, + "step": 47600 + }, + { + "epoch": 1.97, + "grad_norm": 0.248046875, + "learning_rate": 0.000494709010556771, + "loss": 0.2608, + "step": 47610 + }, + { + "epoch": 1.97, + "grad_norm": 0.5703125, + "learning_rate": 0.0004947067909003217, + "loss": 0.2342, + "step": 47620 + }, + { + "epoch": 1.97, + "grad_norm": 0.421875, + "learning_rate": 0.0004947045707833606, + "loss": 0.2213, + "step": 47630 + }, + { + "epoch": 1.97, + "grad_norm": 0.921875, + "learning_rate": 0.0004947023502058919, + "loss": 0.1901, + "step": 47640 + }, + { + "epoch": 1.97, + "grad_norm": 0.63671875, + "learning_rate": 0.00049470012916792, + "loss": 0.2026, + "step": 47650 + }, + { + "epoch": 1.97, + "grad_norm": 0.875, + "learning_rate": 0.0004946979076694487, + "loss": 0.2225, + "step": 47660 + }, + { + "epoch": 1.97, + "grad_norm": 1.8984375, + "learning_rate": 0.0004946956857104824, + "loss": 0.2581, + "step": 47670 + }, + { + "epoch": 1.97, + "grad_norm": 0.28125, + "learning_rate": 0.0004946934632910253, + "loss": 0.2145, + "step": 47680 + }, + { + "epoch": 1.98, + "grad_norm": 0.4765625, + "learning_rate": 0.0004946912404110815, + "loss": 0.22, + "step": 47690 + }, + { + "epoch": 1.98, + "grad_norm": 0.76171875, + "learning_rate": 0.0004946890170706552, + "loss": 0.2137, + "step": 47700 + }, + { + "epoch": 1.98, + "grad_norm": 0.56640625, + "learning_rate": 0.0004946867932697505, + "loss": 0.2377, + "step": 47710 + }, + { + "epoch": 1.98, + "grad_norm": 0.69140625, + "learning_rate": 0.0004946845690083718, + "loss": 0.2125, + "step": 47720 + }, + { + "epoch": 1.98, + "grad_norm": 0.99609375, + "learning_rate": 0.0004946823442865231, + "loss": 0.2209, + "step": 47730 + }, + { + "epoch": 1.98, + "grad_norm": 0.59765625, + "learning_rate": 0.0004946801191042088, + "loss": 0.2022, + "step": 47740 + }, + { + "epoch": 1.98, + "grad_norm": 0.1943359375, + "learning_rate": 0.0004946778934614328, + "loss": 0.2323, + "step": 47750 + }, + { + "epoch": 1.98, + "grad_norm": 0.55078125, + "learning_rate": 0.0004946756673581994, + "loss": 0.226, + "step": 47760 + }, + { + "epoch": 1.98, + "grad_norm": 0.5703125, + "learning_rate": 0.0004946734407945129, + "loss": 0.1827, + "step": 47770 + }, + { + "epoch": 1.98, + "grad_norm": 0.85546875, + "learning_rate": 0.0004946712137703773, + "loss": 0.2676, + "step": 47780 + }, + { + "epoch": 1.98, + "grad_norm": 0.482421875, + "learning_rate": 0.000494668986285797, + "loss": 0.1913, + "step": 47790 + }, + { + "epoch": 1.98, + "grad_norm": 0.51171875, + "learning_rate": 0.0004946667583407761, + "loss": 0.2006, + "step": 47800 + }, + { + "epoch": 1.98, + "grad_norm": 1.078125, + "learning_rate": 0.0004946645299353187, + "loss": 0.1687, + "step": 47810 + }, + { + "epoch": 1.98, + "grad_norm": 0.73828125, + "learning_rate": 0.0004946623010694291, + "loss": 0.1983, + "step": 47820 + }, + { + "epoch": 1.98, + "grad_norm": 1.015625, + "learning_rate": 0.0004946600717431115, + "loss": 0.2605, + "step": 47830 + }, + { + "epoch": 1.98, + "grad_norm": 0.70703125, + "learning_rate": 0.0004946578419563701, + "loss": 0.2547, + "step": 47840 + }, + { + "epoch": 1.98, + "grad_norm": 0.828125, + "learning_rate": 0.000494655611709209, + "loss": 0.2245, + "step": 47850 + }, + { + "epoch": 1.98, + "grad_norm": 0.80859375, + "learning_rate": 0.0004946533810016324, + "loss": 0.2224, + "step": 47860 + }, + { + "epoch": 1.98, + "grad_norm": 0.494140625, + "learning_rate": 0.0004946511498336447, + "loss": 0.2273, + "step": 47870 + }, + { + "epoch": 1.98, + "grad_norm": 2.078125, + "learning_rate": 0.0004946489182052499, + "loss": 0.21, + "step": 47880 + }, + { + "epoch": 1.98, + "grad_norm": 1.0859375, + "learning_rate": 0.0004946466861164522, + "loss": 0.2475, + "step": 47890 + }, + { + "epoch": 1.98, + "grad_norm": 1.03125, + "learning_rate": 0.0004946444535672559, + "loss": 0.24, + "step": 47900 + }, + { + "epoch": 1.98, + "grad_norm": 0.0, + "learning_rate": 0.0004946422205576652, + "loss": 0.2885, + "step": 47910 + }, + { + "epoch": 1.98, + "grad_norm": 0.64453125, + "learning_rate": 0.0004946399870876842, + "loss": 0.1566, + "step": 47920 + }, + { + "epoch": 1.99, + "grad_norm": 0.78125, + "learning_rate": 0.0004946377531573171, + "loss": 0.2706, + "step": 47930 + }, + { + "epoch": 1.99, + "grad_norm": 1.046875, + "learning_rate": 0.0004946355187665683, + "loss": 0.2698, + "step": 47940 + }, + { + "epoch": 1.99, + "grad_norm": 0.66796875, + "learning_rate": 0.0004946332839154419, + "loss": 0.227, + "step": 47950 + }, + { + "epoch": 1.99, + "grad_norm": 0.53125, + "learning_rate": 0.000494631048603942, + "loss": 0.2307, + "step": 47960 + }, + { + "epoch": 1.99, + "grad_norm": 0.55078125, + "learning_rate": 0.0004946288128320729, + "loss": 0.2276, + "step": 47970 + }, + { + "epoch": 1.99, + "grad_norm": 0.421875, + "learning_rate": 0.0004946265765998388, + "loss": 0.207, + "step": 47980 + }, + { + "epoch": 1.99, + "grad_norm": 0.373046875, + "learning_rate": 0.0004946243399072439, + "loss": 0.1841, + "step": 47990 + }, + { + "epoch": 1.99, + "grad_norm": 0.41015625, + "learning_rate": 0.0004946221027542923, + "loss": 0.2339, + "step": 48000 + }, + { + "epoch": 1.99, + "grad_norm": 0.25390625, + "learning_rate": 0.0004946198651409884, + "loss": 0.2165, + "step": 48010 + }, + { + "epoch": 1.99, + "grad_norm": 1.90625, + "learning_rate": 0.0004946176270673364, + "loss": 0.2022, + "step": 48020 + }, + { + "epoch": 1.99, + "grad_norm": 0.5703125, + "learning_rate": 0.0004946153885333403, + "loss": 0.2038, + "step": 48030 + }, + { + "epoch": 1.99, + "grad_norm": 0.455078125, + "learning_rate": 0.0004946131495390046, + "loss": 0.2059, + "step": 48040 + }, + { + "epoch": 1.99, + "grad_norm": 0.60546875, + "learning_rate": 0.0004946109100843332, + "loss": 0.2156, + "step": 48050 + }, + { + "epoch": 1.99, + "grad_norm": 0.88671875, + "learning_rate": 0.0004946086701693306, + "loss": 0.1869, + "step": 48060 + }, + { + "epoch": 1.99, + "grad_norm": 0.5078125, + "learning_rate": 0.0004946064297940009, + "loss": 0.2303, + "step": 48070 + }, + { + "epoch": 1.99, + "grad_norm": 0.625, + "learning_rate": 0.0004946041889583484, + "loss": 0.2298, + "step": 48080 + }, + { + "epoch": 1.99, + "grad_norm": 0.45703125, + "learning_rate": 0.0004946019476623771, + "loss": 0.1896, + "step": 48090 + }, + { + "epoch": 1.99, + "grad_norm": 0.60546875, + "learning_rate": 0.0004945997059060914, + "loss": 0.2256, + "step": 48100 + }, + { + "epoch": 1.99, + "grad_norm": 0.609375, + "learning_rate": 0.0004945974636894954, + "loss": 0.2093, + "step": 48110 + }, + { + "epoch": 1.99, + "grad_norm": 0.69140625, + "learning_rate": 0.0004945952210125934, + "loss": 0.2596, + "step": 48120 + }, + { + "epoch": 1.99, + "grad_norm": 1.0390625, + "learning_rate": 0.0004945929778753897, + "loss": 0.2697, + "step": 48130 + }, + { + "epoch": 1.99, + "grad_norm": 0.64453125, + "learning_rate": 0.0004945907342778883, + "loss": 0.204, + "step": 48140 + }, + { + "epoch": 1.99, + "grad_norm": 0.7109375, + "learning_rate": 0.0004945884902200937, + "loss": 0.2027, + "step": 48150 + }, + { + "epoch": 1.99, + "grad_norm": 0.8125, + "learning_rate": 0.0004945862457020099, + "loss": 0.2143, + "step": 48160 + }, + { + "epoch": 2.0, + "grad_norm": 0.56640625, + "learning_rate": 0.0004945840007236412, + "loss": 0.2398, + "step": 48170 + }, + { + "epoch": 2.0, + "grad_norm": 0.451171875, + "learning_rate": 0.0004945817552849918, + "loss": 0.1956, + "step": 48180 + }, + { + "epoch": 2.0, + "grad_norm": 0.21875, + "learning_rate": 0.000494579509386066, + "loss": 0.2506, + "step": 48190 + }, + { + "epoch": 2.0, + "grad_norm": 1.2890625, + "learning_rate": 0.0004945772630268679, + "loss": 0.2104, + "step": 48200 + }, + { + "epoch": 2.0, + "grad_norm": 3.09375, + "learning_rate": 0.0004945750162074019, + "loss": 0.2428, + "step": 48210 + }, + { + "epoch": 2.0, + "grad_norm": 0.361328125, + "learning_rate": 0.0004945727689276721, + "loss": 0.2399, + "step": 48220 + }, + { + "epoch": 2.0, + "grad_norm": 0.423828125, + "learning_rate": 0.0004945705211876827, + "loss": 0.2027, + "step": 48230 + }, + { + "epoch": 2.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.000494568272987438, + "loss": 0.2713, + "step": 48240 + }, + { + "epoch": 2.0, + "grad_norm": 0.546875, + "learning_rate": 0.0004945660243269423, + "loss": 0.2127, + "step": 48250 + }, + { + "epoch": 2.0, + "grad_norm": 0.56640625, + "learning_rate": 0.0004945637752061996, + "loss": 0.1366, + "step": 48260 + }, + { + "epoch": 2.0, + "grad_norm": 0.94140625, + "learning_rate": 0.0004945615256252145, + "loss": 0.2103, + "step": 48270 + }, + { + "epoch": 2.0, + "grad_norm": 0.65625, + "learning_rate": 0.0004945592755839908, + "loss": 0.2185, + "step": 48280 + }, + { + "epoch": 2.0, + "grad_norm": 0.439453125, + "learning_rate": 0.0004945570250825331, + "loss": 0.1365, + "step": 48290 + }, + { + "epoch": 2.0, + "grad_norm": 0.44921875, + "learning_rate": 0.0004945547741208455, + "loss": 0.249, + "step": 48300 + }, + { + "epoch": 2.0, + "grad_norm": 0.447265625, + "learning_rate": 0.0004945525226989321, + "loss": 0.2118, + "step": 48310 + }, + { + "epoch": 2.0, + "grad_norm": 0.921875, + "learning_rate": 0.0004945502708167973, + "loss": 0.1868, + "step": 48320 + }, + { + "epoch": 2.0, + "grad_norm": 2.203125, + "learning_rate": 0.0004945480184744454, + "loss": 0.1854, + "step": 48330 + }, + { + "epoch": 2.0, + "grad_norm": 0.76953125, + "learning_rate": 0.0004945457656718805, + "loss": 0.247, + "step": 48340 + }, + { + "epoch": 2.0, + "grad_norm": 1.2890625, + "learning_rate": 0.0004945435124091069, + "loss": 0.2494, + "step": 48350 + }, + { + "epoch": 2.0, + "grad_norm": 0.30078125, + "learning_rate": 0.0004945412586861288, + "loss": 0.2209, + "step": 48360 + }, + { + "epoch": 2.0, + "grad_norm": 1.015625, + "learning_rate": 0.0004945390045029503, + "loss": 0.2047, + "step": 48370 + }, + { + "epoch": 2.0, + "grad_norm": 0.35546875, + "learning_rate": 0.000494536749859576, + "loss": 0.2534, + "step": 48380 + }, + { + "epoch": 2.0, + "grad_norm": 0.125, + "learning_rate": 0.0004945344947560098, + "loss": 0.2828, + "step": 48390 + }, + { + "epoch": 2.0, + "grad_norm": 0.66796875, + "learning_rate": 0.0004945322391922562, + "loss": 0.2458, + "step": 48400 + }, + { + "epoch": 2.01, + "grad_norm": 0.56640625, + "learning_rate": 0.0004945299831683193, + "loss": 0.234, + "step": 48410 + }, + { + "epoch": 2.01, + "grad_norm": 0.98046875, + "learning_rate": 0.0004945277266842034, + "loss": 0.2698, + "step": 48420 + }, + { + "epoch": 2.01, + "grad_norm": 0.9296875, + "learning_rate": 0.0004945254697399128, + "loss": 0.1942, + "step": 48430 + }, + { + "epoch": 2.01, + "grad_norm": 0.6796875, + "learning_rate": 0.0004945232123354515, + "loss": 0.189, + "step": 48440 + }, + { + "epoch": 2.01, + "grad_norm": 0.6484375, + "learning_rate": 0.000494520954470824, + "loss": 0.2508, + "step": 48450 + }, + { + "epoch": 2.01, + "grad_norm": 0.6171875, + "learning_rate": 0.0004945186961460345, + "loss": 0.2017, + "step": 48460 + }, + { + "epoch": 2.01, + "grad_norm": 0.51171875, + "learning_rate": 0.0004945164373610872, + "loss": 0.2336, + "step": 48470 + }, + { + "epoch": 2.01, + "grad_norm": 0.67578125, + "learning_rate": 0.0004945141781159863, + "loss": 0.2272, + "step": 48480 + }, + { + "epoch": 2.01, + "grad_norm": 0.48046875, + "learning_rate": 0.0004945119184107361, + "loss": 0.1592, + "step": 48490 + }, + { + "epoch": 2.01, + "grad_norm": 0.4296875, + "learning_rate": 0.000494509658245341, + "loss": 0.196, + "step": 48500 + }, + { + "epoch": 2.01, + "grad_norm": 0.7578125, + "learning_rate": 0.0004945073976198051, + "loss": 0.1986, + "step": 48510 + }, + { + "epoch": 2.01, + "grad_norm": 0.9765625, + "learning_rate": 0.0004945051365341326, + "loss": 0.1975, + "step": 48520 + }, + { + "epoch": 2.01, + "grad_norm": 0.359375, + "learning_rate": 0.000494502874988328, + "loss": 0.2317, + "step": 48530 + }, + { + "epoch": 2.01, + "grad_norm": 0.875, + "learning_rate": 0.0004945006129823952, + "loss": 0.1878, + "step": 48540 + }, + { + "epoch": 2.01, + "grad_norm": 1.421875, + "learning_rate": 0.0004944983505163388, + "loss": 0.2836, + "step": 48550 + }, + { + "epoch": 2.01, + "grad_norm": 0.6328125, + "learning_rate": 0.0004944960875901629, + "loss": 0.2428, + "step": 48560 + }, + { + "epoch": 2.01, + "grad_norm": 1.296875, + "learning_rate": 0.0004944938242038716, + "loss": 0.1908, + "step": 48570 + }, + { + "epoch": 2.01, + "grad_norm": 0.2890625, + "learning_rate": 0.0004944915603574695, + "loss": 0.2207, + "step": 48580 + }, + { + "epoch": 2.01, + "grad_norm": 0.78515625, + "learning_rate": 0.0004944892960509606, + "loss": 0.2024, + "step": 48590 + }, + { + "epoch": 2.01, + "grad_norm": 0.455078125, + "learning_rate": 0.0004944870312843493, + "loss": 0.1939, + "step": 48600 + }, + { + "epoch": 2.01, + "grad_norm": 0.875, + "learning_rate": 0.0004944847660576399, + "loss": 0.1602, + "step": 48610 + }, + { + "epoch": 2.01, + "grad_norm": 0.2138671875, + "learning_rate": 0.0004944825003708363, + "loss": 0.2578, + "step": 48620 + }, + { + "epoch": 2.01, + "grad_norm": 0.5234375, + "learning_rate": 0.0004944802342239432, + "loss": 0.221, + "step": 48630 + }, + { + "epoch": 2.01, + "grad_norm": 0.59375, + "learning_rate": 0.0004944779676169647, + "loss": 0.2226, + "step": 48640 + }, + { + "epoch": 2.02, + "grad_norm": 0.609375, + "learning_rate": 0.0004944757005499051, + "loss": 0.1922, + "step": 48650 + }, + { + "epoch": 2.02, + "grad_norm": 0.890625, + "learning_rate": 0.0004944734330227686, + "loss": 0.2098, + "step": 48660 + }, + { + "epoch": 2.02, + "grad_norm": 0.435546875, + "learning_rate": 0.0004944711650355595, + "loss": 0.2507, + "step": 48670 + }, + { + "epoch": 2.02, + "grad_norm": 0.2890625, + "learning_rate": 0.000494468896588282, + "loss": 0.191, + "step": 48680 + }, + { + "epoch": 2.02, + "grad_norm": 0.70703125, + "learning_rate": 0.0004944666276809406, + "loss": 0.2866, + "step": 48690 + }, + { + "epoch": 2.02, + "grad_norm": 1.2890625, + "learning_rate": 0.0004944643583135392, + "loss": 0.215, + "step": 48700 + }, + { + "epoch": 2.02, + "grad_norm": 1.078125, + "learning_rate": 0.0004944620884860824, + "loss": 0.2296, + "step": 48710 + }, + { + "epoch": 2.02, + "grad_norm": 0.77734375, + "learning_rate": 0.0004944598181985744, + "loss": 0.2158, + "step": 48720 + }, + { + "epoch": 2.02, + "grad_norm": 1.921875, + "learning_rate": 0.0004944575474510193, + "loss": 0.1905, + "step": 48730 + }, + { + "epoch": 2.02, + "grad_norm": 0.365234375, + "learning_rate": 0.0004944552762434216, + "loss": 0.182, + "step": 48740 + }, + { + "epoch": 2.02, + "grad_norm": 0.52734375, + "learning_rate": 0.0004944530045757853, + "loss": 0.1883, + "step": 48750 + }, + { + "epoch": 2.02, + "grad_norm": 0.78515625, + "learning_rate": 0.000494450732448115, + "loss": 0.2682, + "step": 48760 + }, + { + "epoch": 2.02, + "grad_norm": 0.84765625, + "learning_rate": 0.0004944484598604148, + "loss": 0.2434, + "step": 48770 + }, + { + "epoch": 2.02, + "grad_norm": 1.015625, + "learning_rate": 0.000494446186812689, + "loss": 0.2874, + "step": 48780 + }, + { + "epoch": 2.02, + "grad_norm": 0.427734375, + "learning_rate": 0.0004944439133049419, + "loss": 0.2129, + "step": 48790 + }, + { + "epoch": 2.02, + "grad_norm": 0.478515625, + "learning_rate": 0.0004944416393371777, + "loss": 0.1454, + "step": 48800 + }, + { + "epoch": 2.02, + "grad_norm": 0.63671875, + "learning_rate": 0.0004944393649094007, + "loss": 0.2254, + "step": 48810 + }, + { + "epoch": 2.02, + "grad_norm": 0.2314453125, + "learning_rate": 0.0004944370900216153, + "loss": 0.23, + "step": 48820 + }, + { + "epoch": 2.02, + "grad_norm": 0.259765625, + "learning_rate": 0.0004944348146738256, + "loss": 0.188, + "step": 48830 + }, + { + "epoch": 2.02, + "grad_norm": 0.7890625, + "learning_rate": 0.000494432538866036, + "loss": 0.2141, + "step": 48840 + }, + { + "epoch": 2.02, + "grad_norm": 0.57421875, + "learning_rate": 0.0004944302625982508, + "loss": 0.2182, + "step": 48850 + }, + { + "epoch": 2.02, + "grad_norm": 0.37109375, + "learning_rate": 0.0004944279858704742, + "loss": 0.2396, + "step": 48860 + }, + { + "epoch": 2.02, + "grad_norm": 0.3203125, + "learning_rate": 0.0004944257086827105, + "loss": 0.2638, + "step": 48870 + }, + { + "epoch": 2.02, + "grad_norm": 2.25, + "learning_rate": 0.0004944234310349641, + "loss": 0.2432, + "step": 48880 + }, + { + "epoch": 2.03, + "grad_norm": 0.515625, + "learning_rate": 0.0004944211529272391, + "loss": 0.1975, + "step": 48890 + }, + { + "epoch": 2.03, + "grad_norm": 0.0, + "learning_rate": 0.00049441887435954, + "loss": 0.1927, + "step": 48900 + }, + { + "epoch": 2.03, + "grad_norm": 0.53515625, + "learning_rate": 0.0004944165953318708, + "loss": 0.2546, + "step": 48910 + }, + { + "epoch": 2.03, + "grad_norm": 0.5625, + "learning_rate": 0.0004944143158442362, + "loss": 0.2286, + "step": 48920 + }, + { + "epoch": 2.03, + "grad_norm": 1.1328125, + "learning_rate": 0.0004944120358966401, + "loss": 0.1972, + "step": 48930 + }, + { + "epoch": 2.03, + "grad_norm": 0.9296875, + "learning_rate": 0.000494409755489087, + "loss": 0.2412, + "step": 48940 + }, + { + "epoch": 2.03, + "grad_norm": 0.765625, + "learning_rate": 0.0004944074746215811, + "loss": 0.2467, + "step": 48950 + }, + { + "epoch": 2.03, + "grad_norm": 0.640625, + "learning_rate": 0.0004944051932941268, + "loss": 0.2035, + "step": 48960 + }, + { + "epoch": 2.03, + "grad_norm": 0.6953125, + "learning_rate": 0.0004944029115067282, + "loss": 0.1969, + "step": 48970 + }, + { + "epoch": 2.03, + "grad_norm": 0.8125, + "learning_rate": 0.0004944006292593898, + "loss": 0.2987, + "step": 48980 + }, + { + "epoch": 2.03, + "grad_norm": 0.357421875, + "learning_rate": 0.0004943983465521157, + "loss": 0.1789, + "step": 48990 + }, + { + "epoch": 2.03, + "grad_norm": 0.57421875, + "learning_rate": 0.0004943960633849104, + "loss": 0.2343, + "step": 49000 + }, + { + "epoch": 2.03, + "grad_norm": 0.47265625, + "learning_rate": 0.0004943937797577781, + "loss": 0.2174, + "step": 49010 + }, + { + "epoch": 2.03, + "grad_norm": 0.70703125, + "learning_rate": 0.0004943914956707231, + "loss": 0.2161, + "step": 49020 + }, + { + "epoch": 2.03, + "grad_norm": 0.314453125, + "learning_rate": 0.0004943892111237496, + "loss": 0.2067, + "step": 49030 + }, + { + "epoch": 2.03, + "grad_norm": 0.6484375, + "learning_rate": 0.0004943869261168621, + "loss": 0.2222, + "step": 49040 + }, + { + "epoch": 2.03, + "grad_norm": 0.44140625, + "learning_rate": 0.0004943846406500647, + "loss": 0.154, + "step": 49050 + }, + { + "epoch": 2.03, + "grad_norm": 0.859375, + "learning_rate": 0.0004943823547233619, + "loss": 0.2865, + "step": 49060 + }, + { + "epoch": 2.03, + "grad_norm": 0.63671875, + "learning_rate": 0.0004943800683367579, + "loss": 0.2163, + "step": 49070 + }, + { + "epoch": 2.03, + "grad_norm": 1.0546875, + "learning_rate": 0.000494377781490257, + "loss": 0.1648, + "step": 49080 + }, + { + "epoch": 2.03, + "grad_norm": 1.078125, + "learning_rate": 0.0004943754941838635, + "loss": 0.208, + "step": 49090 + }, + { + "epoch": 2.03, + "grad_norm": 1.1484375, + "learning_rate": 0.0004943732064175817, + "loss": 0.2083, + "step": 49100 + }, + { + "epoch": 2.03, + "grad_norm": 1.078125, + "learning_rate": 0.0004943709181914159, + "loss": 0.1809, + "step": 49110 + }, + { + "epoch": 2.03, + "grad_norm": 0.4453125, + "learning_rate": 0.0004943686295053703, + "loss": 0.2084, + "step": 49120 + }, + { + "epoch": 2.03, + "grad_norm": 1.171875, + "learning_rate": 0.0004943663403594495, + "loss": 0.2569, + "step": 49130 + }, + { + "epoch": 2.04, + "grad_norm": 0.58984375, + "learning_rate": 0.0004943640507536576, + "loss": 0.2593, + "step": 49140 + }, + { + "epoch": 2.04, + "grad_norm": 0.2177734375, + "learning_rate": 0.0004943617606879989, + "loss": 0.2497, + "step": 49150 + }, + { + "epoch": 2.04, + "grad_norm": 0.0, + "learning_rate": 0.0004943594701624778, + "loss": 0.2341, + "step": 49160 + }, + { + "epoch": 2.04, + "grad_norm": 0.458984375, + "learning_rate": 0.0004943571791770986, + "loss": 0.2219, + "step": 49170 + }, + { + "epoch": 2.04, + "grad_norm": 0.796875, + "learning_rate": 0.0004943548877318655, + "loss": 0.2303, + "step": 49180 + }, + { + "epoch": 2.04, + "grad_norm": 0.404296875, + "learning_rate": 0.000494352595826783, + "loss": 0.2435, + "step": 49190 + }, + { + "epoch": 2.04, + "grad_norm": 0.90234375, + "learning_rate": 0.0004943503034618552, + "loss": 0.2894, + "step": 49200 + }, + { + "epoch": 2.04, + "grad_norm": 0.7265625, + "learning_rate": 0.0004943480106370864, + "loss": 0.2392, + "step": 49210 + }, + { + "epoch": 2.04, + "grad_norm": 0.703125, + "learning_rate": 0.0004943457173524812, + "loss": 0.2164, + "step": 49220 + }, + { + "epoch": 2.04, + "grad_norm": 1.34375, + "learning_rate": 0.0004943434236080438, + "loss": 0.2377, + "step": 49230 + }, + { + "epoch": 2.04, + "grad_norm": 0.047607421875, + "learning_rate": 0.0004943411294037783, + "loss": 0.264, + "step": 49240 + }, + { + "epoch": 2.04, + "grad_norm": 0.4609375, + "learning_rate": 0.0004943388347396894, + "loss": 0.1667, + "step": 49250 + }, + { + "epoch": 2.04, + "grad_norm": 0.59765625, + "learning_rate": 0.0004943365396157809, + "loss": 0.1882, + "step": 49260 + }, + { + "epoch": 2.04, + "grad_norm": 1.4921875, + "learning_rate": 0.0004943342440320576, + "loss": 0.2255, + "step": 49270 + }, + { + "epoch": 2.04, + "grad_norm": 0.47265625, + "learning_rate": 0.0004943319479885237, + "loss": 0.2336, + "step": 49280 + }, + { + "epoch": 2.04, + "grad_norm": 1.4765625, + "learning_rate": 0.0004943296514851833, + "loss": 0.2048, + "step": 49290 + }, + { + "epoch": 2.04, + "grad_norm": 0.4765625, + "learning_rate": 0.000494327354522041, + "loss": 0.2155, + "step": 49300 + }, + { + "epoch": 2.04, + "grad_norm": 0.408203125, + "learning_rate": 0.0004943250570991009, + "loss": 0.2399, + "step": 49310 + }, + { + "epoch": 2.04, + "grad_norm": 0.7890625, + "learning_rate": 0.0004943227592163676, + "loss": 0.2306, + "step": 49320 + }, + { + "epoch": 2.04, + "grad_norm": 0.9296875, + "learning_rate": 0.0004943204608738451, + "loss": 0.2198, + "step": 49330 + }, + { + "epoch": 2.04, + "grad_norm": 1.171875, + "learning_rate": 0.0004943181620715379, + "loss": 0.2569, + "step": 49340 + }, + { + "epoch": 2.04, + "grad_norm": 0.9765625, + "learning_rate": 0.0004943158628094502, + "loss": 0.2108, + "step": 49350 + }, + { + "epoch": 2.04, + "grad_norm": 0.7109375, + "learning_rate": 0.0004943135630875867, + "loss": 0.2402, + "step": 49360 + }, + { + "epoch": 2.04, + "grad_norm": 0.39453125, + "learning_rate": 0.0004943112629059513, + "loss": 0.1931, + "step": 49370 + }, + { + "epoch": 2.05, + "grad_norm": 0.43359375, + "learning_rate": 0.0004943089622645485, + "loss": 0.2098, + "step": 49380 + }, + { + "epoch": 2.05, + "grad_norm": 0.326171875, + "learning_rate": 0.0004943066611633825, + "loss": 0.2129, + "step": 49390 + }, + { + "epoch": 2.05, + "grad_norm": 1.2265625, + "learning_rate": 0.0004943043596024579, + "loss": 0.2483, + "step": 49400 + }, + { + "epoch": 2.05, + "grad_norm": 0.8203125, + "learning_rate": 0.0004943020575817788, + "loss": 0.2252, + "step": 49410 + }, + { + "epoch": 2.05, + "grad_norm": 0.98828125, + "learning_rate": 0.0004942997551013497, + "loss": 0.1943, + "step": 49420 + }, + { + "epoch": 2.05, + "grad_norm": 1.203125, + "learning_rate": 0.0004942974521611748, + "loss": 0.2382, + "step": 49430 + }, + { + "epoch": 2.05, + "grad_norm": 0.8671875, + "learning_rate": 0.0004942951487612585, + "loss": 0.2459, + "step": 49440 + }, + { + "epoch": 2.05, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004942928449016051, + "loss": 0.263, + "step": 49450 + }, + { + "epoch": 2.05, + "grad_norm": 0.55078125, + "learning_rate": 0.000494290540582219, + "loss": 0.2295, + "step": 49460 + }, + { + "epoch": 2.05, + "grad_norm": 1.0390625, + "learning_rate": 0.0004942882358031043, + "loss": 0.1858, + "step": 49470 + }, + { + "epoch": 2.05, + "grad_norm": 0.52734375, + "learning_rate": 0.0004942859305642656, + "loss": 0.2206, + "step": 49480 + }, + { + "epoch": 2.05, + "grad_norm": 0.30078125, + "learning_rate": 0.0004942836248657072, + "loss": 0.2271, + "step": 49490 + }, + { + "epoch": 2.05, + "grad_norm": 0.65625, + "learning_rate": 0.0004942813187074334, + "loss": 0.2112, + "step": 49500 + }, + { + "epoch": 2.05, + "grad_norm": 1.15625, + "learning_rate": 0.0004942790120894485, + "loss": 0.2136, + "step": 49510 + }, + { + "epoch": 2.05, + "grad_norm": 0.328125, + "learning_rate": 0.000494276705011757, + "loss": 0.2349, + "step": 49520 + }, + { + "epoch": 2.05, + "grad_norm": 0.69921875, + "learning_rate": 0.0004942743974743629, + "loss": 0.2584, + "step": 49530 + }, + { + "epoch": 2.05, + "grad_norm": 0.6640625, + "learning_rate": 0.000494272089477271, + "loss": 0.245, + "step": 49540 + }, + { + "epoch": 2.05, + "grad_norm": 0.474609375, + "learning_rate": 0.0004942697810204852, + "loss": 0.1848, + "step": 49550 + }, + { + "epoch": 2.05, + "grad_norm": 0.265625, + "learning_rate": 0.0004942674721040102, + "loss": 0.1427, + "step": 49560 + }, + { + "epoch": 2.05, + "grad_norm": 0.87109375, + "learning_rate": 0.0004942651627278502, + "loss": 0.2091, + "step": 49570 + }, + { + "epoch": 2.05, + "grad_norm": 0.74609375, + "learning_rate": 0.0004942628528920094, + "loss": 0.2259, + "step": 49580 + }, + { + "epoch": 2.05, + "grad_norm": 0.578125, + "learning_rate": 0.0004942605425964924, + "loss": 0.1944, + "step": 49590 + }, + { + "epoch": 2.05, + "grad_norm": 0.71484375, + "learning_rate": 0.0004942582318413033, + "loss": 0.212, + "step": 49600 + }, + { + "epoch": 2.05, + "grad_norm": 0.375, + "learning_rate": 0.0004942559206264468, + "loss": 0.209, + "step": 49610 + }, + { + "epoch": 2.06, + "grad_norm": 0.4140625, + "learning_rate": 0.0004942536089519269, + "loss": 0.1804, + "step": 49620 + }, + { + "epoch": 2.06, + "grad_norm": 1.1328125, + "learning_rate": 0.0004942512968177481, + "loss": 0.2185, + "step": 49630 + }, + { + "epoch": 2.06, + "grad_norm": 0.431640625, + "learning_rate": 0.0004942489842239148, + "loss": 0.2257, + "step": 49640 + }, + { + "epoch": 2.06, + "grad_norm": 0.41015625, + "learning_rate": 0.0004942466711704312, + "loss": 0.2514, + "step": 49650 + }, + { + "epoch": 2.06, + "grad_norm": 0.58984375, + "learning_rate": 0.0004942443576573018, + "loss": 0.2314, + "step": 49660 + }, + { + "epoch": 2.06, + "grad_norm": 1.3671875, + "learning_rate": 0.0004942420436845308, + "loss": 0.2253, + "step": 49670 + }, + { + "epoch": 2.06, + "grad_norm": 0.6015625, + "learning_rate": 0.0004942397292521228, + "loss": 0.252, + "step": 49680 + }, + { + "epoch": 2.06, + "grad_norm": 1.53125, + "learning_rate": 0.000494237414360082, + "loss": 0.2454, + "step": 49690 + }, + { + "epoch": 2.06, + "grad_norm": 0.69140625, + "learning_rate": 0.0004942350990084127, + "loss": 0.2165, + "step": 49700 + }, + { + "epoch": 2.06, + "grad_norm": 0.412109375, + "learning_rate": 0.0004942327831971192, + "loss": 0.2309, + "step": 49710 + }, + { + "epoch": 2.06, + "grad_norm": 0.890625, + "learning_rate": 0.000494230466926206, + "loss": 0.2422, + "step": 49720 + }, + { + "epoch": 2.06, + "grad_norm": 0.3828125, + "learning_rate": 0.0004942281501956777, + "loss": 0.2286, + "step": 49730 + }, + { + "epoch": 2.06, + "grad_norm": 0.447265625, + "learning_rate": 0.0004942258330055382, + "loss": 0.1765, + "step": 49740 + }, + { + "epoch": 2.06, + "grad_norm": 0.625, + "learning_rate": 0.000494223515355792, + "loss": 0.1958, + "step": 49750 + }, + { + "epoch": 2.06, + "grad_norm": 0.421875, + "learning_rate": 0.0004942211972464437, + "loss": 0.2258, + "step": 49760 + }, + { + "epoch": 2.06, + "grad_norm": 0.34765625, + "learning_rate": 0.0004942188786774973, + "loss": 0.2126, + "step": 49770 + }, + { + "epoch": 2.06, + "grad_norm": 0.48046875, + "learning_rate": 0.0004942165596489574, + "loss": 0.2348, + "step": 49780 + }, + { + "epoch": 2.06, + "grad_norm": 0.56640625, + "learning_rate": 0.0004942142401608284, + "loss": 0.2713, + "step": 49790 + }, + { + "epoch": 2.06, + "grad_norm": 1.390625, + "learning_rate": 0.0004942119202131144, + "loss": 0.2454, + "step": 49800 + }, + { + "epoch": 2.06, + "grad_norm": 1.6875, + "learning_rate": 0.00049420959980582, + "loss": 0.204, + "step": 49810 + }, + { + "epoch": 2.06, + "grad_norm": 2.234375, + "learning_rate": 0.0004942072789389497, + "loss": 0.2641, + "step": 49820 + }, + { + "epoch": 2.06, + "grad_norm": 0.5390625, + "learning_rate": 0.0004942049576125075, + "loss": 0.2171, + "step": 49830 + }, + { + "epoch": 2.06, + "grad_norm": 0.3359375, + "learning_rate": 0.0004942026358264979, + "loss": 0.2118, + "step": 49840 + }, + { + "epoch": 2.06, + "grad_norm": 1.015625, + "learning_rate": 0.0004942003135809254, + "loss": 0.2425, + "step": 49850 + }, + { + "epoch": 2.07, + "grad_norm": 0.65234375, + "learning_rate": 0.0004941979908757942, + "loss": 0.2608, + "step": 49860 + }, + { + "epoch": 2.07, + "grad_norm": 0.640625, + "learning_rate": 0.0004941956677111089, + "loss": 0.2473, + "step": 49870 + }, + { + "epoch": 2.07, + "grad_norm": 0.3046875, + "learning_rate": 0.0004941933440868736, + "loss": 0.1622, + "step": 49880 + }, + { + "epoch": 2.07, + "grad_norm": 0.6015625, + "learning_rate": 0.0004941910200030928, + "loss": 0.1891, + "step": 49890 + }, + { + "epoch": 2.07, + "grad_norm": 0.92578125, + "learning_rate": 0.0004941886954597708, + "loss": 0.2386, + "step": 49900 + }, + { + "epoch": 2.07, + "grad_norm": 1.1796875, + "learning_rate": 0.0004941863704569122, + "loss": 0.2207, + "step": 49910 + }, + { + "epoch": 2.07, + "grad_norm": 0.6875, + "learning_rate": 0.0004941840449945212, + "loss": 0.2321, + "step": 49920 + }, + { + "epoch": 2.07, + "grad_norm": 0.62109375, + "learning_rate": 0.0004941817190726021, + "loss": 0.2576, + "step": 49930 + }, + { + "epoch": 2.07, + "grad_norm": 0.474609375, + "learning_rate": 0.0004941793926911595, + "loss": 0.205, + "step": 49940 + }, + { + "epoch": 2.07, + "grad_norm": 2.046875, + "learning_rate": 0.0004941770658501975, + "loss": 0.2438, + "step": 49950 + }, + { + "epoch": 2.07, + "grad_norm": 1.203125, + "learning_rate": 0.0004941747385497207, + "loss": 0.1659, + "step": 49960 + }, + { + "epoch": 2.07, + "grad_norm": 0.65234375, + "learning_rate": 0.0004941724107897335, + "loss": 0.1914, + "step": 49970 + }, + { + "epoch": 2.07, + "grad_norm": 0.65234375, + "learning_rate": 0.0004941700825702401, + "loss": 0.2053, + "step": 49980 + }, + { + "epoch": 2.07, + "grad_norm": 1.5078125, + "learning_rate": 0.000494167753891245, + "loss": 0.2276, + "step": 49990 + }, + { + "epoch": 2.07, + "grad_norm": 0.546875, + "learning_rate": 0.0004941654247527525, + "loss": 0.3057, + "step": 50000 + }, + { + "epoch": 2.07, + "grad_norm": 0.376953125, + "learning_rate": 0.0004941630951547671, + "loss": 0.2062, + "step": 50010 + }, + { + "epoch": 2.07, + "grad_norm": 0.43359375, + "learning_rate": 0.0004941607650972931, + "loss": 0.2105, + "step": 50020 + }, + { + "epoch": 2.07, + "grad_norm": 0.51953125, + "learning_rate": 0.0004941584345803349, + "loss": 0.2817, + "step": 50030 + }, + { + "epoch": 2.07, + "grad_norm": 0.9609375, + "learning_rate": 0.000494156103603897, + "loss": 0.2441, + "step": 50040 + }, + { + "epoch": 2.07, + "grad_norm": 0.625, + "learning_rate": 0.0004941537721679835, + "loss": 0.2286, + "step": 50050 + }, + { + "epoch": 2.07, + "grad_norm": 1.640625, + "learning_rate": 0.000494151440272599, + "loss": 0.2633, + "step": 50060 + }, + { + "epoch": 2.07, + "grad_norm": 1.9375, + "learning_rate": 0.000494149107917748, + "loss": 0.238, + "step": 50070 + }, + { + "epoch": 2.07, + "grad_norm": 1.109375, + "learning_rate": 0.0004941467751034347, + "loss": 0.2074, + "step": 50080 + }, + { + "epoch": 2.07, + "grad_norm": 1.59375, + "learning_rate": 0.0004941444418296634, + "loss": 0.2906, + "step": 50090 + }, + { + "epoch": 2.08, + "grad_norm": 0.55078125, + "learning_rate": 0.0004941421080964387, + "loss": 0.2629, + "step": 50100 + }, + { + "epoch": 2.08, + "grad_norm": 0.7109375, + "learning_rate": 0.000494139773903765, + "loss": 0.2423, + "step": 50110 + }, + { + "epoch": 2.08, + "grad_norm": 0.384765625, + "learning_rate": 0.0004941374392516465, + "loss": 0.2017, + "step": 50120 + }, + { + "epoch": 2.08, + "grad_norm": 0.490234375, + "learning_rate": 0.0004941351041400878, + "loss": 0.2469, + "step": 50130 + }, + { + "epoch": 2.08, + "grad_norm": 1.8046875, + "learning_rate": 0.000494132768569093, + "loss": 0.2535, + "step": 50140 + }, + { + "epoch": 2.08, + "grad_norm": 0.71875, + "learning_rate": 0.000494130432538667, + "loss": 0.2335, + "step": 50150 + }, + { + "epoch": 2.08, + "grad_norm": 0.8125, + "learning_rate": 0.0004941280960488137, + "loss": 0.2249, + "step": 50160 + }, + { + "epoch": 2.08, + "grad_norm": 0.453125, + "learning_rate": 0.0004941257590995376, + "loss": 0.2334, + "step": 50170 + }, + { + "epoch": 2.08, + "grad_norm": 1.3671875, + "learning_rate": 0.0004941234216908433, + "loss": 0.2899, + "step": 50180 + }, + { + "epoch": 2.08, + "grad_norm": 0.63671875, + "learning_rate": 0.0004941210838227351, + "loss": 0.1853, + "step": 50190 + }, + { + "epoch": 2.08, + "grad_norm": 0.79296875, + "learning_rate": 0.0004941187454952174, + "loss": 0.2252, + "step": 50200 + }, + { + "epoch": 2.08, + "grad_norm": 0.6015625, + "learning_rate": 0.0004941164067082945, + "loss": 0.2459, + "step": 50210 + }, + { + "epoch": 2.08, + "grad_norm": 1.1796875, + "learning_rate": 0.0004941140674619709, + "loss": 0.1834, + "step": 50220 + }, + { + "epoch": 2.08, + "grad_norm": 0.439453125, + "learning_rate": 0.0004941117277562511, + "loss": 0.2056, + "step": 50230 + }, + { + "epoch": 2.08, + "grad_norm": 0.48828125, + "learning_rate": 0.0004941093875911394, + "loss": 0.2326, + "step": 50240 + }, + { + "epoch": 2.08, + "grad_norm": 0.76953125, + "learning_rate": 0.00049410704696664, + "loss": 0.1751, + "step": 50250 + }, + { + "epoch": 2.08, + "grad_norm": 0.88671875, + "learning_rate": 0.0004941047058827576, + "loss": 0.2529, + "step": 50260 + }, + { + "epoch": 2.08, + "grad_norm": 0.9453125, + "learning_rate": 0.0004941023643394965, + "loss": 0.1978, + "step": 50270 + }, + { + "epoch": 2.08, + "grad_norm": 1.15625, + "learning_rate": 0.0004941000223368611, + "loss": 0.1842, + "step": 50280 + }, + { + "epoch": 2.08, + "grad_norm": 0.447265625, + "learning_rate": 0.0004940976798748558, + "loss": 0.2013, + "step": 50290 + }, + { + "epoch": 2.08, + "grad_norm": 0.58203125, + "learning_rate": 0.0004940953369534851, + "loss": 0.1912, + "step": 50300 + }, + { + "epoch": 2.08, + "grad_norm": 1.1328125, + "learning_rate": 0.0004940929935727533, + "loss": 0.1576, + "step": 50310 + }, + { + "epoch": 2.08, + "grad_norm": 0.75390625, + "learning_rate": 0.0004940906497326649, + "loss": 0.2352, + "step": 50320 + }, + { + "epoch": 2.08, + "grad_norm": 0.68359375, + "learning_rate": 0.0004940883054332241, + "loss": 0.242, + "step": 50330 + }, + { + "epoch": 2.09, + "grad_norm": 1.5390625, + "learning_rate": 0.0004940859606744356, + "loss": 0.2228, + "step": 50340 + }, + { + "epoch": 2.09, + "grad_norm": 0.64453125, + "learning_rate": 0.0004940836154563036, + "loss": 0.243, + "step": 50350 + }, + { + "epoch": 2.09, + "grad_norm": 2.0625, + "learning_rate": 0.0004940812697788327, + "loss": 0.1954, + "step": 50360 + }, + { + "epoch": 2.09, + "grad_norm": 0.65234375, + "learning_rate": 0.0004940789236420272, + "loss": 0.219, + "step": 50370 + }, + { + "epoch": 2.09, + "grad_norm": 0.5390625, + "learning_rate": 0.0004940765770458915, + "loss": 0.2333, + "step": 50380 + }, + { + "epoch": 2.09, + "grad_norm": 1.953125, + "learning_rate": 0.00049407422999043, + "loss": 0.2109, + "step": 50390 + }, + { + "epoch": 2.09, + "grad_norm": 0.50390625, + "learning_rate": 0.0004940718824756471, + "loss": 0.1952, + "step": 50400 + }, + { + "epoch": 2.09, + "grad_norm": 0.189453125, + "learning_rate": 0.0004940695345015474, + "loss": 0.201, + "step": 50410 + }, + { + "epoch": 2.09, + "grad_norm": 1.4921875, + "learning_rate": 0.0004940671860681351, + "loss": 0.2086, + "step": 50420 + }, + { + "epoch": 2.09, + "grad_norm": 0.60546875, + "learning_rate": 0.0004940648371754148, + "loss": 0.189, + "step": 50430 + }, + { + "epoch": 2.09, + "grad_norm": 0.6640625, + "learning_rate": 0.0004940624878233908, + "loss": 0.1735, + "step": 50440 + }, + { + "epoch": 2.09, + "grad_norm": 0.5859375, + "learning_rate": 0.0004940601380120675, + "loss": 0.2336, + "step": 50450 + }, + { + "epoch": 2.09, + "grad_norm": 3.578125, + "learning_rate": 0.0004940577877414494, + "loss": 0.2046, + "step": 50460 + }, + { + "epoch": 2.09, + "grad_norm": 1.0859375, + "learning_rate": 0.000494055437011541, + "loss": 0.236, + "step": 50470 + }, + { + "epoch": 2.09, + "grad_norm": 0.9296875, + "learning_rate": 0.0004940530858223466, + "loss": 0.2221, + "step": 50480 + }, + { + "epoch": 2.09, + "grad_norm": 0.6875, + "learning_rate": 0.0004940507341738705, + "loss": 0.2109, + "step": 50490 + }, + { + "epoch": 2.09, + "grad_norm": 0.60546875, + "learning_rate": 0.0004940483820661173, + "loss": 0.2817, + "step": 50500 + }, + { + "epoch": 2.09, + "grad_norm": 0.609375, + "learning_rate": 0.0004940460294990915, + "loss": 0.2714, + "step": 50510 + }, + { + "epoch": 2.09, + "grad_norm": 0.28515625, + "learning_rate": 0.0004940436764727974, + "loss": 0.2438, + "step": 50520 + }, + { + "epoch": 2.09, + "grad_norm": 0.67578125, + "learning_rate": 0.0004940413229872395, + "loss": 0.2444, + "step": 50530 + }, + { + "epoch": 2.09, + "grad_norm": 0.9375, + "learning_rate": 0.0004940389690424221, + "loss": 0.1883, + "step": 50540 + }, + { + "epoch": 2.09, + "grad_norm": 1.140625, + "learning_rate": 0.0004940366146383497, + "loss": 0.2571, + "step": 50550 + }, + { + "epoch": 2.09, + "grad_norm": 0.470703125, + "learning_rate": 0.0004940342597750268, + "loss": 0.198, + "step": 50560 + }, + { + "epoch": 2.09, + "grad_norm": 0.5078125, + "learning_rate": 0.0004940319044524578, + "loss": 0.2322, + "step": 50570 + }, + { + "epoch": 2.1, + "grad_norm": 0.69921875, + "learning_rate": 0.0004940295486706471, + "loss": 0.2399, + "step": 50580 + }, + { + "epoch": 2.1, + "grad_norm": 0.8671875, + "learning_rate": 0.0004940271924295991, + "loss": 0.2318, + "step": 50590 + }, + { + "epoch": 2.1, + "grad_norm": 0.59375, + "learning_rate": 0.0004940248357293182, + "loss": 0.2388, + "step": 50600 + }, + { + "epoch": 2.1, + "grad_norm": 0.77734375, + "learning_rate": 0.0004940224785698091, + "loss": 0.2297, + "step": 50610 + }, + { + "epoch": 2.1, + "grad_norm": 1.6484375, + "learning_rate": 0.0004940201209510759, + "loss": 0.2374, + "step": 50620 + }, + { + "epoch": 2.1, + "grad_norm": 0.70703125, + "learning_rate": 0.0004940177628731233, + "loss": 0.2486, + "step": 50630 + }, + { + "epoch": 2.1, + "grad_norm": 0.7109375, + "learning_rate": 0.0004940154043359556, + "loss": 0.2259, + "step": 50640 + }, + { + "epoch": 2.1, + "grad_norm": 0.73046875, + "learning_rate": 0.0004940130453395771, + "loss": 0.2143, + "step": 50650 + }, + { + "epoch": 2.1, + "grad_norm": 0.8828125, + "learning_rate": 0.0004940106858839925, + "loss": 0.2665, + "step": 50660 + }, + { + "epoch": 2.1, + "grad_norm": 0.66796875, + "learning_rate": 0.0004940083259692062, + "loss": 0.2306, + "step": 50670 + }, + { + "epoch": 2.1, + "grad_norm": 0.953125, + "learning_rate": 0.0004940059655952226, + "loss": 0.2066, + "step": 50680 + }, + { + "epoch": 2.1, + "grad_norm": 0.55859375, + "learning_rate": 0.000494003604762046, + "loss": 0.1983, + "step": 50690 + }, + { + "epoch": 2.1, + "grad_norm": 0.6484375, + "learning_rate": 0.000494001243469681, + "loss": 0.1774, + "step": 50700 + }, + { + "epoch": 2.1, + "grad_norm": 0.58203125, + "learning_rate": 0.0004939988817181322, + "loss": 0.267, + "step": 50710 + }, + { + "epoch": 2.1, + "grad_norm": 0.40625, + "learning_rate": 0.0004939965195074036, + "loss": 0.2549, + "step": 50720 + }, + { + "epoch": 2.1, + "grad_norm": 0.322265625, + "learning_rate": 0.0004939941568375, + "loss": 0.1989, + "step": 50730 + }, + { + "epoch": 2.1, + "grad_norm": 0.87109375, + "learning_rate": 0.0004939917937084257, + "loss": 0.2098, + "step": 50740 + }, + { + "epoch": 2.1, + "grad_norm": 1.2734375, + "learning_rate": 0.0004939894301201853, + "loss": 0.2349, + "step": 50750 + }, + { + "epoch": 2.1, + "grad_norm": 0.57421875, + "learning_rate": 0.0004939870660727831, + "loss": 0.2175, + "step": 50760 + }, + { + "epoch": 2.1, + "grad_norm": 2.96875, + "learning_rate": 0.0004939847015662235, + "loss": 0.2082, + "step": 50770 + }, + { + "epoch": 2.1, + "grad_norm": 0.34375, + "learning_rate": 0.000493982336600511, + "loss": 0.209, + "step": 50780 + }, + { + "epoch": 2.1, + "grad_norm": 0.625, + "learning_rate": 0.0004939799711756503, + "loss": 0.2473, + "step": 50790 + }, + { + "epoch": 2.1, + "grad_norm": 0.7421875, + "learning_rate": 0.0004939776052916455, + "loss": 0.2461, + "step": 50800 + }, + { + "epoch": 2.1, + "grad_norm": 0.54296875, + "learning_rate": 0.0004939752389485013, + "loss": 0.2006, + "step": 50810 + }, + { + "epoch": 2.1, + "grad_norm": 0.84765625, + "learning_rate": 0.0004939728721462219, + "loss": 0.2858, + "step": 50820 + }, + { + "epoch": 2.11, + "grad_norm": 0.4140625, + "learning_rate": 0.000493970504884812, + "loss": 0.2428, + "step": 50830 + }, + { + "epoch": 2.11, + "grad_norm": 0.46875, + "learning_rate": 0.0004939681371642759, + "loss": 0.2114, + "step": 50840 + }, + { + "epoch": 2.11, + "grad_norm": 0.72265625, + "learning_rate": 0.0004939657689846182, + "loss": 0.2401, + "step": 50850 + }, + { + "epoch": 2.11, + "grad_norm": 0.91796875, + "learning_rate": 0.0004939634003458432, + "loss": 0.1643, + "step": 50860 + }, + { + "epoch": 2.11, + "grad_norm": 0.7890625, + "learning_rate": 0.0004939610312479554, + "loss": 0.2461, + "step": 50870 + }, + { + "epoch": 2.11, + "grad_norm": 0.3125, + "learning_rate": 0.0004939586616909594, + "loss": 0.2138, + "step": 50880 + }, + { + "epoch": 2.11, + "grad_norm": 0.625, + "learning_rate": 0.0004939562916748594, + "loss": 0.2033, + "step": 50890 + }, + { + "epoch": 2.11, + "grad_norm": 0.57421875, + "learning_rate": 0.0004939539211996601, + "loss": 0.2618, + "step": 50900 + }, + { + "epoch": 2.11, + "grad_norm": 0.0, + "learning_rate": 0.0004939515502653659, + "loss": 0.1999, + "step": 50910 + }, + { + "epoch": 2.11, + "grad_norm": 0.439453125, + "learning_rate": 0.0004939491788719811, + "loss": 0.3103, + "step": 50920 + }, + { + "epoch": 2.11, + "grad_norm": 0.84375, + "learning_rate": 0.0004939468070195103, + "loss": 0.2411, + "step": 50930 + }, + { + "epoch": 2.11, + "grad_norm": 0.4453125, + "learning_rate": 0.0004939444347079581, + "loss": 0.232, + "step": 50940 + }, + { + "epoch": 2.11, + "grad_norm": 0.66796875, + "learning_rate": 0.0004939420619373287, + "loss": 0.2199, + "step": 50950 + }, + { + "epoch": 2.11, + "grad_norm": 1.15625, + "learning_rate": 0.0004939396887076267, + "loss": 0.1853, + "step": 50960 + }, + { + "epoch": 2.11, + "grad_norm": 0.39453125, + "learning_rate": 0.0004939373150188566, + "loss": 0.2119, + "step": 50970 + }, + { + "epoch": 2.11, + "grad_norm": 0.87890625, + "learning_rate": 0.0004939349408710227, + "loss": 0.2278, + "step": 50980 + }, + { + "epoch": 2.11, + "grad_norm": 0.40625, + "learning_rate": 0.0004939325662641296, + "loss": 0.2066, + "step": 50990 + }, + { + "epoch": 2.11, + "grad_norm": 0.36328125, + "learning_rate": 0.0004939301911981819, + "loss": 0.195, + "step": 51000 + }, + { + "epoch": 2.11, + "grad_norm": 0.546875, + "learning_rate": 0.0004939278156731839, + "loss": 0.1426, + "step": 51010 + }, + { + "epoch": 2.11, + "grad_norm": 0.5234375, + "learning_rate": 0.0004939254396891399, + "loss": 0.2441, + "step": 51020 + }, + { + "epoch": 2.11, + "grad_norm": 1.1953125, + "learning_rate": 0.0004939230632460547, + "loss": 0.2049, + "step": 51030 + }, + { + "epoch": 2.11, + "grad_norm": 0.81640625, + "learning_rate": 0.0004939206863439328, + "loss": 0.2141, + "step": 51040 + }, + { + "epoch": 2.11, + "grad_norm": 0.5390625, + "learning_rate": 0.0004939183089827783, + "loss": 0.2526, + "step": 51050 + }, + { + "epoch": 2.11, + "grad_norm": 0.734375, + "learning_rate": 0.0004939159311625959, + "loss": 0.2355, + "step": 51060 + }, + { + "epoch": 2.12, + "grad_norm": 0.482421875, + "learning_rate": 0.00049391355288339, + "loss": 0.1938, + "step": 51070 + }, + { + "epoch": 2.12, + "grad_norm": 0.396484375, + "learning_rate": 0.0004939111741451653, + "loss": 0.185, + "step": 51080 + }, + { + "epoch": 2.12, + "grad_norm": 0.9765625, + "learning_rate": 0.000493908794947926, + "loss": 0.1962, + "step": 51090 + }, + { + "epoch": 2.12, + "grad_norm": 0.65234375, + "learning_rate": 0.0004939064152916767, + "loss": 0.2588, + "step": 51100 + }, + { + "epoch": 2.12, + "grad_norm": 0.68359375, + "learning_rate": 0.0004939040351764219, + "loss": 0.2455, + "step": 51110 + }, + { + "epoch": 2.12, + "grad_norm": 0.1484375, + "learning_rate": 0.0004939016546021659, + "loss": 0.2357, + "step": 51120 + }, + { + "epoch": 2.12, + "grad_norm": 0.400390625, + "learning_rate": 0.0004938992735689135, + "loss": 0.219, + "step": 51130 + }, + { + "epoch": 2.12, + "grad_norm": 0.462890625, + "learning_rate": 0.000493896892076669, + "loss": 0.2225, + "step": 51140 + }, + { + "epoch": 2.12, + "grad_norm": 0.80078125, + "learning_rate": 0.0004938945101254368, + "loss": 0.1868, + "step": 51150 + }, + { + "epoch": 2.12, + "grad_norm": 0.3515625, + "learning_rate": 0.0004938921277152215, + "loss": 0.2105, + "step": 51160 + }, + { + "epoch": 2.12, + "grad_norm": 0.06640625, + "learning_rate": 0.0004938897448460276, + "loss": 0.2073, + "step": 51170 + }, + { + "epoch": 2.12, + "grad_norm": 0.625, + "learning_rate": 0.0004938873615178595, + "loss": 0.1916, + "step": 51180 + }, + { + "epoch": 2.12, + "grad_norm": 0.57421875, + "learning_rate": 0.0004938849777307217, + "loss": 0.2396, + "step": 51190 + }, + { + "epoch": 2.12, + "grad_norm": 0.419921875, + "learning_rate": 0.0004938825934846187, + "loss": 0.2257, + "step": 51200 + }, + { + "epoch": 2.12, + "grad_norm": 0.7734375, + "learning_rate": 0.0004938802087795551, + "loss": 0.2047, + "step": 51210 + }, + { + "epoch": 2.12, + "grad_norm": 1.4296875, + "learning_rate": 0.0004938778236155351, + "loss": 0.2799, + "step": 51220 + }, + { + "epoch": 2.12, + "grad_norm": 0.47265625, + "learning_rate": 0.0004938754379925636, + "loss": 0.1998, + "step": 51230 + }, + { + "epoch": 2.12, + "grad_norm": 0.498046875, + "learning_rate": 0.0004938730519106447, + "loss": 0.2026, + "step": 51240 + }, + { + "epoch": 2.12, + "grad_norm": 0.7734375, + "learning_rate": 0.0004938706653697831, + "loss": 0.2021, + "step": 51250 + }, + { + "epoch": 2.12, + "grad_norm": 2.640625, + "learning_rate": 0.0004938682783699833, + "loss": 0.1942, + "step": 51260 + }, + { + "epoch": 2.12, + "grad_norm": 1.46875, + "learning_rate": 0.0004938658909112496, + "loss": 0.197, + "step": 51270 + }, + { + "epoch": 2.12, + "grad_norm": 1.296875, + "learning_rate": 0.0004938635029935868, + "loss": 0.2361, + "step": 51280 + }, + { + "epoch": 2.12, + "grad_norm": 0.609375, + "learning_rate": 0.0004938611146169991, + "loss": 0.2649, + "step": 51290 + }, + { + "epoch": 2.12, + "grad_norm": 0.66796875, + "learning_rate": 0.0004938587257814911, + "loss": 0.2165, + "step": 51300 + }, + { + "epoch": 2.13, + "grad_norm": 0.78515625, + "learning_rate": 0.0004938563364870674, + "loss": 0.1958, + "step": 51310 + }, + { + "epoch": 2.13, + "grad_norm": 0.7890625, + "learning_rate": 0.0004938539467337324, + "loss": 0.2201, + "step": 51320 + }, + { + "epoch": 2.13, + "grad_norm": 0.5625, + "learning_rate": 0.0004938515565214905, + "loss": 0.1693, + "step": 51330 + }, + { + "epoch": 2.13, + "grad_norm": 1.3515625, + "learning_rate": 0.0004938491658503465, + "loss": 0.1595, + "step": 51340 + }, + { + "epoch": 2.13, + "grad_norm": 0.703125, + "learning_rate": 0.0004938467747203044, + "loss": 0.2336, + "step": 51350 + }, + { + "epoch": 2.13, + "grad_norm": 0.0, + "learning_rate": 0.0004938443831313693, + "loss": 0.2326, + "step": 51360 + }, + { + "epoch": 2.13, + "grad_norm": 0.5390625, + "learning_rate": 0.0004938419910835453, + "loss": 0.2013, + "step": 51370 + }, + { + "epoch": 2.13, + "grad_norm": 0.439453125, + "learning_rate": 0.000493839598576837, + "loss": 0.2684, + "step": 51380 + }, + { + "epoch": 2.13, + "grad_norm": 0.5859375, + "learning_rate": 0.0004938372056112488, + "loss": 0.2052, + "step": 51390 + }, + { + "epoch": 2.13, + "grad_norm": 0.63671875, + "learning_rate": 0.0004938348121867855, + "loss": 0.2178, + "step": 51400 + }, + { + "epoch": 2.13, + "grad_norm": 0.91796875, + "learning_rate": 0.0004938324183034514, + "loss": 0.217, + "step": 51410 + }, + { + "epoch": 2.13, + "grad_norm": 0.412109375, + "learning_rate": 0.0004938300239612509, + "loss": 0.2475, + "step": 51420 + }, + { + "epoch": 2.13, + "grad_norm": 0.62890625, + "learning_rate": 0.0004938276291601886, + "loss": 0.2199, + "step": 51430 + }, + { + "epoch": 2.13, + "grad_norm": 0.8671875, + "learning_rate": 0.0004938252339002691, + "loss": 0.2204, + "step": 51440 + }, + { + "epoch": 2.13, + "grad_norm": 0.3515625, + "learning_rate": 0.0004938228381814969, + "loss": 0.2056, + "step": 51450 + }, + { + "epoch": 2.13, + "grad_norm": 0.84375, + "learning_rate": 0.0004938204420038764, + "loss": 0.1846, + "step": 51460 + }, + { + "epoch": 2.13, + "grad_norm": 0.46875, + "learning_rate": 0.0004938180453674121, + "loss": 0.187, + "step": 51470 + }, + { + "epoch": 2.13, + "grad_norm": 0.44140625, + "learning_rate": 0.0004938156482721087, + "loss": 0.2049, + "step": 51480 + }, + { + "epoch": 2.13, + "grad_norm": 0.58984375, + "learning_rate": 0.0004938132507179704, + "loss": 0.192, + "step": 51490 + }, + { + "epoch": 2.13, + "grad_norm": 0.62109375, + "learning_rate": 0.0004938108527050021, + "loss": 0.2525, + "step": 51500 + }, + { + "epoch": 2.13, + "grad_norm": 1.3203125, + "learning_rate": 0.000493808454233208, + "loss": 0.2667, + "step": 51510 + }, + { + "epoch": 2.13, + "grad_norm": 0.50390625, + "learning_rate": 0.0004938060553025928, + "loss": 0.2005, + "step": 51520 + }, + { + "epoch": 2.13, + "grad_norm": 0.375, + "learning_rate": 0.0004938036559131608, + "loss": 0.2362, + "step": 51530 + }, + { + "epoch": 2.13, + "grad_norm": 0.4921875, + "learning_rate": 0.0004938012560649167, + "loss": 0.1669, + "step": 51540 + }, + { + "epoch": 2.14, + "grad_norm": 0.380859375, + "learning_rate": 0.0004937988557578649, + "loss": 0.2429, + "step": 51550 + }, + { + "epoch": 2.14, + "grad_norm": 3.5, + "learning_rate": 0.0004937964549920101, + "loss": 0.2225, + "step": 51560 + }, + { + "epoch": 2.14, + "grad_norm": 1.578125, + "learning_rate": 0.0004937940537673567, + "loss": 0.2159, + "step": 51570 + }, + { + "epoch": 2.14, + "grad_norm": 0.51171875, + "learning_rate": 0.000493791652083909, + "loss": 0.2891, + "step": 51580 + }, + { + "epoch": 2.14, + "grad_norm": 0.92578125, + "learning_rate": 0.000493789249941672, + "loss": 0.2143, + "step": 51590 + }, + { + "epoch": 2.14, + "grad_norm": 0.66015625, + "learning_rate": 0.0004937868473406499, + "loss": 0.2109, + "step": 51600 + }, + { + "epoch": 2.14, + "grad_norm": 0.51171875, + "learning_rate": 0.0004937844442808473, + "loss": 0.2093, + "step": 51610 + }, + { + "epoch": 2.14, + "grad_norm": 0.6484375, + "learning_rate": 0.0004937820407622685, + "loss": 0.2806, + "step": 51620 + }, + { + "epoch": 2.14, + "grad_norm": 0.2177734375, + "learning_rate": 0.0004937796367849185, + "loss": 0.2598, + "step": 51630 + }, + { + "epoch": 2.14, + "grad_norm": 0.890625, + "learning_rate": 0.0004937772323488014, + "loss": 0.2085, + "step": 51640 + }, + { + "epoch": 2.14, + "grad_norm": 0.255859375, + "learning_rate": 0.0004937748274539218, + "loss": 0.1986, + "step": 51650 + }, + { + "epoch": 2.14, + "grad_norm": 0.94921875, + "learning_rate": 0.0004937724221002844, + "loss": 0.1783, + "step": 51660 + }, + { + "epoch": 2.14, + "grad_norm": 1.9296875, + "learning_rate": 0.0004937700162878936, + "loss": 0.2158, + "step": 51670 + }, + { + "epoch": 2.14, + "grad_norm": 1.21875, + "learning_rate": 0.000493767610016754, + "loss": 0.216, + "step": 51680 + }, + { + "epoch": 2.14, + "grad_norm": 1.671875, + "learning_rate": 0.00049376520328687, + "loss": 0.2081, + "step": 51690 + }, + { + "epoch": 2.14, + "grad_norm": 0.53125, + "learning_rate": 0.0004937627960982464, + "loss": 0.2159, + "step": 51700 + }, + { + "epoch": 2.14, + "grad_norm": 0.51171875, + "learning_rate": 0.0004937603884508873, + "loss": 0.1636, + "step": 51710 + }, + { + "epoch": 2.14, + "grad_norm": 0.470703125, + "learning_rate": 0.0004937579803447976, + "loss": 0.1954, + "step": 51720 + }, + { + "epoch": 2.14, + "grad_norm": 0.83203125, + "learning_rate": 0.0004937555717799818, + "loss": 0.2322, + "step": 51730 + }, + { + "epoch": 2.14, + "grad_norm": 0.3203125, + "learning_rate": 0.0004937531627564442, + "loss": 0.2352, + "step": 51740 + }, + { + "epoch": 2.14, + "grad_norm": 0.5390625, + "learning_rate": 0.0004937507532741895, + "loss": 0.2329, + "step": 51750 + }, + { + "epoch": 2.14, + "grad_norm": 0.703125, + "learning_rate": 0.0004937483433332221, + "loss": 0.2117, + "step": 51760 + }, + { + "epoch": 2.14, + "grad_norm": 0.224609375, + "learning_rate": 0.0004937459329335468, + "loss": 0.189, + "step": 51770 + }, + { + "epoch": 2.14, + "grad_norm": 0.44140625, + "learning_rate": 0.000493743522075168, + "loss": 0.1761, + "step": 51780 + }, + { + "epoch": 2.15, + "grad_norm": 1.2265625, + "learning_rate": 0.00049374111075809, + "loss": 0.2219, + "step": 51790 + }, + { + "epoch": 2.15, + "grad_norm": 0.71484375, + "learning_rate": 0.0004937386989823177, + "loss": 0.1722, + "step": 51800 + }, + { + "epoch": 2.15, + "grad_norm": 0.63671875, + "learning_rate": 0.0004937362867478555, + "loss": 0.2382, + "step": 51810 + }, + { + "epoch": 2.15, + "grad_norm": 0.251953125, + "learning_rate": 0.000493733874054708, + "loss": 0.2264, + "step": 51820 + }, + { + "epoch": 2.15, + "grad_norm": 0.79296875, + "learning_rate": 0.0004937314609028794, + "loss": 0.1929, + "step": 51830 + }, + { + "epoch": 2.15, + "grad_norm": 0.99609375, + "learning_rate": 0.0004937290472923748, + "loss": 0.2439, + "step": 51840 + }, + { + "epoch": 2.15, + "grad_norm": 0.859375, + "learning_rate": 0.0004937266332231984, + "loss": 0.1988, + "step": 51850 + }, + { + "epoch": 2.15, + "grad_norm": 0.0, + "learning_rate": 0.0004937242186953547, + "loss": 0.2332, + "step": 51860 + }, + { + "epoch": 2.15, + "grad_norm": 0.486328125, + "learning_rate": 0.0004937218037088483, + "loss": 0.2383, + "step": 51870 + }, + { + "epoch": 2.15, + "grad_norm": 0.8671875, + "learning_rate": 0.0004937193882636839, + "loss": 0.226, + "step": 51880 + }, + { + "epoch": 2.15, + "grad_norm": 0.6171875, + "learning_rate": 0.0004937169723598659, + "loss": 0.2118, + "step": 51890 + }, + { + "epoch": 2.15, + "grad_norm": 0.4921875, + "learning_rate": 0.0004937145559973988, + "loss": 0.2322, + "step": 51900 + }, + { + "epoch": 2.15, + "grad_norm": 0.9609375, + "learning_rate": 0.0004937121391762873, + "loss": 0.292, + "step": 51910 + }, + { + "epoch": 2.15, + "grad_norm": 0.66015625, + "learning_rate": 0.0004937097218965359, + "loss": 0.188, + "step": 51920 + }, + { + "epoch": 2.15, + "grad_norm": 0.9375, + "learning_rate": 0.000493707304158149, + "loss": 0.2438, + "step": 51930 + }, + { + "epoch": 2.15, + "grad_norm": 0.431640625, + "learning_rate": 0.0004937048859611314, + "loss": 0.2257, + "step": 51940 + }, + { + "epoch": 2.15, + "grad_norm": 0.427734375, + "learning_rate": 0.0004937024673054875, + "loss": 0.2226, + "step": 51950 + }, + { + "epoch": 2.15, + "grad_norm": 0.65234375, + "learning_rate": 0.0004937000481912219, + "loss": 0.2707, + "step": 51960 + }, + { + "epoch": 2.15, + "grad_norm": 0.5625, + "learning_rate": 0.000493697628618339, + "loss": 0.2393, + "step": 51970 + }, + { + "epoch": 2.15, + "grad_norm": 0.546875, + "learning_rate": 0.0004936952085868436, + "loss": 0.2316, + "step": 51980 + }, + { + "epoch": 2.15, + "grad_norm": 0.734375, + "learning_rate": 0.0004936927880967401, + "loss": 0.2438, + "step": 51990 + }, + { + "epoch": 2.15, + "grad_norm": 1.953125, + "learning_rate": 0.0004936903671480331, + "loss": 0.2075, + "step": 52000 + }, + { + "epoch": 2.15, + "grad_norm": 0.71484375, + "learning_rate": 0.0004936879457407271, + "loss": 0.2527, + "step": 52010 + }, + { + "epoch": 2.15, + "grad_norm": 2.65625, + "learning_rate": 0.0004936855238748268, + "loss": 0.2141, + "step": 52020 + }, + { + "epoch": 2.16, + "grad_norm": 0.482421875, + "learning_rate": 0.0004936831015503365, + "loss": 0.1975, + "step": 52030 + }, + { + "epoch": 2.16, + "grad_norm": 1.8671875, + "learning_rate": 0.0004936806787672611, + "loss": 0.2369, + "step": 52040 + }, + { + "epoch": 2.16, + "grad_norm": 1.1171875, + "learning_rate": 0.0004936782555256049, + "loss": 0.2259, + "step": 52050 + }, + { + "epoch": 2.16, + "grad_norm": 0.39453125, + "learning_rate": 0.0004936758318253724, + "loss": 0.2333, + "step": 52060 + }, + { + "epoch": 2.16, + "grad_norm": 0.58984375, + "learning_rate": 0.0004936734076665685, + "loss": 0.2, + "step": 52070 + }, + { + "epoch": 2.16, + "grad_norm": 1.0546875, + "learning_rate": 0.0004936709830491975, + "loss": 0.2772, + "step": 52080 + }, + { + "epoch": 2.16, + "grad_norm": 1.703125, + "learning_rate": 0.0004936685579732639, + "loss": 0.279, + "step": 52090 + }, + { + "epoch": 2.16, + "grad_norm": 0.53515625, + "learning_rate": 0.0004936661324387725, + "loss": 0.2392, + "step": 52100 + }, + { + "epoch": 2.16, + "grad_norm": 1.0859375, + "learning_rate": 0.0004936637064457277, + "loss": 0.1893, + "step": 52110 + }, + { + "epoch": 2.16, + "grad_norm": 0.38671875, + "learning_rate": 0.0004936612799941343, + "loss": 0.1837, + "step": 52120 + }, + { + "epoch": 2.16, + "grad_norm": 0.79296875, + "learning_rate": 0.0004936588530839966, + "loss": 0.2011, + "step": 52130 + }, + { + "epoch": 2.16, + "grad_norm": 0.57421875, + "learning_rate": 0.0004936564257153192, + "loss": 0.2146, + "step": 52140 + }, + { + "epoch": 2.16, + "grad_norm": 0.703125, + "learning_rate": 0.0004936539978881067, + "loss": 0.2835, + "step": 52150 + }, + { + "epoch": 2.16, + "grad_norm": 0.416015625, + "learning_rate": 0.0004936515696023637, + "loss": 0.1877, + "step": 52160 + }, + { + "epoch": 2.16, + "grad_norm": 1.640625, + "learning_rate": 0.0004936491408580948, + "loss": 0.2226, + "step": 52170 + }, + { + "epoch": 2.16, + "grad_norm": 0.58984375, + "learning_rate": 0.0004936467116553045, + "loss": 0.1934, + "step": 52180 + }, + { + "epoch": 2.16, + "grad_norm": 0.353515625, + "learning_rate": 0.0004936442819939974, + "loss": 0.2327, + "step": 52190 + }, + { + "epoch": 2.16, + "grad_norm": 0.46875, + "learning_rate": 0.000493641851874178, + "loss": 0.2384, + "step": 52200 + }, + { + "epoch": 2.16, + "grad_norm": 0.000782012939453125, + "learning_rate": 0.000493639421295851, + "loss": 0.2103, + "step": 52210 + }, + { + "epoch": 2.16, + "grad_norm": 0.75, + "learning_rate": 0.0004936369902590209, + "loss": 0.2105, + "step": 52220 + }, + { + "epoch": 2.16, + "grad_norm": 0.67578125, + "learning_rate": 0.0004936345587636924, + "loss": 0.2159, + "step": 52230 + }, + { + "epoch": 2.16, + "grad_norm": 0.421875, + "learning_rate": 0.0004936321268098699, + "loss": 0.247, + "step": 52240 + }, + { + "epoch": 2.16, + "grad_norm": 1.25, + "learning_rate": 0.0004936296943975581, + "loss": 0.2472, + "step": 52250 + }, + { + "epoch": 2.16, + "grad_norm": 1.1875, + "learning_rate": 0.0004936272615267614, + "loss": 0.2329, + "step": 52260 + }, + { + "epoch": 2.17, + "grad_norm": 0.875, + "learning_rate": 0.0004936248281974846, + "loss": 0.2449, + "step": 52270 + }, + { + "epoch": 2.17, + "grad_norm": 2.0, + "learning_rate": 0.0004936223944097322, + "loss": 0.2098, + "step": 52280 + }, + { + "epoch": 2.17, + "grad_norm": 0.466796875, + "learning_rate": 0.0004936199601635086, + "loss": 0.2079, + "step": 52290 + }, + { + "epoch": 2.17, + "grad_norm": 0.271484375, + "learning_rate": 0.0004936175254588187, + "loss": 0.1866, + "step": 52300 + }, + { + "epoch": 2.17, + "grad_norm": 1.03125, + "learning_rate": 0.000493615090295667, + "loss": 0.2115, + "step": 52310 + }, + { + "epoch": 2.17, + "grad_norm": 0.4296875, + "learning_rate": 0.0004936126546740578, + "loss": 0.1771, + "step": 52320 + }, + { + "epoch": 2.17, + "grad_norm": 0.384765625, + "learning_rate": 0.000493610218593996, + "loss": 0.2378, + "step": 52330 + }, + { + "epoch": 2.17, + "grad_norm": 1.9296875, + "learning_rate": 0.0004936077820554861, + "loss": 0.2273, + "step": 52340 + }, + { + "epoch": 2.17, + "grad_norm": 0.93359375, + "learning_rate": 0.0004936053450585327, + "loss": 0.262, + "step": 52350 + }, + { + "epoch": 2.17, + "grad_norm": 2.5625, + "learning_rate": 0.0004936029076031402, + "loss": 0.2231, + "step": 52360 + }, + { + "epoch": 2.17, + "grad_norm": 0.9453125, + "learning_rate": 0.0004936004696893134, + "loss": 0.2332, + "step": 52370 + }, + { + "epoch": 2.17, + "grad_norm": 0.357421875, + "learning_rate": 0.0004935980313170568, + "loss": 0.2063, + "step": 52380 + }, + { + "epoch": 2.17, + "grad_norm": 0.6953125, + "learning_rate": 0.0004935955924863751, + "loss": 0.1799, + "step": 52390 + }, + { + "epoch": 2.17, + "grad_norm": 1.0390625, + "learning_rate": 0.0004935931531972728, + "loss": 0.2429, + "step": 52400 + }, + { + "epoch": 2.17, + "grad_norm": 1.7734375, + "learning_rate": 0.0004935907134497544, + "loss": 0.1863, + "step": 52410 + }, + { + "epoch": 2.17, + "grad_norm": 0.5625, + "learning_rate": 0.0004935882732438246, + "loss": 0.1884, + "step": 52420 + }, + { + "epoch": 2.17, + "grad_norm": 0.32421875, + "learning_rate": 0.000493585832579488, + "loss": 0.2377, + "step": 52430 + }, + { + "epoch": 2.17, + "grad_norm": 0.453125, + "learning_rate": 0.0004935833914567492, + "loss": 0.2394, + "step": 52440 + }, + { + "epoch": 2.17, + "grad_norm": 0.6640625, + "learning_rate": 0.0004935809498756127, + "loss": 0.2241, + "step": 52450 + }, + { + "epoch": 2.17, + "grad_norm": 0.65625, + "learning_rate": 0.0004935785078360832, + "loss": 0.2453, + "step": 52460 + }, + { + "epoch": 2.17, + "grad_norm": 0.3359375, + "learning_rate": 0.0004935760653381652, + "loss": 0.2557, + "step": 52470 + }, + { + "epoch": 2.17, + "grad_norm": 0.796875, + "learning_rate": 0.0004935736223818635, + "loss": 0.2323, + "step": 52480 + }, + { + "epoch": 2.17, + "grad_norm": 0.55859375, + "learning_rate": 0.0004935711789671824, + "loss": 0.1788, + "step": 52490 + }, + { + "epoch": 2.17, + "grad_norm": 0.66796875, + "learning_rate": 0.0004935687350941266, + "loss": 0.2426, + "step": 52500 + }, + { + "epoch": 2.17, + "grad_norm": 2.046875, + "learning_rate": 0.0004935662907627009, + "loss": 0.2449, + "step": 52510 + }, + { + "epoch": 2.18, + "grad_norm": 0.68359375, + "learning_rate": 0.0004935638459729096, + "loss": 0.2872, + "step": 52520 + }, + { + "epoch": 2.18, + "grad_norm": 1.046875, + "learning_rate": 0.0004935614007247575, + "loss": 0.2279, + "step": 52530 + }, + { + "epoch": 2.18, + "grad_norm": 1.6875, + "learning_rate": 0.0004935589550182492, + "loss": 0.2228, + "step": 52540 + }, + { + "epoch": 2.18, + "grad_norm": 0.27734375, + "learning_rate": 0.0004935565088533893, + "loss": 0.2599, + "step": 52550 + }, + { + "epoch": 2.18, + "grad_norm": 0.984375, + "learning_rate": 0.0004935540622301822, + "loss": 0.1946, + "step": 52560 + }, + { + "epoch": 2.18, + "grad_norm": 0.482421875, + "learning_rate": 0.0004935516151486327, + "loss": 0.2527, + "step": 52570 + }, + { + "epoch": 2.18, + "grad_norm": 0.80078125, + "learning_rate": 0.0004935491676087454, + "loss": 0.2165, + "step": 52580 + }, + { + "epoch": 2.18, + "grad_norm": 0.75390625, + "learning_rate": 0.0004935467196105248, + "loss": 0.2125, + "step": 52590 + }, + { + "epoch": 2.18, + "grad_norm": 0.46875, + "learning_rate": 0.0004935442711539756, + "loss": 0.25, + "step": 52600 + }, + { + "epoch": 2.18, + "grad_norm": 1.0390625, + "learning_rate": 0.0004935418222391023, + "loss": 0.2295, + "step": 52610 + }, + { + "epoch": 2.18, + "grad_norm": 0.328125, + "learning_rate": 0.0004935393728659098, + "loss": 0.243, + "step": 52620 + }, + { + "epoch": 2.18, + "grad_norm": 0.828125, + "learning_rate": 0.0004935369230344023, + "loss": 0.1958, + "step": 52630 + }, + { + "epoch": 2.18, + "grad_norm": 0.84375, + "learning_rate": 0.0004935344727445847, + "loss": 0.2024, + "step": 52640 + }, + { + "epoch": 2.18, + "grad_norm": 0.9765625, + "learning_rate": 0.0004935320219964616, + "loss": 0.2428, + "step": 52650 + }, + { + "epoch": 2.18, + "grad_norm": 0.71875, + "learning_rate": 0.0004935295707900375, + "loss": 0.1989, + "step": 52660 + }, + { + "epoch": 2.18, + "grad_norm": 0.65625, + "learning_rate": 0.0004935271191253169, + "loss": 0.1972, + "step": 52670 + }, + { + "epoch": 2.18, + "grad_norm": 0.74609375, + "learning_rate": 0.0004935246670023047, + "loss": 0.1945, + "step": 52680 + }, + { + "epoch": 2.18, + "grad_norm": 0.68359375, + "learning_rate": 0.0004935222144210053, + "loss": 0.2465, + "step": 52690 + }, + { + "epoch": 2.18, + "grad_norm": 0.734375, + "learning_rate": 0.0004935197613814235, + "loss": 0.191, + "step": 52700 + }, + { + "epoch": 2.18, + "grad_norm": 0.9765625, + "learning_rate": 0.0004935173078835637, + "loss": 0.236, + "step": 52710 + }, + { + "epoch": 2.18, + "grad_norm": 0.7734375, + "learning_rate": 0.0004935148539274306, + "loss": 0.2231, + "step": 52720 + }, + { + "epoch": 2.18, + "grad_norm": 0.75, + "learning_rate": 0.000493512399513029, + "loss": 0.1603, + "step": 52730 + }, + { + "epoch": 2.18, + "grad_norm": 0.57421875, + "learning_rate": 0.0004935099446403633, + "loss": 0.2145, + "step": 52740 + }, + { + "epoch": 2.18, + "grad_norm": 0.625, + "learning_rate": 0.000493507489309438, + "loss": 0.2159, + "step": 52750 + }, + { + "epoch": 2.19, + "grad_norm": 0.77734375, + "learning_rate": 0.0004935050335202581, + "loss": 0.2648, + "step": 52760 + }, + { + "epoch": 2.19, + "grad_norm": 1.0703125, + "learning_rate": 0.000493502577272828, + "loss": 0.2378, + "step": 52770 + }, + { + "epoch": 2.19, + "grad_norm": 0.0, + "learning_rate": 0.0004935001205671523, + "loss": 0.2466, + "step": 52780 + }, + { + "epoch": 2.19, + "grad_norm": 0.7734375, + "learning_rate": 0.0004934976634032356, + "loss": 0.2499, + "step": 52790 + }, + { + "epoch": 2.19, + "grad_norm": 1.984375, + "learning_rate": 0.0004934952057810828, + "loss": 0.2381, + "step": 52800 + }, + { + "epoch": 2.19, + "grad_norm": 1.0234375, + "learning_rate": 0.0004934927477006982, + "loss": 0.2509, + "step": 52810 + }, + { + "epoch": 2.19, + "grad_norm": 0.546875, + "learning_rate": 0.0004934902891620865, + "loss": 0.216, + "step": 52820 + }, + { + "epoch": 2.19, + "grad_norm": 0.66015625, + "learning_rate": 0.0004934878301652524, + "loss": 0.244, + "step": 52830 + }, + { + "epoch": 2.19, + "grad_norm": 1.0703125, + "learning_rate": 0.0004934853707102005, + "loss": 0.2027, + "step": 52840 + }, + { + "epoch": 2.19, + "grad_norm": 0.66796875, + "learning_rate": 0.0004934829107969354, + "loss": 0.2442, + "step": 52850 + }, + { + "epoch": 2.19, + "grad_norm": 0.6875, + "learning_rate": 0.0004934804504254618, + "loss": 0.2399, + "step": 52860 + }, + { + "epoch": 2.19, + "grad_norm": 0.3828125, + "learning_rate": 0.0004934779895957843, + "loss": 0.2186, + "step": 52870 + }, + { + "epoch": 2.19, + "grad_norm": 1.40625, + "learning_rate": 0.0004934755283079075, + "loss": 0.2634, + "step": 52880 + }, + { + "epoch": 2.19, + "grad_norm": 0.49609375, + "learning_rate": 0.000493473066561836, + "loss": 0.2261, + "step": 52890 + }, + { + "epoch": 2.19, + "grad_norm": 0.640625, + "learning_rate": 0.0004934706043575745, + "loss": 0.2478, + "step": 52900 + }, + { + "epoch": 2.19, + "grad_norm": 0.310546875, + "learning_rate": 0.0004934681416951277, + "loss": 0.2409, + "step": 52910 + }, + { + "epoch": 2.19, + "grad_norm": 0.62890625, + "learning_rate": 0.0004934656785745001, + "loss": 0.2494, + "step": 52920 + }, + { + "epoch": 2.19, + "grad_norm": 0.28515625, + "learning_rate": 0.0004934632149956964, + "loss": 0.204, + "step": 52930 + }, + { + "epoch": 2.19, + "grad_norm": 2.5625, + "learning_rate": 0.0004934607509587211, + "loss": 0.2215, + "step": 52940 + }, + { + "epoch": 2.19, + "grad_norm": 0.828125, + "learning_rate": 0.000493458286463579, + "loss": 0.2181, + "step": 52950 + }, + { + "epoch": 2.19, + "grad_norm": 0.828125, + "learning_rate": 0.0004934558215102748, + "loss": 0.2319, + "step": 52960 + }, + { + "epoch": 2.19, + "grad_norm": 0.6328125, + "learning_rate": 0.000493453356098813, + "loss": 0.1908, + "step": 52970 + }, + { + "epoch": 2.19, + "grad_norm": 0.3125, + "learning_rate": 0.0004934508902291983, + "loss": 0.2263, + "step": 52980 + }, + { + "epoch": 2.19, + "grad_norm": 0.55078125, + "learning_rate": 0.0004934484239014353, + "loss": 0.1992, + "step": 52990 + }, + { + "epoch": 2.2, + "grad_norm": 0.98046875, + "learning_rate": 0.0004934459571155286, + "loss": 0.1669, + "step": 53000 + }, + { + "epoch": 2.2, + "grad_norm": 0.53125, + "learning_rate": 0.0004934434898714829, + "loss": 0.2086, + "step": 53010 + }, + { + "epoch": 2.2, + "grad_norm": 0.77734375, + "learning_rate": 0.0004934410221693029, + "loss": 0.2099, + "step": 53020 + }, + { + "epoch": 2.2, + "grad_norm": 0.8671875, + "learning_rate": 0.0004934385540089931, + "loss": 0.2445, + "step": 53030 + }, + { + "epoch": 2.2, + "grad_norm": 0.796875, + "learning_rate": 0.0004934360853905583, + "loss": 0.2327, + "step": 53040 + }, + { + "epoch": 2.2, + "grad_norm": 0.92578125, + "learning_rate": 0.000493433616314003, + "loss": 0.2454, + "step": 53050 + }, + { + "epoch": 2.2, + "grad_norm": 0.4453125, + "learning_rate": 0.000493431146779332, + "loss": 0.2974, + "step": 53060 + }, + { + "epoch": 2.2, + "grad_norm": 0.40625, + "learning_rate": 0.0004934286767865499, + "loss": 0.1703, + "step": 53070 + }, + { + "epoch": 2.2, + "grad_norm": 0.92578125, + "learning_rate": 0.0004934262063356613, + "loss": 0.2196, + "step": 53080 + }, + { + "epoch": 2.2, + "grad_norm": 0.640625, + "learning_rate": 0.0004934237354266708, + "loss": 0.2456, + "step": 53090 + }, + { + "epoch": 2.2, + "grad_norm": 0.765625, + "learning_rate": 0.0004934212640595832, + "loss": 0.2432, + "step": 53100 + }, + { + "epoch": 2.2, + "grad_norm": 0.8671875, + "learning_rate": 0.000493418792234403, + "loss": 0.2196, + "step": 53110 + }, + { + "epoch": 2.2, + "grad_norm": 0.6484375, + "learning_rate": 0.0004934163199511349, + "loss": 0.18, + "step": 53120 + }, + { + "epoch": 2.2, + "grad_norm": 0.9296875, + "learning_rate": 0.0004934138472097836, + "loss": 0.2279, + "step": 53130 + }, + { + "epoch": 2.2, + "grad_norm": 0.60546875, + "learning_rate": 0.0004934113740103537, + "loss": 0.2316, + "step": 53140 + }, + { + "epoch": 2.2, + "grad_norm": 0.451171875, + "learning_rate": 0.00049340890035285, + "loss": 0.171, + "step": 53150 + }, + { + "epoch": 2.2, + "grad_norm": 1.5390625, + "learning_rate": 0.000493406426237277, + "loss": 0.283, + "step": 53160 + }, + { + "epoch": 2.2, + "grad_norm": 0.30078125, + "learning_rate": 0.0004934039516636393, + "loss": 0.1886, + "step": 53170 + }, + { + "epoch": 2.2, + "grad_norm": 1.1640625, + "learning_rate": 0.0004934014766319417, + "loss": 0.1802, + "step": 53180 + }, + { + "epoch": 2.2, + "grad_norm": 2.65625, + "learning_rate": 0.0004933990011421888, + "loss": 0.2677, + "step": 53190 + }, + { + "epoch": 2.2, + "grad_norm": 0.90625, + "learning_rate": 0.0004933965251943853, + "loss": 0.2344, + "step": 53200 + }, + { + "epoch": 2.2, + "grad_norm": 0.427734375, + "learning_rate": 0.0004933940487885358, + "loss": 0.2491, + "step": 53210 + }, + { + "epoch": 2.2, + "grad_norm": 0.96484375, + "learning_rate": 0.0004933915719246449, + "loss": 0.2234, + "step": 53220 + }, + { + "epoch": 2.2, + "grad_norm": 0.6796875, + "learning_rate": 0.0004933890946027175, + "loss": 0.224, + "step": 53230 + }, + { + "epoch": 2.21, + "grad_norm": 0.72265625, + "learning_rate": 0.000493386616822758, + "loss": 0.195, + "step": 53240 + }, + { + "epoch": 2.21, + "grad_norm": 1.0390625, + "learning_rate": 0.0004933841385847712, + "loss": 0.2243, + "step": 53250 + }, + { + "epoch": 2.21, + "grad_norm": 0.37890625, + "learning_rate": 0.0004933816598887617, + "loss": 0.1782, + "step": 53260 + }, + { + "epoch": 2.21, + "grad_norm": 0.80859375, + "learning_rate": 0.0004933791807347342, + "loss": 0.2563, + "step": 53270 + }, + { + "epoch": 2.21, + "grad_norm": 0.6875, + "learning_rate": 0.0004933767011226934, + "loss": 0.2576, + "step": 53280 + }, + { + "epoch": 2.21, + "grad_norm": 0.486328125, + "learning_rate": 0.0004933742210526439, + "loss": 0.1895, + "step": 53290 + }, + { + "epoch": 2.21, + "grad_norm": 0.31640625, + "learning_rate": 0.0004933717405245906, + "loss": 0.2082, + "step": 53300 + }, + { + "epoch": 2.21, + "grad_norm": 1.65625, + "learning_rate": 0.0004933692595385377, + "loss": 0.2459, + "step": 53310 + }, + { + "epoch": 2.21, + "grad_norm": 0.76953125, + "learning_rate": 0.0004933667780944903, + "loss": 0.2862, + "step": 53320 + }, + { + "epoch": 2.21, + "grad_norm": 1.1015625, + "learning_rate": 0.0004933642961924528, + "loss": 0.2437, + "step": 53330 + }, + { + "epoch": 2.21, + "grad_norm": 0.80078125, + "learning_rate": 0.0004933618138324299, + "loss": 0.2214, + "step": 53340 + }, + { + "epoch": 2.21, + "grad_norm": 0.6796875, + "learning_rate": 0.0004933593310144266, + "loss": 0.24, + "step": 53350 + }, + { + "epoch": 2.21, + "grad_norm": 0.59765625, + "learning_rate": 0.0004933568477384472, + "loss": 0.2496, + "step": 53360 + }, + { + "epoch": 2.21, + "grad_norm": 0.55859375, + "learning_rate": 0.0004933543640044964, + "loss": 0.2342, + "step": 53370 + }, + { + "epoch": 2.21, + "grad_norm": 0.7578125, + "learning_rate": 0.000493351879812579, + "loss": 0.1812, + "step": 53380 + }, + { + "epoch": 2.21, + "grad_norm": 0.625, + "learning_rate": 0.0004933493951626997, + "loss": 0.182, + "step": 53390 + }, + { + "epoch": 2.21, + "grad_norm": 1.6796875, + "learning_rate": 0.0004933469100548631, + "loss": 0.2348, + "step": 53400 + }, + { + "epoch": 2.21, + "grad_norm": 0.921875, + "learning_rate": 0.0004933444244890739, + "loss": 0.1832, + "step": 53410 + }, + { + "epoch": 2.21, + "grad_norm": 0.3203125, + "learning_rate": 0.0004933419384653368, + "loss": 0.2133, + "step": 53420 + }, + { + "epoch": 2.21, + "grad_norm": 0.51953125, + "learning_rate": 0.0004933394519836564, + "loss": 0.1934, + "step": 53430 + }, + { + "epoch": 2.21, + "grad_norm": 0.61328125, + "learning_rate": 0.0004933369650440374, + "loss": 0.2786, + "step": 53440 + }, + { + "epoch": 2.21, + "grad_norm": 0.419921875, + "learning_rate": 0.0004933344776464845, + "loss": 0.2363, + "step": 53450 + }, + { + "epoch": 2.21, + "grad_norm": 0.2734375, + "learning_rate": 0.0004933319897910025, + "loss": 0.2469, + "step": 53460 + }, + { + "epoch": 2.21, + "grad_norm": 0.5859375, + "learning_rate": 0.0004933295014775959, + "loss": 0.2353, + "step": 53470 + }, + { + "epoch": 2.22, + "grad_norm": 0.82421875, + "learning_rate": 0.0004933270127062694, + "loss": 0.1658, + "step": 53480 + }, + { + "epoch": 2.22, + "grad_norm": 0.6640625, + "learning_rate": 0.0004933245234770278, + "loss": 0.1933, + "step": 53490 + }, + { + "epoch": 2.22, + "grad_norm": 0.55078125, + "learning_rate": 0.0004933220337898759, + "loss": 0.2083, + "step": 53500 + }, + { + "epoch": 2.22, + "grad_norm": 2.5625, + "learning_rate": 0.0004933195436448181, + "loss": 0.2025, + "step": 53510 + }, + { + "epoch": 2.22, + "grad_norm": 0.81640625, + "learning_rate": 0.000493317053041859, + "loss": 0.1766, + "step": 53520 + }, + { + "epoch": 2.22, + "grad_norm": 0.56640625, + "learning_rate": 0.0004933145619810036, + "loss": 0.2283, + "step": 53530 + }, + { + "epoch": 2.22, + "grad_norm": 0.734375, + "learning_rate": 0.0004933120704622566, + "loss": 0.1926, + "step": 53540 + }, + { + "epoch": 2.22, + "grad_norm": 0.44140625, + "learning_rate": 0.0004933095784856224, + "loss": 0.2385, + "step": 53550 + }, + { + "epoch": 2.22, + "grad_norm": 1.078125, + "learning_rate": 0.0004933070860511058, + "loss": 0.1645, + "step": 53560 + }, + { + "epoch": 2.22, + "grad_norm": 1.125, + "learning_rate": 0.0004933045931587117, + "loss": 0.2347, + "step": 53570 + }, + { + "epoch": 2.22, + "grad_norm": 0.63671875, + "learning_rate": 0.0004933020998084445, + "loss": 0.1755, + "step": 53580 + }, + { + "epoch": 2.22, + "grad_norm": 1.625, + "learning_rate": 0.0004932996060003092, + "loss": 0.2081, + "step": 53590 + }, + { + "epoch": 2.22, + "grad_norm": 0.0830078125, + "learning_rate": 0.0004932971117343101, + "loss": 0.235, + "step": 53600 + }, + { + "epoch": 2.22, + "grad_norm": 0.6328125, + "learning_rate": 0.0004932946170104523, + "loss": 0.2522, + "step": 53610 + }, + { + "epoch": 2.22, + "grad_norm": 0.65625, + "learning_rate": 0.0004932921218287401, + "loss": 0.2138, + "step": 53620 + }, + { + "epoch": 2.22, + "grad_norm": 0.482421875, + "learning_rate": 0.0004932896261891786, + "loss": 0.202, + "step": 53630 + }, + { + "epoch": 2.22, + "grad_norm": 1.0234375, + "learning_rate": 0.0004932871300917722, + "loss": 0.1826, + "step": 53640 + }, + { + "epoch": 2.22, + "grad_norm": 0.8515625, + "learning_rate": 0.0004932846335365257, + "loss": 0.2381, + "step": 53650 + }, + { + "epoch": 2.22, + "grad_norm": 0.73828125, + "learning_rate": 0.0004932821365234439, + "loss": 0.2199, + "step": 53660 + }, + { + "epoch": 2.22, + "grad_norm": 1.328125, + "learning_rate": 0.0004932796390525313, + "loss": 0.2338, + "step": 53670 + }, + { + "epoch": 2.22, + "grad_norm": 0.51953125, + "learning_rate": 0.0004932771411237926, + "loss": 0.2634, + "step": 53680 + }, + { + "epoch": 2.22, + "grad_norm": 1.2578125, + "learning_rate": 0.0004932746427372327, + "loss": 0.1909, + "step": 53690 + }, + { + "epoch": 2.22, + "grad_norm": 0.63671875, + "learning_rate": 0.0004932721438928562, + "loss": 0.2314, + "step": 53700 + }, + { + "epoch": 2.22, + "grad_norm": 0.25, + "learning_rate": 0.0004932696445906678, + "loss": 0.262, + "step": 53710 + }, + { + "epoch": 2.23, + "grad_norm": 0.55078125, + "learning_rate": 0.0004932671448306721, + "loss": 0.1379, + "step": 53720 + }, + { + "epoch": 2.23, + "grad_norm": 0.77734375, + "learning_rate": 0.000493264644612874, + "loss": 0.2597, + "step": 53730 + }, + { + "epoch": 2.23, + "grad_norm": 0.92578125, + "learning_rate": 0.0004932621439372781, + "loss": 0.2273, + "step": 53740 + }, + { + "epoch": 2.23, + "grad_norm": 0.9453125, + "learning_rate": 0.000493259642803889, + "loss": 0.234, + "step": 53750 + }, + { + "epoch": 2.23, + "grad_norm": 1.1640625, + "learning_rate": 0.0004932571412127118, + "loss": 0.268, + "step": 53760 + }, + { + "epoch": 2.23, + "grad_norm": 2.84375, + "learning_rate": 0.0004932546391637506, + "loss": 0.1908, + "step": 53770 + }, + { + "epoch": 2.23, + "grad_norm": 0.486328125, + "learning_rate": 0.0004932521366570106, + "loss": 0.2034, + "step": 53780 + }, + { + "epoch": 2.23, + "grad_norm": 0.40625, + "learning_rate": 0.0004932496336924963, + "loss": 0.1829, + "step": 53790 + }, + { + "epoch": 2.23, + "grad_norm": 0.90234375, + "learning_rate": 0.0004932471302702125, + "loss": 0.2166, + "step": 53800 + }, + { + "epoch": 2.23, + "grad_norm": 0.8515625, + "learning_rate": 0.0004932446263901639, + "loss": 0.2562, + "step": 53810 + }, + { + "epoch": 2.23, + "grad_norm": 0.87890625, + "learning_rate": 0.0004932421220523552, + "loss": 0.2716, + "step": 53820 + }, + { + "epoch": 2.23, + "grad_norm": 0.69140625, + "learning_rate": 0.000493239617256791, + "loss": 0.2332, + "step": 53830 + }, + { + "epoch": 2.23, + "grad_norm": 1.0, + "learning_rate": 0.0004932371120034761, + "loss": 0.2755, + "step": 53840 + }, + { + "epoch": 2.23, + "grad_norm": 1.1015625, + "learning_rate": 0.0004932346062924152, + "loss": 0.1691, + "step": 53850 + }, + { + "epoch": 2.23, + "grad_norm": 0.326171875, + "learning_rate": 0.0004932321001236132, + "loss": 0.2145, + "step": 53860 + }, + { + "epoch": 2.23, + "grad_norm": 0.2353515625, + "learning_rate": 0.0004932295934970745, + "loss": 0.2513, + "step": 53870 + }, + { + "epoch": 2.23, + "grad_norm": 0.39453125, + "learning_rate": 0.000493227086412804, + "loss": 0.1747, + "step": 53880 + }, + { + "epoch": 2.23, + "grad_norm": 0.61328125, + "learning_rate": 0.0004932245788708065, + "loss": 0.2491, + "step": 53890 + }, + { + "epoch": 2.23, + "grad_norm": 0.98046875, + "learning_rate": 0.0004932220708710865, + "loss": 0.2067, + "step": 53900 + }, + { + "epoch": 2.23, + "grad_norm": 1.0625, + "learning_rate": 0.0004932195624136489, + "loss": 0.2129, + "step": 53910 + }, + { + "epoch": 2.23, + "grad_norm": 0.765625, + "learning_rate": 0.0004932170534984983, + "loss": 0.2198, + "step": 53920 + }, + { + "epoch": 2.23, + "grad_norm": 2.96875, + "learning_rate": 0.0004932145441256395, + "loss": 0.2326, + "step": 53930 + }, + { + "epoch": 2.23, + "grad_norm": 0.373046875, + "learning_rate": 0.0004932120342950771, + "loss": 0.221, + "step": 53940 + }, + { + "epoch": 2.23, + "grad_norm": 1.25, + "learning_rate": 0.000493209524006816, + "loss": 0.2129, + "step": 53950 + }, + { + "epoch": 2.24, + "grad_norm": 0.609375, + "learning_rate": 0.0004932070132608608, + "loss": 0.2464, + "step": 53960 + }, + { + "epoch": 2.24, + "grad_norm": 0.208984375, + "learning_rate": 0.0004932045020572163, + "loss": 0.2105, + "step": 53970 + }, + { + "epoch": 2.24, + "grad_norm": 0.7734375, + "learning_rate": 0.0004932019903958872, + "loss": 0.2813, + "step": 53980 + }, + { + "epoch": 2.24, + "grad_norm": 1.765625, + "learning_rate": 0.0004931994782768782, + "loss": 0.3199, + "step": 53990 + }, + { + "epoch": 2.24, + "grad_norm": 0.498046875, + "learning_rate": 0.000493196965700194, + "loss": 0.2121, + "step": 54000 + }, + { + "epoch": 2.24, + "grad_norm": 1.90625, + "learning_rate": 0.0004931944526658395, + "loss": 0.2079, + "step": 54010 + }, + { + "epoch": 2.24, + "grad_norm": 2.25, + "learning_rate": 0.0004931919391738191, + "loss": 0.193, + "step": 54020 + }, + { + "epoch": 2.24, + "grad_norm": 1.1640625, + "learning_rate": 0.0004931894252241379, + "loss": 0.2017, + "step": 54030 + }, + { + "epoch": 2.24, + "grad_norm": 0.5234375, + "learning_rate": 0.0004931869108168004, + "loss": 0.2038, + "step": 54040 + }, + { + "epoch": 2.24, + "grad_norm": 1.5078125, + "learning_rate": 0.0004931843959518114, + "loss": 0.2489, + "step": 54050 + }, + { + "epoch": 2.24, + "grad_norm": 0.76171875, + "learning_rate": 0.0004931818806291757, + "loss": 0.2454, + "step": 54060 + }, + { + "epoch": 2.24, + "grad_norm": 0.357421875, + "learning_rate": 0.0004931793648488979, + "loss": 0.214, + "step": 54070 + }, + { + "epoch": 2.24, + "grad_norm": 1.09375, + "learning_rate": 0.0004931768486109828, + "loss": 0.2385, + "step": 54080 + }, + { + "epoch": 2.24, + "grad_norm": 0.62109375, + "learning_rate": 0.0004931743319154351, + "loss": 0.2317, + "step": 54090 + }, + { + "epoch": 2.24, + "grad_norm": 0.796875, + "learning_rate": 0.0004931718147622595, + "loss": 0.2392, + "step": 54100 + }, + { + "epoch": 2.24, + "grad_norm": 0.7734375, + "learning_rate": 0.000493169297151461, + "loss": 0.2039, + "step": 54110 + }, + { + "epoch": 2.24, + "grad_norm": 0.51953125, + "learning_rate": 0.0004931667790830441, + "loss": 0.1881, + "step": 54120 + }, + { + "epoch": 2.24, + "grad_norm": 1.2578125, + "learning_rate": 0.0004931642605570135, + "loss": 0.1792, + "step": 54130 + }, + { + "epoch": 2.24, + "grad_norm": 0.44140625, + "learning_rate": 0.000493161741573374, + "loss": 0.2291, + "step": 54140 + }, + { + "epoch": 2.24, + "grad_norm": 0.75, + "learning_rate": 0.0004931592221321305, + "loss": 0.2433, + "step": 54150 + }, + { + "epoch": 2.24, + "grad_norm": 0.77734375, + "learning_rate": 0.0004931567022332875, + "loss": 0.2625, + "step": 54160 + }, + { + "epoch": 2.24, + "grad_norm": 1.0703125, + "learning_rate": 0.0004931541818768498, + "loss": 0.2197, + "step": 54170 + }, + { + "epoch": 2.24, + "grad_norm": 0.451171875, + "learning_rate": 0.0004931516610628223, + "loss": 0.1702, + "step": 54180 + }, + { + "epoch": 2.24, + "grad_norm": 0.412109375, + "learning_rate": 0.0004931491397912096, + "loss": 0.2098, + "step": 54190 + }, + { + "epoch": 2.24, + "grad_norm": 1.140625, + "learning_rate": 0.0004931466180620165, + "loss": 0.2339, + "step": 54200 + }, + { + "epoch": 2.25, + "grad_norm": 1.15625, + "learning_rate": 0.0004931440958752476, + "loss": 0.229, + "step": 54210 + }, + { + "epoch": 2.25, + "grad_norm": 0.5390625, + "learning_rate": 0.0004931415732309078, + "loss": 0.2393, + "step": 54220 + }, + { + "epoch": 2.25, + "grad_norm": 0.19140625, + "learning_rate": 0.0004931390501290019, + "loss": 0.2022, + "step": 54230 + }, + { + "epoch": 2.25, + "grad_norm": 0.79296875, + "learning_rate": 0.0004931365265695345, + "loss": 0.2098, + "step": 54240 + }, + { + "epoch": 2.25, + "grad_norm": 1.0546875, + "learning_rate": 0.0004931340025525105, + "loss": 0.2031, + "step": 54250 + }, + { + "epoch": 2.25, + "grad_norm": 0.99609375, + "learning_rate": 0.0004931314780779345, + "loss": 0.2017, + "step": 54260 + }, + { + "epoch": 2.25, + "grad_norm": 1.6640625, + "learning_rate": 0.0004931289531458113, + "loss": 0.2371, + "step": 54270 + }, + { + "epoch": 2.25, + "grad_norm": 1.515625, + "learning_rate": 0.0004931264277561457, + "loss": 0.2681, + "step": 54280 + }, + { + "epoch": 2.25, + "grad_norm": 1.28125, + "learning_rate": 0.0004931239019089424, + "loss": 0.232, + "step": 54290 + }, + { + "epoch": 2.25, + "grad_norm": 0.65234375, + "learning_rate": 0.0004931213756042062, + "loss": 0.2787, + "step": 54300 + }, + { + "epoch": 2.25, + "grad_norm": 0.53515625, + "learning_rate": 0.0004931188488419418, + "loss": 0.2134, + "step": 54310 + }, + { + "epoch": 2.25, + "grad_norm": 0.734375, + "learning_rate": 0.000493116321622154, + "loss": 0.2042, + "step": 54320 + }, + { + "epoch": 2.25, + "grad_norm": 0.53515625, + "learning_rate": 0.0004931137939448475, + "loss": 0.2353, + "step": 54330 + }, + { + "epoch": 2.25, + "grad_norm": 0.97265625, + "learning_rate": 0.000493111265810027, + "loss": 0.2269, + "step": 54340 + }, + { + "epoch": 2.25, + "grad_norm": 0.2734375, + "learning_rate": 0.0004931087372176976, + "loss": 0.2674, + "step": 54350 + }, + { + "epoch": 2.25, + "grad_norm": 0.5625, + "learning_rate": 0.0004931062081678636, + "loss": 0.2363, + "step": 54360 + }, + { + "epoch": 2.25, + "grad_norm": 0.30078125, + "learning_rate": 0.0004931036786605301, + "loss": 0.2364, + "step": 54370 + }, + { + "epoch": 2.25, + "grad_norm": 0.85546875, + "learning_rate": 0.0004931011486957016, + "loss": 0.2082, + "step": 54380 + }, + { + "epoch": 2.25, + "grad_norm": 0.671875, + "learning_rate": 0.0004930986182733831, + "loss": 0.2547, + "step": 54390 + }, + { + "epoch": 2.25, + "grad_norm": 0.70703125, + "learning_rate": 0.0004930960873935792, + "loss": 0.2168, + "step": 54400 + }, + { + "epoch": 2.25, + "grad_norm": 0.7734375, + "learning_rate": 0.0004930935560562948, + "loss": 0.249, + "step": 54410 + }, + { + "epoch": 2.25, + "grad_norm": 0.56640625, + "learning_rate": 0.0004930910242615344, + "loss": 0.2271, + "step": 54420 + }, + { + "epoch": 2.25, + "grad_norm": 0.76953125, + "learning_rate": 0.0004930884920093031, + "loss": 0.231, + "step": 54430 + }, + { + "epoch": 2.25, + "grad_norm": 0.8359375, + "learning_rate": 0.0004930859592996054, + "loss": 0.2003, + "step": 54440 + }, + { + "epoch": 2.26, + "grad_norm": 0.59765625, + "learning_rate": 0.0004930834261324462, + "loss": 0.2532, + "step": 54450 + }, + { + "epoch": 2.26, + "grad_norm": 0.287109375, + "learning_rate": 0.0004930808925078304, + "loss": 0.2248, + "step": 54460 + }, + { + "epoch": 2.26, + "grad_norm": 0.26171875, + "learning_rate": 0.0004930783584257625, + "loss": 0.2119, + "step": 54470 + }, + { + "epoch": 2.26, + "grad_norm": 0.6171875, + "learning_rate": 0.0004930758238862474, + "loss": 0.2623, + "step": 54480 + }, + { + "epoch": 2.26, + "grad_norm": 0.68359375, + "learning_rate": 0.0004930732888892898, + "loss": 0.1977, + "step": 54490 + }, + { + "epoch": 2.26, + "grad_norm": 0.66796875, + "learning_rate": 0.0004930707534348945, + "loss": 0.1727, + "step": 54500 + }, + { + "epoch": 2.26, + "grad_norm": 1.5234375, + "learning_rate": 0.0004930682175230663, + "loss": 0.2869, + "step": 54510 + }, + { + "epoch": 2.26, + "grad_norm": 0.66796875, + "learning_rate": 0.0004930656811538101, + "loss": 0.2388, + "step": 54520 + }, + { + "epoch": 2.26, + "grad_norm": 0.494140625, + "learning_rate": 0.0004930631443271304, + "loss": 0.1979, + "step": 54530 + }, + { + "epoch": 2.26, + "grad_norm": 0.734375, + "learning_rate": 0.0004930606070430321, + "loss": 0.234, + "step": 54540 + }, + { + "epoch": 2.26, + "grad_norm": 1.2890625, + "learning_rate": 0.0004930580693015201, + "loss": 0.2326, + "step": 54550 + }, + { + "epoch": 2.26, + "grad_norm": 0.984375, + "learning_rate": 0.0004930555311025989, + "loss": 0.1675, + "step": 54560 + }, + { + "epoch": 2.26, + "grad_norm": 1.390625, + "learning_rate": 0.0004930529924462735, + "loss": 0.2633, + "step": 54570 + }, + { + "epoch": 2.26, + "grad_norm": 0.78515625, + "learning_rate": 0.0004930504533325487, + "loss": 0.221, + "step": 54580 + }, + { + "epoch": 2.26, + "grad_norm": 0.55859375, + "learning_rate": 0.0004930479137614292, + "loss": 0.203, + "step": 54590 + }, + { + "epoch": 2.26, + "grad_norm": 0.58203125, + "learning_rate": 0.0004930453737329197, + "loss": 0.2567, + "step": 54600 + }, + { + "epoch": 2.26, + "grad_norm": 2.140625, + "learning_rate": 0.000493042833247025, + "loss": 0.217, + "step": 54610 + }, + { + "epoch": 2.26, + "grad_norm": 1.3671875, + "learning_rate": 0.00049304029230375, + "loss": 0.2078, + "step": 54620 + }, + { + "epoch": 2.26, + "grad_norm": 0.59765625, + "learning_rate": 0.0004930377509030995, + "loss": 0.2416, + "step": 54630 + }, + { + "epoch": 2.26, + "grad_norm": 1.1484375, + "learning_rate": 0.0004930352090450781, + "loss": 0.2078, + "step": 54640 + }, + { + "epoch": 2.26, + "grad_norm": 1.609375, + "learning_rate": 0.0004930326667296907, + "loss": 0.2247, + "step": 54650 + }, + { + "epoch": 2.26, + "grad_norm": 0.55078125, + "learning_rate": 0.000493030123956942, + "loss": 0.2255, + "step": 54660 + }, + { + "epoch": 2.26, + "grad_norm": 1.015625, + "learning_rate": 0.000493027580726837, + "loss": 0.2612, + "step": 54670 + }, + { + "epoch": 2.26, + "grad_norm": 1.3125, + "learning_rate": 0.0004930250370393803, + "loss": 0.2119, + "step": 54680 + }, + { + "epoch": 2.27, + "grad_norm": 0.51953125, + "learning_rate": 0.0004930224928945766, + "loss": 0.2113, + "step": 54690 + }, + { + "epoch": 2.27, + "grad_norm": 0.57421875, + "learning_rate": 0.0004930199482924309, + "loss": 0.277, + "step": 54700 + }, + { + "epoch": 2.27, + "grad_norm": 0.61328125, + "learning_rate": 0.0004930174032329479, + "loss": 0.2284, + "step": 54710 + }, + { + "epoch": 2.27, + "grad_norm": 0.34765625, + "learning_rate": 0.0004930148577161324, + "loss": 0.2524, + "step": 54720 + }, + { + "epoch": 2.27, + "grad_norm": 0.259765625, + "learning_rate": 0.0004930123117419891, + "loss": 0.2368, + "step": 54730 + }, + { + "epoch": 2.27, + "grad_norm": 0.67578125, + "learning_rate": 0.0004930097653105229, + "loss": 0.2679, + "step": 54740 + }, + { + "epoch": 2.27, + "grad_norm": 0.36328125, + "learning_rate": 0.0004930072184217387, + "loss": 0.2453, + "step": 54750 + }, + { + "epoch": 2.27, + "grad_norm": 0.0, + "learning_rate": 0.000493004671075641, + "loss": 0.1958, + "step": 54760 + }, + { + "epoch": 2.27, + "grad_norm": 0.45703125, + "learning_rate": 0.0004930021232722348, + "loss": 0.1862, + "step": 54770 + }, + { + "epoch": 2.27, + "grad_norm": 0.94921875, + "learning_rate": 0.0004929995750115249, + "loss": 0.2225, + "step": 54780 + }, + { + "epoch": 2.27, + "grad_norm": 0.3671875, + "learning_rate": 0.000492997026293516, + "loss": 0.2374, + "step": 54790 + }, + { + "epoch": 2.27, + "grad_norm": 0.66015625, + "learning_rate": 0.000492994477118213, + "loss": 0.1905, + "step": 54800 + }, + { + "epoch": 2.27, + "grad_norm": 0.421875, + "learning_rate": 0.0004929919274856204, + "loss": 0.2087, + "step": 54810 + }, + { + "epoch": 2.27, + "grad_norm": 0.51953125, + "learning_rate": 0.0004929893773957436, + "loss": 0.2121, + "step": 54820 + }, + { + "epoch": 2.27, + "grad_norm": 0.4921875, + "learning_rate": 0.0004929868268485867, + "loss": 0.2644, + "step": 54830 + }, + { + "epoch": 2.27, + "grad_norm": 1.21875, + "learning_rate": 0.000492984275844155, + "loss": 0.2388, + "step": 54840 + }, + { + "epoch": 2.27, + "grad_norm": 0.427734375, + "learning_rate": 0.0004929817243824532, + "loss": 0.2933, + "step": 54850 + }, + { + "epoch": 2.27, + "grad_norm": 0.76953125, + "learning_rate": 0.0004929791724634858, + "loss": 0.1678, + "step": 54860 + }, + { + "epoch": 2.27, + "grad_norm": 0.58203125, + "learning_rate": 0.000492976620087258, + "loss": 0.2067, + "step": 54870 + }, + { + "epoch": 2.27, + "grad_norm": 0.88671875, + "learning_rate": 0.0004929740672537745, + "loss": 0.2411, + "step": 54880 + }, + { + "epoch": 2.27, + "grad_norm": 0.78515625, + "learning_rate": 0.0004929715139630398, + "loss": 0.2204, + "step": 54890 + }, + { + "epoch": 2.27, + "grad_norm": 1.1171875, + "learning_rate": 0.0004929689602150593, + "loss": 0.2736, + "step": 54900 + }, + { + "epoch": 2.27, + "grad_norm": 0.5234375, + "learning_rate": 0.0004929664060098371, + "loss": 0.2387, + "step": 54910 + }, + { + "epoch": 2.27, + "grad_norm": 0.6171875, + "learning_rate": 0.0004929638513473786, + "loss": 0.213, + "step": 54920 + }, + { + "epoch": 2.28, + "grad_norm": 0.56640625, + "learning_rate": 0.0004929612962276883, + "loss": 0.2042, + "step": 54930 + }, + { + "epoch": 2.28, + "grad_norm": 0.99609375, + "learning_rate": 0.000492958740650771, + "loss": 0.232, + "step": 54940 + }, + { + "epoch": 2.28, + "grad_norm": 0.2353515625, + "learning_rate": 0.0004929561846166317, + "loss": 0.2396, + "step": 54950 + }, + { + "epoch": 2.28, + "grad_norm": 0.2041015625, + "learning_rate": 0.0004929536281252751, + "loss": 0.1805, + "step": 54960 + }, + { + "epoch": 2.28, + "grad_norm": 0.53125, + "learning_rate": 0.0004929510711767058, + "loss": 0.2413, + "step": 54970 + }, + { + "epoch": 2.28, + "grad_norm": 1.296875, + "learning_rate": 0.0004929485137709291, + "loss": 0.2233, + "step": 54980 + }, + { + "epoch": 2.28, + "grad_norm": 1.65625, + "learning_rate": 0.0004929459559079494, + "loss": 0.2202, + "step": 54990 + }, + { + "epoch": 2.28, + "grad_norm": 0.41015625, + "learning_rate": 0.0004929433975877718, + "loss": 0.2244, + "step": 55000 + }, + { + "epoch": 2.28, + "grad_norm": 0.5078125, + "learning_rate": 0.0004929408388104008, + "loss": 0.2361, + "step": 55010 + }, + { + "epoch": 2.28, + "grad_norm": 0.60546875, + "learning_rate": 0.0004929382795758414, + "loss": 0.2793, + "step": 55020 + }, + { + "epoch": 2.28, + "grad_norm": 0.8046875, + "learning_rate": 0.0004929357198840983, + "loss": 0.251, + "step": 55030 + }, + { + "epoch": 2.28, + "grad_norm": 0.25390625, + "learning_rate": 0.0004929331597351765, + "loss": 0.2333, + "step": 55040 + }, + { + "epoch": 2.28, + "grad_norm": 0.1962890625, + "learning_rate": 0.0004929305991290808, + "loss": 0.19, + "step": 55050 + }, + { + "epoch": 2.28, + "grad_norm": 0.462890625, + "learning_rate": 0.0004929280380658159, + "loss": 0.2485, + "step": 55060 + }, + { + "epoch": 2.28, + "grad_norm": 0.265625, + "learning_rate": 0.0004929254765453867, + "loss": 0.2082, + "step": 55070 + }, + { + "epoch": 2.28, + "grad_norm": 0.74609375, + "learning_rate": 0.0004929229145677978, + "loss": 0.248, + "step": 55080 + }, + { + "epoch": 2.28, + "grad_norm": 0.98828125, + "learning_rate": 0.0004929203521330544, + "loss": 0.1998, + "step": 55090 + }, + { + "epoch": 2.28, + "grad_norm": 0.828125, + "learning_rate": 0.0004929177892411611, + "loss": 0.2298, + "step": 55100 + }, + { + "epoch": 2.28, + "grad_norm": 1.375, + "learning_rate": 0.0004929152258921226, + "loss": 0.2165, + "step": 55110 + }, + { + "epoch": 2.28, + "grad_norm": 0.62890625, + "learning_rate": 0.0004929126620859441, + "loss": 0.2506, + "step": 55120 + }, + { + "epoch": 2.28, + "grad_norm": 0.74609375, + "learning_rate": 0.00049291009782263, + "loss": 0.27, + "step": 55130 + }, + { + "epoch": 2.28, + "grad_norm": 0.80859375, + "learning_rate": 0.0004929075331021856, + "loss": 0.1857, + "step": 55140 + }, + { + "epoch": 2.28, + "grad_norm": 4.5625, + "learning_rate": 0.0004929049679246152, + "loss": 0.1807, + "step": 55150 + }, + { + "epoch": 2.28, + "grad_norm": 1.609375, + "learning_rate": 0.000492902402289924, + "loss": 0.2213, + "step": 55160 + }, + { + "epoch": 2.29, + "grad_norm": 0.30859375, + "learning_rate": 0.0004928998361981166, + "loss": 0.2305, + "step": 55170 + }, + { + "epoch": 2.29, + "grad_norm": 0.466796875, + "learning_rate": 0.0004928972696491981, + "loss": 0.2455, + "step": 55180 + }, + { + "epoch": 2.29, + "grad_norm": 0.98828125, + "learning_rate": 0.000492894702643173, + "loss": 0.2317, + "step": 55190 + }, + { + "epoch": 2.29, + "grad_norm": 0.640625, + "learning_rate": 0.0004928921351800464, + "loss": 0.2592, + "step": 55200 + }, + { + "epoch": 2.29, + "grad_norm": 0.60546875, + "learning_rate": 0.000492889567259823, + "loss": 0.2299, + "step": 55210 + }, + { + "epoch": 2.29, + "grad_norm": 0.578125, + "learning_rate": 0.0004928869988825077, + "loss": 0.2445, + "step": 55220 + }, + { + "epoch": 2.29, + "grad_norm": 0.55859375, + "learning_rate": 0.0004928844300481054, + "loss": 0.2026, + "step": 55230 + }, + { + "epoch": 2.29, + "grad_norm": 0.85546875, + "learning_rate": 0.0004928818607566206, + "loss": 0.2392, + "step": 55240 + }, + { + "epoch": 2.29, + "grad_norm": 0.71484375, + "learning_rate": 0.0004928792910080585, + "loss": 0.2327, + "step": 55250 + }, + { + "epoch": 2.29, + "grad_norm": 0.5703125, + "learning_rate": 0.0004928767208024239, + "loss": 0.2287, + "step": 55260 + }, + { + "epoch": 2.29, + "grad_norm": 0.859375, + "learning_rate": 0.0004928741501397213, + "loss": 0.2414, + "step": 55270 + }, + { + "epoch": 2.29, + "grad_norm": 0.1962890625, + "learning_rate": 0.000492871579019956, + "loss": 0.1902, + "step": 55280 + }, + { + "epoch": 2.29, + "grad_norm": 0.7578125, + "learning_rate": 0.0004928690074431324, + "loss": 0.2089, + "step": 55290 + }, + { + "epoch": 2.29, + "grad_norm": 0.8203125, + "learning_rate": 0.0004928664354092557, + "loss": 0.1656, + "step": 55300 + }, + { + "epoch": 2.29, + "grad_norm": 0.73046875, + "learning_rate": 0.0004928638629183306, + "loss": 0.1711, + "step": 55310 + }, + { + "epoch": 2.29, + "grad_norm": 0.53515625, + "learning_rate": 0.000492861289970362, + "loss": 0.2582, + "step": 55320 + }, + { + "epoch": 2.29, + "grad_norm": 0.734375, + "learning_rate": 0.0004928587165653546, + "loss": 0.2612, + "step": 55330 + }, + { + "epoch": 2.29, + "grad_norm": 0.38671875, + "learning_rate": 0.0004928561427033132, + "loss": 0.2466, + "step": 55340 + }, + { + "epoch": 2.29, + "grad_norm": 0.98828125, + "learning_rate": 0.0004928535683842429, + "loss": 0.2304, + "step": 55350 + }, + { + "epoch": 2.29, + "grad_norm": 0.7890625, + "learning_rate": 0.0004928509936081483, + "loss": 0.2165, + "step": 55360 + }, + { + "epoch": 2.29, + "grad_norm": 0.6875, + "learning_rate": 0.0004928484183750344, + "loss": 0.2144, + "step": 55370 + }, + { + "epoch": 2.29, + "grad_norm": 1.203125, + "learning_rate": 0.0004928458426849061, + "loss": 0.2206, + "step": 55380 + }, + { + "epoch": 2.29, + "grad_norm": 0.734375, + "learning_rate": 0.0004928432665377682, + "loss": 0.2405, + "step": 55390 + }, + { + "epoch": 2.29, + "grad_norm": 0.81640625, + "learning_rate": 0.0004928406899336253, + "loss": 0.2157, + "step": 55400 + }, + { + "epoch": 2.3, + "grad_norm": 0.5390625, + "learning_rate": 0.0004928381128724826, + "loss": 0.2293, + "step": 55410 + }, + { + "epoch": 2.3, + "grad_norm": 1.46875, + "learning_rate": 0.0004928355353543447, + "loss": 0.2214, + "step": 55420 + }, + { + "epoch": 2.3, + "grad_norm": 0.890625, + "learning_rate": 0.0004928329573792165, + "loss": 0.2477, + "step": 55430 + }, + { + "epoch": 2.3, + "grad_norm": 0.25, + "learning_rate": 0.0004928303789471031, + "loss": 0.1631, + "step": 55440 + }, + { + "epoch": 2.3, + "grad_norm": 0.828125, + "learning_rate": 0.0004928278000580089, + "loss": 0.2524, + "step": 55450 + }, + { + "epoch": 2.3, + "grad_norm": 0.50390625, + "learning_rate": 0.0004928252207119392, + "loss": 0.2401, + "step": 55460 + }, + { + "epoch": 2.3, + "grad_norm": 0.71484375, + "learning_rate": 0.0004928226409088985, + "loss": 0.2053, + "step": 55470 + }, + { + "epoch": 2.3, + "grad_norm": 0.578125, + "learning_rate": 0.000492820060648892, + "loss": 0.2044, + "step": 55480 + }, + { + "epoch": 2.3, + "grad_norm": 0.59765625, + "learning_rate": 0.0004928174799319242, + "loss": 0.2432, + "step": 55490 + }, + { + "epoch": 2.3, + "grad_norm": 2.140625, + "learning_rate": 0.0004928148987580003, + "loss": 0.2456, + "step": 55500 + }, + { + "epoch": 2.3, + "grad_norm": 1.3125, + "learning_rate": 0.0004928123171271248, + "loss": 0.2343, + "step": 55510 + }, + { + "epoch": 2.3, + "grad_norm": 0.859375, + "learning_rate": 0.0004928097350393028, + "loss": 0.2001, + "step": 55520 + }, + { + "epoch": 2.3, + "grad_norm": 0.318359375, + "learning_rate": 0.0004928071524945391, + "loss": 0.2073, + "step": 55530 + }, + { + "epoch": 2.3, + "grad_norm": 0.470703125, + "learning_rate": 0.0004928045694928386, + "loss": 0.2104, + "step": 55540 + }, + { + "epoch": 2.3, + "grad_norm": 0.7265625, + "learning_rate": 0.0004928019860342061, + "loss": 0.1984, + "step": 55550 + }, + { + "epoch": 2.3, + "grad_norm": 0.6796875, + "learning_rate": 0.0004927994021186465, + "loss": 0.2232, + "step": 55560 + }, + { + "epoch": 2.3, + "grad_norm": 0.74609375, + "learning_rate": 0.0004927968177461646, + "loss": 0.2109, + "step": 55570 + }, + { + "epoch": 2.3, + "grad_norm": 1.4609375, + "learning_rate": 0.0004927942329167653, + "loss": 0.2462, + "step": 55580 + }, + { + "epoch": 2.3, + "grad_norm": 1.546875, + "learning_rate": 0.0004927916476304535, + "loss": 0.2604, + "step": 55590 + }, + { + "epoch": 2.3, + "grad_norm": 0.8828125, + "learning_rate": 0.000492789061887234, + "loss": 0.2692, + "step": 55600 + }, + { + "epoch": 2.3, + "grad_norm": 0.859375, + "learning_rate": 0.0004927864756871118, + "loss": 0.2319, + "step": 55610 + }, + { + "epoch": 2.3, + "grad_norm": 0.77734375, + "learning_rate": 0.0004927838890300916, + "loss": 0.2301, + "step": 55620 + }, + { + "epoch": 2.3, + "grad_norm": 1.6953125, + "learning_rate": 0.0004927813019161783, + "loss": 0.2275, + "step": 55630 + }, + { + "epoch": 2.3, + "grad_norm": 0.53125, + "learning_rate": 0.0004927787143453769, + "loss": 0.2501, + "step": 55640 + }, + { + "epoch": 2.31, + "grad_norm": 0.49609375, + "learning_rate": 0.0004927761263176922, + "loss": 0.2311, + "step": 55650 + }, + { + "epoch": 2.31, + "grad_norm": 1.1484375, + "learning_rate": 0.0004927735378331289, + "loss": 0.2014, + "step": 55660 + }, + { + "epoch": 2.31, + "grad_norm": 0.4453125, + "learning_rate": 0.0004927709488916921, + "loss": 0.2275, + "step": 55670 + }, + { + "epoch": 2.31, + "grad_norm": 0.1630859375, + "learning_rate": 0.0004927683594933866, + "loss": 0.2497, + "step": 55680 + }, + { + "epoch": 2.31, + "grad_norm": 1.03125, + "learning_rate": 0.0004927657696382173, + "loss": 0.2279, + "step": 55690 + }, + { + "epoch": 2.31, + "grad_norm": 1.3984375, + "learning_rate": 0.0004927631793261891, + "loss": 0.1955, + "step": 55700 + }, + { + "epoch": 2.31, + "grad_norm": 0.44140625, + "learning_rate": 0.0004927605885573067, + "loss": 0.2027, + "step": 55710 + }, + { + "epoch": 2.31, + "grad_norm": 0.3828125, + "learning_rate": 0.0004927579973315751, + "loss": 0.166, + "step": 55720 + }, + { + "epoch": 2.31, + "grad_norm": 0.5546875, + "learning_rate": 0.0004927554056489991, + "loss": 0.2388, + "step": 55730 + }, + { + "epoch": 2.31, + "grad_norm": 0.69140625, + "learning_rate": 0.0004927528135095838, + "loss": 0.2308, + "step": 55740 + }, + { + "epoch": 2.31, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004927502209133338, + "loss": 0.1735, + "step": 55750 + }, + { + "epoch": 2.31, + "grad_norm": 0.5234375, + "learning_rate": 0.0004927476278602541, + "loss": 0.2004, + "step": 55760 + }, + { + "epoch": 2.31, + "grad_norm": 0.5859375, + "learning_rate": 0.0004927450343503495, + "loss": 0.2219, + "step": 55770 + }, + { + "epoch": 2.31, + "grad_norm": 0.99609375, + "learning_rate": 0.0004927424403836252, + "loss": 0.2008, + "step": 55780 + }, + { + "epoch": 2.31, + "grad_norm": 0.76171875, + "learning_rate": 0.0004927398459600857, + "loss": 0.2717, + "step": 55790 + }, + { + "epoch": 2.31, + "grad_norm": 0.58203125, + "learning_rate": 0.000492737251079736, + "loss": 0.2122, + "step": 55800 + }, + { + "epoch": 2.31, + "grad_norm": 0.314453125, + "learning_rate": 0.000492734655742581, + "loss": 0.1942, + "step": 55810 + }, + { + "epoch": 2.31, + "grad_norm": 0.55078125, + "learning_rate": 0.0004927320599486257, + "loss": 0.2117, + "step": 55820 + }, + { + "epoch": 2.31, + "grad_norm": 0.447265625, + "learning_rate": 0.0004927294636978747, + "loss": 0.2366, + "step": 55830 + }, + { + "epoch": 2.31, + "grad_norm": 0.6171875, + "learning_rate": 0.0004927268669903332, + "loss": 0.2095, + "step": 55840 + }, + { + "epoch": 2.31, + "grad_norm": 0.412109375, + "learning_rate": 0.0004927242698260059, + "loss": 0.2432, + "step": 55850 + }, + { + "epoch": 2.31, + "grad_norm": 0.8671875, + "learning_rate": 0.0004927216722048979, + "loss": 0.2035, + "step": 55860 + }, + { + "epoch": 2.31, + "grad_norm": 1.0625, + "learning_rate": 0.0004927190741270136, + "loss": 0.213, + "step": 55870 + }, + { + "epoch": 2.31, + "grad_norm": 0.51953125, + "learning_rate": 0.0004927164755923585, + "loss": 0.2177, + "step": 55880 + }, + { + "epoch": 2.31, + "grad_norm": 1.09375, + "learning_rate": 0.0004927138766009371, + "loss": 0.2338, + "step": 55890 + }, + { + "epoch": 2.32, + "grad_norm": 0.69921875, + "learning_rate": 0.0004927112771527544, + "loss": 0.2044, + "step": 55900 + }, + { + "epoch": 2.32, + "grad_norm": 0.5859375, + "learning_rate": 0.0004927086772478152, + "loss": 0.2619, + "step": 55910 + }, + { + "epoch": 2.32, + "grad_norm": 0.49609375, + "learning_rate": 0.0004927060768861246, + "loss": 0.2274, + "step": 55920 + }, + { + "epoch": 2.32, + "grad_norm": 0.73046875, + "learning_rate": 0.0004927034760676873, + "loss": 0.2174, + "step": 55930 + }, + { + "epoch": 2.32, + "grad_norm": 0.73046875, + "learning_rate": 0.0004927008747925084, + "loss": 0.2163, + "step": 55940 + }, + { + "epoch": 2.32, + "grad_norm": 0.65234375, + "learning_rate": 0.0004926982730605924, + "loss": 0.1996, + "step": 55950 + }, + { + "epoch": 2.32, + "grad_norm": 1.1171875, + "learning_rate": 0.0004926956708719447, + "loss": 0.1784, + "step": 55960 + }, + { + "epoch": 2.32, + "grad_norm": 0.875, + "learning_rate": 0.0004926930682265699, + "loss": 0.2203, + "step": 55970 + }, + { + "epoch": 2.32, + "grad_norm": 0.5, + "learning_rate": 0.000492690465124473, + "loss": 0.2098, + "step": 55980 + }, + { + "epoch": 2.32, + "grad_norm": 0.396484375, + "learning_rate": 0.0004926878615656587, + "loss": 0.2008, + "step": 55990 + }, + { + "epoch": 2.32, + "grad_norm": 1.1328125, + "learning_rate": 0.0004926852575501321, + "loss": 0.2202, + "step": 56000 + }, + { + "epoch": 2.32, + "grad_norm": 0.79296875, + "learning_rate": 0.0004926826530778981, + "loss": 0.2119, + "step": 56010 + }, + { + "epoch": 2.32, + "grad_norm": 0.58203125, + "learning_rate": 0.0004926800481489616, + "loss": 0.2144, + "step": 56020 + }, + { + "epoch": 2.32, + "grad_norm": 0.94140625, + "learning_rate": 0.0004926774427633274, + "loss": 0.2, + "step": 56030 + }, + { + "epoch": 2.32, + "grad_norm": 0.68359375, + "learning_rate": 0.0004926748369210004, + "loss": 0.2134, + "step": 56040 + }, + { + "epoch": 2.32, + "grad_norm": 0.3046875, + "learning_rate": 0.0004926722306219856, + "loss": 0.2569, + "step": 56050 + }, + { + "epoch": 2.32, + "grad_norm": 0.80859375, + "learning_rate": 0.0004926696238662879, + "loss": 0.2215, + "step": 56060 + }, + { + "epoch": 2.32, + "grad_norm": 1.109375, + "learning_rate": 0.0004926670166539122, + "loss": 0.2515, + "step": 56070 + }, + { + "epoch": 2.32, + "grad_norm": 0.578125, + "learning_rate": 0.0004926644089848633, + "loss": 0.1996, + "step": 56080 + }, + { + "epoch": 2.32, + "grad_norm": 1.0703125, + "learning_rate": 0.0004926618008591463, + "loss": 0.2551, + "step": 56090 + }, + { + "epoch": 2.32, + "grad_norm": 0.7265625, + "learning_rate": 0.0004926591922767659, + "loss": 0.2461, + "step": 56100 + }, + { + "epoch": 2.32, + "grad_norm": 1.09375, + "learning_rate": 0.0004926565832377271, + "loss": 0.2267, + "step": 56110 + }, + { + "epoch": 2.32, + "grad_norm": 1.46875, + "learning_rate": 0.0004926539737420349, + "loss": 0.2067, + "step": 56120 + }, + { + "epoch": 2.32, + "grad_norm": 0.90234375, + "learning_rate": 0.0004926513637896941, + "loss": 0.1981, + "step": 56130 + }, + { + "epoch": 2.33, + "grad_norm": 0.59765625, + "learning_rate": 0.0004926487533807095, + "loss": 0.2522, + "step": 56140 + }, + { + "epoch": 2.33, + "grad_norm": 0.5390625, + "learning_rate": 0.0004926461425150863, + "loss": 0.2361, + "step": 56150 + }, + { + "epoch": 2.33, + "grad_norm": 0.65234375, + "learning_rate": 0.0004926435311928293, + "loss": 0.2013, + "step": 56160 + }, + { + "epoch": 2.33, + "grad_norm": 0.83203125, + "learning_rate": 0.0004926409194139433, + "loss": 0.2194, + "step": 56170 + }, + { + "epoch": 2.33, + "grad_norm": 0.45703125, + "learning_rate": 0.0004926383071784333, + "loss": 0.2262, + "step": 56180 + }, + { + "epoch": 2.33, + "grad_norm": 1.078125, + "learning_rate": 0.0004926356944863041, + "loss": 0.214, + "step": 56190 + }, + { + "epoch": 2.33, + "grad_norm": 0.62109375, + "learning_rate": 0.000492633081337561, + "loss": 0.2643, + "step": 56200 + }, + { + "epoch": 2.33, + "grad_norm": 0.31640625, + "learning_rate": 0.0004926304677322084, + "loss": 0.221, + "step": 56210 + }, + { + "epoch": 2.33, + "grad_norm": 1.875, + "learning_rate": 0.0004926278536702515, + "loss": 0.2245, + "step": 56220 + }, + { + "epoch": 2.33, + "grad_norm": 0.423828125, + "learning_rate": 0.0004926252391516952, + "loss": 0.1911, + "step": 56230 + }, + { + "epoch": 2.33, + "grad_norm": 0.1728515625, + "learning_rate": 0.0004926226241765445, + "loss": 0.1497, + "step": 56240 + }, + { + "epoch": 2.33, + "grad_norm": 0.29296875, + "learning_rate": 0.0004926200087448041, + "loss": 0.2163, + "step": 56250 + }, + { + "epoch": 2.33, + "grad_norm": 0.640625, + "learning_rate": 0.000492617392856479, + "loss": 0.1969, + "step": 56260 + }, + { + "epoch": 2.33, + "grad_norm": 0.388671875, + "learning_rate": 0.0004926147765115743, + "loss": 0.1931, + "step": 56270 + }, + { + "epoch": 2.33, + "grad_norm": 0.49609375, + "learning_rate": 0.0004926121597100948, + "loss": 0.2206, + "step": 56280 + }, + { + "epoch": 2.33, + "grad_norm": 0.53125, + "learning_rate": 0.0004926095424520453, + "loss": 0.2305, + "step": 56290 + }, + { + "epoch": 2.33, + "grad_norm": 0.29296875, + "learning_rate": 0.0004926069247374309, + "loss": 0.2401, + "step": 56300 + }, + { + "epoch": 2.33, + "grad_norm": 0.8671875, + "learning_rate": 0.0004926043065662564, + "loss": 0.2033, + "step": 56310 + }, + { + "epoch": 2.33, + "grad_norm": 0.58203125, + "learning_rate": 0.0004926016879385268, + "loss": 0.2192, + "step": 56320 + }, + { + "epoch": 2.33, + "grad_norm": 0.2421875, + "learning_rate": 0.0004925990688542472, + "loss": 0.1932, + "step": 56330 + }, + { + "epoch": 2.33, + "grad_norm": 0.765625, + "learning_rate": 0.0004925964493134223, + "loss": 0.2094, + "step": 56340 + }, + { + "epoch": 2.33, + "grad_norm": 0.30859375, + "learning_rate": 0.0004925938293160568, + "loss": 0.2229, + "step": 56350 + }, + { + "epoch": 2.33, + "grad_norm": 0.60546875, + "learning_rate": 0.0004925912088621562, + "loss": 0.2043, + "step": 56360 + }, + { + "epoch": 2.33, + "grad_norm": 0.36328125, + "learning_rate": 0.0004925885879517251, + "loss": 0.2087, + "step": 56370 + }, + { + "epoch": 2.34, + "grad_norm": 0.64453125, + "learning_rate": 0.0004925859665847684, + "loss": 0.2155, + "step": 56380 + }, + { + "epoch": 2.34, + "grad_norm": 0.76953125, + "learning_rate": 0.0004925833447612912, + "loss": 0.1741, + "step": 56390 + }, + { + "epoch": 2.34, + "grad_norm": 0.9609375, + "learning_rate": 0.0004925807224812983, + "loss": 0.256, + "step": 56400 + }, + { + "epoch": 2.34, + "grad_norm": 0.6953125, + "learning_rate": 0.0004925780997447947, + "loss": 0.2398, + "step": 56410 + }, + { + "epoch": 2.34, + "grad_norm": 0.6015625, + "learning_rate": 0.0004925754765517852, + "loss": 0.2005, + "step": 56420 + }, + { + "epoch": 2.34, + "grad_norm": 0.8828125, + "learning_rate": 0.0004925728529022749, + "loss": 0.2288, + "step": 56430 + }, + { + "epoch": 2.34, + "grad_norm": 0.51953125, + "learning_rate": 0.0004925702287962688, + "loss": 0.2592, + "step": 56440 + }, + { + "epoch": 2.34, + "grad_norm": 0.56640625, + "learning_rate": 0.0004925676042337716, + "loss": 0.2262, + "step": 56450 + }, + { + "epoch": 2.34, + "grad_norm": 0.474609375, + "learning_rate": 0.0004925649792147885, + "loss": 0.1733, + "step": 56460 + }, + { + "epoch": 2.34, + "grad_norm": 2.21875, + "learning_rate": 0.0004925623537393242, + "loss": 0.2132, + "step": 56470 + }, + { + "epoch": 2.34, + "grad_norm": 0.84765625, + "learning_rate": 0.0004925597278073836, + "loss": 0.2414, + "step": 56480 + }, + { + "epoch": 2.34, + "grad_norm": 0.8359375, + "learning_rate": 0.000492557101418972, + "loss": 0.2931, + "step": 56490 + }, + { + "epoch": 2.34, + "grad_norm": 1.2265625, + "learning_rate": 0.0004925544745740941, + "loss": 0.2627, + "step": 56500 + }, + { + "epoch": 2.34, + "grad_norm": 0.515625, + "learning_rate": 0.0004925518472727548, + "loss": 0.2331, + "step": 56510 + }, + { + "epoch": 2.34, + "grad_norm": 0.50390625, + "learning_rate": 0.0004925492195149592, + "loss": 0.2318, + "step": 56520 + }, + { + "epoch": 2.34, + "grad_norm": 1.6875, + "learning_rate": 0.0004925465913007121, + "loss": 0.2513, + "step": 56530 + }, + { + "epoch": 2.34, + "grad_norm": 0.4453125, + "learning_rate": 0.0004925439626300186, + "loss": 0.2406, + "step": 56540 + }, + { + "epoch": 2.34, + "grad_norm": 0.279296875, + "learning_rate": 0.0004925413335028834, + "loss": 0.2171, + "step": 56550 + }, + { + "epoch": 2.34, + "grad_norm": 0.78515625, + "learning_rate": 0.0004925387039193117, + "loss": 0.211, + "step": 56560 + }, + { + "epoch": 2.34, + "grad_norm": 0.578125, + "learning_rate": 0.0004925360738793083, + "loss": 0.2327, + "step": 56570 + }, + { + "epoch": 2.34, + "grad_norm": 0.84375, + "learning_rate": 0.0004925334433828782, + "loss": 0.213, + "step": 56580 + }, + { + "epoch": 2.34, + "grad_norm": 0.365234375, + "learning_rate": 0.0004925308124300264, + "loss": 0.24, + "step": 56590 + }, + { + "epoch": 2.34, + "grad_norm": 0.28515625, + "learning_rate": 0.0004925281810207578, + "loss": 0.2025, + "step": 56600 + }, + { + "epoch": 2.34, + "grad_norm": 2.140625, + "learning_rate": 0.0004925255491550774, + "loss": 0.2282, + "step": 56610 + }, + { + "epoch": 2.35, + "grad_norm": 0.5078125, + "learning_rate": 0.00049252291683299, + "loss": 0.2137, + "step": 56620 + }, + { + "epoch": 2.35, + "grad_norm": 0.5, + "learning_rate": 0.0004925202840545007, + "loss": 0.1993, + "step": 56630 + }, + { + "epoch": 2.35, + "grad_norm": 0.53125, + "learning_rate": 0.0004925176508196144, + "loss": 0.2118, + "step": 56640 + }, + { + "epoch": 2.35, + "grad_norm": 0.6328125, + "learning_rate": 0.0004925150171283361, + "loss": 0.2146, + "step": 56650 + }, + { + "epoch": 2.35, + "grad_norm": 1.1796875, + "learning_rate": 0.0004925123829806708, + "loss": 0.2385, + "step": 56660 + }, + { + "epoch": 2.35, + "grad_norm": 0.3046875, + "learning_rate": 0.0004925097483766233, + "loss": 0.2066, + "step": 56670 + }, + { + "epoch": 2.35, + "grad_norm": 0.474609375, + "learning_rate": 0.0004925071133161986, + "loss": 0.2529, + "step": 56680 + }, + { + "epoch": 2.35, + "grad_norm": 0.62109375, + "learning_rate": 0.0004925044777994018, + "loss": 0.2458, + "step": 56690 + }, + { + "epoch": 2.35, + "grad_norm": 0.546875, + "learning_rate": 0.0004925018418262377, + "loss": 0.1916, + "step": 56700 + }, + { + "epoch": 2.35, + "grad_norm": 0.6875, + "learning_rate": 0.0004924992053967113, + "loss": 0.1819, + "step": 56710 + }, + { + "epoch": 2.35, + "grad_norm": 0.26171875, + "learning_rate": 0.0004924965685108276, + "loss": 0.2381, + "step": 56720 + }, + { + "epoch": 2.35, + "grad_norm": 1.3359375, + "learning_rate": 0.0004924939311685915, + "loss": 0.2623, + "step": 56730 + }, + { + "epoch": 2.35, + "grad_norm": 0.181640625, + "learning_rate": 0.0004924912933700081, + "loss": 0.2096, + "step": 56740 + }, + { + "epoch": 2.35, + "grad_norm": 0.6796875, + "learning_rate": 0.0004924886551150823, + "loss": 0.247, + "step": 56750 + }, + { + "epoch": 2.35, + "grad_norm": 0.45703125, + "learning_rate": 0.0004924860164038189, + "loss": 0.2347, + "step": 56760 + }, + { + "epoch": 2.35, + "grad_norm": 0.859375, + "learning_rate": 0.0004924833772362232, + "loss": 0.2366, + "step": 56770 + }, + { + "epoch": 2.35, + "grad_norm": 1.03125, + "learning_rate": 0.0004924807376122998, + "loss": 0.1789, + "step": 56780 + }, + { + "epoch": 2.35, + "grad_norm": 0.60546875, + "learning_rate": 0.0004924780975320539, + "loss": 0.1752, + "step": 56790 + }, + { + "epoch": 2.35, + "grad_norm": 1.3125, + "learning_rate": 0.0004924754569954904, + "loss": 0.1929, + "step": 56800 + }, + { + "epoch": 2.35, + "grad_norm": 0.6796875, + "learning_rate": 0.0004924728160026143, + "loss": 0.2186, + "step": 56810 + }, + { + "epoch": 2.35, + "grad_norm": 0.85546875, + "learning_rate": 0.0004924701745534305, + "loss": 0.2241, + "step": 56820 + }, + { + "epoch": 2.35, + "grad_norm": 0.68359375, + "learning_rate": 0.000492467532647944, + "loss": 0.2193, + "step": 56830 + }, + { + "epoch": 2.35, + "grad_norm": 0.6640625, + "learning_rate": 0.0004924648902861599, + "loss": 0.1981, + "step": 56840 + }, + { + "epoch": 2.35, + "grad_norm": 0.6953125, + "learning_rate": 0.000492462247468083, + "loss": 0.1805, + "step": 56850 + }, + { + "epoch": 2.36, + "grad_norm": 0.208984375, + "learning_rate": 0.0004924596041937183, + "loss": 0.195, + "step": 56860 + }, + { + "epoch": 2.36, + "grad_norm": 0.5859375, + "learning_rate": 0.0004924569604630708, + "loss": 0.2678, + "step": 56870 + }, + { + "epoch": 2.36, + "grad_norm": 1.0234375, + "learning_rate": 0.0004924543162761455, + "loss": 0.2283, + "step": 56880 + }, + { + "epoch": 2.36, + "grad_norm": 0.76953125, + "learning_rate": 0.0004924516716329474, + "loss": 0.2315, + "step": 56890 + }, + { + "epoch": 2.36, + "grad_norm": 0.341796875, + "learning_rate": 0.0004924490265334813, + "loss": 0.1918, + "step": 56900 + }, + { + "epoch": 2.36, + "grad_norm": 0.8671875, + "learning_rate": 0.0004924463809777525, + "loss": 0.224, + "step": 56910 + }, + { + "epoch": 2.36, + "grad_norm": 0.40234375, + "learning_rate": 0.0004924437349657656, + "loss": 0.1886, + "step": 56920 + }, + { + "epoch": 2.36, + "grad_norm": 0.443359375, + "learning_rate": 0.000492441088497526, + "loss": 0.2391, + "step": 56930 + }, + { + "epoch": 2.36, + "grad_norm": 0.462890625, + "learning_rate": 0.0004924384415730383, + "loss": 0.2355, + "step": 56940 + }, + { + "epoch": 2.36, + "grad_norm": 0.51171875, + "learning_rate": 0.0004924357941923077, + "loss": 0.2521, + "step": 56950 + }, + { + "epoch": 2.36, + "grad_norm": 0.53125, + "learning_rate": 0.0004924331463553391, + "loss": 0.2344, + "step": 56960 + }, + { + "epoch": 2.36, + "grad_norm": 0.40625, + "learning_rate": 0.0004924304980621375, + "loss": 0.193, + "step": 56970 + }, + { + "epoch": 2.36, + "grad_norm": 0.55078125, + "learning_rate": 0.0004924278493127078, + "loss": 0.2024, + "step": 56980 + }, + { + "epoch": 2.36, + "grad_norm": 0.6640625, + "learning_rate": 0.0004924252001070552, + "loss": 0.2181, + "step": 56990 + }, + { + "epoch": 2.36, + "grad_norm": 0.494140625, + "learning_rate": 0.0004924225504451845, + "loss": 0.1681, + "step": 57000 + }, + { + "epoch": 2.36, + "grad_norm": 1.171875, + "learning_rate": 0.0004924199003271006, + "loss": 0.1974, + "step": 57010 + }, + { + "epoch": 2.36, + "grad_norm": 0.6171875, + "learning_rate": 0.0004924172497528088, + "loss": 0.2431, + "step": 57020 + }, + { + "epoch": 2.36, + "grad_norm": 0.390625, + "learning_rate": 0.0004924145987223139, + "loss": 0.1602, + "step": 57030 + }, + { + "epoch": 2.36, + "grad_norm": 0.65625, + "learning_rate": 0.0004924119472356209, + "loss": 0.1773, + "step": 57040 + }, + { + "epoch": 2.36, + "grad_norm": 0.91796875, + "learning_rate": 0.0004924092952927347, + "loss": 0.1908, + "step": 57050 + }, + { + "epoch": 2.36, + "grad_norm": 0.8984375, + "learning_rate": 0.0004924066428936604, + "loss": 0.234, + "step": 57060 + }, + { + "epoch": 2.36, + "grad_norm": 0.361328125, + "learning_rate": 0.0004924039900384031, + "loss": 0.2041, + "step": 57070 + }, + { + "epoch": 2.36, + "grad_norm": 0.83203125, + "learning_rate": 0.0004924013367269677, + "loss": 0.2157, + "step": 57080 + }, + { + "epoch": 2.36, + "grad_norm": 0.7109375, + "learning_rate": 0.000492398682959359, + "loss": 0.2218, + "step": 57090 + }, + { + "epoch": 2.37, + "grad_norm": 0.578125, + "learning_rate": 0.0004923960287355821, + "loss": 0.2733, + "step": 57100 + }, + { + "epoch": 2.37, + "grad_norm": 1.1484375, + "learning_rate": 0.0004923933740556422, + "loss": 0.2468, + "step": 57110 + }, + { + "epoch": 2.37, + "grad_norm": 0.609375, + "learning_rate": 0.0004923907189195441, + "loss": 0.1934, + "step": 57120 + }, + { + "epoch": 2.37, + "grad_norm": 1.6171875, + "learning_rate": 0.0004923880633272929, + "loss": 0.2, + "step": 57130 + }, + { + "epoch": 2.37, + "grad_norm": 0.796875, + "learning_rate": 0.0004923854072788935, + "loss": 0.1896, + "step": 57140 + }, + { + "epoch": 2.37, + "grad_norm": 0.53125, + "learning_rate": 0.0004923827507743509, + "loss": 0.2673, + "step": 57150 + }, + { + "epoch": 2.37, + "grad_norm": 0.6484375, + "learning_rate": 0.0004923800938136702, + "loss": 0.2337, + "step": 57160 + }, + { + "epoch": 2.37, + "grad_norm": 0.578125, + "learning_rate": 0.0004923774363968563, + "loss": 0.2149, + "step": 57170 + }, + { + "epoch": 2.37, + "grad_norm": 4.03125, + "learning_rate": 0.0004923747785239142, + "loss": 0.3006, + "step": 57180 + }, + { + "epoch": 2.37, + "grad_norm": 1.078125, + "learning_rate": 0.000492372120194849, + "loss": 0.1856, + "step": 57190 + }, + { + "epoch": 2.37, + "grad_norm": 1.0546875, + "learning_rate": 0.0004923694614096657, + "loss": 0.2317, + "step": 57200 + }, + { + "epoch": 2.37, + "grad_norm": 1.109375, + "learning_rate": 0.0004923668021683691, + "loss": 0.2048, + "step": 57210 + }, + { + "epoch": 2.37, + "grad_norm": 0.515625, + "learning_rate": 0.0004923641424709644, + "loss": 0.2477, + "step": 57220 + }, + { + "epoch": 2.37, + "grad_norm": 0.78515625, + "learning_rate": 0.0004923614823174567, + "loss": 0.2109, + "step": 57230 + }, + { + "epoch": 2.37, + "grad_norm": 0.392578125, + "learning_rate": 0.0004923588217078507, + "loss": 0.2351, + "step": 57240 + }, + { + "epoch": 2.37, + "grad_norm": 0.478515625, + "learning_rate": 0.0004923561606421516, + "loss": 0.138, + "step": 57250 + }, + { + "epoch": 2.37, + "grad_norm": 0.90234375, + "learning_rate": 0.0004923534991203645, + "loss": 0.2211, + "step": 57260 + }, + { + "epoch": 2.37, + "grad_norm": 0.248046875, + "learning_rate": 0.0004923508371424941, + "loss": 0.2189, + "step": 57270 + }, + { + "epoch": 2.37, + "grad_norm": 0.86328125, + "learning_rate": 0.0004923481747085457, + "loss": 0.1913, + "step": 57280 + }, + { + "epoch": 2.37, + "grad_norm": 0.2578125, + "learning_rate": 0.0004923455118185241, + "loss": 0.2331, + "step": 57290 + }, + { + "epoch": 2.37, + "grad_norm": 0.9375, + "learning_rate": 0.0004923428484724346, + "loss": 0.2218, + "step": 57300 + }, + { + "epoch": 2.37, + "grad_norm": 2.1875, + "learning_rate": 0.0004923401846702819, + "loss": 0.2475, + "step": 57310 + }, + { + "epoch": 2.37, + "grad_norm": 0.6015625, + "learning_rate": 0.0004923375204120711, + "loss": 0.2418, + "step": 57320 + }, + { + "epoch": 2.37, + "grad_norm": 0.353515625, + "learning_rate": 0.0004923348556978074, + "loss": 0.1531, + "step": 57330 + }, + { + "epoch": 2.38, + "grad_norm": 0.416015625, + "learning_rate": 0.0004923321905274956, + "loss": 0.2093, + "step": 57340 + }, + { + "epoch": 2.38, + "grad_norm": 0.5234375, + "learning_rate": 0.0004923295249011408, + "loss": 0.1796, + "step": 57350 + }, + { + "epoch": 2.38, + "grad_norm": 0.4375, + "learning_rate": 0.000492326858818748, + "loss": 0.2126, + "step": 57360 + }, + { + "epoch": 2.38, + "grad_norm": 0.8359375, + "learning_rate": 0.0004923241922803222, + "loss": 0.2103, + "step": 57370 + }, + { + "epoch": 2.38, + "grad_norm": 0.84765625, + "learning_rate": 0.0004923215252858685, + "loss": 0.2044, + "step": 57380 + }, + { + "epoch": 2.38, + "grad_norm": 0.8125, + "learning_rate": 0.0004923188578353917, + "loss": 0.1752, + "step": 57390 + }, + { + "epoch": 2.38, + "grad_norm": 1.171875, + "learning_rate": 0.0004923161899288972, + "loss": 0.2142, + "step": 57400 + }, + { + "epoch": 2.38, + "grad_norm": 0.7578125, + "learning_rate": 0.0004923135215663897, + "loss": 0.1583, + "step": 57410 + }, + { + "epoch": 2.38, + "grad_norm": 0.671875, + "learning_rate": 0.0004923108527478742, + "loss": 0.1843, + "step": 57420 + }, + { + "epoch": 2.38, + "grad_norm": 0.3046875, + "learning_rate": 0.0004923081834733561, + "loss": 0.2353, + "step": 57430 + }, + { + "epoch": 2.38, + "grad_norm": 0.81640625, + "learning_rate": 0.0004923055137428399, + "loss": 0.2397, + "step": 57440 + }, + { + "epoch": 2.38, + "grad_norm": 0.37890625, + "learning_rate": 0.000492302843556331, + "loss": 0.2243, + "step": 57450 + }, + { + "epoch": 2.38, + "grad_norm": 0.6171875, + "learning_rate": 0.0004923001729138343, + "loss": 0.235, + "step": 57460 + }, + { + "epoch": 2.38, + "grad_norm": 0.359375, + "learning_rate": 0.0004922975018153549, + "loss": 0.2471, + "step": 57470 + }, + { + "epoch": 2.38, + "grad_norm": 0.82421875, + "learning_rate": 0.0004922948302608978, + "loss": 0.2162, + "step": 57480 + }, + { + "epoch": 2.38, + "grad_norm": 1.65625, + "learning_rate": 0.0004922921582504679, + "loss": 0.2111, + "step": 57490 + }, + { + "epoch": 2.38, + "grad_norm": 0.96875, + "learning_rate": 0.0004922894857840703, + "loss": 0.2441, + "step": 57500 + }, + { + "epoch": 2.38, + "grad_norm": 0.62109375, + "learning_rate": 0.0004922868128617102, + "loss": 0.198, + "step": 57510 + }, + { + "epoch": 2.38, + "grad_norm": 0.4921875, + "learning_rate": 0.0004922841394833923, + "loss": 0.2198, + "step": 57520 + }, + { + "epoch": 2.38, + "grad_norm": 0.71875, + "learning_rate": 0.0004922814656491219, + "loss": 0.2423, + "step": 57530 + }, + { + "epoch": 2.38, + "grad_norm": 0.51171875, + "learning_rate": 0.0004922787913589039, + "loss": 0.1888, + "step": 57540 + }, + { + "epoch": 2.38, + "grad_norm": 1.8125, + "learning_rate": 0.0004922761166127435, + "loss": 0.2127, + "step": 57550 + }, + { + "epoch": 2.38, + "grad_norm": 0.5, + "learning_rate": 0.0004922734414106456, + "loss": 0.1795, + "step": 57560 + }, + { + "epoch": 2.38, + "grad_norm": 1.0625, + "learning_rate": 0.0004922707657526151, + "loss": 0.2226, + "step": 57570 + }, + { + "epoch": 2.38, + "grad_norm": 0.765625, + "learning_rate": 0.0004922680896386573, + "loss": 0.2149, + "step": 57580 + }, + { + "epoch": 2.39, + "grad_norm": 0.51171875, + "learning_rate": 0.0004922654130687771, + "loss": 0.2322, + "step": 57590 + }, + { + "epoch": 2.39, + "grad_norm": 0.52734375, + "learning_rate": 0.0004922627360429795, + "loss": 0.2321, + "step": 57600 + }, + { + "epoch": 2.39, + "grad_norm": 0.671875, + "learning_rate": 0.0004922600585612697, + "loss": 0.2374, + "step": 57610 + }, + { + "epoch": 2.39, + "grad_norm": 0.69921875, + "learning_rate": 0.0004922573806236525, + "loss": 0.1835, + "step": 57620 + }, + { + "epoch": 2.39, + "grad_norm": 0.6328125, + "learning_rate": 0.0004922547022301331, + "loss": 0.2544, + "step": 57630 + }, + { + "epoch": 2.39, + "grad_norm": 0.91796875, + "learning_rate": 0.0004922520233807165, + "loss": 0.2309, + "step": 57640 + }, + { + "epoch": 2.39, + "grad_norm": 0.625, + "learning_rate": 0.0004922493440754079, + "loss": 0.2778, + "step": 57650 + }, + { + "epoch": 2.39, + "grad_norm": 0.5390625, + "learning_rate": 0.000492246664314212, + "loss": 0.2441, + "step": 57660 + }, + { + "epoch": 2.39, + "grad_norm": 0.77734375, + "learning_rate": 0.0004922439840971341, + "loss": 0.2017, + "step": 57670 + }, + { + "epoch": 2.39, + "grad_norm": 0.58203125, + "learning_rate": 0.0004922413034241793, + "loss": 0.232, + "step": 57680 + }, + { + "epoch": 2.39, + "grad_norm": 0.515625, + "learning_rate": 0.0004922386222953524, + "loss": 0.2299, + "step": 57690 + }, + { + "epoch": 2.39, + "grad_norm": 0.498046875, + "learning_rate": 0.0004922359407106586, + "loss": 0.1858, + "step": 57700 + }, + { + "epoch": 2.39, + "grad_norm": 0.314453125, + "learning_rate": 0.0004922332586701029, + "loss": 0.2113, + "step": 57710 + }, + { + "epoch": 2.39, + "grad_norm": 0.85546875, + "learning_rate": 0.0004922305761736905, + "loss": 0.2311, + "step": 57720 + }, + { + "epoch": 2.39, + "grad_norm": 0.65234375, + "learning_rate": 0.0004922278932214262, + "loss": 0.2297, + "step": 57730 + }, + { + "epoch": 2.39, + "grad_norm": 0.5859375, + "learning_rate": 0.0004922252098133152, + "loss": 0.2417, + "step": 57740 + }, + { + "epoch": 2.39, + "grad_norm": 0.58203125, + "learning_rate": 0.0004922225259493625, + "loss": 0.2377, + "step": 57750 + }, + { + "epoch": 2.39, + "grad_norm": 0.9140625, + "learning_rate": 0.0004922198416295731, + "loss": 0.2339, + "step": 57760 + }, + { + "epoch": 2.39, + "grad_norm": 0.59375, + "learning_rate": 0.0004922171568539522, + "loss": 0.2342, + "step": 57770 + }, + { + "epoch": 2.39, + "grad_norm": 0.392578125, + "learning_rate": 0.0004922144716225047, + "loss": 0.217, + "step": 57780 + }, + { + "epoch": 2.39, + "grad_norm": 0.181640625, + "learning_rate": 0.0004922117859352357, + "loss": 0.1867, + "step": 57790 + }, + { + "epoch": 2.39, + "grad_norm": 0.490234375, + "learning_rate": 0.0004922090997921503, + "loss": 0.2016, + "step": 57800 + }, + { + "epoch": 2.39, + "grad_norm": 0.70703125, + "learning_rate": 0.0004922064131932536, + "loss": 0.1825, + "step": 57810 + }, + { + "epoch": 2.39, + "grad_norm": 0.55078125, + "learning_rate": 0.0004922037261385506, + "loss": 0.227, + "step": 57820 + }, + { + "epoch": 2.4, + "grad_norm": 0.294921875, + "learning_rate": 0.0004922010386280462, + "loss": 0.2237, + "step": 57830 + }, + { + "epoch": 2.4, + "grad_norm": 0.5234375, + "learning_rate": 0.0004921983506617457, + "loss": 0.2416, + "step": 57840 + }, + { + "epoch": 2.4, + "grad_norm": 0.69140625, + "learning_rate": 0.0004921956622396541, + "loss": 0.2162, + "step": 57850 + }, + { + "epoch": 2.4, + "grad_norm": 0.74609375, + "learning_rate": 0.0004921929733617764, + "loss": 0.1963, + "step": 57860 + }, + { + "epoch": 2.4, + "grad_norm": 0.61328125, + "learning_rate": 0.0004921902840281176, + "loss": 0.2397, + "step": 57870 + }, + { + "epoch": 2.4, + "grad_norm": 0.4140625, + "learning_rate": 0.0004921875942386829, + "loss": 0.1901, + "step": 57880 + }, + { + "epoch": 2.4, + "grad_norm": 0.61328125, + "learning_rate": 0.0004921849039934773, + "loss": 0.209, + "step": 57890 + }, + { + "epoch": 2.4, + "grad_norm": 0.5703125, + "learning_rate": 0.0004921822132925059, + "loss": 0.2359, + "step": 57900 + }, + { + "epoch": 2.4, + "grad_norm": 0.47265625, + "learning_rate": 0.0004921795221357737, + "loss": 0.2287, + "step": 57910 + }, + { + "epoch": 2.4, + "grad_norm": 0.298828125, + "learning_rate": 0.0004921768305232858, + "loss": 0.1984, + "step": 57920 + }, + { + "epoch": 2.4, + "grad_norm": 0.71484375, + "learning_rate": 0.0004921741384550472, + "loss": 0.26, + "step": 57930 + }, + { + "epoch": 2.4, + "grad_norm": 0.66796875, + "learning_rate": 0.000492171445931063, + "loss": 0.2435, + "step": 57940 + }, + { + "epoch": 2.4, + "grad_norm": 1.109375, + "learning_rate": 0.0004921687529513383, + "loss": 0.1944, + "step": 57950 + }, + { + "epoch": 2.4, + "grad_norm": 0.79296875, + "learning_rate": 0.0004921660595158783, + "loss": 0.2293, + "step": 57960 + }, + { + "epoch": 2.4, + "grad_norm": 0.60546875, + "learning_rate": 0.0004921633656246877, + "loss": 0.2294, + "step": 57970 + }, + { + "epoch": 2.4, + "grad_norm": 0.3828125, + "learning_rate": 0.000492160671277772, + "loss": 0.2027, + "step": 57980 + }, + { + "epoch": 2.4, + "grad_norm": 0.55859375, + "learning_rate": 0.000492157976475136, + "loss": 0.2228, + "step": 57990 + }, + { + "epoch": 2.4, + "grad_norm": 0.7734375, + "learning_rate": 0.0004921552812167849, + "loss": 0.2381, + "step": 58000 + }, + { + "epoch": 2.4, + "grad_norm": 0.25, + "learning_rate": 0.0004921525855027236, + "loss": 0.2455, + "step": 58010 + }, + { + "epoch": 2.4, + "grad_norm": 0.8984375, + "learning_rate": 0.0004921498893329573, + "loss": 0.197, + "step": 58020 + }, + { + "epoch": 2.4, + "grad_norm": 0.6640625, + "learning_rate": 0.0004921471927074911, + "loss": 0.2484, + "step": 58030 + }, + { + "epoch": 2.4, + "grad_norm": 0.765625, + "learning_rate": 0.00049214449562633, + "loss": 0.2072, + "step": 58040 + }, + { + "epoch": 2.4, + "grad_norm": 0.376953125, + "learning_rate": 0.0004921417980894792, + "loss": 0.2464, + "step": 58050 + }, + { + "epoch": 2.4, + "grad_norm": 0.345703125, + "learning_rate": 0.0004921391000969436, + "loss": 0.2275, + "step": 58060 + }, + { + "epoch": 2.41, + "grad_norm": 0.76171875, + "learning_rate": 0.0004921364016487284, + "loss": 0.2265, + "step": 58070 + }, + { + "epoch": 2.41, + "grad_norm": 0.71875, + "learning_rate": 0.0004921337027448386, + "loss": 0.2182, + "step": 58080 + }, + { + "epoch": 2.41, + "grad_norm": 0.9140625, + "learning_rate": 0.0004921310033852794, + "loss": 0.2331, + "step": 58090 + }, + { + "epoch": 2.41, + "grad_norm": 0.404296875, + "learning_rate": 0.0004921283035700557, + "loss": 0.2476, + "step": 58100 + }, + { + "epoch": 2.41, + "grad_norm": 0.44921875, + "learning_rate": 0.0004921256032991728, + "loss": 0.1948, + "step": 58110 + }, + { + "epoch": 2.41, + "grad_norm": 0.388671875, + "learning_rate": 0.0004921229025726354, + "loss": 0.1801, + "step": 58120 + }, + { + "epoch": 2.41, + "grad_norm": 0.58203125, + "learning_rate": 0.0004921202013904491, + "loss": 0.2708, + "step": 58130 + }, + { + "epoch": 2.41, + "grad_norm": 0.99609375, + "learning_rate": 0.0004921174997526187, + "loss": 0.2521, + "step": 58140 + }, + { + "epoch": 2.41, + "grad_norm": 2.28125, + "learning_rate": 0.0004921147976591492, + "loss": 0.2113, + "step": 58150 + }, + { + "epoch": 2.41, + "grad_norm": 0.6875, + "learning_rate": 0.000492112095110046, + "loss": 0.2355, + "step": 58160 + }, + { + "epoch": 2.41, + "grad_norm": 0.314453125, + "learning_rate": 0.0004921093921053138, + "loss": 0.2152, + "step": 58170 + }, + { + "epoch": 2.41, + "grad_norm": 0.60546875, + "learning_rate": 0.0004921066886449579, + "loss": 0.2789, + "step": 58180 + }, + { + "epoch": 2.41, + "grad_norm": 1.3125, + "learning_rate": 0.0004921039847289833, + "loss": 0.1992, + "step": 58190 + }, + { + "epoch": 2.41, + "grad_norm": 0.26171875, + "learning_rate": 0.0004921012803573953, + "loss": 0.2081, + "step": 58200 + }, + { + "epoch": 2.41, + "grad_norm": 0.1572265625, + "learning_rate": 0.0004920985755301988, + "loss": 0.1916, + "step": 58210 + }, + { + "epoch": 2.41, + "grad_norm": 0.64453125, + "learning_rate": 0.0004920958702473988, + "loss": 0.2164, + "step": 58220 + }, + { + "epoch": 2.41, + "grad_norm": 0.66796875, + "learning_rate": 0.0004920931645090006, + "loss": 0.2504, + "step": 58230 + }, + { + "epoch": 2.41, + "grad_norm": 0.3671875, + "learning_rate": 0.0004920904583150092, + "loss": 0.2325, + "step": 58240 + }, + { + "epoch": 2.41, + "grad_norm": 0.70703125, + "learning_rate": 0.0004920877516654298, + "loss": 0.2512, + "step": 58250 + }, + { + "epoch": 2.41, + "grad_norm": 0.345703125, + "learning_rate": 0.0004920850445602672, + "loss": 0.1884, + "step": 58260 + }, + { + "epoch": 2.41, + "grad_norm": 1.2578125, + "learning_rate": 0.0004920823369995268, + "loss": 0.2514, + "step": 58270 + }, + { + "epoch": 2.41, + "grad_norm": 0.47265625, + "learning_rate": 0.0004920796289832136, + "loss": 0.1897, + "step": 58280 + }, + { + "epoch": 2.41, + "grad_norm": 1.390625, + "learning_rate": 0.0004920769205113327, + "loss": 0.1859, + "step": 58290 + }, + { + "epoch": 2.41, + "grad_norm": 0.59765625, + "learning_rate": 0.0004920742115838891, + "loss": 0.2543, + "step": 58300 + }, + { + "epoch": 2.42, + "grad_norm": 0.5, + "learning_rate": 0.0004920715022008879, + "loss": 0.1794, + "step": 58310 + }, + { + "epoch": 2.42, + "grad_norm": 0.51953125, + "learning_rate": 0.0004920687923623345, + "loss": 0.2343, + "step": 58320 + }, + { + "epoch": 2.42, + "grad_norm": 1.2109375, + "learning_rate": 0.0004920660820682336, + "loss": 0.2533, + "step": 58330 + }, + { + "epoch": 2.42, + "grad_norm": 0.90625, + "learning_rate": 0.0004920633713185906, + "loss": 0.2374, + "step": 58340 + }, + { + "epoch": 2.42, + "grad_norm": 0.306640625, + "learning_rate": 0.0004920606601134103, + "loss": 0.1632, + "step": 58350 + }, + { + "epoch": 2.42, + "grad_norm": 0.4140625, + "learning_rate": 0.0004920579484526982, + "loss": 0.2157, + "step": 58360 + }, + { + "epoch": 2.42, + "grad_norm": 0.2060546875, + "learning_rate": 0.000492055236336459, + "loss": 0.2414, + "step": 58370 + }, + { + "epoch": 2.42, + "grad_norm": 1.609375, + "learning_rate": 0.000492052523764698, + "loss": 0.193, + "step": 58380 + }, + { + "epoch": 2.42, + "grad_norm": 0.427734375, + "learning_rate": 0.0004920498107374204, + "loss": 0.1916, + "step": 58390 + }, + { + "epoch": 2.42, + "grad_norm": 0.890625, + "learning_rate": 0.0004920470972546311, + "loss": 0.2333, + "step": 58400 + }, + { + "epoch": 2.42, + "grad_norm": 1.7734375, + "learning_rate": 0.0004920443833163353, + "loss": 0.2674, + "step": 58410 + }, + { + "epoch": 2.42, + "grad_norm": 0.7734375, + "learning_rate": 0.0004920416689225382, + "loss": 0.2395, + "step": 58420 + }, + { + "epoch": 2.42, + "grad_norm": 0.58203125, + "learning_rate": 0.0004920389540732448, + "loss": 0.1942, + "step": 58430 + }, + { + "epoch": 2.42, + "grad_norm": 1.234375, + "learning_rate": 0.0004920362387684601, + "loss": 0.2225, + "step": 58440 + }, + { + "epoch": 2.42, + "grad_norm": 0.1591796875, + "learning_rate": 0.0004920335230081895, + "loss": 0.1782, + "step": 58450 + }, + { + "epoch": 2.42, + "grad_norm": 0.890625, + "learning_rate": 0.0004920308067924378, + "loss": 0.2845, + "step": 58460 + }, + { + "epoch": 2.42, + "grad_norm": 0.890625, + "learning_rate": 0.0004920280901212103, + "loss": 0.2516, + "step": 58470 + }, + { + "epoch": 2.42, + "grad_norm": 1.296875, + "learning_rate": 0.0004920253729945121, + "loss": 0.2478, + "step": 58480 + }, + { + "epoch": 2.42, + "grad_norm": 1.078125, + "learning_rate": 0.0004920226554123484, + "loss": 0.2678, + "step": 58490 + }, + { + "epoch": 2.42, + "grad_norm": 0.8984375, + "learning_rate": 0.0004920199373747241, + "loss": 0.1921, + "step": 58500 + }, + { + "epoch": 2.42, + "grad_norm": 0.46875, + "learning_rate": 0.0004920172188816443, + "loss": 0.2097, + "step": 58510 + }, + { + "epoch": 2.42, + "grad_norm": 0.388671875, + "learning_rate": 0.0004920144999331144, + "loss": 0.1718, + "step": 58520 + }, + { + "epoch": 2.42, + "grad_norm": 0.59765625, + "learning_rate": 0.0004920117805291392, + "loss": 0.2489, + "step": 58530 + }, + { + "epoch": 2.42, + "grad_norm": 1.1328125, + "learning_rate": 0.0004920090606697241, + "loss": 0.2269, + "step": 58540 + }, + { + "epoch": 2.43, + "grad_norm": 1.09375, + "learning_rate": 0.000492006340354874, + "loss": 0.2097, + "step": 58550 + }, + { + "epoch": 2.43, + "grad_norm": 0.3671875, + "learning_rate": 0.0004920036195845942, + "loss": 0.2339, + "step": 58560 + }, + { + "epoch": 2.43, + "grad_norm": 0.578125, + "learning_rate": 0.0004920008983588896, + "loss": 0.2025, + "step": 58570 + }, + { + "epoch": 2.43, + "grad_norm": 1.5390625, + "learning_rate": 0.0004919981766777655, + "loss": 0.2149, + "step": 58580 + }, + { + "epoch": 2.43, + "grad_norm": 0.8984375, + "learning_rate": 0.0004919954545412269, + "loss": 0.1969, + "step": 58590 + }, + { + "epoch": 2.43, + "grad_norm": 2.328125, + "learning_rate": 0.0004919927319492791, + "loss": 0.2449, + "step": 58600 + }, + { + "epoch": 2.43, + "grad_norm": 0.83984375, + "learning_rate": 0.000491990008901927, + "loss": 0.2345, + "step": 58610 + }, + { + "epoch": 2.43, + "grad_norm": 1.078125, + "learning_rate": 0.0004919872853991759, + "loss": 0.1765, + "step": 58620 + }, + { + "epoch": 2.43, + "grad_norm": 1.2578125, + "learning_rate": 0.0004919845614410309, + "loss": 0.2555, + "step": 58630 + }, + { + "epoch": 2.43, + "grad_norm": 0.3984375, + "learning_rate": 0.000491981837027497, + "loss": 0.2204, + "step": 58640 + }, + { + "epoch": 2.43, + "grad_norm": 0.6328125, + "learning_rate": 0.0004919791121585794, + "loss": 0.2216, + "step": 58650 + }, + { + "epoch": 2.43, + "grad_norm": 0.75, + "learning_rate": 0.0004919763868342833, + "loss": 0.1738, + "step": 58660 + }, + { + "epoch": 2.43, + "grad_norm": 0.9765625, + "learning_rate": 0.0004919736610546139, + "loss": 0.2434, + "step": 58670 + }, + { + "epoch": 2.43, + "grad_norm": 1.9140625, + "learning_rate": 0.000491970934819576, + "loss": 0.2191, + "step": 58680 + }, + { + "epoch": 2.43, + "grad_norm": 0.62890625, + "learning_rate": 0.000491968208129175, + "loss": 0.1824, + "step": 58690 + }, + { + "epoch": 2.43, + "grad_norm": 1.34375, + "learning_rate": 0.0004919654809834159, + "loss": 0.2052, + "step": 58700 + }, + { + "epoch": 2.43, + "grad_norm": 0.8515625, + "learning_rate": 0.000491962753382304, + "loss": 0.2271, + "step": 58710 + }, + { + "epoch": 2.43, + "grad_norm": 0.46875, + "learning_rate": 0.0004919600253258442, + "loss": 0.1954, + "step": 58720 + }, + { + "epoch": 2.43, + "grad_norm": 3.5625, + "learning_rate": 0.0004919572968140419, + "loss": 0.1639, + "step": 58730 + }, + { + "epoch": 2.43, + "grad_norm": 0.7421875, + "learning_rate": 0.0004919545678469021, + "loss": 0.2742, + "step": 58740 + }, + { + "epoch": 2.43, + "grad_norm": 0.6953125, + "learning_rate": 0.0004919518384244298, + "loss": 0.2171, + "step": 58750 + }, + { + "epoch": 2.43, + "grad_norm": 0.828125, + "learning_rate": 0.0004919491085466303, + "loss": 0.2722, + "step": 58760 + }, + { + "epoch": 2.43, + "grad_norm": 0.2265625, + "learning_rate": 0.0004919463782135088, + "loss": 0.2453, + "step": 58770 + }, + { + "epoch": 2.43, + "grad_norm": 0.6875, + "learning_rate": 0.0004919436474250703, + "loss": 0.2368, + "step": 58780 + }, + { + "epoch": 2.44, + "grad_norm": 0.8046875, + "learning_rate": 0.00049194091618132, + "loss": 0.1693, + "step": 58790 + }, + { + "epoch": 2.44, + "grad_norm": 0.89453125, + "learning_rate": 0.0004919381844822629, + "loss": 0.2475, + "step": 58800 + }, + { + "epoch": 2.44, + "grad_norm": 0.640625, + "learning_rate": 0.0004919354523279044, + "loss": 0.263, + "step": 58810 + }, + { + "epoch": 2.44, + "grad_norm": 0.69140625, + "learning_rate": 0.0004919327197182495, + "loss": 0.2022, + "step": 58820 + }, + { + "epoch": 2.44, + "grad_norm": 0.70703125, + "learning_rate": 0.0004919299866533033, + "loss": 0.1932, + "step": 58830 + }, + { + "epoch": 2.44, + "grad_norm": 0.54296875, + "learning_rate": 0.000491927253133071, + "loss": 0.2296, + "step": 58840 + }, + { + "epoch": 2.44, + "grad_norm": 0.69921875, + "learning_rate": 0.0004919245191575578, + "loss": 0.2003, + "step": 58850 + }, + { + "epoch": 2.44, + "grad_norm": 0.55078125, + "learning_rate": 0.0004919217847267687, + "loss": 0.1861, + "step": 58860 + }, + { + "epoch": 2.44, + "grad_norm": 0.296875, + "learning_rate": 0.000491919049840709, + "loss": 0.1423, + "step": 58870 + }, + { + "epoch": 2.44, + "grad_norm": 1.5078125, + "learning_rate": 0.0004919163144993837, + "loss": 0.2245, + "step": 58880 + }, + { + "epoch": 2.44, + "grad_norm": 0.1767578125, + "learning_rate": 0.000491913578702798, + "loss": 0.1927, + "step": 58890 + }, + { + "epoch": 2.44, + "grad_norm": 0.56640625, + "learning_rate": 0.0004919108424509571, + "loss": 0.2075, + "step": 58900 + }, + { + "epoch": 2.44, + "grad_norm": 0.42578125, + "learning_rate": 0.0004919081057438661, + "loss": 0.2708, + "step": 58910 + }, + { + "epoch": 2.44, + "grad_norm": 0.59375, + "learning_rate": 0.0004919053685815303, + "loss": 0.2194, + "step": 58920 + }, + { + "epoch": 2.44, + "grad_norm": 0.36328125, + "learning_rate": 0.0004919026309639546, + "loss": 0.2284, + "step": 58930 + }, + { + "epoch": 2.44, + "grad_norm": 0.5, + "learning_rate": 0.0004918998928911442, + "loss": 0.243, + "step": 58940 + }, + { + "epoch": 2.44, + "grad_norm": 0.58984375, + "learning_rate": 0.0004918971543631045, + "loss": 0.2252, + "step": 58950 + }, + { + "epoch": 2.44, + "grad_norm": 0.4609375, + "learning_rate": 0.0004918944153798403, + "loss": 0.218, + "step": 58960 + }, + { + "epoch": 2.44, + "grad_norm": 0.5234375, + "learning_rate": 0.0004918916759413571, + "loss": 0.2274, + "step": 58970 + }, + { + "epoch": 2.44, + "grad_norm": 2.453125, + "learning_rate": 0.0004918889360476599, + "loss": 0.244, + "step": 58980 + }, + { + "epoch": 2.44, + "grad_norm": 0.7734375, + "learning_rate": 0.0004918861956987537, + "loss": 0.252, + "step": 58990 + }, + { + "epoch": 2.44, + "grad_norm": 0.609375, + "learning_rate": 0.0004918834548946438, + "loss": 0.1551, + "step": 59000 + }, + { + "epoch": 2.44, + "grad_norm": 0.7421875, + "learning_rate": 0.0004918807136353355, + "loss": 0.2063, + "step": 59010 + }, + { + "epoch": 2.44, + "grad_norm": 0.91796875, + "learning_rate": 0.0004918779719208337, + "loss": 0.263, + "step": 59020 + }, + { + "epoch": 2.45, + "grad_norm": 0.376953125, + "learning_rate": 0.0004918752297511437, + "loss": 0.2356, + "step": 59030 + }, + { + "epoch": 2.45, + "grad_norm": 1.1484375, + "learning_rate": 0.0004918724871262706, + "loss": 0.2077, + "step": 59040 + }, + { + "epoch": 2.45, + "grad_norm": 1.1171875, + "learning_rate": 0.0004918697440462197, + "loss": 0.1856, + "step": 59050 + }, + { + "epoch": 2.45, + "grad_norm": 0.71484375, + "learning_rate": 0.000491867000510996, + "loss": 0.2494, + "step": 59060 + }, + { + "epoch": 2.45, + "grad_norm": 0.9296875, + "learning_rate": 0.0004918642565206047, + "loss": 0.2428, + "step": 59070 + }, + { + "epoch": 2.45, + "grad_norm": 0.7578125, + "learning_rate": 0.0004918615120750511, + "loss": 0.2049, + "step": 59080 + }, + { + "epoch": 2.45, + "grad_norm": 0.69140625, + "learning_rate": 0.0004918587671743402, + "loss": 0.2147, + "step": 59090 + }, + { + "epoch": 2.45, + "grad_norm": 1.1875, + "learning_rate": 0.0004918560218184772, + "loss": 0.2217, + "step": 59100 + }, + { + "epoch": 2.45, + "grad_norm": 0.8671875, + "learning_rate": 0.0004918532760074672, + "loss": 0.2469, + "step": 59110 + }, + { + "epoch": 2.45, + "grad_norm": 0.82421875, + "learning_rate": 0.0004918505297413156, + "loss": 0.2107, + "step": 59120 + }, + { + "epoch": 2.45, + "grad_norm": 0.7421875, + "learning_rate": 0.0004918477830200273, + "loss": 0.2088, + "step": 59130 + }, + { + "epoch": 2.45, + "grad_norm": 1.1640625, + "learning_rate": 0.0004918450358436077, + "loss": 0.1849, + "step": 59140 + }, + { + "epoch": 2.45, + "grad_norm": 0.84765625, + "learning_rate": 0.0004918422882120618, + "loss": 0.2202, + "step": 59150 + }, + { + "epoch": 2.45, + "grad_norm": 0.9375, + "learning_rate": 0.0004918395401253948, + "loss": 0.1882, + "step": 59160 + }, + { + "epoch": 2.45, + "grad_norm": 0.3046875, + "learning_rate": 0.000491836791583612, + "loss": 0.2669, + "step": 59170 + }, + { + "epoch": 2.45, + "grad_norm": 0.63671875, + "learning_rate": 0.0004918340425867184, + "loss": 0.2336, + "step": 59180 + }, + { + "epoch": 2.45, + "grad_norm": 1.1171875, + "learning_rate": 0.0004918312931347192, + "loss": 0.1796, + "step": 59190 + }, + { + "epoch": 2.45, + "grad_norm": 0.70703125, + "learning_rate": 0.0004918285432276197, + "loss": 0.1921, + "step": 59200 + }, + { + "epoch": 2.45, + "grad_norm": 0.416015625, + "learning_rate": 0.000491825792865425, + "loss": 0.2791, + "step": 59210 + }, + { + "epoch": 2.45, + "grad_norm": 0.625, + "learning_rate": 0.0004918230420481402, + "loss": 0.2219, + "step": 59220 + }, + { + "epoch": 2.45, + "grad_norm": 0.58984375, + "learning_rate": 0.0004918202907757707, + "loss": 0.1898, + "step": 59230 + }, + { + "epoch": 2.45, + "grad_norm": 0.546875, + "learning_rate": 0.0004918175390483214, + "loss": 0.2017, + "step": 59240 + }, + { + "epoch": 2.45, + "grad_norm": 1.40625, + "learning_rate": 0.0004918147868657976, + "loss": 0.1781, + "step": 59250 + }, + { + "epoch": 2.45, + "grad_norm": 0.765625, + "learning_rate": 0.0004918120342282046, + "loss": 0.1941, + "step": 59260 + }, + { + "epoch": 2.45, + "grad_norm": 0.48828125, + "learning_rate": 0.0004918092811355473, + "loss": 0.22, + "step": 59270 + }, + { + "epoch": 2.46, + "grad_norm": 0.859375, + "learning_rate": 0.0004918065275878311, + "loss": 0.2737, + "step": 59280 + }, + { + "epoch": 2.46, + "grad_norm": 1.1875, + "learning_rate": 0.0004918037735850612, + "loss": 0.2153, + "step": 59290 + }, + { + "epoch": 2.46, + "grad_norm": 0.99609375, + "learning_rate": 0.0004918010191272426, + "loss": 0.2438, + "step": 59300 + }, + { + "epoch": 2.46, + "grad_norm": 0.322265625, + "learning_rate": 0.0004917982642143806, + "loss": 0.2223, + "step": 59310 + }, + { + "epoch": 2.46, + "grad_norm": 1.109375, + "learning_rate": 0.0004917955088464805, + "loss": 0.2452, + "step": 59320 + }, + { + "epoch": 2.46, + "grad_norm": 0.58984375, + "learning_rate": 0.0004917927530235473, + "loss": 0.2396, + "step": 59330 + }, + { + "epoch": 2.46, + "grad_norm": 0.828125, + "learning_rate": 0.0004917899967455863, + "loss": 0.219, + "step": 59340 + }, + { + "epoch": 2.46, + "grad_norm": 0.60546875, + "learning_rate": 0.0004917872400126026, + "loss": 0.1752, + "step": 59350 + }, + { + "epoch": 2.46, + "grad_norm": 0.5, + "learning_rate": 0.0004917844828246015, + "loss": 0.2601, + "step": 59360 + }, + { + "epoch": 2.46, + "grad_norm": 0.248046875, + "learning_rate": 0.0004917817251815879, + "loss": 0.1862, + "step": 59370 + }, + { + "epoch": 2.46, + "grad_norm": 0.49609375, + "learning_rate": 0.0004917789670835674, + "loss": 0.2633, + "step": 59380 + }, + { + "epoch": 2.46, + "grad_norm": 0.578125, + "learning_rate": 0.0004917762085305449, + "loss": 0.196, + "step": 59390 + }, + { + "epoch": 2.46, + "grad_norm": 0.306640625, + "learning_rate": 0.0004917734495225257, + "loss": 0.2427, + "step": 59400 + }, + { + "epoch": 2.46, + "grad_norm": 0.48046875, + "learning_rate": 0.000491770690059515, + "loss": 0.2416, + "step": 59410 + }, + { + "epoch": 2.46, + "grad_norm": 1.5859375, + "learning_rate": 0.000491767930141518, + "loss": 0.2367, + "step": 59420 + }, + { + "epoch": 2.46, + "grad_norm": 0.54296875, + "learning_rate": 0.0004917651697685398, + "loss": 0.2163, + "step": 59430 + }, + { + "epoch": 2.46, + "grad_norm": 0.77734375, + "learning_rate": 0.0004917624089405857, + "loss": 0.2352, + "step": 59440 + }, + { + "epoch": 2.46, + "grad_norm": 0.341796875, + "learning_rate": 0.0004917596476576608, + "loss": 0.1979, + "step": 59450 + }, + { + "epoch": 2.46, + "grad_norm": 0.138671875, + "learning_rate": 0.0004917568859197704, + "loss": 0.1776, + "step": 59460 + }, + { + "epoch": 2.46, + "grad_norm": 1.109375, + "learning_rate": 0.0004917541237269196, + "loss": 0.204, + "step": 59470 + }, + { + "epoch": 2.46, + "grad_norm": 0.5625, + "learning_rate": 0.0004917513610791137, + "loss": 0.2029, + "step": 59480 + }, + { + "epoch": 2.46, + "grad_norm": 0.796875, + "learning_rate": 0.0004917485979763579, + "loss": 0.1755, + "step": 59490 + }, + { + "epoch": 2.46, + "grad_norm": 0.181640625, + "learning_rate": 0.0004917458344186572, + "loss": 0.1455, + "step": 59500 + }, + { + "epoch": 2.46, + "grad_norm": 0.671875, + "learning_rate": 0.0004917430704060171, + "loss": 0.228, + "step": 59510 + }, + { + "epoch": 2.47, + "grad_norm": 0.51953125, + "learning_rate": 0.0004917403059384425, + "loss": 0.2166, + "step": 59520 + }, + { + "epoch": 2.47, + "grad_norm": 0.310546875, + "learning_rate": 0.0004917375410159388, + "loss": 0.2472, + "step": 59530 + }, + { + "epoch": 2.47, + "grad_norm": 0.486328125, + "learning_rate": 0.0004917347756385112, + "loss": 0.2399, + "step": 59540 + }, + { + "epoch": 2.47, + "grad_norm": 0.35546875, + "learning_rate": 0.0004917320098061648, + "loss": 0.1622, + "step": 59550 + }, + { + "epoch": 2.47, + "grad_norm": 0.640625, + "learning_rate": 0.0004917292435189049, + "loss": 0.2126, + "step": 59560 + }, + { + "epoch": 2.47, + "grad_norm": 0.60546875, + "learning_rate": 0.0004917264767767366, + "loss": 0.199, + "step": 59570 + }, + { + "epoch": 2.47, + "grad_norm": 1.1953125, + "learning_rate": 0.0004917237095796653, + "loss": 0.2206, + "step": 59580 + }, + { + "epoch": 2.47, + "grad_norm": 0.2021484375, + "learning_rate": 0.000491720941927696, + "loss": 0.2047, + "step": 59590 + }, + { + "epoch": 2.47, + "grad_norm": 0.86328125, + "learning_rate": 0.000491718173820834, + "loss": 0.1928, + "step": 59600 + }, + { + "epoch": 2.47, + "grad_norm": 0.72265625, + "learning_rate": 0.0004917154052590845, + "loss": 0.2147, + "step": 59610 + }, + { + "epoch": 2.47, + "grad_norm": 0.78515625, + "learning_rate": 0.0004917126362424528, + "loss": 0.2315, + "step": 59620 + }, + { + "epoch": 2.47, + "grad_norm": 0.6328125, + "learning_rate": 0.0004917098667709439, + "loss": 0.2232, + "step": 59630 + }, + { + "epoch": 2.47, + "grad_norm": 0.2236328125, + "learning_rate": 0.0004917070968445632, + "loss": 0.2082, + "step": 59640 + }, + { + "epoch": 2.47, + "grad_norm": 0.5859375, + "learning_rate": 0.0004917043264633157, + "loss": 0.2122, + "step": 59650 + }, + { + "epoch": 2.47, + "grad_norm": 0.75, + "learning_rate": 0.0004917015556272069, + "loss": 0.2439, + "step": 59660 + }, + { + "epoch": 2.47, + "grad_norm": 0.287109375, + "learning_rate": 0.0004916987843362418, + "loss": 0.1509, + "step": 59670 + }, + { + "epoch": 2.47, + "grad_norm": 0.859375, + "learning_rate": 0.0004916960125904259, + "loss": 0.1954, + "step": 59680 + }, + { + "epoch": 2.47, + "grad_norm": 0.984375, + "learning_rate": 0.0004916932403897639, + "loss": 0.2675, + "step": 59690 + }, + { + "epoch": 2.47, + "grad_norm": 0.6796875, + "learning_rate": 0.0004916904677342615, + "loss": 0.2462, + "step": 59700 + }, + { + "epoch": 2.47, + "grad_norm": 0.640625, + "learning_rate": 0.0004916876946239237, + "loss": 0.2182, + "step": 59710 + }, + { + "epoch": 2.47, + "grad_norm": 0.67578125, + "learning_rate": 0.0004916849210587559, + "loss": 0.1832, + "step": 59720 + }, + { + "epoch": 2.47, + "grad_norm": 2.265625, + "learning_rate": 0.000491682147038763, + "loss": 0.2849, + "step": 59730 + }, + { + "epoch": 2.47, + "grad_norm": 0.412109375, + "learning_rate": 0.0004916793725639504, + "loss": 0.2239, + "step": 59740 + }, + { + "epoch": 2.47, + "grad_norm": 0.53125, + "learning_rate": 0.0004916765976343233, + "loss": 0.2235, + "step": 59750 + }, + { + "epoch": 2.48, + "grad_norm": 0.53515625, + "learning_rate": 0.0004916738222498871, + "loss": 0.2239, + "step": 59760 + }, + { + "epoch": 2.48, + "grad_norm": 0.578125, + "learning_rate": 0.0004916710464106468, + "loss": 0.2046, + "step": 59770 + }, + { + "epoch": 2.48, + "grad_norm": 0.703125, + "learning_rate": 0.0004916682701166076, + "loss": 0.1869, + "step": 59780 + }, + { + "epoch": 2.48, + "grad_norm": 1.453125, + "learning_rate": 0.0004916654933677749, + "loss": 0.2031, + "step": 59790 + }, + { + "epoch": 2.48, + "grad_norm": 0.5625, + "learning_rate": 0.0004916627161641537, + "loss": 0.2024, + "step": 59800 + }, + { + "epoch": 2.48, + "grad_norm": 0.79296875, + "learning_rate": 0.0004916599385057495, + "loss": 0.1714, + "step": 59810 + }, + { + "epoch": 2.48, + "grad_norm": 1.0, + "learning_rate": 0.0004916571603925674, + "loss": 0.218, + "step": 59820 + }, + { + "epoch": 2.48, + "grad_norm": 0.267578125, + "learning_rate": 0.0004916543818246126, + "loss": 0.2052, + "step": 59830 + }, + { + "epoch": 2.48, + "grad_norm": 1.0234375, + "learning_rate": 0.0004916516028018904, + "loss": 0.2098, + "step": 59840 + }, + { + "epoch": 2.48, + "grad_norm": 0.859375, + "learning_rate": 0.0004916488233244059, + "loss": 0.2346, + "step": 59850 + }, + { + "epoch": 2.48, + "grad_norm": 0.54296875, + "learning_rate": 0.0004916460433921644, + "loss": 0.2139, + "step": 59860 + }, + { + "epoch": 2.48, + "grad_norm": 0.609375, + "learning_rate": 0.0004916432630051712, + "loss": 0.2227, + "step": 59870 + }, + { + "epoch": 2.48, + "grad_norm": 0.59375, + "learning_rate": 0.0004916404821634314, + "loss": 0.2572, + "step": 59880 + }, + { + "epoch": 2.48, + "grad_norm": 0.62109375, + "learning_rate": 0.0004916377008669504, + "loss": 0.2039, + "step": 59890 + }, + { + "epoch": 2.48, + "grad_norm": 1.5625, + "learning_rate": 0.0004916349191157333, + "loss": 0.198, + "step": 59900 + }, + { + "epoch": 2.48, + "grad_norm": 0.8359375, + "learning_rate": 0.0004916321369097854, + "loss": 0.2805, + "step": 59910 + }, + { + "epoch": 2.48, + "grad_norm": 0.369140625, + "learning_rate": 0.0004916293542491119, + "loss": 0.241, + "step": 59920 + }, + { + "epoch": 2.48, + "grad_norm": 0.6953125, + "learning_rate": 0.0004916265711337179, + "loss": 0.2665, + "step": 59930 + }, + { + "epoch": 2.48, + "grad_norm": 0.322265625, + "learning_rate": 0.0004916237875636089, + "loss": 0.219, + "step": 59940 + }, + { + "epoch": 2.48, + "grad_norm": 0.392578125, + "learning_rate": 0.0004916210035387902, + "loss": 0.1856, + "step": 59950 + }, + { + "epoch": 2.48, + "grad_norm": 0.8671875, + "learning_rate": 0.0004916182190592666, + "loss": 0.2821, + "step": 59960 + }, + { + "epoch": 2.48, + "grad_norm": 0.65625, + "learning_rate": 0.0004916154341250437, + "loss": 0.203, + "step": 59970 + }, + { + "epoch": 2.48, + "grad_norm": 0.55859375, + "learning_rate": 0.0004916126487361267, + "loss": 0.1725, + "step": 59980 + }, + { + "epoch": 2.48, + "grad_norm": 0.70703125, + "learning_rate": 0.0004916098628925207, + "loss": 0.2144, + "step": 59990 + }, + { + "epoch": 2.49, + "grad_norm": 0.310546875, + "learning_rate": 0.0004916070765942311, + "loss": 0.1954, + "step": 60000 + }, + { + "epoch": 2.49, + "grad_norm": 0.953125, + "learning_rate": 0.0004916042898412631, + "loss": 0.2646, + "step": 60010 + }, + { + "epoch": 2.49, + "grad_norm": 0.43359375, + "learning_rate": 0.0004916015026336218, + "loss": 0.2277, + "step": 60020 + }, + { + "epoch": 2.49, + "grad_norm": 1.046875, + "learning_rate": 0.0004915987149713126, + "loss": 0.199, + "step": 60030 + }, + { + "epoch": 2.49, + "grad_norm": 0.490234375, + "learning_rate": 0.0004915959268543408, + "loss": 0.2519, + "step": 60040 + }, + { + "epoch": 2.49, + "grad_norm": 0.333984375, + "learning_rate": 0.0004915931382827115, + "loss": 0.2285, + "step": 60050 + }, + { + "epoch": 2.49, + "grad_norm": 1.328125, + "learning_rate": 0.00049159034925643, + "loss": 0.2026, + "step": 60060 + }, + { + "epoch": 2.49, + "grad_norm": 0.71484375, + "learning_rate": 0.0004915875597755015, + "loss": 0.2251, + "step": 60070 + }, + { + "epoch": 2.49, + "grad_norm": 0.52734375, + "learning_rate": 0.0004915847698399314, + "loss": 0.2059, + "step": 60080 + }, + { + "epoch": 2.49, + "grad_norm": 0.439453125, + "learning_rate": 0.0004915819794497247, + "loss": 0.2093, + "step": 60090 + }, + { + "epoch": 2.49, + "grad_norm": 0.5625, + "learning_rate": 0.0004915791886048869, + "loss": 0.2424, + "step": 60100 + }, + { + "epoch": 2.49, + "grad_norm": 0.546875, + "learning_rate": 0.0004915763973054232, + "loss": 0.1937, + "step": 60110 + }, + { + "epoch": 2.49, + "grad_norm": 1.4296875, + "learning_rate": 0.0004915736055513388, + "loss": 0.1795, + "step": 60120 + }, + { + "epoch": 2.49, + "grad_norm": 0.451171875, + "learning_rate": 0.000491570813342639, + "loss": 0.2453, + "step": 60130 + }, + { + "epoch": 2.49, + "grad_norm": 0.94140625, + "learning_rate": 0.0004915680206793289, + "loss": 0.1969, + "step": 60140 + }, + { + "epoch": 2.49, + "grad_norm": 0.45703125, + "learning_rate": 0.000491565227561414, + "loss": 0.1562, + "step": 60150 + }, + { + "epoch": 2.49, + "grad_norm": 0.86328125, + "learning_rate": 0.0004915624339888993, + "loss": 0.2261, + "step": 60160 + }, + { + "epoch": 2.49, + "grad_norm": 0.7421875, + "learning_rate": 0.0004915596399617902, + "loss": 0.2133, + "step": 60170 + }, + { + "epoch": 2.49, + "grad_norm": 1.03125, + "learning_rate": 0.000491556845480092, + "loss": 0.176, + "step": 60180 + }, + { + "epoch": 2.49, + "grad_norm": 0.703125, + "learning_rate": 0.0004915540505438099, + "loss": 0.2311, + "step": 60190 + }, + { + "epoch": 2.49, + "grad_norm": 0.365234375, + "learning_rate": 0.0004915512551529491, + "loss": 0.2346, + "step": 60200 + }, + { + "epoch": 2.49, + "grad_norm": 1.046875, + "learning_rate": 0.0004915484593075149, + "loss": 0.2563, + "step": 60210 + }, + { + "epoch": 2.49, + "grad_norm": 0.95703125, + "learning_rate": 0.0004915456630075127, + "loss": 0.2604, + "step": 60220 + }, + { + "epoch": 2.49, + "grad_norm": 0.6953125, + "learning_rate": 0.0004915428662529476, + "loss": 0.2657, + "step": 60230 + }, + { + "epoch": 2.5, + "grad_norm": 0.92578125, + "learning_rate": 0.000491540069043825, + "loss": 0.2, + "step": 60240 + }, + { + "epoch": 2.5, + "grad_norm": 0.66015625, + "learning_rate": 0.0004915372713801499, + "loss": 0.2342, + "step": 60250 + }, + { + "epoch": 2.5, + "grad_norm": 0.57421875, + "learning_rate": 0.0004915344732619278, + "loss": 0.1853, + "step": 60260 + }, + { + "epoch": 2.5, + "grad_norm": 0.94140625, + "learning_rate": 0.0004915316746891639, + "loss": 0.2303, + "step": 60270 + }, + { + "epoch": 2.5, + "grad_norm": 0.4140625, + "learning_rate": 0.0004915288756618635, + "loss": 0.2616, + "step": 60280 + }, + { + "epoch": 2.5, + "grad_norm": 0.23828125, + "learning_rate": 0.0004915260761800318, + "loss": 0.22, + "step": 60290 + }, + { + "epoch": 2.5, + "grad_norm": 0.494140625, + "learning_rate": 0.0004915232762436742, + "loss": 0.22, + "step": 60300 + }, + { + "epoch": 2.5, + "grad_norm": 0.62109375, + "learning_rate": 0.0004915204758527958, + "loss": 0.2164, + "step": 60310 + }, + { + "epoch": 2.5, + "grad_norm": 0.64453125, + "learning_rate": 0.000491517675007402, + "loss": 0.2333, + "step": 60320 + }, + { + "epoch": 2.5, + "grad_norm": 0.2578125, + "learning_rate": 0.000491514873707498, + "loss": 0.1946, + "step": 60330 + }, + { + "epoch": 2.5, + "grad_norm": 0.66015625, + "learning_rate": 0.000491512071953089, + "loss": 0.169, + "step": 60340 + }, + { + "epoch": 2.5, + "grad_norm": 0.43359375, + "learning_rate": 0.0004915092697441806, + "loss": 0.2201, + "step": 60350 + }, + { + "epoch": 2.5, + "grad_norm": 1.453125, + "learning_rate": 0.0004915064670807775, + "loss": 0.2287, + "step": 60360 + }, + { + "epoch": 2.5, + "grad_norm": 0.51953125, + "learning_rate": 0.0004915036639628856, + "loss": 0.2034, + "step": 60370 + }, + { + "epoch": 2.5, + "grad_norm": 0.55859375, + "learning_rate": 0.0004915008603905097, + "loss": 0.2781, + "step": 60380 + }, + { + "epoch": 2.5, + "grad_norm": 0.392578125, + "learning_rate": 0.0004914980563636553, + "loss": 0.2238, + "step": 60390 + }, + { + "epoch": 2.5, + "grad_norm": 0.515625, + "learning_rate": 0.0004914952518823277, + "loss": 0.1982, + "step": 60400 + }, + { + "epoch": 2.5, + "grad_norm": 0.7734375, + "learning_rate": 0.000491492446946532, + "loss": 0.1682, + "step": 60410 + }, + { + "epoch": 2.5, + "grad_norm": 0.416015625, + "learning_rate": 0.0004914896415562737, + "loss": 0.2021, + "step": 60420 + }, + { + "epoch": 2.5, + "grad_norm": 0.84765625, + "learning_rate": 0.000491486835711558, + "loss": 0.2059, + "step": 60430 + }, + { + "epoch": 2.5, + "grad_norm": 0.80859375, + "learning_rate": 0.0004914840294123901, + "loss": 0.1997, + "step": 60440 + }, + { + "epoch": 2.5, + "grad_norm": 1.0546875, + "learning_rate": 0.0004914812226587753, + "loss": 0.2056, + "step": 60450 + }, + { + "epoch": 2.5, + "grad_norm": 0.59375, + "learning_rate": 0.0004914784154507189, + "loss": 0.2128, + "step": 60460 + }, + { + "epoch": 2.5, + "grad_norm": 1.3046875, + "learning_rate": 0.0004914756077882263, + "loss": 0.2162, + "step": 60470 + }, + { + "epoch": 2.51, + "grad_norm": 0.45703125, + "learning_rate": 0.0004914727996713027, + "loss": 0.2675, + "step": 60480 + }, + { + "epoch": 2.51, + "grad_norm": 0.3203125, + "learning_rate": 0.0004914699910999532, + "loss": 0.2175, + "step": 60490 + }, + { + "epoch": 2.51, + "grad_norm": 0.88671875, + "learning_rate": 0.0004914671820741834, + "loss": 0.2132, + "step": 60500 + }, + { + "epoch": 2.51, + "grad_norm": 0.53125, + "learning_rate": 0.0004914643725939983, + "loss": 0.2026, + "step": 60510 + }, + { + "epoch": 2.51, + "grad_norm": 0.50390625, + "learning_rate": 0.0004914615626594034, + "loss": 0.2279, + "step": 60520 + }, + { + "epoch": 2.51, + "grad_norm": 0.609375, + "learning_rate": 0.000491458752270404, + "loss": 0.198, + "step": 60530 + }, + { + "epoch": 2.51, + "grad_norm": 0.486328125, + "learning_rate": 0.0004914559414270053, + "loss": 0.284, + "step": 60540 + }, + { + "epoch": 2.51, + "grad_norm": 0.7109375, + "learning_rate": 0.0004914531301292124, + "loss": 0.2268, + "step": 60550 + }, + { + "epoch": 2.51, + "grad_norm": 1.6796875, + "learning_rate": 0.0004914503183770311, + "loss": 0.2176, + "step": 60560 + }, + { + "epoch": 2.51, + "grad_norm": 1.6171875, + "learning_rate": 0.0004914475061704661, + "loss": 0.2087, + "step": 60570 + }, + { + "epoch": 2.51, + "grad_norm": 0.48828125, + "learning_rate": 0.000491444693509523, + "loss": 0.1613, + "step": 60580 + }, + { + "epoch": 2.51, + "grad_norm": 0.71875, + "learning_rate": 0.0004914418803942071, + "loss": 0.2145, + "step": 60590 + }, + { + "epoch": 2.51, + "grad_norm": 1.265625, + "learning_rate": 0.0004914390668245238, + "loss": 0.201, + "step": 60600 + }, + { + "epoch": 2.51, + "grad_norm": 0.734375, + "learning_rate": 0.0004914362528004781, + "loss": 0.149, + "step": 60610 + }, + { + "epoch": 2.51, + "grad_norm": 0.62890625, + "learning_rate": 0.0004914334383220755, + "loss": 0.1871, + "step": 60620 + }, + { + "epoch": 2.51, + "grad_norm": 0.490234375, + "learning_rate": 0.0004914306233893211, + "loss": 0.1756, + "step": 60630 + }, + { + "epoch": 2.51, + "grad_norm": 0.79296875, + "learning_rate": 0.0004914278080022205, + "loss": 0.2346, + "step": 60640 + }, + { + "epoch": 2.51, + "grad_norm": 0.25, + "learning_rate": 0.0004914249921607787, + "loss": 0.2545, + "step": 60650 + }, + { + "epoch": 2.51, + "grad_norm": 0.314453125, + "learning_rate": 0.0004914221758650013, + "loss": 0.2867, + "step": 60660 + }, + { + "epoch": 2.51, + "grad_norm": 0.70703125, + "learning_rate": 0.0004914193591148933, + "loss": 0.2277, + "step": 60670 + }, + { + "epoch": 2.51, + "grad_norm": 0.3671875, + "learning_rate": 0.0004914165419104602, + "loss": 0.2082, + "step": 60680 + }, + { + "epoch": 2.51, + "grad_norm": 0.484375, + "learning_rate": 0.0004914137242517072, + "loss": 0.2047, + "step": 60690 + }, + { + "epoch": 2.51, + "grad_norm": 0.62890625, + "learning_rate": 0.0004914109061386397, + "loss": 0.2115, + "step": 60700 + }, + { + "epoch": 2.51, + "grad_norm": 0.5078125, + "learning_rate": 0.000491408087571263, + "loss": 0.2241, + "step": 60710 + }, + { + "epoch": 2.52, + "grad_norm": 0.71484375, + "learning_rate": 0.0004914052685495822, + "loss": 0.2209, + "step": 60720 + }, + { + "epoch": 2.52, + "grad_norm": 0.640625, + "learning_rate": 0.0004914024490736029, + "loss": 0.2666, + "step": 60730 + }, + { + "epoch": 2.52, + "grad_norm": 0.64453125, + "learning_rate": 0.0004913996291433301, + "loss": 0.2366, + "step": 60740 + }, + { + "epoch": 2.52, + "grad_norm": 0.96875, + "learning_rate": 0.0004913968087587693, + "loss": 0.2806, + "step": 60750 + }, + { + "epoch": 2.52, + "grad_norm": 0.44921875, + "learning_rate": 0.0004913939879199259, + "loss": 0.1849, + "step": 60760 + }, + { + "epoch": 2.52, + "grad_norm": 0.345703125, + "learning_rate": 0.000491391166626805, + "loss": 0.2133, + "step": 60770 + }, + { + "epoch": 2.52, + "grad_norm": 0.345703125, + "learning_rate": 0.000491388344879412, + "loss": 0.2416, + "step": 60780 + }, + { + "epoch": 2.52, + "grad_norm": 0.76953125, + "learning_rate": 0.0004913855226777522, + "loss": 0.1989, + "step": 60790 + }, + { + "epoch": 2.52, + "grad_norm": 0.52734375, + "learning_rate": 0.0004913827000218309, + "loss": 0.2459, + "step": 60800 + }, + { + "epoch": 2.52, + "grad_norm": 0.478515625, + "learning_rate": 0.0004913798769116534, + "loss": 0.2214, + "step": 60810 + }, + { + "epoch": 2.52, + "grad_norm": 0.53515625, + "learning_rate": 0.0004913770533472252, + "loss": 0.2413, + "step": 60820 + }, + { + "epoch": 2.52, + "grad_norm": 0.1689453125, + "learning_rate": 0.0004913742293285512, + "loss": 0.1982, + "step": 60830 + }, + { + "epoch": 2.52, + "grad_norm": 0.67578125, + "learning_rate": 0.0004913714048556372, + "loss": 0.2127, + "step": 60840 + }, + { + "epoch": 2.52, + "grad_norm": 0.6796875, + "learning_rate": 0.0004913685799284882, + "loss": 0.2567, + "step": 60850 + }, + { + "epoch": 2.52, + "grad_norm": 0.51171875, + "learning_rate": 0.0004913657545471095, + "loss": 0.2273, + "step": 60860 + }, + { + "epoch": 2.52, + "grad_norm": 0.640625, + "learning_rate": 0.0004913629287115065, + "loss": 0.2221, + "step": 60870 + }, + { + "epoch": 2.52, + "grad_norm": 1.5234375, + "learning_rate": 0.0004913601024216847, + "loss": 0.2608, + "step": 60880 + }, + { + "epoch": 2.52, + "grad_norm": 1.6484375, + "learning_rate": 0.0004913572756776492, + "loss": 0.2386, + "step": 60890 + }, + { + "epoch": 2.52, + "grad_norm": 1.171875, + "learning_rate": 0.0004913544484794054, + "loss": 0.1962, + "step": 60900 + }, + { + "epoch": 2.52, + "grad_norm": 0.5234375, + "learning_rate": 0.0004913516208269585, + "loss": 0.2656, + "step": 60910 + }, + { + "epoch": 2.52, + "grad_norm": 0.89453125, + "learning_rate": 0.0004913487927203139, + "loss": 0.187, + "step": 60920 + }, + { + "epoch": 2.52, + "grad_norm": 0.8984375, + "learning_rate": 0.0004913459641594771, + "loss": 0.2558, + "step": 60930 + }, + { + "epoch": 2.52, + "grad_norm": 0.68359375, + "learning_rate": 0.000491343135144453, + "loss": 0.2013, + "step": 60940 + }, + { + "epoch": 2.52, + "grad_norm": 1.5078125, + "learning_rate": 0.0004913403056752474, + "loss": 0.1974, + "step": 60950 + }, + { + "epoch": 2.52, + "grad_norm": 3.390625, + "learning_rate": 0.0004913374757518654, + "loss": 0.2279, + "step": 60960 + }, + { + "epoch": 2.53, + "grad_norm": 0.92578125, + "learning_rate": 0.0004913346453743122, + "loss": 0.2627, + "step": 60970 + }, + { + "epoch": 2.53, + "grad_norm": 2.421875, + "learning_rate": 0.0004913318145425933, + "loss": 0.25, + "step": 60980 + }, + { + "epoch": 2.53, + "grad_norm": 0.58203125, + "learning_rate": 0.0004913289832567141, + "loss": 0.2268, + "step": 60990 + }, + { + "epoch": 2.53, + "grad_norm": 0.404296875, + "learning_rate": 0.0004913261515166797, + "loss": 0.2364, + "step": 61000 + }, + { + "epoch": 2.53, + "grad_norm": 0.82421875, + "learning_rate": 0.0004913233193224955, + "loss": 0.1859, + "step": 61010 + }, + { + "epoch": 2.53, + "grad_norm": 0.765625, + "learning_rate": 0.0004913204866741671, + "loss": 0.2276, + "step": 61020 + }, + { + "epoch": 2.53, + "grad_norm": 0.671875, + "learning_rate": 0.0004913176535716994, + "loss": 0.2299, + "step": 61030 + }, + { + "epoch": 2.53, + "grad_norm": 0.3671875, + "learning_rate": 0.000491314820015098, + "loss": 0.2297, + "step": 61040 + }, + { + "epoch": 2.53, + "grad_norm": 1.1640625, + "learning_rate": 0.0004913119860043681, + "loss": 0.2018, + "step": 61050 + }, + { + "epoch": 2.53, + "grad_norm": 0.79296875, + "learning_rate": 0.0004913091515395152, + "loss": 0.2163, + "step": 61060 + }, + { + "epoch": 2.53, + "grad_norm": 0.0, + "learning_rate": 0.0004913063166205445, + "loss": 0.2314, + "step": 61070 + }, + { + "epoch": 2.53, + "grad_norm": 0.455078125, + "learning_rate": 0.0004913034812474614, + "loss": 0.2768, + "step": 61080 + }, + { + "epoch": 2.53, + "grad_norm": 0.0, + "learning_rate": 0.0004913006454202711, + "loss": 0.1717, + "step": 61090 + }, + { + "epoch": 2.53, + "grad_norm": 0.4453125, + "learning_rate": 0.0004912978091389791, + "loss": 0.2098, + "step": 61100 + }, + { + "epoch": 2.53, + "grad_norm": 1.5, + "learning_rate": 0.0004912949724035908, + "loss": 0.2822, + "step": 61110 + }, + { + "epoch": 2.53, + "grad_norm": 1.2109375, + "learning_rate": 0.0004912921352141112, + "loss": 0.2415, + "step": 61120 + }, + { + "epoch": 2.53, + "grad_norm": 0.482421875, + "learning_rate": 0.0004912892975705461, + "loss": 0.2286, + "step": 61130 + }, + { + "epoch": 2.53, + "grad_norm": 0.353515625, + "learning_rate": 0.0004912864594729004, + "loss": 0.1819, + "step": 61140 + }, + { + "epoch": 2.53, + "grad_norm": 0.98828125, + "learning_rate": 0.0004912836209211797, + "loss": 0.2252, + "step": 61150 + }, + { + "epoch": 2.53, + "grad_norm": 0.5859375, + "learning_rate": 0.0004912807819153893, + "loss": 0.2037, + "step": 61160 + }, + { + "epoch": 2.53, + "grad_norm": 0.921875, + "learning_rate": 0.0004912779424555345, + "loss": 0.1978, + "step": 61170 + }, + { + "epoch": 2.53, + "grad_norm": 0.51171875, + "learning_rate": 0.0004912751025416207, + "loss": 0.2221, + "step": 61180 + }, + { + "epoch": 2.53, + "grad_norm": 1.21875, + "learning_rate": 0.0004912722621736532, + "loss": 0.2351, + "step": 61190 + }, + { + "epoch": 2.53, + "grad_norm": 0.341796875, + "learning_rate": 0.0004912694213516374, + "loss": 0.2428, + "step": 61200 + }, + { + "epoch": 2.54, + "grad_norm": 0.4609375, + "learning_rate": 0.0004912665800755786, + "loss": 0.2275, + "step": 61210 + }, + { + "epoch": 2.54, + "grad_norm": 0.65625, + "learning_rate": 0.0004912637383454821, + "loss": 0.2273, + "step": 61220 + }, + { + "epoch": 2.54, + "grad_norm": 0.255859375, + "learning_rate": 0.0004912608961613532, + "loss": 0.1852, + "step": 61230 + }, + { + "epoch": 2.54, + "grad_norm": 1.1328125, + "learning_rate": 0.0004912580535231975, + "loss": 0.2304, + "step": 61240 + }, + { + "epoch": 2.54, + "grad_norm": 0.255859375, + "learning_rate": 0.0004912552104310202, + "loss": 0.1983, + "step": 61250 + }, + { + "epoch": 2.54, + "grad_norm": 2.328125, + "learning_rate": 0.0004912523668848266, + "loss": 0.1444, + "step": 61260 + }, + { + "epoch": 2.54, + "grad_norm": 0.47265625, + "learning_rate": 0.0004912495228846221, + "loss": 0.2435, + "step": 61270 + }, + { + "epoch": 2.54, + "grad_norm": 0.96484375, + "learning_rate": 0.0004912466784304121, + "loss": 0.2062, + "step": 61280 + }, + { + "epoch": 2.54, + "grad_norm": 0.73046875, + "learning_rate": 0.0004912438335222018, + "loss": 0.2594, + "step": 61290 + }, + { + "epoch": 2.54, + "grad_norm": 0.6953125, + "learning_rate": 0.0004912409881599967, + "loss": 0.2309, + "step": 61300 + }, + { + "epoch": 2.54, + "grad_norm": 0.419921875, + "learning_rate": 0.0004912381423438022, + "loss": 0.2408, + "step": 61310 + }, + { + "epoch": 2.54, + "grad_norm": 0.87109375, + "learning_rate": 0.0004912352960736235, + "loss": 0.2322, + "step": 61320 + }, + { + "epoch": 2.54, + "grad_norm": 0.70703125, + "learning_rate": 0.000491232449349466, + "loss": 0.223, + "step": 61330 + }, + { + "epoch": 2.54, + "grad_norm": 0.6171875, + "learning_rate": 0.0004912296021713351, + "loss": 0.2338, + "step": 61340 + }, + { + "epoch": 2.54, + "grad_norm": 0.431640625, + "learning_rate": 0.0004912267545392362, + "loss": 0.2247, + "step": 61350 + }, + { + "epoch": 2.54, + "grad_norm": 0.79296875, + "learning_rate": 0.0004912239064531745, + "loss": 0.2244, + "step": 61360 + }, + { + "epoch": 2.54, + "grad_norm": 0.50390625, + "learning_rate": 0.0004912210579131555, + "loss": 0.1863, + "step": 61370 + }, + { + "epoch": 2.54, + "grad_norm": 1.46875, + "learning_rate": 0.0004912182089191844, + "loss": 0.2363, + "step": 61380 + }, + { + "epoch": 2.54, + "grad_norm": 1.0625, + "learning_rate": 0.0004912153594712668, + "loss": 0.2178, + "step": 61390 + }, + { + "epoch": 2.54, + "grad_norm": 1.046875, + "learning_rate": 0.000491212509569408, + "loss": 0.2633, + "step": 61400 + }, + { + "epoch": 2.54, + "grad_norm": 0.56640625, + "learning_rate": 0.0004912096592136133, + "loss": 0.2362, + "step": 61410 + }, + { + "epoch": 2.54, + "grad_norm": 0.921875, + "learning_rate": 0.000491206808403888, + "loss": 0.207, + "step": 61420 + }, + { + "epoch": 2.54, + "grad_norm": 0.9375, + "learning_rate": 0.0004912039571402375, + "loss": 0.2181, + "step": 61430 + }, + { + "epoch": 2.54, + "grad_norm": 1.0546875, + "learning_rate": 0.0004912011054226673, + "loss": 0.2294, + "step": 61440 + }, + { + "epoch": 2.55, + "grad_norm": 0.875, + "learning_rate": 0.0004911982532511826, + "loss": 0.2187, + "step": 61450 + }, + { + "epoch": 2.55, + "grad_norm": 0.59375, + "learning_rate": 0.0004911954006257888, + "loss": 0.1879, + "step": 61460 + }, + { + "epoch": 2.55, + "grad_norm": 0.90625, + "learning_rate": 0.0004911925475464913, + "loss": 0.2114, + "step": 61470 + }, + { + "epoch": 2.55, + "grad_norm": 0.86328125, + "learning_rate": 0.0004911896940132956, + "loss": 0.2317, + "step": 61480 + }, + { + "epoch": 2.55, + "grad_norm": 0.80859375, + "learning_rate": 0.0004911868400262068, + "loss": 0.2348, + "step": 61490 + }, + { + "epoch": 2.55, + "grad_norm": 1.1328125, + "learning_rate": 0.0004911839855852306, + "loss": 0.2176, + "step": 61500 + }, + { + "epoch": 2.55, + "grad_norm": 0.48828125, + "learning_rate": 0.000491181130690372, + "loss": 0.1799, + "step": 61510 + }, + { + "epoch": 2.55, + "grad_norm": 0.494140625, + "learning_rate": 0.0004911782753416366, + "loss": 0.2293, + "step": 61520 + }, + { + "epoch": 2.55, + "grad_norm": 0.5703125, + "learning_rate": 0.0004911754195390298, + "loss": 0.2635, + "step": 61530 + }, + { + "epoch": 2.55, + "grad_norm": 0.29296875, + "learning_rate": 0.0004911725632825568, + "loss": 0.2325, + "step": 61540 + }, + { + "epoch": 2.55, + "grad_norm": 0.271484375, + "learning_rate": 0.0004911697065722231, + "loss": 0.1926, + "step": 61550 + }, + { + "epoch": 2.55, + "grad_norm": 1.4296875, + "learning_rate": 0.0004911668494080342, + "loss": 0.2368, + "step": 61560 + }, + { + "epoch": 2.55, + "grad_norm": 0.9140625, + "learning_rate": 0.0004911639917899952, + "loss": 0.1722, + "step": 61570 + }, + { + "epoch": 2.55, + "grad_norm": 0.412109375, + "learning_rate": 0.0004911611337181116, + "loss": 0.2476, + "step": 61580 + }, + { + "epoch": 2.55, + "grad_norm": 0.435546875, + "learning_rate": 0.0004911582751923888, + "loss": 0.2436, + "step": 61590 + }, + { + "epoch": 2.55, + "grad_norm": 0.51171875, + "learning_rate": 0.0004911554162128322, + "loss": 0.2396, + "step": 61600 + }, + { + "epoch": 2.55, + "grad_norm": 0.8984375, + "learning_rate": 0.0004911525567794471, + "loss": 0.2104, + "step": 61610 + }, + { + "epoch": 2.55, + "grad_norm": 2.78125, + "learning_rate": 0.0004911496968922391, + "loss": 0.234, + "step": 61620 + }, + { + "epoch": 2.55, + "grad_norm": 0.65625, + "learning_rate": 0.0004911468365512133, + "loss": 0.1905, + "step": 61630 + }, + { + "epoch": 2.55, + "grad_norm": 1.1171875, + "learning_rate": 0.0004911439757563751, + "loss": 0.2518, + "step": 61640 + }, + { + "epoch": 2.55, + "grad_norm": 0.625, + "learning_rate": 0.0004911411145077301, + "loss": 0.2245, + "step": 61650 + }, + { + "epoch": 2.55, + "grad_norm": 0.41796875, + "learning_rate": 0.0004911382528052836, + "loss": 0.2625, + "step": 61660 + }, + { + "epoch": 2.55, + "grad_norm": 0.5078125, + "learning_rate": 0.0004911353906490408, + "loss": 0.2327, + "step": 61670 + }, + { + "epoch": 2.55, + "grad_norm": 0.58984375, + "learning_rate": 0.0004911325280390073, + "loss": 0.2392, + "step": 61680 + }, + { + "epoch": 2.56, + "grad_norm": 0.49609375, + "learning_rate": 0.0004911296649751884, + "loss": 0.2153, + "step": 61690 + }, + { + "epoch": 2.56, + "grad_norm": 0.72265625, + "learning_rate": 0.0004911268014575896, + "loss": 0.25, + "step": 61700 + }, + { + "epoch": 2.56, + "grad_norm": 0.86328125, + "learning_rate": 0.000491123937486216, + "loss": 0.2492, + "step": 61710 + }, + { + "epoch": 2.56, + "grad_norm": 0.56640625, + "learning_rate": 0.0004911210730610734, + "loss": 0.2524, + "step": 61720 + }, + { + "epoch": 2.56, + "grad_norm": 0.51171875, + "learning_rate": 0.000491118208182167, + "loss": 0.1819, + "step": 61730 + }, + { + "epoch": 2.56, + "grad_norm": 0.796875, + "learning_rate": 0.000491115342849502, + "loss": 0.2055, + "step": 61740 + }, + { + "epoch": 2.56, + "grad_norm": 1.078125, + "learning_rate": 0.0004911124770630841, + "loss": 0.2195, + "step": 61750 + }, + { + "epoch": 2.56, + "grad_norm": 0.65625, + "learning_rate": 0.0004911096108229185, + "loss": 0.1851, + "step": 61760 + }, + { + "epoch": 2.56, + "grad_norm": 0.66796875, + "learning_rate": 0.0004911067441290106, + "loss": 0.2173, + "step": 61770 + }, + { + "epoch": 2.56, + "grad_norm": 0.2041015625, + "learning_rate": 0.0004911038769813659, + "loss": 0.2124, + "step": 61780 + }, + { + "epoch": 2.56, + "grad_norm": 0.494140625, + "learning_rate": 0.0004911010093799897, + "loss": 0.2446, + "step": 61790 + }, + { + "epoch": 2.56, + "grad_norm": 0.50390625, + "learning_rate": 0.0004910981413248875, + "loss": 0.2262, + "step": 61800 + }, + { + "epoch": 2.56, + "grad_norm": 0.90234375, + "learning_rate": 0.0004910952728160646, + "loss": 0.2128, + "step": 61810 + }, + { + "epoch": 2.56, + "grad_norm": 0.6875, + "learning_rate": 0.0004910924038535265, + "loss": 0.2394, + "step": 61820 + }, + { + "epoch": 2.56, + "grad_norm": 1.3203125, + "learning_rate": 0.0004910895344372784, + "loss": 0.2612, + "step": 61830 + }, + { + "epoch": 2.56, + "grad_norm": 1.015625, + "learning_rate": 0.0004910866645673259, + "loss": 0.2312, + "step": 61840 + }, + { + "epoch": 2.56, + "grad_norm": 0.640625, + "learning_rate": 0.0004910837942436744, + "loss": 0.2064, + "step": 61850 + }, + { + "epoch": 2.56, + "grad_norm": 0.53125, + "learning_rate": 0.0004910809234663292, + "loss": 0.2628, + "step": 61860 + }, + { + "epoch": 2.56, + "grad_norm": 1.578125, + "learning_rate": 0.0004910780522352957, + "loss": 0.1879, + "step": 61870 + }, + { + "epoch": 2.56, + "grad_norm": 0.8203125, + "learning_rate": 0.0004910751805505794, + "loss": 0.1951, + "step": 61880 + }, + { + "epoch": 2.56, + "grad_norm": 1.71875, + "learning_rate": 0.0004910723084121855, + "loss": 0.2322, + "step": 61890 + }, + { + "epoch": 2.56, + "grad_norm": 0.6640625, + "learning_rate": 0.0004910694358201197, + "loss": 0.2458, + "step": 61900 + }, + { + "epoch": 2.56, + "grad_norm": 0.57421875, + "learning_rate": 0.0004910665627743871, + "loss": 0.195, + "step": 61910 + }, + { + "epoch": 2.56, + "grad_norm": 0.7578125, + "learning_rate": 0.0004910636892749933, + "loss": 0.2174, + "step": 61920 + }, + { + "epoch": 2.57, + "grad_norm": 1.2109375, + "learning_rate": 0.0004910608153219438, + "loss": 0.2365, + "step": 61930 + }, + { + "epoch": 2.57, + "grad_norm": 1.0234375, + "learning_rate": 0.0004910579409152438, + "loss": 0.2063, + "step": 61940 + }, + { + "epoch": 2.57, + "grad_norm": 0.70703125, + "learning_rate": 0.0004910550660548988, + "loss": 0.1786, + "step": 61950 + }, + { + "epoch": 2.57, + "grad_norm": 0.6953125, + "learning_rate": 0.0004910521907409141, + "loss": 0.2283, + "step": 61960 + }, + { + "epoch": 2.57, + "grad_norm": 0.6328125, + "learning_rate": 0.0004910493149732954, + "loss": 0.2611, + "step": 61970 + }, + { + "epoch": 2.57, + "grad_norm": 0.9296875, + "learning_rate": 0.0004910464387520478, + "loss": 0.2276, + "step": 61980 + }, + { + "epoch": 2.57, + "grad_norm": 0.59375, + "learning_rate": 0.0004910435620771768, + "loss": 0.2134, + "step": 61990 + }, + { + "epoch": 2.57, + "grad_norm": 0.76953125, + "learning_rate": 0.0004910406849486879, + "loss": 0.1933, + "step": 62000 + }, + { + "epoch": 2.57, + "grad_norm": 1.0546875, + "learning_rate": 0.0004910378073665864, + "loss": 0.1609, + "step": 62010 + }, + { + "epoch": 2.57, + "grad_norm": 0.61328125, + "learning_rate": 0.0004910349293308779, + "loss": 0.1911, + "step": 62020 + }, + { + "epoch": 2.57, + "grad_norm": 0.6796875, + "learning_rate": 0.0004910320508415677, + "loss": 0.2387, + "step": 62030 + }, + { + "epoch": 2.57, + "grad_norm": 0.9140625, + "learning_rate": 0.000491029171898661, + "loss": 0.1584, + "step": 62040 + }, + { + "epoch": 2.57, + "grad_norm": 0.8125, + "learning_rate": 0.0004910262925021636, + "loss": 0.1855, + "step": 62050 + }, + { + "epoch": 2.57, + "grad_norm": 0.82421875, + "learning_rate": 0.0004910234126520807, + "loss": 0.2447, + "step": 62060 + }, + { + "epoch": 2.57, + "grad_norm": 1.1171875, + "learning_rate": 0.0004910205323484178, + "loss": 0.2262, + "step": 62070 + }, + { + "epoch": 2.57, + "grad_norm": 0.54296875, + "learning_rate": 0.0004910176515911803, + "loss": 0.2319, + "step": 62080 + }, + { + "epoch": 2.57, + "grad_norm": 0.69921875, + "learning_rate": 0.0004910147703803735, + "loss": 0.1908, + "step": 62090 + }, + { + "epoch": 2.57, + "grad_norm": 0.5078125, + "learning_rate": 0.000491011888716003, + "loss": 0.1534, + "step": 62100 + }, + { + "epoch": 2.57, + "grad_norm": 0.8984375, + "learning_rate": 0.0004910090065980742, + "loss": 0.2843, + "step": 62110 + }, + { + "epoch": 2.57, + "grad_norm": 0.890625, + "learning_rate": 0.0004910061240265924, + "loss": 0.2084, + "step": 62120 + }, + { + "epoch": 2.57, + "grad_norm": 0.400390625, + "learning_rate": 0.0004910032410015632, + "loss": 0.1613, + "step": 62130 + }, + { + "epoch": 2.57, + "grad_norm": 0.78515625, + "learning_rate": 0.0004910003575229918, + "loss": 0.2279, + "step": 62140 + }, + { + "epoch": 2.57, + "grad_norm": 0.95703125, + "learning_rate": 0.0004909974735908838, + "loss": 0.2486, + "step": 62150 + }, + { + "epoch": 2.57, + "grad_norm": 0.875, + "learning_rate": 0.0004909945892052445, + "loss": 0.1541, + "step": 62160 + }, + { + "epoch": 2.58, + "grad_norm": 0.54296875, + "learning_rate": 0.0004909917043660795, + "loss": 0.2431, + "step": 62170 + }, + { + "epoch": 2.58, + "grad_norm": 0.4375, + "learning_rate": 0.0004909888190733942, + "loss": 0.2435, + "step": 62180 + }, + { + "epoch": 2.58, + "grad_norm": 0.46875, + "learning_rate": 0.0004909859333271938, + "loss": 0.2469, + "step": 62190 + }, + { + "epoch": 2.58, + "grad_norm": 2.109375, + "learning_rate": 0.0004909830471274841, + "loss": 0.2465, + "step": 62200 + }, + { + "epoch": 2.58, + "grad_norm": 0.458984375, + "learning_rate": 0.0004909801604742702, + "loss": 0.2161, + "step": 62210 + }, + { + "epoch": 2.58, + "grad_norm": 1.078125, + "learning_rate": 0.0004909772733675577, + "loss": 0.2008, + "step": 62220 + }, + { + "epoch": 2.58, + "grad_norm": 0.6640625, + "learning_rate": 0.000490974385807352, + "loss": 0.228, + "step": 62230 + }, + { + "epoch": 2.58, + "grad_norm": 0.6953125, + "learning_rate": 0.0004909714977936585, + "loss": 0.2033, + "step": 62240 + }, + { + "epoch": 2.58, + "grad_norm": 0.0, + "learning_rate": 0.0004909686093264827, + "loss": 0.2119, + "step": 62250 + }, + { + "epoch": 2.58, + "grad_norm": 1.125, + "learning_rate": 0.0004909657204058299, + "loss": 0.2095, + "step": 62260 + }, + { + "epoch": 2.58, + "grad_norm": 1.3125, + "learning_rate": 0.0004909628310317058, + "loss": 0.2218, + "step": 62270 + }, + { + "epoch": 2.58, + "grad_norm": 1.3203125, + "learning_rate": 0.0004909599412041155, + "loss": 0.2857, + "step": 62280 + }, + { + "epoch": 2.58, + "grad_norm": 0.5546875, + "learning_rate": 0.0004909570509230648, + "loss": 0.251, + "step": 62290 + }, + { + "epoch": 2.58, + "grad_norm": 0.76953125, + "learning_rate": 0.0004909541601885587, + "loss": 0.2294, + "step": 62300 + }, + { + "epoch": 2.58, + "grad_norm": 0.625, + "learning_rate": 0.0004909512690006031, + "loss": 0.1956, + "step": 62310 + }, + { + "epoch": 2.58, + "grad_norm": 1.1328125, + "learning_rate": 0.0004909483773592032, + "loss": 0.2396, + "step": 62320 + }, + { + "epoch": 2.58, + "grad_norm": 0.302734375, + "learning_rate": 0.0004909454852643644, + "loss": 0.2105, + "step": 62330 + }, + { + "epoch": 2.58, + "grad_norm": 0.58203125, + "learning_rate": 0.0004909425927160923, + "loss": 0.2308, + "step": 62340 + }, + { + "epoch": 2.58, + "grad_norm": 0.33203125, + "learning_rate": 0.0004909396997143922, + "loss": 0.2032, + "step": 62350 + }, + { + "epoch": 2.58, + "grad_norm": 0.953125, + "learning_rate": 0.0004909368062592696, + "loss": 0.227, + "step": 62360 + }, + { + "epoch": 2.58, + "grad_norm": 0.5234375, + "learning_rate": 0.00049093391235073, + "loss": 0.1597, + "step": 62370 + }, + { + "epoch": 2.58, + "grad_norm": 0.98828125, + "learning_rate": 0.0004909310179887788, + "loss": 0.1965, + "step": 62380 + }, + { + "epoch": 2.58, + "grad_norm": 0.1708984375, + "learning_rate": 0.0004909281231734214, + "loss": 0.2364, + "step": 62390 + }, + { + "epoch": 2.58, + "grad_norm": 1.390625, + "learning_rate": 0.0004909252279046634, + "loss": 0.175, + "step": 62400 + }, + { + "epoch": 2.59, + "grad_norm": 0.6328125, + "learning_rate": 0.0004909223321825099, + "loss": 0.2075, + "step": 62410 + }, + { + "epoch": 2.59, + "grad_norm": 0.6484375, + "learning_rate": 0.0004909194360069668, + "loss": 0.2667, + "step": 62420 + }, + { + "epoch": 2.59, + "grad_norm": 0.7421875, + "learning_rate": 0.0004909165393780393, + "loss": 0.2515, + "step": 62430 + }, + { + "epoch": 2.59, + "grad_norm": 0.68359375, + "learning_rate": 0.0004909136422957328, + "loss": 0.2621, + "step": 62440 + }, + { + "epoch": 2.59, + "grad_norm": 2.328125, + "learning_rate": 0.0004909107447600529, + "loss": 0.2395, + "step": 62450 + }, + { + "epoch": 2.59, + "grad_norm": 0.1884765625, + "learning_rate": 0.0004909078467710049, + "loss": 0.2733, + "step": 62460 + }, + { + "epoch": 2.59, + "grad_norm": 0.5859375, + "learning_rate": 0.0004909049483285946, + "loss": 0.2062, + "step": 62470 + }, + { + "epoch": 2.59, + "grad_norm": 1.2421875, + "learning_rate": 0.0004909020494328271, + "loss": 0.1689, + "step": 62480 + }, + { + "epoch": 2.59, + "grad_norm": 1.65625, + "learning_rate": 0.0004908991500837078, + "loss": 0.2063, + "step": 62490 + }, + { + "epoch": 2.59, + "grad_norm": 0.419921875, + "learning_rate": 0.0004908962502812425, + "loss": 0.2063, + "step": 62500 + }, + { + "epoch": 2.59, + "grad_norm": 0.58984375, + "learning_rate": 0.0004908933500254363, + "loss": 0.2604, + "step": 62510 + }, + { + "epoch": 2.59, + "grad_norm": 1.078125, + "learning_rate": 0.0004908904493162949, + "loss": 0.2397, + "step": 62520 + }, + { + "epoch": 2.59, + "grad_norm": 1.03125, + "learning_rate": 0.0004908875481538237, + "loss": 0.2351, + "step": 62530 + }, + { + "epoch": 2.59, + "grad_norm": 0.6171875, + "learning_rate": 0.0004908846465380282, + "loss": 0.1626, + "step": 62540 + }, + { + "epoch": 2.59, + "grad_norm": 0.41796875, + "learning_rate": 0.0004908817444689138, + "loss": 0.2083, + "step": 62550 + }, + { + "epoch": 2.59, + "grad_norm": 0.71875, + "learning_rate": 0.0004908788419464859, + "loss": 0.2062, + "step": 62560 + }, + { + "epoch": 2.59, + "grad_norm": 0.51953125, + "learning_rate": 0.00049087593897075, + "loss": 0.2009, + "step": 62570 + }, + { + "epoch": 2.59, + "grad_norm": 0.94140625, + "learning_rate": 0.0004908730355417117, + "loss": 0.2519, + "step": 62580 + }, + { + "epoch": 2.59, + "grad_norm": 0.5390625, + "learning_rate": 0.0004908701316593763, + "loss": 0.2339, + "step": 62590 + }, + { + "epoch": 2.59, + "grad_norm": 0.458984375, + "learning_rate": 0.0004908672273237493, + "loss": 0.1651, + "step": 62600 + }, + { + "epoch": 2.59, + "grad_norm": 0.83984375, + "learning_rate": 0.0004908643225348362, + "loss": 0.216, + "step": 62610 + }, + { + "epoch": 2.59, + "grad_norm": 0.62109375, + "learning_rate": 0.0004908614172926426, + "loss": 0.2664, + "step": 62620 + }, + { + "epoch": 2.59, + "grad_norm": 0.94140625, + "learning_rate": 0.0004908585115971737, + "loss": 0.1787, + "step": 62630 + }, + { + "epoch": 2.59, + "grad_norm": 1.0625, + "learning_rate": 0.0004908556054484351, + "loss": 0.2295, + "step": 62640 + }, + { + "epoch": 2.59, + "grad_norm": 0.435546875, + "learning_rate": 0.0004908526988464323, + "loss": 0.2171, + "step": 62650 + }, + { + "epoch": 2.6, + "grad_norm": 0.408203125, + "learning_rate": 0.0004908497917911706, + "loss": 0.2427, + "step": 62660 + }, + { + "epoch": 2.6, + "grad_norm": 1.0390625, + "learning_rate": 0.0004908468842826557, + "loss": 0.1558, + "step": 62670 + }, + { + "epoch": 2.6, + "grad_norm": 1.1484375, + "learning_rate": 0.0004908439763208931, + "loss": 0.1729, + "step": 62680 + }, + { + "epoch": 2.6, + "grad_norm": 1.015625, + "learning_rate": 0.0004908410679058879, + "loss": 0.2673, + "step": 62690 + }, + { + "epoch": 2.6, + "grad_norm": 0.41796875, + "learning_rate": 0.000490838159037646, + "loss": 0.1841, + "step": 62700 + }, + { + "epoch": 2.6, + "grad_norm": 0.484375, + "learning_rate": 0.0004908352497161726, + "loss": 0.2279, + "step": 62710 + }, + { + "epoch": 2.6, + "grad_norm": 0.53515625, + "learning_rate": 0.0004908323399414733, + "loss": 0.1633, + "step": 62720 + }, + { + "epoch": 2.6, + "grad_norm": 1.21875, + "learning_rate": 0.0004908294297135535, + "loss": 0.2478, + "step": 62730 + }, + { + "epoch": 2.6, + "grad_norm": 0.73828125, + "learning_rate": 0.0004908265190324188, + "loss": 0.217, + "step": 62740 + }, + { + "epoch": 2.6, + "grad_norm": 0.4765625, + "learning_rate": 0.0004908236078980746, + "loss": 0.1685, + "step": 62750 + }, + { + "epoch": 2.6, + "grad_norm": 0.62109375, + "learning_rate": 0.0004908206963105263, + "loss": 0.2233, + "step": 62760 + }, + { + "epoch": 2.6, + "grad_norm": 0.5390625, + "learning_rate": 0.0004908177842697795, + "loss": 0.1983, + "step": 62770 + }, + { + "epoch": 2.6, + "grad_norm": 0.59765625, + "learning_rate": 0.0004908148717758396, + "loss": 0.2106, + "step": 62780 + }, + { + "epoch": 2.6, + "grad_norm": 0.515625, + "learning_rate": 0.0004908119588287121, + "loss": 0.1954, + "step": 62790 + }, + { + "epoch": 2.6, + "grad_norm": 0.1572265625, + "learning_rate": 0.0004908090454284026, + "loss": 0.1737, + "step": 62800 + }, + { + "epoch": 2.6, + "grad_norm": 0.55078125, + "learning_rate": 0.0004908061315749164, + "loss": 0.2143, + "step": 62810 + }, + { + "epoch": 2.6, + "grad_norm": 0.5, + "learning_rate": 0.0004908032172682592, + "loss": 0.2289, + "step": 62820 + }, + { + "epoch": 2.6, + "grad_norm": 0.5859375, + "learning_rate": 0.0004908003025084362, + "loss": 0.2729, + "step": 62830 + }, + { + "epoch": 2.6, + "grad_norm": 0.57421875, + "learning_rate": 0.000490797387295453, + "loss": 0.2458, + "step": 62840 + }, + { + "epoch": 2.6, + "grad_norm": 0.48828125, + "learning_rate": 0.0004907944716293153, + "loss": 0.2012, + "step": 62850 + }, + { + "epoch": 2.6, + "grad_norm": 0.69140625, + "learning_rate": 0.0004907915555100282, + "loss": 0.1453, + "step": 62860 + }, + { + "epoch": 2.6, + "grad_norm": 1.1484375, + "learning_rate": 0.0004907886389375975, + "loss": 0.2141, + "step": 62870 + }, + { + "epoch": 2.6, + "grad_norm": 0.52734375, + "learning_rate": 0.0004907857219120286, + "loss": 0.2729, + "step": 62880 + }, + { + "epoch": 2.6, + "grad_norm": 0.5703125, + "learning_rate": 0.000490782804433327, + "loss": 0.2458, + "step": 62890 + }, + { + "epoch": 2.61, + "grad_norm": 0.78125, + "learning_rate": 0.0004907798865014981, + "loss": 0.1578, + "step": 62900 + }, + { + "epoch": 2.61, + "grad_norm": 1.9921875, + "learning_rate": 0.0004907769681165475, + "loss": 0.2324, + "step": 62910 + }, + { + "epoch": 2.61, + "grad_norm": 1.2578125, + "learning_rate": 0.0004907740492784805, + "loss": 0.2598, + "step": 62920 + }, + { + "epoch": 2.61, + "grad_norm": 0.62890625, + "learning_rate": 0.000490771129987303, + "loss": 0.2475, + "step": 62930 + }, + { + "epoch": 2.61, + "grad_norm": 0.7578125, + "learning_rate": 0.0004907682102430201, + "loss": 0.1894, + "step": 62940 + }, + { + "epoch": 2.61, + "grad_norm": 0.65234375, + "learning_rate": 0.0004907652900456375, + "loss": 0.2224, + "step": 62950 + }, + { + "epoch": 2.61, + "grad_norm": 0.494140625, + "learning_rate": 0.0004907623693951605, + "loss": 0.2297, + "step": 62960 + }, + { + "epoch": 2.61, + "grad_norm": 1.6171875, + "learning_rate": 0.0004907594482915947, + "loss": 0.2009, + "step": 62970 + }, + { + "epoch": 2.61, + "grad_norm": 0.73828125, + "learning_rate": 0.0004907565267349458, + "loss": 0.2164, + "step": 62980 + }, + { + "epoch": 2.61, + "grad_norm": 1.3515625, + "learning_rate": 0.0004907536047252189, + "loss": 0.217, + "step": 62990 + }, + { + "epoch": 2.61, + "grad_norm": 0.2451171875, + "learning_rate": 0.0004907506822624198, + "loss": 0.2261, + "step": 63000 + }, + { + "epoch": 2.61, + "grad_norm": 1.7265625, + "learning_rate": 0.000490747759346554, + "loss": 0.2278, + "step": 63010 + }, + { + "epoch": 2.61, + "grad_norm": 0.58203125, + "learning_rate": 0.0004907448359776268, + "loss": 0.2465, + "step": 63020 + }, + { + "epoch": 2.61, + "grad_norm": 0.85546875, + "learning_rate": 0.0004907419121556439, + "loss": 0.2233, + "step": 63030 + }, + { + "epoch": 2.61, + "grad_norm": 0.64453125, + "learning_rate": 0.0004907389878806107, + "loss": 0.2724, + "step": 63040 + }, + { + "epoch": 2.61, + "grad_norm": 1.0234375, + "learning_rate": 0.0004907360631525328, + "loss": 0.214, + "step": 63050 + }, + { + "epoch": 2.61, + "grad_norm": 0.90234375, + "learning_rate": 0.0004907331379714155, + "loss": 0.2144, + "step": 63060 + }, + { + "epoch": 2.61, + "grad_norm": 0.6328125, + "learning_rate": 0.0004907302123372644, + "loss": 0.2165, + "step": 63070 + }, + { + "epoch": 2.61, + "grad_norm": 0.67578125, + "learning_rate": 0.0004907272862500851, + "loss": 0.174, + "step": 63080 + }, + { + "epoch": 2.61, + "grad_norm": 0.87890625, + "learning_rate": 0.0004907243597098831, + "loss": 0.2097, + "step": 63090 + }, + { + "epoch": 2.61, + "grad_norm": 0.921875, + "learning_rate": 0.0004907214327166638, + "loss": 0.2439, + "step": 63100 + }, + { + "epoch": 2.61, + "grad_norm": 0.7265625, + "learning_rate": 0.0004907185052704327, + "loss": 0.2127, + "step": 63110 + }, + { + "epoch": 2.61, + "grad_norm": 0.640625, + "learning_rate": 0.0004907155773711955, + "loss": 0.1855, + "step": 63120 + }, + { + "epoch": 2.61, + "grad_norm": 0.8203125, + "learning_rate": 0.0004907126490189575, + "loss": 0.2715, + "step": 63130 + }, + { + "epoch": 2.62, + "grad_norm": 1.125, + "learning_rate": 0.0004907097202137243, + "loss": 0.2287, + "step": 63140 + }, + { + "epoch": 2.62, + "grad_norm": 0.9921875, + "learning_rate": 0.0004907067909555014, + "loss": 0.3, + "step": 63150 + }, + { + "epoch": 2.62, + "grad_norm": 0.6015625, + "learning_rate": 0.0004907038612442943, + "loss": 0.2183, + "step": 63160 + }, + { + "epoch": 2.62, + "grad_norm": 0.67578125, + "learning_rate": 0.0004907009310801085, + "loss": 0.2436, + "step": 63170 + }, + { + "epoch": 2.62, + "grad_norm": 0.54296875, + "learning_rate": 0.0004906980004629496, + "loss": 0.1849, + "step": 63180 + }, + { + "epoch": 2.62, + "grad_norm": 0.46875, + "learning_rate": 0.0004906950693928231, + "loss": 0.205, + "step": 63190 + }, + { + "epoch": 2.62, + "grad_norm": 0.27734375, + "learning_rate": 0.0004906921378697343, + "loss": 0.1752, + "step": 63200 + }, + { + "epoch": 2.62, + "grad_norm": 0.46875, + "learning_rate": 0.000490689205893689, + "loss": 0.2172, + "step": 63210 + }, + { + "epoch": 2.62, + "grad_norm": 1.515625, + "learning_rate": 0.0004906862734646927, + "loss": 0.2893, + "step": 63220 + }, + { + "epoch": 2.62, + "grad_norm": 0.85546875, + "learning_rate": 0.0004906833405827507, + "loss": 0.195, + "step": 63230 + }, + { + "epoch": 2.62, + "grad_norm": 0.51171875, + "learning_rate": 0.0004906804072478686, + "loss": 0.2369, + "step": 63240 + }, + { + "epoch": 2.62, + "grad_norm": 0.443359375, + "learning_rate": 0.0004906774734600521, + "loss": 0.1905, + "step": 63250 + }, + { + "epoch": 2.62, + "grad_norm": 3.296875, + "learning_rate": 0.0004906745392193065, + "loss": 0.2149, + "step": 63260 + }, + { + "epoch": 2.62, + "grad_norm": 0.52734375, + "learning_rate": 0.0004906716045256374, + "loss": 0.2138, + "step": 63270 + }, + { + "epoch": 2.62, + "grad_norm": 0.435546875, + "learning_rate": 0.0004906686693790504, + "loss": 0.1714, + "step": 63280 + }, + { + "epoch": 2.62, + "grad_norm": 1.515625, + "learning_rate": 0.000490665733779551, + "loss": 0.2053, + "step": 63290 + }, + { + "epoch": 2.62, + "grad_norm": 0.59375, + "learning_rate": 0.0004906627977271445, + "loss": 0.2, + "step": 63300 + }, + { + "epoch": 2.62, + "grad_norm": 0.369140625, + "learning_rate": 0.0004906598612218367, + "loss": 0.1564, + "step": 63310 + }, + { + "epoch": 2.62, + "grad_norm": 0.38671875, + "learning_rate": 0.0004906569242636331, + "loss": 0.2284, + "step": 63320 + }, + { + "epoch": 2.62, + "grad_norm": 0.85546875, + "learning_rate": 0.000490653986852539, + "loss": 0.2079, + "step": 63330 + }, + { + "epoch": 2.62, + "grad_norm": 0.4296875, + "learning_rate": 0.0004906510489885602, + "loss": 0.2003, + "step": 63340 + }, + { + "epoch": 2.62, + "grad_norm": 0.75390625, + "learning_rate": 0.0004906481106717022, + "loss": 0.239, + "step": 63350 + }, + { + "epoch": 2.62, + "grad_norm": 0.8125, + "learning_rate": 0.0004906451719019703, + "loss": 0.238, + "step": 63360 + }, + { + "epoch": 2.62, + "grad_norm": 0.8125, + "learning_rate": 0.0004906422326793701, + "loss": 0.229, + "step": 63370 + }, + { + "epoch": 2.63, + "grad_norm": 0.94921875, + "learning_rate": 0.0004906392930039073, + "loss": 0.1833, + "step": 63380 + }, + { + "epoch": 2.63, + "grad_norm": 0.95703125, + "learning_rate": 0.0004906363528755874, + "loss": 0.2206, + "step": 63390 + }, + { + "epoch": 2.63, + "grad_norm": 0.44921875, + "learning_rate": 0.0004906334122944158, + "loss": 0.1993, + "step": 63400 + }, + { + "epoch": 2.63, + "grad_norm": 1.015625, + "learning_rate": 0.0004906304712603981, + "loss": 0.2756, + "step": 63410 + }, + { + "epoch": 2.63, + "grad_norm": 0.75, + "learning_rate": 0.0004906275297735399, + "loss": 0.1887, + "step": 63420 + }, + { + "epoch": 2.63, + "grad_norm": 1.1328125, + "learning_rate": 0.0004906245878338466, + "loss": 0.2118, + "step": 63430 + }, + { + "epoch": 2.63, + "grad_norm": 1.0546875, + "learning_rate": 0.0004906216454413239, + "loss": 0.2344, + "step": 63440 + }, + { + "epoch": 2.63, + "grad_norm": 0.263671875, + "learning_rate": 0.0004906187025959772, + "loss": 0.2221, + "step": 63450 + }, + { + "epoch": 2.63, + "grad_norm": 0.65234375, + "learning_rate": 0.000490615759297812, + "loss": 0.2256, + "step": 63460 + }, + { + "epoch": 2.63, + "grad_norm": 1.4140625, + "learning_rate": 0.000490612815546834, + "loss": 0.2225, + "step": 63470 + }, + { + "epoch": 2.63, + "grad_norm": 0.5703125, + "learning_rate": 0.0004906098713430486, + "loss": 0.2295, + "step": 63480 + }, + { + "epoch": 2.63, + "grad_norm": 1.7109375, + "learning_rate": 0.0004906069266864616, + "loss": 0.1966, + "step": 63490 + }, + { + "epoch": 2.63, + "grad_norm": 0.66796875, + "learning_rate": 0.0004906039815770782, + "loss": 0.1744, + "step": 63500 + }, + { + "epoch": 2.63, + "grad_norm": 0.9375, + "learning_rate": 0.0004906010360149042, + "loss": 0.2185, + "step": 63510 + }, + { + "epoch": 2.63, + "grad_norm": 0.4609375, + "learning_rate": 0.0004905980899999449, + "loss": 0.1782, + "step": 63520 + }, + { + "epoch": 2.63, + "grad_norm": 0.353515625, + "learning_rate": 0.000490595143532206, + "loss": 0.2356, + "step": 63530 + }, + { + "epoch": 2.63, + "grad_norm": 0.361328125, + "learning_rate": 0.000490592196611693, + "loss": 0.2173, + "step": 63540 + }, + { + "epoch": 2.63, + "grad_norm": 0.421875, + "learning_rate": 0.0004905892492384115, + "loss": 0.1999, + "step": 63550 + }, + { + "epoch": 2.63, + "grad_norm": 0.373046875, + "learning_rate": 0.0004905863014123671, + "loss": 0.2545, + "step": 63560 + }, + { + "epoch": 2.63, + "grad_norm": 1.171875, + "learning_rate": 0.0004905833531335652, + "loss": 0.2289, + "step": 63570 + }, + { + "epoch": 2.63, + "grad_norm": 0.447265625, + "learning_rate": 0.0004905804044020113, + "loss": 0.2764, + "step": 63580 + }, + { + "epoch": 2.63, + "grad_norm": 0.5859375, + "learning_rate": 0.0004905774552177113, + "loss": 0.1965, + "step": 63590 + }, + { + "epoch": 2.63, + "grad_norm": 0.87109375, + "learning_rate": 0.0004905745055806703, + "loss": 0.2233, + "step": 63600 + }, + { + "epoch": 2.63, + "grad_norm": 0.58203125, + "learning_rate": 0.0004905715554908941, + "loss": 0.1802, + "step": 63610 + }, + { + "epoch": 2.64, + "grad_norm": 0.4375, + "learning_rate": 0.0004905686049483883, + "loss": 0.2223, + "step": 63620 + }, + { + "epoch": 2.64, + "grad_norm": 0.640625, + "learning_rate": 0.0004905656539531582, + "loss": 0.3025, + "step": 63630 + }, + { + "epoch": 2.64, + "grad_norm": 0.423828125, + "learning_rate": 0.0004905627025052097, + "loss": 0.2618, + "step": 63640 + }, + { + "epoch": 2.64, + "grad_norm": 0.2138671875, + "learning_rate": 0.000490559750604548, + "loss": 0.1626, + "step": 63650 + }, + { + "epoch": 2.64, + "grad_norm": 0.55078125, + "learning_rate": 0.000490556798251179, + "loss": 0.1787, + "step": 63660 + }, + { + "epoch": 2.64, + "grad_norm": 1.1484375, + "learning_rate": 0.0004905538454451079, + "loss": 0.1989, + "step": 63670 + }, + { + "epoch": 2.64, + "grad_norm": 1.0, + "learning_rate": 0.0004905508921863406, + "loss": 0.2063, + "step": 63680 + }, + { + "epoch": 2.64, + "grad_norm": 0.494140625, + "learning_rate": 0.0004905479384748825, + "loss": 0.2084, + "step": 63690 + }, + { + "epoch": 2.64, + "grad_norm": 0.4765625, + "learning_rate": 0.000490544984310739, + "loss": 0.1991, + "step": 63700 + }, + { + "epoch": 2.64, + "grad_norm": 0.55078125, + "learning_rate": 0.000490542029693916, + "loss": 0.196, + "step": 63710 + }, + { + "epoch": 2.64, + "grad_norm": 0.65625, + "learning_rate": 0.0004905390746244189, + "loss": 0.2449, + "step": 63720 + }, + { + "epoch": 2.64, + "grad_norm": 0.427734375, + "learning_rate": 0.0004905361191022531, + "loss": 0.2102, + "step": 63730 + }, + { + "epoch": 2.64, + "grad_norm": 0.64453125, + "learning_rate": 0.0004905331631274242, + "loss": 0.2091, + "step": 63740 + }, + { + "epoch": 2.64, + "grad_norm": 0.310546875, + "learning_rate": 0.0004905302066999381, + "loss": 0.2348, + "step": 63750 + }, + { + "epoch": 2.64, + "grad_norm": 0.75390625, + "learning_rate": 0.0004905272498198, + "loss": 0.1697, + "step": 63760 + }, + { + "epoch": 2.64, + "grad_norm": 0.4765625, + "learning_rate": 0.0004905242924870156, + "loss": 0.2552, + "step": 63770 + }, + { + "epoch": 2.64, + "grad_norm": 0.357421875, + "learning_rate": 0.0004905213347015904, + "loss": 0.223, + "step": 63780 + }, + { + "epoch": 2.64, + "grad_norm": 0.5078125, + "learning_rate": 0.0004905183764635302, + "loss": 0.1712, + "step": 63790 + }, + { + "epoch": 2.64, + "grad_norm": 0.546875, + "learning_rate": 0.0004905154177728402, + "loss": 0.2183, + "step": 63800 + }, + { + "epoch": 2.64, + "grad_norm": 1.5, + "learning_rate": 0.0004905124586295262, + "loss": 0.2101, + "step": 63810 + }, + { + "epoch": 2.64, + "grad_norm": 0.296875, + "learning_rate": 0.0004905094990335937, + "loss": 0.2152, + "step": 63820 + }, + { + "epoch": 2.64, + "grad_norm": 0.58203125, + "learning_rate": 0.0004905065389850485, + "loss": 0.2107, + "step": 63830 + }, + { + "epoch": 2.64, + "grad_norm": 0.79296875, + "learning_rate": 0.0004905035784838958, + "loss": 0.2416, + "step": 63840 + }, + { + "epoch": 2.64, + "grad_norm": 1.6953125, + "learning_rate": 0.0004905006175301413, + "loss": 0.1859, + "step": 63850 + }, + { + "epoch": 2.65, + "grad_norm": 0.349609375, + "learning_rate": 0.0004904976561237907, + "loss": 0.2065, + "step": 63860 + }, + { + "epoch": 2.65, + "grad_norm": 2.609375, + "learning_rate": 0.0004904946942648495, + "loss": 0.2588, + "step": 63870 + }, + { + "epoch": 2.65, + "grad_norm": 0.50390625, + "learning_rate": 0.0004904917319533232, + "loss": 0.2069, + "step": 63880 + }, + { + "epoch": 2.65, + "grad_norm": 0.453125, + "learning_rate": 0.0004904887691892174, + "loss": 0.1566, + "step": 63890 + }, + { + "epoch": 2.65, + "grad_norm": 0.0, + "learning_rate": 0.0004904858059725378, + "loss": 0.1738, + "step": 63900 + }, + { + "epoch": 2.65, + "grad_norm": 0.703125, + "learning_rate": 0.0004904828423032898, + "loss": 0.2273, + "step": 63910 + }, + { + "epoch": 2.65, + "grad_norm": 0.37890625, + "learning_rate": 0.0004904798781814791, + "loss": 0.2195, + "step": 63920 + }, + { + "epoch": 2.65, + "grad_norm": 1.375, + "learning_rate": 0.0004904769136071112, + "loss": 0.2235, + "step": 63930 + }, + { + "epoch": 2.65, + "grad_norm": 0.30859375, + "learning_rate": 0.0004904739485801918, + "loss": 0.2004, + "step": 63940 + }, + { + "epoch": 2.65, + "grad_norm": 0.6171875, + "learning_rate": 0.0004904709831007263, + "loss": 0.2121, + "step": 63950 + }, + { + "epoch": 2.65, + "grad_norm": 0.337890625, + "learning_rate": 0.0004904680171687204, + "loss": 0.2186, + "step": 63960 + }, + { + "epoch": 2.65, + "grad_norm": 1.28125, + "learning_rate": 0.0004904650507841796, + "loss": 0.1909, + "step": 63970 + }, + { + "epoch": 2.65, + "grad_norm": 0.59375, + "learning_rate": 0.0004904620839471097, + "loss": 0.2089, + "step": 63980 + }, + { + "epoch": 2.65, + "grad_norm": 0.51953125, + "learning_rate": 0.0004904591166575161, + "loss": 0.2741, + "step": 63990 + }, + { + "epoch": 2.65, + "grad_norm": 0.546875, + "learning_rate": 0.0004904561489154043, + "loss": 0.259, + "step": 64000 + }, + { + "epoch": 2.65, + "grad_norm": 0.546875, + "learning_rate": 0.0004904531807207801, + "loss": 0.1877, + "step": 64010 + }, + { + "epoch": 2.65, + "grad_norm": 0.58984375, + "learning_rate": 0.0004904502120736489, + "loss": 0.1594, + "step": 64020 + }, + { + "epoch": 2.65, + "grad_norm": 0.404296875, + "learning_rate": 0.0004904472429740164, + "loss": 0.215, + "step": 64030 + }, + { + "epoch": 2.65, + "grad_norm": 0.50390625, + "learning_rate": 0.0004904442734218881, + "loss": 0.2322, + "step": 64040 + }, + { + "epoch": 2.65, + "grad_norm": 0.34765625, + "learning_rate": 0.0004904413034172698, + "loss": 0.1667, + "step": 64050 + }, + { + "epoch": 2.65, + "grad_norm": 0.64453125, + "learning_rate": 0.0004904383329601669, + "loss": 0.1841, + "step": 64060 + }, + { + "epoch": 2.65, + "grad_norm": 0.71484375, + "learning_rate": 0.0004904353620505848, + "loss": 0.2318, + "step": 64070 + }, + { + "epoch": 2.65, + "grad_norm": 0.44140625, + "learning_rate": 0.0004904323906885296, + "loss": 0.197, + "step": 64080 + }, + { + "epoch": 2.65, + "grad_norm": 0.34765625, + "learning_rate": 0.0004904294188740064, + "loss": 0.2013, + "step": 64090 + }, + { + "epoch": 2.66, + "grad_norm": 0.46484375, + "learning_rate": 0.0004904264466070211, + "loss": 0.2123, + "step": 64100 + }, + { + "epoch": 2.66, + "grad_norm": 0.55859375, + "learning_rate": 0.0004904234738875792, + "loss": 0.2351, + "step": 64110 + }, + { + "epoch": 2.66, + "grad_norm": 0.53125, + "learning_rate": 0.0004904205007156862, + "loss": 0.1884, + "step": 64120 + }, + { + "epoch": 2.66, + "grad_norm": 0.65625, + "learning_rate": 0.0004904175270913478, + "loss": 0.208, + "step": 64130 + }, + { + "epoch": 2.66, + "grad_norm": 0.83984375, + "learning_rate": 0.0004904145530145696, + "loss": 0.218, + "step": 64140 + }, + { + "epoch": 2.66, + "grad_norm": 1.0, + "learning_rate": 0.0004904115784853572, + "loss": 0.1754, + "step": 64150 + }, + { + "epoch": 2.66, + "grad_norm": 0.62890625, + "learning_rate": 0.0004904086035037162, + "loss": 0.1666, + "step": 64160 + }, + { + "epoch": 2.66, + "grad_norm": 1.8203125, + "learning_rate": 0.000490405628069652, + "loss": 0.2314, + "step": 64170 + }, + { + "epoch": 2.66, + "grad_norm": 0.484375, + "learning_rate": 0.0004904026521831706, + "loss": 0.1955, + "step": 64180 + }, + { + "epoch": 2.66, + "grad_norm": 0.271484375, + "learning_rate": 0.0004903996758442772, + "loss": 0.2254, + "step": 64190 + }, + { + "epoch": 2.66, + "grad_norm": 0.55859375, + "learning_rate": 0.0004903966990529777, + "loss": 0.2567, + "step": 64200 + }, + { + "epoch": 2.66, + "grad_norm": 1.0390625, + "learning_rate": 0.0004903937218092775, + "loss": 0.214, + "step": 64210 + }, + { + "epoch": 2.66, + "grad_norm": 0.52734375, + "learning_rate": 0.0004903907441131823, + "loss": 0.2378, + "step": 64220 + }, + { + "epoch": 2.66, + "grad_norm": 2.1875, + "learning_rate": 0.0004903877659646976, + "loss": 0.2494, + "step": 64230 + }, + { + "epoch": 2.66, + "grad_norm": 0.66796875, + "learning_rate": 0.0004903847873638292, + "loss": 0.1876, + "step": 64240 + }, + { + "epoch": 2.66, + "grad_norm": 1.53125, + "learning_rate": 0.0004903818083105825, + "loss": 0.2505, + "step": 64250 + }, + { + "epoch": 2.66, + "grad_norm": 0.55859375, + "learning_rate": 0.0004903788288049632, + "loss": 0.2318, + "step": 64260 + }, + { + "epoch": 2.66, + "grad_norm": 0.5703125, + "learning_rate": 0.0004903758488469769, + "loss": 0.224, + "step": 64270 + }, + { + "epoch": 2.66, + "grad_norm": 0.765625, + "learning_rate": 0.0004903728684366293, + "loss": 0.2559, + "step": 64280 + }, + { + "epoch": 2.66, + "grad_norm": 0.53125, + "learning_rate": 0.0004903698875739259, + "loss": 0.2481, + "step": 64290 + }, + { + "epoch": 2.66, + "grad_norm": 0.91015625, + "learning_rate": 0.0004903669062588723, + "loss": 0.1993, + "step": 64300 + }, + { + "epoch": 2.66, + "grad_norm": 0.392578125, + "learning_rate": 0.0004903639244914741, + "loss": 0.2821, + "step": 64310 + }, + { + "epoch": 2.66, + "grad_norm": 0.64453125, + "learning_rate": 0.000490360942271737, + "loss": 0.1934, + "step": 64320 + }, + { + "epoch": 2.66, + "grad_norm": 0.59765625, + "learning_rate": 0.0004903579595996665, + "loss": 0.2139, + "step": 64330 + }, + { + "epoch": 2.66, + "grad_norm": 0.796875, + "learning_rate": 0.0004903549764752684, + "loss": 0.1683, + "step": 64340 + }, + { + "epoch": 2.67, + "grad_norm": 0.59375, + "learning_rate": 0.0004903519928985481, + "loss": 0.2297, + "step": 64350 + }, + { + "epoch": 2.67, + "grad_norm": 1.1171875, + "learning_rate": 0.0004903490088695113, + "loss": 0.2029, + "step": 64360 + }, + { + "epoch": 2.67, + "grad_norm": 0.78125, + "learning_rate": 0.0004903460243881637, + "loss": 0.2715, + "step": 64370 + }, + { + "epoch": 2.67, + "grad_norm": 0.5703125, + "learning_rate": 0.0004903430394545107, + "loss": 0.2539, + "step": 64380 + }, + { + "epoch": 2.67, + "grad_norm": 0.40234375, + "learning_rate": 0.0004903400540685581, + "loss": 0.2183, + "step": 64390 + }, + { + "epoch": 2.67, + "grad_norm": 0.4296875, + "learning_rate": 0.0004903370682303116, + "loss": 0.2018, + "step": 64400 + }, + { + "epoch": 2.67, + "grad_norm": 0.6953125, + "learning_rate": 0.0004903340819397766, + "loss": 0.2567, + "step": 64410 + }, + { + "epoch": 2.67, + "grad_norm": 1.2265625, + "learning_rate": 0.0004903310951969587, + "loss": 0.2563, + "step": 64420 + }, + { + "epoch": 2.67, + "grad_norm": 0.322265625, + "learning_rate": 0.0004903281080018638, + "loss": 0.1964, + "step": 64430 + }, + { + "epoch": 2.67, + "grad_norm": 0.9296875, + "learning_rate": 0.0004903251203544973, + "loss": 0.2237, + "step": 64440 + }, + { + "epoch": 2.67, + "grad_norm": 0.71484375, + "learning_rate": 0.0004903221322548649, + "loss": 0.2006, + "step": 64450 + }, + { + "epoch": 2.67, + "grad_norm": 1.234375, + "learning_rate": 0.0004903191437029722, + "loss": 0.2458, + "step": 64460 + }, + { + "epoch": 2.67, + "grad_norm": 0.447265625, + "learning_rate": 0.0004903161546988247, + "loss": 0.2303, + "step": 64470 + }, + { + "epoch": 2.67, + "grad_norm": 1.015625, + "learning_rate": 0.0004903131652424283, + "loss": 0.2703, + "step": 64480 + }, + { + "epoch": 2.67, + "grad_norm": 1.9453125, + "learning_rate": 0.0004903101753337884, + "loss": 0.2578, + "step": 64490 + }, + { + "epoch": 2.67, + "grad_norm": 0.828125, + "learning_rate": 0.0004903071849729107, + "loss": 0.2736, + "step": 64500 + }, + { + "epoch": 2.67, + "grad_norm": 0.40234375, + "learning_rate": 0.0004903041941598009, + "loss": 0.2046, + "step": 64510 + }, + { + "epoch": 2.67, + "grad_norm": 0.7578125, + "learning_rate": 0.0004903012028944644, + "loss": 0.2053, + "step": 64520 + }, + { + "epoch": 2.67, + "grad_norm": 0.392578125, + "learning_rate": 0.0004902982111769071, + "loss": 0.1671, + "step": 64530 + }, + { + "epoch": 2.67, + "grad_norm": 0.55859375, + "learning_rate": 0.0004902952190071346, + "loss": 0.216, + "step": 64540 + }, + { + "epoch": 2.67, + "grad_norm": 1.234375, + "learning_rate": 0.0004902922263851523, + "loss": 0.2614, + "step": 64550 + }, + { + "epoch": 2.67, + "grad_norm": 1.1328125, + "learning_rate": 0.000490289233310966, + "loss": 0.2192, + "step": 64560 + }, + { + "epoch": 2.67, + "grad_norm": 0.5703125, + "learning_rate": 0.0004902862397845814, + "loss": 0.2119, + "step": 64570 + }, + { + "epoch": 2.67, + "grad_norm": 0.75390625, + "learning_rate": 0.000490283245806004, + "loss": 0.2298, + "step": 64580 + }, + { + "epoch": 2.68, + "grad_norm": 0.46484375, + "learning_rate": 0.0004902802513752395, + "loss": 0.1971, + "step": 64590 + }, + { + "epoch": 2.68, + "grad_norm": 0.455078125, + "learning_rate": 0.0004902772564922935, + "loss": 0.2801, + "step": 64600 + }, + { + "epoch": 2.68, + "grad_norm": 0.77734375, + "learning_rate": 0.0004902742611571716, + "loss": 0.2325, + "step": 64610 + }, + { + "epoch": 2.68, + "grad_norm": 0.8828125, + "learning_rate": 0.0004902712653698795, + "loss": 0.1869, + "step": 64620 + }, + { + "epoch": 2.68, + "grad_norm": 1.1484375, + "learning_rate": 0.0004902682691304229, + "loss": 0.2086, + "step": 64630 + }, + { + "epoch": 2.68, + "grad_norm": 0.67578125, + "learning_rate": 0.0004902652724388072, + "loss": 0.2255, + "step": 64640 + }, + { + "epoch": 2.68, + "grad_norm": 0.349609375, + "learning_rate": 0.0004902622752950384, + "loss": 0.1838, + "step": 64650 + }, + { + "epoch": 2.68, + "grad_norm": 0.734375, + "learning_rate": 0.0004902592776991218, + "loss": 0.2266, + "step": 64660 + }, + { + "epoch": 2.68, + "grad_norm": 1.0, + "learning_rate": 0.0004902562796510633, + "loss": 0.2572, + "step": 64670 + }, + { + "epoch": 2.68, + "grad_norm": 0.6328125, + "learning_rate": 0.0004902532811508684, + "loss": 0.2209, + "step": 64680 + }, + { + "epoch": 2.68, + "grad_norm": 0.640625, + "learning_rate": 0.0004902502821985428, + "loss": 0.2473, + "step": 64690 + }, + { + "epoch": 2.68, + "grad_norm": 0.359375, + "learning_rate": 0.0004902472827940919, + "loss": 0.2338, + "step": 64700 + }, + { + "epoch": 2.68, + "grad_norm": 0.498046875, + "learning_rate": 0.0004902442829375218, + "loss": 0.1977, + "step": 64710 + }, + { + "epoch": 2.68, + "grad_norm": 0.40234375, + "learning_rate": 0.0004902412826288379, + "loss": 0.2265, + "step": 64720 + }, + { + "epoch": 2.68, + "grad_norm": 1.3515625, + "learning_rate": 0.0004902382818680457, + "loss": 0.1671, + "step": 64730 + }, + { + "epoch": 2.68, + "grad_norm": 0.259765625, + "learning_rate": 0.000490235280655151, + "loss": 0.1753, + "step": 64740 + }, + { + "epoch": 2.68, + "grad_norm": 0.50390625, + "learning_rate": 0.0004902322789901596, + "loss": 0.2168, + "step": 64750 + }, + { + "epoch": 2.68, + "grad_norm": 1.6484375, + "learning_rate": 0.0004902292768730769, + "loss": 0.1741, + "step": 64760 + }, + { + "epoch": 2.68, + "grad_norm": 1.03125, + "learning_rate": 0.0004902262743039087, + "loss": 0.2119, + "step": 64770 + }, + { + "epoch": 2.68, + "grad_norm": 0.54296875, + "learning_rate": 0.0004902232712826604, + "loss": 0.2401, + "step": 64780 + }, + { + "epoch": 2.68, + "grad_norm": 0.7421875, + "learning_rate": 0.000490220267809338, + "loss": 0.2342, + "step": 64790 + }, + { + "epoch": 2.68, + "grad_norm": 0.58203125, + "learning_rate": 0.0004902172638839471, + "loss": 0.2272, + "step": 64800 + }, + { + "epoch": 2.68, + "grad_norm": 0.451171875, + "learning_rate": 0.0004902142595064931, + "loss": 0.2024, + "step": 64810 + }, + { + "epoch": 2.68, + "grad_norm": 0.5546875, + "learning_rate": 0.0004902112546769818, + "loss": 0.2283, + "step": 64820 + }, + { + "epoch": 2.69, + "grad_norm": 0.380859375, + "learning_rate": 0.0004902082493954189, + "loss": 0.1925, + "step": 64830 + }, + { + "epoch": 2.69, + "grad_norm": 0.89453125, + "learning_rate": 0.00049020524366181, + "loss": 0.2521, + "step": 64840 + }, + { + "epoch": 2.69, + "grad_norm": 1.3203125, + "learning_rate": 0.0004902022374761608, + "loss": 0.2642, + "step": 64850 + }, + { + "epoch": 2.69, + "grad_norm": 1.5390625, + "learning_rate": 0.000490199230838477, + "loss": 0.2602, + "step": 64860 + }, + { + "epoch": 2.69, + "grad_norm": 1.1640625, + "learning_rate": 0.000490196223748764, + "loss": 0.1566, + "step": 64870 + }, + { + "epoch": 2.69, + "grad_norm": 0.59765625, + "learning_rate": 0.0004901932162070277, + "loss": 0.2391, + "step": 64880 + }, + { + "epoch": 2.69, + "grad_norm": 0.69921875, + "learning_rate": 0.0004901902082132738, + "loss": 0.1822, + "step": 64890 + }, + { + "epoch": 2.69, + "grad_norm": 0.703125, + "learning_rate": 0.0004901871997675079, + "loss": 0.2836, + "step": 64900 + }, + { + "epoch": 2.69, + "grad_norm": 0.80078125, + "learning_rate": 0.0004901841908697356, + "loss": 0.2073, + "step": 64910 + }, + { + "epoch": 2.69, + "grad_norm": 0.65234375, + "learning_rate": 0.0004901811815199625, + "loss": 0.211, + "step": 64920 + }, + { + "epoch": 2.69, + "grad_norm": 1.25, + "learning_rate": 0.0004901781717181943, + "loss": 0.2685, + "step": 64930 + }, + { + "epoch": 2.69, + "grad_norm": 1.5703125, + "learning_rate": 0.0004901751614644369, + "loss": 0.1822, + "step": 64940 + }, + { + "epoch": 2.69, + "grad_norm": 0.859375, + "learning_rate": 0.0004901721507586957, + "loss": 0.2542, + "step": 64950 + }, + { + "epoch": 2.69, + "grad_norm": 0.4140625, + "learning_rate": 0.0004901691396009763, + "loss": 0.1606, + "step": 64960 + }, + { + "epoch": 2.69, + "grad_norm": 0.546875, + "learning_rate": 0.0004901661279912848, + "loss": 0.2619, + "step": 64970 + }, + { + "epoch": 2.69, + "grad_norm": 0.26953125, + "learning_rate": 0.0004901631159296264, + "loss": 0.1958, + "step": 64980 + }, + { + "epoch": 2.69, + "grad_norm": 1.2265625, + "learning_rate": 0.000490160103416007, + "loss": 0.2038, + "step": 64990 + }, + { + "epoch": 2.69, + "grad_norm": 1.0703125, + "learning_rate": 0.0004901570904504323, + "loss": 0.222, + "step": 65000 + }, + { + "epoch": 2.69, + "grad_norm": 0.396484375, + "learning_rate": 0.0004901540770329077, + "loss": 0.2372, + "step": 65010 + }, + { + "epoch": 2.69, + "grad_norm": 1.203125, + "learning_rate": 0.0004901510631634392, + "loss": 0.2675, + "step": 65020 + }, + { + "epoch": 2.69, + "grad_norm": 0.9296875, + "learning_rate": 0.0004901480488420322, + "loss": 0.2547, + "step": 65030 + }, + { + "epoch": 2.69, + "grad_norm": 0.6875, + "learning_rate": 0.0004901450340686926, + "loss": 0.2184, + "step": 65040 + }, + { + "epoch": 2.69, + "grad_norm": 0.78515625, + "learning_rate": 0.000490142018843426, + "loss": 0.2603, + "step": 65050 + }, + { + "epoch": 2.69, + "grad_norm": 0.5234375, + "learning_rate": 0.000490139003166238, + "loss": 0.2769, + "step": 65060 + }, + { + "epoch": 2.7, + "grad_norm": 0.640625, + "learning_rate": 0.0004901359870371344, + "loss": 0.1893, + "step": 65070 + }, + { + "epoch": 2.7, + "grad_norm": 0.2578125, + "learning_rate": 0.0004901329704561208, + "loss": 0.1923, + "step": 65080 + }, + { + "epoch": 2.7, + "grad_norm": 1.0859375, + "learning_rate": 0.0004901299534232029, + "loss": 0.2026, + "step": 65090 + }, + { + "epoch": 2.7, + "grad_norm": 1.109375, + "learning_rate": 0.0004901269359383863, + "loss": 0.24, + "step": 65100 + }, + { + "epoch": 2.7, + "grad_norm": 0.291015625, + "learning_rate": 0.0004901239180016767, + "loss": 0.1785, + "step": 65110 + }, + { + "epoch": 2.7, + "grad_norm": 0.443359375, + "learning_rate": 0.0004901208996130799, + "loss": 0.231, + "step": 65120 + }, + { + "epoch": 2.7, + "grad_norm": 0.79296875, + "learning_rate": 0.0004901178807726014, + "loss": 0.1925, + "step": 65130 + }, + { + "epoch": 2.7, + "grad_norm": 0.92578125, + "learning_rate": 0.0004901148614802471, + "loss": 0.2177, + "step": 65140 + }, + { + "epoch": 2.7, + "grad_norm": 0.68359375, + "learning_rate": 0.0004901118417360225, + "loss": 0.2737, + "step": 65150 + }, + { + "epoch": 2.7, + "grad_norm": 0.76171875, + "learning_rate": 0.0004901088215399333, + "loss": 0.2241, + "step": 65160 + }, + { + "epoch": 2.7, + "grad_norm": 0.578125, + "learning_rate": 0.0004901058008919853, + "loss": 0.1877, + "step": 65170 + }, + { + "epoch": 2.7, + "grad_norm": 0.5546875, + "learning_rate": 0.0004901027797921841, + "loss": 0.2258, + "step": 65180 + }, + { + "epoch": 2.7, + "grad_norm": 1.0234375, + "learning_rate": 0.0004900997582405354, + "loss": 0.2045, + "step": 65190 + }, + { + "epoch": 2.7, + "grad_norm": 1.34375, + "learning_rate": 0.0004900967362370448, + "loss": 0.2667, + "step": 65200 + }, + { + "epoch": 2.7, + "grad_norm": 0.2080078125, + "learning_rate": 0.0004900937137817182, + "loss": 0.2035, + "step": 65210 + }, + { + "epoch": 2.7, + "grad_norm": 0.890625, + "learning_rate": 0.000490090690874561, + "loss": 0.1858, + "step": 65220 + }, + { + "epoch": 2.7, + "grad_norm": 0.91796875, + "learning_rate": 0.0004900876675155792, + "loss": 0.1278, + "step": 65230 + }, + { + "epoch": 2.7, + "grad_norm": 1.609375, + "learning_rate": 0.0004900846437047783, + "loss": 0.2188, + "step": 65240 + }, + { + "epoch": 2.7, + "grad_norm": 0.82421875, + "learning_rate": 0.000490081619442164, + "loss": 0.2323, + "step": 65250 + }, + { + "epoch": 2.7, + "grad_norm": 0.81640625, + "learning_rate": 0.000490078594727742, + "loss": 0.2281, + "step": 65260 + }, + { + "epoch": 2.7, + "grad_norm": 1.0703125, + "learning_rate": 0.000490075569561518, + "loss": 0.2037, + "step": 65270 + }, + { + "epoch": 2.7, + "grad_norm": 0.85546875, + "learning_rate": 0.0004900725439434978, + "loss": 0.2131, + "step": 65280 + }, + { + "epoch": 2.7, + "grad_norm": 0.609375, + "learning_rate": 0.000490069517873687, + "loss": 0.1801, + "step": 65290 + }, + { + "epoch": 2.7, + "grad_norm": 0.0, + "learning_rate": 0.0004900664913520912, + "loss": 0.1747, + "step": 65300 + }, + { + "epoch": 2.71, + "grad_norm": 0.373046875, + "learning_rate": 0.0004900634643787161, + "loss": 0.202, + "step": 65310 + }, + { + "epoch": 2.71, + "grad_norm": 1.015625, + "learning_rate": 0.0004900604369535677, + "loss": 0.277, + "step": 65320 + }, + { + "epoch": 2.71, + "grad_norm": 0.89453125, + "learning_rate": 0.0004900574090766514, + "loss": 0.2322, + "step": 65330 + }, + { + "epoch": 2.71, + "grad_norm": 0.376953125, + "learning_rate": 0.0004900543807479729, + "loss": 0.176, + "step": 65340 + }, + { + "epoch": 2.71, + "grad_norm": 0.609375, + "learning_rate": 0.0004900513519675382, + "loss": 0.23, + "step": 65350 + }, + { + "epoch": 2.71, + "grad_norm": 0.70703125, + "learning_rate": 0.0004900483227353525, + "loss": 0.2034, + "step": 65360 + }, + { + "epoch": 2.71, + "grad_norm": 0.6171875, + "learning_rate": 0.0004900452930514219, + "loss": 0.2026, + "step": 65370 + }, + { + "epoch": 2.71, + "grad_norm": 0.70703125, + "learning_rate": 0.0004900422629157521, + "loss": 0.1987, + "step": 65380 + }, + { + "epoch": 2.71, + "grad_norm": 0.55859375, + "learning_rate": 0.0004900392323283485, + "loss": 0.2483, + "step": 65390 + }, + { + "epoch": 2.71, + "grad_norm": 0.8984375, + "learning_rate": 0.0004900362012892171, + "loss": 0.1712, + "step": 65400 + }, + { + "epoch": 2.71, + "grad_norm": 0.5625, + "learning_rate": 0.0004900331697983635, + "loss": 0.2336, + "step": 65410 + }, + { + "epoch": 2.71, + "grad_norm": 0.337890625, + "learning_rate": 0.0004900301378557932, + "loss": 0.2268, + "step": 65420 + }, + { + "epoch": 2.71, + "grad_norm": 0.330078125, + "learning_rate": 0.0004900271054615123, + "loss": 0.1966, + "step": 65430 + }, + { + "epoch": 2.71, + "grad_norm": 0.79296875, + "learning_rate": 0.0004900240726155263, + "loss": 0.2551, + "step": 65440 + }, + { + "epoch": 2.71, + "grad_norm": 0.6484375, + "learning_rate": 0.0004900210393178409, + "loss": 0.2012, + "step": 65450 + }, + { + "epoch": 2.71, + "grad_norm": 0.380859375, + "learning_rate": 0.0004900180055684616, + "loss": 0.2687, + "step": 65460 + }, + { + "epoch": 2.71, + "grad_norm": 0.47265625, + "learning_rate": 0.0004900149713673946, + "loss": 0.2341, + "step": 65470 + }, + { + "epoch": 2.71, + "grad_norm": 0.46875, + "learning_rate": 0.0004900119367146452, + "loss": 0.1887, + "step": 65480 + }, + { + "epoch": 2.71, + "grad_norm": 0.6640625, + "learning_rate": 0.0004900089016102194, + "loss": 0.2567, + "step": 65490 + }, + { + "epoch": 2.71, + "grad_norm": 0.515625, + "learning_rate": 0.0004900058660541227, + "loss": 0.1905, + "step": 65500 + }, + { + "epoch": 2.71, + "grad_norm": 1.515625, + "learning_rate": 0.0004900028300463609, + "loss": 0.2144, + "step": 65510 + }, + { + "epoch": 2.71, + "grad_norm": 0.7890625, + "learning_rate": 0.0004899997935869396, + "loss": 0.1997, + "step": 65520 + }, + { + "epoch": 2.71, + "grad_norm": 0.609375, + "learning_rate": 0.0004899967566758647, + "loss": 0.176, + "step": 65530 + }, + { + "epoch": 2.71, + "grad_norm": 0.375, + "learning_rate": 0.0004899937193131417, + "loss": 0.2511, + "step": 65540 + }, + { + "epoch": 2.72, + "grad_norm": 0.95703125, + "learning_rate": 0.0004899906814987766, + "loss": 0.2138, + "step": 65550 + }, + { + "epoch": 2.72, + "grad_norm": 0.4296875, + "learning_rate": 0.0004899876432327749, + "loss": 0.2431, + "step": 65560 + }, + { + "epoch": 2.72, + "grad_norm": 0.98828125, + "learning_rate": 0.0004899846045151423, + "loss": 0.1867, + "step": 65570 + }, + { + "epoch": 2.72, + "grad_norm": 0.640625, + "learning_rate": 0.0004899815653458847, + "loss": 0.1927, + "step": 65580 + }, + { + "epoch": 2.72, + "grad_norm": 0.59375, + "learning_rate": 0.0004899785257250077, + "loss": 0.2468, + "step": 65590 + }, + { + "epoch": 2.72, + "grad_norm": 0.6796875, + "learning_rate": 0.0004899754856525169, + "loss": 0.2382, + "step": 65600 + }, + { + "epoch": 2.72, + "grad_norm": 0.66796875, + "learning_rate": 0.0004899724451284183, + "loss": 0.1941, + "step": 65610 + }, + { + "epoch": 2.72, + "grad_norm": 0.330078125, + "learning_rate": 0.0004899694041527173, + "loss": 0.2269, + "step": 65620 + }, + { + "epoch": 2.72, + "grad_norm": 0.84765625, + "learning_rate": 0.0004899663627254199, + "loss": 0.1983, + "step": 65630 + }, + { + "epoch": 2.72, + "grad_norm": 0.43359375, + "learning_rate": 0.0004899633208465317, + "loss": 0.2163, + "step": 65640 + }, + { + "epoch": 2.72, + "grad_norm": 0.388671875, + "learning_rate": 0.0004899602785160585, + "loss": 0.2406, + "step": 65650 + }, + { + "epoch": 2.72, + "grad_norm": 0.79296875, + "learning_rate": 0.000489957235734006, + "loss": 0.2222, + "step": 65660 + }, + { + "epoch": 2.72, + "grad_norm": 0.921875, + "learning_rate": 0.0004899541925003798, + "loss": 0.2759, + "step": 65670 + }, + { + "epoch": 2.72, + "grad_norm": 0.40625, + "learning_rate": 0.0004899511488151857, + "loss": 0.2014, + "step": 65680 + }, + { + "epoch": 2.72, + "grad_norm": 1.0234375, + "learning_rate": 0.0004899481046784296, + "loss": 0.2615, + "step": 65690 + }, + { + "epoch": 2.72, + "grad_norm": 0.451171875, + "learning_rate": 0.0004899450600901169, + "loss": 0.216, + "step": 65700 + }, + { + "epoch": 2.72, + "grad_norm": 0.78515625, + "learning_rate": 0.0004899420150502536, + "loss": 0.2697, + "step": 65710 + }, + { + "epoch": 2.72, + "grad_norm": 0.7265625, + "learning_rate": 0.0004899389695588453, + "loss": 0.1721, + "step": 65720 + }, + { + "epoch": 2.72, + "grad_norm": 0.51171875, + "learning_rate": 0.0004899359236158979, + "loss": 0.2208, + "step": 65730 + }, + { + "epoch": 2.72, + "grad_norm": 0.546875, + "learning_rate": 0.0004899328772214168, + "loss": 0.2189, + "step": 65740 + }, + { + "epoch": 2.72, + "grad_norm": 0.85546875, + "learning_rate": 0.000489929830375408, + "loss": 0.2113, + "step": 65750 + }, + { + "epoch": 2.72, + "grad_norm": 0.53125, + "learning_rate": 0.0004899267830778773, + "loss": 0.2538, + "step": 65760 + }, + { + "epoch": 2.72, + "grad_norm": 1.390625, + "learning_rate": 0.0004899237353288301, + "loss": 0.1961, + "step": 65770 + }, + { + "epoch": 2.72, + "grad_norm": 0.6640625, + "learning_rate": 0.0004899206871282725, + "loss": 0.2597, + "step": 65780 + }, + { + "epoch": 2.73, + "grad_norm": 0.32421875, + "learning_rate": 0.00048991763847621, + "loss": 0.2074, + "step": 65790 + }, + { + "epoch": 2.73, + "grad_norm": 0.25, + "learning_rate": 0.0004899145893726485, + "loss": 0.2205, + "step": 65800 + }, + { + "epoch": 2.73, + "grad_norm": 0.81640625, + "learning_rate": 0.0004899115398175936, + "loss": 0.2056, + "step": 65810 + }, + { + "epoch": 2.73, + "grad_norm": 0.474609375, + "learning_rate": 0.000489908489811051, + "loss": 0.1773, + "step": 65820 + }, + { + "epoch": 2.73, + "grad_norm": 0.51953125, + "learning_rate": 0.0004899054393530266, + "loss": 0.2303, + "step": 65830 + }, + { + "epoch": 2.73, + "grad_norm": 0.671875, + "learning_rate": 0.0004899023884435262, + "loss": 0.1698, + "step": 65840 + }, + { + "epoch": 2.73, + "grad_norm": 0.69921875, + "learning_rate": 0.0004898993370825552, + "loss": 0.2198, + "step": 65850 + }, + { + "epoch": 2.73, + "grad_norm": 0.4375, + "learning_rate": 0.0004898962852701197, + "loss": 0.2015, + "step": 65860 + }, + { + "epoch": 2.73, + "grad_norm": 0.5078125, + "learning_rate": 0.0004898932330062252, + "loss": 0.2094, + "step": 65870 + }, + { + "epoch": 2.73, + "grad_norm": 0.443359375, + "learning_rate": 0.0004898901802908776, + "loss": 0.1872, + "step": 65880 + }, + { + "epoch": 2.73, + "grad_norm": 0.478515625, + "learning_rate": 0.0004898871271240826, + "loss": 0.2241, + "step": 65890 + }, + { + "epoch": 2.73, + "grad_norm": 0.65234375, + "learning_rate": 0.0004898840735058459, + "loss": 0.2008, + "step": 65900 + }, + { + "epoch": 2.73, + "grad_norm": 0.265625, + "learning_rate": 0.0004898810194361733, + "loss": 0.1941, + "step": 65910 + }, + { + "epoch": 2.73, + "grad_norm": 1.09375, + "learning_rate": 0.0004898779649150705, + "loss": 0.1997, + "step": 65920 + }, + { + "epoch": 2.73, + "grad_norm": 0.37109375, + "learning_rate": 0.0004898749099425432, + "loss": 0.2374, + "step": 65930 + }, + { + "epoch": 2.73, + "grad_norm": 1.25, + "learning_rate": 0.0004898718545185974, + "loss": 0.1986, + "step": 65940 + }, + { + "epoch": 2.73, + "grad_norm": 0.56640625, + "learning_rate": 0.0004898687986432386, + "loss": 0.183, + "step": 65950 + }, + { + "epoch": 2.73, + "grad_norm": 1.75, + "learning_rate": 0.0004898657423164726, + "loss": 0.2511, + "step": 65960 + }, + { + "epoch": 2.73, + "grad_norm": 0.55859375, + "learning_rate": 0.0004898626855383052, + "loss": 0.1712, + "step": 65970 + }, + { + "epoch": 2.73, + "grad_norm": 0.55078125, + "learning_rate": 0.0004898596283087421, + "loss": 0.1912, + "step": 65980 + }, + { + "epoch": 2.73, + "grad_norm": 0.76171875, + "learning_rate": 0.000489856570627789, + "loss": 0.2211, + "step": 65990 + }, + { + "epoch": 2.73, + "grad_norm": 2.046875, + "learning_rate": 0.000489853512495452, + "loss": 0.2654, + "step": 66000 + }, + { + "epoch": 2.73, + "grad_norm": 0.953125, + "learning_rate": 0.0004898504539117363, + "loss": 0.1972, + "step": 66010 + }, + { + "epoch": 2.73, + "grad_norm": 0.69921875, + "learning_rate": 0.0004898473948766482, + "loss": 0.2439, + "step": 66020 + }, + { + "epoch": 2.73, + "grad_norm": 0.65234375, + "learning_rate": 0.0004898443353901931, + "loss": 0.2101, + "step": 66030 + }, + { + "epoch": 2.74, + "grad_norm": 0.7109375, + "learning_rate": 0.0004898412754523768, + "loss": 0.2153, + "step": 66040 + }, + { + "epoch": 2.74, + "grad_norm": 0.88671875, + "learning_rate": 0.0004898382150632051, + "loss": 0.2481, + "step": 66050 + }, + { + "epoch": 2.74, + "grad_norm": 0.82421875, + "learning_rate": 0.0004898351542226839, + "loss": 0.2676, + "step": 66060 + }, + { + "epoch": 2.74, + "grad_norm": 1.125, + "learning_rate": 0.0004898320929308187, + "loss": 0.1833, + "step": 66070 + }, + { + "epoch": 2.74, + "grad_norm": 0.439453125, + "learning_rate": 0.0004898290311876154, + "loss": 0.2148, + "step": 66080 + }, + { + "epoch": 2.74, + "grad_norm": 0.88671875, + "learning_rate": 0.0004898259689930799, + "loss": 0.2303, + "step": 66090 + }, + { + "epoch": 2.74, + "grad_norm": 0.88671875, + "learning_rate": 0.0004898229063472179, + "loss": 0.227, + "step": 66100 + }, + { + "epoch": 2.74, + "grad_norm": 0.546875, + "learning_rate": 0.0004898198432500349, + "loss": 0.2184, + "step": 66110 + }, + { + "epoch": 2.74, + "grad_norm": 0.69140625, + "learning_rate": 0.0004898167797015369, + "loss": 0.2214, + "step": 66120 + }, + { + "epoch": 2.74, + "grad_norm": 0.341796875, + "learning_rate": 0.0004898137157017296, + "loss": 0.1859, + "step": 66130 + }, + { + "epoch": 2.74, + "grad_norm": 1.28125, + "learning_rate": 0.0004898106512506188, + "loss": 0.2696, + "step": 66140 + }, + { + "epoch": 2.74, + "grad_norm": 0.2373046875, + "learning_rate": 0.0004898075863482103, + "loss": 0.2007, + "step": 66150 + }, + { + "epoch": 2.74, + "grad_norm": 0.234375, + "learning_rate": 0.0004898045209945097, + "loss": 0.1943, + "step": 66160 + }, + { + "epoch": 2.74, + "grad_norm": 2.1875, + "learning_rate": 0.000489801455189523, + "loss": 0.2213, + "step": 66170 + }, + { + "epoch": 2.74, + "grad_norm": 0.40234375, + "learning_rate": 0.0004897983889332559, + "loss": 0.207, + "step": 66180 + }, + { + "epoch": 2.74, + "grad_norm": 0.376953125, + "learning_rate": 0.000489795322225714, + "loss": 0.2231, + "step": 66190 + }, + { + "epoch": 2.74, + "grad_norm": 0.953125, + "learning_rate": 0.0004897922550669033, + "loss": 0.2329, + "step": 66200 + }, + { + "epoch": 2.74, + "grad_norm": 0.2451171875, + "learning_rate": 0.0004897891874568294, + "loss": 0.2301, + "step": 66210 + }, + { + "epoch": 2.74, + "grad_norm": 0.546875, + "learning_rate": 0.0004897861193954981, + "loss": 0.221, + "step": 66220 + }, + { + "epoch": 2.74, + "grad_norm": 0.353515625, + "learning_rate": 0.0004897830508829153, + "loss": 0.2039, + "step": 66230 + }, + { + "epoch": 2.74, + "grad_norm": 0.5703125, + "learning_rate": 0.0004897799819190867, + "loss": 0.2312, + "step": 66240 + }, + { + "epoch": 2.74, + "grad_norm": 0.50390625, + "learning_rate": 0.000489776912504018, + "loss": 0.1967, + "step": 66250 + }, + { + "epoch": 2.74, + "grad_norm": 1.671875, + "learning_rate": 0.0004897738426377152, + "loss": 0.2402, + "step": 66260 + }, + { + "epoch": 2.74, + "grad_norm": 1.375, + "learning_rate": 0.0004897707723201838, + "loss": 0.1885, + "step": 66270 + }, + { + "epoch": 2.75, + "grad_norm": 0.66796875, + "learning_rate": 0.0004897677015514297, + "loss": 0.1738, + "step": 66280 + }, + { + "epoch": 2.75, + "grad_norm": 0.52734375, + "learning_rate": 0.0004897646303314586, + "loss": 0.1957, + "step": 66290 + }, + { + "epoch": 2.75, + "grad_norm": 0.4296875, + "learning_rate": 0.0004897615586602765, + "loss": 0.2331, + "step": 66300 + }, + { + "epoch": 2.75, + "grad_norm": 0.57421875, + "learning_rate": 0.000489758486537889, + "loss": 0.1977, + "step": 66310 + }, + { + "epoch": 2.75, + "grad_norm": 0.44921875, + "learning_rate": 0.0004897554139643019, + "loss": 0.2046, + "step": 66320 + }, + { + "epoch": 2.75, + "grad_norm": 0.99609375, + "learning_rate": 0.0004897523409395211, + "loss": 0.1958, + "step": 66330 + }, + { + "epoch": 2.75, + "grad_norm": 0.9921875, + "learning_rate": 0.0004897492674635521, + "loss": 0.2593, + "step": 66340 + }, + { + "epoch": 2.75, + "grad_norm": 0.57421875, + "learning_rate": 0.000489746193536401, + "loss": 0.2239, + "step": 66350 + }, + { + "epoch": 2.75, + "grad_norm": 0.52734375, + "learning_rate": 0.0004897431191580734, + "loss": 0.2394, + "step": 66360 + }, + { + "epoch": 2.75, + "grad_norm": 0.8515625, + "learning_rate": 0.0004897400443285751, + "loss": 0.2177, + "step": 66370 + }, + { + "epoch": 2.75, + "grad_norm": 0.365234375, + "learning_rate": 0.0004897369690479121, + "loss": 0.231, + "step": 66380 + }, + { + "epoch": 2.75, + "grad_norm": 0.59765625, + "learning_rate": 0.0004897338933160898, + "loss": 0.2095, + "step": 66390 + }, + { + "epoch": 2.75, + "grad_norm": 0.3984375, + "learning_rate": 0.0004897308171331144, + "loss": 0.218, + "step": 66400 + }, + { + "epoch": 2.75, + "grad_norm": 0.390625, + "learning_rate": 0.0004897277404989914, + "loss": 0.2146, + "step": 66410 + }, + { + "epoch": 2.75, + "grad_norm": 2.109375, + "learning_rate": 0.0004897246634137266, + "loss": 0.1931, + "step": 66420 + }, + { + "epoch": 2.75, + "grad_norm": 0.43359375, + "learning_rate": 0.000489721585877326, + "loss": 0.1828, + "step": 66430 + }, + { + "epoch": 2.75, + "grad_norm": 1.09375, + "learning_rate": 0.0004897185078897952, + "loss": 0.222, + "step": 66440 + }, + { + "epoch": 2.75, + "grad_norm": 0.36328125, + "learning_rate": 0.00048971542945114, + "loss": 0.2344, + "step": 66450 + }, + { + "epoch": 2.75, + "grad_norm": 0.6015625, + "learning_rate": 0.0004897123505613663, + "loss": 0.1823, + "step": 66460 + }, + { + "epoch": 2.75, + "grad_norm": 0.5078125, + "learning_rate": 0.00048970927122048, + "loss": 0.2224, + "step": 66470 + }, + { + "epoch": 2.75, + "grad_norm": 0.83203125, + "learning_rate": 0.0004897061914284865, + "loss": 0.1823, + "step": 66480 + }, + { + "epoch": 2.75, + "grad_norm": 0.47265625, + "learning_rate": 0.0004897031111853919, + "loss": 0.2261, + "step": 66490 + }, + { + "epoch": 2.75, + "grad_norm": 0.80859375, + "learning_rate": 0.0004897000304912019, + "loss": 0.2442, + "step": 66500 + }, + { + "epoch": 2.75, + "grad_norm": 2.03125, + "learning_rate": 0.0004896969493459225, + "loss": 0.2498, + "step": 66510 + }, + { + "epoch": 2.76, + "grad_norm": 0.34765625, + "learning_rate": 0.0004896938677495592, + "loss": 0.2443, + "step": 66520 + }, + { + "epoch": 2.76, + "grad_norm": 0.875, + "learning_rate": 0.0004896907857021179, + "loss": 0.2346, + "step": 66530 + }, + { + "epoch": 2.76, + "grad_norm": 0.73828125, + "learning_rate": 0.0004896877032036044, + "loss": 0.2275, + "step": 66540 + }, + { + "epoch": 2.76, + "grad_norm": 0.7734375, + "learning_rate": 0.0004896846202540247, + "loss": 0.1967, + "step": 66550 + }, + { + "epoch": 2.76, + "grad_norm": 0.57421875, + "learning_rate": 0.0004896815368533843, + "loss": 0.2083, + "step": 66560 + }, + { + "epoch": 2.76, + "grad_norm": 0.515625, + "learning_rate": 0.0004896784530016891, + "loss": 0.219, + "step": 66570 + }, + { + "epoch": 2.76, + "grad_norm": 0.56640625, + "learning_rate": 0.000489675368698945, + "loss": 0.2243, + "step": 66580 + }, + { + "epoch": 2.76, + "grad_norm": 0.6875, + "learning_rate": 0.0004896722839451578, + "loss": 0.2223, + "step": 66590 + }, + { + "epoch": 2.76, + "grad_norm": 0.419921875, + "learning_rate": 0.0004896691987403331, + "loss": 0.1834, + "step": 66600 + }, + { + "epoch": 2.76, + "grad_norm": 1.90625, + "learning_rate": 0.0004896661130844769, + "loss": 0.1928, + "step": 66610 + }, + { + "epoch": 2.76, + "grad_norm": 2.390625, + "learning_rate": 0.0004896630269775951, + "loss": 0.2143, + "step": 66620 + }, + { + "epoch": 2.76, + "grad_norm": 1.1171875, + "learning_rate": 0.0004896599404196931, + "loss": 0.2185, + "step": 66630 + }, + { + "epoch": 2.76, + "grad_norm": 0.353515625, + "learning_rate": 0.0004896568534107771, + "loss": 0.2259, + "step": 66640 + }, + { + "epoch": 2.76, + "grad_norm": 0.76953125, + "learning_rate": 0.0004896537659508528, + "loss": 0.2442, + "step": 66650 + }, + { + "epoch": 2.76, + "grad_norm": 0.4765625, + "learning_rate": 0.0004896506780399261, + "loss": 0.2105, + "step": 66660 + }, + { + "epoch": 2.76, + "grad_norm": 0.73828125, + "learning_rate": 0.0004896475896780025, + "loss": 0.2442, + "step": 66670 + }, + { + "epoch": 2.76, + "grad_norm": 0.259765625, + "learning_rate": 0.0004896445008650881, + "loss": 0.2178, + "step": 66680 + }, + { + "epoch": 2.76, + "grad_norm": 0.8125, + "learning_rate": 0.0004896414116011886, + "loss": 0.2163, + "step": 66690 + }, + { + "epoch": 2.76, + "grad_norm": 0.59375, + "learning_rate": 0.00048963832188631, + "loss": 0.189, + "step": 66700 + }, + { + "epoch": 2.76, + "grad_norm": 0.55859375, + "learning_rate": 0.0004896352317204578, + "loss": 0.2209, + "step": 66710 + }, + { + "epoch": 2.76, + "grad_norm": 0.73046875, + "learning_rate": 0.0004896321411036379, + "loss": 0.2082, + "step": 66720 + }, + { + "epoch": 2.76, + "grad_norm": 0.96484375, + "learning_rate": 0.0004896290500358563, + "loss": 0.2549, + "step": 66730 + }, + { + "epoch": 2.76, + "grad_norm": 0.5859375, + "learning_rate": 0.0004896259585171188, + "loss": 0.1923, + "step": 66740 + }, + { + "epoch": 2.76, + "grad_norm": 0.65234375, + "learning_rate": 0.000489622866547431, + "loss": 0.2185, + "step": 66750 + }, + { + "epoch": 2.77, + "grad_norm": 0.5078125, + "learning_rate": 0.0004896197741267989, + "loss": 0.2551, + "step": 66760 + }, + { + "epoch": 2.77, + "grad_norm": 0.71484375, + "learning_rate": 0.0004896166812552281, + "loss": 0.1724, + "step": 66770 + }, + { + "epoch": 2.77, + "grad_norm": 0.98046875, + "learning_rate": 0.0004896135879327248, + "loss": 0.2074, + "step": 66780 + }, + { + "epoch": 2.77, + "grad_norm": 0.490234375, + "learning_rate": 0.0004896104941592945, + "loss": 0.2355, + "step": 66790 + }, + { + "epoch": 2.77, + "grad_norm": 0.43359375, + "learning_rate": 0.0004896073999349431, + "loss": 0.2075, + "step": 66800 + }, + { + "epoch": 2.77, + "grad_norm": 0.60546875, + "learning_rate": 0.0004896043052596765, + "loss": 0.241, + "step": 66810 + }, + { + "epoch": 2.77, + "grad_norm": 0.44140625, + "learning_rate": 0.0004896012101335004, + "loss": 0.1948, + "step": 66820 + }, + { + "epoch": 2.77, + "grad_norm": 0.9609375, + "learning_rate": 0.0004895981145564208, + "loss": 0.1956, + "step": 66830 + }, + { + "epoch": 2.77, + "grad_norm": 1.1171875, + "learning_rate": 0.0004895950185284434, + "loss": 0.2345, + "step": 66840 + }, + { + "epoch": 2.77, + "grad_norm": 0.703125, + "learning_rate": 0.000489591922049574, + "loss": 0.2196, + "step": 66850 + }, + { + "epoch": 2.77, + "grad_norm": 1.140625, + "learning_rate": 0.0004895888251198186, + "loss": 0.2562, + "step": 66860 + }, + { + "epoch": 2.77, + "grad_norm": 2.109375, + "learning_rate": 0.0004895857277391828, + "loss": 0.215, + "step": 66870 + }, + { + "epoch": 2.77, + "grad_norm": 0.2578125, + "learning_rate": 0.0004895826299076725, + "loss": 0.2026, + "step": 66880 + }, + { + "epoch": 2.77, + "grad_norm": 0.384765625, + "learning_rate": 0.0004895795316252937, + "loss": 0.2068, + "step": 66890 + }, + { + "epoch": 2.77, + "grad_norm": 0.279296875, + "learning_rate": 0.0004895764328920519, + "loss": 0.2188, + "step": 66900 + }, + { + "epoch": 2.77, + "grad_norm": 0.56640625, + "learning_rate": 0.0004895733337079532, + "loss": 0.2281, + "step": 66910 + }, + { + "epoch": 2.77, + "grad_norm": 0.94140625, + "learning_rate": 0.0004895702340730034, + "loss": 0.2336, + "step": 66920 + }, + { + "epoch": 2.77, + "grad_norm": 1.1171875, + "learning_rate": 0.0004895671339872083, + "loss": 0.2217, + "step": 66930 + }, + { + "epoch": 2.77, + "grad_norm": 0.0, + "learning_rate": 0.0004895640334505736, + "loss": 0.1799, + "step": 66940 + }, + { + "epoch": 2.77, + "grad_norm": 0.50390625, + "learning_rate": 0.0004895609324631054, + "loss": 0.1533, + "step": 66950 + }, + { + "epoch": 2.77, + "grad_norm": 0.6171875, + "learning_rate": 0.0004895578310248095, + "loss": 0.2478, + "step": 66960 + }, + { + "epoch": 2.77, + "grad_norm": 1.375, + "learning_rate": 0.0004895547291356914, + "loss": 0.1967, + "step": 66970 + }, + { + "epoch": 2.77, + "grad_norm": 2.09375, + "learning_rate": 0.0004895516267957573, + "loss": 0.1752, + "step": 66980 + }, + { + "epoch": 2.77, + "grad_norm": 0.6875, + "learning_rate": 0.0004895485240050128, + "loss": 0.1919, + "step": 66990 + }, + { + "epoch": 2.78, + "grad_norm": 1.1328125, + "learning_rate": 0.0004895454207634639, + "loss": 0.2074, + "step": 67000 + }, + { + "epoch": 2.78, + "grad_norm": 0.68359375, + "learning_rate": 0.0004895423170711164, + "loss": 0.2216, + "step": 67010 + }, + { + "epoch": 2.78, + "grad_norm": 1.6640625, + "learning_rate": 0.0004895392129279761, + "loss": 0.2734, + "step": 67020 + }, + { + "epoch": 2.78, + "grad_norm": 0.60546875, + "learning_rate": 0.0004895361083340489, + "loss": 0.2055, + "step": 67030 + }, + { + "epoch": 2.78, + "grad_norm": 0.43359375, + "learning_rate": 0.0004895330032893406, + "loss": 0.1994, + "step": 67040 + }, + { + "epoch": 2.78, + "grad_norm": 1.078125, + "learning_rate": 0.0004895298977938571, + "loss": 0.2051, + "step": 67050 + }, + { + "epoch": 2.78, + "grad_norm": 0.365234375, + "learning_rate": 0.0004895267918476042, + "loss": 0.2393, + "step": 67060 + }, + { + "epoch": 2.78, + "grad_norm": 1.0703125, + "learning_rate": 0.0004895236854505876, + "loss": 0.2517, + "step": 67070 + }, + { + "epoch": 2.78, + "grad_norm": 0.93359375, + "learning_rate": 0.0004895205786028135, + "loss": 0.2648, + "step": 67080 + }, + { + "epoch": 2.78, + "grad_norm": 0.66015625, + "learning_rate": 0.0004895174713042873, + "loss": 0.2076, + "step": 67090 + }, + { + "epoch": 2.78, + "grad_norm": 0.546875, + "learning_rate": 0.0004895143635550153, + "loss": 0.2602, + "step": 67100 + }, + { + "epoch": 2.78, + "grad_norm": 0.6875, + "learning_rate": 0.0004895112553550031, + "loss": 0.2172, + "step": 67110 + }, + { + "epoch": 2.78, + "grad_norm": 0.7890625, + "learning_rate": 0.0004895081467042564, + "loss": 0.2144, + "step": 67120 + }, + { + "epoch": 2.78, + "grad_norm": 0.66796875, + "learning_rate": 0.0004895050376027815, + "loss": 0.235, + "step": 67130 + }, + { + "epoch": 2.78, + "grad_norm": 0.80078125, + "learning_rate": 0.0004895019280505838, + "loss": 0.1556, + "step": 67140 + }, + { + "epoch": 2.78, + "grad_norm": 0.9765625, + "learning_rate": 0.0004894988180476694, + "loss": 0.2253, + "step": 67150 + }, + { + "epoch": 2.78, + "grad_norm": 0.462890625, + "learning_rate": 0.0004894957075940441, + "loss": 0.2965, + "step": 67160 + }, + { + "epoch": 2.78, + "grad_norm": 0.53125, + "learning_rate": 0.0004894925966897137, + "loss": 0.1994, + "step": 67170 + }, + { + "epoch": 2.78, + "grad_norm": 0.71484375, + "learning_rate": 0.0004894894853346841, + "loss": 0.2718, + "step": 67180 + }, + { + "epoch": 2.78, + "grad_norm": 2.0625, + "learning_rate": 0.0004894863735289611, + "loss": 0.2231, + "step": 67190 + }, + { + "epoch": 2.78, + "grad_norm": 1.0078125, + "learning_rate": 0.0004894832612725508, + "loss": 0.2546, + "step": 67200 + }, + { + "epoch": 2.78, + "grad_norm": 0.0, + "learning_rate": 0.0004894801485654587, + "loss": 0.2446, + "step": 67210 + }, + { + "epoch": 2.78, + "grad_norm": 1.5703125, + "learning_rate": 0.0004894770354076908, + "loss": 0.2493, + "step": 67220 + }, + { + "epoch": 2.78, + "grad_norm": 0.6875, + "learning_rate": 0.0004894739217992531, + "loss": 0.1698, + "step": 67230 + }, + { + "epoch": 2.79, + "grad_norm": 0.625, + "learning_rate": 0.0004894708077401512, + "loss": 0.2305, + "step": 67240 + }, + { + "epoch": 2.79, + "grad_norm": 0.88671875, + "learning_rate": 0.0004894676932303912, + "loss": 0.1799, + "step": 67250 + }, + { + "epoch": 2.79, + "grad_norm": 0.34765625, + "learning_rate": 0.0004894645782699788, + "loss": 0.275, + "step": 67260 + }, + { + "epoch": 2.79, + "grad_norm": 0.66796875, + "learning_rate": 0.00048946146285892, + "loss": 0.2098, + "step": 67270 + }, + { + "epoch": 2.79, + "grad_norm": 0.91015625, + "learning_rate": 0.0004894583469972206, + "loss": 0.1712, + "step": 67280 + }, + { + "epoch": 2.79, + "grad_norm": 0.44140625, + "learning_rate": 0.0004894552306848863, + "loss": 0.2385, + "step": 67290 + }, + { + "epoch": 2.79, + "grad_norm": 0.32421875, + "learning_rate": 0.0004894521139219232, + "loss": 0.2139, + "step": 67300 + }, + { + "epoch": 2.79, + "grad_norm": 0.703125, + "learning_rate": 0.0004894489967083371, + "loss": 0.2432, + "step": 67310 + }, + { + "epoch": 2.79, + "grad_norm": 0.9375, + "learning_rate": 0.0004894458790441338, + "loss": 0.2582, + "step": 67320 + }, + { + "epoch": 2.79, + "grad_norm": 0.28515625, + "learning_rate": 0.0004894427609293193, + "loss": 0.198, + "step": 67330 + }, + { + "epoch": 2.79, + "grad_norm": 0.2578125, + "learning_rate": 0.0004894396423638994, + "loss": 0.2052, + "step": 67340 + }, + { + "epoch": 2.79, + "grad_norm": 0.431640625, + "learning_rate": 0.0004894365233478798, + "loss": 0.27, + "step": 67350 + }, + { + "epoch": 2.79, + "grad_norm": 0.439453125, + "learning_rate": 0.0004894334038812667, + "loss": 0.2495, + "step": 67360 + }, + { + "epoch": 2.79, + "grad_norm": 0.38671875, + "learning_rate": 0.0004894302839640656, + "loss": 0.2903, + "step": 67370 + }, + { + "epoch": 2.79, + "grad_norm": 0.734375, + "learning_rate": 0.0004894271635962827, + "loss": 0.2246, + "step": 67380 + }, + { + "epoch": 2.79, + "grad_norm": 0.61328125, + "learning_rate": 0.0004894240427779237, + "loss": 0.1843, + "step": 67390 + }, + { + "epoch": 2.79, + "grad_norm": 0.96484375, + "learning_rate": 0.0004894209215089945, + "loss": 0.2121, + "step": 67400 + }, + { + "epoch": 2.79, + "grad_norm": 0.84375, + "learning_rate": 0.0004894177997895009, + "loss": 0.2021, + "step": 67410 + }, + { + "epoch": 2.79, + "grad_norm": 0.498046875, + "learning_rate": 0.000489414677619449, + "loss": 0.2598, + "step": 67420 + }, + { + "epoch": 2.79, + "grad_norm": 1.3046875, + "learning_rate": 0.0004894115549988444, + "loss": 0.2266, + "step": 67430 + }, + { + "epoch": 2.79, + "grad_norm": 0.7265625, + "learning_rate": 0.0004894084319276933, + "loss": 0.2198, + "step": 67440 + }, + { + "epoch": 2.79, + "grad_norm": 0.37109375, + "learning_rate": 0.0004894053084060012, + "loss": 0.2329, + "step": 67450 + }, + { + "epoch": 2.79, + "grad_norm": 0.484375, + "learning_rate": 0.0004894021844337742, + "loss": 0.1631, + "step": 67460 + }, + { + "epoch": 2.79, + "grad_norm": 1.1953125, + "learning_rate": 0.0004893990600110182, + "loss": 0.2588, + "step": 67470 + }, + { + "epoch": 2.8, + "grad_norm": 0.578125, + "learning_rate": 0.0004893959351377389, + "loss": 0.2114, + "step": 67480 + }, + { + "epoch": 2.8, + "grad_norm": 0.44921875, + "learning_rate": 0.0004893928098139424, + "loss": 0.1634, + "step": 67490 + }, + { + "epoch": 2.8, + "grad_norm": 0.392578125, + "learning_rate": 0.0004893896840396345, + "loss": 0.2434, + "step": 67500 + }, + { + "epoch": 2.8, + "grad_norm": 0.76171875, + "learning_rate": 0.0004893865578148211, + "loss": 0.2413, + "step": 67510 + }, + { + "epoch": 2.8, + "grad_norm": 1.3671875, + "learning_rate": 0.0004893834311395079, + "loss": 0.2334, + "step": 67520 + }, + { + "epoch": 2.8, + "grad_norm": 0.57421875, + "learning_rate": 0.000489380304013701, + "loss": 0.2389, + "step": 67530 + }, + { + "epoch": 2.8, + "grad_norm": 0.470703125, + "learning_rate": 0.0004893771764374063, + "loss": 0.221, + "step": 67540 + }, + { + "epoch": 2.8, + "grad_norm": 0.458984375, + "learning_rate": 0.0004893740484106296, + "loss": 0.2291, + "step": 67550 + }, + { + "epoch": 2.8, + "grad_norm": 0.6875, + "learning_rate": 0.0004893709199333767, + "loss": 0.2297, + "step": 67560 + }, + { + "epoch": 2.8, + "grad_norm": 2.5625, + "learning_rate": 0.0004893677910056535, + "loss": 0.2034, + "step": 67570 + }, + { + "epoch": 2.8, + "grad_norm": 2.125, + "learning_rate": 0.0004893646616274662, + "loss": 0.2321, + "step": 67580 + }, + { + "epoch": 2.8, + "grad_norm": 1.7734375, + "learning_rate": 0.0004893615317988203, + "loss": 0.2003, + "step": 67590 + }, + { + "epoch": 2.8, + "grad_norm": 0.5625, + "learning_rate": 0.0004893584015197218, + "loss": 0.1991, + "step": 67600 + }, + { + "epoch": 2.8, + "grad_norm": 0.5703125, + "learning_rate": 0.0004893552707901769, + "loss": 0.2361, + "step": 67610 + }, + { + "epoch": 2.8, + "grad_norm": 0.546875, + "learning_rate": 0.000489352139610191, + "loss": 0.2462, + "step": 67620 + }, + { + "epoch": 2.8, + "grad_norm": 1.6796875, + "learning_rate": 0.0004893490079797702, + "loss": 0.1764, + "step": 67630 + }, + { + "epoch": 2.8, + "grad_norm": 0.609375, + "learning_rate": 0.0004893458758989205, + "loss": 0.2177, + "step": 67640 + }, + { + "epoch": 2.8, + "grad_norm": 0.51171875, + "learning_rate": 0.0004893427433676477, + "loss": 0.2256, + "step": 67650 + }, + { + "epoch": 2.8, + "grad_norm": 0.357421875, + "learning_rate": 0.0004893396103859578, + "loss": 0.2207, + "step": 67660 + }, + { + "epoch": 2.8, + "grad_norm": 1.15625, + "learning_rate": 0.0004893364769538564, + "loss": 0.2522, + "step": 67670 + }, + { + "epoch": 2.8, + "grad_norm": 1.0, + "learning_rate": 0.0004893333430713496, + "loss": 0.2676, + "step": 67680 + }, + { + "epoch": 2.8, + "grad_norm": 0.546875, + "learning_rate": 0.0004893302087384435, + "loss": 0.2146, + "step": 67690 + }, + { + "epoch": 2.8, + "grad_norm": 0.2119140625, + "learning_rate": 0.0004893270739551437, + "loss": 0.1853, + "step": 67700 + }, + { + "epoch": 2.8, + "grad_norm": 0.30859375, + "learning_rate": 0.0004893239387214561, + "loss": 0.2197, + "step": 67710 + }, + { + "epoch": 2.8, + "grad_norm": 0.76171875, + "learning_rate": 0.0004893208030373868, + "loss": 0.2113, + "step": 67720 + }, + { + "epoch": 2.81, + "grad_norm": 0.6875, + "learning_rate": 0.0004893176669029416, + "loss": 0.2232, + "step": 67730 + }, + { + "epoch": 2.81, + "grad_norm": 0.65625, + "learning_rate": 0.0004893145303181264, + "loss": 0.2224, + "step": 67740 + }, + { + "epoch": 2.81, + "grad_norm": 1.1796875, + "learning_rate": 0.000489311393282947, + "loss": 0.2048, + "step": 67750 + }, + { + "epoch": 2.81, + "grad_norm": 1.3125, + "learning_rate": 0.0004893082557974094, + "loss": 0.2256, + "step": 67760 + }, + { + "epoch": 2.81, + "grad_norm": 1.171875, + "learning_rate": 0.0004893051178615196, + "loss": 0.1869, + "step": 67770 + }, + { + "epoch": 2.81, + "grad_norm": 0.5390625, + "learning_rate": 0.0004893019794752834, + "loss": 0.1727, + "step": 67780 + }, + { + "epoch": 2.81, + "grad_norm": 0.3671875, + "learning_rate": 0.0004892988406387066, + "loss": 0.2418, + "step": 67790 + }, + { + "epoch": 2.81, + "grad_norm": 1.34375, + "learning_rate": 0.0004892957013517954, + "loss": 0.2518, + "step": 67800 + }, + { + "epoch": 2.81, + "grad_norm": 0.38671875, + "learning_rate": 0.0004892925616145554, + "loss": 0.1807, + "step": 67810 + }, + { + "epoch": 2.81, + "grad_norm": 0.78125, + "learning_rate": 0.0004892894214269927, + "loss": 0.2147, + "step": 67820 + }, + { + "epoch": 2.81, + "grad_norm": 0.62109375, + "learning_rate": 0.0004892862807891131, + "loss": 0.208, + "step": 67830 + }, + { + "epoch": 2.81, + "grad_norm": 0.68359375, + "learning_rate": 0.0004892831397009226, + "loss": 0.2587, + "step": 67840 + }, + { + "epoch": 2.81, + "grad_norm": 0.71875, + "learning_rate": 0.000489279998162427, + "loss": 0.2025, + "step": 67850 + }, + { + "epoch": 2.81, + "grad_norm": 0.65625, + "learning_rate": 0.0004892768561736324, + "loss": 0.2146, + "step": 67860 + }, + { + "epoch": 2.81, + "grad_norm": 2.515625, + "learning_rate": 0.0004892737137345446, + "loss": 0.1868, + "step": 67870 + }, + { + "epoch": 2.81, + "grad_norm": 0.94921875, + "learning_rate": 0.0004892705708451694, + "loss": 0.2395, + "step": 67880 + }, + { + "epoch": 2.81, + "grad_norm": 0.96875, + "learning_rate": 0.0004892674275055128, + "loss": 0.2198, + "step": 67890 + }, + { + "epoch": 2.81, + "grad_norm": 0.47265625, + "learning_rate": 0.0004892642837155809, + "loss": 0.2247, + "step": 67900 + }, + { + "epoch": 2.81, + "grad_norm": 0.76953125, + "learning_rate": 0.0004892611394753793, + "loss": 0.2274, + "step": 67910 + }, + { + "epoch": 2.81, + "grad_norm": 0.5859375, + "learning_rate": 0.0004892579947849142, + "loss": 0.17, + "step": 67920 + }, + { + "epoch": 2.81, + "grad_norm": 0.5, + "learning_rate": 0.0004892548496441913, + "loss": 0.1769, + "step": 67930 + }, + { + "epoch": 2.81, + "grad_norm": 1.1328125, + "learning_rate": 0.0004892517040532167, + "loss": 0.1818, + "step": 67940 + }, + { + "epoch": 2.81, + "grad_norm": 0.515625, + "learning_rate": 0.0004892485580119962, + "loss": 0.2297, + "step": 67950 + }, + { + "epoch": 2.81, + "grad_norm": 0.5546875, + "learning_rate": 0.0004892454115205357, + "loss": 0.1929, + "step": 67960 + }, + { + "epoch": 2.82, + "grad_norm": 1.1875, + "learning_rate": 0.0004892422645788411, + "loss": 0.1904, + "step": 67970 + }, + { + "epoch": 2.82, + "grad_norm": 0.578125, + "learning_rate": 0.0004892391171869185, + "loss": 0.1924, + "step": 67980 + }, + { + "epoch": 2.82, + "grad_norm": 0.59375, + "learning_rate": 0.0004892359693447738, + "loss": 0.2324, + "step": 67990 + }, + { + "epoch": 2.82, + "grad_norm": 0.1904296875, + "learning_rate": 0.0004892328210524127, + "loss": 0.203, + "step": 68000 + }, + { + "epoch": 2.82, + "grad_norm": 0.5, + "learning_rate": 0.0004892296723098414, + "loss": 0.1906, + "step": 68010 + }, + { + "epoch": 2.82, + "grad_norm": 0.734375, + "learning_rate": 0.0004892265231170656, + "loss": 0.2553, + "step": 68020 + }, + { + "epoch": 2.82, + "grad_norm": 0.33203125, + "learning_rate": 0.0004892233734740913, + "loss": 0.214, + "step": 68030 + }, + { + "epoch": 2.82, + "grad_norm": 0.4140625, + "learning_rate": 0.0004892202233809244, + "loss": 0.1816, + "step": 68040 + }, + { + "epoch": 2.82, + "grad_norm": 0.48046875, + "learning_rate": 0.000489217072837571, + "loss": 0.1917, + "step": 68050 + }, + { + "epoch": 2.82, + "grad_norm": 1.1015625, + "learning_rate": 0.0004892139218440368, + "loss": 0.2178, + "step": 68060 + }, + { + "epoch": 2.82, + "grad_norm": 0.77734375, + "learning_rate": 0.0004892107704003279, + "loss": 0.2183, + "step": 68070 + }, + { + "epoch": 2.82, + "grad_norm": 0.52734375, + "learning_rate": 0.0004892076185064501, + "loss": 0.2621, + "step": 68080 + }, + { + "epoch": 2.82, + "grad_norm": 0.36328125, + "learning_rate": 0.0004892044661624095, + "loss": 0.214, + "step": 68090 + }, + { + "epoch": 2.82, + "grad_norm": 0.61328125, + "learning_rate": 0.0004892013133682118, + "loss": 0.1619, + "step": 68100 + }, + { + "epoch": 2.82, + "grad_norm": 0.85546875, + "learning_rate": 0.0004891981601238632, + "loss": 0.1992, + "step": 68110 + }, + { + "epoch": 2.82, + "grad_norm": 0.59765625, + "learning_rate": 0.0004891950064293694, + "loss": 0.2123, + "step": 68120 + }, + { + "epoch": 2.82, + "grad_norm": 0.84375, + "learning_rate": 0.0004891918522847364, + "loss": 0.2003, + "step": 68130 + }, + { + "epoch": 2.82, + "grad_norm": 0.400390625, + "learning_rate": 0.0004891886976899702, + "loss": 0.2831, + "step": 68140 + }, + { + "epoch": 2.82, + "grad_norm": 1.171875, + "learning_rate": 0.0004891855426450767, + "loss": 0.2239, + "step": 68150 + }, + { + "epoch": 2.82, + "grad_norm": 1.1328125, + "learning_rate": 0.0004891823871500619, + "loss": 0.1636, + "step": 68160 + }, + { + "epoch": 2.82, + "grad_norm": 1.8359375, + "learning_rate": 0.0004891792312049317, + "loss": 0.179, + "step": 68170 + }, + { + "epoch": 2.82, + "grad_norm": 0.8671875, + "learning_rate": 0.0004891760748096919, + "loss": 0.2148, + "step": 68180 + }, + { + "epoch": 2.82, + "grad_norm": 0.52734375, + "learning_rate": 0.0004891729179643486, + "loss": 0.1855, + "step": 68190 + }, + { + "epoch": 2.82, + "grad_norm": 0.42578125, + "learning_rate": 0.0004891697606689077, + "loss": 0.209, + "step": 68200 + }, + { + "epoch": 2.83, + "grad_norm": 0.48046875, + "learning_rate": 0.0004891666029233752, + "loss": 0.1878, + "step": 68210 + }, + { + "epoch": 2.83, + "grad_norm": 0.306640625, + "learning_rate": 0.0004891634447277568, + "loss": 0.1686, + "step": 68220 + }, + { + "epoch": 2.83, + "grad_norm": 1.34375, + "learning_rate": 0.0004891602860820587, + "loss": 0.2763, + "step": 68230 + }, + { + "epoch": 2.83, + "grad_norm": 1.046875, + "learning_rate": 0.0004891571269862869, + "loss": 0.2411, + "step": 68240 + }, + { + "epoch": 2.83, + "grad_norm": 0.466796875, + "learning_rate": 0.0004891539674404471, + "loss": 0.2543, + "step": 68250 + }, + { + "epoch": 2.83, + "grad_norm": 0.5859375, + "learning_rate": 0.0004891508074445453, + "loss": 0.2378, + "step": 68260 + }, + { + "epoch": 2.83, + "grad_norm": 0.87109375, + "learning_rate": 0.0004891476469985876, + "loss": 0.1626, + "step": 68270 + }, + { + "epoch": 2.83, + "grad_norm": 1.5, + "learning_rate": 0.0004891444861025799, + "loss": 0.2365, + "step": 68280 + }, + { + "epoch": 2.83, + "grad_norm": 1.1953125, + "learning_rate": 0.0004891413247565281, + "loss": 0.2761, + "step": 68290 + }, + { + "epoch": 2.83, + "grad_norm": 0.9296875, + "learning_rate": 0.000489138162960438, + "loss": 0.1765, + "step": 68300 + }, + { + "epoch": 2.83, + "grad_norm": 0.90234375, + "learning_rate": 0.0004891350007143158, + "loss": 0.2318, + "step": 68310 + }, + { + "epoch": 2.83, + "grad_norm": 1.4765625, + "learning_rate": 0.0004891318380181673, + "loss": 0.2124, + "step": 68320 + }, + { + "epoch": 2.83, + "grad_norm": 1.578125, + "learning_rate": 0.0004891286748719986, + "loss": 0.1802, + "step": 68330 + }, + { + "epoch": 2.83, + "grad_norm": 0.412109375, + "learning_rate": 0.0004891255112758155, + "loss": 0.1705, + "step": 68340 + }, + { + "epoch": 2.83, + "grad_norm": 0.6015625, + "learning_rate": 0.0004891223472296241, + "loss": 0.2042, + "step": 68350 + }, + { + "epoch": 2.83, + "grad_norm": 0.5546875, + "learning_rate": 0.0004891191827334302, + "loss": 0.182, + "step": 68360 + }, + { + "epoch": 2.83, + "grad_norm": 0.875, + "learning_rate": 0.0004891160177872398, + "loss": 0.227, + "step": 68370 + }, + { + "epoch": 2.83, + "grad_norm": 0.734375, + "learning_rate": 0.0004891128523910589, + "loss": 0.2051, + "step": 68380 + }, + { + "epoch": 2.83, + "grad_norm": 0.61328125, + "learning_rate": 0.0004891096865448934, + "loss": 0.2375, + "step": 68390 + }, + { + "epoch": 2.83, + "grad_norm": 0.84375, + "learning_rate": 0.0004891065202487492, + "loss": 0.2403, + "step": 68400 + }, + { + "epoch": 2.83, + "grad_norm": 0.66796875, + "learning_rate": 0.0004891033535026326, + "loss": 0.2545, + "step": 68410 + }, + { + "epoch": 2.83, + "grad_norm": 0.9765625, + "learning_rate": 0.0004891001863065491, + "loss": 0.2368, + "step": 68420 + }, + { + "epoch": 2.83, + "grad_norm": 1.640625, + "learning_rate": 0.000489097018660505, + "loss": 0.2179, + "step": 68430 + }, + { + "epoch": 2.83, + "grad_norm": 0.7109375, + "learning_rate": 0.0004890938505645061, + "loss": 0.194, + "step": 68440 + }, + { + "epoch": 2.84, + "grad_norm": 0.765625, + "learning_rate": 0.0004890906820185583, + "loss": 0.2157, + "step": 68450 + }, + { + "epoch": 2.84, + "grad_norm": 0.59375, + "learning_rate": 0.0004890875130226678, + "loss": 0.2058, + "step": 68460 + }, + { + "epoch": 2.84, + "grad_norm": 0.41796875, + "learning_rate": 0.0004890843435768402, + "loss": 0.1829, + "step": 68470 + }, + { + "epoch": 2.84, + "grad_norm": 0.71875, + "learning_rate": 0.0004890811736810818, + "loss": 0.228, + "step": 68480 + }, + { + "epoch": 2.84, + "grad_norm": 0.4765625, + "learning_rate": 0.0004890780033353986, + "loss": 0.2274, + "step": 68490 + }, + { + "epoch": 2.84, + "grad_norm": 0.2373046875, + "learning_rate": 0.0004890748325397963, + "loss": 0.2367, + "step": 68500 + }, + { + "epoch": 2.84, + "grad_norm": 0.52734375, + "learning_rate": 0.0004890716612942809, + "loss": 0.1595, + "step": 68510 + }, + { + "epoch": 2.84, + "grad_norm": 0.8984375, + "learning_rate": 0.0004890684895988585, + "loss": 0.2215, + "step": 68520 + }, + { + "epoch": 2.84, + "grad_norm": 0.734375, + "learning_rate": 0.0004890653174535351, + "loss": 0.2041, + "step": 68530 + }, + { + "epoch": 2.84, + "grad_norm": 1.515625, + "learning_rate": 0.0004890621448583165, + "loss": 0.2592, + "step": 68540 + }, + { + "epoch": 2.84, + "grad_norm": 0.95703125, + "learning_rate": 0.0004890589718132088, + "loss": 0.2288, + "step": 68550 + }, + { + "epoch": 2.84, + "grad_norm": 0.5546875, + "learning_rate": 0.0004890557983182179, + "loss": 0.216, + "step": 68560 + }, + { + "epoch": 2.84, + "grad_norm": 0.4921875, + "learning_rate": 0.0004890526243733498, + "loss": 0.227, + "step": 68570 + }, + { + "epoch": 2.84, + "grad_norm": 7.53125, + "learning_rate": 0.0004890494499786106, + "loss": 0.2312, + "step": 68580 + }, + { + "epoch": 2.84, + "grad_norm": 0.2353515625, + "learning_rate": 0.000489046275134006, + "loss": 0.2242, + "step": 68590 + }, + { + "epoch": 2.84, + "grad_norm": 0.447265625, + "learning_rate": 0.0004890430998395422, + "loss": 0.2177, + "step": 68600 + }, + { + "epoch": 2.84, + "grad_norm": 0.53125, + "learning_rate": 0.0004890399240952252, + "loss": 0.194, + "step": 68610 + }, + { + "epoch": 2.84, + "grad_norm": 0.3984375, + "learning_rate": 0.0004890367479010607, + "loss": 0.1968, + "step": 68620 + }, + { + "epoch": 2.84, + "grad_norm": 0.64453125, + "learning_rate": 0.0004890335712570548, + "loss": 0.1897, + "step": 68630 + }, + { + "epoch": 2.84, + "grad_norm": 0.5703125, + "learning_rate": 0.0004890303941632137, + "loss": 0.1784, + "step": 68640 + }, + { + "epoch": 2.84, + "grad_norm": 0.57421875, + "learning_rate": 0.0004890272166195432, + "loss": 0.1885, + "step": 68650 + }, + { + "epoch": 2.84, + "grad_norm": 0.63671875, + "learning_rate": 0.0004890240386260492, + "loss": 0.2017, + "step": 68660 + }, + { + "epoch": 2.84, + "grad_norm": 0.50390625, + "learning_rate": 0.0004890208601827378, + "loss": 0.2232, + "step": 68670 + }, + { + "epoch": 2.84, + "grad_norm": 0.314453125, + "learning_rate": 0.000489017681289615, + "loss": 0.2137, + "step": 68680 + }, + { + "epoch": 2.85, + "grad_norm": 0.796875, + "learning_rate": 0.0004890145019466868, + "loss": 0.2161, + "step": 68690 + }, + { + "epoch": 2.85, + "grad_norm": 0.310546875, + "learning_rate": 0.0004890113221539589, + "loss": 0.2025, + "step": 68700 + }, + { + "epoch": 2.85, + "grad_norm": 0.263671875, + "learning_rate": 0.0004890081419114377, + "loss": 0.2465, + "step": 68710 + }, + { + "epoch": 2.85, + "grad_norm": 0.318359375, + "learning_rate": 0.0004890049612191288, + "loss": 0.1884, + "step": 68720 + }, + { + "epoch": 2.85, + "grad_norm": 1.015625, + "learning_rate": 0.0004890017800770385, + "loss": 0.1905, + "step": 68730 + }, + { + "epoch": 2.85, + "grad_norm": 0.4140625, + "learning_rate": 0.0004889985984851727, + "loss": 0.2108, + "step": 68740 + }, + { + "epoch": 2.85, + "grad_norm": 1.4375, + "learning_rate": 0.0004889954164435373, + "loss": 0.2093, + "step": 68750 + }, + { + "epoch": 2.85, + "grad_norm": 0.87109375, + "learning_rate": 0.0004889922339521383, + "loss": 0.227, + "step": 68760 + }, + { + "epoch": 2.85, + "grad_norm": 2.9375, + "learning_rate": 0.0004889890510109818, + "loss": 0.2147, + "step": 68770 + }, + { + "epoch": 2.85, + "grad_norm": 0.859375, + "learning_rate": 0.0004889858676200737, + "loss": 0.2076, + "step": 68780 + }, + { + "epoch": 2.85, + "grad_norm": 0.90625, + "learning_rate": 0.0004889826837794199, + "loss": 0.2304, + "step": 68790 + }, + { + "epoch": 2.85, + "grad_norm": 0.435546875, + "learning_rate": 0.0004889794994890267, + "loss": 0.183, + "step": 68800 + }, + { + "epoch": 2.85, + "grad_norm": 0.416015625, + "learning_rate": 0.0004889763147488997, + "loss": 0.1957, + "step": 68810 + }, + { + "epoch": 2.85, + "grad_norm": 0.6171875, + "learning_rate": 0.0004889731295590452, + "loss": 0.219, + "step": 68820 + }, + { + "epoch": 2.85, + "grad_norm": 0.166015625, + "learning_rate": 0.000488969943919469, + "loss": 0.1681, + "step": 68830 + }, + { + "epoch": 2.85, + "grad_norm": 0.66796875, + "learning_rate": 0.0004889667578301772, + "loss": 0.2288, + "step": 68840 + }, + { + "epoch": 2.85, + "grad_norm": 0.3984375, + "learning_rate": 0.0004889635712911757, + "loss": 0.1875, + "step": 68850 + }, + { + "epoch": 2.85, + "grad_norm": 0.625, + "learning_rate": 0.0004889603843024707, + "loss": 0.1882, + "step": 68860 + }, + { + "epoch": 2.85, + "grad_norm": 0.5390625, + "learning_rate": 0.000488957196864068, + "loss": 0.1518, + "step": 68870 + }, + { + "epoch": 2.85, + "grad_norm": 0.0, + "learning_rate": 0.0004889540089759736, + "loss": 0.1911, + "step": 68880 + }, + { + "epoch": 2.85, + "grad_norm": 0.71484375, + "learning_rate": 0.0004889508206381936, + "loss": 0.1897, + "step": 68890 + }, + { + "epoch": 2.85, + "grad_norm": 0.43359375, + "learning_rate": 0.0004889476318507339, + "loss": 0.2294, + "step": 68900 + }, + { + "epoch": 2.85, + "grad_norm": 0.65234375, + "learning_rate": 0.0004889444426136007, + "loss": 0.2391, + "step": 68910 + }, + { + "epoch": 2.85, + "grad_norm": 0.244140625, + "learning_rate": 0.0004889412529267998, + "loss": 0.2365, + "step": 68920 + }, + { + "epoch": 2.86, + "grad_norm": 0.40234375, + "learning_rate": 0.0004889380627903372, + "loss": 0.204, + "step": 68930 + }, + { + "epoch": 2.86, + "grad_norm": 0.80078125, + "learning_rate": 0.000488934872204219, + "loss": 0.2408, + "step": 68940 + }, + { + "epoch": 2.86, + "grad_norm": 0.9609375, + "learning_rate": 0.0004889316811684512, + "loss": 0.2081, + "step": 68950 + }, + { + "epoch": 2.86, + "grad_norm": 0.5078125, + "learning_rate": 0.0004889284896830398, + "loss": 0.2437, + "step": 68960 + }, + { + "epoch": 2.86, + "grad_norm": 0.48046875, + "learning_rate": 0.0004889252977479908, + "loss": 0.1905, + "step": 68970 + }, + { + "epoch": 2.86, + "grad_norm": 0.337890625, + "learning_rate": 0.0004889221053633101, + "loss": 0.2215, + "step": 68980 + }, + { + "epoch": 2.86, + "grad_norm": 0.96484375, + "learning_rate": 0.0004889189125290039, + "loss": 0.2104, + "step": 68990 + }, + { + "epoch": 2.86, + "grad_norm": 0.359375, + "learning_rate": 0.0004889157192450781, + "loss": 0.2398, + "step": 69000 + }, + { + "epoch": 2.86, + "grad_norm": 0.65625, + "learning_rate": 0.0004889125255115387, + "loss": 0.2115, + "step": 69010 + }, + { + "epoch": 2.86, + "grad_norm": 0.451171875, + "learning_rate": 0.0004889093313283916, + "loss": 0.2283, + "step": 69020 + }, + { + "epoch": 2.86, + "grad_norm": 1.25, + "learning_rate": 0.0004889061366956431, + "loss": 0.2359, + "step": 69030 + }, + { + "epoch": 2.86, + "grad_norm": 0.5703125, + "learning_rate": 0.0004889029416132991, + "loss": 0.2518, + "step": 69040 + }, + { + "epoch": 2.86, + "grad_norm": 1.0859375, + "learning_rate": 0.0004888997460813654, + "loss": 0.1923, + "step": 69050 + }, + { + "epoch": 2.86, + "grad_norm": 0.6171875, + "learning_rate": 0.0004888965500998482, + "loss": 0.2106, + "step": 69060 + }, + { + "epoch": 2.86, + "grad_norm": 1.328125, + "learning_rate": 0.0004888933536687536, + "loss": 0.21, + "step": 69070 + }, + { + "epoch": 2.86, + "grad_norm": 0.97265625, + "learning_rate": 0.0004888901567880874, + "loss": 0.3109, + "step": 69080 + }, + { + "epoch": 2.86, + "grad_norm": 0.6171875, + "learning_rate": 0.0004888869594578557, + "loss": 0.1756, + "step": 69090 + }, + { + "epoch": 2.86, + "grad_norm": 0.6640625, + "learning_rate": 0.0004888837616780647, + "loss": 0.1933, + "step": 69100 + }, + { + "epoch": 2.86, + "grad_norm": 0.28515625, + "learning_rate": 0.0004888805634487201, + "loss": 0.2544, + "step": 69110 + }, + { + "epoch": 2.86, + "grad_norm": 0.40234375, + "learning_rate": 0.0004888773647698281, + "loss": 0.1659, + "step": 69120 + }, + { + "epoch": 2.86, + "grad_norm": 0.54296875, + "learning_rate": 0.0004888741656413947, + "loss": 0.258, + "step": 69130 + }, + { + "epoch": 2.86, + "grad_norm": 1.1328125, + "learning_rate": 0.000488870966063426, + "loss": 0.2102, + "step": 69140 + }, + { + "epoch": 2.86, + "grad_norm": 0.50390625, + "learning_rate": 0.0004888677660359279, + "loss": 0.1627, + "step": 69150 + }, + { + "epoch": 2.86, + "grad_norm": 0.703125, + "learning_rate": 0.0004888645655589064, + "loss": 0.2274, + "step": 69160 + }, + { + "epoch": 2.87, + "grad_norm": 0.482421875, + "learning_rate": 0.0004888613646323676, + "loss": 0.1628, + "step": 69170 + }, + { + "epoch": 2.87, + "grad_norm": 0.8359375, + "learning_rate": 0.0004888581632563176, + "loss": 0.2036, + "step": 69180 + }, + { + "epoch": 2.87, + "grad_norm": 0.77734375, + "learning_rate": 0.0004888549614307622, + "loss": 0.2366, + "step": 69190 + }, + { + "epoch": 2.87, + "grad_norm": 0.5546875, + "learning_rate": 0.0004888517591557077, + "loss": 0.1904, + "step": 69200 + }, + { + "epoch": 2.87, + "grad_norm": 0.57421875, + "learning_rate": 0.0004888485564311599, + "loss": 0.2575, + "step": 69210 + }, + { + "epoch": 2.87, + "grad_norm": 0.435546875, + "learning_rate": 0.0004888453532571248, + "loss": 0.208, + "step": 69220 + }, + { + "epoch": 2.87, + "grad_norm": 0.5234375, + "learning_rate": 0.0004888421496336087, + "loss": 0.2761, + "step": 69230 + }, + { + "epoch": 2.87, + "grad_norm": 0.56640625, + "learning_rate": 0.0004888389455606174, + "loss": 0.2117, + "step": 69240 + }, + { + "epoch": 2.87, + "grad_norm": 0.75390625, + "learning_rate": 0.000488835741038157, + "loss": 0.1361, + "step": 69250 + }, + { + "epoch": 2.87, + "grad_norm": 0.7734375, + "learning_rate": 0.0004888325360662335, + "loss": 0.2006, + "step": 69260 + }, + { + "epoch": 2.87, + "grad_norm": 0.36328125, + "learning_rate": 0.0004888293306448531, + "loss": 0.248, + "step": 69270 + }, + { + "epoch": 2.87, + "grad_norm": 0.8671875, + "learning_rate": 0.0004888261247740216, + "loss": 0.1909, + "step": 69280 + }, + { + "epoch": 2.87, + "grad_norm": 0.953125, + "learning_rate": 0.000488822918453745, + "loss": 0.2021, + "step": 69290 + }, + { + "epoch": 2.87, + "grad_norm": 0.4375, + "learning_rate": 0.0004888197116840297, + "loss": 0.134, + "step": 69300 + }, + { + "epoch": 2.87, + "grad_norm": 1.171875, + "learning_rate": 0.0004888165044648813, + "loss": 0.2082, + "step": 69310 + }, + { + "epoch": 2.87, + "grad_norm": 0.443359375, + "learning_rate": 0.0004888132967963061, + "loss": 0.2351, + "step": 69320 + }, + { + "epoch": 2.87, + "grad_norm": 1.0078125, + "learning_rate": 0.0004888100886783101, + "loss": 0.2087, + "step": 69330 + }, + { + "epoch": 2.87, + "grad_norm": 0.7734375, + "learning_rate": 0.0004888068801108992, + "loss": 0.2115, + "step": 69340 + }, + { + "epoch": 2.87, + "grad_norm": 0.0, + "learning_rate": 0.0004888036710940796, + "loss": 0.1439, + "step": 69350 + }, + { + "epoch": 2.87, + "grad_norm": 0.6015625, + "learning_rate": 0.0004888004616278573, + "loss": 0.1918, + "step": 69360 + }, + { + "epoch": 2.87, + "grad_norm": 0.703125, + "learning_rate": 0.0004887972517122383, + "loss": 0.2171, + "step": 69370 + }, + { + "epoch": 2.87, + "grad_norm": 0.58203125, + "learning_rate": 0.0004887940413472287, + "loss": 0.2009, + "step": 69380 + }, + { + "epoch": 2.87, + "grad_norm": 0.419921875, + "learning_rate": 0.0004887908305328345, + "loss": 0.2285, + "step": 69390 + }, + { + "epoch": 2.87, + "grad_norm": 0.41796875, + "learning_rate": 0.0004887876192690617, + "loss": 0.2352, + "step": 69400 + }, + { + "epoch": 2.87, + "grad_norm": 0.8046875, + "learning_rate": 0.0004887844075559163, + "loss": 0.2323, + "step": 69410 + }, + { + "epoch": 2.88, + "grad_norm": 0.423828125, + "learning_rate": 0.0004887811953934046, + "loss": 0.2372, + "step": 69420 + }, + { + "epoch": 2.88, + "grad_norm": 0.263671875, + "learning_rate": 0.0004887779827815324, + "loss": 0.1873, + "step": 69430 + }, + { + "epoch": 2.88, + "grad_norm": 0.57421875, + "learning_rate": 0.0004887747697203058, + "loss": 0.1851, + "step": 69440 + }, + { + "epoch": 2.88, + "grad_norm": 0.5703125, + "learning_rate": 0.0004887715562097309, + "loss": 0.2637, + "step": 69450 + }, + { + "epoch": 2.88, + "grad_norm": 0.275390625, + "learning_rate": 0.0004887683422498137, + "loss": 0.2426, + "step": 69460 + }, + { + "epoch": 2.88, + "grad_norm": 0.28515625, + "learning_rate": 0.0004887651278405602, + "loss": 0.1983, + "step": 69470 + }, + { + "epoch": 2.88, + "grad_norm": 0.8515625, + "learning_rate": 0.0004887619129819767, + "loss": 0.2012, + "step": 69480 + }, + { + "epoch": 2.88, + "grad_norm": 0.458984375, + "learning_rate": 0.0004887586976740689, + "loss": 0.2082, + "step": 69490 + }, + { + "epoch": 2.88, + "grad_norm": 0.412109375, + "learning_rate": 0.000488755481916843, + "loss": 0.278, + "step": 69500 + }, + { + "epoch": 2.88, + "grad_norm": 0.70703125, + "learning_rate": 0.0004887522657103053, + "loss": 0.2461, + "step": 69510 + }, + { + "epoch": 2.88, + "grad_norm": 1.28125, + "learning_rate": 0.0004887490490544614, + "loss": 0.1862, + "step": 69520 + }, + { + "epoch": 2.88, + "grad_norm": 1.7265625, + "learning_rate": 0.0004887458319493176, + "loss": 0.2396, + "step": 69530 + }, + { + "epoch": 2.88, + "grad_norm": 1.2109375, + "learning_rate": 0.0004887426143948799, + "loss": 0.2456, + "step": 69540 + }, + { + "epoch": 2.88, + "grad_norm": 0.6171875, + "learning_rate": 0.0004887393963911545, + "loss": 0.1944, + "step": 69550 + }, + { + "epoch": 2.88, + "grad_norm": 0.80078125, + "learning_rate": 0.0004887361779381473, + "loss": 0.2501, + "step": 69560 + }, + { + "epoch": 2.88, + "grad_norm": 0.474609375, + "learning_rate": 0.0004887329590358644, + "loss": 0.2288, + "step": 69570 + }, + { + "epoch": 2.88, + "grad_norm": 0.5625, + "learning_rate": 0.0004887297396843118, + "loss": 0.1828, + "step": 69580 + }, + { + "epoch": 2.88, + "grad_norm": 0.88671875, + "learning_rate": 0.0004887265198834956, + "loss": 0.1823, + "step": 69590 + }, + { + "epoch": 2.88, + "grad_norm": 1.265625, + "learning_rate": 0.0004887232996334219, + "loss": 0.2337, + "step": 69600 + }, + { + "epoch": 2.88, + "grad_norm": 0.70703125, + "learning_rate": 0.0004887200789340967, + "loss": 0.2317, + "step": 69610 + }, + { + "epoch": 2.88, + "grad_norm": 1.1484375, + "learning_rate": 0.0004887168577855261, + "loss": 0.1528, + "step": 69620 + }, + { + "epoch": 2.88, + "grad_norm": 0.30078125, + "learning_rate": 0.0004887136361877161, + "loss": 0.1878, + "step": 69630 + }, + { + "epoch": 2.88, + "grad_norm": 0.3671875, + "learning_rate": 0.000488710414140673, + "loss": 0.191, + "step": 69640 + }, + { + "epoch": 2.88, + "grad_norm": 0.68359375, + "learning_rate": 0.0004887071916444025, + "loss": 0.2041, + "step": 69650 + }, + { + "epoch": 2.89, + "grad_norm": 0.53515625, + "learning_rate": 0.000488703968698911, + "loss": 0.2112, + "step": 69660 + }, + { + "epoch": 2.89, + "grad_norm": 0.53515625, + "learning_rate": 0.0004887007453042043, + "loss": 0.2104, + "step": 69670 + }, + { + "epoch": 2.89, + "grad_norm": 0.62890625, + "learning_rate": 0.0004886975214602885, + "loss": 0.2015, + "step": 69680 + }, + { + "epoch": 2.89, + "grad_norm": 1.28125, + "learning_rate": 0.0004886942971671698, + "loss": 0.2037, + "step": 69690 + }, + { + "epoch": 2.89, + "grad_norm": 0.2734375, + "learning_rate": 0.0004886910724248543, + "loss": 0.189, + "step": 69700 + }, + { + "epoch": 2.89, + "grad_norm": 0.6875, + "learning_rate": 0.000488687847233348, + "loss": 0.1797, + "step": 69710 + }, + { + "epoch": 2.89, + "grad_norm": 0.98828125, + "learning_rate": 0.0004886846215926568, + "loss": 0.2894, + "step": 69720 + }, + { + "epoch": 2.89, + "grad_norm": 1.40625, + "learning_rate": 0.000488681395502787, + "loss": 0.201, + "step": 69730 + }, + { + "epoch": 2.89, + "grad_norm": 0.5078125, + "learning_rate": 0.0004886781689637446, + "loss": 0.178, + "step": 69740 + }, + { + "epoch": 2.89, + "grad_norm": 0.63671875, + "learning_rate": 0.0004886749419755357, + "loss": 0.1899, + "step": 69750 + }, + { + "epoch": 2.89, + "grad_norm": 0.703125, + "learning_rate": 0.0004886717145381661, + "loss": 0.2081, + "step": 69760 + }, + { + "epoch": 2.89, + "grad_norm": 0.5390625, + "learning_rate": 0.0004886684866516424, + "loss": 0.2034, + "step": 69770 + }, + { + "epoch": 2.89, + "grad_norm": 0.5390625, + "learning_rate": 0.0004886652583159702, + "loss": 0.2354, + "step": 69780 + }, + { + "epoch": 2.89, + "grad_norm": 0.5390625, + "learning_rate": 0.0004886620295311559, + "loss": 0.2974, + "step": 69790 + }, + { + "epoch": 2.89, + "grad_norm": 0.68359375, + "learning_rate": 0.0004886588002972052, + "loss": 0.2783, + "step": 69800 + }, + { + "epoch": 2.89, + "grad_norm": 0.435546875, + "learning_rate": 0.0004886555706141247, + "loss": 0.1979, + "step": 69810 + }, + { + "epoch": 2.89, + "grad_norm": 0.89453125, + "learning_rate": 0.00048865234048192, + "loss": 0.2396, + "step": 69820 + }, + { + "epoch": 2.89, + "grad_norm": 0.88671875, + "learning_rate": 0.0004886491099005974, + "loss": 0.2489, + "step": 69830 + }, + { + "epoch": 2.89, + "grad_norm": 0.50390625, + "learning_rate": 0.000488645878870163, + "loss": 0.2436, + "step": 69840 + }, + { + "epoch": 2.89, + "grad_norm": 0.734375, + "learning_rate": 0.0004886426473906228, + "loss": 0.2184, + "step": 69850 + }, + { + "epoch": 2.89, + "grad_norm": 0.7734375, + "learning_rate": 0.0004886394154619829, + "loss": 0.2036, + "step": 69860 + }, + { + "epoch": 2.89, + "grad_norm": 0.91796875, + "learning_rate": 0.0004886361830842495, + "loss": 0.1367, + "step": 69870 + }, + { + "epoch": 2.89, + "grad_norm": 0.65234375, + "learning_rate": 0.0004886329502574284, + "loss": 0.221, + "step": 69880 + }, + { + "epoch": 2.89, + "grad_norm": 0.4453125, + "learning_rate": 0.0004886297169815259, + "loss": 0.1638, + "step": 69890 + }, + { + "epoch": 2.9, + "grad_norm": 0.490234375, + "learning_rate": 0.000488626483256548, + "loss": 0.2141, + "step": 69900 + }, + { + "epoch": 2.9, + "grad_norm": 0.66015625, + "learning_rate": 0.0004886232490825009, + "loss": 0.2184, + "step": 69910 + }, + { + "epoch": 2.9, + "grad_norm": 0.79296875, + "learning_rate": 0.0004886200144593907, + "loss": 0.1929, + "step": 69920 + }, + { + "epoch": 2.9, + "grad_norm": 0.54296875, + "learning_rate": 0.0004886167793872233, + "loss": 0.1589, + "step": 69930 + }, + { + "epoch": 2.9, + "grad_norm": 0.8828125, + "learning_rate": 0.0004886135438660049, + "loss": 0.2278, + "step": 69940 + }, + { + "epoch": 2.9, + "grad_norm": 0.5, + "learning_rate": 0.0004886103078957416, + "loss": 0.1706, + "step": 69950 + }, + { + "epoch": 2.9, + "grad_norm": 1.1953125, + "learning_rate": 0.0004886070714764393, + "loss": 0.2465, + "step": 69960 + }, + { + "epoch": 2.9, + "grad_norm": 0.73828125, + "learning_rate": 0.0004886038346081045, + "loss": 0.2132, + "step": 69970 + }, + { + "epoch": 2.9, + "grad_norm": 0.54296875, + "learning_rate": 0.0004886005972907429, + "loss": 0.176, + "step": 69980 + }, + { + "epoch": 2.9, + "grad_norm": 0.828125, + "learning_rate": 0.0004885973595243608, + "loss": 0.1886, + "step": 69990 + }, + { + "epoch": 2.9, + "grad_norm": 0.70703125, + "learning_rate": 0.0004885941213089642, + "loss": 0.1764, + "step": 70000 + }, + { + "epoch": 2.9, + "grad_norm": 1.3515625, + "learning_rate": 0.0004885908826445593, + "loss": 0.2407, + "step": 70010 + }, + { + "epoch": 2.9, + "grad_norm": 0.609375, + "learning_rate": 0.0004885876435311522, + "loss": 0.1938, + "step": 70020 + }, + { + "epoch": 2.9, + "grad_norm": 0.89453125, + "learning_rate": 0.0004885844039687487, + "loss": 0.265, + "step": 70030 + }, + { + "epoch": 2.9, + "grad_norm": 0.69921875, + "learning_rate": 0.0004885811639573553, + "loss": 0.1964, + "step": 70040 + }, + { + "epoch": 2.9, + "grad_norm": 0.42578125, + "learning_rate": 0.0004885779234969778, + "loss": 0.3107, + "step": 70050 + }, + { + "epoch": 2.9, + "grad_norm": 3.21875, + "learning_rate": 0.0004885746825876225, + "loss": 0.2371, + "step": 70060 + }, + { + "epoch": 2.9, + "grad_norm": 1.5078125, + "learning_rate": 0.0004885714412292953, + "loss": 0.2369, + "step": 70070 + }, + { + "epoch": 2.9, + "grad_norm": 1.59375, + "learning_rate": 0.0004885681994220024, + "loss": 0.2101, + "step": 70080 + }, + { + "epoch": 2.9, + "grad_norm": 0.921875, + "learning_rate": 0.0004885649571657501, + "loss": 0.2055, + "step": 70090 + }, + { + "epoch": 2.9, + "grad_norm": 0.71484375, + "learning_rate": 0.0004885617144605442, + "loss": 0.265, + "step": 70100 + }, + { + "epoch": 2.9, + "grad_norm": 0.8671875, + "learning_rate": 0.0004885584713063909, + "loss": 0.1961, + "step": 70110 + }, + { + "epoch": 2.9, + "grad_norm": 0.2138671875, + "learning_rate": 0.0004885552277032963, + "loss": 0.2513, + "step": 70120 + }, + { + "epoch": 2.9, + "grad_norm": 0.6328125, + "learning_rate": 0.0004885519836512666, + "loss": 0.2379, + "step": 70130 + }, + { + "epoch": 2.91, + "grad_norm": 0.80078125, + "learning_rate": 0.0004885487391503077, + "loss": 0.2321, + "step": 70140 + }, + { + "epoch": 2.91, + "grad_norm": 0.55859375, + "learning_rate": 0.000488545494200426, + "loss": 0.1907, + "step": 70150 + }, + { + "epoch": 2.91, + "grad_norm": 0.640625, + "learning_rate": 0.0004885422488016273, + "loss": 0.2312, + "step": 70160 + }, + { + "epoch": 2.91, + "grad_norm": 0.63671875, + "learning_rate": 0.000488539002953918, + "loss": 0.1992, + "step": 70170 + }, + { + "epoch": 2.91, + "grad_norm": 0.64453125, + "learning_rate": 0.0004885357566573039, + "loss": 0.2353, + "step": 70180 + }, + { + "epoch": 2.91, + "grad_norm": 0.6875, + "learning_rate": 0.0004885325099117914, + "loss": 0.2122, + "step": 70190 + }, + { + "epoch": 2.91, + "grad_norm": 0.51171875, + "learning_rate": 0.0004885292627173864, + "loss": 0.261, + "step": 70200 + }, + { + "epoch": 2.91, + "grad_norm": 0.625, + "learning_rate": 0.000488526015074095, + "loss": 0.2239, + "step": 70210 + }, + { + "epoch": 2.91, + "grad_norm": 0.83203125, + "learning_rate": 0.0004885227669819235, + "loss": 0.2137, + "step": 70220 + }, + { + "epoch": 2.91, + "grad_norm": 0.671875, + "learning_rate": 0.000488519518440878, + "loss": 0.2621, + "step": 70230 + }, + { + "epoch": 2.91, + "grad_norm": 0.76171875, + "learning_rate": 0.0004885162694509644, + "loss": 0.239, + "step": 70240 + }, + { + "epoch": 2.91, + "grad_norm": 1.4140625, + "learning_rate": 0.0004885130200121889, + "loss": 0.1989, + "step": 70250 + }, + { + "epoch": 2.91, + "grad_norm": 0.486328125, + "learning_rate": 0.0004885097701245578, + "loss": 0.1843, + "step": 70260 + }, + { + "epoch": 2.91, + "grad_norm": 0.75390625, + "learning_rate": 0.0004885065197880769, + "loss": 0.1588, + "step": 70270 + }, + { + "epoch": 2.91, + "grad_norm": 1.1015625, + "learning_rate": 0.0004885032690027526, + "loss": 0.2245, + "step": 70280 + }, + { + "epoch": 2.91, + "grad_norm": 0.380859375, + "learning_rate": 0.0004885000177685909, + "loss": 0.2062, + "step": 70290 + }, + { + "epoch": 2.91, + "grad_norm": 0.333984375, + "learning_rate": 0.0004884967660855979, + "loss": 0.2119, + "step": 70300 + }, + { + "epoch": 2.91, + "grad_norm": 0.333984375, + "learning_rate": 0.0004884935139537797, + "loss": 0.1902, + "step": 70310 + }, + { + "epoch": 2.91, + "grad_norm": 3.203125, + "learning_rate": 0.0004884902613731425, + "loss": 0.2194, + "step": 70320 + }, + { + "epoch": 2.91, + "grad_norm": 0.83203125, + "learning_rate": 0.0004884870083436924, + "loss": 0.2284, + "step": 70330 + }, + { + "epoch": 2.91, + "grad_norm": 0.515625, + "learning_rate": 0.0004884837548654355, + "loss": 0.1885, + "step": 70340 + }, + { + "epoch": 2.91, + "grad_norm": 1.0859375, + "learning_rate": 0.0004884805009383779, + "loss": 0.1983, + "step": 70350 + }, + { + "epoch": 2.91, + "grad_norm": 0.84765625, + "learning_rate": 0.0004884772465625258, + "loss": 0.205, + "step": 70360 + }, + { + "epoch": 2.91, + "grad_norm": 0.59765625, + "learning_rate": 0.0004884739917378852, + "loss": 0.2053, + "step": 70370 + }, + { + "epoch": 2.92, + "grad_norm": 1.109375, + "learning_rate": 0.0004884707364644624, + "loss": 0.2269, + "step": 70380 + }, + { + "epoch": 2.92, + "grad_norm": 0.6171875, + "learning_rate": 0.0004884674807422633, + "loss": 0.1974, + "step": 70390 + }, + { + "epoch": 2.92, + "grad_norm": 0.16015625, + "learning_rate": 0.0004884642245712943, + "loss": 0.2495, + "step": 70400 + }, + { + "epoch": 2.92, + "grad_norm": 0.63671875, + "learning_rate": 0.0004884609679515613, + "loss": 0.1745, + "step": 70410 + }, + { + "epoch": 2.92, + "grad_norm": 0.6796875, + "learning_rate": 0.0004884577108830705, + "loss": 0.1893, + "step": 70420 + }, + { + "epoch": 2.92, + "grad_norm": 0.2431640625, + "learning_rate": 0.0004884544533658281, + "loss": 0.2923, + "step": 70430 + }, + { + "epoch": 2.92, + "grad_norm": 0.37890625, + "learning_rate": 0.0004884511953998401, + "loss": 0.1896, + "step": 70440 + }, + { + "epoch": 2.92, + "grad_norm": 0.6328125, + "learning_rate": 0.0004884479369851127, + "loss": 0.169, + "step": 70450 + }, + { + "epoch": 2.92, + "grad_norm": 0.33984375, + "learning_rate": 0.000488444678121652, + "loss": 0.1916, + "step": 70460 + }, + { + "epoch": 2.92, + "grad_norm": 1.5078125, + "learning_rate": 0.0004884414188094643, + "loss": 0.2033, + "step": 70470 + }, + { + "epoch": 2.92, + "grad_norm": 1.0859375, + "learning_rate": 0.0004884381590485555, + "loss": 0.2334, + "step": 70480 + }, + { + "epoch": 2.92, + "grad_norm": 0.447265625, + "learning_rate": 0.0004884348988389318, + "loss": 0.2361, + "step": 70490 + }, + { + "epoch": 2.92, + "grad_norm": 0.48046875, + "learning_rate": 0.0004884316381805994, + "loss": 0.2025, + "step": 70500 + }, + { + "epoch": 2.92, + "grad_norm": 1.328125, + "learning_rate": 0.0004884283770735645, + "loss": 0.2322, + "step": 70510 + }, + { + "epoch": 2.92, + "grad_norm": 2.140625, + "learning_rate": 0.000488425115517833, + "loss": 0.2072, + "step": 70520 + }, + { + "epoch": 2.92, + "grad_norm": 0.400390625, + "learning_rate": 0.0004884218535134113, + "loss": 0.1674, + "step": 70530 + }, + { + "epoch": 2.92, + "grad_norm": 0.87109375, + "learning_rate": 0.0004884185910603054, + "loss": 0.2209, + "step": 70540 + }, + { + "epoch": 2.92, + "grad_norm": 0.640625, + "learning_rate": 0.0004884153281585214, + "loss": 0.2502, + "step": 70550 + }, + { + "epoch": 2.92, + "grad_norm": 0.85546875, + "learning_rate": 0.0004884120648080655, + "loss": 0.2758, + "step": 70560 + }, + { + "epoch": 2.92, + "grad_norm": 0.63671875, + "learning_rate": 0.0004884088010089439, + "loss": 0.2113, + "step": 70570 + }, + { + "epoch": 2.92, + "grad_norm": 0.63671875, + "learning_rate": 0.0004884055367611627, + "loss": 0.2405, + "step": 70580 + }, + { + "epoch": 2.92, + "grad_norm": 1.1953125, + "learning_rate": 0.0004884022720647279, + "loss": 0.2057, + "step": 70590 + }, + { + "epoch": 2.92, + "grad_norm": 0.36328125, + "learning_rate": 0.0004883990069196459, + "loss": 0.1637, + "step": 70600 + }, + { + "epoch": 2.92, + "grad_norm": 0.5703125, + "learning_rate": 0.0004883957413259227, + "loss": 0.2837, + "step": 70610 + }, + { + "epoch": 2.93, + "grad_norm": 0.68359375, + "learning_rate": 0.0004883924752835644, + "loss": 0.2194, + "step": 70620 + }, + { + "epoch": 2.93, + "grad_norm": 1.140625, + "learning_rate": 0.0004883892087925772, + "loss": 0.253, + "step": 70630 + }, + { + "epoch": 2.93, + "grad_norm": 0.482421875, + "learning_rate": 0.0004883859418529673, + "loss": 0.2117, + "step": 70640 + }, + { + "epoch": 2.93, + "grad_norm": 1.046875, + "learning_rate": 0.0004883826744647408, + "loss": 0.2425, + "step": 70650 + }, + { + "epoch": 2.93, + "grad_norm": 0.734375, + "learning_rate": 0.0004883794066279038, + "loss": 0.2118, + "step": 70660 + }, + { + "epoch": 2.93, + "grad_norm": 0.69140625, + "learning_rate": 0.0004883761383424625, + "loss": 0.1871, + "step": 70670 + }, + { + "epoch": 2.93, + "grad_norm": 0.51953125, + "learning_rate": 0.0004883728696084231, + "loss": 0.2319, + "step": 70680 + }, + { + "epoch": 2.93, + "grad_norm": 1.0234375, + "learning_rate": 0.0004883696004257917, + "loss": 0.1944, + "step": 70690 + }, + { + "epoch": 2.93, + "grad_norm": 0.294921875, + "learning_rate": 0.0004883663307945744, + "loss": 0.1576, + "step": 70700 + }, + { + "epoch": 2.93, + "grad_norm": 0.67578125, + "learning_rate": 0.0004883630607147774, + "loss": 0.2082, + "step": 70710 + }, + { + "epoch": 2.93, + "grad_norm": 1.65625, + "learning_rate": 0.0004883597901864069, + "loss": 0.2155, + "step": 70720 + }, + { + "epoch": 2.93, + "grad_norm": 0.9375, + "learning_rate": 0.0004883565192094692, + "loss": 0.2488, + "step": 70730 + }, + { + "epoch": 2.93, + "grad_norm": 0.84765625, + "learning_rate": 0.00048835324778397, + "loss": 0.2035, + "step": 70740 + }, + { + "epoch": 2.93, + "grad_norm": 0.59765625, + "learning_rate": 0.0004883499759099158, + "loss": 0.1541, + "step": 70750 + }, + { + "epoch": 2.93, + "grad_norm": 0.369140625, + "learning_rate": 0.0004883467035873127, + "loss": 0.1718, + "step": 70760 + }, + { + "epoch": 2.93, + "grad_norm": 0.81640625, + "learning_rate": 0.0004883434308161668, + "loss": 0.2228, + "step": 70770 + }, + { + "epoch": 2.93, + "grad_norm": 0.0, + "learning_rate": 0.0004883401575964843, + "loss": 0.2292, + "step": 70780 + }, + { + "epoch": 2.93, + "grad_norm": 0.7265625, + "learning_rate": 0.0004883368839282714, + "loss": 0.1967, + "step": 70790 + }, + { + "epoch": 2.93, + "grad_norm": 0.5, + "learning_rate": 0.0004883336098115343, + "loss": 0.2399, + "step": 70800 + }, + { + "epoch": 2.93, + "grad_norm": 0.8828125, + "learning_rate": 0.000488330335246279, + "loss": 0.1903, + "step": 70810 + }, + { + "epoch": 2.93, + "grad_norm": 0.470703125, + "learning_rate": 0.0004883270602325118, + "loss": 0.2122, + "step": 70820 + }, + { + "epoch": 2.93, + "grad_norm": 0.69140625, + "learning_rate": 0.0004883237847702387, + "loss": 0.2595, + "step": 70830 + }, + { + "epoch": 2.93, + "grad_norm": 0.33203125, + "learning_rate": 0.000488320508859466, + "loss": 0.2184, + "step": 70840 + }, + { + "epoch": 2.93, + "grad_norm": 0.73828125, + "learning_rate": 0.0004883172325002, + "loss": 0.2323, + "step": 70850 + }, + { + "epoch": 2.94, + "grad_norm": 0.68359375, + "learning_rate": 0.0004883139556924466, + "loss": 0.2437, + "step": 70860 + }, + { + "epoch": 2.94, + "grad_norm": 0.62109375, + "learning_rate": 0.000488310678436212, + "loss": 0.263, + "step": 70870 + }, + { + "epoch": 2.94, + "grad_norm": 1.0546875, + "learning_rate": 0.0004883074007315026, + "loss": 0.1788, + "step": 70880 + }, + { + "epoch": 2.94, + "grad_norm": 1.2109375, + "learning_rate": 0.0004883041225783243, + "loss": 0.2186, + "step": 70890 + }, + { + "epoch": 2.94, + "grad_norm": 0.44140625, + "learning_rate": 0.0004883008439766832, + "loss": 0.2292, + "step": 70900 + }, + { + "epoch": 2.94, + "grad_norm": 1.234375, + "learning_rate": 0.0004882975649265859, + "loss": 0.1891, + "step": 70910 + }, + { + "epoch": 2.94, + "grad_norm": 0.67578125, + "learning_rate": 0.0004882942854280383, + "loss": 0.1882, + "step": 70920 + }, + { + "epoch": 2.94, + "grad_norm": 0.353515625, + "learning_rate": 0.0004882910054810466, + "loss": 0.197, + "step": 70930 + }, + { + "epoch": 2.94, + "grad_norm": 0.703125, + "learning_rate": 0.0004882877250856168, + "loss": 0.2776, + "step": 70940 + }, + { + "epoch": 2.94, + "grad_norm": 1.0625, + "learning_rate": 0.0004882844442417553, + "loss": 0.2025, + "step": 70950 + }, + { + "epoch": 2.94, + "grad_norm": 0.80078125, + "learning_rate": 0.0004882811629494683, + "loss": 0.236, + "step": 70960 + }, + { + "epoch": 2.94, + "grad_norm": 0.6875, + "learning_rate": 0.00048827788120876185, + "loss": 0.1906, + "step": 70970 + }, + { + "epoch": 2.94, + "grad_norm": 0.9375, + "learning_rate": 0.0004882745990196421, + "loss": 0.2398, + "step": 70980 + }, + { + "epoch": 2.94, + "grad_norm": 0.828125, + "learning_rate": 0.0004882713163821153, + "loss": 0.1863, + "step": 70990 + }, + { + "epoch": 2.94, + "grad_norm": 0.3828125, + "learning_rate": 0.0004882680332961876, + "loss": 0.1789, + "step": 71000 + }, + { + "epoch": 2.94, + "grad_norm": 1.296875, + "learning_rate": 0.0004882647497618652, + "loss": 0.2164, + "step": 71010 + }, + { + "epoch": 2.94, + "grad_norm": 1.7734375, + "learning_rate": 0.0004882614657791543, + "loss": 0.2398, + "step": 71020 + }, + { + "epoch": 2.94, + "grad_norm": 0.365234375, + "learning_rate": 0.0004882581813480611, + "loss": 0.2221, + "step": 71030 + }, + { + "epoch": 2.94, + "grad_norm": 0.1474609375, + "learning_rate": 0.00048825489646859167, + "loss": 0.2201, + "step": 71040 + }, + { + "epoch": 2.94, + "grad_norm": 0.65625, + "learning_rate": 0.00048825161114075225, + "loss": 0.2051, + "step": 71050 + }, + { + "epoch": 2.94, + "grad_norm": 0.42578125, + "learning_rate": 0.000488248325364549, + "loss": 0.2748, + "step": 71060 + }, + { + "epoch": 2.94, + "grad_norm": 0.88671875, + "learning_rate": 0.0004882450391399882, + "loss": 0.2223, + "step": 71070 + }, + { + "epoch": 2.94, + "grad_norm": 1.0390625, + "learning_rate": 0.00048824175246707594, + "loss": 0.2607, + "step": 71080 + }, + { + "epoch": 2.94, + "grad_norm": 1.0625, + "learning_rate": 0.0004882384653458184, + "loss": 0.2646, + "step": 71090 + }, + { + "epoch": 2.94, + "grad_norm": 1.21875, + "learning_rate": 0.00048823517777622186, + "loss": 0.2231, + "step": 71100 + }, + { + "epoch": 2.95, + "grad_norm": 1.0234375, + "learning_rate": 0.0004882318897582924, + "loss": 0.2075, + "step": 71110 + }, + { + "epoch": 2.95, + "grad_norm": 1.21875, + "learning_rate": 0.0004882286012920362, + "loss": 0.2393, + "step": 71120 + }, + { + "epoch": 2.95, + "grad_norm": 0.69921875, + "learning_rate": 0.0004882253123774595, + "loss": 0.2536, + "step": 71130 + }, + { + "epoch": 2.95, + "grad_norm": 1.953125, + "learning_rate": 0.00048822202301456857, + "loss": 0.234, + "step": 71140 + }, + { + "epoch": 2.95, + "grad_norm": 0.890625, + "learning_rate": 0.0004882187332033695, + "loss": 0.2563, + "step": 71150 + }, + { + "epoch": 2.95, + "grad_norm": 0.78515625, + "learning_rate": 0.00048821544294386845, + "loss": 0.2201, + "step": 71160 + }, + { + "epoch": 2.95, + "grad_norm": 0.6953125, + "learning_rate": 0.00048821215223607175, + "loss": 0.1954, + "step": 71170 + }, + { + "epoch": 2.95, + "grad_norm": 1.8203125, + "learning_rate": 0.00048820886107998543, + "loss": 0.2505, + "step": 71180 + }, + { + "epoch": 2.95, + "grad_norm": 0.0, + "learning_rate": 0.00048820556947561577, + "loss": 0.2222, + "step": 71190 + }, + { + "epoch": 2.95, + "grad_norm": 0.71875, + "learning_rate": 0.00048820227742296896, + "loss": 0.2207, + "step": 71200 + }, + { + "epoch": 2.95, + "grad_norm": 1.125, + "learning_rate": 0.00048819898492205116, + "loss": 0.2234, + "step": 71210 + }, + { + "epoch": 2.95, + "grad_norm": 0.65234375, + "learning_rate": 0.0004881956919728686, + "loss": 0.2822, + "step": 71220 + }, + { + "epoch": 2.95, + "grad_norm": 1.140625, + "learning_rate": 0.0004881923985754275, + "loss": 0.2253, + "step": 71230 + }, + { + "epoch": 2.95, + "grad_norm": 0.66015625, + "learning_rate": 0.00048818910472973404, + "loss": 0.2631, + "step": 71240 + }, + { + "epoch": 2.95, + "grad_norm": 0.6953125, + "learning_rate": 0.0004881858104357944, + "loss": 0.2246, + "step": 71250 + }, + { + "epoch": 2.95, + "grad_norm": 0.306640625, + "learning_rate": 0.00048818251569361475, + "loss": 0.1991, + "step": 71260 + }, + { + "epoch": 2.95, + "grad_norm": 0.30078125, + "learning_rate": 0.0004881792205032014, + "loss": 0.2254, + "step": 71270 + }, + { + "epoch": 2.95, + "grad_norm": 0.984375, + "learning_rate": 0.0004881759248645604, + "loss": 0.2035, + "step": 71280 + }, + { + "epoch": 2.95, + "grad_norm": 1.046875, + "learning_rate": 0.0004881726287776981, + "loss": 0.1759, + "step": 71290 + }, + { + "epoch": 2.95, + "grad_norm": 0.71875, + "learning_rate": 0.00048816933224262056, + "loss": 0.1803, + "step": 71300 + }, + { + "epoch": 2.95, + "grad_norm": 1.2578125, + "learning_rate": 0.0004881660352593341, + "loss": 0.206, + "step": 71310 + }, + { + "epoch": 2.95, + "grad_norm": 0.84375, + "learning_rate": 0.0004881627378278448, + "loss": 0.2679, + "step": 71320 + }, + { + "epoch": 2.95, + "grad_norm": 0.400390625, + "learning_rate": 0.000488159439948159, + "loss": 0.1987, + "step": 71330 + }, + { + "epoch": 2.95, + "grad_norm": 1.0859375, + "learning_rate": 0.00048815614162028294, + "loss": 0.2294, + "step": 71340 + }, + { + "epoch": 2.96, + "grad_norm": 0.40234375, + "learning_rate": 0.0004881528428442227, + "loss": 0.2184, + "step": 71350 + }, + { + "epoch": 2.96, + "grad_norm": 0.494140625, + "learning_rate": 0.00048814954361998443, + "loss": 0.1522, + "step": 71360 + }, + { + "epoch": 2.96, + "grad_norm": 0.828125, + "learning_rate": 0.00048814624394757455, + "loss": 0.1758, + "step": 71370 + }, + { + "epoch": 2.96, + "grad_norm": 0.640625, + "learning_rate": 0.0004881429438269991, + "loss": 0.2968, + "step": 71380 + }, + { + "epoch": 2.96, + "grad_norm": 0.193359375, + "learning_rate": 0.0004881396432582643, + "loss": 0.2377, + "step": 71390 + }, + { + "epoch": 2.96, + "grad_norm": 0.48828125, + "learning_rate": 0.0004881363422413765, + "loss": 0.2281, + "step": 71400 + }, + { + "epoch": 2.96, + "grad_norm": 0.427734375, + "learning_rate": 0.0004881330407763418, + "loss": 0.2063, + "step": 71410 + }, + { + "epoch": 2.96, + "grad_norm": 0.482421875, + "learning_rate": 0.0004881297388631664, + "loss": 0.1654, + "step": 71420 + }, + { + "epoch": 2.96, + "grad_norm": 0.765625, + "learning_rate": 0.0004881264365018566, + "loss": 0.2251, + "step": 71430 + }, + { + "epoch": 2.96, + "grad_norm": 0.62890625, + "learning_rate": 0.0004881231336924185, + "loss": 0.2569, + "step": 71440 + }, + { + "epoch": 2.96, + "grad_norm": 0.423828125, + "learning_rate": 0.00048811983043485843, + "loss": 0.1904, + "step": 71450 + }, + { + "epoch": 2.96, + "grad_norm": 0.67578125, + "learning_rate": 0.0004881165267291825, + "loss": 0.233, + "step": 71460 + }, + { + "epoch": 2.96, + "grad_norm": 0.74609375, + "learning_rate": 0.00048811322257539706, + "loss": 0.2037, + "step": 71470 + }, + { + "epoch": 2.96, + "grad_norm": 2.9375, + "learning_rate": 0.00048810991797350816, + "loss": 0.1803, + "step": 71480 + }, + { + "epoch": 2.96, + "grad_norm": 2.296875, + "learning_rate": 0.00048810661292352214, + "loss": 0.1891, + "step": 71490 + }, + { + "epoch": 2.96, + "grad_norm": 0.859375, + "learning_rate": 0.0004881033074254452, + "loss": 0.3033, + "step": 71500 + }, + { + "epoch": 2.96, + "grad_norm": 0.55078125, + "learning_rate": 0.0004881000014792836, + "loss": 0.2606, + "step": 71510 + }, + { + "epoch": 2.96, + "grad_norm": 0.67578125, + "learning_rate": 0.0004880966950850434, + "loss": 0.2012, + "step": 71520 + }, + { + "epoch": 2.96, + "grad_norm": 0.76953125, + "learning_rate": 0.000488093388242731, + "loss": 0.1895, + "step": 71530 + }, + { + "epoch": 2.96, + "grad_norm": 0.462890625, + "learning_rate": 0.0004880900809523525, + "loss": 0.1695, + "step": 71540 + }, + { + "epoch": 2.96, + "grad_norm": 0.90234375, + "learning_rate": 0.00048808677321391427, + "loss": 0.1795, + "step": 71550 + }, + { + "epoch": 2.96, + "grad_norm": 1.53125, + "learning_rate": 0.0004880834650274224, + "loss": 0.2306, + "step": 71560 + }, + { + "epoch": 2.96, + "grad_norm": 0.294921875, + "learning_rate": 0.0004880801563928832, + "loss": 0.2433, + "step": 71570 + }, + { + "epoch": 2.96, + "grad_norm": 1.0625, + "learning_rate": 0.00048807684731030275, + "loss": 0.2928, + "step": 71580 + }, + { + "epoch": 2.97, + "grad_norm": 0.46875, + "learning_rate": 0.00048807353777968743, + "loss": 0.2403, + "step": 71590 + }, + { + "epoch": 2.97, + "grad_norm": 0.3515625, + "learning_rate": 0.0004880702278010435, + "loss": 0.1902, + "step": 71600 + }, + { + "epoch": 2.97, + "grad_norm": 0.55078125, + "learning_rate": 0.00048806691737437704, + "loss": 0.2623, + "step": 71610 + }, + { + "epoch": 2.97, + "grad_norm": 1.4609375, + "learning_rate": 0.00048806360649969433, + "loss": 0.1771, + "step": 71620 + }, + { + "epoch": 2.97, + "grad_norm": 0.63671875, + "learning_rate": 0.00048806029517700167, + "loss": 0.2745, + "step": 71630 + }, + { + "epoch": 2.97, + "grad_norm": 0.6796875, + "learning_rate": 0.00048805698340630523, + "loss": 0.2442, + "step": 71640 + }, + { + "epoch": 2.97, + "grad_norm": 0.484375, + "learning_rate": 0.00048805367118761125, + "loss": 0.1739, + "step": 71650 + }, + { + "epoch": 2.97, + "grad_norm": 0.5859375, + "learning_rate": 0.00048805035852092596, + "loss": 0.2743, + "step": 71660 + }, + { + "epoch": 2.97, + "grad_norm": 0.5078125, + "learning_rate": 0.00048804704540625555, + "loss": 0.2061, + "step": 71670 + }, + { + "epoch": 2.97, + "grad_norm": 0.609375, + "learning_rate": 0.0004880437318436064, + "loss": 0.2508, + "step": 71680 + }, + { + "epoch": 2.97, + "grad_norm": 0.90625, + "learning_rate": 0.0004880404178329846, + "loss": 0.195, + "step": 71690 + }, + { + "epoch": 2.97, + "grad_norm": 0.52734375, + "learning_rate": 0.0004880371033743964, + "loss": 0.2486, + "step": 71700 + }, + { + "epoch": 2.97, + "grad_norm": 0.59375, + "learning_rate": 0.00048803378846784816, + "loss": 0.2084, + "step": 71710 + }, + { + "epoch": 2.97, + "grad_norm": 0.51953125, + "learning_rate": 0.00048803047311334605, + "loss": 0.1868, + "step": 71720 + }, + { + "epoch": 2.97, + "grad_norm": 0.640625, + "learning_rate": 0.0004880271573108963, + "loss": 0.2051, + "step": 71730 + }, + { + "epoch": 2.97, + "grad_norm": 0.43359375, + "learning_rate": 0.000488023841060505, + "loss": 0.1991, + "step": 71740 + }, + { + "epoch": 2.97, + "grad_norm": 1.5234375, + "learning_rate": 0.0004880205243621787, + "loss": 0.2575, + "step": 71750 + }, + { + "epoch": 2.97, + "grad_norm": 0.353515625, + "learning_rate": 0.00048801720721592333, + "loss": 0.2823, + "step": 71760 + }, + { + "epoch": 2.97, + "grad_norm": 1.0625, + "learning_rate": 0.00048801388962174544, + "loss": 0.2158, + "step": 71770 + }, + { + "epoch": 2.97, + "grad_norm": 1.03125, + "learning_rate": 0.000488010571579651, + "loss": 0.2444, + "step": 71780 + }, + { + "epoch": 2.97, + "grad_norm": 0.369140625, + "learning_rate": 0.00048800725308964645, + "loss": 0.1964, + "step": 71790 + }, + { + "epoch": 2.97, + "grad_norm": 0.478515625, + "learning_rate": 0.0004880039341517379, + "loss": 0.1923, + "step": 71800 + }, + { + "epoch": 2.97, + "grad_norm": 0.73828125, + "learning_rate": 0.0004880006147659316, + "loss": 0.1948, + "step": 71810 + }, + { + "epoch": 2.97, + "grad_norm": 0.62890625, + "learning_rate": 0.0004879972949322339, + "loss": 0.2379, + "step": 71820 + }, + { + "epoch": 2.98, + "grad_norm": 0.8125, + "learning_rate": 0.000487993974650651, + "loss": 0.192, + "step": 71830 + }, + { + "epoch": 2.98, + "grad_norm": 0.62890625, + "learning_rate": 0.0004879906539211891, + "loss": 0.2021, + "step": 71840 + }, + { + "epoch": 2.98, + "grad_norm": 0.92578125, + "learning_rate": 0.00048798733274385455, + "loss": 0.2294, + "step": 71850 + }, + { + "epoch": 2.98, + "grad_norm": 0.431640625, + "learning_rate": 0.00048798401111865353, + "loss": 0.2164, + "step": 71860 + }, + { + "epoch": 2.98, + "grad_norm": 1.2734375, + "learning_rate": 0.00048798068904559223, + "loss": 0.1919, + "step": 71870 + }, + { + "epoch": 2.98, + "grad_norm": 0.5625, + "learning_rate": 0.00048797736652467704, + "loss": 0.2509, + "step": 71880 + }, + { + "epoch": 2.98, + "grad_norm": 0.5703125, + "learning_rate": 0.00048797404355591415, + "loss": 0.2571, + "step": 71890 + }, + { + "epoch": 2.98, + "grad_norm": 1.015625, + "learning_rate": 0.0004879707201393098, + "loss": 0.2226, + "step": 71900 + }, + { + "epoch": 2.98, + "grad_norm": 0.0, + "learning_rate": 0.00048796739627487024, + "loss": 0.1937, + "step": 71910 + }, + { + "epoch": 2.98, + "grad_norm": 0.53125, + "learning_rate": 0.00048796407196260175, + "loss": 0.2193, + "step": 71920 + }, + { + "epoch": 2.98, + "grad_norm": 1.0625, + "learning_rate": 0.00048796074720251054, + "loss": 0.2114, + "step": 71930 + }, + { + "epoch": 2.98, + "grad_norm": 0.404296875, + "learning_rate": 0.0004879574219946029, + "loss": 0.2087, + "step": 71940 + }, + { + "epoch": 2.98, + "grad_norm": 0.640625, + "learning_rate": 0.00048795409633888514, + "loss": 0.2018, + "step": 71950 + }, + { + "epoch": 2.98, + "grad_norm": 0.65234375, + "learning_rate": 0.0004879507702353635, + "loss": 0.2279, + "step": 71960 + }, + { + "epoch": 2.98, + "grad_norm": 0.41796875, + "learning_rate": 0.00048794744368404406, + "loss": 0.2306, + "step": 71970 + }, + { + "epoch": 2.98, + "grad_norm": 0.56640625, + "learning_rate": 0.0004879441166849334, + "loss": 0.2042, + "step": 71980 + }, + { + "epoch": 2.98, + "grad_norm": 0.58984375, + "learning_rate": 0.0004879407892380375, + "loss": 0.1775, + "step": 71990 + }, + { + "epoch": 2.98, + "grad_norm": 0.6875, + "learning_rate": 0.00048793746134336274, + "loss": 0.2049, + "step": 72000 + }, + { + "epoch": 2.98, + "grad_norm": 0.98046875, + "learning_rate": 0.0004879341330009154, + "loss": 0.2592, + "step": 72010 + }, + { + "epoch": 2.98, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004879308042107017, + "loss": 0.2273, + "step": 72020 + }, + { + "epoch": 2.98, + "grad_norm": 0.1787109375, + "learning_rate": 0.00048792747497272796, + "loss": 0.2029, + "step": 72030 + }, + { + "epoch": 2.98, + "grad_norm": 0.423828125, + "learning_rate": 0.00048792414528700035, + "loss": 0.2187, + "step": 72040 + }, + { + "epoch": 2.98, + "grad_norm": 0.90625, + "learning_rate": 0.00048792081515352517, + "loss": 0.2199, + "step": 72050 + }, + { + "epoch": 2.98, + "grad_norm": 0.353515625, + "learning_rate": 0.0004879174845723088, + "loss": 0.2342, + "step": 72060 + }, + { + "epoch": 2.99, + "grad_norm": 1.03125, + "learning_rate": 0.00048791415354335733, + "loss": 0.2126, + "step": 72070 + }, + { + "epoch": 2.99, + "grad_norm": 1.6015625, + "learning_rate": 0.00048791082206667714, + "loss": 0.1571, + "step": 72080 + }, + { + "epoch": 2.99, + "grad_norm": 0.353515625, + "learning_rate": 0.0004879074901422745, + "loss": 0.2076, + "step": 72090 + }, + { + "epoch": 2.99, + "grad_norm": 0.84375, + "learning_rate": 0.00048790415777015566, + "loss": 0.2469, + "step": 72100 + }, + { + "epoch": 2.99, + "grad_norm": 0.361328125, + "learning_rate": 0.0004879008249503269, + "loss": 0.1783, + "step": 72110 + }, + { + "epoch": 2.99, + "grad_norm": 0.3671875, + "learning_rate": 0.0004878974916827944, + "loss": 0.2376, + "step": 72120 + }, + { + "epoch": 2.99, + "grad_norm": 0.546875, + "learning_rate": 0.0004878941579675646, + "loss": 0.2034, + "step": 72130 + }, + { + "epoch": 2.99, + "grad_norm": 0.61328125, + "learning_rate": 0.0004878908238046437, + "loss": 0.2649, + "step": 72140 + }, + { + "epoch": 2.99, + "grad_norm": 0.87890625, + "learning_rate": 0.0004878874891940378, + "loss": 0.1896, + "step": 72150 + }, + { + "epoch": 2.99, + "grad_norm": 0.484375, + "learning_rate": 0.0004878841541357535, + "loss": 0.2109, + "step": 72160 + }, + { + "epoch": 2.99, + "grad_norm": 0.94140625, + "learning_rate": 0.0004878808186297968, + "loss": 0.2062, + "step": 72170 + }, + { + "epoch": 2.99, + "grad_norm": 0.57421875, + "learning_rate": 0.00048787748267617415, + "loss": 0.1673, + "step": 72180 + }, + { + "epoch": 2.99, + "grad_norm": 0.68359375, + "learning_rate": 0.0004878741462748918, + "loss": 0.2011, + "step": 72190 + }, + { + "epoch": 2.99, + "grad_norm": 0.275390625, + "learning_rate": 0.00048787080942595595, + "loss": 0.2357, + "step": 72200 + }, + { + "epoch": 2.99, + "grad_norm": 0.75390625, + "learning_rate": 0.0004878674721293729, + "loss": 0.1926, + "step": 72210 + }, + { + "epoch": 2.99, + "grad_norm": 0.8046875, + "learning_rate": 0.00048786413438514897, + "loss": 0.2105, + "step": 72220 + }, + { + "epoch": 2.99, + "grad_norm": 0.51171875, + "learning_rate": 0.0004878607961932905, + "loss": 0.2121, + "step": 72230 + }, + { + "epoch": 2.99, + "grad_norm": 0.69140625, + "learning_rate": 0.0004878574575538036, + "loss": 0.2099, + "step": 72240 + }, + { + "epoch": 2.99, + "grad_norm": 0.478515625, + "learning_rate": 0.0004878541184666947, + "loss": 0.2236, + "step": 72250 + }, + { + "epoch": 2.99, + "grad_norm": 0.53515625, + "learning_rate": 0.00048785077893197, + "loss": 0.2327, + "step": 72260 + }, + { + "epoch": 2.99, + "grad_norm": 0.921875, + "learning_rate": 0.00048784743894963583, + "loss": 0.23, + "step": 72270 + }, + { + "epoch": 2.99, + "grad_norm": 0.6015625, + "learning_rate": 0.00048784409851969845, + "loss": 0.2363, + "step": 72280 + }, + { + "epoch": 2.99, + "grad_norm": 0.361328125, + "learning_rate": 0.00048784075764216425, + "loss": 0.2224, + "step": 72290 + }, + { + "epoch": 2.99, + "grad_norm": 0.51171875, + "learning_rate": 0.00048783741631703935, + "loss": 0.2215, + "step": 72300 + }, + { + "epoch": 3.0, + "grad_norm": 0.77734375, + "learning_rate": 0.00048783407454433016, + "loss": 0.268, + "step": 72310 + }, + { + "epoch": 3.0, + "grad_norm": 0.91796875, + "learning_rate": 0.00048783073232404285, + "loss": 0.2132, + "step": 72320 + }, + { + "epoch": 3.0, + "grad_norm": 0.5625, + "learning_rate": 0.00048782738965618387, + "loss": 0.2695, + "step": 72330 + }, + { + "epoch": 3.0, + "grad_norm": 0.5703125, + "learning_rate": 0.0004878240465407594, + "loss": 0.2792, + "step": 72340 + }, + { + "epoch": 3.0, + "grad_norm": 0.62890625, + "learning_rate": 0.0004878207029777757, + "loss": 0.2346, + "step": 72350 + }, + { + "epoch": 3.0, + "grad_norm": 0.55078125, + "learning_rate": 0.0004878173589672392, + "loss": 0.2275, + "step": 72360 + }, + { + "epoch": 3.0, + "grad_norm": 0.458984375, + "learning_rate": 0.00048781401450915607, + "loss": 0.2062, + "step": 72370 + }, + { + "epoch": 3.0, + "grad_norm": 0.7265625, + "learning_rate": 0.00048781066960353265, + "loss": 0.2178, + "step": 72380 + }, + { + "epoch": 3.0, + "grad_norm": 0.412109375, + "learning_rate": 0.00048780732425037523, + "loss": 0.1775, + "step": 72390 + }, + { + "epoch": 3.0, + "grad_norm": 0.8359375, + "learning_rate": 0.00048780397844969017, + "loss": 0.2349, + "step": 72400 + }, + { + "epoch": 3.0, + "grad_norm": 0.6015625, + "learning_rate": 0.00048780063220148364, + "loss": 0.1752, + "step": 72410 + }, + { + "epoch": 3.0, + "grad_norm": 0.53125, + "learning_rate": 0.000487797285505762, + "loss": 0.181, + "step": 72420 + }, + { + "epoch": 3.0, + "grad_norm": 0.73046875, + "learning_rate": 0.0004877939383625315, + "loss": 0.2107, + "step": 72430 + }, + { + "epoch": 3.0, + "grad_norm": 0.640625, + "learning_rate": 0.00048779059077179855, + "loss": 0.1439, + "step": 72440 + }, + { + "epoch": 3.0, + "grad_norm": 0.7578125, + "learning_rate": 0.0004877872427335694, + "loss": 0.2217, + "step": 72450 + }, + { + "epoch": 3.0, + "grad_norm": 0.486328125, + "learning_rate": 0.0004877838942478503, + "loss": 0.1862, + "step": 72460 + }, + { + "epoch": 3.0, + "grad_norm": 1.265625, + "learning_rate": 0.00048778054531464765, + "loss": 0.243, + "step": 72470 + }, + { + "epoch": 3.0, + "grad_norm": 0.423828125, + "learning_rate": 0.0004877771959339676, + "loss": 0.1798, + "step": 72480 + }, + { + "epoch": 3.0, + "grad_norm": 0.98046875, + "learning_rate": 0.00048777384610581664, + "loss": 0.217, + "step": 72490 + }, + { + "epoch": 3.0, + "grad_norm": 0.625, + "learning_rate": 0.00048777049583020095, + "loss": 0.2105, + "step": 72500 + }, + { + "epoch": 3.0, + "grad_norm": 0.60546875, + "learning_rate": 0.00048776714510712684, + "loss": 0.1668, + "step": 72510 + }, + { + "epoch": 3.0, + "grad_norm": 0.54296875, + "learning_rate": 0.00048776379393660065, + "loss": 0.1755, + "step": 72520 + }, + { + "epoch": 3.0, + "grad_norm": 0.43359375, + "learning_rate": 0.00048776044231862867, + "loss": 0.1925, + "step": 72530 + }, + { + "epoch": 3.0, + "grad_norm": 0.75390625, + "learning_rate": 0.0004877570902532172, + "loss": 0.1652, + "step": 72540 + }, + { + "epoch": 3.01, + "grad_norm": 0.416015625, + "learning_rate": 0.0004877537377403726, + "loss": 0.2013, + "step": 72550 + }, + { + "epoch": 3.01, + "grad_norm": 1.21875, + "learning_rate": 0.0004877503847801011, + "loss": 0.2054, + "step": 72560 + }, + { + "epoch": 3.01, + "grad_norm": 0.796875, + "learning_rate": 0.00048774703137240903, + "loss": 0.2, + "step": 72570 + }, + { + "epoch": 3.01, + "grad_norm": 1.5078125, + "learning_rate": 0.00048774367751730274, + "loss": 0.166, + "step": 72580 + }, + { + "epoch": 3.01, + "grad_norm": 0.41015625, + "learning_rate": 0.00048774032321478855, + "loss": 0.1647, + "step": 72590 + }, + { + "epoch": 3.01, + "grad_norm": 0.52734375, + "learning_rate": 0.0004877369684648728, + "loss": 0.2036, + "step": 72600 + }, + { + "epoch": 3.01, + "grad_norm": 0.84375, + "learning_rate": 0.00048773361326756164, + "loss": 0.1787, + "step": 72610 + }, + { + "epoch": 3.01, + "grad_norm": 0.50390625, + "learning_rate": 0.0004877302576228616, + "loss": 0.2388, + "step": 72620 + }, + { + "epoch": 3.01, + "grad_norm": 1.234375, + "learning_rate": 0.0004877269015307788, + "loss": 0.2288, + "step": 72630 + }, + { + "epoch": 3.01, + "grad_norm": 0.48046875, + "learning_rate": 0.00048772354499131967, + "loss": 0.2394, + "step": 72640 + }, + { + "epoch": 3.01, + "grad_norm": 0.67578125, + "learning_rate": 0.0004877201880044905, + "loss": 0.2308, + "step": 72650 + }, + { + "epoch": 3.01, + "grad_norm": 4.09375, + "learning_rate": 0.00048771683057029756, + "loss": 0.1905, + "step": 72660 + }, + { + "epoch": 3.01, + "grad_norm": 0.59765625, + "learning_rate": 0.0004877134726887473, + "loss": 0.2068, + "step": 72670 + }, + { + "epoch": 3.01, + "grad_norm": 0.55078125, + "learning_rate": 0.0004877101143598459, + "loss": 0.2087, + "step": 72680 + }, + { + "epoch": 3.01, + "grad_norm": 2.234375, + "learning_rate": 0.0004877067555835998, + "loss": 0.2214, + "step": 72690 + }, + { + "epoch": 3.01, + "grad_norm": 0.53125, + "learning_rate": 0.00048770339636001526, + "loss": 0.2059, + "step": 72700 + }, + { + "epoch": 3.01, + "grad_norm": 0.38671875, + "learning_rate": 0.0004877000366890985, + "loss": 0.2037, + "step": 72710 + }, + { + "epoch": 3.01, + "grad_norm": 0.76953125, + "learning_rate": 0.000487696676570856, + "loss": 0.194, + "step": 72720 + }, + { + "epoch": 3.01, + "grad_norm": 0.66015625, + "learning_rate": 0.00048769331600529396, + "loss": 0.2644, + "step": 72730 + }, + { + "epoch": 3.01, + "grad_norm": 0.75, + "learning_rate": 0.0004876899549924189, + "loss": 0.2014, + "step": 72740 + }, + { + "epoch": 3.01, + "grad_norm": 0.40625, + "learning_rate": 0.0004876865935322369, + "loss": 0.1983, + "step": 72750 + }, + { + "epoch": 3.01, + "grad_norm": 0.546875, + "learning_rate": 0.00048768323162475446, + "loss": 0.2287, + "step": 72760 + }, + { + "epoch": 3.01, + "grad_norm": 0.578125, + "learning_rate": 0.0004876798692699778, + "loss": 0.1679, + "step": 72770 + }, + { + "epoch": 3.01, + "grad_norm": 1.2890625, + "learning_rate": 0.00048767650646791333, + "loss": 0.2739, + "step": 72780 + }, + { + "epoch": 3.01, + "grad_norm": 0.486328125, + "learning_rate": 0.0004876731432185673, + "loss": 0.1546, + "step": 72790 + }, + { + "epoch": 3.02, + "grad_norm": 0.6640625, + "learning_rate": 0.0004876697795219461, + "loss": 0.2504, + "step": 72800 + }, + { + "epoch": 3.02, + "grad_norm": 0.486328125, + "learning_rate": 0.00048766641537805605, + "loss": 0.1978, + "step": 72810 + }, + { + "epoch": 3.02, + "grad_norm": 0.546875, + "learning_rate": 0.0004876630507869035, + "loss": 0.188, + "step": 72820 + }, + { + "epoch": 3.02, + "grad_norm": 1.1484375, + "learning_rate": 0.0004876596857484947, + "loss": 0.2741, + "step": 72830 + }, + { + "epoch": 3.02, + "grad_norm": 0.6015625, + "learning_rate": 0.00048765632026283605, + "loss": 0.1905, + "step": 72840 + }, + { + "epoch": 3.02, + "grad_norm": 0.203125, + "learning_rate": 0.0004876529543299339, + "loss": 0.2008, + "step": 72850 + }, + { + "epoch": 3.02, + "grad_norm": 0.58203125, + "learning_rate": 0.00048764958794979453, + "loss": 0.196, + "step": 72860 + }, + { + "epoch": 3.02, + "grad_norm": 0.369140625, + "learning_rate": 0.0004876462211224243, + "loss": 0.1902, + "step": 72870 + }, + { + "epoch": 3.02, + "grad_norm": 0.515625, + "learning_rate": 0.00048764285384782956, + "loss": 0.2203, + "step": 72880 + }, + { + "epoch": 3.02, + "grad_norm": 1.859375, + "learning_rate": 0.0004876394861260166, + "loss": 0.2204, + "step": 72890 + }, + { + "epoch": 3.02, + "grad_norm": 0.6171875, + "learning_rate": 0.00048763611795699184, + "loss": 0.2486, + "step": 72900 + }, + { + "epoch": 3.02, + "grad_norm": 0.625, + "learning_rate": 0.00048763274934076153, + "loss": 0.1858, + "step": 72910 + }, + { + "epoch": 3.02, + "grad_norm": 0.484375, + "learning_rate": 0.00048762938027733203, + "loss": 0.2007, + "step": 72920 + }, + { + "epoch": 3.02, + "grad_norm": 0.48046875, + "learning_rate": 0.00048762601076670975, + "loss": 0.1984, + "step": 72930 + }, + { + "epoch": 3.02, + "grad_norm": 0.6875, + "learning_rate": 0.00048762264080890095, + "loss": 0.2159, + "step": 72940 + }, + { + "epoch": 3.02, + "grad_norm": 0.3359375, + "learning_rate": 0.000487619270403912, + "loss": 0.1565, + "step": 72950 + }, + { + "epoch": 3.02, + "grad_norm": 0.6328125, + "learning_rate": 0.0004876158995517492, + "loss": 0.2507, + "step": 72960 + }, + { + "epoch": 3.02, + "grad_norm": 0.88671875, + "learning_rate": 0.0004876125282524191, + "loss": 0.2102, + "step": 72970 + }, + { + "epoch": 3.02, + "grad_norm": 0.95703125, + "learning_rate": 0.00048760915650592775, + "loss": 0.2121, + "step": 72980 + }, + { + "epoch": 3.02, + "grad_norm": 1.0, + "learning_rate": 0.00048760578431228163, + "loss": 0.2234, + "step": 72990 + }, + { + "epoch": 3.02, + "grad_norm": 1.0234375, + "learning_rate": 0.0004876024116714871, + "loss": 0.1907, + "step": 73000 + }, + { + "epoch": 3.02, + "grad_norm": 0.46875, + "learning_rate": 0.0004875990385835505, + "loss": 0.2353, + "step": 73010 + }, + { + "epoch": 3.02, + "grad_norm": 0.85546875, + "learning_rate": 0.00048759566504847816, + "loss": 0.1463, + "step": 73020 + }, + { + "epoch": 3.02, + "grad_norm": 1.1171875, + "learning_rate": 0.0004875922910662764, + "loss": 0.1872, + "step": 73030 + }, + { + "epoch": 3.03, + "grad_norm": 1.1953125, + "learning_rate": 0.0004875889166369516, + "loss": 0.1915, + "step": 73040 + }, + { + "epoch": 3.03, + "grad_norm": 1.0859375, + "learning_rate": 0.00048758554176051017, + "loss": 0.277, + "step": 73050 + }, + { + "epoch": 3.03, + "grad_norm": 1.6640625, + "learning_rate": 0.0004875821664369584, + "loss": 0.1807, + "step": 73060 + }, + { + "epoch": 3.03, + "grad_norm": 0.578125, + "learning_rate": 0.0004875787906663027, + "loss": 0.1922, + "step": 73070 + }, + { + "epoch": 3.03, + "grad_norm": 1.21875, + "learning_rate": 0.00048757541444854923, + "loss": 0.1925, + "step": 73080 + }, + { + "epoch": 3.03, + "grad_norm": 0.67578125, + "learning_rate": 0.00048757203778370463, + "loss": 0.1845, + "step": 73090 + }, + { + "epoch": 3.03, + "grad_norm": 0.86328125, + "learning_rate": 0.00048756866067177506, + "loss": 0.1858, + "step": 73100 + }, + { + "epoch": 3.03, + "grad_norm": 1.390625, + "learning_rate": 0.0004875652831127669, + "loss": 0.195, + "step": 73110 + }, + { + "epoch": 3.03, + "grad_norm": 0.98046875, + "learning_rate": 0.0004875619051066865, + "loss": 0.2507, + "step": 73120 + }, + { + "epoch": 3.03, + "grad_norm": 0.671875, + "learning_rate": 0.00048755852665354035, + "loss": 0.2051, + "step": 73130 + }, + { + "epoch": 3.03, + "grad_norm": 1.1953125, + "learning_rate": 0.00048755514775333466, + "loss": 0.2253, + "step": 73140 + }, + { + "epoch": 3.03, + "grad_norm": 0.859375, + "learning_rate": 0.0004875517684060758, + "loss": 0.2032, + "step": 73150 + }, + { + "epoch": 3.03, + "grad_norm": 0.828125, + "learning_rate": 0.0004875483886117702, + "loss": 0.2175, + "step": 73160 + }, + { + "epoch": 3.03, + "grad_norm": 0.94140625, + "learning_rate": 0.0004875450083704242, + "loss": 0.1815, + "step": 73170 + }, + { + "epoch": 3.03, + "grad_norm": 0.8984375, + "learning_rate": 0.0004875416276820441, + "loss": 0.1723, + "step": 73180 + }, + { + "epoch": 3.03, + "grad_norm": 0.390625, + "learning_rate": 0.0004875382465466364, + "loss": 0.2832, + "step": 73190 + }, + { + "epoch": 3.03, + "grad_norm": 0.8671875, + "learning_rate": 0.00048753486496420727, + "loss": 0.1818, + "step": 73200 + }, + { + "epoch": 3.03, + "grad_norm": 0.421875, + "learning_rate": 0.00048753148293476323, + "loss": 0.2538, + "step": 73210 + }, + { + "epoch": 3.03, + "grad_norm": 0.470703125, + "learning_rate": 0.00048752810045831055, + "loss": 0.2147, + "step": 73220 + }, + { + "epoch": 3.03, + "grad_norm": 0.76953125, + "learning_rate": 0.00048752471753485567, + "loss": 0.1986, + "step": 73230 + }, + { + "epoch": 3.03, + "grad_norm": 0.431640625, + "learning_rate": 0.0004875213341644049, + "loss": 0.1704, + "step": 73240 + }, + { + "epoch": 3.03, + "grad_norm": 0.9609375, + "learning_rate": 0.00048751795034696467, + "loss": 0.2443, + "step": 73250 + }, + { + "epoch": 3.03, + "grad_norm": 0.53125, + "learning_rate": 0.0004875145660825413, + "loss": 0.1706, + "step": 73260 + }, + { + "epoch": 3.03, + "grad_norm": 0.703125, + "learning_rate": 0.00048751118137114113, + "loss": 0.2349, + "step": 73270 + }, + { + "epoch": 3.04, + "grad_norm": 0.578125, + "learning_rate": 0.00048750779621277055, + "loss": 0.2239, + "step": 73280 + }, + { + "epoch": 3.04, + "grad_norm": 0.625, + "learning_rate": 0.000487504410607436, + "loss": 0.211, + "step": 73290 + }, + { + "epoch": 3.04, + "grad_norm": 1.28125, + "learning_rate": 0.0004875010245551438, + "loss": 0.2575, + "step": 73300 + }, + { + "epoch": 3.04, + "grad_norm": 0.2001953125, + "learning_rate": 0.00048749763805590025, + "loss": 0.1918, + "step": 73310 + }, + { + "epoch": 3.04, + "grad_norm": 1.3359375, + "learning_rate": 0.00048749425110971185, + "loss": 0.2217, + "step": 73320 + }, + { + "epoch": 3.04, + "grad_norm": 0.68359375, + "learning_rate": 0.0004874908637165849, + "loss": 0.1684, + "step": 73330 + }, + { + "epoch": 3.04, + "grad_norm": 0.396484375, + "learning_rate": 0.00048748747587652573, + "loss": 0.2268, + "step": 73340 + }, + { + "epoch": 3.04, + "grad_norm": 0.953125, + "learning_rate": 0.00048748408758954087, + "loss": 0.2751, + "step": 73350 + }, + { + "epoch": 3.04, + "grad_norm": 0.439453125, + "learning_rate": 0.0004874806988556366, + "loss": 0.2484, + "step": 73360 + }, + { + "epoch": 3.04, + "grad_norm": 0.48046875, + "learning_rate": 0.0004874773096748193, + "loss": 0.2622, + "step": 73370 + }, + { + "epoch": 3.04, + "grad_norm": 1.2890625, + "learning_rate": 0.00048747392004709535, + "loss": 0.1791, + "step": 73380 + }, + { + "epoch": 3.04, + "grad_norm": 0.84765625, + "learning_rate": 0.00048747052997247107, + "loss": 0.2235, + "step": 73390 + }, + { + "epoch": 3.04, + "grad_norm": 0.49609375, + "learning_rate": 0.0004874671394509529, + "loss": 0.2666, + "step": 73400 + }, + { + "epoch": 3.04, + "grad_norm": 0.55078125, + "learning_rate": 0.00048746374848254725, + "loss": 0.2475, + "step": 73410 + }, + { + "epoch": 3.04, + "grad_norm": 0.79296875, + "learning_rate": 0.0004874603570672604, + "loss": 0.1935, + "step": 73420 + }, + { + "epoch": 3.04, + "grad_norm": 0.51953125, + "learning_rate": 0.00048745696520509894, + "loss": 0.2355, + "step": 73430 + }, + { + "epoch": 3.04, + "grad_norm": 4.28125, + "learning_rate": 0.00048745357289606907, + "loss": 0.2292, + "step": 73440 + }, + { + "epoch": 3.04, + "grad_norm": 0.6953125, + "learning_rate": 0.0004874501801401772, + "loss": 0.2344, + "step": 73450 + }, + { + "epoch": 3.04, + "grad_norm": 0.73828125, + "learning_rate": 0.0004874467869374297, + "loss": 0.1944, + "step": 73460 + }, + { + "epoch": 3.04, + "grad_norm": 0.8203125, + "learning_rate": 0.00048744339328783297, + "loss": 0.2549, + "step": 73470 + }, + { + "epoch": 3.04, + "grad_norm": 0.431640625, + "learning_rate": 0.00048743999919139346, + "loss": 0.2009, + "step": 73480 + }, + { + "epoch": 3.04, + "grad_norm": 0.79296875, + "learning_rate": 0.00048743660464811754, + "loss": 0.2083, + "step": 73490 + }, + { + "epoch": 3.04, + "grad_norm": 1.1328125, + "learning_rate": 0.0004874332096580115, + "loss": 0.2185, + "step": 73500 + }, + { + "epoch": 3.04, + "grad_norm": 0.66796875, + "learning_rate": 0.0004874298142210819, + "loss": 0.2591, + "step": 73510 + }, + { + "epoch": 3.05, + "grad_norm": 0.625, + "learning_rate": 0.00048742641833733494, + "loss": 0.1996, + "step": 73520 + }, + { + "epoch": 3.05, + "grad_norm": 0.5546875, + "learning_rate": 0.0004874230220067771, + "loss": 0.2206, + "step": 73530 + }, + { + "epoch": 3.05, + "grad_norm": 0.2216796875, + "learning_rate": 0.00048741962522941485, + "loss": 0.2299, + "step": 73540 + }, + { + "epoch": 3.05, + "grad_norm": 0.6796875, + "learning_rate": 0.00048741622800525444, + "loss": 0.2073, + "step": 73550 + }, + { + "epoch": 3.05, + "grad_norm": 1.71875, + "learning_rate": 0.0004874128303343024, + "loss": 0.2152, + "step": 73560 + }, + { + "epoch": 3.05, + "grad_norm": 0.451171875, + "learning_rate": 0.00048740943221656496, + "loss": 0.2292, + "step": 73570 + }, + { + "epoch": 3.05, + "grad_norm": 1.1875, + "learning_rate": 0.0004874060336520486, + "loss": 0.2768, + "step": 73580 + }, + { + "epoch": 3.05, + "grad_norm": 0.80859375, + "learning_rate": 0.0004874026346407598, + "loss": 0.232, + "step": 73590 + }, + { + "epoch": 3.05, + "grad_norm": 0.458984375, + "learning_rate": 0.00048739923518270487, + "loss": 0.2236, + "step": 73600 + }, + { + "epoch": 3.05, + "grad_norm": 1.2265625, + "learning_rate": 0.0004873958352778902, + "loss": 0.2414, + "step": 73610 + }, + { + "epoch": 3.05, + "grad_norm": 1.3203125, + "learning_rate": 0.0004873924349263222, + "loss": 0.2556, + "step": 73620 + }, + { + "epoch": 3.05, + "grad_norm": 0.703125, + "learning_rate": 0.0004873890341280073, + "loss": 0.2733, + "step": 73630 + }, + { + "epoch": 3.05, + "grad_norm": 0.404296875, + "learning_rate": 0.0004873856328829518, + "loss": 0.2222, + "step": 73640 + }, + { + "epoch": 3.05, + "grad_norm": 1.015625, + "learning_rate": 0.0004873822311911622, + "loss": 0.1805, + "step": 73650 + }, + { + "epoch": 3.05, + "grad_norm": 0.76953125, + "learning_rate": 0.0004873788290526449, + "loss": 0.202, + "step": 73660 + }, + { + "epoch": 3.05, + "grad_norm": 1.015625, + "learning_rate": 0.00048737542646740627, + "loss": 0.2271, + "step": 73670 + }, + { + "epoch": 3.05, + "grad_norm": 0.46875, + "learning_rate": 0.0004873720234354527, + "loss": 0.2459, + "step": 73680 + }, + { + "epoch": 3.05, + "grad_norm": 0.87109375, + "learning_rate": 0.00048736861995679067, + "loss": 0.2104, + "step": 73690 + }, + { + "epoch": 3.05, + "grad_norm": 0.369140625, + "learning_rate": 0.0004873652160314265, + "loss": 0.1958, + "step": 73700 + }, + { + "epoch": 3.05, + "grad_norm": 0.416015625, + "learning_rate": 0.0004873618116593667, + "loss": 0.204, + "step": 73710 + }, + { + "epoch": 3.05, + "grad_norm": 0.279296875, + "learning_rate": 0.0004873584068406175, + "loss": 0.1555, + "step": 73720 + }, + { + "epoch": 3.05, + "grad_norm": 1.3203125, + "learning_rate": 0.00048735500157518543, + "loss": 0.2232, + "step": 73730 + }, + { + "epoch": 3.05, + "grad_norm": 1.2890625, + "learning_rate": 0.0004873515958630769, + "loss": 0.1992, + "step": 73740 + }, + { + "epoch": 3.05, + "grad_norm": 1.546875, + "learning_rate": 0.0004873481897042983, + "loss": 0.2826, + "step": 73750 + }, + { + "epoch": 3.06, + "grad_norm": 0.6796875, + "learning_rate": 0.000487344783098856, + "loss": 0.1707, + "step": 73760 + }, + { + "epoch": 3.06, + "grad_norm": 3.0625, + "learning_rate": 0.0004873413760467564, + "loss": 0.2739, + "step": 73770 + }, + { + "epoch": 3.06, + "grad_norm": 0.474609375, + "learning_rate": 0.0004873379685480061, + "loss": 0.2207, + "step": 73780 + }, + { + "epoch": 3.06, + "grad_norm": 0.212890625, + "learning_rate": 0.0004873345606026113, + "loss": 0.1909, + "step": 73790 + }, + { + "epoch": 3.06, + "grad_norm": 1.234375, + "learning_rate": 0.0004873311522105785, + "loss": 0.1933, + "step": 73800 + }, + { + "epoch": 3.06, + "grad_norm": 0.72265625, + "learning_rate": 0.0004873277433719141, + "loss": 0.1565, + "step": 73810 + }, + { + "epoch": 3.06, + "grad_norm": 0.62109375, + "learning_rate": 0.00048732433408662455, + "loss": 0.2122, + "step": 73820 + }, + { + "epoch": 3.06, + "grad_norm": 0.671875, + "learning_rate": 0.0004873209243547162, + "loss": 0.2397, + "step": 73830 + }, + { + "epoch": 3.06, + "grad_norm": 0.2421875, + "learning_rate": 0.00048731751417619546, + "loss": 0.2042, + "step": 73840 + }, + { + "epoch": 3.06, + "grad_norm": 0.81640625, + "learning_rate": 0.0004873141035510688, + "loss": 0.2229, + "step": 73850 + }, + { + "epoch": 3.06, + "grad_norm": 0.47265625, + "learning_rate": 0.00048731069247934266, + "loss": 0.2532, + "step": 73860 + }, + { + "epoch": 3.06, + "grad_norm": 1.1328125, + "learning_rate": 0.0004873072809610234, + "loss": 0.2278, + "step": 73870 + }, + { + "epoch": 3.06, + "grad_norm": 0.7578125, + "learning_rate": 0.0004873038689961175, + "loss": 0.2484, + "step": 73880 + }, + { + "epoch": 3.06, + "grad_norm": 0.6953125, + "learning_rate": 0.0004873004565846313, + "loss": 0.1948, + "step": 73890 + }, + { + "epoch": 3.06, + "grad_norm": 0.470703125, + "learning_rate": 0.0004872970437265713, + "loss": 0.2021, + "step": 73900 + }, + { + "epoch": 3.06, + "grad_norm": 0.703125, + "learning_rate": 0.00048729363042194384, + "loss": 0.237, + "step": 73910 + }, + { + "epoch": 3.06, + "grad_norm": 0.455078125, + "learning_rate": 0.0004872902166707555, + "loss": 0.2537, + "step": 73920 + }, + { + "epoch": 3.06, + "grad_norm": 0.267578125, + "learning_rate": 0.0004872868024730125, + "loss": 0.2726, + "step": 73930 + }, + { + "epoch": 3.06, + "grad_norm": 0.78125, + "learning_rate": 0.00048728338782872137, + "loss": 0.2311, + "step": 73940 + }, + { + "epoch": 3.06, + "grad_norm": 0.86328125, + "learning_rate": 0.0004872799727378886, + "loss": 0.1994, + "step": 73950 + }, + { + "epoch": 3.06, + "grad_norm": 0.765625, + "learning_rate": 0.00048727655720052045, + "loss": 0.2452, + "step": 73960 + }, + { + "epoch": 3.06, + "grad_norm": 0.6328125, + "learning_rate": 0.00048727314121662345, + "loss": 0.1759, + "step": 73970 + }, + { + "epoch": 3.06, + "grad_norm": 0.6328125, + "learning_rate": 0.0004872697247862041, + "loss": 0.2141, + "step": 73980 + }, + { + "epoch": 3.06, + "grad_norm": 0.1650390625, + "learning_rate": 0.00048726630790926875, + "loss": 0.1746, + "step": 73990 + }, + { + "epoch": 3.07, + "grad_norm": 0.310546875, + "learning_rate": 0.00048726289058582375, + "loss": 0.1923, + "step": 74000 + }, + { + "epoch": 3.07, + "grad_norm": 0.0022125244140625, + "learning_rate": 0.0004872594728158757, + "loss": 0.1961, + "step": 74010 + }, + { + "epoch": 3.07, + "grad_norm": 0.7109375, + "learning_rate": 0.00048725605459943086, + "loss": 0.2439, + "step": 74020 + }, + { + "epoch": 3.07, + "grad_norm": 0.76953125, + "learning_rate": 0.00048725263593649577, + "loss": 0.2458, + "step": 74030 + }, + { + "epoch": 3.07, + "grad_norm": 0.326171875, + "learning_rate": 0.00048724921682707684, + "loss": 0.2663, + "step": 74040 + }, + { + "epoch": 3.07, + "grad_norm": 0.79296875, + "learning_rate": 0.0004872457972711806, + "loss": 0.2116, + "step": 74050 + }, + { + "epoch": 3.07, + "grad_norm": 0.57421875, + "learning_rate": 0.0004872423772688133, + "loss": 0.198, + "step": 74060 + }, + { + "epoch": 3.07, + "grad_norm": 1.7890625, + "learning_rate": 0.00048723895681998145, + "loss": 0.1864, + "step": 74070 + }, + { + "epoch": 3.07, + "grad_norm": 0.80859375, + "learning_rate": 0.0004872355359246915, + "loss": 0.237, + "step": 74080 + }, + { + "epoch": 3.07, + "grad_norm": 0.6953125, + "learning_rate": 0.00048723211458294994, + "loss": 0.2472, + "step": 74090 + }, + { + "epoch": 3.07, + "grad_norm": 0.7734375, + "learning_rate": 0.0004872286927947631, + "loss": 0.2114, + "step": 74100 + }, + { + "epoch": 3.07, + "grad_norm": 0.2099609375, + "learning_rate": 0.00048722527056013754, + "loss": 0.2861, + "step": 74110 + }, + { + "epoch": 3.07, + "grad_norm": 0.921875, + "learning_rate": 0.00048722184787907964, + "loss": 0.1873, + "step": 74120 + }, + { + "epoch": 3.07, + "grad_norm": 1.1015625, + "learning_rate": 0.0004872184247515958, + "loss": 0.2359, + "step": 74130 + }, + { + "epoch": 3.07, + "grad_norm": 0.63671875, + "learning_rate": 0.0004872150011776925, + "loss": 0.2526, + "step": 74140 + }, + { + "epoch": 3.07, + "grad_norm": 1.140625, + "learning_rate": 0.0004872115771573762, + "loss": 0.2178, + "step": 74150 + }, + { + "epoch": 3.07, + "grad_norm": 0.82421875, + "learning_rate": 0.00048720815269065335, + "loss": 0.2769, + "step": 74160 + }, + { + "epoch": 3.07, + "grad_norm": 0.466796875, + "learning_rate": 0.00048720472777753033, + "loss": 0.2539, + "step": 74170 + }, + { + "epoch": 3.07, + "grad_norm": 0.640625, + "learning_rate": 0.00048720130241801363, + "loss": 0.2595, + "step": 74180 + }, + { + "epoch": 3.07, + "grad_norm": 0.435546875, + "learning_rate": 0.00048719787661210976, + "loss": 0.2066, + "step": 74190 + }, + { + "epoch": 3.07, + "grad_norm": 1.328125, + "learning_rate": 0.000487194450359825, + "loss": 0.1813, + "step": 74200 + }, + { + "epoch": 3.07, + "grad_norm": 0.345703125, + "learning_rate": 0.00048719102366116595, + "loss": 0.1825, + "step": 74210 + }, + { + "epoch": 3.07, + "grad_norm": 0.515625, + "learning_rate": 0.00048718759651613904, + "loss": 0.21, + "step": 74220 + }, + { + "epoch": 3.07, + "grad_norm": 0.546875, + "learning_rate": 0.00048718416892475064, + "loss": 0.2267, + "step": 74230 + }, + { + "epoch": 3.08, + "grad_norm": 0.953125, + "learning_rate": 0.00048718074088700727, + "loss": 0.1799, + "step": 74240 + }, + { + "epoch": 3.08, + "grad_norm": 2.0, + "learning_rate": 0.00048717731240291543, + "loss": 0.1784, + "step": 74250 + }, + { + "epoch": 3.08, + "grad_norm": 1.15625, + "learning_rate": 0.00048717388347248135, + "loss": 0.2123, + "step": 74260 + }, + { + "epoch": 3.08, + "grad_norm": 0.55078125, + "learning_rate": 0.0004871704540957117, + "loss": 0.2307, + "step": 74270 + }, + { + "epoch": 3.08, + "grad_norm": 0.65234375, + "learning_rate": 0.00048716702427261294, + "loss": 0.2161, + "step": 74280 + }, + { + "epoch": 3.08, + "grad_norm": 0.82421875, + "learning_rate": 0.0004871635940031914, + "loss": 0.2187, + "step": 74290 + }, + { + "epoch": 3.08, + "grad_norm": 1.109375, + "learning_rate": 0.00048716016328745354, + "loss": 0.2815, + "step": 74300 + }, + { + "epoch": 3.08, + "grad_norm": 0.9453125, + "learning_rate": 0.0004871567321254059, + "loss": 0.2447, + "step": 74310 + }, + { + "epoch": 3.08, + "grad_norm": 0.62890625, + "learning_rate": 0.0004871533005170549, + "loss": 0.1874, + "step": 74320 + }, + { + "epoch": 3.08, + "grad_norm": 0.73046875, + "learning_rate": 0.00048714986846240706, + "loss": 0.2047, + "step": 74330 + }, + { + "epoch": 3.08, + "grad_norm": 1.015625, + "learning_rate": 0.0004871464359614687, + "loss": 0.1969, + "step": 74340 + }, + { + "epoch": 3.08, + "grad_norm": 0.9140625, + "learning_rate": 0.00048714300301424634, + "loss": 0.2407, + "step": 74350 + }, + { + "epoch": 3.08, + "grad_norm": 0.7109375, + "learning_rate": 0.0004871395696207465, + "loss": 0.1862, + "step": 74360 + }, + { + "epoch": 3.08, + "grad_norm": 0.53515625, + "learning_rate": 0.0004871361357809756, + "loss": 0.2237, + "step": 74370 + }, + { + "epoch": 3.08, + "grad_norm": 1.015625, + "learning_rate": 0.0004871327014949401, + "loss": 0.1938, + "step": 74380 + }, + { + "epoch": 3.08, + "grad_norm": 0.67578125, + "learning_rate": 0.00048712926676264647, + "loss": 0.193, + "step": 74390 + }, + { + "epoch": 3.08, + "grad_norm": 0.91015625, + "learning_rate": 0.00048712583158410115, + "loss": 0.1844, + "step": 74400 + }, + { + "epoch": 3.08, + "grad_norm": 0.62109375, + "learning_rate": 0.00048712239595931063, + "loss": 0.2097, + "step": 74410 + }, + { + "epoch": 3.08, + "grad_norm": 0.78515625, + "learning_rate": 0.00048711895988828136, + "loss": 0.2252, + "step": 74420 + }, + { + "epoch": 3.08, + "grad_norm": 0.703125, + "learning_rate": 0.00048711552337101985, + "loss": 0.2165, + "step": 74430 + }, + { + "epoch": 3.08, + "grad_norm": 0.6796875, + "learning_rate": 0.00048711208640753244, + "loss": 0.2026, + "step": 74440 + }, + { + "epoch": 3.08, + "grad_norm": 0.296875, + "learning_rate": 0.0004871086489978258, + "loss": 0.1961, + "step": 74450 + }, + { + "epoch": 3.08, + "grad_norm": 1.21875, + "learning_rate": 0.0004871052111419062, + "loss": 0.205, + "step": 74460 + }, + { + "epoch": 3.08, + "grad_norm": 0.49609375, + "learning_rate": 0.0004871017728397803, + "loss": 0.221, + "step": 74470 + }, + { + "epoch": 3.08, + "grad_norm": 0.69140625, + "learning_rate": 0.00048709833409145436, + "loss": 0.198, + "step": 74480 + }, + { + "epoch": 3.09, + "grad_norm": 0.7421875, + "learning_rate": 0.000487094894896935, + "loss": 0.2249, + "step": 74490 + }, + { + "epoch": 3.09, + "grad_norm": 0.30859375, + "learning_rate": 0.00048709145525622865, + "loss": 0.2476, + "step": 74500 + }, + { + "epoch": 3.09, + "grad_norm": 0.26171875, + "learning_rate": 0.0004870880151693418, + "loss": 0.2306, + "step": 74510 + }, + { + "epoch": 3.09, + "grad_norm": 0.9765625, + "learning_rate": 0.00048708457463628093, + "loss": 0.2395, + "step": 74520 + }, + { + "epoch": 3.09, + "grad_norm": 0.67578125, + "learning_rate": 0.00048708113365705243, + "loss": 0.1723, + "step": 74530 + }, + { + "epoch": 3.09, + "grad_norm": 0.41015625, + "learning_rate": 0.00048707769223166287, + "loss": 0.2081, + "step": 74540 + }, + { + "epoch": 3.09, + "grad_norm": 0.77734375, + "learning_rate": 0.0004870742503601187, + "loss": 0.1828, + "step": 74550 + }, + { + "epoch": 3.09, + "grad_norm": 0.85546875, + "learning_rate": 0.0004870708080424264, + "loss": 0.2185, + "step": 74560 + }, + { + "epoch": 3.09, + "grad_norm": 1.4609375, + "learning_rate": 0.0004870673652785924, + "loss": 0.2814, + "step": 74570 + }, + { + "epoch": 3.09, + "grad_norm": 0.26171875, + "learning_rate": 0.00048706392206862326, + "loss": 0.2318, + "step": 74580 + }, + { + "epoch": 3.09, + "grad_norm": 1.046875, + "learning_rate": 0.00048706047841252546, + "loss": 0.1994, + "step": 74590 + }, + { + "epoch": 3.09, + "grad_norm": 0.56640625, + "learning_rate": 0.00048705703431030535, + "loss": 0.2512, + "step": 74600 + }, + { + "epoch": 3.09, + "grad_norm": 0.6171875, + "learning_rate": 0.00048705358976196955, + "loss": 0.2478, + "step": 74610 + }, + { + "epoch": 3.09, + "grad_norm": 0.6796875, + "learning_rate": 0.0004870501447675245, + "loss": 0.1924, + "step": 74620 + }, + { + "epoch": 3.09, + "grad_norm": 0.8046875, + "learning_rate": 0.00048704669932697663, + "loss": 0.2074, + "step": 74630 + }, + { + "epoch": 3.09, + "grad_norm": 0.7109375, + "learning_rate": 0.0004870432534403325, + "loss": 0.2241, + "step": 74640 + }, + { + "epoch": 3.09, + "grad_norm": 0.625, + "learning_rate": 0.0004870398071075986, + "loss": 0.1863, + "step": 74650 + }, + { + "epoch": 3.09, + "grad_norm": 0.49609375, + "learning_rate": 0.00048703636032878134, + "loss": 0.1563, + "step": 74660 + }, + { + "epoch": 3.09, + "grad_norm": 0.75, + "learning_rate": 0.00048703291310388724, + "loss": 0.3047, + "step": 74670 + }, + { + "epoch": 3.09, + "grad_norm": 0.75, + "learning_rate": 0.0004870294654329228, + "loss": 0.201, + "step": 74680 + }, + { + "epoch": 3.09, + "grad_norm": 0.6875, + "learning_rate": 0.0004870260173158946, + "loss": 0.193, + "step": 74690 + }, + { + "epoch": 3.09, + "grad_norm": 0.66015625, + "learning_rate": 0.00048702256875280894, + "loss": 0.1976, + "step": 74700 + }, + { + "epoch": 3.09, + "grad_norm": 0.486328125, + "learning_rate": 0.00048701911974367244, + "loss": 0.2685, + "step": 74710 + }, + { + "epoch": 3.09, + "grad_norm": 0.5390625, + "learning_rate": 0.00048701567028849147, + "loss": 0.2463, + "step": 74720 + }, + { + "epoch": 3.1, + "grad_norm": 0.68359375, + "learning_rate": 0.00048701222038727266, + "loss": 0.1852, + "step": 74730 + }, + { + "epoch": 3.1, + "grad_norm": 1.921875, + "learning_rate": 0.0004870087700400224, + "loss": 0.2467, + "step": 74740 + }, + { + "epoch": 3.1, + "grad_norm": 0.53125, + "learning_rate": 0.0004870053192467473, + "loss": 0.239, + "step": 74750 + }, + { + "epoch": 3.1, + "grad_norm": 0.80859375, + "learning_rate": 0.0004870018680074537, + "loss": 0.176, + "step": 74760 + }, + { + "epoch": 3.1, + "grad_norm": 0.53515625, + "learning_rate": 0.00048699841632214824, + "loss": 0.2479, + "step": 74770 + }, + { + "epoch": 3.1, + "grad_norm": 0.75, + "learning_rate": 0.0004869949641908374, + "loss": 0.2113, + "step": 74780 + }, + { + "epoch": 3.1, + "grad_norm": 0.3203125, + "learning_rate": 0.00048699151161352757, + "loss": 0.2088, + "step": 74790 + }, + { + "epoch": 3.1, + "grad_norm": 1.265625, + "learning_rate": 0.00048698805859022533, + "loss": 0.2537, + "step": 74800 + }, + { + "epoch": 3.1, + "grad_norm": 0.94921875, + "learning_rate": 0.0004869846051209371, + "loss": 0.2545, + "step": 74810 + }, + { + "epoch": 3.1, + "grad_norm": 0.84375, + "learning_rate": 0.0004869811512056695, + "loss": 0.1687, + "step": 74820 + }, + { + "epoch": 3.1, + "grad_norm": 0.71875, + "learning_rate": 0.00048697769684442893, + "loss": 0.2517, + "step": 74830 + }, + { + "epoch": 3.1, + "grad_norm": 0.8984375, + "learning_rate": 0.00048697424203722196, + "loss": 0.2569, + "step": 74840 + }, + { + "epoch": 3.1, + "grad_norm": 0.5234375, + "learning_rate": 0.00048697078678405504, + "loss": 0.2825, + "step": 74850 + }, + { + "epoch": 3.1, + "grad_norm": 0.8984375, + "learning_rate": 0.0004869673310849347, + "loss": 0.2246, + "step": 74860 + }, + { + "epoch": 3.1, + "grad_norm": 0.84765625, + "learning_rate": 0.0004869638749398674, + "loss": 0.2391, + "step": 74870 + }, + { + "epoch": 3.1, + "grad_norm": 0.53515625, + "learning_rate": 0.00048696041834885966, + "loss": 0.2607, + "step": 74880 + }, + { + "epoch": 3.1, + "grad_norm": 0.5703125, + "learning_rate": 0.00048695696131191807, + "loss": 0.2377, + "step": 74890 + }, + { + "epoch": 3.1, + "grad_norm": 0.80859375, + "learning_rate": 0.000486953503829049, + "loss": 0.1982, + "step": 74900 + }, + { + "epoch": 3.1, + "grad_norm": 0.5625, + "learning_rate": 0.0004869500459002591, + "loss": 0.1778, + "step": 74910 + }, + { + "epoch": 3.1, + "grad_norm": 0.203125, + "learning_rate": 0.0004869465875255548, + "loss": 0.2137, + "step": 74920 + }, + { + "epoch": 3.1, + "grad_norm": 0.48828125, + "learning_rate": 0.0004869431287049425, + "loss": 0.2208, + "step": 74930 + }, + { + "epoch": 3.1, + "grad_norm": 0.6953125, + "learning_rate": 0.00048693966943842893, + "loss": 0.225, + "step": 74940 + }, + { + "epoch": 3.1, + "grad_norm": 0.271484375, + "learning_rate": 0.0004869362097260205, + "loss": 0.2365, + "step": 74950 + }, + { + "epoch": 3.1, + "grad_norm": 0.259765625, + "learning_rate": 0.0004869327495677236, + "loss": 0.2478, + "step": 74960 + }, + { + "epoch": 3.11, + "grad_norm": 0.72265625, + "learning_rate": 0.0004869292889635449, + "loss": 0.2016, + "step": 74970 + }, + { + "epoch": 3.11, + "grad_norm": 1.0546875, + "learning_rate": 0.00048692582791349094, + "loss": 0.2145, + "step": 74980 + }, + { + "epoch": 3.11, + "grad_norm": 0.671875, + "learning_rate": 0.00048692236641756804, + "loss": 0.186, + "step": 74990 + }, + { + "epoch": 3.11, + "grad_norm": 0.51171875, + "learning_rate": 0.00048691890447578293, + "loss": 0.2622, + "step": 75000 + }, + { + "epoch": 3.11, + "grad_norm": 0.93359375, + "learning_rate": 0.00048691544208814196, + "loss": 0.1908, + "step": 75010 + }, + { + "epoch": 3.11, + "grad_norm": 0.60546875, + "learning_rate": 0.00048691197925465173, + "loss": 0.1609, + "step": 75020 + }, + { + "epoch": 3.11, + "grad_norm": 0.72265625, + "learning_rate": 0.00048690851597531877, + "loss": 0.2121, + "step": 75030 + }, + { + "epoch": 3.11, + "grad_norm": 0.6015625, + "learning_rate": 0.0004869050522501495, + "loss": 0.2386, + "step": 75040 + }, + { + "epoch": 3.11, + "grad_norm": 0.86328125, + "learning_rate": 0.00048690158807915057, + "loss": 0.2522, + "step": 75050 + }, + { + "epoch": 3.11, + "grad_norm": 0.81640625, + "learning_rate": 0.0004868981234623284, + "loss": 0.2329, + "step": 75060 + }, + { + "epoch": 3.11, + "grad_norm": 0.6640625, + "learning_rate": 0.0004868946583996896, + "loss": 0.2143, + "step": 75070 + }, + { + "epoch": 3.11, + "grad_norm": 0.671875, + "learning_rate": 0.00048689119289124056, + "loss": 0.2056, + "step": 75080 + }, + { + "epoch": 3.11, + "grad_norm": 0.62890625, + "learning_rate": 0.0004868877269369879, + "loss": 0.2274, + "step": 75090 + }, + { + "epoch": 3.11, + "grad_norm": 1.1953125, + "learning_rate": 0.00048688426053693814, + "loss": 0.2926, + "step": 75100 + }, + { + "epoch": 3.11, + "grad_norm": 0.97265625, + "learning_rate": 0.0004868807936910978, + "loss": 0.2211, + "step": 75110 + }, + { + "epoch": 3.11, + "grad_norm": 1.1484375, + "learning_rate": 0.00048687732639947335, + "loss": 0.2214, + "step": 75120 + }, + { + "epoch": 3.11, + "grad_norm": 0.5, + "learning_rate": 0.0004868738586620713, + "loss": 0.1772, + "step": 75130 + }, + { + "epoch": 3.11, + "grad_norm": 0.9375, + "learning_rate": 0.0004868703904788984, + "loss": 0.2388, + "step": 75140 + }, + { + "epoch": 3.11, + "grad_norm": 0.25, + "learning_rate": 0.00048686692184996087, + "loss": 0.1368, + "step": 75150 + }, + { + "epoch": 3.11, + "grad_norm": 0.66015625, + "learning_rate": 0.00048686345277526534, + "loss": 0.1473, + "step": 75160 + }, + { + "epoch": 3.11, + "grad_norm": 1.34375, + "learning_rate": 0.00048685998325481846, + "loss": 0.2187, + "step": 75170 + }, + { + "epoch": 3.11, + "grad_norm": 0.69140625, + "learning_rate": 0.00048685651328862664, + "loss": 0.234, + "step": 75180 + }, + { + "epoch": 3.11, + "grad_norm": 0.376953125, + "learning_rate": 0.0004868530428766964, + "loss": 0.2204, + "step": 75190 + }, + { + "epoch": 3.11, + "grad_norm": 0.42578125, + "learning_rate": 0.0004868495720190343, + "loss": 0.2326, + "step": 75200 + }, + { + "epoch": 3.12, + "grad_norm": 0.90234375, + "learning_rate": 0.00048684610071564707, + "loss": 0.2412, + "step": 75210 + }, + { + "epoch": 3.12, + "grad_norm": 1.015625, + "learning_rate": 0.0004868426289665409, + "loss": 0.1761, + "step": 75220 + }, + { + "epoch": 3.12, + "grad_norm": 0.70703125, + "learning_rate": 0.0004868391567717225, + "loss": 0.2289, + "step": 75230 + }, + { + "epoch": 3.12, + "grad_norm": 0.80078125, + "learning_rate": 0.00048683568413119833, + "loss": 0.2197, + "step": 75240 + }, + { + "epoch": 3.12, + "grad_norm": 0.380859375, + "learning_rate": 0.00048683221104497504, + "loss": 0.1994, + "step": 75250 + }, + { + "epoch": 3.12, + "grad_norm": 0.002593994140625, + "learning_rate": 0.0004868287375130591, + "loss": 0.1754, + "step": 75260 + }, + { + "epoch": 3.12, + "grad_norm": 0.8046875, + "learning_rate": 0.00048682526353545705, + "loss": 0.2145, + "step": 75270 + }, + { + "epoch": 3.12, + "grad_norm": 0.298828125, + "learning_rate": 0.00048682178911217547, + "loss": 0.2454, + "step": 75280 + }, + { + "epoch": 3.12, + "grad_norm": 0.73046875, + "learning_rate": 0.0004868183142432208, + "loss": 0.2335, + "step": 75290 + }, + { + "epoch": 3.12, + "grad_norm": 0.97265625, + "learning_rate": 0.00048681483892859957, + "loss": 0.2151, + "step": 75300 + }, + { + "epoch": 3.12, + "grad_norm": 0.220703125, + "learning_rate": 0.0004868113631683185, + "loss": 0.1634, + "step": 75310 + }, + { + "epoch": 3.12, + "grad_norm": 0.75, + "learning_rate": 0.000486807886962384, + "loss": 0.2583, + "step": 75320 + }, + { + "epoch": 3.12, + "grad_norm": 0.61328125, + "learning_rate": 0.00048680441031080256, + "loss": 0.1783, + "step": 75330 + }, + { + "epoch": 3.12, + "grad_norm": 0.74609375, + "learning_rate": 0.00048680093321358086, + "loss": 0.2546, + "step": 75340 + }, + { + "epoch": 3.12, + "grad_norm": 0.412109375, + "learning_rate": 0.0004867974556707254, + "loss": 0.2167, + "step": 75350 + }, + { + "epoch": 3.12, + "grad_norm": 0.58203125, + "learning_rate": 0.00048679397768224263, + "loss": 0.1755, + "step": 75360 + }, + { + "epoch": 3.12, + "grad_norm": 0.419921875, + "learning_rate": 0.0004867904992481392, + "loss": 0.1679, + "step": 75370 + }, + { + "epoch": 3.12, + "grad_norm": 0.89453125, + "learning_rate": 0.0004867870203684216, + "loss": 0.2073, + "step": 75380 + }, + { + "epoch": 3.12, + "grad_norm": 0.67578125, + "learning_rate": 0.0004867835410430964, + "loss": 0.2396, + "step": 75390 + }, + { + "epoch": 3.12, + "grad_norm": 0.46875, + "learning_rate": 0.00048678006127217025, + "loss": 0.2361, + "step": 75400 + }, + { + "epoch": 3.12, + "grad_norm": 0.58203125, + "learning_rate": 0.00048677658105564947, + "loss": 0.2466, + "step": 75410 + }, + { + "epoch": 3.12, + "grad_norm": 0.6015625, + "learning_rate": 0.0004867731003935408, + "loss": 0.2032, + "step": 75420 + }, + { + "epoch": 3.12, + "grad_norm": 1.6328125, + "learning_rate": 0.0004867696192858507, + "loss": 0.2151, + "step": 75430 + }, + { + "epoch": 3.12, + "grad_norm": 0.9375, + "learning_rate": 0.0004867661377325857, + "loss": 0.2117, + "step": 75440 + }, + { + "epoch": 3.13, + "grad_norm": 0.609375, + "learning_rate": 0.00048676265573375247, + "loss": 0.168, + "step": 75450 + }, + { + "epoch": 3.13, + "grad_norm": 0.6640625, + "learning_rate": 0.0004867591732893575, + "loss": 0.2195, + "step": 75460 + }, + { + "epoch": 3.13, + "grad_norm": 0.96484375, + "learning_rate": 0.00048675569039940725, + "loss": 0.155, + "step": 75470 + }, + { + "epoch": 3.13, + "grad_norm": 0.88671875, + "learning_rate": 0.00048675220706390836, + "loss": 0.2309, + "step": 75480 + }, + { + "epoch": 3.13, + "grad_norm": 0.578125, + "learning_rate": 0.00048674872328286747, + "loss": 0.2205, + "step": 75490 + }, + { + "epoch": 3.13, + "grad_norm": 2.984375, + "learning_rate": 0.000486745239056291, + "loss": 0.2281, + "step": 75500 + }, + { + "epoch": 3.13, + "grad_norm": 0.92578125, + "learning_rate": 0.00048674175438418555, + "loss": 0.2466, + "step": 75510 + }, + { + "epoch": 3.13, + "grad_norm": 1.0703125, + "learning_rate": 0.00048673826926655774, + "loss": 0.2197, + "step": 75520 + }, + { + "epoch": 3.13, + "grad_norm": 0.88671875, + "learning_rate": 0.00048673478370341396, + "loss": 0.1929, + "step": 75530 + }, + { + "epoch": 3.13, + "grad_norm": 1.6640625, + "learning_rate": 0.000486731297694761, + "loss": 0.2061, + "step": 75540 + }, + { + "epoch": 3.13, + "grad_norm": 0.67578125, + "learning_rate": 0.00048672781124060524, + "loss": 0.2053, + "step": 75550 + }, + { + "epoch": 3.13, + "grad_norm": 0.29296875, + "learning_rate": 0.00048672432434095325, + "loss": 0.1714, + "step": 75560 + }, + { + "epoch": 3.13, + "grad_norm": 0.1923828125, + "learning_rate": 0.00048672083699581175, + "loss": 0.182, + "step": 75570 + }, + { + "epoch": 3.13, + "grad_norm": 0.455078125, + "learning_rate": 0.0004867173492051872, + "loss": 0.2254, + "step": 75580 + }, + { + "epoch": 3.13, + "grad_norm": 0.451171875, + "learning_rate": 0.0004867138609690861, + "loss": 0.1265, + "step": 75590 + }, + { + "epoch": 3.13, + "grad_norm": 0.412109375, + "learning_rate": 0.00048671037228751506, + "loss": 0.1455, + "step": 75600 + }, + { + "epoch": 3.13, + "grad_norm": 0.78125, + "learning_rate": 0.0004867068831604807, + "loss": 0.2166, + "step": 75610 + }, + { + "epoch": 3.13, + "grad_norm": 0.451171875, + "learning_rate": 0.0004867033935879895, + "loss": 0.1881, + "step": 75620 + }, + { + "epoch": 3.13, + "grad_norm": 0.47265625, + "learning_rate": 0.0004866999035700482, + "loss": 0.252, + "step": 75630 + }, + { + "epoch": 3.13, + "grad_norm": 0.0, + "learning_rate": 0.0004866964131066632, + "loss": 0.2433, + "step": 75640 + }, + { + "epoch": 3.13, + "grad_norm": 0.6640625, + "learning_rate": 0.00048669292219784104, + "loss": 0.169, + "step": 75650 + }, + { + "epoch": 3.13, + "grad_norm": 0.65234375, + "learning_rate": 0.0004866894308435884, + "loss": 0.2411, + "step": 75660 + }, + { + "epoch": 3.13, + "grad_norm": 0.55859375, + "learning_rate": 0.0004866859390439118, + "loss": 0.1801, + "step": 75670 + }, + { + "epoch": 3.13, + "grad_norm": 0.91796875, + "learning_rate": 0.00048668244679881783, + "loss": 0.2312, + "step": 75680 + }, + { + "epoch": 3.14, + "grad_norm": 1.328125, + "learning_rate": 0.00048667895410831307, + "loss": 0.2379, + "step": 75690 + }, + { + "epoch": 3.14, + "grad_norm": 0.4765625, + "learning_rate": 0.00048667546097240407, + "loss": 0.1836, + "step": 75700 + }, + { + "epoch": 3.14, + "grad_norm": 0.9765625, + "learning_rate": 0.00048667196739109745, + "loss": 0.1853, + "step": 75710 + }, + { + "epoch": 3.14, + "grad_norm": 0.671875, + "learning_rate": 0.0004866684733643997, + "loss": 0.1938, + "step": 75720 + }, + { + "epoch": 3.14, + "grad_norm": 0.177734375, + "learning_rate": 0.0004866649788923174, + "loss": 0.1804, + "step": 75730 + }, + { + "epoch": 3.14, + "grad_norm": 0.59765625, + "learning_rate": 0.0004866614839748572, + "loss": 0.242, + "step": 75740 + }, + { + "epoch": 3.14, + "grad_norm": 1.1796875, + "learning_rate": 0.0004866579886120257, + "loss": 0.2473, + "step": 75750 + }, + { + "epoch": 3.14, + "grad_norm": 0.609375, + "learning_rate": 0.00048665449280382937, + "loss": 0.2142, + "step": 75760 + }, + { + "epoch": 3.14, + "grad_norm": 0.48828125, + "learning_rate": 0.0004866509965502748, + "loss": 0.1966, + "step": 75770 + }, + { + "epoch": 3.14, + "grad_norm": 0.734375, + "learning_rate": 0.0004866474998513687, + "loss": 0.1611, + "step": 75780 + }, + { + "epoch": 3.14, + "grad_norm": 0.69921875, + "learning_rate": 0.0004866440027071175, + "loss": 0.2353, + "step": 75790 + }, + { + "epoch": 3.14, + "grad_norm": 0.294921875, + "learning_rate": 0.00048664050511752787, + "loss": 0.2218, + "step": 75800 + }, + { + "epoch": 3.14, + "grad_norm": 0.79296875, + "learning_rate": 0.00048663700708260636, + "loss": 0.2121, + "step": 75810 + }, + { + "epoch": 3.14, + "grad_norm": 0.51171875, + "learning_rate": 0.00048663350860235954, + "loss": 0.2174, + "step": 75820 + }, + { + "epoch": 3.14, + "grad_norm": 0.9609375, + "learning_rate": 0.00048663000967679403, + "loss": 0.2006, + "step": 75830 + }, + { + "epoch": 3.14, + "grad_norm": 0.6640625, + "learning_rate": 0.00048662651030591643, + "loss": 0.2396, + "step": 75840 + }, + { + "epoch": 3.14, + "grad_norm": 0.7734375, + "learning_rate": 0.0004866230104897332, + "loss": 0.2294, + "step": 75850 + }, + { + "epoch": 3.14, + "grad_norm": 0.435546875, + "learning_rate": 0.00048661951022825107, + "loss": 0.2376, + "step": 75860 + }, + { + "epoch": 3.14, + "grad_norm": 0.84375, + "learning_rate": 0.00048661600952147653, + "loss": 0.245, + "step": 75870 + }, + { + "epoch": 3.14, + "grad_norm": 0.33203125, + "learning_rate": 0.00048661250836941626, + "loss": 0.2192, + "step": 75880 + }, + { + "epoch": 3.14, + "grad_norm": 0.625, + "learning_rate": 0.00048660900677207687, + "loss": 0.2599, + "step": 75890 + }, + { + "epoch": 3.14, + "grad_norm": 0.58984375, + "learning_rate": 0.00048660550472946475, + "loss": 0.1729, + "step": 75900 + }, + { + "epoch": 3.14, + "grad_norm": 1.1484375, + "learning_rate": 0.0004866020022415867, + "loss": 0.2844, + "step": 75910 + }, + { + "epoch": 3.14, + "grad_norm": 0.78515625, + "learning_rate": 0.0004865984993084492, + "loss": 0.1972, + "step": 75920 + }, + { + "epoch": 3.15, + "grad_norm": 0.5234375, + "learning_rate": 0.0004865949959300589, + "loss": 0.1966, + "step": 75930 + }, + { + "epoch": 3.15, + "grad_norm": 0.671875, + "learning_rate": 0.0004865914921064224, + "loss": 0.1556, + "step": 75940 + }, + { + "epoch": 3.15, + "grad_norm": 0.8984375, + "learning_rate": 0.0004865879878375462, + "loss": 0.2096, + "step": 75950 + }, + { + "epoch": 3.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00048658448312343696, + "loss": 0.1934, + "step": 75960 + }, + { + "epoch": 3.15, + "grad_norm": 0.8203125, + "learning_rate": 0.00048658097796410127, + "loss": 0.2166, + "step": 75970 + }, + { + "epoch": 3.15, + "grad_norm": 0.6171875, + "learning_rate": 0.00048657747235954577, + "loss": 0.1357, + "step": 75980 + }, + { + "epoch": 3.15, + "grad_norm": 1.9765625, + "learning_rate": 0.00048657396630977695, + "loss": 0.198, + "step": 75990 + }, + { + "epoch": 3.15, + "grad_norm": 2.265625, + "learning_rate": 0.00048657045981480155, + "loss": 0.212, + "step": 76000 + }, + { + "epoch": 3.15, + "grad_norm": 0.515625, + "learning_rate": 0.00048656695287462606, + "loss": 0.2087, + "step": 76010 + }, + { + "epoch": 3.15, + "grad_norm": 0.578125, + "learning_rate": 0.000486563445489257, + "loss": 0.2687, + "step": 76020 + }, + { + "epoch": 3.15, + "grad_norm": 0.83984375, + "learning_rate": 0.0004865599376587012, + "loss": 0.2446, + "step": 76030 + }, + { + "epoch": 3.15, + "grad_norm": 1.1171875, + "learning_rate": 0.0004865564293829652, + "loss": 0.2476, + "step": 76040 + }, + { + "epoch": 3.15, + "grad_norm": 0.92578125, + "learning_rate": 0.00048655292066205545, + "loss": 0.1841, + "step": 76050 + }, + { + "epoch": 3.15, + "grad_norm": 0.3046875, + "learning_rate": 0.00048654941149597864, + "loss": 0.2098, + "step": 76060 + }, + { + "epoch": 3.15, + "grad_norm": 0.74609375, + "learning_rate": 0.00048654590188474143, + "loss": 0.2198, + "step": 76070 + }, + { + "epoch": 3.15, + "grad_norm": 0.91015625, + "learning_rate": 0.0004865423918283504, + "loss": 0.2651, + "step": 76080 + }, + { + "epoch": 3.15, + "grad_norm": 0.46484375, + "learning_rate": 0.00048653888132681205, + "loss": 0.2216, + "step": 76090 + }, + { + "epoch": 3.15, + "grad_norm": 0.80859375, + "learning_rate": 0.00048653537038013316, + "loss": 0.2415, + "step": 76100 + }, + { + "epoch": 3.15, + "grad_norm": 0.765625, + "learning_rate": 0.0004865318589883202, + "loss": 0.2485, + "step": 76110 + }, + { + "epoch": 3.15, + "grad_norm": 0.84765625, + "learning_rate": 0.0004865283471513798, + "loss": 0.1855, + "step": 76120 + }, + { + "epoch": 3.15, + "grad_norm": 0.53515625, + "learning_rate": 0.00048652483486931865, + "loss": 0.2288, + "step": 76130 + }, + { + "epoch": 3.15, + "grad_norm": 0.82421875, + "learning_rate": 0.00048652132214214327, + "loss": 0.1923, + "step": 76140 + }, + { + "epoch": 3.15, + "grad_norm": 0.70703125, + "learning_rate": 0.0004865178089698603, + "loss": 0.1532, + "step": 76150 + }, + { + "epoch": 3.15, + "grad_norm": 0.4453125, + "learning_rate": 0.0004865142953524764, + "loss": 0.2525, + "step": 76160 + }, + { + "epoch": 3.15, + "grad_norm": 0.158203125, + "learning_rate": 0.00048651078128999803, + "loss": 0.1827, + "step": 76170 + }, + { + "epoch": 3.16, + "grad_norm": 0.357421875, + "learning_rate": 0.00048650726678243204, + "loss": 0.1746, + "step": 76180 + }, + { + "epoch": 3.16, + "grad_norm": 0.640625, + "learning_rate": 0.0004865037518297848, + "loss": 0.2618, + "step": 76190 + }, + { + "epoch": 3.16, + "grad_norm": 0.7734375, + "learning_rate": 0.0004865002364320631, + "loss": 0.239, + "step": 76200 + }, + { + "epoch": 3.16, + "grad_norm": 0.6796875, + "learning_rate": 0.0004864967205892735, + "loss": 0.2202, + "step": 76210 + }, + { + "epoch": 3.16, + "grad_norm": 0.58984375, + "learning_rate": 0.00048649320430142263, + "loss": 0.2463, + "step": 76220 + }, + { + "epoch": 3.16, + "grad_norm": 0.875, + "learning_rate": 0.00048648968756851704, + "loss": 0.2251, + "step": 76230 + }, + { + "epoch": 3.16, + "grad_norm": 0.6171875, + "learning_rate": 0.0004864861703905634, + "loss": 0.1798, + "step": 76240 + }, + { + "epoch": 3.16, + "grad_norm": 0.703125, + "learning_rate": 0.00048648265276756835, + "loss": 0.1844, + "step": 76250 + }, + { + "epoch": 3.16, + "grad_norm": 0.8125, + "learning_rate": 0.00048647913469953847, + "loss": 0.251, + "step": 76260 + }, + { + "epoch": 3.16, + "grad_norm": 0.37109375, + "learning_rate": 0.0004864756161864804, + "loss": 0.2283, + "step": 76270 + }, + { + "epoch": 3.16, + "grad_norm": 0.5390625, + "learning_rate": 0.0004864720972284008, + "loss": 0.2399, + "step": 76280 + }, + { + "epoch": 3.16, + "grad_norm": 0.375, + "learning_rate": 0.0004864685778253062, + "loss": 0.2525, + "step": 76290 + }, + { + "epoch": 3.16, + "grad_norm": 0.5703125, + "learning_rate": 0.0004864650579772033, + "loss": 0.2411, + "step": 76300 + }, + { + "epoch": 3.16, + "grad_norm": 0.353515625, + "learning_rate": 0.0004864615376840986, + "loss": 0.1988, + "step": 76310 + }, + { + "epoch": 3.16, + "grad_norm": 0.53125, + "learning_rate": 0.00048645801694599896, + "loss": 0.2547, + "step": 76320 + }, + { + "epoch": 3.16, + "grad_norm": 0.703125, + "learning_rate": 0.0004864544957629108, + "loss": 0.238, + "step": 76330 + }, + { + "epoch": 3.16, + "grad_norm": 1.0234375, + "learning_rate": 0.00048645097413484086, + "loss": 0.2317, + "step": 76340 + }, + { + "epoch": 3.16, + "grad_norm": 1.0390625, + "learning_rate": 0.00048644745206179564, + "loss": 0.2216, + "step": 76350 + }, + { + "epoch": 3.16, + "grad_norm": 0.349609375, + "learning_rate": 0.00048644392954378193, + "loss": 0.218, + "step": 76360 + }, + { + "epoch": 3.16, + "grad_norm": 0.0, + "learning_rate": 0.0004864404065808062, + "loss": 0.1763, + "step": 76370 + }, + { + "epoch": 3.16, + "grad_norm": 0.33984375, + "learning_rate": 0.0004864368831728752, + "loss": 0.2447, + "step": 76380 + }, + { + "epoch": 3.16, + "grad_norm": 1.1484375, + "learning_rate": 0.00048643335931999556, + "loss": 0.1969, + "step": 76390 + }, + { + "epoch": 3.16, + "grad_norm": 0.4765625, + "learning_rate": 0.00048642983502217383, + "loss": 0.1879, + "step": 76400 + }, + { + "epoch": 3.16, + "grad_norm": 0.65625, + "learning_rate": 0.0004864263102794166, + "loss": 0.1922, + "step": 76410 + }, + { + "epoch": 3.17, + "grad_norm": 0.82421875, + "learning_rate": 0.0004864227850917307, + "loss": 0.2029, + "step": 76420 + }, + { + "epoch": 3.17, + "grad_norm": 1.03125, + "learning_rate": 0.0004864192594591226, + "loss": 0.2089, + "step": 76430 + }, + { + "epoch": 3.17, + "grad_norm": 0.5234375, + "learning_rate": 0.00048641573338159907, + "loss": 0.2173, + "step": 76440 + }, + { + "epoch": 3.17, + "grad_norm": 0.50390625, + "learning_rate": 0.00048641220685916655, + "loss": 0.256, + "step": 76450 + }, + { + "epoch": 3.17, + "grad_norm": 0.53125, + "learning_rate": 0.0004864086798918319, + "loss": 0.2652, + "step": 76460 + }, + { + "epoch": 3.17, + "grad_norm": 0.8203125, + "learning_rate": 0.0004864051524796015, + "loss": 0.1905, + "step": 76470 + }, + { + "epoch": 3.17, + "grad_norm": 0.5078125, + "learning_rate": 0.00048640162462248223, + "loss": 0.2436, + "step": 76480 + }, + { + "epoch": 3.17, + "grad_norm": 0.73828125, + "learning_rate": 0.0004863980963204806, + "loss": 0.2101, + "step": 76490 + }, + { + "epoch": 3.17, + "grad_norm": 0.361328125, + "learning_rate": 0.00048639456757360326, + "loss": 0.231, + "step": 76500 + }, + { + "epoch": 3.17, + "grad_norm": 0.77734375, + "learning_rate": 0.00048639103838185696, + "loss": 0.2016, + "step": 76510 + }, + { + "epoch": 3.17, + "grad_norm": 0.416015625, + "learning_rate": 0.00048638750874524815, + "loss": 0.2291, + "step": 76520 + }, + { + "epoch": 3.17, + "grad_norm": 0.62109375, + "learning_rate": 0.00048638397866378365, + "loss": 0.1993, + "step": 76530 + }, + { + "epoch": 3.17, + "grad_norm": 0.80078125, + "learning_rate": 0.00048638044813746997, + "loss": 0.2237, + "step": 76540 + }, + { + "epoch": 3.17, + "grad_norm": 0.30859375, + "learning_rate": 0.0004863769171663139, + "loss": 0.1843, + "step": 76550 + }, + { + "epoch": 3.17, + "grad_norm": 0.51171875, + "learning_rate": 0.00048637338575032194, + "loss": 0.1589, + "step": 76560 + }, + { + "epoch": 3.17, + "grad_norm": 0.85546875, + "learning_rate": 0.0004863698538895008, + "loss": 0.1967, + "step": 76570 + }, + { + "epoch": 3.17, + "grad_norm": 0.70703125, + "learning_rate": 0.00048636632158385705, + "loss": 0.2564, + "step": 76580 + }, + { + "epoch": 3.17, + "grad_norm": 0.4765625, + "learning_rate": 0.00048636278883339753, + "loss": 0.2277, + "step": 76590 + }, + { + "epoch": 3.17, + "grad_norm": 0.74609375, + "learning_rate": 0.0004863592556381287, + "loss": 0.1709, + "step": 76600 + }, + { + "epoch": 3.17, + "grad_norm": 0.8046875, + "learning_rate": 0.00048635572199805726, + "loss": 0.21, + "step": 76610 + }, + { + "epoch": 3.17, + "grad_norm": 1.1953125, + "learning_rate": 0.0004863521879131899, + "loss": 0.1604, + "step": 76620 + }, + { + "epoch": 3.17, + "grad_norm": 0.283203125, + "learning_rate": 0.0004863486533835333, + "loss": 0.1937, + "step": 76630 + }, + { + "epoch": 3.17, + "grad_norm": 0.83203125, + "learning_rate": 0.000486345118409094, + "loss": 0.2028, + "step": 76640 + }, + { + "epoch": 3.17, + "grad_norm": 1.015625, + "learning_rate": 0.00048634158298987876, + "loss": 0.2475, + "step": 76650 + }, + { + "epoch": 3.18, + "grad_norm": 1.1953125, + "learning_rate": 0.00048633804712589407, + "loss": 0.1718, + "step": 76660 + }, + { + "epoch": 3.18, + "grad_norm": 1.078125, + "learning_rate": 0.0004863345108171468, + "loss": 0.1373, + "step": 76670 + }, + { + "epoch": 3.18, + "grad_norm": 0.71875, + "learning_rate": 0.0004863309740636435, + "loss": 0.2065, + "step": 76680 + }, + { + "epoch": 3.18, + "grad_norm": 0.76953125, + "learning_rate": 0.00048632743686539075, + "loss": 0.2192, + "step": 76690 + }, + { + "epoch": 3.18, + "grad_norm": 0.58203125, + "learning_rate": 0.0004863238992223953, + "loss": 0.2389, + "step": 76700 + }, + { + "epoch": 3.18, + "grad_norm": 1.328125, + "learning_rate": 0.0004863203611346639, + "loss": 0.2186, + "step": 76710 + }, + { + "epoch": 3.18, + "grad_norm": 0.88671875, + "learning_rate": 0.000486316822602203, + "loss": 0.2226, + "step": 76720 + }, + { + "epoch": 3.18, + "grad_norm": 1.3515625, + "learning_rate": 0.00048631328362501935, + "loss": 0.2062, + "step": 76730 + }, + { + "epoch": 3.18, + "grad_norm": 0.59765625, + "learning_rate": 0.0004863097442031197, + "loss": 0.2319, + "step": 76740 + }, + { + "epoch": 3.18, + "grad_norm": 0.55078125, + "learning_rate": 0.00048630620433651065, + "loss": 0.218, + "step": 76750 + }, + { + "epoch": 3.18, + "grad_norm": 0.76171875, + "learning_rate": 0.00048630266402519874, + "loss": 0.1936, + "step": 76760 + }, + { + "epoch": 3.18, + "grad_norm": 1.0546875, + "learning_rate": 0.0004862991232691908, + "loss": 0.2427, + "step": 76770 + }, + { + "epoch": 3.18, + "grad_norm": 0.796875, + "learning_rate": 0.0004862955820684933, + "loss": 0.1961, + "step": 76780 + }, + { + "epoch": 3.18, + "grad_norm": 1.0234375, + "learning_rate": 0.0004862920404231132, + "loss": 0.2183, + "step": 76790 + }, + { + "epoch": 3.18, + "grad_norm": 0.828125, + "learning_rate": 0.0004862884983330569, + "loss": 0.209, + "step": 76800 + }, + { + "epoch": 3.18, + "grad_norm": 1.2421875, + "learning_rate": 0.0004862849557983312, + "loss": 0.2853, + "step": 76810 + }, + { + "epoch": 3.18, + "grad_norm": 1.2890625, + "learning_rate": 0.0004862814128189428, + "loss": 0.2384, + "step": 76820 + }, + { + "epoch": 3.18, + "grad_norm": 0.484375, + "learning_rate": 0.00048627786939489816, + "loss": 0.2078, + "step": 76830 + }, + { + "epoch": 3.18, + "grad_norm": 0.490234375, + "learning_rate": 0.00048627432552620416, + "loss": 0.1949, + "step": 76840 + }, + { + "epoch": 3.18, + "grad_norm": 0.2236328125, + "learning_rate": 0.00048627078121286736, + "loss": 0.2333, + "step": 76850 + }, + { + "epoch": 3.18, + "grad_norm": 0.1962890625, + "learning_rate": 0.00048626723645489454, + "loss": 0.2505, + "step": 76860 + }, + { + "epoch": 3.18, + "grad_norm": 0.9375, + "learning_rate": 0.0004862636912522922, + "loss": 0.2375, + "step": 76870 + }, + { + "epoch": 3.18, + "grad_norm": 0.97265625, + "learning_rate": 0.00048626014560506714, + "loss": 0.2086, + "step": 76880 + }, + { + "epoch": 3.18, + "grad_norm": 0.416015625, + "learning_rate": 0.000486256599513226, + "loss": 0.2056, + "step": 76890 + }, + { + "epoch": 3.19, + "grad_norm": 0.474609375, + "learning_rate": 0.00048625305297677544, + "loss": 0.1589, + "step": 76900 + }, + { + "epoch": 3.19, + "grad_norm": 0.578125, + "learning_rate": 0.0004862495059957223, + "loss": 0.23, + "step": 76910 + }, + { + "epoch": 3.19, + "grad_norm": 1.46875, + "learning_rate": 0.00048624595857007293, + "loss": 0.2286, + "step": 76920 + }, + { + "epoch": 3.19, + "grad_norm": 0.58203125, + "learning_rate": 0.00048624241069983423, + "loss": 0.1811, + "step": 76930 + }, + { + "epoch": 3.19, + "grad_norm": 1.8984375, + "learning_rate": 0.0004862388623850128, + "loss": 0.2829, + "step": 76940 + }, + { + "epoch": 3.19, + "grad_norm": 0.5859375, + "learning_rate": 0.00048623531362561534, + "loss": 0.209, + "step": 76950 + }, + { + "epoch": 3.19, + "grad_norm": 1.078125, + "learning_rate": 0.0004862317644216486, + "loss": 0.2462, + "step": 76960 + }, + { + "epoch": 3.19, + "grad_norm": 0.458984375, + "learning_rate": 0.00048622821477311906, + "loss": 0.214, + "step": 76970 + }, + { + "epoch": 3.19, + "grad_norm": 0.765625, + "learning_rate": 0.00048622466468003367, + "loss": 0.223, + "step": 76980 + }, + { + "epoch": 3.19, + "grad_norm": 1.1171875, + "learning_rate": 0.0004862211141423989, + "loss": 0.19, + "step": 76990 + }, + { + "epoch": 3.19, + "grad_norm": 0.625, + "learning_rate": 0.00048621756316022147, + "loss": 0.2022, + "step": 77000 + }, + { + "epoch": 3.19, + "grad_norm": 0.318359375, + "learning_rate": 0.0004862140117335082, + "loss": 0.1568, + "step": 77010 + }, + { + "epoch": 3.19, + "grad_norm": 1.2421875, + "learning_rate": 0.0004862104598622656, + "loss": 0.2239, + "step": 77020 + }, + { + "epoch": 3.19, + "grad_norm": 0.5859375, + "learning_rate": 0.00048620690754650045, + "loss": 0.2055, + "step": 77030 + }, + { + "epoch": 3.19, + "grad_norm": 0.419921875, + "learning_rate": 0.00048620335478621933, + "loss": 0.2026, + "step": 77040 + }, + { + "epoch": 3.19, + "grad_norm": 0.70703125, + "learning_rate": 0.00048619980158142915, + "loss": 0.2013, + "step": 77050 + }, + { + "epoch": 3.19, + "grad_norm": 0.490234375, + "learning_rate": 0.00048619624793213635, + "loss": 0.2007, + "step": 77060 + }, + { + "epoch": 3.19, + "grad_norm": 0.6015625, + "learning_rate": 0.00048619269383834776, + "loss": 0.2575, + "step": 77070 + }, + { + "epoch": 3.19, + "grad_norm": 0.3203125, + "learning_rate": 0.0004861891393000699, + "loss": 0.2202, + "step": 77080 + }, + { + "epoch": 3.19, + "grad_norm": 0.2392578125, + "learning_rate": 0.00048618558431730974, + "loss": 0.1805, + "step": 77090 + }, + { + "epoch": 3.19, + "grad_norm": 1.828125, + "learning_rate": 0.0004861820288900738, + "loss": 0.1773, + "step": 77100 + }, + { + "epoch": 3.19, + "grad_norm": 0.5390625, + "learning_rate": 0.0004861784730183687, + "loss": 0.26, + "step": 77110 + }, + { + "epoch": 3.19, + "grad_norm": 0.5859375, + "learning_rate": 0.00048617491670220126, + "loss": 0.2319, + "step": 77120 + }, + { + "epoch": 3.19, + "grad_norm": 1.0, + "learning_rate": 0.00048617135994157815, + "loss": 0.2319, + "step": 77130 + }, + { + "epoch": 3.2, + "grad_norm": 0.62109375, + "learning_rate": 0.0004861678027365061, + "loss": 0.229, + "step": 77140 + }, + { + "epoch": 3.2, + "grad_norm": 3.671875, + "learning_rate": 0.00048616424508699164, + "loss": 0.2383, + "step": 77150 + }, + { + "epoch": 3.2, + "grad_norm": 0.63671875, + "learning_rate": 0.00048616068699304163, + "loss": 0.2235, + "step": 77160 + }, + { + "epoch": 3.2, + "grad_norm": 0.59765625, + "learning_rate": 0.0004861571284546626, + "loss": 0.2354, + "step": 77170 + }, + { + "epoch": 3.2, + "grad_norm": 0.83984375, + "learning_rate": 0.0004861535694718615, + "loss": 0.2078, + "step": 77180 + }, + { + "epoch": 3.2, + "grad_norm": 1.1875, + "learning_rate": 0.0004861500100446449, + "loss": 0.2276, + "step": 77190 + }, + { + "epoch": 3.2, + "grad_norm": 0.6171875, + "learning_rate": 0.0004861464501730194, + "loss": 0.1749, + "step": 77200 + }, + { + "epoch": 3.2, + "grad_norm": 0.5, + "learning_rate": 0.00048614288985699187, + "loss": 0.206, + "step": 77210 + }, + { + "epoch": 3.2, + "grad_norm": 0.2451171875, + "learning_rate": 0.00048613932909656875, + "loss": 0.2305, + "step": 77220 + }, + { + "epoch": 3.2, + "grad_norm": 1.09375, + "learning_rate": 0.0004861357678917571, + "loss": 0.2306, + "step": 77230 + }, + { + "epoch": 3.2, + "grad_norm": 0.412109375, + "learning_rate": 0.0004861322062425633, + "loss": 0.1811, + "step": 77240 + }, + { + "epoch": 3.2, + "grad_norm": 0.7265625, + "learning_rate": 0.0004861286441489943, + "loss": 0.1609, + "step": 77250 + }, + { + "epoch": 3.2, + "grad_norm": 0.7421875, + "learning_rate": 0.00048612508161105663, + "loss": 0.2148, + "step": 77260 + }, + { + "epoch": 3.2, + "grad_norm": 0.6015625, + "learning_rate": 0.0004861215186287571, + "loss": 0.2357, + "step": 77270 + }, + { + "epoch": 3.2, + "grad_norm": 0.76171875, + "learning_rate": 0.0004861179552021023, + "loss": 0.2614, + "step": 77280 + }, + { + "epoch": 3.2, + "grad_norm": 0.5625, + "learning_rate": 0.00048611439133109904, + "loss": 0.2342, + "step": 77290 + }, + { + "epoch": 3.2, + "grad_norm": 1.125, + "learning_rate": 0.000486110827015754, + "loss": 0.1917, + "step": 77300 + }, + { + "epoch": 3.2, + "grad_norm": 0.26953125, + "learning_rate": 0.00048610726225607384, + "loss": 0.249, + "step": 77310 + }, + { + "epoch": 3.2, + "grad_norm": 1.4921875, + "learning_rate": 0.00048610369705206536, + "loss": 0.1946, + "step": 77320 + }, + { + "epoch": 3.2, + "grad_norm": 0.7265625, + "learning_rate": 0.0004861001314037352, + "loss": 0.1932, + "step": 77330 + }, + { + "epoch": 3.2, + "grad_norm": 0.96875, + "learning_rate": 0.0004860965653110901, + "loss": 0.2193, + "step": 77340 + }, + { + "epoch": 3.2, + "grad_norm": 0.4140625, + "learning_rate": 0.0004860929987741368, + "loss": 0.1851, + "step": 77350 + }, + { + "epoch": 3.2, + "grad_norm": 0.6953125, + "learning_rate": 0.0004860894317928819, + "loss": 0.2093, + "step": 77360 + }, + { + "epoch": 3.2, + "grad_norm": 0.52734375, + "learning_rate": 0.00048608586436733214, + "loss": 0.2564, + "step": 77370 + }, + { + "epoch": 3.21, + "grad_norm": 0.6640625, + "learning_rate": 0.00048608229649749435, + "loss": 0.17, + "step": 77380 + }, + { + "epoch": 3.21, + "grad_norm": 0.39453125, + "learning_rate": 0.00048607872818337523, + "loss": 0.2455, + "step": 77390 + }, + { + "epoch": 3.21, + "grad_norm": 0.384765625, + "learning_rate": 0.0004860751594249814, + "loss": 0.1291, + "step": 77400 + }, + { + "epoch": 3.21, + "grad_norm": 0.435546875, + "learning_rate": 0.00048607159022231956, + "loss": 0.2527, + "step": 77410 + }, + { + "epoch": 3.21, + "grad_norm": 1.625, + "learning_rate": 0.0004860680205753965, + "loss": 0.2725, + "step": 77420 + }, + { + "epoch": 3.21, + "grad_norm": 0.279296875, + "learning_rate": 0.00048606445048421897, + "loss": 0.2224, + "step": 77430 + }, + { + "epoch": 3.21, + "grad_norm": 1.015625, + "learning_rate": 0.00048606087994879354, + "loss": 0.2558, + "step": 77440 + }, + { + "epoch": 3.21, + "grad_norm": 0.44140625, + "learning_rate": 0.00048605730896912716, + "loss": 0.2822, + "step": 77450 + }, + { + "epoch": 3.21, + "grad_norm": 0.279296875, + "learning_rate": 0.0004860537375452263, + "loss": 0.2012, + "step": 77460 + }, + { + "epoch": 3.21, + "grad_norm": 0.84375, + "learning_rate": 0.0004860501656770978, + "loss": 0.2143, + "step": 77470 + }, + { + "epoch": 3.21, + "grad_norm": 0.73046875, + "learning_rate": 0.0004860465933647484, + "loss": 0.2435, + "step": 77480 + }, + { + "epoch": 3.21, + "grad_norm": 0.66796875, + "learning_rate": 0.0004860430206081849, + "loss": 0.2006, + "step": 77490 + }, + { + "epoch": 3.21, + "grad_norm": 0.58984375, + "learning_rate": 0.00048603944740741386, + "loss": 0.1847, + "step": 77500 + }, + { + "epoch": 3.21, + "grad_norm": 0.46875, + "learning_rate": 0.00048603587376244207, + "loss": 0.1794, + "step": 77510 + }, + { + "epoch": 3.21, + "grad_norm": 0.62890625, + "learning_rate": 0.0004860322996732762, + "loss": 0.2273, + "step": 77520 + }, + { + "epoch": 3.21, + "grad_norm": 0.53515625, + "learning_rate": 0.0004860287251399231, + "loss": 0.2161, + "step": 77530 + }, + { + "epoch": 3.21, + "grad_norm": 1.234375, + "learning_rate": 0.00048602515016238937, + "loss": 0.1924, + "step": 77540 + }, + { + "epoch": 3.21, + "grad_norm": 0.88671875, + "learning_rate": 0.00048602157474068186, + "loss": 0.2439, + "step": 77550 + }, + { + "epoch": 3.21, + "grad_norm": 0.609375, + "learning_rate": 0.0004860179988748072, + "loss": 0.2406, + "step": 77560 + }, + { + "epoch": 3.21, + "grad_norm": 0.9296875, + "learning_rate": 0.0004860144225647721, + "loss": 0.1417, + "step": 77570 + }, + { + "epoch": 3.21, + "grad_norm": 0.4921875, + "learning_rate": 0.0004860108458105834, + "loss": 0.2011, + "step": 77580 + }, + { + "epoch": 3.21, + "grad_norm": 0.61328125, + "learning_rate": 0.0004860072686122478, + "loss": 0.2183, + "step": 77590 + }, + { + "epoch": 3.21, + "grad_norm": 1.25, + "learning_rate": 0.00048600369096977206, + "loss": 0.2161, + "step": 77600 + }, + { + "epoch": 3.21, + "grad_norm": 0.66796875, + "learning_rate": 0.0004860001128831627, + "loss": 0.2267, + "step": 77610 + }, + { + "epoch": 3.22, + "grad_norm": 0.431640625, + "learning_rate": 0.00048599653435242677, + "loss": 0.2215, + "step": 77620 + }, + { + "epoch": 3.22, + "grad_norm": 2.21875, + "learning_rate": 0.0004859929553775707, + "loss": 0.2882, + "step": 77630 + }, + { + "epoch": 3.22, + "grad_norm": 1.0234375, + "learning_rate": 0.0004859893759586015, + "loss": 0.2054, + "step": 77640 + }, + { + "epoch": 3.22, + "grad_norm": 1.2890625, + "learning_rate": 0.00048598579609552574, + "loss": 0.1765, + "step": 77650 + }, + { + "epoch": 3.22, + "grad_norm": 0.625, + "learning_rate": 0.0004859822157883501, + "loss": 0.2704, + "step": 77660 + }, + { + "epoch": 3.22, + "grad_norm": 1.1953125, + "learning_rate": 0.0004859786350370815, + "loss": 0.2432, + "step": 77670 + }, + { + "epoch": 3.22, + "grad_norm": 0.31640625, + "learning_rate": 0.00048597505384172663, + "loss": 0.212, + "step": 77680 + }, + { + "epoch": 3.22, + "grad_norm": 1.2421875, + "learning_rate": 0.00048597147220229216, + "loss": 0.1732, + "step": 77690 + }, + { + "epoch": 3.22, + "grad_norm": 0.2578125, + "learning_rate": 0.00048596789011878485, + "loss": 0.2366, + "step": 77700 + }, + { + "epoch": 3.22, + "grad_norm": 1.453125, + "learning_rate": 0.0004859643075912115, + "loss": 0.2072, + "step": 77710 + }, + { + "epoch": 3.22, + "grad_norm": 0.265625, + "learning_rate": 0.0004859607246195787, + "loss": 0.1939, + "step": 77720 + }, + { + "epoch": 3.22, + "grad_norm": 0.52734375, + "learning_rate": 0.00048595714120389333, + "loss": 0.2075, + "step": 77730 + }, + { + "epoch": 3.22, + "grad_norm": 1.3046875, + "learning_rate": 0.00048595355734416216, + "loss": 0.1787, + "step": 77740 + }, + { + "epoch": 3.22, + "grad_norm": 0.51953125, + "learning_rate": 0.00048594997304039186, + "loss": 0.2224, + "step": 77750 + }, + { + "epoch": 3.22, + "grad_norm": 0.625, + "learning_rate": 0.0004859463882925892, + "loss": 0.199, + "step": 77760 + }, + { + "epoch": 3.22, + "grad_norm": 0.96484375, + "learning_rate": 0.00048594280310076086, + "loss": 0.2356, + "step": 77770 + }, + { + "epoch": 3.22, + "grad_norm": 1.2734375, + "learning_rate": 0.00048593921746491365, + "loss": 0.238, + "step": 77780 + }, + { + "epoch": 3.22, + "grad_norm": 0.423828125, + "learning_rate": 0.0004859356313850544, + "loss": 0.2551, + "step": 77790 + }, + { + "epoch": 3.22, + "grad_norm": 1.15625, + "learning_rate": 0.0004859320448611897, + "loss": 0.246, + "step": 77800 + }, + { + "epoch": 3.22, + "grad_norm": 0.61328125, + "learning_rate": 0.00048592845789332634, + "loss": 0.2294, + "step": 77810 + }, + { + "epoch": 3.22, + "grad_norm": 0.65234375, + "learning_rate": 0.0004859248704814712, + "loss": 0.1923, + "step": 77820 + }, + { + "epoch": 3.22, + "grad_norm": 1.2109375, + "learning_rate": 0.00048592128262563085, + "loss": 0.2261, + "step": 77830 + }, + { + "epoch": 3.22, + "grad_norm": 0.84375, + "learning_rate": 0.0004859176943258121, + "loss": 0.2342, + "step": 77840 + }, + { + "epoch": 3.22, + "grad_norm": 0.361328125, + "learning_rate": 0.00048591410558202175, + "loss": 0.2477, + "step": 77850 + }, + { + "epoch": 3.22, + "grad_norm": 0.43359375, + "learning_rate": 0.0004859105163942666, + "loss": 0.2214, + "step": 77860 + }, + { + "epoch": 3.23, + "grad_norm": 0.3046875, + "learning_rate": 0.0004859069267625533, + "loss": 0.1856, + "step": 77870 + }, + { + "epoch": 3.23, + "grad_norm": 0.353515625, + "learning_rate": 0.0004859033366868886, + "loss": 0.1822, + "step": 77880 + }, + { + "epoch": 3.23, + "grad_norm": 1.890625, + "learning_rate": 0.0004858997461672793, + "loss": 0.2302, + "step": 77890 + }, + { + "epoch": 3.23, + "grad_norm": 0.390625, + "learning_rate": 0.00048589615520373223, + "loss": 0.2759, + "step": 77900 + }, + { + "epoch": 3.23, + "grad_norm": 0.63671875, + "learning_rate": 0.000485892563796254, + "loss": 0.2143, + "step": 77910 + }, + { + "epoch": 3.23, + "grad_norm": 0.8984375, + "learning_rate": 0.00048588897194485145, + "loss": 0.1389, + "step": 77920 + }, + { + "epoch": 3.23, + "grad_norm": 0.462890625, + "learning_rate": 0.0004858853796495313, + "loss": 0.1495, + "step": 77930 + }, + { + "epoch": 3.23, + "grad_norm": 0.36328125, + "learning_rate": 0.0004858817869103004, + "loss": 0.2156, + "step": 77940 + }, + { + "epoch": 3.23, + "grad_norm": 0.67578125, + "learning_rate": 0.0004858781937271654, + "loss": 0.2142, + "step": 77950 + }, + { + "epoch": 3.23, + "grad_norm": 0.62890625, + "learning_rate": 0.0004858746001001331, + "loss": 0.1995, + "step": 77960 + }, + { + "epoch": 3.23, + "grad_norm": 0.625, + "learning_rate": 0.0004858710060292103, + "loss": 0.2224, + "step": 77970 + }, + { + "epoch": 3.23, + "grad_norm": 1.7421875, + "learning_rate": 0.0004858674115144038, + "loss": 0.2189, + "step": 77980 + }, + { + "epoch": 3.23, + "grad_norm": 0.54296875, + "learning_rate": 0.0004858638165557202, + "loss": 0.1984, + "step": 77990 + }, + { + "epoch": 3.23, + "grad_norm": 0.427734375, + "learning_rate": 0.00048586022115316643, + "loss": 0.1877, + "step": 78000 + }, + { + "epoch": 3.23, + "grad_norm": 0.51953125, + "learning_rate": 0.00048585662530674926, + "loss": 0.2569, + "step": 78010 + }, + { + "epoch": 3.23, + "grad_norm": 0.62109375, + "learning_rate": 0.00048585302901647527, + "loss": 0.2619, + "step": 78020 + }, + { + "epoch": 3.23, + "grad_norm": 0.62109375, + "learning_rate": 0.00048584943228235136, + "loss": 0.1908, + "step": 78030 + }, + { + "epoch": 3.23, + "grad_norm": 0.8828125, + "learning_rate": 0.00048584583510438435, + "loss": 0.2171, + "step": 78040 + }, + { + "epoch": 3.23, + "grad_norm": 0.2392578125, + "learning_rate": 0.00048584223748258096, + "loss": 0.2497, + "step": 78050 + }, + { + "epoch": 3.23, + "grad_norm": 1.4375, + "learning_rate": 0.0004858386394169478, + "loss": 0.1769, + "step": 78060 + }, + { + "epoch": 3.23, + "grad_norm": 0.85546875, + "learning_rate": 0.00048583504090749197, + "loss": 0.1748, + "step": 78070 + }, + { + "epoch": 3.23, + "grad_norm": 0.361328125, + "learning_rate": 0.00048583144195421996, + "loss": 0.2148, + "step": 78080 + }, + { + "epoch": 3.23, + "grad_norm": 0.5078125, + "learning_rate": 0.0004858278425571387, + "loss": 0.241, + "step": 78090 + }, + { + "epoch": 3.23, + "grad_norm": 0.9921875, + "learning_rate": 0.00048582424271625485, + "loss": 0.2428, + "step": 78100 + }, + { + "epoch": 3.24, + "grad_norm": 0.92578125, + "learning_rate": 0.0004858206424315753, + "loss": 0.223, + "step": 78110 + }, + { + "epoch": 3.24, + "grad_norm": 0.58984375, + "learning_rate": 0.0004858170417031067, + "loss": 0.2546, + "step": 78120 + }, + { + "epoch": 3.24, + "grad_norm": 1.6640625, + "learning_rate": 0.00048581344053085597, + "loss": 0.2122, + "step": 78130 + }, + { + "epoch": 3.24, + "grad_norm": 1.0234375, + "learning_rate": 0.00048580983891482977, + "loss": 0.2198, + "step": 78140 + }, + { + "epoch": 3.24, + "grad_norm": 0.6015625, + "learning_rate": 0.0004858062368550349, + "loss": 0.18, + "step": 78150 + }, + { + "epoch": 3.24, + "grad_norm": 0.984375, + "learning_rate": 0.0004858026343514782, + "loss": 0.1672, + "step": 78160 + }, + { + "epoch": 3.24, + "grad_norm": 0.39453125, + "learning_rate": 0.00048579903140416635, + "loss": 0.2259, + "step": 78170 + }, + { + "epoch": 3.24, + "grad_norm": 0.6875, + "learning_rate": 0.0004857954280131062, + "loss": 0.197, + "step": 78180 + }, + { + "epoch": 3.24, + "grad_norm": 0.6953125, + "learning_rate": 0.0004857918241783046, + "loss": 0.2422, + "step": 78190 + }, + { + "epoch": 3.24, + "grad_norm": 1.4765625, + "learning_rate": 0.00048578821989976817, + "loss": 0.2131, + "step": 78200 + }, + { + "epoch": 3.24, + "grad_norm": 0.96484375, + "learning_rate": 0.0004857846151775038, + "loss": 0.1934, + "step": 78210 + }, + { + "epoch": 3.24, + "grad_norm": 0.3125, + "learning_rate": 0.0004857810100115182, + "loss": 0.2155, + "step": 78220 + }, + { + "epoch": 3.24, + "grad_norm": 1.046875, + "learning_rate": 0.0004857774044018183, + "loss": 0.2569, + "step": 78230 + }, + { + "epoch": 3.24, + "grad_norm": 0.42578125, + "learning_rate": 0.00048577379834841065, + "loss": 0.1716, + "step": 78240 + }, + { + "epoch": 3.24, + "grad_norm": 0.365234375, + "learning_rate": 0.00048577019185130227, + "loss": 0.1956, + "step": 78250 + }, + { + "epoch": 3.24, + "grad_norm": 0.47265625, + "learning_rate": 0.00048576658491049986, + "loss": 0.2411, + "step": 78260 + }, + { + "epoch": 3.24, + "grad_norm": 0.62109375, + "learning_rate": 0.00048576297752601016, + "loss": 0.213, + "step": 78270 + }, + { + "epoch": 3.24, + "grad_norm": 0.1318359375, + "learning_rate": 0.00048575936969783994, + "loss": 0.1448, + "step": 78280 + }, + { + "epoch": 3.24, + "grad_norm": 0.27734375, + "learning_rate": 0.00048575576142599613, + "loss": 0.2381, + "step": 78290 + }, + { + "epoch": 3.24, + "grad_norm": 0.353515625, + "learning_rate": 0.00048575215271048547, + "loss": 0.203, + "step": 78300 + }, + { + "epoch": 3.24, + "grad_norm": 0.671875, + "learning_rate": 0.00048574854355131457, + "loss": 0.1983, + "step": 78310 + }, + { + "epoch": 3.24, + "grad_norm": 0.56640625, + "learning_rate": 0.0004857449339484905, + "loss": 0.2723, + "step": 78320 + }, + { + "epoch": 3.24, + "grad_norm": 0.6875, + "learning_rate": 0.00048574132390201993, + "loss": 0.2005, + "step": 78330 + }, + { + "epoch": 3.24, + "grad_norm": 0.65625, + "learning_rate": 0.0004857377134119096, + "loss": 0.1988, + "step": 78340 + }, + { + "epoch": 3.25, + "grad_norm": 0.3671875, + "learning_rate": 0.00048573410247816634, + "loss": 0.2034, + "step": 78350 + }, + { + "epoch": 3.25, + "grad_norm": 0.87109375, + "learning_rate": 0.000485730491100797, + "loss": 0.2306, + "step": 78360 + }, + { + "epoch": 3.25, + "grad_norm": 0.671875, + "learning_rate": 0.0004857268792798083, + "loss": 0.2202, + "step": 78370 + }, + { + "epoch": 3.25, + "grad_norm": 1.2265625, + "learning_rate": 0.000485723267015207, + "loss": 0.2628, + "step": 78380 + }, + { + "epoch": 3.25, + "grad_norm": 0.61328125, + "learning_rate": 0.00048571965430700004, + "loss": 0.26, + "step": 78390 + }, + { + "epoch": 3.25, + "grad_norm": 0.59765625, + "learning_rate": 0.0004857160411551942, + "loss": 0.1827, + "step": 78400 + }, + { + "epoch": 3.25, + "grad_norm": 0.828125, + "learning_rate": 0.0004857124275597962, + "loss": 0.1619, + "step": 78410 + }, + { + "epoch": 3.25, + "grad_norm": 0.9453125, + "learning_rate": 0.00048570881352081273, + "loss": 0.2316, + "step": 78420 + }, + { + "epoch": 3.25, + "grad_norm": 0.369140625, + "learning_rate": 0.0004857051990382509, + "loss": 0.2181, + "step": 78430 + }, + { + "epoch": 3.25, + "grad_norm": 0.69140625, + "learning_rate": 0.0004857015841121173, + "loss": 0.2141, + "step": 78440 + }, + { + "epoch": 3.25, + "grad_norm": 0.75, + "learning_rate": 0.0004856979687424187, + "loss": 0.2087, + "step": 78450 + }, + { + "epoch": 3.25, + "grad_norm": 0.21484375, + "learning_rate": 0.000485694352929162, + "loss": 0.2128, + "step": 78460 + }, + { + "epoch": 3.25, + "grad_norm": 0.8359375, + "learning_rate": 0.000485690736672354, + "loss": 0.2428, + "step": 78470 + }, + { + "epoch": 3.25, + "grad_norm": 0.365234375, + "learning_rate": 0.00048568711997200157, + "loss": 0.2542, + "step": 78480 + }, + { + "epoch": 3.25, + "grad_norm": 0.42578125, + "learning_rate": 0.0004856835028281113, + "loss": 0.209, + "step": 78490 + }, + { + "epoch": 3.25, + "grad_norm": 0.796875, + "learning_rate": 0.0004856798852406902, + "loss": 0.2242, + "step": 78500 + }, + { + "epoch": 3.25, + "grad_norm": 0.7265625, + "learning_rate": 0.000485676267209745, + "loss": 0.2272, + "step": 78510 + }, + { + "epoch": 3.25, + "grad_norm": 0.470703125, + "learning_rate": 0.0004856726487352825, + "loss": 0.2504, + "step": 78520 + }, + { + "epoch": 3.25, + "grad_norm": 0.890625, + "learning_rate": 0.0004856690298173095, + "loss": 0.1615, + "step": 78530 + }, + { + "epoch": 3.25, + "grad_norm": 0.640625, + "learning_rate": 0.0004856654104558328, + "loss": 0.1694, + "step": 78540 + }, + { + "epoch": 3.25, + "grad_norm": 0.640625, + "learning_rate": 0.00048566179065085937, + "loss": 0.1628, + "step": 78550 + }, + { + "epoch": 3.25, + "grad_norm": 0.48046875, + "learning_rate": 0.00048565817040239584, + "loss": 0.1901, + "step": 78560 + }, + { + "epoch": 3.25, + "grad_norm": 1.703125, + "learning_rate": 0.0004856545497104491, + "loss": 0.2717, + "step": 78570 + }, + { + "epoch": 3.25, + "grad_norm": 0.55859375, + "learning_rate": 0.0004856509285750259, + "loss": 0.2312, + "step": 78580 + }, + { + "epoch": 3.26, + "grad_norm": 0.353515625, + "learning_rate": 0.0004856473069961331, + "loss": 0.2135, + "step": 78590 + }, + { + "epoch": 3.26, + "grad_norm": 0.578125, + "learning_rate": 0.0004856436849737776, + "loss": 0.2436, + "step": 78600 + }, + { + "epoch": 3.26, + "grad_norm": 0.5234375, + "learning_rate": 0.0004856400625079661, + "loss": 0.2014, + "step": 78610 + }, + { + "epoch": 3.26, + "grad_norm": 0.7890625, + "learning_rate": 0.00048563643959870543, + "loss": 0.1885, + "step": 78620 + }, + { + "epoch": 3.26, + "grad_norm": 1.421875, + "learning_rate": 0.00048563281624600243, + "loss": 0.2315, + "step": 78630 + }, + { + "epoch": 3.26, + "grad_norm": 0.65234375, + "learning_rate": 0.0004856291924498639, + "loss": 0.2224, + "step": 78640 + }, + { + "epoch": 3.26, + "grad_norm": 0.578125, + "learning_rate": 0.0004856255682102967, + "loss": 0.16, + "step": 78650 + }, + { + "epoch": 3.26, + "grad_norm": 1.4609375, + "learning_rate": 0.0004856219435273076, + "loss": 0.1815, + "step": 78660 + }, + { + "epoch": 3.26, + "grad_norm": 1.1640625, + "learning_rate": 0.0004856183184009034, + "loss": 0.1733, + "step": 78670 + }, + { + "epoch": 3.26, + "grad_norm": 0.52734375, + "learning_rate": 0.00048561469283109104, + "loss": 0.2664, + "step": 78680 + }, + { + "epoch": 3.26, + "grad_norm": 1.125, + "learning_rate": 0.00048561106681787724, + "loss": 0.2349, + "step": 78690 + }, + { + "epoch": 3.26, + "grad_norm": 0.388671875, + "learning_rate": 0.0004856074403612689, + "loss": 0.2233, + "step": 78700 + }, + { + "epoch": 3.26, + "grad_norm": 0.56640625, + "learning_rate": 0.0004856038134612728, + "loss": 0.2047, + "step": 78710 + }, + { + "epoch": 3.26, + "grad_norm": 1.1953125, + "learning_rate": 0.0004856001861178957, + "loss": 0.2391, + "step": 78720 + }, + { + "epoch": 3.26, + "grad_norm": 0.84765625, + "learning_rate": 0.0004855965583311446, + "loss": 0.1945, + "step": 78730 + }, + { + "epoch": 3.26, + "grad_norm": 0.96484375, + "learning_rate": 0.0004855929301010261, + "loss": 0.1717, + "step": 78740 + }, + { + "epoch": 3.26, + "grad_norm": 0.47265625, + "learning_rate": 0.0004855893014275472, + "loss": 0.2346, + "step": 78750 + }, + { + "epoch": 3.26, + "grad_norm": 0.3828125, + "learning_rate": 0.0004855856723107147, + "loss": 0.1896, + "step": 78760 + }, + { + "epoch": 3.26, + "grad_norm": 0.423828125, + "learning_rate": 0.0004855820427505354, + "loss": 0.1525, + "step": 78770 + }, + { + "epoch": 3.26, + "grad_norm": 0.5625, + "learning_rate": 0.0004855784127470161, + "loss": 0.217, + "step": 78780 + }, + { + "epoch": 3.26, + "grad_norm": 0.392578125, + "learning_rate": 0.00048557478230016373, + "loss": 0.1683, + "step": 78790 + }, + { + "epoch": 3.26, + "grad_norm": 0.65625, + "learning_rate": 0.000485571151409985, + "loss": 0.1955, + "step": 78800 + }, + { + "epoch": 3.26, + "grad_norm": 0.306640625, + "learning_rate": 0.0004855675200764868, + "loss": 0.2077, + "step": 78810 + }, + { + "epoch": 3.26, + "grad_norm": 1.0546875, + "learning_rate": 0.000485563888299676, + "loss": 0.2571, + "step": 78820 + }, + { + "epoch": 3.27, + "grad_norm": 1.0546875, + "learning_rate": 0.0004855602560795593, + "loss": 0.1719, + "step": 78830 + }, + { + "epoch": 3.27, + "grad_norm": 0.671875, + "learning_rate": 0.0004855566234161438, + "loss": 0.2495, + "step": 78840 + }, + { + "epoch": 3.27, + "grad_norm": 0.94140625, + "learning_rate": 0.0004855529903094361, + "loss": 0.2241, + "step": 78850 + }, + { + "epoch": 3.27, + "grad_norm": 0.609375, + "learning_rate": 0.0004855493567594431, + "loss": 0.208, + "step": 78860 + }, + { + "epoch": 3.27, + "grad_norm": 0.34765625, + "learning_rate": 0.00048554572276617166, + "loss": 0.2114, + "step": 78870 + }, + { + "epoch": 3.27, + "grad_norm": 0.66796875, + "learning_rate": 0.00048554208832962854, + "loss": 0.2046, + "step": 78880 + }, + { + "epoch": 3.27, + "grad_norm": 0.921875, + "learning_rate": 0.0004855384534498207, + "loss": 0.2185, + "step": 78890 + }, + { + "epoch": 3.27, + "grad_norm": 0.44140625, + "learning_rate": 0.00048553481812675496, + "loss": 0.2581, + "step": 78900 + }, + { + "epoch": 3.27, + "grad_norm": 0.76953125, + "learning_rate": 0.00048553118236043815, + "loss": 0.2505, + "step": 78910 + }, + { + "epoch": 3.27, + "grad_norm": 0.59765625, + "learning_rate": 0.000485527546150877, + "loss": 0.1838, + "step": 78920 + }, + { + "epoch": 3.27, + "grad_norm": 0.5703125, + "learning_rate": 0.00048552390949807843, + "loss": 0.1977, + "step": 78930 + }, + { + "epoch": 3.27, + "grad_norm": 0.80078125, + "learning_rate": 0.00048552027240204946, + "loss": 0.2398, + "step": 78940 + }, + { + "epoch": 3.27, + "grad_norm": 0.2578125, + "learning_rate": 0.00048551663486279663, + "loss": 0.2645, + "step": 78950 + }, + { + "epoch": 3.27, + "grad_norm": 0.98828125, + "learning_rate": 0.0004855129968803269, + "loss": 0.2275, + "step": 78960 + }, + { + "epoch": 3.27, + "grad_norm": 0.65625, + "learning_rate": 0.0004855093584546473, + "loss": 0.2359, + "step": 78970 + }, + { + "epoch": 3.27, + "grad_norm": 0.89453125, + "learning_rate": 0.0004855057195857644, + "loss": 0.231, + "step": 78980 + }, + { + "epoch": 3.27, + "grad_norm": 0.447265625, + "learning_rate": 0.0004855020802736851, + "loss": 0.1818, + "step": 78990 + }, + { + "epoch": 3.27, + "grad_norm": 0.80078125, + "learning_rate": 0.00048549844051841646, + "loss": 0.223, + "step": 79000 + }, + { + "epoch": 3.27, + "grad_norm": 0.53515625, + "learning_rate": 0.00048549480031996514, + "loss": 0.2034, + "step": 79010 + }, + { + "epoch": 3.27, + "grad_norm": 0.640625, + "learning_rate": 0.000485491159678338, + "loss": 0.2266, + "step": 79020 + }, + { + "epoch": 3.27, + "grad_norm": 0.5703125, + "learning_rate": 0.000485487518593542, + "loss": 0.2516, + "step": 79030 + }, + { + "epoch": 3.27, + "grad_norm": 0.625, + "learning_rate": 0.00048548387706558383, + "loss": 0.2518, + "step": 79040 + }, + { + "epoch": 3.27, + "grad_norm": 0.53515625, + "learning_rate": 0.00048548023509447047, + "loss": 0.2241, + "step": 79050 + }, + { + "epoch": 3.27, + "grad_norm": 0.427734375, + "learning_rate": 0.0004854765926802088, + "loss": 0.213, + "step": 79060 + }, + { + "epoch": 3.28, + "grad_norm": 0.625, + "learning_rate": 0.00048547294982280556, + "loss": 0.1909, + "step": 79070 + }, + { + "epoch": 3.28, + "grad_norm": 0.65625, + "learning_rate": 0.0004854693065222676, + "loss": 0.2231, + "step": 79080 + }, + { + "epoch": 3.28, + "grad_norm": 0.796875, + "learning_rate": 0.00048546566277860194, + "loss": 0.2492, + "step": 79090 + }, + { + "epoch": 3.28, + "grad_norm": 0.80859375, + "learning_rate": 0.00048546201859181524, + "loss": 0.2229, + "step": 79100 + }, + { + "epoch": 3.28, + "grad_norm": 1.15625, + "learning_rate": 0.0004854583739619145, + "loss": 0.1583, + "step": 79110 + }, + { + "epoch": 3.28, + "grad_norm": 0.77734375, + "learning_rate": 0.0004854547288889065, + "loss": 0.261, + "step": 79120 + }, + { + "epoch": 3.28, + "grad_norm": 0.984375, + "learning_rate": 0.0004854510833727982, + "loss": 0.263, + "step": 79130 + }, + { + "epoch": 3.28, + "grad_norm": 0.263671875, + "learning_rate": 0.0004854474374135963, + "loss": 0.256, + "step": 79140 + }, + { + "epoch": 3.28, + "grad_norm": 0.310546875, + "learning_rate": 0.0004854437910113078, + "loss": 0.1909, + "step": 79150 + }, + { + "epoch": 3.28, + "grad_norm": 0.55859375, + "learning_rate": 0.0004854401441659395, + "loss": 0.2165, + "step": 79160 + }, + { + "epoch": 3.28, + "grad_norm": 0.609375, + "learning_rate": 0.00048543649687749824, + "loss": 0.1656, + "step": 79170 + }, + { + "epoch": 3.28, + "grad_norm": 1.4921875, + "learning_rate": 0.00048543284914599093, + "loss": 0.216, + "step": 79180 + }, + { + "epoch": 3.28, + "grad_norm": 0.74609375, + "learning_rate": 0.0004854292009714245, + "loss": 0.2089, + "step": 79190 + }, + { + "epoch": 3.28, + "grad_norm": 0.4296875, + "learning_rate": 0.0004854255523538057, + "loss": 0.2056, + "step": 79200 + }, + { + "epoch": 3.28, + "grad_norm": 0.2470703125, + "learning_rate": 0.0004854219032931414, + "loss": 0.1542, + "step": 79210 + }, + { + "epoch": 3.28, + "grad_norm": 0.44921875, + "learning_rate": 0.0004854182537894385, + "loss": 0.187, + "step": 79220 + }, + { + "epoch": 3.28, + "grad_norm": 0.90625, + "learning_rate": 0.00048541460384270385, + "loss": 0.1908, + "step": 79230 + }, + { + "epoch": 3.28, + "grad_norm": 0.640625, + "learning_rate": 0.0004854109534529444, + "loss": 0.203, + "step": 79240 + }, + { + "epoch": 3.28, + "grad_norm": 0.16015625, + "learning_rate": 0.0004854073026201669, + "loss": 0.1955, + "step": 79250 + }, + { + "epoch": 3.28, + "grad_norm": 0.494140625, + "learning_rate": 0.00048540365134437836, + "loss": 0.1377, + "step": 79260 + }, + { + "epoch": 3.28, + "grad_norm": 1.1171875, + "learning_rate": 0.00048539999962558545, + "loss": 0.2259, + "step": 79270 + }, + { + "epoch": 3.28, + "grad_norm": 0.70703125, + "learning_rate": 0.00048539634746379523, + "loss": 0.2108, + "step": 79280 + }, + { + "epoch": 3.28, + "grad_norm": 0.439453125, + "learning_rate": 0.0004853926948590145, + "loss": 0.2252, + "step": 79290 + }, + { + "epoch": 3.28, + "grad_norm": 0.61328125, + "learning_rate": 0.0004853890418112502, + "loss": 0.235, + "step": 79300 + }, + { + "epoch": 3.29, + "grad_norm": 0.7734375, + "learning_rate": 0.00048538538832050906, + "loss": 0.2411, + "step": 79310 + }, + { + "epoch": 3.29, + "grad_norm": 0.83984375, + "learning_rate": 0.00048538173438679807, + "loss": 0.1972, + "step": 79320 + }, + { + "epoch": 3.29, + "grad_norm": 0.87890625, + "learning_rate": 0.0004853780800101241, + "loss": 0.1916, + "step": 79330 + }, + { + "epoch": 3.29, + "grad_norm": 0.859375, + "learning_rate": 0.00048537442519049393, + "loss": 0.2145, + "step": 79340 + }, + { + "epoch": 3.29, + "grad_norm": 2.359375, + "learning_rate": 0.0004853707699279145, + "loss": 0.2438, + "step": 79350 + }, + { + "epoch": 3.29, + "grad_norm": 0.88671875, + "learning_rate": 0.0004853671142223928, + "loss": 0.2156, + "step": 79360 + }, + { + "epoch": 3.29, + "grad_norm": 1.453125, + "learning_rate": 0.00048536345807393555, + "loss": 0.2102, + "step": 79370 + }, + { + "epoch": 3.29, + "grad_norm": 0.94140625, + "learning_rate": 0.00048535980148254964, + "loss": 0.2161, + "step": 79380 + }, + { + "epoch": 3.29, + "grad_norm": 0.345703125, + "learning_rate": 0.0004853561444482421, + "loss": 0.2212, + "step": 79390 + }, + { + "epoch": 3.29, + "grad_norm": 0.6875, + "learning_rate": 0.00048535248697101964, + "loss": 0.1743, + "step": 79400 + }, + { + "epoch": 3.29, + "grad_norm": 0.87109375, + "learning_rate": 0.0004853488290508893, + "loss": 0.2544, + "step": 79410 + }, + { + "epoch": 3.29, + "grad_norm": 0.515625, + "learning_rate": 0.00048534517068785776, + "loss": 0.2123, + "step": 79420 + }, + { + "epoch": 3.29, + "grad_norm": 0.53515625, + "learning_rate": 0.0004853415118819321, + "loss": 0.2, + "step": 79430 + }, + { + "epoch": 3.29, + "grad_norm": 0.69921875, + "learning_rate": 0.00048533785263311913, + "loss": 0.2331, + "step": 79440 + }, + { + "epoch": 3.29, + "grad_norm": 0.59375, + "learning_rate": 0.0004853341929414257, + "loss": 0.1882, + "step": 79450 + }, + { + "epoch": 3.29, + "grad_norm": 0.73828125, + "learning_rate": 0.0004853305328068588, + "loss": 0.2046, + "step": 79460 + }, + { + "epoch": 3.29, + "grad_norm": 0.90625, + "learning_rate": 0.0004853268722294252, + "loss": 0.184, + "step": 79470 + }, + { + "epoch": 3.29, + "grad_norm": 0.6796875, + "learning_rate": 0.0004853232112091318, + "loss": 0.2169, + "step": 79480 + }, + { + "epoch": 3.29, + "grad_norm": 0.8359375, + "learning_rate": 0.00048531954974598556, + "loss": 0.2033, + "step": 79490 + }, + { + "epoch": 3.29, + "grad_norm": 0.52734375, + "learning_rate": 0.00048531588783999336, + "loss": 0.2355, + "step": 79500 + }, + { + "epoch": 3.29, + "grad_norm": 0.74609375, + "learning_rate": 0.00048531222549116204, + "loss": 0.2141, + "step": 79510 + }, + { + "epoch": 3.29, + "grad_norm": 1.0625, + "learning_rate": 0.00048530856269949865, + "loss": 0.2255, + "step": 79520 + }, + { + "epoch": 3.29, + "grad_norm": 0.80859375, + "learning_rate": 0.0004853048994650098, + "loss": 0.1707, + "step": 79530 + }, + { + "epoch": 3.29, + "grad_norm": 0.83203125, + "learning_rate": 0.00048530123578770256, + "loss": 0.1946, + "step": 79540 + }, + { + "epoch": 3.29, + "grad_norm": 1.21875, + "learning_rate": 0.00048529757166758386, + "loss": 0.2116, + "step": 79550 + }, + { + "epoch": 3.3, + "grad_norm": 1.5, + "learning_rate": 0.0004852939071046605, + "loss": 0.2593, + "step": 79560 + }, + { + "epoch": 3.3, + "grad_norm": 0.5546875, + "learning_rate": 0.00048529024209893946, + "loss": 0.2509, + "step": 79570 + }, + { + "epoch": 3.3, + "grad_norm": 0.67578125, + "learning_rate": 0.0004852865766504276, + "loss": 0.2207, + "step": 79580 + }, + { + "epoch": 3.3, + "grad_norm": 0.46484375, + "learning_rate": 0.00048528291075913176, + "loss": 0.194, + "step": 79590 + }, + { + "epoch": 3.3, + "grad_norm": 0.6953125, + "learning_rate": 0.0004852792444250589, + "loss": 0.1882, + "step": 79600 + }, + { + "epoch": 3.3, + "grad_norm": 0.373046875, + "learning_rate": 0.00048527557764821595, + "loss": 0.2375, + "step": 79610 + }, + { + "epoch": 3.3, + "grad_norm": 0.291015625, + "learning_rate": 0.00048527191042860974, + "loss": 0.1758, + "step": 79620 + }, + { + "epoch": 3.3, + "grad_norm": 0.59765625, + "learning_rate": 0.00048526824276624717, + "loss": 0.2226, + "step": 79630 + }, + { + "epoch": 3.3, + "grad_norm": 0.2373046875, + "learning_rate": 0.0004852645746611352, + "loss": 0.2274, + "step": 79640 + }, + { + "epoch": 3.3, + "grad_norm": 0.333984375, + "learning_rate": 0.0004852609061132807, + "loss": 0.1795, + "step": 79650 + }, + { + "epoch": 3.3, + "grad_norm": 0.83203125, + "learning_rate": 0.0004852572371226906, + "loss": 0.2136, + "step": 79660 + }, + { + "epoch": 3.3, + "grad_norm": 0.640625, + "learning_rate": 0.0004852535676893718, + "loss": 0.2388, + "step": 79670 + }, + { + "epoch": 3.3, + "grad_norm": 0.58203125, + "learning_rate": 0.00048524989781333116, + "loss": 0.2692, + "step": 79680 + }, + { + "epoch": 3.3, + "grad_norm": 0.322265625, + "learning_rate": 0.0004852462274945756, + "loss": 0.1745, + "step": 79690 + }, + { + "epoch": 3.3, + "grad_norm": 1.0859375, + "learning_rate": 0.0004852425567331121, + "loss": 0.2251, + "step": 79700 + }, + { + "epoch": 3.3, + "grad_norm": 0.875, + "learning_rate": 0.00048523888552894746, + "loss": 0.2251, + "step": 79710 + }, + { + "epoch": 3.3, + "grad_norm": 0.51953125, + "learning_rate": 0.0004852352138820887, + "loss": 0.2339, + "step": 79720 + }, + { + "epoch": 3.3, + "grad_norm": 0.396484375, + "learning_rate": 0.0004852315417925426, + "loss": 0.1958, + "step": 79730 + }, + { + "epoch": 3.3, + "grad_norm": 1.1484375, + "learning_rate": 0.0004852278692603162, + "loss": 0.2371, + "step": 79740 + }, + { + "epoch": 3.3, + "grad_norm": 0.359375, + "learning_rate": 0.0004852241962854164, + "loss": 0.1975, + "step": 79750 + }, + { + "epoch": 3.3, + "grad_norm": 0.34375, + "learning_rate": 0.0004852205228678499, + "loss": 0.2036, + "step": 79760 + }, + { + "epoch": 3.3, + "grad_norm": 1.4375, + "learning_rate": 0.00048521684900762387, + "loss": 0.1986, + "step": 79770 + }, + { + "epoch": 3.3, + "grad_norm": 0.5078125, + "learning_rate": 0.00048521317470474515, + "loss": 0.1944, + "step": 79780 + }, + { + "epoch": 3.3, + "grad_norm": 0.6015625, + "learning_rate": 0.00048520949995922057, + "loss": 0.227, + "step": 79790 + }, + { + "epoch": 3.31, + "grad_norm": 1.453125, + "learning_rate": 0.0004852058247710572, + "loss": 0.2412, + "step": 79800 + }, + { + "epoch": 3.31, + "grad_norm": 0.310546875, + "learning_rate": 0.00048520214914026183, + "loss": 0.1637, + "step": 79810 + }, + { + "epoch": 3.31, + "grad_norm": 0.158203125, + "learning_rate": 0.0004851984730668414, + "loss": 0.1812, + "step": 79820 + }, + { + "epoch": 3.31, + "grad_norm": 0.51171875, + "learning_rate": 0.0004851947965508029, + "loss": 0.1852, + "step": 79830 + }, + { + "epoch": 3.31, + "grad_norm": 0.96484375, + "learning_rate": 0.0004851911195921531, + "loss": 0.238, + "step": 79840 + }, + { + "epoch": 3.31, + "grad_norm": 0.71875, + "learning_rate": 0.00048518744219089907, + "loss": 0.2204, + "step": 79850 + }, + { + "epoch": 3.31, + "grad_norm": 0.6171875, + "learning_rate": 0.0004851837643470477, + "loss": 0.2068, + "step": 79860 + }, + { + "epoch": 3.31, + "grad_norm": 1.1796875, + "learning_rate": 0.00048518008606060584, + "loss": 0.2133, + "step": 79870 + }, + { + "epoch": 3.31, + "grad_norm": 0.25, + "learning_rate": 0.0004851764073315804, + "loss": 0.1949, + "step": 79880 + }, + { + "epoch": 3.31, + "grad_norm": 1.140625, + "learning_rate": 0.0004851727281599784, + "loss": 0.2277, + "step": 79890 + }, + { + "epoch": 3.31, + "grad_norm": 0.6953125, + "learning_rate": 0.0004851690485458068, + "loss": 0.2091, + "step": 79900 + }, + { + "epoch": 3.31, + "grad_norm": 0.66015625, + "learning_rate": 0.00048516536848907234, + "loss": 0.145, + "step": 79910 + }, + { + "epoch": 3.31, + "grad_norm": 0.7421875, + "learning_rate": 0.00048516168798978213, + "loss": 0.2468, + "step": 79920 + }, + { + "epoch": 3.31, + "grad_norm": 1.125, + "learning_rate": 0.000485158007047943, + "loss": 0.1951, + "step": 79930 + }, + { + "epoch": 3.31, + "grad_norm": 0.68359375, + "learning_rate": 0.0004851543256635619, + "loss": 0.1996, + "step": 79940 + }, + { + "epoch": 3.31, + "grad_norm": 0.70703125, + "learning_rate": 0.0004851506438366457, + "loss": 0.2609, + "step": 79950 + }, + { + "epoch": 3.31, + "grad_norm": 0.71875, + "learning_rate": 0.0004851469615672014, + "loss": 0.1971, + "step": 79960 + }, + { + "epoch": 3.31, + "grad_norm": 0.51953125, + "learning_rate": 0.0004851432788552359, + "loss": 0.1822, + "step": 79970 + }, + { + "epoch": 3.31, + "grad_norm": 0.8828125, + "learning_rate": 0.00048513959570075624, + "loss": 0.202, + "step": 79980 + }, + { + "epoch": 3.31, + "grad_norm": 0.419921875, + "learning_rate": 0.0004851359121037692, + "loss": 0.2025, + "step": 79990 + }, + { + "epoch": 3.31, + "grad_norm": 0.86328125, + "learning_rate": 0.0004851322280642817, + "loss": 0.2275, + "step": 80000 + }, + { + "epoch": 3.31, + "grad_norm": 0.5703125, + "learning_rate": 0.0004851285435823008, + "loss": 0.1865, + "step": 80010 + }, + { + "epoch": 3.31, + "grad_norm": 0.91015625, + "learning_rate": 0.00048512485865783334, + "loss": 0.2306, + "step": 80020 + }, + { + "epoch": 3.31, + "grad_norm": 0.6640625, + "learning_rate": 0.00048512117329088634, + "loss": 0.2225, + "step": 80030 + }, + { + "epoch": 3.32, + "grad_norm": 0.326171875, + "learning_rate": 0.0004851174874814667, + "loss": 0.1954, + "step": 80040 + }, + { + "epoch": 3.32, + "grad_norm": 0.734375, + "learning_rate": 0.0004851138012295813, + "loss": 0.2454, + "step": 80050 + }, + { + "epoch": 3.32, + "grad_norm": 1.0546875, + "learning_rate": 0.000485110114535237, + "loss": 0.1822, + "step": 80060 + }, + { + "epoch": 3.32, + "grad_norm": 0.51953125, + "learning_rate": 0.00048510642739844103, + "loss": 0.2153, + "step": 80070 + }, + { + "epoch": 3.32, + "grad_norm": 0.46484375, + "learning_rate": 0.00048510273981920005, + "loss": 0.2862, + "step": 80080 + }, + { + "epoch": 3.32, + "grad_norm": 0.66015625, + "learning_rate": 0.0004850990517975211, + "loss": 0.2013, + "step": 80090 + }, + { + "epoch": 3.32, + "grad_norm": 0.9296875, + "learning_rate": 0.0004850953633334112, + "loss": 0.1968, + "step": 80100 + }, + { + "epoch": 3.32, + "grad_norm": 1.0390625, + "learning_rate": 0.0004850916744268772, + "loss": 0.1972, + "step": 80110 + }, + { + "epoch": 3.32, + "grad_norm": 0.279296875, + "learning_rate": 0.000485087985077926, + "loss": 0.1901, + "step": 80120 + }, + { + "epoch": 3.32, + "grad_norm": 1.0859375, + "learning_rate": 0.0004850842952865646, + "loss": 0.233, + "step": 80130 + }, + { + "epoch": 3.32, + "grad_norm": 0.890625, + "learning_rate": 0.00048508060505280006, + "loss": 0.1886, + "step": 80140 + }, + { + "epoch": 3.32, + "grad_norm": 1.1875, + "learning_rate": 0.0004850769143766391, + "loss": 0.2491, + "step": 80150 + }, + { + "epoch": 3.32, + "grad_norm": 0.7578125, + "learning_rate": 0.00048507322325808876, + "loss": 0.2318, + "step": 80160 + }, + { + "epoch": 3.32, + "grad_norm": 0.81640625, + "learning_rate": 0.00048506953169715606, + "loss": 0.2077, + "step": 80170 + }, + { + "epoch": 3.32, + "grad_norm": 0.70703125, + "learning_rate": 0.0004850658396938479, + "loss": 0.2028, + "step": 80180 + }, + { + "epoch": 3.32, + "grad_norm": 0.263671875, + "learning_rate": 0.0004850621472481711, + "loss": 0.2148, + "step": 80190 + }, + { + "epoch": 3.32, + "grad_norm": 1.0, + "learning_rate": 0.00048505845436013287, + "loss": 0.1926, + "step": 80200 + }, + { + "epoch": 3.32, + "grad_norm": 0.291015625, + "learning_rate": 0.0004850547610297399, + "loss": 0.2083, + "step": 80210 + }, + { + "epoch": 3.32, + "grad_norm": 0.72265625, + "learning_rate": 0.0004850510672569993, + "loss": 0.1658, + "step": 80220 + }, + { + "epoch": 3.32, + "grad_norm": 1.734375, + "learning_rate": 0.00048504737304191795, + "loss": 0.199, + "step": 80230 + }, + { + "epoch": 3.32, + "grad_norm": 0.75, + "learning_rate": 0.00048504367838450287, + "loss": 0.2044, + "step": 80240 + }, + { + "epoch": 3.32, + "grad_norm": 0.90234375, + "learning_rate": 0.00048503998328476095, + "loss": 0.1678, + "step": 80250 + }, + { + "epoch": 3.32, + "grad_norm": 0.62890625, + "learning_rate": 0.00048503628774269913, + "loss": 0.2161, + "step": 80260 + }, + { + "epoch": 3.32, + "grad_norm": 0.73828125, + "learning_rate": 0.00048503259175832446, + "loss": 0.2066, + "step": 80270 + }, + { + "epoch": 3.33, + "grad_norm": 0.390625, + "learning_rate": 0.0004850288953316438, + "loss": 0.239, + "step": 80280 + }, + { + "epoch": 3.33, + "grad_norm": 0.6015625, + "learning_rate": 0.00048502519846266415, + "loss": 0.1934, + "step": 80290 + }, + { + "epoch": 3.33, + "grad_norm": 2.015625, + "learning_rate": 0.00048502150115139244, + "loss": 0.1643, + "step": 80300 + }, + { + "epoch": 3.33, + "grad_norm": 0.6640625, + "learning_rate": 0.00048501780339783564, + "loss": 0.1822, + "step": 80310 + }, + { + "epoch": 3.33, + "grad_norm": 2.0625, + "learning_rate": 0.00048501410520200073, + "loss": 0.1235, + "step": 80320 + }, + { + "epoch": 3.33, + "grad_norm": 0.76953125, + "learning_rate": 0.00048501040656389466, + "loss": 0.2259, + "step": 80330 + }, + { + "epoch": 3.33, + "grad_norm": 0.48828125, + "learning_rate": 0.00048500670748352436, + "loss": 0.2378, + "step": 80340 + }, + { + "epoch": 3.33, + "grad_norm": 0.828125, + "learning_rate": 0.0004850030079608969, + "loss": 0.2253, + "step": 80350 + }, + { + "epoch": 3.33, + "grad_norm": 0.42578125, + "learning_rate": 0.00048499930799601905, + "loss": 0.1994, + "step": 80360 + }, + { + "epoch": 3.33, + "grad_norm": 0.80078125, + "learning_rate": 0.0004849956075888979, + "loss": 0.2437, + "step": 80370 + }, + { + "epoch": 3.33, + "grad_norm": 0.498046875, + "learning_rate": 0.00048499190673954043, + "loss": 0.2531, + "step": 80380 + }, + { + "epoch": 3.33, + "grad_norm": 0.443359375, + "learning_rate": 0.0004849882054479535, + "loss": 0.2442, + "step": 80390 + }, + { + "epoch": 3.33, + "grad_norm": 0.76171875, + "learning_rate": 0.0004849845037141443, + "loss": 0.2175, + "step": 80400 + }, + { + "epoch": 3.33, + "grad_norm": 0.8046875, + "learning_rate": 0.00048498080153811944, + "loss": 0.1956, + "step": 80410 + }, + { + "epoch": 3.33, + "grad_norm": 0.609375, + "learning_rate": 0.00048497709891988616, + "loss": 0.2279, + "step": 80420 + }, + { + "epoch": 3.33, + "grad_norm": 0.54296875, + "learning_rate": 0.0004849733958594514, + "loss": 0.2383, + "step": 80430 + }, + { + "epoch": 3.33, + "grad_norm": 0.6484375, + "learning_rate": 0.00048496969235682207, + "loss": 0.1448, + "step": 80440 + }, + { + "epoch": 3.33, + "grad_norm": 0.466796875, + "learning_rate": 0.00048496598841200515, + "loss": 0.2412, + "step": 80450 + }, + { + "epoch": 3.33, + "grad_norm": 0.75, + "learning_rate": 0.00048496228402500764, + "loss": 0.1841, + "step": 80460 + }, + { + "epoch": 3.33, + "grad_norm": 0.8671875, + "learning_rate": 0.00048495857919583643, + "loss": 0.2608, + "step": 80470 + }, + { + "epoch": 3.33, + "grad_norm": 0.291015625, + "learning_rate": 0.00048495487392449853, + "loss": 0.2494, + "step": 80480 + }, + { + "epoch": 3.33, + "grad_norm": 0.60546875, + "learning_rate": 0.000484951168211001, + "loss": 0.1826, + "step": 80490 + }, + { + "epoch": 3.33, + "grad_norm": 0.68359375, + "learning_rate": 0.00048494746205535074, + "loss": 0.1965, + "step": 80500 + }, + { + "epoch": 3.33, + "grad_norm": 0.84375, + "learning_rate": 0.0004849437554575547, + "loss": 0.2384, + "step": 80510 + }, + { + "epoch": 3.34, + "grad_norm": 0.4140625, + "learning_rate": 0.00048494004841761985, + "loss": 0.2581, + "step": 80520 + }, + { + "epoch": 3.34, + "grad_norm": 0.890625, + "learning_rate": 0.0004849363409355533, + "loss": 0.2726, + "step": 80530 + }, + { + "epoch": 3.34, + "grad_norm": 0.5390625, + "learning_rate": 0.0004849326330113618, + "loss": 0.2447, + "step": 80540 + }, + { + "epoch": 3.34, + "grad_norm": 0.43359375, + "learning_rate": 0.00048492892464505256, + "loss": 0.1522, + "step": 80550 + }, + { + "epoch": 3.34, + "grad_norm": 0.2451171875, + "learning_rate": 0.0004849252158366324, + "loss": 0.1735, + "step": 80560 + }, + { + "epoch": 3.34, + "grad_norm": 1.875, + "learning_rate": 0.0004849215065861083, + "loss": 0.242, + "step": 80570 + }, + { + "epoch": 3.34, + "grad_norm": 1.1171875, + "learning_rate": 0.00048491779689348747, + "loss": 0.1861, + "step": 80580 + }, + { + "epoch": 3.34, + "grad_norm": 0.89453125, + "learning_rate": 0.00048491408675877657, + "loss": 0.1936, + "step": 80590 + }, + { + "epoch": 3.34, + "grad_norm": 0.47265625, + "learning_rate": 0.00048491037618198277, + "loss": 0.2546, + "step": 80600 + }, + { + "epoch": 3.34, + "grad_norm": 0.443359375, + "learning_rate": 0.00048490666516311297, + "loss": 0.2104, + "step": 80610 + }, + { + "epoch": 3.34, + "grad_norm": 0.404296875, + "learning_rate": 0.00048490295370217425, + "loss": 0.1747, + "step": 80620 + }, + { + "epoch": 3.34, + "grad_norm": 0.5546875, + "learning_rate": 0.00048489924179917357, + "loss": 0.2477, + "step": 80630 + }, + { + "epoch": 3.34, + "grad_norm": 1.3671875, + "learning_rate": 0.00048489552945411775, + "loss": 0.2145, + "step": 80640 + }, + { + "epoch": 3.34, + "grad_norm": 1.0234375, + "learning_rate": 0.000484891816667014, + "loss": 0.2651, + "step": 80650 + }, + { + "epoch": 3.34, + "grad_norm": 1.1015625, + "learning_rate": 0.0004848881034378692, + "loss": 0.2008, + "step": 80660 + }, + { + "epoch": 3.34, + "grad_norm": 0.80859375, + "learning_rate": 0.0004848843897666904, + "loss": 0.2624, + "step": 80670 + }, + { + "epoch": 3.34, + "grad_norm": 0.48046875, + "learning_rate": 0.0004848806756534845, + "loss": 0.2182, + "step": 80680 + }, + { + "epoch": 3.34, + "grad_norm": 0.72265625, + "learning_rate": 0.0004848769610982585, + "loss": 0.2114, + "step": 80690 + }, + { + "epoch": 3.34, + "grad_norm": 0.75390625, + "learning_rate": 0.0004848732461010195, + "loss": 0.2468, + "step": 80700 + }, + { + "epoch": 3.34, + "grad_norm": 0.69140625, + "learning_rate": 0.00048486953066177437, + "loss": 0.2043, + "step": 80710 + }, + { + "epoch": 3.34, + "grad_norm": 0.5390625, + "learning_rate": 0.00048486581478053017, + "loss": 0.1799, + "step": 80720 + }, + { + "epoch": 3.34, + "grad_norm": 1.25, + "learning_rate": 0.0004848620984572938, + "loss": 0.2587, + "step": 80730 + }, + { + "epoch": 3.34, + "grad_norm": 0.82421875, + "learning_rate": 0.00048485838169207244, + "loss": 0.1566, + "step": 80740 + }, + { + "epoch": 3.34, + "grad_norm": 0.609375, + "learning_rate": 0.00048485466448487294, + "loss": 0.2086, + "step": 80750 + }, + { + "epoch": 3.35, + "grad_norm": 0.64453125, + "learning_rate": 0.00048485094683570223, + "loss": 0.1585, + "step": 80760 + }, + { + "epoch": 3.35, + "grad_norm": 0.310546875, + "learning_rate": 0.00048484722874456745, + "loss": 0.2617, + "step": 80770 + }, + { + "epoch": 3.35, + "grad_norm": 0.65625, + "learning_rate": 0.00048484351021147555, + "loss": 0.1949, + "step": 80780 + }, + { + "epoch": 3.35, + "grad_norm": 0.48046875, + "learning_rate": 0.00048483979123643356, + "loss": 0.1921, + "step": 80790 + }, + { + "epoch": 3.35, + "grad_norm": 0.0, + "learning_rate": 0.00048483607181944844, + "loss": 0.2364, + "step": 80800 + }, + { + "epoch": 3.35, + "grad_norm": 0.5, + "learning_rate": 0.0004848323519605271, + "loss": 0.1975, + "step": 80810 + }, + { + "epoch": 3.35, + "grad_norm": 0.609375, + "learning_rate": 0.00048482863165967674, + "loss": 0.2009, + "step": 80820 + }, + { + "epoch": 3.35, + "grad_norm": 0.361328125, + "learning_rate": 0.0004848249109169042, + "loss": 0.2076, + "step": 80830 + }, + { + "epoch": 3.35, + "grad_norm": 0.64453125, + "learning_rate": 0.00048482118973221656, + "loss": 0.2289, + "step": 80840 + }, + { + "epoch": 3.35, + "grad_norm": 0.87890625, + "learning_rate": 0.0004848174681056208, + "loss": 0.2487, + "step": 80850 + }, + { + "epoch": 3.35, + "grad_norm": 1.453125, + "learning_rate": 0.0004848137460371239, + "loss": 0.1855, + "step": 80860 + }, + { + "epoch": 3.35, + "grad_norm": 0.63671875, + "learning_rate": 0.00048481002352673293, + "loss": 0.2189, + "step": 80870 + }, + { + "epoch": 3.35, + "grad_norm": 0.6875, + "learning_rate": 0.00048480630057445487, + "loss": 0.256, + "step": 80880 + }, + { + "epoch": 3.35, + "grad_norm": 0.73046875, + "learning_rate": 0.0004848025771802966, + "loss": 0.2304, + "step": 80890 + }, + { + "epoch": 3.35, + "grad_norm": 0.0, + "learning_rate": 0.0004847988533442653, + "loss": 0.1646, + "step": 80900 + }, + { + "epoch": 3.35, + "grad_norm": 0.34765625, + "learning_rate": 0.0004847951290663679, + "loss": 0.1998, + "step": 80910 + }, + { + "epoch": 3.35, + "grad_norm": 1.171875, + "learning_rate": 0.00048479140434661146, + "loss": 0.1997, + "step": 80920 + }, + { + "epoch": 3.35, + "grad_norm": 0.640625, + "learning_rate": 0.0004847876791850029, + "loss": 0.2622, + "step": 80930 + }, + { + "epoch": 3.35, + "grad_norm": 0.69140625, + "learning_rate": 0.00048478395358154936, + "loss": 0.2303, + "step": 80940 + }, + { + "epoch": 3.35, + "grad_norm": 0.48046875, + "learning_rate": 0.0004847802275362577, + "loss": 0.2048, + "step": 80950 + }, + { + "epoch": 3.35, + "grad_norm": 0.6875, + "learning_rate": 0.0004847765010491351, + "loss": 0.2127, + "step": 80960 + }, + { + "epoch": 3.35, + "grad_norm": 0.423828125, + "learning_rate": 0.0004847727741201884, + "loss": 0.1799, + "step": 80970 + }, + { + "epoch": 3.35, + "grad_norm": 0.8671875, + "learning_rate": 0.00048476904674942466, + "loss": 0.2009, + "step": 80980 + }, + { + "epoch": 3.35, + "grad_norm": 0.7421875, + "learning_rate": 0.00048476531893685095, + "loss": 0.2118, + "step": 80990 + }, + { + "epoch": 3.36, + "grad_norm": 0.64453125, + "learning_rate": 0.0004847615906824743, + "loss": 0.2476, + "step": 81000 + }, + { + "epoch": 3.36, + "grad_norm": 0.63671875, + "learning_rate": 0.00048475786198630167, + "loss": 0.208, + "step": 81010 + }, + { + "epoch": 3.36, + "grad_norm": 0.91015625, + "learning_rate": 0.00048475413284834013, + "loss": 0.2489, + "step": 81020 + }, + { + "epoch": 3.36, + "grad_norm": 0.64453125, + "learning_rate": 0.0004847504032685966, + "loss": 0.1978, + "step": 81030 + }, + { + "epoch": 3.36, + "grad_norm": 0.46875, + "learning_rate": 0.00048474667324707824, + "loss": 0.1991, + "step": 81040 + }, + { + "epoch": 3.36, + "grad_norm": 0.8671875, + "learning_rate": 0.0004847429427837919, + "loss": 0.1861, + "step": 81050 + }, + { + "epoch": 3.36, + "grad_norm": 2.375, + "learning_rate": 0.00048473921187874473, + "loss": 0.1891, + "step": 81060 + }, + { + "epoch": 3.36, + "grad_norm": 0.455078125, + "learning_rate": 0.00048473548053194374, + "loss": 0.1894, + "step": 81070 + }, + { + "epoch": 3.36, + "grad_norm": 0.52734375, + "learning_rate": 0.00048473174874339586, + "loss": 0.1529, + "step": 81080 + }, + { + "epoch": 3.36, + "grad_norm": 0.75390625, + "learning_rate": 0.0004847280165131083, + "loss": 0.2741, + "step": 81090 + }, + { + "epoch": 3.36, + "grad_norm": 0.8984375, + "learning_rate": 0.0004847242838410878, + "loss": 0.2159, + "step": 81100 + }, + { + "epoch": 3.36, + "grad_norm": 0.546875, + "learning_rate": 0.00048472055072734167, + "loss": 0.1978, + "step": 81110 + }, + { + "epoch": 3.36, + "grad_norm": 0.30078125, + "learning_rate": 0.00048471681717187677, + "loss": 0.1949, + "step": 81120 + }, + { + "epoch": 3.36, + "grad_norm": 0.490234375, + "learning_rate": 0.00048471308317470015, + "loss": 0.1898, + "step": 81130 + }, + { + "epoch": 3.36, + "grad_norm": 0.7734375, + "learning_rate": 0.00048470934873581887, + "loss": 0.2808, + "step": 81140 + }, + { + "epoch": 3.36, + "grad_norm": 0.470703125, + "learning_rate": 0.0004847056138552399, + "loss": 0.18, + "step": 81150 + }, + { + "epoch": 3.36, + "grad_norm": 0.66796875, + "learning_rate": 0.00048470187853297036, + "loss": 0.1881, + "step": 81160 + }, + { + "epoch": 3.36, + "grad_norm": 0.72265625, + "learning_rate": 0.00048469814276901724, + "loss": 0.2513, + "step": 81170 + }, + { + "epoch": 3.36, + "grad_norm": 0.37109375, + "learning_rate": 0.0004846944065633876, + "loss": 0.1765, + "step": 81180 + }, + { + "epoch": 3.36, + "grad_norm": 1.15625, + "learning_rate": 0.00048469066991608834, + "loss": 0.2562, + "step": 81190 + }, + { + "epoch": 3.36, + "grad_norm": 1.0390625, + "learning_rate": 0.0004846869328271266, + "loss": 0.1877, + "step": 81200 + }, + { + "epoch": 3.36, + "grad_norm": 0.359375, + "learning_rate": 0.0004846831952965094, + "loss": 0.2069, + "step": 81210 + }, + { + "epoch": 3.36, + "grad_norm": 0.7265625, + "learning_rate": 0.0004846794573242438, + "loss": 0.2093, + "step": 81220 + }, + { + "epoch": 3.36, + "grad_norm": 2.25, + "learning_rate": 0.0004846757189103368, + "loss": 0.1857, + "step": 81230 + }, + { + "epoch": 3.36, + "grad_norm": 0.73046875, + "learning_rate": 0.0004846719800547954, + "loss": 0.2508, + "step": 81240 + }, + { + "epoch": 3.37, + "grad_norm": 0.72265625, + "learning_rate": 0.00048466824075762673, + "loss": 0.2697, + "step": 81250 + }, + { + "epoch": 3.37, + "grad_norm": 0.443359375, + "learning_rate": 0.00048466450101883774, + "loss": 0.1808, + "step": 81260 + }, + { + "epoch": 3.37, + "grad_norm": 0.58984375, + "learning_rate": 0.00048466076083843553, + "loss": 0.1829, + "step": 81270 + }, + { + "epoch": 3.37, + "grad_norm": 0.69921875, + "learning_rate": 0.0004846570202164271, + "loss": 0.2062, + "step": 81280 + }, + { + "epoch": 3.37, + "grad_norm": 0.408203125, + "learning_rate": 0.00048465327915281944, + "loss": 0.2312, + "step": 81290 + }, + { + "epoch": 3.37, + "grad_norm": 0.6875, + "learning_rate": 0.0004846495376476197, + "loss": 0.2887, + "step": 81300 + }, + { + "epoch": 3.37, + "grad_norm": 0.439453125, + "learning_rate": 0.0004846457957008349, + "loss": 0.2267, + "step": 81310 + }, + { + "epoch": 3.37, + "grad_norm": 0.609375, + "learning_rate": 0.00048464205331247203, + "loss": 0.2548, + "step": 81320 + }, + { + "epoch": 3.37, + "grad_norm": 0.30859375, + "learning_rate": 0.00048463831048253813, + "loss": 0.2076, + "step": 81330 + }, + { + "epoch": 3.37, + "grad_norm": 0.20703125, + "learning_rate": 0.00048463456721104024, + "loss": 0.2183, + "step": 81340 + }, + { + "epoch": 3.37, + "grad_norm": 0.34765625, + "learning_rate": 0.00048463082349798546, + "loss": 0.2076, + "step": 81350 + }, + { + "epoch": 3.37, + "grad_norm": 0.5859375, + "learning_rate": 0.0004846270793433809, + "loss": 0.2243, + "step": 81360 + }, + { + "epoch": 3.37, + "grad_norm": 0.87890625, + "learning_rate": 0.0004846233347472334, + "loss": 0.1666, + "step": 81370 + }, + { + "epoch": 3.37, + "grad_norm": 0.490234375, + "learning_rate": 0.00048461958970955017, + "loss": 0.2185, + "step": 81380 + }, + { + "epoch": 3.37, + "grad_norm": 0.70703125, + "learning_rate": 0.00048461584423033823, + "loss": 0.2208, + "step": 81390 + }, + { + "epoch": 3.37, + "grad_norm": 1.625, + "learning_rate": 0.00048461209830960453, + "loss": 0.1902, + "step": 81400 + }, + { + "epoch": 3.37, + "grad_norm": 0.484375, + "learning_rate": 0.0004846083519473563, + "loss": 0.1952, + "step": 81410 + }, + { + "epoch": 3.37, + "grad_norm": 1.4296875, + "learning_rate": 0.0004846046051436004, + "loss": 0.1743, + "step": 81420 + }, + { + "epoch": 3.37, + "grad_norm": 0.80859375, + "learning_rate": 0.00048460085789834396, + "loss": 0.2405, + "step": 81430 + }, + { + "epoch": 3.37, + "grad_norm": 0.69921875, + "learning_rate": 0.0004845971102115941, + "loss": 0.2338, + "step": 81440 + }, + { + "epoch": 3.37, + "grad_norm": 0.0, + "learning_rate": 0.0004845933620833578, + "loss": 0.2281, + "step": 81450 + }, + { + "epoch": 3.37, + "grad_norm": 0.55859375, + "learning_rate": 0.0004845896135136421, + "loss": 0.2013, + "step": 81460 + }, + { + "epoch": 3.37, + "grad_norm": 0.68359375, + "learning_rate": 0.0004845858645024541, + "loss": 0.22, + "step": 81470 + }, + { + "epoch": 3.37, + "grad_norm": 0.1689453125, + "learning_rate": 0.0004845821150498008, + "loss": 0.2123, + "step": 81480 + }, + { + "epoch": 3.38, + "grad_norm": 0.40234375, + "learning_rate": 0.0004845783651556893, + "loss": 0.2117, + "step": 81490 + }, + { + "epoch": 3.38, + "grad_norm": 0.578125, + "learning_rate": 0.0004845746148201267, + "loss": 0.222, + "step": 81500 + }, + { + "epoch": 3.38, + "grad_norm": 0.330078125, + "learning_rate": 0.00048457086404312, + "loss": 0.2098, + "step": 81510 + }, + { + "epoch": 3.38, + "grad_norm": 1.109375, + "learning_rate": 0.00048456711282467623, + "loss": 0.1947, + "step": 81520 + }, + { + "epoch": 3.38, + "grad_norm": 0.58203125, + "learning_rate": 0.00048456336116480247, + "loss": 0.2399, + "step": 81530 + }, + { + "epoch": 3.38, + "grad_norm": 0.51953125, + "learning_rate": 0.00048455960906350587, + "loss": 0.2588, + "step": 81540 + }, + { + "epoch": 3.38, + "grad_norm": 0.6953125, + "learning_rate": 0.0004845558565207934, + "loss": 0.2047, + "step": 81550 + }, + { + "epoch": 3.38, + "grad_norm": 1.1640625, + "learning_rate": 0.0004845521035366721, + "loss": 0.2558, + "step": 81560 + }, + { + "epoch": 3.38, + "grad_norm": 0.478515625, + "learning_rate": 0.00048454835011114907, + "loss": 0.2019, + "step": 81570 + }, + { + "epoch": 3.38, + "grad_norm": 1.171875, + "learning_rate": 0.0004845445962442314, + "loss": 0.1764, + "step": 81580 + }, + { + "epoch": 3.38, + "grad_norm": 0.75, + "learning_rate": 0.0004845408419359261, + "loss": 0.2641, + "step": 81590 + }, + { + "epoch": 3.38, + "grad_norm": 1.1953125, + "learning_rate": 0.0004845370871862402, + "loss": 0.299, + "step": 81600 + }, + { + "epoch": 3.38, + "grad_norm": 0.38671875, + "learning_rate": 0.00048453333199518096, + "loss": 0.2141, + "step": 81610 + }, + { + "epoch": 3.38, + "grad_norm": 0.197265625, + "learning_rate": 0.0004845295763627553, + "loss": 0.2005, + "step": 81620 + }, + { + "epoch": 3.38, + "grad_norm": 0.451171875, + "learning_rate": 0.00048452582028897024, + "loss": 0.2278, + "step": 81630 + }, + { + "epoch": 3.38, + "grad_norm": 0.9609375, + "learning_rate": 0.00048452206377383293, + "loss": 0.1831, + "step": 81640 + }, + { + "epoch": 3.38, + "grad_norm": 0.5546875, + "learning_rate": 0.0004845183068173504, + "loss": 0.2113, + "step": 81650 + }, + { + "epoch": 3.38, + "grad_norm": 1.03125, + "learning_rate": 0.0004845145494195298, + "loss": 0.2772, + "step": 81660 + }, + { + "epoch": 3.38, + "grad_norm": 0.314453125, + "learning_rate": 0.00048451079158037813, + "loss": 0.2682, + "step": 81670 + }, + { + "epoch": 3.38, + "grad_norm": 0.31640625, + "learning_rate": 0.00048450703329990246, + "loss": 0.1855, + "step": 81680 + }, + { + "epoch": 3.38, + "grad_norm": 1.328125, + "learning_rate": 0.0004845032745781099, + "loss": 0.2667, + "step": 81690 + }, + { + "epoch": 3.38, + "grad_norm": 0.86328125, + "learning_rate": 0.0004844995154150075, + "loss": 0.2768, + "step": 81700 + }, + { + "epoch": 3.38, + "grad_norm": 0.71484375, + "learning_rate": 0.0004844957558106023, + "loss": 0.2328, + "step": 81710 + }, + { + "epoch": 3.38, + "grad_norm": 0.609375, + "learning_rate": 0.00048449199576490145, + "loss": 0.1991, + "step": 81720 + }, + { + "epoch": 3.39, + "grad_norm": 0.625, + "learning_rate": 0.000484488235277912, + "loss": 0.2357, + "step": 81730 + }, + { + "epoch": 3.39, + "grad_norm": 0.640625, + "learning_rate": 0.0004844844743496409, + "loss": 0.1832, + "step": 81740 + }, + { + "epoch": 3.39, + "grad_norm": 0.41015625, + "learning_rate": 0.00048448071298009545, + "loss": 0.2206, + "step": 81750 + }, + { + "epoch": 3.39, + "grad_norm": 1.78125, + "learning_rate": 0.00048447695116928257, + "loss": 0.1918, + "step": 81760 + }, + { + "epoch": 3.39, + "grad_norm": 0.890625, + "learning_rate": 0.00048447318891720937, + "loss": 0.2012, + "step": 81770 + }, + { + "epoch": 3.39, + "grad_norm": 5.875, + "learning_rate": 0.000484469426223883, + "loss": 0.2267, + "step": 81780 + }, + { + "epoch": 3.39, + "grad_norm": 1.09375, + "learning_rate": 0.0004844656630893105, + "loss": 0.2139, + "step": 81790 + }, + { + "epoch": 3.39, + "grad_norm": 0.5703125, + "learning_rate": 0.00048446189951349895, + "loss": 0.1962, + "step": 81800 + }, + { + "epoch": 3.39, + "grad_norm": 0.796875, + "learning_rate": 0.00048445813549645537, + "loss": 0.1989, + "step": 81810 + }, + { + "epoch": 3.39, + "grad_norm": 0.4140625, + "learning_rate": 0.0004844543710381869, + "loss": 0.2126, + "step": 81820 + }, + { + "epoch": 3.39, + "grad_norm": 0.6875, + "learning_rate": 0.0004844506061387007, + "loss": 0.2475, + "step": 81830 + }, + { + "epoch": 3.39, + "grad_norm": 0.5390625, + "learning_rate": 0.00048444684079800373, + "loss": 0.2323, + "step": 81840 + }, + { + "epoch": 3.39, + "grad_norm": 0.75390625, + "learning_rate": 0.00048444307501610307, + "loss": 0.2409, + "step": 81850 + }, + { + "epoch": 3.39, + "grad_norm": 0.3359375, + "learning_rate": 0.0004844393087930059, + "loss": 0.1598, + "step": 81860 + }, + { + "epoch": 3.39, + "grad_norm": 0.318359375, + "learning_rate": 0.0004844355421287193, + "loss": 0.2233, + "step": 81870 + }, + { + "epoch": 3.39, + "grad_norm": 0.75390625, + "learning_rate": 0.0004844317750232503, + "loss": 0.2118, + "step": 81880 + }, + { + "epoch": 3.39, + "grad_norm": 0.380859375, + "learning_rate": 0.00048442800747660605, + "loss": 0.2248, + "step": 81890 + }, + { + "epoch": 3.39, + "grad_norm": 0.36328125, + "learning_rate": 0.00048442423948879355, + "loss": 0.1889, + "step": 81900 + }, + { + "epoch": 3.39, + "grad_norm": 0.54296875, + "learning_rate": 0.00048442047105982, + "loss": 0.2174, + "step": 81910 + }, + { + "epoch": 3.39, + "grad_norm": 0.5078125, + "learning_rate": 0.0004844167021896924, + "loss": 0.1987, + "step": 81920 + }, + { + "epoch": 3.39, + "grad_norm": 1.453125, + "learning_rate": 0.00048441293287841794, + "loss": 0.1696, + "step": 81930 + }, + { + "epoch": 3.39, + "grad_norm": 0.73828125, + "learning_rate": 0.0004844091631260036, + "loss": 0.2093, + "step": 81940 + }, + { + "epoch": 3.39, + "grad_norm": 0.421875, + "learning_rate": 0.0004844053929324565, + "loss": 0.2151, + "step": 81950 + }, + { + "epoch": 3.39, + "grad_norm": 0.98828125, + "learning_rate": 0.00048440162229778386, + "loss": 0.2365, + "step": 81960 + }, + { + "epoch": 3.4, + "grad_norm": 0.71875, + "learning_rate": 0.00048439785122199254, + "loss": 0.2056, + "step": 81970 + }, + { + "epoch": 3.4, + "grad_norm": 0.51953125, + "learning_rate": 0.0004843940797050899, + "loss": 0.1591, + "step": 81980 + }, + { + "epoch": 3.4, + "grad_norm": 0.53515625, + "learning_rate": 0.0004843903077470828, + "loss": 0.1959, + "step": 81990 + }, + { + "epoch": 3.4, + "grad_norm": 0.62890625, + "learning_rate": 0.00048438653534797863, + "loss": 0.2348, + "step": 82000 + }, + { + "epoch": 3.4, + "grad_norm": 0.56640625, + "learning_rate": 0.0004843827625077841, + "loss": 0.1866, + "step": 82010 + }, + { + "epoch": 3.4, + "grad_norm": 0.353515625, + "learning_rate": 0.0004843789892265067, + "loss": 0.2013, + "step": 82020 + }, + { + "epoch": 3.4, + "grad_norm": 0.84765625, + "learning_rate": 0.00048437521550415326, + "loss": 0.2435, + "step": 82030 + }, + { + "epoch": 3.4, + "grad_norm": 0.90625, + "learning_rate": 0.000484371441340731, + "loss": 0.249, + "step": 82040 + }, + { + "epoch": 3.4, + "grad_norm": 0.95703125, + "learning_rate": 0.000484367666736247, + "loss": 0.2489, + "step": 82050 + }, + { + "epoch": 3.4, + "grad_norm": 0.73046875, + "learning_rate": 0.0004843638916907084, + "loss": 0.2121, + "step": 82060 + }, + { + "epoch": 3.4, + "grad_norm": 1.3671875, + "learning_rate": 0.0004843601162041222, + "loss": 0.2387, + "step": 82070 + }, + { + "epoch": 3.4, + "grad_norm": 0.52734375, + "learning_rate": 0.00048435634027649553, + "loss": 0.2548, + "step": 82080 + }, + { + "epoch": 3.4, + "grad_norm": 0.52734375, + "learning_rate": 0.00048435256390783564, + "loss": 0.2008, + "step": 82090 + }, + { + "epoch": 3.4, + "grad_norm": 0.71875, + "learning_rate": 0.00048434878709814947, + "loss": 0.2112, + "step": 82100 + }, + { + "epoch": 3.4, + "grad_norm": 1.6640625, + "learning_rate": 0.0004843450098474442, + "loss": 0.1585, + "step": 82110 + }, + { + "epoch": 3.4, + "grad_norm": 0.46875, + "learning_rate": 0.00048434123215572694, + "loss": 0.257, + "step": 82120 + }, + { + "epoch": 3.4, + "grad_norm": 0.435546875, + "learning_rate": 0.0004843374540230048, + "loss": 0.2353, + "step": 82130 + }, + { + "epoch": 3.4, + "grad_norm": 1.34375, + "learning_rate": 0.00048433367544928487, + "loss": 0.1812, + "step": 82140 + }, + { + "epoch": 3.4, + "grad_norm": 0.419921875, + "learning_rate": 0.0004843298964345743, + "loss": 0.1804, + "step": 82150 + }, + { + "epoch": 3.4, + "grad_norm": 0.1689453125, + "learning_rate": 0.00048432611697888007, + "loss": 0.1743, + "step": 82160 + }, + { + "epoch": 3.4, + "grad_norm": 0.470703125, + "learning_rate": 0.00048432233708220945, + "loss": 0.2361, + "step": 82170 + }, + { + "epoch": 3.4, + "grad_norm": 0.52734375, + "learning_rate": 0.00048431855674456947, + "loss": 0.2171, + "step": 82180 + }, + { + "epoch": 3.4, + "grad_norm": 0.44921875, + "learning_rate": 0.00048431477596596733, + "loss": 0.1734, + "step": 82190 + }, + { + "epoch": 3.4, + "grad_norm": 0.61328125, + "learning_rate": 0.00048431099474641003, + "loss": 0.1663, + "step": 82200 + }, + { + "epoch": 3.41, + "grad_norm": 1.8515625, + "learning_rate": 0.00048430721308590483, + "loss": 0.2035, + "step": 82210 + }, + { + "epoch": 3.41, + "grad_norm": 0.53125, + "learning_rate": 0.0004843034309844587, + "loss": 0.217, + "step": 82220 + }, + { + "epoch": 3.41, + "grad_norm": 1.7265625, + "learning_rate": 0.0004842996484420788, + "loss": 0.2112, + "step": 82230 + }, + { + "epoch": 3.41, + "grad_norm": 0.5703125, + "learning_rate": 0.00048429586545877223, + "loss": 0.2534, + "step": 82240 + }, + { + "epoch": 3.41, + "grad_norm": 0.83203125, + "learning_rate": 0.00048429208203454625, + "loss": 0.1979, + "step": 82250 + }, + { + "epoch": 3.41, + "grad_norm": 0.345703125, + "learning_rate": 0.00048428829816940777, + "loss": 0.2188, + "step": 82260 + }, + { + "epoch": 3.41, + "grad_norm": 0.53125, + "learning_rate": 0.00048428451386336405, + "loss": 0.2499, + "step": 82270 + }, + { + "epoch": 3.41, + "grad_norm": 1.890625, + "learning_rate": 0.0004842807291164222, + "loss": 0.2187, + "step": 82280 + }, + { + "epoch": 3.41, + "grad_norm": 0.73828125, + "learning_rate": 0.0004842769439285893, + "loss": 0.186, + "step": 82290 + }, + { + "epoch": 3.41, + "grad_norm": 0.65234375, + "learning_rate": 0.0004842731582998725, + "loss": 0.1938, + "step": 82300 + }, + { + "epoch": 3.41, + "grad_norm": 0.76171875, + "learning_rate": 0.0004842693722302789, + "loss": 0.225, + "step": 82310 + }, + { + "epoch": 3.41, + "grad_norm": 1.2265625, + "learning_rate": 0.0004842655857198156, + "loss": 0.1351, + "step": 82320 + }, + { + "epoch": 3.41, + "grad_norm": 0.5546875, + "learning_rate": 0.00048426179876848987, + "loss": 0.2329, + "step": 82330 + }, + { + "epoch": 3.41, + "grad_norm": 1.203125, + "learning_rate": 0.00048425801137630864, + "loss": 0.1734, + "step": 82340 + }, + { + "epoch": 3.41, + "grad_norm": 0.5625, + "learning_rate": 0.0004842542235432792, + "loss": 0.2323, + "step": 82350 + }, + { + "epoch": 3.41, + "grad_norm": 0.390625, + "learning_rate": 0.00048425043526940855, + "loss": 0.2118, + "step": 82360 + }, + { + "epoch": 3.41, + "grad_norm": 0.341796875, + "learning_rate": 0.0004842466465547039, + "loss": 0.1811, + "step": 82370 + }, + { + "epoch": 3.41, + "grad_norm": 0.000408172607421875, + "learning_rate": 0.00048424285739917235, + "loss": 0.153, + "step": 82380 + }, + { + "epoch": 3.41, + "grad_norm": 1.046875, + "learning_rate": 0.00048423906780282105, + "loss": 0.1814, + "step": 82390 + }, + { + "epoch": 3.41, + "grad_norm": 0.8203125, + "learning_rate": 0.0004842352777656571, + "loss": 0.195, + "step": 82400 + }, + { + "epoch": 3.41, + "grad_norm": 1.3671875, + "learning_rate": 0.0004842314872876877, + "loss": 0.2172, + "step": 82410 + }, + { + "epoch": 3.41, + "grad_norm": 0.83984375, + "learning_rate": 0.0004842276963689199, + "loss": 0.2266, + "step": 82420 + }, + { + "epoch": 3.41, + "grad_norm": 0.8359375, + "learning_rate": 0.00048422390500936084, + "loss": 0.2547, + "step": 82430 + }, + { + "epoch": 3.41, + "grad_norm": 0.3671875, + "learning_rate": 0.00048422011320901773, + "loss": 0.237, + "step": 82440 + }, + { + "epoch": 3.42, + "grad_norm": 1.75, + "learning_rate": 0.0004842163209678977, + "loss": 0.2039, + "step": 82450 + }, + { + "epoch": 3.42, + "grad_norm": 0.330078125, + "learning_rate": 0.00048421252828600774, + "loss": 0.2122, + "step": 82460 + }, + { + "epoch": 3.42, + "grad_norm": 0.58984375, + "learning_rate": 0.0004842087351633552, + "loss": 0.2418, + "step": 82470 + }, + { + "epoch": 3.42, + "grad_norm": 0.91015625, + "learning_rate": 0.00048420494159994706, + "loss": 0.1942, + "step": 82480 + }, + { + "epoch": 3.42, + "grad_norm": 0.6171875, + "learning_rate": 0.0004842011475957905, + "loss": 0.2176, + "step": 82490 + }, + { + "epoch": 3.42, + "grad_norm": 0.5546875, + "learning_rate": 0.0004841973531508927, + "loss": 0.2044, + "step": 82500 + }, + { + "epoch": 3.42, + "grad_norm": 1.0625, + "learning_rate": 0.00048419355826526074, + "loss": 0.2086, + "step": 82510 + }, + { + "epoch": 3.42, + "grad_norm": 0.44921875, + "learning_rate": 0.00048418976293890183, + "loss": 0.2055, + "step": 82520 + }, + { + "epoch": 3.42, + "grad_norm": 0.734375, + "learning_rate": 0.0004841859671718231, + "loss": 0.1984, + "step": 82530 + }, + { + "epoch": 3.42, + "grad_norm": 0.62890625, + "learning_rate": 0.0004841821709640316, + "loss": 0.2435, + "step": 82540 + }, + { + "epoch": 3.42, + "grad_norm": 0.341796875, + "learning_rate": 0.00048417837431553455, + "loss": 0.2455, + "step": 82550 + }, + { + "epoch": 3.42, + "grad_norm": 0.41015625, + "learning_rate": 0.00048417457722633913, + "loss": 0.2361, + "step": 82560 + }, + { + "epoch": 3.42, + "grad_norm": 1.4609375, + "learning_rate": 0.0004841707796964524, + "loss": 0.1743, + "step": 82570 + }, + { + "epoch": 3.42, + "grad_norm": 0.259765625, + "learning_rate": 0.0004841669817258816, + "loss": 0.1748, + "step": 82580 + }, + { + "epoch": 3.42, + "grad_norm": 0.87890625, + "learning_rate": 0.00048416318331463384, + "loss": 0.1942, + "step": 82590 + }, + { + "epoch": 3.42, + "grad_norm": 0.78515625, + "learning_rate": 0.0004841593844627162, + "loss": 0.2281, + "step": 82600 + }, + { + "epoch": 3.42, + "grad_norm": 0.455078125, + "learning_rate": 0.00048415558517013593, + "loss": 0.1645, + "step": 82610 + }, + { + "epoch": 3.42, + "grad_norm": 0.73828125, + "learning_rate": 0.0004841517854369001, + "loss": 0.1857, + "step": 82620 + }, + { + "epoch": 3.42, + "grad_norm": 0.5, + "learning_rate": 0.00048414798526301585, + "loss": 0.2487, + "step": 82630 + }, + { + "epoch": 3.42, + "grad_norm": 1.015625, + "learning_rate": 0.00048414418464849045, + "loss": 0.2251, + "step": 82640 + }, + { + "epoch": 3.42, + "grad_norm": 0.51953125, + "learning_rate": 0.000484140383593331, + "loss": 0.2285, + "step": 82650 + }, + { + "epoch": 3.42, + "grad_norm": 0.6640625, + "learning_rate": 0.0004841365820975446, + "loss": 0.2503, + "step": 82660 + }, + { + "epoch": 3.42, + "grad_norm": 0.259765625, + "learning_rate": 0.00048413278016113846, + "loss": 0.2404, + "step": 82670 + }, + { + "epoch": 3.42, + "grad_norm": 0.59765625, + "learning_rate": 0.0004841289777841197, + "loss": 0.258, + "step": 82680 + }, + { + "epoch": 3.43, + "grad_norm": 0.6640625, + "learning_rate": 0.00048412517496649547, + "loss": 0.2457, + "step": 82690 + }, + { + "epoch": 3.43, + "grad_norm": 0.55078125, + "learning_rate": 0.0004841213717082729, + "loss": 0.1853, + "step": 82700 + }, + { + "epoch": 3.43, + "grad_norm": 0.9453125, + "learning_rate": 0.0004841175680094593, + "loss": 0.2006, + "step": 82710 + }, + { + "epoch": 3.43, + "grad_norm": 1.9375, + "learning_rate": 0.0004841137638700616, + "loss": 0.2435, + "step": 82720 + }, + { + "epoch": 3.43, + "grad_norm": 0.6484375, + "learning_rate": 0.0004841099592900872, + "loss": 0.2535, + "step": 82730 + }, + { + "epoch": 3.43, + "grad_norm": 0.419921875, + "learning_rate": 0.0004841061542695431, + "loss": 0.2113, + "step": 82740 + }, + { + "epoch": 3.43, + "grad_norm": 0.80859375, + "learning_rate": 0.0004841023488084364, + "loss": 0.198, + "step": 82750 + }, + { + "epoch": 3.43, + "grad_norm": 0.5703125, + "learning_rate": 0.00048409854290677447, + "loss": 0.1848, + "step": 82760 + }, + { + "epoch": 3.43, + "grad_norm": 0.326171875, + "learning_rate": 0.0004840947365645643, + "loss": 0.2298, + "step": 82770 + }, + { + "epoch": 3.43, + "grad_norm": 0.0, + "learning_rate": 0.00048409092978181313, + "loss": 0.2277, + "step": 82780 + }, + { + "epoch": 3.43, + "grad_norm": 1.9921875, + "learning_rate": 0.0004840871225585282, + "loss": 0.2307, + "step": 82790 + }, + { + "epoch": 3.43, + "grad_norm": 0.73046875, + "learning_rate": 0.0004840833148947165, + "loss": 0.205, + "step": 82800 + }, + { + "epoch": 3.43, + "grad_norm": 0.953125, + "learning_rate": 0.00048407950679038526, + "loss": 0.1709, + "step": 82810 + }, + { + "epoch": 3.43, + "grad_norm": 0.76171875, + "learning_rate": 0.0004840756982455417, + "loss": 0.175, + "step": 82820 + }, + { + "epoch": 3.43, + "grad_norm": 0.51953125, + "learning_rate": 0.00048407188926019297, + "loss": 0.2222, + "step": 82830 + }, + { + "epoch": 3.43, + "grad_norm": 0.90625, + "learning_rate": 0.0004840680798343462, + "loss": 0.1879, + "step": 82840 + }, + { + "epoch": 3.43, + "grad_norm": 0.39453125, + "learning_rate": 0.00048406426996800854, + "loss": 0.17, + "step": 82850 + }, + { + "epoch": 3.43, + "grad_norm": 0.76171875, + "learning_rate": 0.0004840604596611873, + "loss": 0.1944, + "step": 82860 + }, + { + "epoch": 3.43, + "grad_norm": 0.494140625, + "learning_rate": 0.00048405664891388944, + "loss": 0.2199, + "step": 82870 + }, + { + "epoch": 3.43, + "grad_norm": 0.8203125, + "learning_rate": 0.0004840528377261223, + "loss": 0.2358, + "step": 82880 + }, + { + "epoch": 3.43, + "grad_norm": 0.291015625, + "learning_rate": 0.000484049026097893, + "loss": 0.22, + "step": 82890 + }, + { + "epoch": 3.43, + "grad_norm": 0.57421875, + "learning_rate": 0.00048404521402920875, + "loss": 0.1884, + "step": 82900 + }, + { + "epoch": 3.43, + "grad_norm": 0.408203125, + "learning_rate": 0.00048404140152007657, + "loss": 0.215, + "step": 82910 + }, + { + "epoch": 3.43, + "grad_norm": 0.6328125, + "learning_rate": 0.00048403758857050385, + "loss": 0.2117, + "step": 82920 + }, + { + "epoch": 3.43, + "grad_norm": 1.1171875, + "learning_rate": 0.0004840337751804976, + "loss": 0.2398, + "step": 82930 + }, + { + "epoch": 3.44, + "grad_norm": 0.53125, + "learning_rate": 0.0004840299613500651, + "loss": 0.221, + "step": 82940 + }, + { + "epoch": 3.44, + "grad_norm": 0.8359375, + "learning_rate": 0.0004840261470792134, + "loss": 0.2274, + "step": 82950 + }, + { + "epoch": 3.44, + "grad_norm": 0.6796875, + "learning_rate": 0.0004840223323679499, + "loss": 0.2014, + "step": 82960 + }, + { + "epoch": 3.44, + "grad_norm": 0.89453125, + "learning_rate": 0.00048401851721628154, + "loss": 0.2353, + "step": 82970 + }, + { + "epoch": 3.44, + "grad_norm": 0.54296875, + "learning_rate": 0.00048401470162421565, + "loss": 0.1948, + "step": 82980 + }, + { + "epoch": 3.44, + "grad_norm": 0.65234375, + "learning_rate": 0.0004840108855917593, + "loss": 0.1565, + "step": 82990 + }, + { + "epoch": 3.44, + "grad_norm": 0.52734375, + "learning_rate": 0.00048400706911891985, + "loss": 0.1756, + "step": 83000 + }, + { + "epoch": 3.44, + "grad_norm": 0.494140625, + "learning_rate": 0.0004840032522057043, + "loss": 0.2362, + "step": 83010 + }, + { + "epoch": 3.44, + "grad_norm": 0.79296875, + "learning_rate": 0.0004839994348521199, + "loss": 0.2488, + "step": 83020 + }, + { + "epoch": 3.44, + "grad_norm": 0.298828125, + "learning_rate": 0.00048399561705817385, + "loss": 0.2174, + "step": 83030 + }, + { + "epoch": 3.44, + "grad_norm": 0.625, + "learning_rate": 0.0004839917988238733, + "loss": 0.2075, + "step": 83040 + }, + { + "epoch": 3.44, + "grad_norm": 0.396484375, + "learning_rate": 0.0004839879801492254, + "loss": 0.2659, + "step": 83050 + }, + { + "epoch": 3.44, + "grad_norm": 0.74609375, + "learning_rate": 0.0004839841610342375, + "loss": 0.1958, + "step": 83060 + }, + { + "epoch": 3.44, + "grad_norm": 0.466796875, + "learning_rate": 0.00048398034147891655, + "loss": 0.2462, + "step": 83070 + }, + { + "epoch": 3.44, + "grad_norm": 3.453125, + "learning_rate": 0.00048397652148327, + "loss": 0.1773, + "step": 83080 + }, + { + "epoch": 3.44, + "grad_norm": 0.50390625, + "learning_rate": 0.00048397270104730484, + "loss": 0.1522, + "step": 83090 + }, + { + "epoch": 3.44, + "grad_norm": 4.125, + "learning_rate": 0.0004839688801710283, + "loss": 0.2622, + "step": 83100 + }, + { + "epoch": 3.44, + "grad_norm": 0.984375, + "learning_rate": 0.0004839650588544476, + "loss": 0.1975, + "step": 83110 + }, + { + "epoch": 3.44, + "grad_norm": 0.80078125, + "learning_rate": 0.00048396123709757, + "loss": 0.1896, + "step": 83120 + }, + { + "epoch": 3.44, + "grad_norm": 0.75, + "learning_rate": 0.00048395741490040253, + "loss": 0.1916, + "step": 83130 + }, + { + "epoch": 3.44, + "grad_norm": 1.0859375, + "learning_rate": 0.0004839535922629525, + "loss": 0.2439, + "step": 83140 + }, + { + "epoch": 3.44, + "grad_norm": 0.578125, + "learning_rate": 0.000483949769185227, + "loss": 0.2206, + "step": 83150 + }, + { + "epoch": 3.44, + "grad_norm": 0.45703125, + "learning_rate": 0.00048394594566723343, + "loss": 0.216, + "step": 83160 + }, + { + "epoch": 3.44, + "grad_norm": 0.7421875, + "learning_rate": 0.0004839421217089788, + "loss": 0.2498, + "step": 83170 + }, + { + "epoch": 3.45, + "grad_norm": 0.86328125, + "learning_rate": 0.00048393829731047035, + "loss": 0.2252, + "step": 83180 + }, + { + "epoch": 3.45, + "grad_norm": 0.7890625, + "learning_rate": 0.0004839344724717153, + "loss": 0.1922, + "step": 83190 + }, + { + "epoch": 3.45, + "grad_norm": 0.82421875, + "learning_rate": 0.00048393064719272084, + "loss": 0.207, + "step": 83200 + }, + { + "epoch": 3.45, + "grad_norm": 0.84765625, + "learning_rate": 0.00048392682147349413, + "loss": 0.2032, + "step": 83210 + }, + { + "epoch": 3.45, + "grad_norm": 0.33984375, + "learning_rate": 0.00048392299531404247, + "loss": 0.224, + "step": 83220 + }, + { + "epoch": 3.45, + "grad_norm": 0.2890625, + "learning_rate": 0.0004839191687143729, + "loss": 0.161, + "step": 83230 + }, + { + "epoch": 3.45, + "grad_norm": 0.34765625, + "learning_rate": 0.0004839153416744928, + "loss": 0.2719, + "step": 83240 + }, + { + "epoch": 3.45, + "grad_norm": 0.65234375, + "learning_rate": 0.0004839115141944093, + "loss": 0.2162, + "step": 83250 + }, + { + "epoch": 3.45, + "grad_norm": 0.78125, + "learning_rate": 0.0004839076862741295, + "loss": 0.2139, + "step": 83260 + }, + { + "epoch": 3.45, + "grad_norm": 0.61328125, + "learning_rate": 0.0004839038579136608, + "loss": 0.3009, + "step": 83270 + }, + { + "epoch": 3.45, + "grad_norm": 0.3359375, + "learning_rate": 0.00048390002911301023, + "loss": 0.1809, + "step": 83280 + }, + { + "epoch": 3.45, + "grad_norm": 0.404296875, + "learning_rate": 0.00048389619987218504, + "loss": 0.2269, + "step": 83290 + }, + { + "epoch": 3.45, + "grad_norm": 0.296875, + "learning_rate": 0.00048389237019119246, + "loss": 0.2221, + "step": 83300 + }, + { + "epoch": 3.45, + "grad_norm": 1.4296875, + "learning_rate": 0.00048388854007003977, + "loss": 0.17, + "step": 83310 + }, + { + "epoch": 3.45, + "grad_norm": 0.5234375, + "learning_rate": 0.00048388470950873406, + "loss": 0.2111, + "step": 83320 + }, + { + "epoch": 3.45, + "grad_norm": 0.6875, + "learning_rate": 0.00048388087850728255, + "loss": 0.2043, + "step": 83330 + }, + { + "epoch": 3.45, + "grad_norm": 0.365234375, + "learning_rate": 0.00048387704706569256, + "loss": 0.2179, + "step": 83340 + }, + { + "epoch": 3.45, + "grad_norm": 0.396484375, + "learning_rate": 0.0004838732151839712, + "loss": 0.2438, + "step": 83350 + }, + { + "epoch": 3.45, + "grad_norm": 0.4375, + "learning_rate": 0.0004838693828621257, + "loss": 0.2005, + "step": 83360 + }, + { + "epoch": 3.45, + "grad_norm": 1.296875, + "learning_rate": 0.00048386555010016333, + "loss": 0.2195, + "step": 83370 + }, + { + "epoch": 3.45, + "grad_norm": 0.38671875, + "learning_rate": 0.0004838617168980911, + "loss": 0.196, + "step": 83380 + }, + { + "epoch": 3.45, + "grad_norm": 0.2197265625, + "learning_rate": 0.0004838578832559165, + "loss": 0.1808, + "step": 83390 + }, + { + "epoch": 3.45, + "grad_norm": 0.4140625, + "learning_rate": 0.00048385404917364663, + "loss": 0.1655, + "step": 83400 + }, + { + "epoch": 3.45, + "grad_norm": 0.609375, + "learning_rate": 0.0004838502146512886, + "loss": 0.2114, + "step": 83410 + }, + { + "epoch": 3.46, + "grad_norm": 0.5546875, + "learning_rate": 0.0004838463796888498, + "loss": 0.2489, + "step": 83420 + }, + { + "epoch": 3.46, + "grad_norm": 0.59765625, + "learning_rate": 0.00048384254428633736, + "loss": 0.2313, + "step": 83430 + }, + { + "epoch": 3.46, + "grad_norm": 0.5546875, + "learning_rate": 0.00048383870844375846, + "loss": 0.1864, + "step": 83440 + }, + { + "epoch": 3.46, + "grad_norm": 0.578125, + "learning_rate": 0.00048383487216112033, + "loss": 0.2118, + "step": 83450 + }, + { + "epoch": 3.46, + "grad_norm": 0.7890625, + "learning_rate": 0.00048383103543843034, + "loss": 0.2444, + "step": 83460 + }, + { + "epoch": 3.46, + "grad_norm": 0.95703125, + "learning_rate": 0.00048382719827569544, + "loss": 0.2017, + "step": 83470 + }, + { + "epoch": 3.46, + "grad_norm": 1.59375, + "learning_rate": 0.0004838233606729231, + "loss": 0.2311, + "step": 83480 + }, + { + "epoch": 3.46, + "grad_norm": 0.83203125, + "learning_rate": 0.00048381952263012047, + "loss": 0.2236, + "step": 83490 + }, + { + "epoch": 3.46, + "grad_norm": 1.078125, + "learning_rate": 0.0004838156841472947, + "loss": 0.2446, + "step": 83500 + }, + { + "epoch": 3.46, + "grad_norm": 0.87109375, + "learning_rate": 0.0004838118452244531, + "loss": 0.3069, + "step": 83510 + }, + { + "epoch": 3.46, + "grad_norm": 0.1943359375, + "learning_rate": 0.0004838080058616028, + "loss": 0.1853, + "step": 83520 + }, + { + "epoch": 3.46, + "grad_norm": 1.7109375, + "learning_rate": 0.0004838041660587511, + "loss": 0.1547, + "step": 83530 + }, + { + "epoch": 3.46, + "grad_norm": 0.59765625, + "learning_rate": 0.0004838003258159053, + "loss": 0.1884, + "step": 83540 + }, + { + "epoch": 3.46, + "grad_norm": 0.412109375, + "learning_rate": 0.0004837964851330724, + "loss": 0.173, + "step": 83550 + }, + { + "epoch": 3.46, + "grad_norm": 1.2734375, + "learning_rate": 0.00048379264401025987, + "loss": 0.2672, + "step": 83560 + }, + { + "epoch": 3.46, + "grad_norm": 0.84765625, + "learning_rate": 0.00048378880244747476, + "loss": 0.2201, + "step": 83570 + }, + { + "epoch": 3.46, + "grad_norm": 0.77734375, + "learning_rate": 0.0004837849604447244, + "loss": 0.2021, + "step": 83580 + }, + { + "epoch": 3.46, + "grad_norm": 0.59765625, + "learning_rate": 0.00048378111800201595, + "loss": 0.2835, + "step": 83590 + }, + { + "epoch": 3.46, + "grad_norm": 0.91796875, + "learning_rate": 0.0004837772751193568, + "loss": 0.2353, + "step": 83600 + }, + { + "epoch": 3.46, + "grad_norm": 1.4375, + "learning_rate": 0.0004837734317967539, + "loss": 0.2099, + "step": 83610 + }, + { + "epoch": 3.46, + "grad_norm": 0.59375, + "learning_rate": 0.0004837695880342148, + "loss": 0.2187, + "step": 83620 + }, + { + "epoch": 3.46, + "grad_norm": 0.66015625, + "learning_rate": 0.00048376574383174646, + "loss": 0.2573, + "step": 83630 + }, + { + "epoch": 3.46, + "grad_norm": 0.578125, + "learning_rate": 0.0004837618991893563, + "loss": 0.2779, + "step": 83640 + }, + { + "epoch": 3.46, + "grad_norm": 0.248046875, + "learning_rate": 0.00048375805410705145, + "loss": 0.2057, + "step": 83650 + }, + { + "epoch": 3.47, + "grad_norm": 0.68359375, + "learning_rate": 0.0004837542085848392, + "loss": 0.1996, + "step": 83660 + }, + { + "epoch": 3.47, + "grad_norm": 0.3515625, + "learning_rate": 0.0004837503626227268, + "loss": 0.2053, + "step": 83670 + }, + { + "epoch": 3.47, + "grad_norm": 1.1953125, + "learning_rate": 0.0004837465162207214, + "loss": 0.192, + "step": 83680 + }, + { + "epoch": 3.47, + "grad_norm": 1.828125, + "learning_rate": 0.00048374266937883036, + "loss": 0.2185, + "step": 83690 + }, + { + "epoch": 3.47, + "grad_norm": 0.46484375, + "learning_rate": 0.0004837388220970609, + "loss": 0.2369, + "step": 83700 + }, + { + "epoch": 3.47, + "grad_norm": 0.58984375, + "learning_rate": 0.0004837349743754201, + "loss": 0.2424, + "step": 83710 + }, + { + "epoch": 3.47, + "grad_norm": 0.416015625, + "learning_rate": 0.0004837311262139154, + "loss": 0.2576, + "step": 83720 + }, + { + "epoch": 3.47, + "grad_norm": 1.046875, + "learning_rate": 0.00048372727761255397, + "loss": 0.2188, + "step": 83730 + }, + { + "epoch": 3.47, + "grad_norm": 0.7578125, + "learning_rate": 0.000483723428571343, + "loss": 0.241, + "step": 83740 + }, + { + "epoch": 3.47, + "grad_norm": 0.1474609375, + "learning_rate": 0.00048371957909028974, + "loss": 0.2342, + "step": 83750 + }, + { + "epoch": 3.47, + "grad_norm": 0.66796875, + "learning_rate": 0.0004837157291694015, + "loss": 0.2328, + "step": 83760 + }, + { + "epoch": 3.47, + "grad_norm": 0.78125, + "learning_rate": 0.0004837118788086856, + "loss": 0.1965, + "step": 83770 + }, + { + "epoch": 3.47, + "grad_norm": 1.171875, + "learning_rate": 0.00048370802800814903, + "loss": 0.2285, + "step": 83780 + }, + { + "epoch": 3.47, + "grad_norm": 1.6015625, + "learning_rate": 0.0004837041767677993, + "loss": 0.2253, + "step": 83790 + }, + { + "epoch": 3.47, + "grad_norm": 0.5078125, + "learning_rate": 0.0004837003250876435, + "loss": 0.1864, + "step": 83800 + }, + { + "epoch": 3.47, + "grad_norm": 0.64453125, + "learning_rate": 0.0004836964729676889, + "loss": 0.1916, + "step": 83810 + }, + { + "epoch": 3.47, + "grad_norm": 0.9765625, + "learning_rate": 0.0004836926204079428, + "loss": 0.2106, + "step": 83820 + }, + { + "epoch": 3.47, + "grad_norm": 0.640625, + "learning_rate": 0.0004836887674084125, + "loss": 0.2455, + "step": 83830 + }, + { + "epoch": 3.47, + "grad_norm": 0.63671875, + "learning_rate": 0.00048368491396910506, + "loss": 0.2554, + "step": 83840 + }, + { + "epoch": 3.47, + "grad_norm": 0.69140625, + "learning_rate": 0.0004836810600900279, + "loss": 0.1948, + "step": 83850 + }, + { + "epoch": 3.47, + "grad_norm": 0.734375, + "learning_rate": 0.00048367720577118824, + "loss": 0.2347, + "step": 83860 + }, + { + "epoch": 3.47, + "grad_norm": 0.51171875, + "learning_rate": 0.0004836733510125933, + "loss": 0.2236, + "step": 83870 + }, + { + "epoch": 3.47, + "grad_norm": 0.71484375, + "learning_rate": 0.0004836694958142503, + "loss": 0.1952, + "step": 83880 + }, + { + "epoch": 3.47, + "grad_norm": 0.88671875, + "learning_rate": 0.0004836656401761667, + "loss": 0.2218, + "step": 83890 + }, + { + "epoch": 3.48, + "grad_norm": 0.5625, + "learning_rate": 0.0004836617840983494, + "loss": 0.224, + "step": 83900 + }, + { + "epoch": 3.48, + "grad_norm": 1.0703125, + "learning_rate": 0.00048365792758080597, + "loss": 0.2163, + "step": 83910 + }, + { + "epoch": 3.48, + "grad_norm": 0.859375, + "learning_rate": 0.0004836540706235435, + "loss": 0.2142, + "step": 83920 + }, + { + "epoch": 3.48, + "grad_norm": 0.357421875, + "learning_rate": 0.00048365021322656934, + "loss": 0.1991, + "step": 83930 + }, + { + "epoch": 3.48, + "grad_norm": 0.4375, + "learning_rate": 0.00048364635538989065, + "loss": 0.2036, + "step": 83940 + }, + { + "epoch": 3.48, + "grad_norm": 0.3828125, + "learning_rate": 0.0004836424971135148, + "loss": 0.2664, + "step": 83950 + }, + { + "epoch": 3.48, + "grad_norm": 0.72265625, + "learning_rate": 0.00048363863839744905, + "loss": 0.2682, + "step": 83960 + }, + { + "epoch": 3.48, + "grad_norm": 0.85546875, + "learning_rate": 0.0004836347792417005, + "loss": 0.1756, + "step": 83970 + }, + { + "epoch": 3.48, + "grad_norm": 1.90625, + "learning_rate": 0.0004836309196462766, + "loss": 0.2557, + "step": 83980 + }, + { + "epoch": 3.48, + "grad_norm": 0.380859375, + "learning_rate": 0.0004836270596111845, + "loss": 0.1903, + "step": 83990 + }, + { + "epoch": 3.48, + "grad_norm": 2.265625, + "learning_rate": 0.00048362319913643155, + "loss": 0.2324, + "step": 84000 + }, + { + "epoch": 3.48, + "grad_norm": 0.5390625, + "learning_rate": 0.00048361933822202495, + "loss": 0.165, + "step": 84010 + }, + { + "epoch": 3.48, + "grad_norm": 0.84375, + "learning_rate": 0.000483615476867972, + "loss": 0.202, + "step": 84020 + }, + { + "epoch": 3.48, + "grad_norm": 0.330078125, + "learning_rate": 0.00048361161507427987, + "loss": 0.1942, + "step": 84030 + }, + { + "epoch": 3.48, + "grad_norm": 0.53125, + "learning_rate": 0.00048360775284095596, + "loss": 0.2152, + "step": 84040 + }, + { + "epoch": 3.48, + "grad_norm": 0.478515625, + "learning_rate": 0.00048360389016800754, + "loss": 0.2225, + "step": 84050 + }, + { + "epoch": 3.48, + "grad_norm": 0.90625, + "learning_rate": 0.00048360002705544175, + "loss": 0.2653, + "step": 84060 + }, + { + "epoch": 3.48, + "grad_norm": 0.7890625, + "learning_rate": 0.000483596163503266, + "loss": 0.2579, + "step": 84070 + }, + { + "epoch": 3.48, + "grad_norm": 0.7421875, + "learning_rate": 0.0004835922995114874, + "loss": 0.2012, + "step": 84080 + }, + { + "epoch": 3.48, + "grad_norm": 0.73046875, + "learning_rate": 0.0004835884350801134, + "loss": 0.1969, + "step": 84090 + }, + { + "epoch": 3.48, + "grad_norm": 1.171875, + "learning_rate": 0.0004835845702091512, + "loss": 0.1906, + "step": 84100 + }, + { + "epoch": 3.48, + "grad_norm": 1.1015625, + "learning_rate": 0.00048358070489860807, + "loss": 0.199, + "step": 84110 + }, + { + "epoch": 3.48, + "grad_norm": 0.84375, + "learning_rate": 0.0004835768391484912, + "loss": 0.2365, + "step": 84120 + }, + { + "epoch": 3.48, + "grad_norm": 0.72265625, + "learning_rate": 0.00048357297295880796, + "loss": 0.1946, + "step": 84130 + }, + { + "epoch": 3.49, + "grad_norm": 1.2265625, + "learning_rate": 0.00048356910632956564, + "loss": 0.2372, + "step": 84140 + }, + { + "epoch": 3.49, + "grad_norm": 0.76953125, + "learning_rate": 0.0004835652392607715, + "loss": 0.1938, + "step": 84150 + }, + { + "epoch": 3.49, + "grad_norm": 1.546875, + "learning_rate": 0.00048356137175243274, + "loss": 0.2487, + "step": 84160 + }, + { + "epoch": 3.49, + "grad_norm": 0.75, + "learning_rate": 0.00048355750380455674, + "loss": 0.2212, + "step": 84170 + }, + { + "epoch": 3.49, + "grad_norm": 0.87109375, + "learning_rate": 0.0004835536354171508, + "loss": 0.2403, + "step": 84180 + }, + { + "epoch": 3.49, + "grad_norm": 0.5625, + "learning_rate": 0.0004835497665902221, + "loss": 0.2392, + "step": 84190 + }, + { + "epoch": 3.49, + "grad_norm": 0.26171875, + "learning_rate": 0.0004835458973237779, + "loss": 0.1639, + "step": 84200 + }, + { + "epoch": 3.49, + "grad_norm": 0.58984375, + "learning_rate": 0.0004835420276178256, + "loss": 0.2565, + "step": 84210 + }, + { + "epoch": 3.49, + "grad_norm": 0.9375, + "learning_rate": 0.00048353815747237237, + "loss": 0.2501, + "step": 84220 + }, + { + "epoch": 3.49, + "grad_norm": 0.7578125, + "learning_rate": 0.00048353428688742564, + "loss": 0.2121, + "step": 84230 + }, + { + "epoch": 3.49, + "grad_norm": 0.400390625, + "learning_rate": 0.0004835304158629925, + "loss": 0.1596, + "step": 84240 + }, + { + "epoch": 3.49, + "grad_norm": 0.65234375, + "learning_rate": 0.0004835265443990804, + "loss": 0.2195, + "step": 84250 + }, + { + "epoch": 3.49, + "grad_norm": 1.015625, + "learning_rate": 0.0004835226724956966, + "loss": 0.1555, + "step": 84260 + }, + { + "epoch": 3.49, + "grad_norm": 0.6015625, + "learning_rate": 0.0004835188001528483, + "loss": 0.1846, + "step": 84270 + }, + { + "epoch": 3.49, + "grad_norm": 0.490234375, + "learning_rate": 0.0004835149273705428, + "loss": 0.2287, + "step": 84280 + }, + { + "epoch": 3.49, + "grad_norm": 0.76953125, + "learning_rate": 0.0004835110541487875, + "loss": 0.1753, + "step": 84290 + }, + { + "epoch": 3.49, + "grad_norm": 0.431640625, + "learning_rate": 0.0004835071804875895, + "loss": 0.2302, + "step": 84300 + }, + { + "epoch": 3.49, + "grad_norm": 0.77734375, + "learning_rate": 0.00048350330638695626, + "loss": 0.2028, + "step": 84310 + }, + { + "epoch": 3.49, + "grad_norm": 0.283203125, + "learning_rate": 0.000483499431846895, + "loss": 0.2857, + "step": 84320 + }, + { + "epoch": 3.49, + "grad_norm": 0.53125, + "learning_rate": 0.0004834955568674131, + "loss": 0.2383, + "step": 84330 + }, + { + "epoch": 3.49, + "grad_norm": 0.671875, + "learning_rate": 0.00048349168144851774, + "loss": 0.1913, + "step": 84340 + }, + { + "epoch": 3.49, + "grad_norm": 0.470703125, + "learning_rate": 0.0004834878055902162, + "loss": 0.1969, + "step": 84350 + }, + { + "epoch": 3.49, + "grad_norm": 0.490234375, + "learning_rate": 0.00048348392929251587, + "loss": 0.1781, + "step": 84360 + }, + { + "epoch": 3.49, + "grad_norm": 0.63671875, + "learning_rate": 0.00048348005255542404, + "loss": 0.1827, + "step": 84370 + }, + { + "epoch": 3.5, + "grad_norm": 0.7890625, + "learning_rate": 0.0004834761753789479, + "loss": 0.2775, + "step": 84380 + }, + { + "epoch": 3.5, + "grad_norm": 1.1875, + "learning_rate": 0.0004834722977630948, + "loss": 0.1558, + "step": 84390 + }, + { + "epoch": 3.5, + "grad_norm": 0.7578125, + "learning_rate": 0.0004834684197078721, + "loss": 0.1807, + "step": 84400 + }, + { + "epoch": 3.5, + "grad_norm": 0.9453125, + "learning_rate": 0.000483464541213287, + "loss": 0.1897, + "step": 84410 + }, + { + "epoch": 3.5, + "grad_norm": 0.609375, + "learning_rate": 0.0004834606622793469, + "loss": 0.2198, + "step": 84420 + }, + { + "epoch": 3.5, + "grad_norm": 0.6640625, + "learning_rate": 0.000483456782906059, + "loss": 0.1809, + "step": 84430 + }, + { + "epoch": 3.5, + "grad_norm": 0.52734375, + "learning_rate": 0.00048345290309343065, + "loss": 0.2299, + "step": 84440 + }, + { + "epoch": 3.5, + "grad_norm": 0.60546875, + "learning_rate": 0.00048344902284146917, + "loss": 0.234, + "step": 84450 + }, + { + "epoch": 3.5, + "grad_norm": 0.73828125, + "learning_rate": 0.0004834451421501818, + "loss": 0.2304, + "step": 84460 + }, + { + "epoch": 3.5, + "grad_norm": 0.61328125, + "learning_rate": 0.00048344126101957594, + "loss": 0.2153, + "step": 84470 + }, + { + "epoch": 3.5, + "grad_norm": 1.4140625, + "learning_rate": 0.00048343737944965884, + "loss": 0.2297, + "step": 84480 + }, + { + "epoch": 3.5, + "grad_norm": 0.546875, + "learning_rate": 0.00048343349744043776, + "loss": 0.2167, + "step": 84490 + }, + { + "epoch": 3.5, + "grad_norm": 0.81640625, + "learning_rate": 0.00048342961499192003, + "loss": 0.2522, + "step": 84500 + }, + { + "epoch": 3.5, + "grad_norm": 1.4609375, + "learning_rate": 0.000483425732104113, + "loss": 0.2318, + "step": 84510 + }, + { + "epoch": 3.5, + "grad_norm": 0.8125, + "learning_rate": 0.00048342184877702386, + "loss": 0.2427, + "step": 84520 + }, + { + "epoch": 3.5, + "grad_norm": 0.59765625, + "learning_rate": 0.0004834179650106602, + "loss": 0.2526, + "step": 84530 + }, + { + "epoch": 3.5, + "grad_norm": 0.1796875, + "learning_rate": 0.00048341408080502894, + "loss": 0.1821, + "step": 84540 + }, + { + "epoch": 3.5, + "grad_norm": 0.578125, + "learning_rate": 0.0004834101961601377, + "loss": 0.1957, + "step": 84550 + }, + { + "epoch": 3.5, + "grad_norm": 1.3515625, + "learning_rate": 0.00048340631107599366, + "loss": 0.2197, + "step": 84560 + }, + { + "epoch": 3.5, + "grad_norm": 0.55078125, + "learning_rate": 0.0004834024255526042, + "loss": 0.2128, + "step": 84570 + }, + { + "epoch": 3.5, + "grad_norm": 0.75390625, + "learning_rate": 0.0004833985395899765, + "loss": 0.2401, + "step": 84580 + }, + { + "epoch": 3.5, + "grad_norm": 1.0859375, + "learning_rate": 0.0004833946531881179, + "loss": 0.2082, + "step": 84590 + }, + { + "epoch": 3.5, + "grad_norm": 0.375, + "learning_rate": 0.0004833907663470359, + "loss": 0.2511, + "step": 84600 + }, + { + "epoch": 3.5, + "grad_norm": 0.48828125, + "learning_rate": 0.0004833868790667376, + "loss": 0.233, + "step": 84610 + }, + { + "epoch": 3.5, + "grad_norm": 0.8984375, + "learning_rate": 0.0004833829913472304, + "loss": 0.2383, + "step": 84620 + }, + { + "epoch": 3.51, + "grad_norm": 1.171875, + "learning_rate": 0.0004833791031885216, + "loss": 0.2466, + "step": 84630 + }, + { + "epoch": 3.51, + "grad_norm": 0.5546875, + "learning_rate": 0.00048337521459061863, + "loss": 0.1708, + "step": 84640 + }, + { + "epoch": 3.51, + "grad_norm": 0.65625, + "learning_rate": 0.00048337132555352865, + "loss": 0.2559, + "step": 84650 + }, + { + "epoch": 3.51, + "grad_norm": 0.87109375, + "learning_rate": 0.00048336743607725895, + "loss": 0.1535, + "step": 84660 + }, + { + "epoch": 3.51, + "grad_norm": 0.2119140625, + "learning_rate": 0.0004833635461618171, + "loss": 0.1927, + "step": 84670 + }, + { + "epoch": 3.51, + "grad_norm": 0.640625, + "learning_rate": 0.0004833596558072101, + "loss": 0.2276, + "step": 84680 + }, + { + "epoch": 3.51, + "grad_norm": 0.94140625, + "learning_rate": 0.0004833557650134456, + "loss": 0.1889, + "step": 84690 + }, + { + "epoch": 3.51, + "grad_norm": 0.50390625, + "learning_rate": 0.0004833518737805306, + "loss": 0.2006, + "step": 84700 + }, + { + "epoch": 3.51, + "grad_norm": 0.79296875, + "learning_rate": 0.00048334798210847264, + "loss": 0.2068, + "step": 84710 + }, + { + "epoch": 3.51, + "grad_norm": 0.859375, + "learning_rate": 0.00048334408999727897, + "loss": 0.1845, + "step": 84720 + }, + { + "epoch": 3.51, + "grad_norm": 1.1171875, + "learning_rate": 0.0004833401974469569, + "loss": 0.1661, + "step": 84730 + }, + { + "epoch": 3.51, + "grad_norm": 1.4375, + "learning_rate": 0.00048333630445751373, + "loss": 0.1964, + "step": 84740 + }, + { + "epoch": 3.51, + "grad_norm": 0.439453125, + "learning_rate": 0.00048333241102895696, + "loss": 0.2096, + "step": 84750 + }, + { + "epoch": 3.51, + "grad_norm": 0.9296875, + "learning_rate": 0.0004833285171612937, + "loss": 0.2859, + "step": 84760 + }, + { + "epoch": 3.51, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004833246228545314, + "loss": 0.2331, + "step": 84770 + }, + { + "epoch": 3.51, + "grad_norm": 0.5703125, + "learning_rate": 0.00048332072810867733, + "loss": 0.2552, + "step": 84780 + }, + { + "epoch": 3.51, + "grad_norm": 0.90234375, + "learning_rate": 0.0004833168329237389, + "loss": 0.2087, + "step": 84790 + }, + { + "epoch": 3.51, + "grad_norm": 1.203125, + "learning_rate": 0.00048331293729972336, + "loss": 0.2448, + "step": 84800 + }, + { + "epoch": 3.51, + "grad_norm": 0.51171875, + "learning_rate": 0.00048330904123663804, + "loss": 0.208, + "step": 84810 + }, + { + "epoch": 3.51, + "grad_norm": 0.625, + "learning_rate": 0.00048330514473449035, + "loss": 0.1949, + "step": 84820 + }, + { + "epoch": 3.51, + "grad_norm": 0.96875, + "learning_rate": 0.00048330124779328757, + "loss": 0.203, + "step": 84830 + }, + { + "epoch": 3.51, + "grad_norm": 0.546875, + "learning_rate": 0.00048329735041303696, + "loss": 0.2151, + "step": 84840 + }, + { + "epoch": 3.51, + "grad_norm": 0.80859375, + "learning_rate": 0.00048329345259374605, + "loss": 0.2019, + "step": 84850 + }, + { + "epoch": 3.51, + "grad_norm": 0.87109375, + "learning_rate": 0.0004832895543354219, + "loss": 0.2097, + "step": 84860 + }, + { + "epoch": 3.52, + "grad_norm": 0.60546875, + "learning_rate": 0.0004832856556380721, + "loss": 0.1954, + "step": 84870 + }, + { + "epoch": 3.52, + "grad_norm": 0.66015625, + "learning_rate": 0.0004832817565017039, + "loss": 0.1602, + "step": 84880 + }, + { + "epoch": 3.52, + "grad_norm": 1.0546875, + "learning_rate": 0.0004832778569263246, + "loss": 0.2157, + "step": 84890 + }, + { + "epoch": 3.52, + "grad_norm": 0.8984375, + "learning_rate": 0.00048327395691194155, + "loss": 0.227, + "step": 84900 + }, + { + "epoch": 3.52, + "grad_norm": 0.55078125, + "learning_rate": 0.00048327005645856216, + "loss": 0.1924, + "step": 84910 + }, + { + "epoch": 3.52, + "grad_norm": 0.5703125, + "learning_rate": 0.00048326615556619365, + "loss": 0.1731, + "step": 84920 + }, + { + "epoch": 3.52, + "grad_norm": 0.72265625, + "learning_rate": 0.00048326225423484345, + "loss": 0.2289, + "step": 84930 + }, + { + "epoch": 3.52, + "grad_norm": 0.546875, + "learning_rate": 0.00048325835246451886, + "loss": 0.1715, + "step": 84940 + }, + { + "epoch": 3.52, + "grad_norm": 0.5, + "learning_rate": 0.00048325445025522727, + "loss": 0.1766, + "step": 84950 + }, + { + "epoch": 3.52, + "grad_norm": 0.98046875, + "learning_rate": 0.00048325054760697594, + "loss": 0.2493, + "step": 84960 + }, + { + "epoch": 3.52, + "grad_norm": 0.75390625, + "learning_rate": 0.00048324664451977235, + "loss": 0.1902, + "step": 84970 + }, + { + "epoch": 3.52, + "grad_norm": 0.84765625, + "learning_rate": 0.00048324274099362377, + "loss": 0.173, + "step": 84980 + }, + { + "epoch": 3.52, + "grad_norm": 0.65625, + "learning_rate": 0.00048323883702853745, + "loss": 0.2154, + "step": 84990 + }, + { + "epoch": 3.52, + "grad_norm": 0.388671875, + "learning_rate": 0.0004832349326245209, + "loss": 0.2232, + "step": 85000 + }, + { + "epoch": 3.52, + "grad_norm": 0.90625, + "learning_rate": 0.0004832310277815813, + "loss": 0.2199, + "step": 85010 + }, + { + "epoch": 3.52, + "grad_norm": 1.2578125, + "learning_rate": 0.0004832271224997262, + "loss": 0.2364, + "step": 85020 + }, + { + "epoch": 3.52, + "grad_norm": 0.609375, + "learning_rate": 0.0004832232167789628, + "loss": 0.221, + "step": 85030 + }, + { + "epoch": 3.52, + "grad_norm": 0.65234375, + "learning_rate": 0.00048321931061929847, + "loss": 0.2217, + "step": 85040 + }, + { + "epoch": 3.52, + "grad_norm": 0.91015625, + "learning_rate": 0.0004832154040207406, + "loss": 0.2487, + "step": 85050 + }, + { + "epoch": 3.52, + "grad_norm": 0.734375, + "learning_rate": 0.0004832114969832966, + "loss": 0.2074, + "step": 85060 + }, + { + "epoch": 3.52, + "grad_norm": 1.2890625, + "learning_rate": 0.00048320758950697366, + "loss": 0.2174, + "step": 85070 + }, + { + "epoch": 3.52, + "grad_norm": 0.376953125, + "learning_rate": 0.0004832036815917792, + "loss": 0.1682, + "step": 85080 + }, + { + "epoch": 3.52, + "grad_norm": 0.69921875, + "learning_rate": 0.0004831997732377206, + "loss": 0.2285, + "step": 85090 + }, + { + "epoch": 3.52, + "grad_norm": 1.1328125, + "learning_rate": 0.00048319586444480525, + "loss": 0.2101, + "step": 85100 + }, + { + "epoch": 3.53, + "grad_norm": 0.29296875, + "learning_rate": 0.0004831919552130405, + "loss": 0.2403, + "step": 85110 + }, + { + "epoch": 3.53, + "grad_norm": 0.640625, + "learning_rate": 0.0004831880455424336, + "loss": 0.2085, + "step": 85120 + }, + { + "epoch": 3.53, + "grad_norm": 0.50390625, + "learning_rate": 0.000483184135432992, + "loss": 0.2112, + "step": 85130 + }, + { + "epoch": 3.53, + "grad_norm": 0.1748046875, + "learning_rate": 0.0004831802248847231, + "loss": 0.188, + "step": 85140 + }, + { + "epoch": 3.53, + "grad_norm": 0.46875, + "learning_rate": 0.0004831763138976341, + "loss": 0.2405, + "step": 85150 + }, + { + "epoch": 3.53, + "grad_norm": 0.271484375, + "learning_rate": 0.00048317240247173256, + "loss": 0.2155, + "step": 85160 + }, + { + "epoch": 3.53, + "grad_norm": 0.703125, + "learning_rate": 0.0004831684906070257, + "loss": 0.2167, + "step": 85170 + }, + { + "epoch": 3.53, + "grad_norm": 0.44140625, + "learning_rate": 0.0004831645783035209, + "loss": 0.2038, + "step": 85180 + }, + { + "epoch": 3.53, + "grad_norm": 0.36328125, + "learning_rate": 0.0004831606655612255, + "loss": 0.173, + "step": 85190 + }, + { + "epoch": 3.53, + "grad_norm": 1.1171875, + "learning_rate": 0.000483156752380147, + "loss": 0.2153, + "step": 85200 + }, + { + "epoch": 3.53, + "grad_norm": 0.93359375, + "learning_rate": 0.0004831528387602926, + "loss": 0.2266, + "step": 85210 + }, + { + "epoch": 3.53, + "grad_norm": 0.73046875, + "learning_rate": 0.00048314892470166976, + "loss": 0.1933, + "step": 85220 + }, + { + "epoch": 3.53, + "grad_norm": 0.87890625, + "learning_rate": 0.0004831450102042858, + "loss": 0.1813, + "step": 85230 + }, + { + "epoch": 3.53, + "grad_norm": 1.5234375, + "learning_rate": 0.00048314109526814816, + "loss": 0.2577, + "step": 85240 + }, + { + "epoch": 3.53, + "grad_norm": 1.0078125, + "learning_rate": 0.0004831371798932641, + "loss": 0.256, + "step": 85250 + }, + { + "epoch": 3.53, + "grad_norm": 0.71875, + "learning_rate": 0.0004831332640796411, + "loss": 0.2518, + "step": 85260 + }, + { + "epoch": 3.53, + "grad_norm": 0.98828125, + "learning_rate": 0.00048312934782728645, + "loss": 0.1895, + "step": 85270 + }, + { + "epoch": 3.53, + "grad_norm": 0.404296875, + "learning_rate": 0.00048312543113620757, + "loss": 0.246, + "step": 85280 + }, + { + "epoch": 3.53, + "grad_norm": 0.67578125, + "learning_rate": 0.00048312151400641175, + "loss": 0.1993, + "step": 85290 + }, + { + "epoch": 3.53, + "grad_norm": 0.4140625, + "learning_rate": 0.0004831175964379064, + "loss": 0.2289, + "step": 85300 + }, + { + "epoch": 3.53, + "grad_norm": 0.875, + "learning_rate": 0.00048311367843069896, + "loss": 0.1599, + "step": 85310 + }, + { + "epoch": 3.53, + "grad_norm": 0.6015625, + "learning_rate": 0.0004831097599847968, + "loss": 0.1975, + "step": 85320 + }, + { + "epoch": 3.53, + "grad_norm": 0.52734375, + "learning_rate": 0.0004831058411002072, + "loss": 0.2598, + "step": 85330 + }, + { + "epoch": 3.53, + "grad_norm": 0.890625, + "learning_rate": 0.00048310192177693743, + "loss": 0.1715, + "step": 85340 + }, + { + "epoch": 3.54, + "grad_norm": 0.8359375, + "learning_rate": 0.0004830980020149952, + "loss": 0.1941, + "step": 85350 + }, + { + "epoch": 3.54, + "grad_norm": 0.50390625, + "learning_rate": 0.0004830940818143876, + "loss": 0.1847, + "step": 85360 + }, + { + "epoch": 3.54, + "grad_norm": 0.765625, + "learning_rate": 0.0004830901611751222, + "loss": 0.2444, + "step": 85370 + }, + { + "epoch": 3.54, + "grad_norm": 0.54296875, + "learning_rate": 0.00048308624009720625, + "loss": 0.2276, + "step": 85380 + }, + { + "epoch": 3.54, + "grad_norm": 0.37109375, + "learning_rate": 0.00048308231858064724, + "loss": 0.1728, + "step": 85390 + }, + { + "epoch": 3.54, + "grad_norm": 1.515625, + "learning_rate": 0.0004830783966254524, + "loss": 0.2081, + "step": 85400 + }, + { + "epoch": 3.54, + "grad_norm": 0.6875, + "learning_rate": 0.0004830744742316291, + "loss": 0.209, + "step": 85410 + }, + { + "epoch": 3.54, + "grad_norm": 1.1328125, + "learning_rate": 0.000483070551399185, + "loss": 0.219, + "step": 85420 + }, + { + "epoch": 3.54, + "grad_norm": 1.25, + "learning_rate": 0.0004830666281281272, + "loss": 0.241, + "step": 85430 + }, + { + "epoch": 3.54, + "grad_norm": 0.54296875, + "learning_rate": 0.0004830627044184631, + "loss": 0.2186, + "step": 85440 + }, + { + "epoch": 3.54, + "grad_norm": 1.4375, + "learning_rate": 0.00048305878027020025, + "loss": 0.239, + "step": 85450 + }, + { + "epoch": 3.54, + "grad_norm": 0.240234375, + "learning_rate": 0.00048305485568334597, + "loss": 0.2408, + "step": 85460 + }, + { + "epoch": 3.54, + "grad_norm": 0.76953125, + "learning_rate": 0.0004830509306579075, + "loss": 0.1667, + "step": 85470 + }, + { + "epoch": 3.54, + "grad_norm": 1.6328125, + "learning_rate": 0.0004830470051938924, + "loss": 0.2046, + "step": 85480 + }, + { + "epoch": 3.54, + "grad_norm": 0.83203125, + "learning_rate": 0.00048304307929130806, + "loss": 0.2299, + "step": 85490 + }, + { + "epoch": 3.54, + "grad_norm": 0.37890625, + "learning_rate": 0.00048303915295016175, + "loss": 0.2163, + "step": 85500 + }, + { + "epoch": 3.54, + "grad_norm": 0.40625, + "learning_rate": 0.0004830352261704609, + "loss": 0.1806, + "step": 85510 + }, + { + "epoch": 3.54, + "grad_norm": 0.22265625, + "learning_rate": 0.000483031298952213, + "loss": 0.1969, + "step": 85520 + }, + { + "epoch": 3.54, + "grad_norm": 0.341796875, + "learning_rate": 0.00048302737129542536, + "loss": 0.2095, + "step": 85530 + }, + { + "epoch": 3.54, + "grad_norm": 0.73046875, + "learning_rate": 0.00048302344320010526, + "loss": 0.2265, + "step": 85540 + }, + { + "epoch": 3.54, + "grad_norm": 0.9609375, + "learning_rate": 0.0004830195146662603, + "loss": 0.1746, + "step": 85550 + }, + { + "epoch": 3.54, + "grad_norm": 0.77734375, + "learning_rate": 0.0004830155856938978, + "loss": 0.1587, + "step": 85560 + }, + { + "epoch": 3.54, + "grad_norm": 1.0, + "learning_rate": 0.0004830116562830251, + "loss": 0.2723, + "step": 85570 + }, + { + "epoch": 3.54, + "grad_norm": 0.2001953125, + "learning_rate": 0.00048300772643364954, + "loss": 0.2006, + "step": 85580 + }, + { + "epoch": 3.55, + "grad_norm": 0.6875, + "learning_rate": 0.0004830037961457787, + "loss": 0.2248, + "step": 85590 + }, + { + "epoch": 3.55, + "grad_norm": 0.69140625, + "learning_rate": 0.00048299986541941984, + "loss": 0.2116, + "step": 85600 + }, + { + "epoch": 3.55, + "grad_norm": 0.73046875, + "learning_rate": 0.00048299593425458044, + "loss": 0.2102, + "step": 85610 + }, + { + "epoch": 3.55, + "grad_norm": 0.88671875, + "learning_rate": 0.0004829920026512678, + "loss": 0.2138, + "step": 85620 + }, + { + "epoch": 3.55, + "grad_norm": 1.1484375, + "learning_rate": 0.0004829880706094894, + "loss": 0.2041, + "step": 85630 + }, + { + "epoch": 3.55, + "grad_norm": 0.97265625, + "learning_rate": 0.00048298413812925255, + "loss": 0.2139, + "step": 85640 + }, + { + "epoch": 3.55, + "grad_norm": 0.73046875, + "learning_rate": 0.00048298020521056484, + "loss": 0.1757, + "step": 85650 + }, + { + "epoch": 3.55, + "grad_norm": 0.416015625, + "learning_rate": 0.00048297627185343346, + "loss": 0.1747, + "step": 85660 + }, + { + "epoch": 3.55, + "grad_norm": 0.64453125, + "learning_rate": 0.0004829723380578659, + "loss": 0.2227, + "step": 85670 + }, + { + "epoch": 3.55, + "grad_norm": 1.328125, + "learning_rate": 0.00048296840382386954, + "loss": 0.1827, + "step": 85680 + }, + { + "epoch": 3.55, + "grad_norm": 0.8203125, + "learning_rate": 0.00048296446915145185, + "loss": 0.1764, + "step": 85690 + }, + { + "epoch": 3.55, + "grad_norm": 1.046875, + "learning_rate": 0.0004829605340406201, + "loss": 0.2169, + "step": 85700 + }, + { + "epoch": 3.55, + "grad_norm": 0.6953125, + "learning_rate": 0.00048295659849138193, + "loss": 0.2197, + "step": 85710 + }, + { + "epoch": 3.55, + "grad_norm": 0.41796875, + "learning_rate": 0.0004829526625037445, + "loss": 0.2171, + "step": 85720 + }, + { + "epoch": 3.55, + "grad_norm": 0.7578125, + "learning_rate": 0.00048294872607771534, + "loss": 0.2082, + "step": 85730 + }, + { + "epoch": 3.55, + "grad_norm": 0.30078125, + "learning_rate": 0.00048294478921330177, + "loss": 0.1819, + "step": 85740 + }, + { + "epoch": 3.55, + "grad_norm": 0.8046875, + "learning_rate": 0.0004829408519105113, + "loss": 0.2408, + "step": 85750 + }, + { + "epoch": 3.55, + "grad_norm": 0.3671875, + "learning_rate": 0.0004829369141693514, + "loss": 0.2582, + "step": 85760 + }, + { + "epoch": 3.55, + "grad_norm": 1.0078125, + "learning_rate": 0.00048293297598982926, + "loss": 0.2329, + "step": 85770 + }, + { + "epoch": 3.55, + "grad_norm": 0.7265625, + "learning_rate": 0.0004829290373719525, + "loss": 0.1816, + "step": 85780 + }, + { + "epoch": 3.55, + "grad_norm": 0.6875, + "learning_rate": 0.0004829250983157284, + "loss": 0.2602, + "step": 85790 + }, + { + "epoch": 3.55, + "grad_norm": 0.61328125, + "learning_rate": 0.00048292115882116437, + "loss": 0.2224, + "step": 85800 + }, + { + "epoch": 3.55, + "grad_norm": 0.60546875, + "learning_rate": 0.0004829172188882679, + "loss": 0.1852, + "step": 85810 + }, + { + "epoch": 3.55, + "grad_norm": 0.57421875, + "learning_rate": 0.0004829132785170464, + "loss": 0.1925, + "step": 85820 + }, + { + "epoch": 3.56, + "grad_norm": 0.984375, + "learning_rate": 0.00048290933770750726, + "loss": 0.1955, + "step": 85830 + }, + { + "epoch": 3.56, + "grad_norm": 0.796875, + "learning_rate": 0.0004829053964596579, + "loss": 0.2804, + "step": 85840 + }, + { + "epoch": 3.56, + "grad_norm": 0.2177734375, + "learning_rate": 0.0004829014547735058, + "loss": 0.1906, + "step": 85850 + }, + { + "epoch": 3.56, + "grad_norm": 0.58984375, + "learning_rate": 0.0004828975126490582, + "loss": 0.1975, + "step": 85860 + }, + { + "epoch": 3.56, + "grad_norm": 0.78515625, + "learning_rate": 0.00048289357008632274, + "loss": 0.2239, + "step": 85870 + }, + { + "epoch": 3.56, + "grad_norm": 0.53125, + "learning_rate": 0.0004828896270853066, + "loss": 0.253, + "step": 85880 + }, + { + "epoch": 3.56, + "grad_norm": 1.0390625, + "learning_rate": 0.0004828856836460174, + "loss": 0.2135, + "step": 85890 + }, + { + "epoch": 3.56, + "grad_norm": 0.53125, + "learning_rate": 0.0004828817397684625, + "loss": 0.1934, + "step": 85900 + }, + { + "epoch": 3.56, + "grad_norm": 0.6015625, + "learning_rate": 0.0004828777954526493, + "loss": 0.2367, + "step": 85910 + }, + { + "epoch": 3.56, + "grad_norm": 0.404296875, + "learning_rate": 0.00048287385069858523, + "loss": 0.2328, + "step": 85920 + }, + { + "epoch": 3.56, + "grad_norm": 0.470703125, + "learning_rate": 0.0004828699055062778, + "loss": 0.169, + "step": 85930 + }, + { + "epoch": 3.56, + "grad_norm": 0.73828125, + "learning_rate": 0.0004828659598757343, + "loss": 0.2882, + "step": 85940 + }, + { + "epoch": 3.56, + "grad_norm": 0.62890625, + "learning_rate": 0.0004828620138069622, + "loss": 0.2222, + "step": 85950 + }, + { + "epoch": 3.56, + "grad_norm": 0.2451171875, + "learning_rate": 0.00048285806729996895, + "loss": 0.1987, + "step": 85960 + }, + { + "epoch": 3.56, + "grad_norm": 0.91015625, + "learning_rate": 0.00048285412035476197, + "loss": 0.15, + "step": 85970 + }, + { + "epoch": 3.56, + "grad_norm": 1.234375, + "learning_rate": 0.00048285017297134867, + "loss": 0.2291, + "step": 85980 + }, + { + "epoch": 3.56, + "grad_norm": 0.609375, + "learning_rate": 0.00048284622514973647, + "loss": 0.2431, + "step": 85990 + }, + { + "epoch": 3.56, + "grad_norm": 0.9140625, + "learning_rate": 0.0004828422768899329, + "loss": 0.175, + "step": 86000 + }, + { + "epoch": 3.56, + "grad_norm": 0.86328125, + "learning_rate": 0.00048283832819194517, + "loss": 0.2232, + "step": 86010 + }, + { + "epoch": 3.56, + "grad_norm": 0.640625, + "learning_rate": 0.0004828343790557809, + "loss": 0.2527, + "step": 86020 + }, + { + "epoch": 3.56, + "grad_norm": 0.2099609375, + "learning_rate": 0.0004828304294814475, + "loss": 0.1901, + "step": 86030 + }, + { + "epoch": 3.56, + "grad_norm": 0.69921875, + "learning_rate": 0.00048282647946895244, + "loss": 0.1694, + "step": 86040 + }, + { + "epoch": 3.56, + "grad_norm": 0.92578125, + "learning_rate": 0.00048282252901830306, + "loss": 0.215, + "step": 86050 + }, + { + "epoch": 3.56, + "grad_norm": 0.93359375, + "learning_rate": 0.0004828185781295067, + "loss": 0.2004, + "step": 86060 + }, + { + "epoch": 3.57, + "grad_norm": 0.396484375, + "learning_rate": 0.00048281462680257103, + "loss": 0.2133, + "step": 86070 + }, + { + "epoch": 3.57, + "grad_norm": 1.0234375, + "learning_rate": 0.00048281067503750335, + "loss": 0.272, + "step": 86080 + }, + { + "epoch": 3.57, + "grad_norm": 0.2734375, + "learning_rate": 0.00048280672283431114, + "loss": 0.221, + "step": 86090 + }, + { + "epoch": 3.57, + "grad_norm": 0.828125, + "learning_rate": 0.00048280277019300177, + "loss": 0.226, + "step": 86100 + }, + { + "epoch": 3.57, + "grad_norm": 0.81640625, + "learning_rate": 0.00048279881711358274, + "loss": 0.1692, + "step": 86110 + }, + { + "epoch": 3.57, + "grad_norm": 1.953125, + "learning_rate": 0.0004827948635960615, + "loss": 0.2323, + "step": 86120 + }, + { + "epoch": 3.57, + "grad_norm": 0.85546875, + "learning_rate": 0.00048279090964044546, + "loss": 0.1944, + "step": 86130 + }, + { + "epoch": 3.57, + "grad_norm": 1.5234375, + "learning_rate": 0.0004827869552467421, + "loss": 0.2287, + "step": 86140 + }, + { + "epoch": 3.57, + "grad_norm": 2.140625, + "learning_rate": 0.00048278300041495884, + "loss": 0.1631, + "step": 86150 + }, + { + "epoch": 3.57, + "grad_norm": 0.84375, + "learning_rate": 0.000482779045145103, + "loss": 0.2171, + "step": 86160 + }, + { + "epoch": 3.57, + "grad_norm": 0.984375, + "learning_rate": 0.00048277508943718223, + "loss": 0.1952, + "step": 86170 + }, + { + "epoch": 3.57, + "grad_norm": 0.1884765625, + "learning_rate": 0.0004827711332912038, + "loss": 0.221, + "step": 86180 + }, + { + "epoch": 3.57, + "grad_norm": 0.35546875, + "learning_rate": 0.0004827671767071753, + "loss": 0.2207, + "step": 86190 + }, + { + "epoch": 3.57, + "grad_norm": 0.5859375, + "learning_rate": 0.00048276321968510413, + "loss": 0.2336, + "step": 86200 + }, + { + "epoch": 3.57, + "grad_norm": 0.6015625, + "learning_rate": 0.00048275926222499766, + "loss": 0.1628, + "step": 86210 + }, + { + "epoch": 3.57, + "grad_norm": 0.69140625, + "learning_rate": 0.00048275530432686343, + "loss": 0.25, + "step": 86220 + }, + { + "epoch": 3.57, + "grad_norm": 0.67578125, + "learning_rate": 0.00048275134599070886, + "loss": 0.2002, + "step": 86230 + }, + { + "epoch": 3.57, + "grad_norm": 0.3984375, + "learning_rate": 0.00048274738721654133, + "loss": 0.1775, + "step": 86240 + }, + { + "epoch": 3.57, + "grad_norm": 0.9453125, + "learning_rate": 0.00048274342800436843, + "loss": 0.1621, + "step": 86250 + }, + { + "epoch": 3.57, + "grad_norm": 0.79296875, + "learning_rate": 0.0004827394683541975, + "loss": 0.1983, + "step": 86260 + }, + { + "epoch": 3.57, + "grad_norm": 0.84375, + "learning_rate": 0.000482735508266036, + "loss": 0.2411, + "step": 86270 + }, + { + "epoch": 3.57, + "grad_norm": 0.447265625, + "learning_rate": 0.00048273154773989144, + "loss": 0.2462, + "step": 86280 + }, + { + "epoch": 3.57, + "grad_norm": 0.3984375, + "learning_rate": 0.0004827275867757712, + "loss": 0.2015, + "step": 86290 + }, + { + "epoch": 3.57, + "grad_norm": 0.984375, + "learning_rate": 0.00048272362537368286, + "loss": 0.1808, + "step": 86300 + }, + { + "epoch": 3.57, + "grad_norm": 0.060302734375, + "learning_rate": 0.00048271966353363374, + "loss": 0.2049, + "step": 86310 + }, + { + "epoch": 3.58, + "grad_norm": 0.384765625, + "learning_rate": 0.00048271570125563124, + "loss": 0.2399, + "step": 86320 + }, + { + "epoch": 3.58, + "grad_norm": 0.859375, + "learning_rate": 0.0004827117385396831, + "loss": 0.2278, + "step": 86330 + }, + { + "epoch": 3.58, + "grad_norm": 0.5078125, + "learning_rate": 0.00048270777538579645, + "loss": 0.2371, + "step": 86340 + }, + { + "epoch": 3.58, + "grad_norm": 0.60546875, + "learning_rate": 0.000482703811793979, + "loss": 0.2318, + "step": 86350 + }, + { + "epoch": 3.58, + "grad_norm": 1.0390625, + "learning_rate": 0.00048269984776423805, + "loss": 0.2312, + "step": 86360 + }, + { + "epoch": 3.58, + "grad_norm": 0.2109375, + "learning_rate": 0.00048269588329658117, + "loss": 0.2178, + "step": 86370 + }, + { + "epoch": 3.58, + "grad_norm": 0.63671875, + "learning_rate": 0.0004826919183910157, + "loss": 0.2006, + "step": 86380 + }, + { + "epoch": 3.58, + "grad_norm": 0.703125, + "learning_rate": 0.0004826879530475492, + "loss": 0.1827, + "step": 86390 + }, + { + "epoch": 3.58, + "grad_norm": 2.21875, + "learning_rate": 0.00048268398726618913, + "loss": 0.1519, + "step": 86400 + }, + { + "epoch": 3.58, + "grad_norm": 0.1640625, + "learning_rate": 0.00048268002104694287, + "loss": 0.2491, + "step": 86410 + }, + { + "epoch": 3.58, + "grad_norm": 0.82421875, + "learning_rate": 0.000482676054389818, + "loss": 0.2409, + "step": 86420 + }, + { + "epoch": 3.58, + "grad_norm": 0.64453125, + "learning_rate": 0.00048267208729482183, + "loss": 0.2438, + "step": 86430 + }, + { + "epoch": 3.58, + "grad_norm": 1.234375, + "learning_rate": 0.000482668119761962, + "loss": 0.1968, + "step": 86440 + }, + { + "epoch": 3.58, + "grad_norm": 0.38671875, + "learning_rate": 0.00048266415179124586, + "loss": 0.2302, + "step": 86450 + }, + { + "epoch": 3.58, + "grad_norm": 1.1015625, + "learning_rate": 0.00048266018338268094, + "loss": 0.2073, + "step": 86460 + }, + { + "epoch": 3.58, + "grad_norm": 0.71875, + "learning_rate": 0.00048265621453627463, + "loss": 0.2428, + "step": 86470 + }, + { + "epoch": 3.58, + "grad_norm": 1.4921875, + "learning_rate": 0.0004826522452520344, + "loss": 0.1949, + "step": 86480 + }, + { + "epoch": 3.58, + "grad_norm": 0.5390625, + "learning_rate": 0.00048264827552996795, + "loss": 0.2472, + "step": 86490 + }, + { + "epoch": 3.58, + "grad_norm": 0.96875, + "learning_rate": 0.0004826443053700824, + "loss": 0.2854, + "step": 86500 + }, + { + "epoch": 3.58, + "grad_norm": 0.66015625, + "learning_rate": 0.00048264033477238543, + "loss": 0.2077, + "step": 86510 + }, + { + "epoch": 3.58, + "grad_norm": 0.5234375, + "learning_rate": 0.00048263636373688457, + "loss": 0.2255, + "step": 86520 + }, + { + "epoch": 3.58, + "grad_norm": 0.25390625, + "learning_rate": 0.0004826323922635871, + "loss": 0.2177, + "step": 86530 + }, + { + "epoch": 3.58, + "grad_norm": 0.69140625, + "learning_rate": 0.00048262842035250067, + "loss": 0.1978, + "step": 86540 + }, + { + "epoch": 3.58, + "grad_norm": 0.80078125, + "learning_rate": 0.0004826244480036326, + "loss": 0.1816, + "step": 86550 + }, + { + "epoch": 3.59, + "grad_norm": 0.97265625, + "learning_rate": 0.00048262047521699044, + "loss": 0.2177, + "step": 86560 + }, + { + "epoch": 3.59, + "grad_norm": 0.8125, + "learning_rate": 0.0004826165019925817, + "loss": 0.2508, + "step": 86570 + }, + { + "epoch": 3.59, + "grad_norm": 0.8046875, + "learning_rate": 0.0004826125283304138, + "loss": 0.2028, + "step": 86580 + }, + { + "epoch": 3.59, + "grad_norm": 0.61328125, + "learning_rate": 0.0004826085542304942, + "loss": 0.1791, + "step": 86590 + }, + { + "epoch": 3.59, + "grad_norm": 0.76171875, + "learning_rate": 0.0004826045796928305, + "loss": 0.2037, + "step": 86600 + }, + { + "epoch": 3.59, + "grad_norm": 0.494140625, + "learning_rate": 0.00048260060471743007, + "loss": 0.2281, + "step": 86610 + }, + { + "epoch": 3.59, + "grad_norm": 0.8515625, + "learning_rate": 0.0004825966293043004, + "loss": 0.1995, + "step": 86620 + }, + { + "epoch": 3.59, + "grad_norm": 0.59375, + "learning_rate": 0.00048259265345344907, + "loss": 0.2129, + "step": 86630 + }, + { + "epoch": 3.59, + "grad_norm": 0.63671875, + "learning_rate": 0.0004825886771648834, + "loss": 0.1946, + "step": 86640 + }, + { + "epoch": 3.59, + "grad_norm": 0.609375, + "learning_rate": 0.000482584700438611, + "loss": 0.2634, + "step": 86650 + }, + { + "epoch": 3.59, + "grad_norm": 0.40625, + "learning_rate": 0.0004825807232746393, + "loss": 0.2142, + "step": 86660 + }, + { + "epoch": 3.59, + "grad_norm": 0.408203125, + "learning_rate": 0.00048257674567297575, + "loss": 0.1609, + "step": 86670 + }, + { + "epoch": 3.59, + "grad_norm": 0.80859375, + "learning_rate": 0.00048257276763362795, + "loss": 0.1813, + "step": 86680 + }, + { + "epoch": 3.59, + "grad_norm": 0.7421875, + "learning_rate": 0.0004825687891566033, + "loss": 0.2108, + "step": 86690 + }, + { + "epoch": 3.59, + "grad_norm": 0.9296875, + "learning_rate": 0.00048256481024190926, + "loss": 0.2157, + "step": 86700 + }, + { + "epoch": 3.59, + "grad_norm": 3.140625, + "learning_rate": 0.0004825608308895534, + "loss": 0.2409, + "step": 86710 + }, + { + "epoch": 3.59, + "grad_norm": 0.671875, + "learning_rate": 0.0004825568510995432, + "loss": 0.2072, + "step": 86720 + }, + { + "epoch": 3.59, + "grad_norm": 1.40625, + "learning_rate": 0.000482552870871886, + "loss": 0.1971, + "step": 86730 + }, + { + "epoch": 3.59, + "grad_norm": 0.8359375, + "learning_rate": 0.00048254889020658956, + "loss": 0.2653, + "step": 86740 + }, + { + "epoch": 3.59, + "grad_norm": 0.9765625, + "learning_rate": 0.00048254490910366117, + "loss": 0.1656, + "step": 86750 + }, + { + "epoch": 3.59, + "grad_norm": 0.5703125, + "learning_rate": 0.00048254092756310835, + "loss": 0.2833, + "step": 86760 + }, + { + "epoch": 3.59, + "grad_norm": 0.87890625, + "learning_rate": 0.0004825369455849387, + "loss": 0.2193, + "step": 86770 + }, + { + "epoch": 3.59, + "grad_norm": 0.72265625, + "learning_rate": 0.00048253296316915954, + "loss": 0.2151, + "step": 86780 + }, + { + "epoch": 3.59, + "grad_norm": 1.6484375, + "learning_rate": 0.0004825289803157784, + "loss": 0.203, + "step": 86790 + }, + { + "epoch": 3.6, + "grad_norm": 1.078125, + "learning_rate": 0.0004825249970248029, + "loss": 0.213, + "step": 86800 + }, + { + "epoch": 3.6, + "grad_norm": 0.55078125, + "learning_rate": 0.0004825210132962405, + "loss": 0.2054, + "step": 86810 + }, + { + "epoch": 3.6, + "grad_norm": 0.67578125, + "learning_rate": 0.00048251702913009863, + "loss": 0.176, + "step": 86820 + }, + { + "epoch": 3.6, + "grad_norm": 0.83984375, + "learning_rate": 0.00048251304452638476, + "loss": 0.2075, + "step": 86830 + }, + { + "epoch": 3.6, + "grad_norm": 0.25, + "learning_rate": 0.0004825090594851066, + "loss": 0.2447, + "step": 86840 + }, + { + "epoch": 3.6, + "grad_norm": 0.92578125, + "learning_rate": 0.0004825050740062714, + "loss": 0.2222, + "step": 86850 + }, + { + "epoch": 3.6, + "grad_norm": 0.71875, + "learning_rate": 0.00048250108808988676, + "loss": 0.2231, + "step": 86860 + }, + { + "epoch": 3.6, + "grad_norm": 0.84375, + "learning_rate": 0.0004824971017359602, + "loss": 0.2118, + "step": 86870 + }, + { + "epoch": 3.6, + "grad_norm": 0.7265625, + "learning_rate": 0.00048249311494449914, + "loss": 0.1517, + "step": 86880 + }, + { + "epoch": 3.6, + "grad_norm": 0.67578125, + "learning_rate": 0.00048248912771551124, + "loss": 0.1902, + "step": 86890 + }, + { + "epoch": 3.6, + "grad_norm": 0.6640625, + "learning_rate": 0.00048248514004900386, + "loss": 0.1506, + "step": 86900 + }, + { + "epoch": 3.6, + "grad_norm": 0.5859375, + "learning_rate": 0.0004824811519449845, + "loss": 0.2241, + "step": 86910 + }, + { + "epoch": 3.6, + "grad_norm": 0.283203125, + "learning_rate": 0.00048247716340346083, + "loss": 0.2001, + "step": 86920 + }, + { + "epoch": 3.6, + "grad_norm": 1.0078125, + "learning_rate": 0.00048247317442444014, + "loss": 0.2389, + "step": 86930 + }, + { + "epoch": 3.6, + "grad_norm": 0.63671875, + "learning_rate": 0.0004824691850079301, + "loss": 0.2451, + "step": 86940 + }, + { + "epoch": 3.6, + "grad_norm": 0.65625, + "learning_rate": 0.00048246519515393816, + "loss": 0.1682, + "step": 86950 + }, + { + "epoch": 3.6, + "grad_norm": 0.8828125, + "learning_rate": 0.0004824612048624718, + "loss": 0.1975, + "step": 86960 + }, + { + "epoch": 3.6, + "grad_norm": 0.71484375, + "learning_rate": 0.0004824572141335386, + "loss": 0.1439, + "step": 86970 + }, + { + "epoch": 3.6, + "grad_norm": 0.0, + "learning_rate": 0.000482453222967146, + "loss": 0.2542, + "step": 86980 + }, + { + "epoch": 3.6, + "grad_norm": 1.4765625, + "learning_rate": 0.00048244923136330154, + "loss": 0.232, + "step": 86990 + }, + { + "epoch": 3.6, + "grad_norm": 0.734375, + "learning_rate": 0.00048244523932201275, + "loss": 0.2047, + "step": 87000 + }, + { + "epoch": 3.6, + "grad_norm": 0.439453125, + "learning_rate": 0.00048244124684328703, + "loss": 0.2678, + "step": 87010 + }, + { + "epoch": 3.6, + "grad_norm": 0.44140625, + "learning_rate": 0.0004824372539271321, + "loss": 0.1885, + "step": 87020 + }, + { + "epoch": 3.6, + "grad_norm": 0.8671875, + "learning_rate": 0.0004824332605735553, + "loss": 0.1718, + "step": 87030 + }, + { + "epoch": 3.61, + "grad_norm": 0.39453125, + "learning_rate": 0.00048242926678256423, + "loss": 0.2047, + "step": 87040 + }, + { + "epoch": 3.61, + "grad_norm": 0.765625, + "learning_rate": 0.00048242527255416633, + "loss": 0.1831, + "step": 87050 + }, + { + "epoch": 3.61, + "grad_norm": 0.79296875, + "learning_rate": 0.0004824212778883692, + "loss": 0.1539, + "step": 87060 + }, + { + "epoch": 3.61, + "grad_norm": 0.484375, + "learning_rate": 0.0004824172827851804, + "loss": 0.2059, + "step": 87070 + }, + { + "epoch": 3.61, + "grad_norm": 0.4296875, + "learning_rate": 0.0004824132872446073, + "loss": 0.1782, + "step": 87080 + }, + { + "epoch": 3.61, + "grad_norm": 0.177734375, + "learning_rate": 0.00048240929126665744, + "loss": 0.2059, + "step": 87090 + }, + { + "epoch": 3.61, + "grad_norm": 0.80078125, + "learning_rate": 0.0004824052948513384, + "loss": 0.2415, + "step": 87100 + }, + { + "epoch": 3.61, + "grad_norm": 0.88671875, + "learning_rate": 0.0004824012979986578, + "loss": 0.2559, + "step": 87110 + }, + { + "epoch": 3.61, + "grad_norm": 0.87890625, + "learning_rate": 0.00048239730070862296, + "loss": 0.2342, + "step": 87120 + }, + { + "epoch": 3.61, + "grad_norm": 0.56640625, + "learning_rate": 0.0004823933029812415, + "loss": 0.2548, + "step": 87130 + }, + { + "epoch": 3.61, + "grad_norm": 1.515625, + "learning_rate": 0.000482389304816521, + "loss": 0.2462, + "step": 87140 + }, + { + "epoch": 3.61, + "grad_norm": 0.65625, + "learning_rate": 0.00048238530621446887, + "loss": 0.1639, + "step": 87150 + }, + { + "epoch": 3.61, + "grad_norm": 1.1953125, + "learning_rate": 0.00048238130717509275, + "loss": 0.2376, + "step": 87160 + }, + { + "epoch": 3.61, + "grad_norm": 1.0078125, + "learning_rate": 0.00048237730769840005, + "loss": 0.1834, + "step": 87170 + }, + { + "epoch": 3.61, + "grad_norm": 1.0390625, + "learning_rate": 0.00048237330778439836, + "loss": 0.2583, + "step": 87180 + }, + { + "epoch": 3.61, + "grad_norm": 0.4765625, + "learning_rate": 0.0004823693074330952, + "loss": 0.1999, + "step": 87190 + }, + { + "epoch": 3.61, + "grad_norm": 0.84765625, + "learning_rate": 0.00048236530664449805, + "loss": 0.196, + "step": 87200 + }, + { + "epoch": 3.61, + "grad_norm": 1.8125, + "learning_rate": 0.00048236130541861455, + "loss": 0.2751, + "step": 87210 + }, + { + "epoch": 3.61, + "grad_norm": 0.59765625, + "learning_rate": 0.00048235730375545215, + "loss": 0.1888, + "step": 87220 + }, + { + "epoch": 3.61, + "grad_norm": 1.359375, + "learning_rate": 0.00048235330165501844, + "loss": 0.1903, + "step": 87230 + }, + { + "epoch": 3.61, + "grad_norm": 1.1171875, + "learning_rate": 0.00048234929911732084, + "loss": 0.2009, + "step": 87240 + }, + { + "epoch": 3.61, + "grad_norm": 1.15625, + "learning_rate": 0.000482345296142367, + "loss": 0.2521, + "step": 87250 + }, + { + "epoch": 3.61, + "grad_norm": 1.078125, + "learning_rate": 0.00048234129273016434, + "loss": 0.2399, + "step": 87260 + }, + { + "epoch": 3.61, + "grad_norm": 0.494140625, + "learning_rate": 0.0004823372888807206, + "loss": 0.2063, + "step": 87270 + }, + { + "epoch": 3.62, + "grad_norm": 1.09375, + "learning_rate": 0.00048233328459404304, + "loss": 0.2773, + "step": 87280 + }, + { + "epoch": 3.62, + "grad_norm": 0.318359375, + "learning_rate": 0.00048232927987013936, + "loss": 0.2159, + "step": 87290 + }, + { + "epoch": 3.62, + "grad_norm": 1.0703125, + "learning_rate": 0.00048232527470901706, + "loss": 0.2563, + "step": 87300 + }, + { + "epoch": 3.62, + "grad_norm": 0.62890625, + "learning_rate": 0.0004823212691106837, + "loss": 0.2508, + "step": 87310 + }, + { + "epoch": 3.62, + "grad_norm": 0.86328125, + "learning_rate": 0.0004823172630751468, + "loss": 0.2094, + "step": 87320 + }, + { + "epoch": 3.62, + "grad_norm": 0.376953125, + "learning_rate": 0.00048231325660241385, + "loss": 0.207, + "step": 87330 + }, + { + "epoch": 3.62, + "grad_norm": 0.494140625, + "learning_rate": 0.0004823092496924925, + "loss": 0.1754, + "step": 87340 + }, + { + "epoch": 3.62, + "grad_norm": 0.345703125, + "learning_rate": 0.0004823052423453902, + "loss": 0.2107, + "step": 87350 + }, + { + "epoch": 3.62, + "grad_norm": 0.3671875, + "learning_rate": 0.00048230123456111454, + "loss": 0.2118, + "step": 87360 + }, + { + "epoch": 3.62, + "grad_norm": 0.69140625, + "learning_rate": 0.000482297226339673, + "loss": 0.2112, + "step": 87370 + }, + { + "epoch": 3.62, + "grad_norm": 0.1806640625, + "learning_rate": 0.0004822932176810732, + "loss": 0.1905, + "step": 87380 + }, + { + "epoch": 3.62, + "grad_norm": 1.015625, + "learning_rate": 0.0004822892085853227, + "loss": 0.2427, + "step": 87390 + }, + { + "epoch": 3.62, + "grad_norm": 0.90234375, + "learning_rate": 0.0004822851990524289, + "loss": 0.1758, + "step": 87400 + }, + { + "epoch": 3.62, + "grad_norm": 0.6171875, + "learning_rate": 0.00048228118908239946, + "loss": 0.23, + "step": 87410 + }, + { + "epoch": 3.62, + "grad_norm": 0.4765625, + "learning_rate": 0.0004822771786752419, + "loss": 0.197, + "step": 87420 + }, + { + "epoch": 3.62, + "grad_norm": 0.671875, + "learning_rate": 0.0004822731678309639, + "loss": 0.1882, + "step": 87430 + }, + { + "epoch": 3.62, + "grad_norm": 0.8203125, + "learning_rate": 0.0004822691565495727, + "loss": 0.2678, + "step": 87440 + }, + { + "epoch": 3.62, + "grad_norm": 0.234375, + "learning_rate": 0.0004822651448310761, + "loss": 0.1988, + "step": 87450 + }, + { + "epoch": 3.62, + "grad_norm": 0.427734375, + "learning_rate": 0.0004822611326754816, + "loss": 0.2124, + "step": 87460 + }, + { + "epoch": 3.62, + "grad_norm": 0.451171875, + "learning_rate": 0.0004822571200827968, + "loss": 0.2231, + "step": 87470 + }, + { + "epoch": 3.62, + "grad_norm": 0.65234375, + "learning_rate": 0.00048225310705302907, + "loss": 0.1863, + "step": 87480 + }, + { + "epoch": 3.62, + "grad_norm": 0.87109375, + "learning_rate": 0.0004822490935861861, + "loss": 0.2481, + "step": 87490 + }, + { + "epoch": 3.62, + "grad_norm": 0.62109375, + "learning_rate": 0.0004822450796822754, + "loss": 0.204, + "step": 87500 + }, + { + "epoch": 3.62, + "grad_norm": 0.515625, + "learning_rate": 0.0004822410653413046, + "loss": 0.1982, + "step": 87510 + }, + { + "epoch": 3.63, + "grad_norm": 0.388671875, + "learning_rate": 0.00048223705056328115, + "loss": 0.2335, + "step": 87520 + }, + { + "epoch": 3.63, + "grad_norm": 0.578125, + "learning_rate": 0.00048223303534821263, + "loss": 0.219, + "step": 87530 + }, + { + "epoch": 3.63, + "grad_norm": 0.435546875, + "learning_rate": 0.00048222901969610665, + "loss": 0.2356, + "step": 87540 + }, + { + "epoch": 3.63, + "grad_norm": 2.21875, + "learning_rate": 0.00048222500360697076, + "loss": 0.2693, + "step": 87550 + }, + { + "epoch": 3.63, + "grad_norm": 0.9609375, + "learning_rate": 0.00048222098708081243, + "loss": 0.2584, + "step": 87560 + }, + { + "epoch": 3.63, + "grad_norm": 0.4375, + "learning_rate": 0.0004822169701176393, + "loss": 0.2608, + "step": 87570 + }, + { + "epoch": 3.63, + "grad_norm": 0.82421875, + "learning_rate": 0.0004822129527174589, + "loss": 0.2005, + "step": 87580 + }, + { + "epoch": 3.63, + "grad_norm": 1.1875, + "learning_rate": 0.00048220893488027885, + "loss": 0.2129, + "step": 87590 + }, + { + "epoch": 3.63, + "grad_norm": 0.7578125, + "learning_rate": 0.0004822049166061067, + "loss": 0.1807, + "step": 87600 + }, + { + "epoch": 3.63, + "grad_norm": 1.0, + "learning_rate": 0.0004822008978949498, + "loss": 0.1786, + "step": 87610 + }, + { + "epoch": 3.63, + "grad_norm": 0.796875, + "learning_rate": 0.000482196878746816, + "loss": 0.158, + "step": 87620 + }, + { + "epoch": 3.63, + "grad_norm": 0.68359375, + "learning_rate": 0.00048219285916171274, + "loss": 0.1514, + "step": 87630 + }, + { + "epoch": 3.63, + "grad_norm": 0.78515625, + "learning_rate": 0.0004821888391396476, + "loss": 0.1951, + "step": 87640 + }, + { + "epoch": 3.63, + "grad_norm": 1.1171875, + "learning_rate": 0.00048218481868062815, + "loss": 0.3203, + "step": 87650 + }, + { + "epoch": 3.63, + "grad_norm": 0.61328125, + "learning_rate": 0.000482180797784662, + "loss": 0.1779, + "step": 87660 + }, + { + "epoch": 3.63, + "grad_norm": 0.8828125, + "learning_rate": 0.00048217677645175653, + "loss": 0.2357, + "step": 87670 + }, + { + "epoch": 3.63, + "grad_norm": 0.490234375, + "learning_rate": 0.00048217275468191945, + "loss": 0.1891, + "step": 87680 + }, + { + "epoch": 3.63, + "grad_norm": 0.83984375, + "learning_rate": 0.0004821687324751584, + "loss": 0.2352, + "step": 87690 + }, + { + "epoch": 3.63, + "grad_norm": 3.890625, + "learning_rate": 0.0004821647098314808, + "loss": 0.2131, + "step": 87700 + }, + { + "epoch": 3.63, + "grad_norm": 0.625, + "learning_rate": 0.0004821606867508943, + "loss": 0.2104, + "step": 87710 + }, + { + "epoch": 3.63, + "grad_norm": 0.734375, + "learning_rate": 0.00048215666323340653, + "loss": 0.2278, + "step": 87720 + }, + { + "epoch": 3.63, + "grad_norm": 0.87890625, + "learning_rate": 0.0004821526392790249, + "loss": 0.1505, + "step": 87730 + }, + { + "epoch": 3.63, + "grad_norm": 0.75390625, + "learning_rate": 0.00048214861488775714, + "loss": 0.1629, + "step": 87740 + }, + { + "epoch": 3.63, + "grad_norm": 0.59765625, + "learning_rate": 0.00048214459005961076, + "loss": 0.1872, + "step": 87750 + }, + { + "epoch": 3.64, + "grad_norm": 0.9921875, + "learning_rate": 0.0004821405647945933, + "loss": 0.1887, + "step": 87760 + }, + { + "epoch": 3.64, + "grad_norm": 0.57421875, + "learning_rate": 0.0004821365390927124, + "loss": 0.1948, + "step": 87770 + }, + { + "epoch": 3.64, + "grad_norm": 1.6171875, + "learning_rate": 0.00048213251295397554, + "loss": 0.2742, + "step": 87780 + }, + { + "epoch": 3.64, + "grad_norm": 0.232421875, + "learning_rate": 0.0004821284863783904, + "loss": 0.1722, + "step": 87790 + }, + { + "epoch": 3.64, + "grad_norm": 1.5625, + "learning_rate": 0.0004821244593659645, + "loss": 0.2026, + "step": 87800 + }, + { + "epoch": 3.64, + "grad_norm": 0.95703125, + "learning_rate": 0.0004821204319167054, + "loss": 0.2027, + "step": 87810 + }, + { + "epoch": 3.64, + "grad_norm": 1.1640625, + "learning_rate": 0.00048211640403062074, + "loss": 0.1859, + "step": 87820 + }, + { + "epoch": 3.64, + "grad_norm": 0.69921875, + "learning_rate": 0.0004821123757077181, + "loss": 0.2399, + "step": 87830 + }, + { + "epoch": 3.64, + "grad_norm": 0.76953125, + "learning_rate": 0.00048210834694800497, + "loss": 0.1898, + "step": 87840 + }, + { + "epoch": 3.64, + "grad_norm": 0.478515625, + "learning_rate": 0.0004821043177514891, + "loss": 0.2066, + "step": 87850 + }, + { + "epoch": 3.64, + "grad_norm": 0.56640625, + "learning_rate": 0.0004821002881181779, + "loss": 0.2368, + "step": 87860 + }, + { + "epoch": 3.64, + "grad_norm": 0.96875, + "learning_rate": 0.00048209625804807905, + "loss": 0.2189, + "step": 87870 + }, + { + "epoch": 3.64, + "grad_norm": 0.4296875, + "learning_rate": 0.00048209222754120007, + "loss": 0.2525, + "step": 87880 + }, + { + "epoch": 3.64, + "grad_norm": 0.443359375, + "learning_rate": 0.00048208819659754866, + "loss": 0.2394, + "step": 87890 + }, + { + "epoch": 3.64, + "grad_norm": 1.078125, + "learning_rate": 0.00048208416521713227, + "loss": 0.21, + "step": 87900 + }, + { + "epoch": 3.64, + "grad_norm": 0.96484375, + "learning_rate": 0.0004820801333999585, + "loss": 0.196, + "step": 87910 + }, + { + "epoch": 3.64, + "grad_norm": 0.27734375, + "learning_rate": 0.0004820761011460351, + "loss": 0.1237, + "step": 87920 + }, + { + "epoch": 3.64, + "grad_norm": 0.71875, + "learning_rate": 0.0004820720684553694, + "loss": 0.2028, + "step": 87930 + }, + { + "epoch": 3.64, + "grad_norm": 0.0, + "learning_rate": 0.0004820680353279692, + "loss": 0.182, + "step": 87940 + }, + { + "epoch": 3.64, + "grad_norm": 2.0, + "learning_rate": 0.00048206400176384204, + "loss": 0.2195, + "step": 87950 + }, + { + "epoch": 3.64, + "grad_norm": 0.984375, + "learning_rate": 0.0004820599677629955, + "loss": 0.1942, + "step": 87960 + }, + { + "epoch": 3.64, + "grad_norm": 0.376953125, + "learning_rate": 0.0004820559333254371, + "loss": 0.1457, + "step": 87970 + }, + { + "epoch": 3.64, + "grad_norm": 1.1484375, + "learning_rate": 0.0004820518984511745, + "loss": 0.2357, + "step": 87980 + }, + { + "epoch": 3.64, + "grad_norm": 0.57421875, + "learning_rate": 0.0004820478631402153, + "loss": 0.2018, + "step": 87990 + }, + { + "epoch": 3.64, + "grad_norm": 0.9609375, + "learning_rate": 0.00048204382739256714, + "loss": 0.2368, + "step": 88000 + }, + { + "epoch": 3.65, + "grad_norm": 0.9375, + "learning_rate": 0.0004820397912082375, + "loss": 0.1984, + "step": 88010 + }, + { + "epoch": 3.65, + "grad_norm": 1.5, + "learning_rate": 0.000482035754587234, + "loss": 0.2281, + "step": 88020 + }, + { + "epoch": 3.65, + "grad_norm": 0.177734375, + "learning_rate": 0.0004820317175295643, + "loss": 0.1422, + "step": 88030 + }, + { + "epoch": 3.65, + "grad_norm": 0.609375, + "learning_rate": 0.000482027680035236, + "loss": 0.2482, + "step": 88040 + }, + { + "epoch": 3.65, + "grad_norm": 0.67578125, + "learning_rate": 0.00048202364210425664, + "loss": 0.255, + "step": 88050 + }, + { + "epoch": 3.65, + "grad_norm": 0.6640625, + "learning_rate": 0.0004820196037366338, + "loss": 0.2054, + "step": 88060 + }, + { + "epoch": 3.65, + "grad_norm": 1.1328125, + "learning_rate": 0.0004820155649323752, + "loss": 0.1982, + "step": 88070 + }, + { + "epoch": 3.65, + "grad_norm": 0.361328125, + "learning_rate": 0.0004820115256914883, + "loss": 0.1696, + "step": 88080 + }, + { + "epoch": 3.65, + "grad_norm": 0.45703125, + "learning_rate": 0.0004820074860139808, + "loss": 0.2124, + "step": 88090 + }, + { + "epoch": 3.65, + "grad_norm": 0.6484375, + "learning_rate": 0.0004820034458998602, + "loss": 0.2227, + "step": 88100 + }, + { + "epoch": 3.65, + "grad_norm": 0.5390625, + "learning_rate": 0.00048199940534913424, + "loss": 0.1908, + "step": 88110 + }, + { + "epoch": 3.65, + "grad_norm": 0.703125, + "learning_rate": 0.0004819953643618104, + "loss": 0.1846, + "step": 88120 + }, + { + "epoch": 3.65, + "grad_norm": 0.41015625, + "learning_rate": 0.0004819913229378964, + "loss": 0.2327, + "step": 88130 + }, + { + "epoch": 3.65, + "grad_norm": 0.375, + "learning_rate": 0.00048198728107739976, + "loss": 0.2155, + "step": 88140 + }, + { + "epoch": 3.65, + "grad_norm": 1.2109375, + "learning_rate": 0.0004819832387803281, + "loss": 0.2069, + "step": 88150 + }, + { + "epoch": 3.65, + "grad_norm": 1.09375, + "learning_rate": 0.00048197919604668903, + "loss": 0.1803, + "step": 88160 + }, + { + "epoch": 3.65, + "grad_norm": 0.98828125, + "learning_rate": 0.00048197515287649016, + "loss": 0.2307, + "step": 88170 + }, + { + "epoch": 3.65, + "grad_norm": 0.7109375, + "learning_rate": 0.0004819711092697391, + "loss": 0.1751, + "step": 88180 + }, + { + "epoch": 3.65, + "grad_norm": 0.734375, + "learning_rate": 0.0004819670652264435, + "loss": 0.21, + "step": 88190 + }, + { + "epoch": 3.65, + "grad_norm": 1.03125, + "learning_rate": 0.00048196302074661094, + "loss": 0.2281, + "step": 88200 + }, + { + "epoch": 3.65, + "grad_norm": 0.474609375, + "learning_rate": 0.00048195897583024896, + "loss": 0.2834, + "step": 88210 + }, + { + "epoch": 3.65, + "grad_norm": 1.0703125, + "learning_rate": 0.0004819549304773653, + "loss": 0.2281, + "step": 88220 + }, + { + "epoch": 3.65, + "grad_norm": 0.54296875, + "learning_rate": 0.00048195088468796745, + "loss": 0.2281, + "step": 88230 + }, + { + "epoch": 3.65, + "grad_norm": 0.66015625, + "learning_rate": 0.0004819468384620631, + "loss": 0.2235, + "step": 88240 + }, + { + "epoch": 3.66, + "grad_norm": 1.4296875, + "learning_rate": 0.0004819427917996598, + "loss": 0.2461, + "step": 88250 + }, + { + "epoch": 3.66, + "grad_norm": 0.4296875, + "learning_rate": 0.00048193874470076536, + "loss": 0.1905, + "step": 88260 + }, + { + "epoch": 3.66, + "grad_norm": 0.419921875, + "learning_rate": 0.00048193469716538716, + "loss": 0.2439, + "step": 88270 + }, + { + "epoch": 3.66, + "grad_norm": 0.69140625, + "learning_rate": 0.0004819306491935329, + "loss": 0.1679, + "step": 88280 + }, + { + "epoch": 3.66, + "grad_norm": 0.9765625, + "learning_rate": 0.00048192660078521024, + "loss": 0.2448, + "step": 88290 + }, + { + "epoch": 3.66, + "grad_norm": 0.625, + "learning_rate": 0.0004819225519404267, + "loss": 0.1768, + "step": 88300 + }, + { + "epoch": 3.66, + "grad_norm": 0.75390625, + "learning_rate": 0.00048191850265919, + "loss": 0.2381, + "step": 88310 + }, + { + "epoch": 3.66, + "grad_norm": 0.5390625, + "learning_rate": 0.0004819144529415077, + "loss": 0.248, + "step": 88320 + }, + { + "epoch": 3.66, + "grad_norm": 0.34765625, + "learning_rate": 0.00048191040278738754, + "loss": 0.2321, + "step": 88330 + }, + { + "epoch": 3.66, + "grad_norm": 0.671875, + "learning_rate": 0.000481906352196837, + "loss": 0.1929, + "step": 88340 + }, + { + "epoch": 3.66, + "grad_norm": 0.6640625, + "learning_rate": 0.00048190230116986366, + "loss": 0.2212, + "step": 88350 + }, + { + "epoch": 3.66, + "grad_norm": 0.4453125, + "learning_rate": 0.00048189824970647536, + "loss": 0.1937, + "step": 88360 + }, + { + "epoch": 3.66, + "grad_norm": 0.4609375, + "learning_rate": 0.00048189419780667953, + "loss": 0.2441, + "step": 88370 + }, + { + "epoch": 3.66, + "grad_norm": 0.66015625, + "learning_rate": 0.0004818901454704838, + "loss": 0.2045, + "step": 88380 + }, + { + "epoch": 3.66, + "grad_norm": 0.3984375, + "learning_rate": 0.00048188609269789596, + "loss": 0.2579, + "step": 88390 + }, + { + "epoch": 3.66, + "grad_norm": 1.3671875, + "learning_rate": 0.0004818820394889235, + "loss": 0.1974, + "step": 88400 + }, + { + "epoch": 3.66, + "grad_norm": 0.68359375, + "learning_rate": 0.0004818779858435741, + "loss": 0.2561, + "step": 88410 + }, + { + "epoch": 3.66, + "grad_norm": 0.369140625, + "learning_rate": 0.00048187393176185535, + "loss": 0.1714, + "step": 88420 + }, + { + "epoch": 3.66, + "grad_norm": 1.1171875, + "learning_rate": 0.0004818698772437749, + "loss": 0.2172, + "step": 88430 + }, + { + "epoch": 3.66, + "grad_norm": 1.34375, + "learning_rate": 0.0004818658222893404, + "loss": 0.1839, + "step": 88440 + }, + { + "epoch": 3.66, + "grad_norm": 0.55078125, + "learning_rate": 0.00048186176689855945, + "loss": 0.226, + "step": 88450 + }, + { + "epoch": 3.66, + "grad_norm": 0.74609375, + "learning_rate": 0.00048185771107143966, + "loss": 0.2339, + "step": 88460 + }, + { + "epoch": 3.66, + "grad_norm": 1.328125, + "learning_rate": 0.0004818536548079887, + "loss": 0.1904, + "step": 88470 + }, + { + "epoch": 3.66, + "grad_norm": 1.15625, + "learning_rate": 0.0004818495981082143, + "loss": 0.2332, + "step": 88480 + }, + { + "epoch": 3.67, + "grad_norm": 0.734375, + "learning_rate": 0.0004818455409721239, + "loss": 0.249, + "step": 88490 + }, + { + "epoch": 3.67, + "grad_norm": 0.6875, + "learning_rate": 0.00048184148339972525, + "loss": 0.2028, + "step": 88500 + }, + { + "epoch": 3.67, + "grad_norm": 0.67578125, + "learning_rate": 0.00048183742539102594, + "loss": 0.1625, + "step": 88510 + }, + { + "epoch": 3.67, + "grad_norm": 0.91015625, + "learning_rate": 0.00048183336694603365, + "loss": 0.2142, + "step": 88520 + }, + { + "epoch": 3.67, + "grad_norm": 0.6640625, + "learning_rate": 0.000481829308064756, + "loss": 0.24, + "step": 88530 + }, + { + "epoch": 3.67, + "grad_norm": 0.9765625, + "learning_rate": 0.0004818252487472007, + "loss": 0.1782, + "step": 88540 + }, + { + "epoch": 3.67, + "grad_norm": 0.69140625, + "learning_rate": 0.0004818211889933752, + "loss": 0.2378, + "step": 88550 + }, + { + "epoch": 3.67, + "grad_norm": 0.546875, + "learning_rate": 0.0004818171288032873, + "loss": 0.2477, + "step": 88560 + }, + { + "epoch": 3.67, + "grad_norm": 0.5390625, + "learning_rate": 0.0004818130681769446, + "loss": 0.221, + "step": 88570 + }, + { + "epoch": 3.67, + "grad_norm": 0.28515625, + "learning_rate": 0.00048180900711435477, + "loss": 0.2664, + "step": 88580 + }, + { + "epoch": 3.67, + "grad_norm": 0.79296875, + "learning_rate": 0.00048180494561552535, + "loss": 0.1819, + "step": 88590 + }, + { + "epoch": 3.67, + "grad_norm": 0.69921875, + "learning_rate": 0.00048180088368046415, + "loss": 0.194, + "step": 88600 + }, + { + "epoch": 3.67, + "grad_norm": 0.6640625, + "learning_rate": 0.0004817968213091787, + "loss": 0.2313, + "step": 88610 + }, + { + "epoch": 3.67, + "grad_norm": 0.54296875, + "learning_rate": 0.0004817927585016766, + "loss": 0.206, + "step": 88620 + }, + { + "epoch": 3.67, + "grad_norm": 0.69921875, + "learning_rate": 0.0004817886952579656, + "loss": 0.2444, + "step": 88630 + }, + { + "epoch": 3.67, + "grad_norm": 0.6484375, + "learning_rate": 0.0004817846315780533, + "loss": 0.1916, + "step": 88640 + }, + { + "epoch": 3.67, + "grad_norm": 1.0546875, + "learning_rate": 0.0004817805674619473, + "loss": 0.1957, + "step": 88650 + }, + { + "epoch": 3.67, + "grad_norm": 0.427734375, + "learning_rate": 0.00048177650290965533, + "loss": 0.2372, + "step": 88660 + }, + { + "epoch": 3.67, + "grad_norm": 0.43359375, + "learning_rate": 0.0004817724379211851, + "loss": 0.1536, + "step": 88670 + }, + { + "epoch": 3.67, + "grad_norm": 0.58203125, + "learning_rate": 0.0004817683724965442, + "loss": 0.2428, + "step": 88680 + }, + { + "epoch": 3.67, + "grad_norm": 0.57421875, + "learning_rate": 0.0004817643066357401, + "loss": 0.1657, + "step": 88690 + }, + { + "epoch": 3.67, + "grad_norm": 0.875, + "learning_rate": 0.00048176024033878064, + "loss": 0.1538, + "step": 88700 + }, + { + "epoch": 3.67, + "grad_norm": 0.3125, + "learning_rate": 0.00048175617360567347, + "loss": 0.2284, + "step": 88710 + }, + { + "epoch": 3.67, + "grad_norm": 1.2109375, + "learning_rate": 0.0004817521064364262, + "loss": 0.2424, + "step": 88720 + }, + { + "epoch": 3.68, + "grad_norm": 1.0078125, + "learning_rate": 0.00048174803883104645, + "loss": 0.178, + "step": 88730 + }, + { + "epoch": 3.68, + "grad_norm": 1.1484375, + "learning_rate": 0.00048174397078954204, + "loss": 0.2524, + "step": 88740 + }, + { + "epoch": 3.68, + "grad_norm": 0.62890625, + "learning_rate": 0.0004817399023119204, + "loss": 0.2043, + "step": 88750 + }, + { + "epoch": 3.68, + "grad_norm": 0.3828125, + "learning_rate": 0.0004817358333981893, + "loss": 0.2001, + "step": 88760 + }, + { + "epoch": 3.68, + "grad_norm": 0.5546875, + "learning_rate": 0.0004817317640483564, + "loss": 0.1965, + "step": 88770 + }, + { + "epoch": 3.68, + "grad_norm": 0.66015625, + "learning_rate": 0.00048172769426242933, + "loss": 0.1786, + "step": 88780 + }, + { + "epoch": 3.68, + "grad_norm": 0.46484375, + "learning_rate": 0.0004817236240404158, + "loss": 0.2466, + "step": 88790 + }, + { + "epoch": 3.68, + "grad_norm": 1.046875, + "learning_rate": 0.0004817195533823234, + "loss": 0.2412, + "step": 88800 + }, + { + "epoch": 3.68, + "grad_norm": 0.62109375, + "learning_rate": 0.0004817154822881598, + "loss": 0.2501, + "step": 88810 + }, + { + "epoch": 3.68, + "grad_norm": 0.6953125, + "learning_rate": 0.0004817114107579327, + "loss": 0.1917, + "step": 88820 + }, + { + "epoch": 3.68, + "grad_norm": 0.62109375, + "learning_rate": 0.00048170733879164985, + "loss": 0.2456, + "step": 88830 + }, + { + "epoch": 3.68, + "grad_norm": 0.734375, + "learning_rate": 0.0004817032663893187, + "loss": 0.2186, + "step": 88840 + }, + { + "epoch": 3.68, + "grad_norm": 1.7578125, + "learning_rate": 0.00048169919355094706, + "loss": 0.2017, + "step": 88850 + }, + { + "epoch": 3.68, + "grad_norm": 0.859375, + "learning_rate": 0.0004816951202765425, + "loss": 0.2267, + "step": 88860 + }, + { + "epoch": 3.68, + "grad_norm": 1.6875, + "learning_rate": 0.00048169104656611286, + "loss": 0.2419, + "step": 88870 + }, + { + "epoch": 3.68, + "grad_norm": 0.462890625, + "learning_rate": 0.0004816869724196656, + "loss": 0.1949, + "step": 88880 + }, + { + "epoch": 3.68, + "grad_norm": 1.0078125, + "learning_rate": 0.00048168289783720854, + "loss": 0.1948, + "step": 88890 + }, + { + "epoch": 3.68, + "grad_norm": 0.345703125, + "learning_rate": 0.0004816788228187492, + "loss": 0.2037, + "step": 88900 + }, + { + "epoch": 3.68, + "grad_norm": 0.3515625, + "learning_rate": 0.0004816747473642954, + "loss": 0.1973, + "step": 88910 + }, + { + "epoch": 3.68, + "grad_norm": 0.3671875, + "learning_rate": 0.0004816706714738547, + "loss": 0.1965, + "step": 88920 + }, + { + "epoch": 3.68, + "grad_norm": 0.69140625, + "learning_rate": 0.00048166659514743487, + "loss": 0.2195, + "step": 88930 + }, + { + "epoch": 3.68, + "grad_norm": 0.345703125, + "learning_rate": 0.0004816625183850435, + "loss": 0.2393, + "step": 88940 + }, + { + "epoch": 3.68, + "grad_norm": 0.92578125, + "learning_rate": 0.00048165844118668835, + "loss": 0.2519, + "step": 88950 + }, + { + "epoch": 3.68, + "grad_norm": 0.51171875, + "learning_rate": 0.00048165436355237693, + "loss": 0.2468, + "step": 88960 + }, + { + "epoch": 3.69, + "grad_norm": 0.6171875, + "learning_rate": 0.00048165028548211703, + "loss": 0.1653, + "step": 88970 + }, + { + "epoch": 3.69, + "grad_norm": 1.5390625, + "learning_rate": 0.00048164620697591635, + "loss": 0.214, + "step": 88980 + }, + { + "epoch": 3.69, + "grad_norm": 1.25, + "learning_rate": 0.0004816421280337825, + "loss": 0.2439, + "step": 88990 + }, + { + "epoch": 3.69, + "grad_norm": 0.69921875, + "learning_rate": 0.00048163804865572317, + "loss": 0.2112, + "step": 89000 + }, + { + "epoch": 3.69, + "grad_norm": 0.30078125, + "learning_rate": 0.0004816339688417461, + "loss": 0.1893, + "step": 89010 + }, + { + "epoch": 3.69, + "grad_norm": 0.64453125, + "learning_rate": 0.00048162988859185886, + "loss": 0.2387, + "step": 89020 + }, + { + "epoch": 3.69, + "grad_norm": 1.0234375, + "learning_rate": 0.0004816258079060692, + "loss": 0.25, + "step": 89030 + }, + { + "epoch": 3.69, + "grad_norm": 0.73828125, + "learning_rate": 0.00048162172678438474, + "loss": 0.2639, + "step": 89040 + }, + { + "epoch": 3.69, + "grad_norm": 0.64453125, + "learning_rate": 0.00048161764522681327, + "loss": 0.222, + "step": 89050 + }, + { + "epoch": 3.69, + "grad_norm": 0.30078125, + "learning_rate": 0.00048161356323336244, + "loss": 0.2451, + "step": 89060 + }, + { + "epoch": 3.69, + "grad_norm": 0.984375, + "learning_rate": 0.00048160948080403984, + "loss": 0.1632, + "step": 89070 + }, + { + "epoch": 3.69, + "grad_norm": 0.93359375, + "learning_rate": 0.00048160539793885317, + "loss": 0.2195, + "step": 89080 + }, + { + "epoch": 3.69, + "grad_norm": 0.6953125, + "learning_rate": 0.00048160131463781023, + "loss": 0.2091, + "step": 89090 + }, + { + "epoch": 3.69, + "grad_norm": 0.3125, + "learning_rate": 0.00048159723090091856, + "loss": 0.2422, + "step": 89100 + }, + { + "epoch": 3.69, + "grad_norm": 0.94140625, + "learning_rate": 0.0004815931467281859, + "loss": 0.2721, + "step": 89110 + }, + { + "epoch": 3.69, + "grad_norm": 0.7109375, + "learning_rate": 0.00048158906211962004, + "loss": 0.2254, + "step": 89120 + }, + { + "epoch": 3.69, + "grad_norm": 0.435546875, + "learning_rate": 0.0004815849770752285, + "loss": 0.2498, + "step": 89130 + }, + { + "epoch": 3.69, + "grad_norm": 0.71484375, + "learning_rate": 0.00048158089159501907, + "loss": 0.1644, + "step": 89140 + }, + { + "epoch": 3.69, + "grad_norm": 0.63671875, + "learning_rate": 0.0004815768056789994, + "loss": 0.219, + "step": 89150 + }, + { + "epoch": 3.69, + "grad_norm": 0.81640625, + "learning_rate": 0.0004815727193271772, + "loss": 0.226, + "step": 89160 + }, + { + "epoch": 3.69, + "grad_norm": 0.83984375, + "learning_rate": 0.0004815686325395601, + "loss": 0.1933, + "step": 89170 + }, + { + "epoch": 3.69, + "grad_norm": 0.85546875, + "learning_rate": 0.0004815645453161559, + "loss": 0.2615, + "step": 89180 + }, + { + "epoch": 3.69, + "grad_norm": 0.6484375, + "learning_rate": 0.00048156045765697223, + "loss": 0.2319, + "step": 89190 + }, + { + "epoch": 3.69, + "grad_norm": 1.3671875, + "learning_rate": 0.00048155636956201674, + "loss": 0.2486, + "step": 89200 + }, + { + "epoch": 3.7, + "grad_norm": 1.0546875, + "learning_rate": 0.00048155228103129725, + "loss": 0.192, + "step": 89210 + }, + { + "epoch": 3.7, + "grad_norm": 0.21484375, + "learning_rate": 0.0004815481920648213, + "loss": 0.1901, + "step": 89220 + }, + { + "epoch": 3.7, + "grad_norm": 0.8359375, + "learning_rate": 0.00048154410266259667, + "loss": 0.2396, + "step": 89230 + }, + { + "epoch": 3.7, + "grad_norm": 0.0849609375, + "learning_rate": 0.00048154001282463103, + "loss": 0.1401, + "step": 89240 + }, + { + "epoch": 3.7, + "grad_norm": 0.310546875, + "learning_rate": 0.00048153592255093214, + "loss": 0.2011, + "step": 89250 + }, + { + "epoch": 3.7, + "grad_norm": 0.88671875, + "learning_rate": 0.0004815318318415076, + "loss": 0.2407, + "step": 89260 + }, + { + "epoch": 3.7, + "grad_norm": 0.96484375, + "learning_rate": 0.0004815277406963652, + "loss": 0.234, + "step": 89270 + }, + { + "epoch": 3.7, + "grad_norm": 0.5234375, + "learning_rate": 0.0004815236491155126, + "loss": 0.1769, + "step": 89280 + }, + { + "epoch": 3.7, + "grad_norm": 0.267578125, + "learning_rate": 0.00048151955709895743, + "loss": 0.1418, + "step": 89290 + }, + { + "epoch": 3.7, + "grad_norm": 0.67578125, + "learning_rate": 0.00048151546464670746, + "loss": 0.2431, + "step": 89300 + }, + { + "epoch": 3.7, + "grad_norm": 0.484375, + "learning_rate": 0.0004815113717587705, + "loss": 0.169, + "step": 89310 + }, + { + "epoch": 3.7, + "grad_norm": 0.796875, + "learning_rate": 0.000481507278435154, + "loss": 0.1969, + "step": 89320 + }, + { + "epoch": 3.7, + "grad_norm": 0.94140625, + "learning_rate": 0.00048150318467586584, + "loss": 0.1898, + "step": 89330 + }, + { + "epoch": 3.7, + "grad_norm": 0.60546875, + "learning_rate": 0.00048149909048091376, + "loss": 0.2131, + "step": 89340 + }, + { + "epoch": 3.7, + "grad_norm": 0.462890625, + "learning_rate": 0.00048149499585030533, + "loss": 0.2418, + "step": 89350 + }, + { + "epoch": 3.7, + "grad_norm": 0.35546875, + "learning_rate": 0.0004814909007840483, + "loss": 0.1763, + "step": 89360 + }, + { + "epoch": 3.7, + "grad_norm": 0.451171875, + "learning_rate": 0.0004814868052821504, + "loss": 0.2272, + "step": 89370 + }, + { + "epoch": 3.7, + "grad_norm": 0.515625, + "learning_rate": 0.00048148270934461943, + "loss": 0.1762, + "step": 89380 + }, + { + "epoch": 3.7, + "grad_norm": 0.390625, + "learning_rate": 0.0004814786129714629, + "loss": 0.1826, + "step": 89390 + }, + { + "epoch": 3.7, + "grad_norm": 0.0, + "learning_rate": 0.0004814745161626886, + "loss": 0.18, + "step": 89400 + }, + { + "epoch": 3.7, + "grad_norm": 0.318359375, + "learning_rate": 0.00048147041891830435, + "loss": 0.1924, + "step": 89410 + }, + { + "epoch": 3.7, + "grad_norm": 0.46875, + "learning_rate": 0.0004814663212383177, + "loss": 0.2742, + "step": 89420 + }, + { + "epoch": 3.7, + "grad_norm": 0.392578125, + "learning_rate": 0.0004814622231227365, + "loss": 0.1847, + "step": 89430 + }, + { + "epoch": 3.7, + "grad_norm": 1.1328125, + "learning_rate": 0.0004814581245715682, + "loss": 0.1781, + "step": 89440 + }, + { + "epoch": 3.71, + "grad_norm": 0.2001953125, + "learning_rate": 0.00048145402558482086, + "loss": 0.2031, + "step": 89450 + }, + { + "epoch": 3.71, + "grad_norm": 1.6875, + "learning_rate": 0.00048144992616250206, + "loss": 0.2572, + "step": 89460 + }, + { + "epoch": 3.71, + "grad_norm": 1.0390625, + "learning_rate": 0.0004814458263046194, + "loss": 0.1996, + "step": 89470 + }, + { + "epoch": 3.71, + "grad_norm": 0.64453125, + "learning_rate": 0.0004814417260111808, + "loss": 0.229, + "step": 89480 + }, + { + "epoch": 3.71, + "grad_norm": 0.6015625, + "learning_rate": 0.00048143762528219384, + "loss": 0.1933, + "step": 89490 + }, + { + "epoch": 3.71, + "grad_norm": 0.59765625, + "learning_rate": 0.0004814335241176662, + "loss": 0.2302, + "step": 89500 + }, + { + "epoch": 3.71, + "grad_norm": 1.0390625, + "learning_rate": 0.00048142942251760566, + "loss": 0.2201, + "step": 89510 + }, + { + "epoch": 3.71, + "grad_norm": 0.40625, + "learning_rate": 0.00048142532048202, + "loss": 0.2327, + "step": 89520 + }, + { + "epoch": 3.71, + "grad_norm": 0.8515625, + "learning_rate": 0.0004814212180109168, + "loss": 0.2241, + "step": 89530 + }, + { + "epoch": 3.71, + "grad_norm": 0.65625, + "learning_rate": 0.00048141711510430395, + "loss": 0.2359, + "step": 89540 + }, + { + "epoch": 3.71, + "grad_norm": 0.51953125, + "learning_rate": 0.00048141301176218897, + "loss": 0.2054, + "step": 89550 + }, + { + "epoch": 3.71, + "grad_norm": 1.2421875, + "learning_rate": 0.00048140890798457984, + "loss": 0.2075, + "step": 89560 + }, + { + "epoch": 3.71, + "grad_norm": 0.625, + "learning_rate": 0.000481404803771484, + "loss": 0.2241, + "step": 89570 + }, + { + "epoch": 3.71, + "grad_norm": 1.0390625, + "learning_rate": 0.0004814006991229094, + "loss": 0.2145, + "step": 89580 + }, + { + "epoch": 3.71, + "grad_norm": 0.478515625, + "learning_rate": 0.0004813965940388636, + "loss": 0.2329, + "step": 89590 + }, + { + "epoch": 3.71, + "grad_norm": 1.4609375, + "learning_rate": 0.00048139248851935445, + "loss": 0.2031, + "step": 89600 + }, + { + "epoch": 3.71, + "grad_norm": 0.5234375, + "learning_rate": 0.0004813883825643896, + "loss": 0.2679, + "step": 89610 + }, + { + "epoch": 3.71, + "grad_norm": 0.51953125, + "learning_rate": 0.0004813842761739768, + "loss": 0.2505, + "step": 89620 + }, + { + "epoch": 3.71, + "grad_norm": 0.6015625, + "learning_rate": 0.0004813801693481238, + "loss": 0.2245, + "step": 89630 + }, + { + "epoch": 3.71, + "grad_norm": 0.76953125, + "learning_rate": 0.00048137606208683827, + "loss": 0.2445, + "step": 89640 + }, + { + "epoch": 3.71, + "grad_norm": 0.95703125, + "learning_rate": 0.00048137195439012804, + "loss": 0.2453, + "step": 89650 + }, + { + "epoch": 3.71, + "grad_norm": 0.48828125, + "learning_rate": 0.00048136784625800076, + "loss": 0.2244, + "step": 89660 + }, + { + "epoch": 3.71, + "grad_norm": 0.7578125, + "learning_rate": 0.00048136373769046417, + "loss": 0.233, + "step": 89670 + }, + { + "epoch": 3.71, + "grad_norm": 0.55078125, + "learning_rate": 0.00048135962868752597, + "loss": 0.2189, + "step": 89680 + }, + { + "epoch": 3.71, + "grad_norm": 0.58984375, + "learning_rate": 0.000481355519249194, + "loss": 0.1979, + "step": 89690 + }, + { + "epoch": 3.72, + "grad_norm": 1.125, + "learning_rate": 0.0004813514093754758, + "loss": 0.1755, + "step": 89700 + }, + { + "epoch": 3.72, + "grad_norm": 0.5859375, + "learning_rate": 0.00048134729906637943, + "loss": 0.2439, + "step": 89710 + }, + { + "epoch": 3.72, + "grad_norm": 1.6640625, + "learning_rate": 0.0004813431883219123, + "loss": 0.203, + "step": 89720 + }, + { + "epoch": 3.72, + "grad_norm": 0.68359375, + "learning_rate": 0.0004813390771420822, + "loss": 0.1568, + "step": 89730 + }, + { + "epoch": 3.72, + "grad_norm": 0.5, + "learning_rate": 0.00048133496552689713, + "loss": 0.1614, + "step": 89740 + }, + { + "epoch": 3.72, + "grad_norm": 0.0, + "learning_rate": 0.00048133085347636453, + "loss": 0.1593, + "step": 89750 + }, + { + "epoch": 3.72, + "grad_norm": 0.734375, + "learning_rate": 0.00048132674099049224, + "loss": 0.3255, + "step": 89760 + }, + { + "epoch": 3.72, + "grad_norm": 0.765625, + "learning_rate": 0.00048132262806928805, + "loss": 0.2138, + "step": 89770 + }, + { + "epoch": 3.72, + "grad_norm": 0.451171875, + "learning_rate": 0.0004813185147127596, + "loss": 0.2178, + "step": 89780 + }, + { + "epoch": 3.72, + "grad_norm": 0.59765625, + "learning_rate": 0.00048131440092091473, + "loss": 0.2188, + "step": 89790 + }, + { + "epoch": 3.72, + "grad_norm": 0.5625, + "learning_rate": 0.0004813102866937611, + "loss": 0.2209, + "step": 89800 + }, + { + "epoch": 3.72, + "grad_norm": 1.3125, + "learning_rate": 0.0004813061720313065, + "loss": 0.2162, + "step": 89810 + }, + { + "epoch": 3.72, + "grad_norm": 0.478515625, + "learning_rate": 0.00048130205693355864, + "loss": 0.1811, + "step": 89820 + }, + { + "epoch": 3.72, + "grad_norm": 1.2578125, + "learning_rate": 0.00048129794140052535, + "loss": 0.2069, + "step": 89830 + }, + { + "epoch": 3.72, + "grad_norm": 1.1875, + "learning_rate": 0.00048129382543221427, + "loss": 0.2329, + "step": 89840 + }, + { + "epoch": 3.72, + "grad_norm": 0.47265625, + "learning_rate": 0.0004812897090286332, + "loss": 0.1451, + "step": 89850 + }, + { + "epoch": 3.72, + "grad_norm": 0.39453125, + "learning_rate": 0.0004812855921897899, + "loss": 0.2214, + "step": 89860 + }, + { + "epoch": 3.72, + "grad_norm": 0.69140625, + "learning_rate": 0.0004812814749156921, + "loss": 0.2135, + "step": 89870 + }, + { + "epoch": 3.72, + "grad_norm": 0.953125, + "learning_rate": 0.00048127735720634746, + "loss": 0.231, + "step": 89880 + }, + { + "epoch": 3.72, + "grad_norm": 0.421875, + "learning_rate": 0.00048127323906176387, + "loss": 0.1876, + "step": 89890 + }, + { + "epoch": 3.72, + "grad_norm": 1.03125, + "learning_rate": 0.000481269120481949, + "loss": 0.2076, + "step": 89900 + }, + { + "epoch": 3.72, + "grad_norm": 0.8359375, + "learning_rate": 0.00048126500146691067, + "loss": 0.2117, + "step": 89910 + }, + { + "epoch": 3.72, + "grad_norm": 1.1953125, + "learning_rate": 0.0004812608820166565, + "loss": 0.1968, + "step": 89920 + }, + { + "epoch": 3.72, + "grad_norm": 0.66796875, + "learning_rate": 0.00048125676213119436, + "loss": 0.2203, + "step": 89930 + }, + { + "epoch": 3.73, + "grad_norm": 0.46875, + "learning_rate": 0.00048125264181053196, + "loss": 0.1797, + "step": 89940 + }, + { + "epoch": 3.73, + "grad_norm": 0.828125, + "learning_rate": 0.0004812485210546771, + "loss": 0.2434, + "step": 89950 + }, + { + "epoch": 3.73, + "grad_norm": 0.5234375, + "learning_rate": 0.00048124439986363746, + "loss": 0.2284, + "step": 89960 + }, + { + "epoch": 3.73, + "grad_norm": 0.337890625, + "learning_rate": 0.00048124027823742075, + "loss": 0.2054, + "step": 89970 + }, + { + "epoch": 3.73, + "grad_norm": 0.3046875, + "learning_rate": 0.00048123615617603495, + "loss": 0.1896, + "step": 89980 + }, + { + "epoch": 3.73, + "grad_norm": 0.95703125, + "learning_rate": 0.00048123203367948764, + "loss": 0.2427, + "step": 89990 + }, + { + "epoch": 3.73, + "grad_norm": 1.09375, + "learning_rate": 0.0004812279107477866, + "loss": 0.1845, + "step": 90000 + }, + { + "epoch": 3.73, + "grad_norm": 0.796875, + "learning_rate": 0.0004812237873809396, + "loss": 0.2277, + "step": 90010 + }, + { + "epoch": 3.73, + "grad_norm": 0.3203125, + "learning_rate": 0.00048121966357895433, + "loss": 0.1677, + "step": 90020 + }, + { + "epoch": 3.73, + "grad_norm": 0.98046875, + "learning_rate": 0.0004812155393418387, + "loss": 0.2861, + "step": 90030 + }, + { + "epoch": 3.73, + "grad_norm": 0.484375, + "learning_rate": 0.0004812114146696004, + "loss": 0.1545, + "step": 90040 + }, + { + "epoch": 3.73, + "grad_norm": 0.68359375, + "learning_rate": 0.00048120728956224716, + "loss": 0.2431, + "step": 90050 + }, + { + "epoch": 3.73, + "grad_norm": 0.7421875, + "learning_rate": 0.00048120316401978683, + "loss": 0.2569, + "step": 90060 + }, + { + "epoch": 3.73, + "grad_norm": 0.23046875, + "learning_rate": 0.00048119903804222705, + "loss": 0.1716, + "step": 90070 + }, + { + "epoch": 3.73, + "grad_norm": 0.6171875, + "learning_rate": 0.00048119491162957565, + "loss": 0.2331, + "step": 90080 + }, + { + "epoch": 3.73, + "grad_norm": 0.56640625, + "learning_rate": 0.0004811907847818404, + "loss": 0.2147, + "step": 90090 + }, + { + "epoch": 3.73, + "grad_norm": 0.69921875, + "learning_rate": 0.00048118665749902906, + "loss": 0.2394, + "step": 90100 + }, + { + "epoch": 3.73, + "grad_norm": 1.4921875, + "learning_rate": 0.0004811825297811494, + "loss": 0.2376, + "step": 90110 + }, + { + "epoch": 3.73, + "grad_norm": 0.7578125, + "learning_rate": 0.00048117840162820917, + "loss": 0.189, + "step": 90120 + }, + { + "epoch": 3.73, + "grad_norm": 0.59765625, + "learning_rate": 0.0004811742730402162, + "loss": 0.2658, + "step": 90130 + }, + { + "epoch": 3.73, + "grad_norm": 0.625, + "learning_rate": 0.0004811701440171782, + "loss": 0.1851, + "step": 90140 + }, + { + "epoch": 3.73, + "grad_norm": 1.0625, + "learning_rate": 0.00048116601455910294, + "loss": 0.226, + "step": 90150 + }, + { + "epoch": 3.73, + "grad_norm": 0.361328125, + "learning_rate": 0.0004811618846659982, + "loss": 0.137, + "step": 90160 + }, + { + "epoch": 3.73, + "grad_norm": 0.93359375, + "learning_rate": 0.00048115775433787175, + "loss": 0.2018, + "step": 90170 + }, + { + "epoch": 3.74, + "grad_norm": 0.86328125, + "learning_rate": 0.00048115362357473135, + "loss": 0.2292, + "step": 90180 + }, + { + "epoch": 3.74, + "grad_norm": 0.1787109375, + "learning_rate": 0.00048114949237658476, + "loss": 0.1834, + "step": 90190 + }, + { + "epoch": 3.74, + "grad_norm": 0.76171875, + "learning_rate": 0.0004811453607434399, + "loss": 0.1983, + "step": 90200 + }, + { + "epoch": 3.74, + "grad_norm": 0.90625, + "learning_rate": 0.0004811412286753044, + "loss": 0.1853, + "step": 90210 + }, + { + "epoch": 3.74, + "grad_norm": 0.8828125, + "learning_rate": 0.000481137096172186, + "loss": 0.1692, + "step": 90220 + }, + { + "epoch": 3.74, + "grad_norm": 0.220703125, + "learning_rate": 0.0004811329632340926, + "loss": 0.2442, + "step": 90230 + }, + { + "epoch": 3.74, + "grad_norm": 0.65234375, + "learning_rate": 0.0004811288298610319, + "loss": 0.2206, + "step": 90240 + }, + { + "epoch": 3.74, + "grad_norm": 0.69140625, + "learning_rate": 0.00048112469605301176, + "loss": 0.2357, + "step": 90250 + }, + { + "epoch": 3.74, + "grad_norm": 0.57421875, + "learning_rate": 0.0004811205618100398, + "loss": 0.1822, + "step": 90260 + }, + { + "epoch": 3.74, + "grad_norm": 0.0, + "learning_rate": 0.000481116427132124, + "loss": 0.2024, + "step": 90270 + }, + { + "epoch": 3.74, + "grad_norm": 0.609375, + "learning_rate": 0.000481112292019272, + "loss": 0.2064, + "step": 90280 + }, + { + "epoch": 3.74, + "grad_norm": 0.47265625, + "learning_rate": 0.00048110815647149164, + "loss": 0.1945, + "step": 90290 + }, + { + "epoch": 3.74, + "grad_norm": 0.87890625, + "learning_rate": 0.00048110402048879067, + "loss": 0.2408, + "step": 90300 + }, + { + "epoch": 3.74, + "grad_norm": 0.0, + "learning_rate": 0.0004810998840711769, + "loss": 0.2036, + "step": 90310 + }, + { + "epoch": 3.74, + "grad_norm": 0.94921875, + "learning_rate": 0.0004810957472186581, + "loss": 0.2267, + "step": 90320 + }, + { + "epoch": 3.74, + "grad_norm": 0.51953125, + "learning_rate": 0.000481091609931242, + "loss": 0.228, + "step": 90330 + }, + { + "epoch": 3.74, + "grad_norm": 0.8046875, + "learning_rate": 0.00048108747220893655, + "loss": 0.2034, + "step": 90340 + }, + { + "epoch": 3.74, + "grad_norm": 0.400390625, + "learning_rate": 0.0004810833340517494, + "loss": 0.2681, + "step": 90350 + }, + { + "epoch": 3.74, + "grad_norm": 0.427734375, + "learning_rate": 0.00048107919545968834, + "loss": 0.2019, + "step": 90360 + }, + { + "epoch": 3.74, + "grad_norm": 0.2412109375, + "learning_rate": 0.0004810750564327612, + "loss": 0.2654, + "step": 90370 + }, + { + "epoch": 3.74, + "grad_norm": 1.046875, + "learning_rate": 0.0004810709169709758, + "loss": 0.1773, + "step": 90380 + }, + { + "epoch": 3.74, + "grad_norm": 1.0859375, + "learning_rate": 0.00048106677707433987, + "loss": 0.1941, + "step": 90390 + }, + { + "epoch": 3.74, + "grad_norm": 0.41015625, + "learning_rate": 0.00048106263674286117, + "loss": 0.2182, + "step": 90400 + }, + { + "epoch": 3.74, + "grad_norm": 0.70703125, + "learning_rate": 0.0004810584959765476, + "loss": 0.2038, + "step": 90410 + }, + { + "epoch": 3.75, + "grad_norm": 0.7421875, + "learning_rate": 0.0004810543547754068, + "loss": 0.2016, + "step": 90420 + }, + { + "epoch": 3.75, + "grad_norm": 0.404296875, + "learning_rate": 0.00048105021313944675, + "loss": 0.2098, + "step": 90430 + }, + { + "epoch": 3.75, + "grad_norm": 0.8828125, + "learning_rate": 0.00048104607106867514, + "loss": 0.181, + "step": 90440 + }, + { + "epoch": 3.75, + "grad_norm": 0.53515625, + "learning_rate": 0.0004810419285630997, + "loss": 0.181, + "step": 90450 + }, + { + "epoch": 3.75, + "grad_norm": 0.16796875, + "learning_rate": 0.00048103778562272835, + "loss": 0.2345, + "step": 90460 + }, + { + "epoch": 3.75, + "grad_norm": 0.453125, + "learning_rate": 0.00048103364224756883, + "loss": 0.1571, + "step": 90470 + }, + { + "epoch": 3.75, + "grad_norm": 0.625, + "learning_rate": 0.000481029498437629, + "loss": 0.1908, + "step": 90480 + }, + { + "epoch": 3.75, + "grad_norm": 0.326171875, + "learning_rate": 0.0004810253541929166, + "loss": 0.2216, + "step": 90490 + }, + { + "epoch": 3.75, + "grad_norm": 1.5078125, + "learning_rate": 0.00048102120951343935, + "loss": 0.1618, + "step": 90500 + }, + { + "epoch": 3.75, + "grad_norm": 0.439453125, + "learning_rate": 0.0004810170643992052, + "loss": 0.1545, + "step": 90510 + }, + { + "epoch": 3.75, + "grad_norm": 1.3828125, + "learning_rate": 0.0004810129188502219, + "loss": 0.1815, + "step": 90520 + }, + { + "epoch": 3.75, + "grad_norm": 0.68359375, + "learning_rate": 0.00048100877286649715, + "loss": 0.2265, + "step": 90530 + }, + { + "epoch": 3.75, + "grad_norm": 0.462890625, + "learning_rate": 0.0004810046264480389, + "loss": 0.2015, + "step": 90540 + }, + { + "epoch": 3.75, + "grad_norm": 1.1796875, + "learning_rate": 0.0004810004795948548, + "loss": 0.1892, + "step": 90550 + }, + { + "epoch": 3.75, + "grad_norm": 1.2109375, + "learning_rate": 0.0004809963323069528, + "loss": 0.2297, + "step": 90560 + }, + { + "epoch": 3.75, + "grad_norm": 1.1328125, + "learning_rate": 0.00048099218458434067, + "loss": 0.285, + "step": 90570 + }, + { + "epoch": 3.75, + "grad_norm": 0.828125, + "learning_rate": 0.00048098803642702623, + "loss": 0.2117, + "step": 90580 + }, + { + "epoch": 3.75, + "grad_norm": 1.09375, + "learning_rate": 0.0004809838878350172, + "loss": 0.1851, + "step": 90590 + }, + { + "epoch": 3.75, + "grad_norm": 0.2392578125, + "learning_rate": 0.0004809797388083214, + "loss": 0.25, + "step": 90600 + }, + { + "epoch": 3.75, + "grad_norm": 1.59375, + "learning_rate": 0.00048097558934694675, + "loss": 0.181, + "step": 90610 + }, + { + "epoch": 3.75, + "grad_norm": 0.7421875, + "learning_rate": 0.0004809714394509009, + "loss": 0.2002, + "step": 90620 + }, + { + "epoch": 3.75, + "grad_norm": 0.53515625, + "learning_rate": 0.0004809672891201918, + "loss": 0.2398, + "step": 90630 + }, + { + "epoch": 3.75, + "grad_norm": 0.45703125, + "learning_rate": 0.0004809631383548272, + "loss": 0.2145, + "step": 90640 + }, + { + "epoch": 3.75, + "grad_norm": 0.361328125, + "learning_rate": 0.0004809589871548149, + "loss": 0.2067, + "step": 90650 + }, + { + "epoch": 3.76, + "grad_norm": 0.5078125, + "learning_rate": 0.00048095483552016273, + "loss": 0.2345, + "step": 90660 + }, + { + "epoch": 3.76, + "grad_norm": 0.7265625, + "learning_rate": 0.0004809506834508786, + "loss": 0.2059, + "step": 90670 + }, + { + "epoch": 3.76, + "grad_norm": 2.359375, + "learning_rate": 0.00048094653094697004, + "loss": 0.2689, + "step": 90680 + }, + { + "epoch": 3.76, + "grad_norm": 1.2265625, + "learning_rate": 0.00048094237800844523, + "loss": 0.1852, + "step": 90690 + }, + { + "epoch": 3.76, + "grad_norm": 1.6328125, + "learning_rate": 0.0004809382246353117, + "loss": 0.215, + "step": 90700 + }, + { + "epoch": 3.76, + "grad_norm": 1.140625, + "learning_rate": 0.00048093407082757734, + "loss": 0.2212, + "step": 90710 + }, + { + "epoch": 3.76, + "grad_norm": 1.2265625, + "learning_rate": 0.0004809299165852501, + "loss": 0.2303, + "step": 90720 + }, + { + "epoch": 3.76, + "grad_norm": 0.388671875, + "learning_rate": 0.0004809257619083376, + "loss": 0.2383, + "step": 90730 + }, + { + "epoch": 3.76, + "grad_norm": 0.51171875, + "learning_rate": 0.00048092160679684783, + "loss": 0.1465, + "step": 90740 + }, + { + "epoch": 3.76, + "grad_norm": 1.640625, + "learning_rate": 0.00048091745125078845, + "loss": 0.2389, + "step": 90750 + }, + { + "epoch": 3.76, + "grad_norm": 1.25, + "learning_rate": 0.00048091329527016744, + "loss": 0.2372, + "step": 90760 + }, + { + "epoch": 3.76, + "grad_norm": 0.65625, + "learning_rate": 0.0004809091388549925, + "loss": 0.1936, + "step": 90770 + }, + { + "epoch": 3.76, + "grad_norm": 1.2890625, + "learning_rate": 0.0004809049820052716, + "loss": 0.245, + "step": 90780 + }, + { + "epoch": 3.76, + "grad_norm": 0.50390625, + "learning_rate": 0.0004809008247210123, + "loss": 0.229, + "step": 90790 + }, + { + "epoch": 3.76, + "grad_norm": 0.404296875, + "learning_rate": 0.0004808966670022227, + "loss": 0.1973, + "step": 90800 + }, + { + "epoch": 3.76, + "grad_norm": 0.201171875, + "learning_rate": 0.0004808925088489104, + "loss": 0.1995, + "step": 90810 + }, + { + "epoch": 3.76, + "grad_norm": 0.89453125, + "learning_rate": 0.0004808883502610834, + "loss": 0.2203, + "step": 90820 + }, + { + "epoch": 3.76, + "grad_norm": 0.70703125, + "learning_rate": 0.00048088419123874944, + "loss": 0.2208, + "step": 90830 + }, + { + "epoch": 3.76, + "grad_norm": 0.62109375, + "learning_rate": 0.0004808800317819164, + "loss": 0.1991, + "step": 90840 + }, + { + "epoch": 3.76, + "grad_norm": 1.1796875, + "learning_rate": 0.000480875871890592, + "loss": 0.233, + "step": 90850 + }, + { + "epoch": 3.76, + "grad_norm": 0.59375, + "learning_rate": 0.00048087171156478427, + "loss": 0.2794, + "step": 90860 + }, + { + "epoch": 3.76, + "grad_norm": 0.609375, + "learning_rate": 0.0004808675508045008, + "loss": 0.2344, + "step": 90870 + }, + { + "epoch": 3.76, + "grad_norm": 0.9375, + "learning_rate": 0.00048086338960974957, + "loss": 0.2399, + "step": 90880 + }, + { + "epoch": 3.76, + "grad_norm": 0.45703125, + "learning_rate": 0.0004808592279805384, + "loss": 0.2298, + "step": 90890 + }, + { + "epoch": 3.77, + "grad_norm": 0.1865234375, + "learning_rate": 0.000480855065916875, + "loss": 0.2039, + "step": 90900 + }, + { + "epoch": 3.77, + "grad_norm": 0.404296875, + "learning_rate": 0.00048085090341876737, + "loss": 0.2067, + "step": 90910 + }, + { + "epoch": 3.77, + "grad_norm": 1.0703125, + "learning_rate": 0.0004808467404862232, + "loss": 0.1964, + "step": 90920 + }, + { + "epoch": 3.77, + "grad_norm": 0.474609375, + "learning_rate": 0.0004808425771192505, + "loss": 0.2177, + "step": 90930 + }, + { + "epoch": 3.77, + "grad_norm": 0.66015625, + "learning_rate": 0.0004808384133178569, + "loss": 0.2023, + "step": 90940 + }, + { + "epoch": 3.77, + "grad_norm": 0.58984375, + "learning_rate": 0.0004808342490820504, + "loss": 0.2376, + "step": 90950 + }, + { + "epoch": 3.77, + "grad_norm": 0.77734375, + "learning_rate": 0.0004808300844118388, + "loss": 0.2055, + "step": 90960 + }, + { + "epoch": 3.77, + "grad_norm": 1.3203125, + "learning_rate": 0.00048082591930722983, + "loss": 0.1893, + "step": 90970 + }, + { + "epoch": 3.77, + "grad_norm": 0.7421875, + "learning_rate": 0.00048082175376823146, + "loss": 0.2799, + "step": 90980 + }, + { + "epoch": 3.77, + "grad_norm": 0.609375, + "learning_rate": 0.00048081758779485145, + "loss": 0.2576, + "step": 90990 + }, + { + "epoch": 3.77, + "grad_norm": 0.4140625, + "learning_rate": 0.0004808134213870977, + "loss": 0.1928, + "step": 91000 + }, + { + "epoch": 3.77, + "grad_norm": 0.90234375, + "learning_rate": 0.000480809254544978, + "loss": 0.2699, + "step": 91010 + }, + { + "epoch": 3.77, + "grad_norm": 0.58984375, + "learning_rate": 0.00048080508726850015, + "loss": 0.2487, + "step": 91020 + }, + { + "epoch": 3.77, + "grad_norm": 0.515625, + "learning_rate": 0.0004808009195576721, + "loss": 0.2355, + "step": 91030 + }, + { + "epoch": 3.77, + "grad_norm": 1.0, + "learning_rate": 0.0004807967514125017, + "loss": 0.2006, + "step": 91040 + }, + { + "epoch": 3.77, + "grad_norm": 0.162109375, + "learning_rate": 0.00048079258283299667, + "loss": 0.1931, + "step": 91050 + }, + { + "epoch": 3.77, + "grad_norm": 0.703125, + "learning_rate": 0.00048078841381916487, + "loss": 0.2018, + "step": 91060 + }, + { + "epoch": 3.77, + "grad_norm": 0.6484375, + "learning_rate": 0.0004807842443710143, + "loss": 0.1952, + "step": 91070 + }, + { + "epoch": 3.77, + "grad_norm": 0.33984375, + "learning_rate": 0.00048078007448855263, + "loss": 0.2079, + "step": 91080 + }, + { + "epoch": 3.77, + "grad_norm": 0.65234375, + "learning_rate": 0.00048077590417178786, + "loss": 0.2663, + "step": 91090 + }, + { + "epoch": 3.77, + "grad_norm": 1.171875, + "learning_rate": 0.00048077173342072764, + "loss": 0.2326, + "step": 91100 + }, + { + "epoch": 3.77, + "grad_norm": 0.68359375, + "learning_rate": 0.0004807675622353801, + "loss": 0.1673, + "step": 91110 + }, + { + "epoch": 3.77, + "grad_norm": 0.84375, + "learning_rate": 0.0004807633906157528, + "loss": 0.1556, + "step": 91120 + }, + { + "epoch": 3.77, + "grad_norm": 1.1953125, + "learning_rate": 0.00048075921856185376, + "loss": 0.2016, + "step": 91130 + }, + { + "epoch": 3.78, + "grad_norm": 0.50390625, + "learning_rate": 0.0004807550460736908, + "loss": 0.1799, + "step": 91140 + }, + { + "epoch": 3.78, + "grad_norm": 1.625, + "learning_rate": 0.0004807508731512718, + "loss": 0.1731, + "step": 91150 + }, + { + "epoch": 3.78, + "grad_norm": 0.8671875, + "learning_rate": 0.00048074669979460453, + "loss": 0.1698, + "step": 91160 + }, + { + "epoch": 3.78, + "grad_norm": 0.474609375, + "learning_rate": 0.00048074252600369686, + "loss": 0.176, + "step": 91170 + }, + { + "epoch": 3.78, + "grad_norm": 0.58984375, + "learning_rate": 0.00048073835177855673, + "loss": 0.1877, + "step": 91180 + }, + { + "epoch": 3.78, + "grad_norm": 0.8984375, + "learning_rate": 0.0004807341771191919, + "loss": 0.1914, + "step": 91190 + }, + { + "epoch": 3.78, + "grad_norm": 0.62890625, + "learning_rate": 0.0004807300020256102, + "loss": 0.2374, + "step": 91200 + }, + { + "epoch": 3.78, + "grad_norm": 1.078125, + "learning_rate": 0.0004807258264978197, + "loss": 0.225, + "step": 91210 + }, + { + "epoch": 3.78, + "grad_norm": 1.0625, + "learning_rate": 0.00048072165053582804, + "loss": 0.2214, + "step": 91220 + }, + { + "epoch": 3.78, + "grad_norm": 0.640625, + "learning_rate": 0.00048071747413964315, + "loss": 0.264, + "step": 91230 + }, + { + "epoch": 3.78, + "grad_norm": 0.388671875, + "learning_rate": 0.00048071329730927287, + "loss": 0.2253, + "step": 91240 + }, + { + "epoch": 3.78, + "grad_norm": 0.54296875, + "learning_rate": 0.0004807091200447251, + "loss": 0.1937, + "step": 91250 + }, + { + "epoch": 3.78, + "grad_norm": 0.671875, + "learning_rate": 0.0004807049423460077, + "loss": 0.2647, + "step": 91260 + }, + { + "epoch": 3.78, + "grad_norm": 0.404296875, + "learning_rate": 0.00048070076421312846, + "loss": 0.2325, + "step": 91270 + }, + { + "epoch": 3.78, + "grad_norm": 0.75, + "learning_rate": 0.0004806965856460954, + "loss": 0.2252, + "step": 91280 + }, + { + "epoch": 3.78, + "grad_norm": 0.7109375, + "learning_rate": 0.0004806924066449162, + "loss": 0.196, + "step": 91290 + }, + { + "epoch": 3.78, + "grad_norm": 0.3984375, + "learning_rate": 0.0004806882272095988, + "loss": 0.1979, + "step": 91300 + }, + { + "epoch": 3.78, + "grad_norm": 1.1640625, + "learning_rate": 0.00048068404734015105, + "loss": 0.1881, + "step": 91310 + }, + { + "epoch": 3.78, + "grad_norm": 0.828125, + "learning_rate": 0.0004806798670365809, + "loss": 0.2142, + "step": 91320 + }, + { + "epoch": 3.78, + "grad_norm": 0.484375, + "learning_rate": 0.00048067568629889615, + "loss": 0.2271, + "step": 91330 + }, + { + "epoch": 3.78, + "grad_norm": 0.640625, + "learning_rate": 0.00048067150512710454, + "loss": 0.197, + "step": 91340 + }, + { + "epoch": 3.78, + "grad_norm": 0.73046875, + "learning_rate": 0.0004806673235212142, + "loss": 0.2671, + "step": 91350 + }, + { + "epoch": 3.78, + "grad_norm": 0.546875, + "learning_rate": 0.00048066314148123286, + "loss": 0.243, + "step": 91360 + }, + { + "epoch": 3.78, + "grad_norm": 0.474609375, + "learning_rate": 0.0004806589590071684, + "loss": 0.2897, + "step": 91370 + }, + { + "epoch": 3.78, + "grad_norm": 0.671875, + "learning_rate": 0.00048065477609902864, + "loss": 0.2273, + "step": 91380 + }, + { + "epoch": 3.79, + "grad_norm": 0.6796875, + "learning_rate": 0.0004806505927568215, + "loss": 0.2425, + "step": 91390 + }, + { + "epoch": 3.79, + "grad_norm": 1.3203125, + "learning_rate": 0.00048064640898055487, + "loss": 0.2381, + "step": 91400 + }, + { + "epoch": 3.79, + "grad_norm": 0.88671875, + "learning_rate": 0.00048064222477023653, + "loss": 0.2375, + "step": 91410 + }, + { + "epoch": 3.79, + "grad_norm": 0.703125, + "learning_rate": 0.00048063804012587455, + "loss": 0.2383, + "step": 91420 + }, + { + "epoch": 3.79, + "grad_norm": 0.61328125, + "learning_rate": 0.0004806338550474766, + "loss": 0.2274, + "step": 91430 + }, + { + "epoch": 3.79, + "grad_norm": 0.65625, + "learning_rate": 0.0004806296695350507, + "loss": 0.239, + "step": 91440 + }, + { + "epoch": 3.79, + "grad_norm": 0.482421875, + "learning_rate": 0.0004806254835886047, + "loss": 0.1616, + "step": 91450 + }, + { + "epoch": 3.79, + "grad_norm": 0.8828125, + "learning_rate": 0.0004806212972081464, + "loss": 0.2353, + "step": 91460 + }, + { + "epoch": 3.79, + "grad_norm": 1.0703125, + "learning_rate": 0.00048061711039368375, + "loss": 0.1752, + "step": 91470 + }, + { + "epoch": 3.79, + "grad_norm": 0.65625, + "learning_rate": 0.00048061292314522454, + "loss": 0.2342, + "step": 91480 + }, + { + "epoch": 3.79, + "grad_norm": 0.4609375, + "learning_rate": 0.0004806087354627767, + "loss": 0.2441, + "step": 91490 + }, + { + "epoch": 3.79, + "grad_norm": 0.77734375, + "learning_rate": 0.0004806045473463482, + "loss": 0.2049, + "step": 91500 + }, + { + "epoch": 3.79, + "grad_norm": 0.55859375, + "learning_rate": 0.00048060035879594676, + "loss": 0.2009, + "step": 91510 + }, + { + "epoch": 3.79, + "grad_norm": 0.9375, + "learning_rate": 0.00048059616981158045, + "loss": 0.2022, + "step": 91520 + }, + { + "epoch": 3.79, + "grad_norm": 0.875, + "learning_rate": 0.000480591980393257, + "loss": 0.2449, + "step": 91530 + }, + { + "epoch": 3.79, + "grad_norm": 0.494140625, + "learning_rate": 0.00048058779054098433, + "loss": 0.1413, + "step": 91540 + }, + { + "epoch": 3.79, + "grad_norm": 0.51171875, + "learning_rate": 0.00048058360025477034, + "loss": 0.202, + "step": 91550 + }, + { + "epoch": 3.79, + "grad_norm": 1.9921875, + "learning_rate": 0.000480579409534623, + "loss": 0.2013, + "step": 91560 + }, + { + "epoch": 3.79, + "grad_norm": 0.34765625, + "learning_rate": 0.00048057521838055, + "loss": 0.1891, + "step": 91570 + }, + { + "epoch": 3.79, + "grad_norm": 0.671875, + "learning_rate": 0.0004805710267925594, + "loss": 0.2499, + "step": 91580 + }, + { + "epoch": 3.79, + "grad_norm": 0.99609375, + "learning_rate": 0.000480566834770659, + "loss": 0.2193, + "step": 91590 + }, + { + "epoch": 3.79, + "grad_norm": 0.77734375, + "learning_rate": 0.0004805626423148567, + "loss": 0.2063, + "step": 91600 + }, + { + "epoch": 3.79, + "grad_norm": 0.58203125, + "learning_rate": 0.0004805584494251604, + "loss": 0.1933, + "step": 91610 + }, + { + "epoch": 3.79, + "grad_norm": 0.5234375, + "learning_rate": 0.00048055425610157805, + "loss": 0.1794, + "step": 91620 + }, + { + "epoch": 3.8, + "grad_norm": 0.03515625, + "learning_rate": 0.00048055006234411744, + "loss": 0.2285, + "step": 91630 + }, + { + "epoch": 3.8, + "grad_norm": 1.03125, + "learning_rate": 0.00048054586815278656, + "loss": 0.2552, + "step": 91640 + }, + { + "epoch": 3.8, + "grad_norm": 0.0, + "learning_rate": 0.0004805416735275932, + "loss": 0.21, + "step": 91650 + }, + { + "epoch": 3.8, + "grad_norm": 0.52734375, + "learning_rate": 0.00048053747846854534, + "loss": 0.1555, + "step": 91660 + }, + { + "epoch": 3.8, + "grad_norm": 0.65625, + "learning_rate": 0.00048053328297565083, + "loss": 0.2524, + "step": 91670 + }, + { + "epoch": 3.8, + "grad_norm": 2.03125, + "learning_rate": 0.0004805290870489176, + "loss": 0.1937, + "step": 91680 + }, + { + "epoch": 3.8, + "grad_norm": 0.55859375, + "learning_rate": 0.0004805248906883535, + "loss": 0.1495, + "step": 91690 + }, + { + "epoch": 3.8, + "grad_norm": 1.2109375, + "learning_rate": 0.00048052069389396644, + "loss": 0.2001, + "step": 91700 + }, + { + "epoch": 3.8, + "grad_norm": 0.6796875, + "learning_rate": 0.00048051649666576435, + "loss": 0.1219, + "step": 91710 + }, + { + "epoch": 3.8, + "grad_norm": 0.546875, + "learning_rate": 0.00048051229900375513, + "loss": 0.2695, + "step": 91720 + }, + { + "epoch": 3.8, + "grad_norm": 1.3203125, + "learning_rate": 0.0004805081009079466, + "loss": 0.1929, + "step": 91730 + }, + { + "epoch": 3.8, + "grad_norm": 0.478515625, + "learning_rate": 0.00048050390237834676, + "loss": 0.2143, + "step": 91740 + }, + { + "epoch": 3.8, + "grad_norm": 0.59375, + "learning_rate": 0.0004804997034149634, + "loss": 0.2081, + "step": 91750 + }, + { + "epoch": 3.8, + "grad_norm": 1.1171875, + "learning_rate": 0.0004804955040178046, + "loss": 0.2531, + "step": 91760 + }, + { + "epoch": 3.8, + "grad_norm": 0.96484375, + "learning_rate": 0.00048049130418687804, + "loss": 0.2407, + "step": 91770 + }, + { + "epoch": 3.8, + "grad_norm": 1.5390625, + "learning_rate": 0.0004804871039221918, + "loss": 0.2512, + "step": 91780 + }, + { + "epoch": 3.8, + "grad_norm": 0.55078125, + "learning_rate": 0.0004804829032237537, + "loss": 0.1919, + "step": 91790 + }, + { + "epoch": 3.8, + "grad_norm": 0.578125, + "learning_rate": 0.00048047870209157173, + "loss": 0.2077, + "step": 91800 + }, + { + "epoch": 3.8, + "grad_norm": 1.0, + "learning_rate": 0.0004804745005256537, + "loss": 0.2533, + "step": 91810 + }, + { + "epoch": 3.8, + "grad_norm": 1.3984375, + "learning_rate": 0.0004804702985260075, + "loss": 0.2075, + "step": 91820 + }, + { + "epoch": 3.8, + "grad_norm": 0.62890625, + "learning_rate": 0.0004804660960926411, + "loss": 0.1778, + "step": 91830 + }, + { + "epoch": 3.8, + "grad_norm": 1.9609375, + "learning_rate": 0.0004804618932255624, + "loss": 0.2141, + "step": 91840 + }, + { + "epoch": 3.8, + "grad_norm": 0.3203125, + "learning_rate": 0.00048045768992477936, + "loss": 0.2026, + "step": 91850 + }, + { + "epoch": 3.8, + "grad_norm": 0.87109375, + "learning_rate": 0.0004804534861902997, + "loss": 0.27, + "step": 91860 + }, + { + "epoch": 3.81, + "grad_norm": 0.87890625, + "learning_rate": 0.00048044928202213154, + "loss": 0.2054, + "step": 91870 + }, + { + "epoch": 3.81, + "grad_norm": 0.578125, + "learning_rate": 0.00048044507742028283, + "loss": 0.2195, + "step": 91880 + }, + { + "epoch": 3.81, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004804408723847613, + "loss": 0.2179, + "step": 91890 + }, + { + "epoch": 3.81, + "grad_norm": 1.3828125, + "learning_rate": 0.00048043666691557484, + "loss": 0.2024, + "step": 91900 + }, + { + "epoch": 3.81, + "grad_norm": 0.69140625, + "learning_rate": 0.0004804324610127315, + "loss": 0.2222, + "step": 91910 + }, + { + "epoch": 3.81, + "grad_norm": 0.62890625, + "learning_rate": 0.00048042825467623917, + "loss": 0.1599, + "step": 91920 + }, + { + "epoch": 3.81, + "grad_norm": 0.61328125, + "learning_rate": 0.0004804240479061058, + "loss": 0.2068, + "step": 91930 + }, + { + "epoch": 3.81, + "grad_norm": 0.98828125, + "learning_rate": 0.00048041984070233923, + "loss": 0.1952, + "step": 91940 + }, + { + "epoch": 3.81, + "grad_norm": 0.64453125, + "learning_rate": 0.0004804156330649474, + "loss": 0.206, + "step": 91950 + }, + { + "epoch": 3.81, + "grad_norm": 0.7890625, + "learning_rate": 0.0004804114249939382, + "loss": 0.1522, + "step": 91960 + }, + { + "epoch": 3.81, + "grad_norm": 0.6015625, + "learning_rate": 0.0004804072164893196, + "loss": 0.1992, + "step": 91970 + }, + { + "epoch": 3.81, + "grad_norm": 0.7109375, + "learning_rate": 0.00048040300755109946, + "loss": 0.1977, + "step": 91980 + }, + { + "epoch": 3.81, + "grad_norm": 0.7421875, + "learning_rate": 0.0004803987981792858, + "loss": 0.2123, + "step": 91990 + }, + { + "epoch": 3.81, + "grad_norm": 0.58203125, + "learning_rate": 0.00048039458837388643, + "loss": 0.1885, + "step": 92000 + }, + { + "epoch": 3.81, + "grad_norm": 1.140625, + "learning_rate": 0.00048039037813490937, + "loss": 0.1966, + "step": 92010 + }, + { + "epoch": 3.81, + "grad_norm": 1.3984375, + "learning_rate": 0.0004803861674623625, + "loss": 0.2115, + "step": 92020 + }, + { + "epoch": 3.81, + "grad_norm": 0.34375, + "learning_rate": 0.0004803819563562537, + "loss": 0.2138, + "step": 92030 + }, + { + "epoch": 3.81, + "grad_norm": 0.51171875, + "learning_rate": 0.00048037774481659104, + "loss": 0.2004, + "step": 92040 + }, + { + "epoch": 3.81, + "grad_norm": 0.9453125, + "learning_rate": 0.00048037353284338224, + "loss": 0.2064, + "step": 92050 + }, + { + "epoch": 3.81, + "grad_norm": 0.26953125, + "learning_rate": 0.00048036932043663536, + "loss": 0.2317, + "step": 92060 + }, + { + "epoch": 3.81, + "grad_norm": 0.9765625, + "learning_rate": 0.00048036510759635827, + "loss": 0.2164, + "step": 92070 + }, + { + "epoch": 3.81, + "grad_norm": 0.8046875, + "learning_rate": 0.0004803608943225589, + "loss": 0.1406, + "step": 92080 + }, + { + "epoch": 3.81, + "grad_norm": 1.0390625, + "learning_rate": 0.0004803566806152453, + "loss": 0.2272, + "step": 92090 + }, + { + "epoch": 3.81, + "grad_norm": 0.52734375, + "learning_rate": 0.00048035246647442525, + "loss": 0.2313, + "step": 92100 + }, + { + "epoch": 3.82, + "grad_norm": 0.3515625, + "learning_rate": 0.00048034825190010675, + "loss": 0.2204, + "step": 92110 + }, + { + "epoch": 3.82, + "grad_norm": 1.015625, + "learning_rate": 0.00048034403689229766, + "loss": 0.2399, + "step": 92120 + }, + { + "epoch": 3.82, + "grad_norm": 0.8359375, + "learning_rate": 0.00048033982145100605, + "loss": 0.2131, + "step": 92130 + }, + { + "epoch": 3.82, + "grad_norm": 1.0078125, + "learning_rate": 0.00048033560557623974, + "loss": 0.1901, + "step": 92140 + }, + { + "epoch": 3.82, + "grad_norm": 0.734375, + "learning_rate": 0.0004803313892680067, + "loss": 0.2889, + "step": 92150 + }, + { + "epoch": 3.82, + "grad_norm": 0.390625, + "learning_rate": 0.00048032717252631486, + "loss": 0.2183, + "step": 92160 + }, + { + "epoch": 3.82, + "grad_norm": 0.66015625, + "learning_rate": 0.0004803229553511721, + "loss": 0.2715, + "step": 92170 + }, + { + "epoch": 3.82, + "grad_norm": 0.421875, + "learning_rate": 0.0004803187377425865, + "loss": 0.2135, + "step": 92180 + }, + { + "epoch": 3.82, + "grad_norm": 1.2265625, + "learning_rate": 0.0004803145197005658, + "loss": 0.2381, + "step": 92190 + }, + { + "epoch": 3.82, + "grad_norm": 1.390625, + "learning_rate": 0.0004803103012251181, + "loss": 0.2344, + "step": 92200 + }, + { + "epoch": 3.82, + "grad_norm": 0.51953125, + "learning_rate": 0.00048030608231625126, + "loss": 0.2201, + "step": 92210 + }, + { + "epoch": 3.82, + "grad_norm": 0.515625, + "learning_rate": 0.00048030186297397335, + "loss": 0.2249, + "step": 92220 + }, + { + "epoch": 3.82, + "grad_norm": 0.57421875, + "learning_rate": 0.00048029764319829207, + "loss": 0.226, + "step": 92230 + }, + { + "epoch": 3.82, + "grad_norm": 0.765625, + "learning_rate": 0.00048029342298921556, + "loss": 0.3081, + "step": 92240 + }, + { + "epoch": 3.82, + "grad_norm": 0.4609375, + "learning_rate": 0.00048028920234675167, + "loss": 0.1992, + "step": 92250 + }, + { + "epoch": 3.82, + "grad_norm": 0.765625, + "learning_rate": 0.0004802849812709085, + "loss": 0.1949, + "step": 92260 + }, + { + "epoch": 3.82, + "grad_norm": 0.62109375, + "learning_rate": 0.00048028075976169375, + "loss": 0.2358, + "step": 92270 + }, + { + "epoch": 3.82, + "grad_norm": 0.828125, + "learning_rate": 0.0004802765378191154, + "loss": 0.1759, + "step": 92280 + }, + { + "epoch": 3.82, + "grad_norm": 0.5859375, + "learning_rate": 0.00048027231544318157, + "loss": 0.2065, + "step": 92290 + }, + { + "epoch": 3.82, + "grad_norm": 2.734375, + "learning_rate": 0.0004802680926339001, + "loss": 0.1797, + "step": 92300 + }, + { + "epoch": 3.82, + "grad_norm": 0.88671875, + "learning_rate": 0.000480263869391279, + "loss": 0.1855, + "step": 92310 + }, + { + "epoch": 3.82, + "grad_norm": 0.376953125, + "learning_rate": 0.00048025964571532607, + "loss": 0.2055, + "step": 92320 + }, + { + "epoch": 3.82, + "grad_norm": 0.455078125, + "learning_rate": 0.0004802554216060494, + "loss": 0.2556, + "step": 92330 + }, + { + "epoch": 3.82, + "grad_norm": 0.609375, + "learning_rate": 0.00048025119706345687, + "loss": 0.1499, + "step": 92340 + }, + { + "epoch": 3.83, + "grad_norm": 0.8203125, + "learning_rate": 0.00048024697208755643, + "loss": 0.2268, + "step": 92350 + }, + { + "epoch": 3.83, + "grad_norm": 1.890625, + "learning_rate": 0.0004802427466783561, + "loss": 0.2023, + "step": 92360 + }, + { + "epoch": 3.83, + "grad_norm": 0.72265625, + "learning_rate": 0.00048023852083586377, + "loss": 0.2753, + "step": 92370 + }, + { + "epoch": 3.83, + "grad_norm": 0.89453125, + "learning_rate": 0.0004802342945600874, + "loss": 0.226, + "step": 92380 + }, + { + "epoch": 3.83, + "grad_norm": 0.80078125, + "learning_rate": 0.00048023006785103496, + "loss": 0.2508, + "step": 92390 + }, + { + "epoch": 3.83, + "grad_norm": 0.58984375, + "learning_rate": 0.00048022584070871443, + "loss": 0.2184, + "step": 92400 + }, + { + "epoch": 3.83, + "grad_norm": 0.0, + "learning_rate": 0.0004802216131331337, + "loss": 0.2519, + "step": 92410 + }, + { + "epoch": 3.83, + "grad_norm": 0.64453125, + "learning_rate": 0.0004802173851243007, + "loss": 0.2244, + "step": 92420 + }, + { + "epoch": 3.83, + "grad_norm": 0.427734375, + "learning_rate": 0.0004802131566822235, + "loss": 0.191, + "step": 92430 + }, + { + "epoch": 3.83, + "grad_norm": 0.44140625, + "learning_rate": 0.00048020892780691, + "loss": 0.2137, + "step": 92440 + }, + { + "epoch": 3.83, + "grad_norm": 0.75390625, + "learning_rate": 0.0004802046984983681, + "loss": 0.2166, + "step": 92450 + }, + { + "epoch": 3.83, + "grad_norm": 1.953125, + "learning_rate": 0.0004802004687566059, + "loss": 0.2379, + "step": 92460 + }, + { + "epoch": 3.83, + "grad_norm": 0.5625, + "learning_rate": 0.0004801962385816313, + "loss": 0.1984, + "step": 92470 + }, + { + "epoch": 3.83, + "grad_norm": 0.671875, + "learning_rate": 0.0004801920079734522, + "loss": 0.2794, + "step": 92480 + }, + { + "epoch": 3.83, + "grad_norm": 0.287109375, + "learning_rate": 0.00048018777693207654, + "loss": 0.2247, + "step": 92490 + }, + { + "epoch": 3.83, + "grad_norm": 1.0234375, + "learning_rate": 0.0004801835454575124, + "loss": 0.2452, + "step": 92500 + }, + { + "epoch": 3.83, + "grad_norm": 0.4921875, + "learning_rate": 0.00048017931354976765, + "loss": 0.2306, + "step": 92510 + }, + { + "epoch": 3.83, + "grad_norm": 1.53125, + "learning_rate": 0.00048017508120885036, + "loss": 0.2411, + "step": 92520 + }, + { + "epoch": 3.83, + "grad_norm": 0.369140625, + "learning_rate": 0.00048017084843476833, + "loss": 0.2657, + "step": 92530 + }, + { + "epoch": 3.83, + "grad_norm": 0.287109375, + "learning_rate": 0.0004801666152275297, + "loss": 0.2008, + "step": 92540 + }, + { + "epoch": 3.83, + "grad_norm": 0.87109375, + "learning_rate": 0.0004801623815871423, + "loss": 0.2029, + "step": 92550 + }, + { + "epoch": 3.83, + "grad_norm": 0.69921875, + "learning_rate": 0.0004801581475136142, + "loss": 0.1908, + "step": 92560 + }, + { + "epoch": 3.83, + "grad_norm": 1.5078125, + "learning_rate": 0.0004801539130069532, + "loss": 0.1476, + "step": 92570 + }, + { + "epoch": 3.83, + "grad_norm": 0.74609375, + "learning_rate": 0.00048014967806716755, + "loss": 0.2475, + "step": 92580 + }, + { + "epoch": 3.84, + "grad_norm": 0.1435546875, + "learning_rate": 0.00048014544269426494, + "loss": 0.1491, + "step": 92590 + }, + { + "epoch": 3.84, + "grad_norm": 0.5078125, + "learning_rate": 0.00048014120688825355, + "loss": 0.1877, + "step": 92600 + }, + { + "epoch": 3.84, + "grad_norm": 0.8984375, + "learning_rate": 0.0004801369706491412, + "loss": 0.1924, + "step": 92610 + }, + { + "epoch": 3.84, + "grad_norm": 0.73046875, + "learning_rate": 0.00048013273397693595, + "loss": 0.2351, + "step": 92620 + }, + { + "epoch": 3.84, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004801284968716458, + "loss": 0.2011, + "step": 92630 + }, + { + "epoch": 3.84, + "grad_norm": 0.36328125, + "learning_rate": 0.00048012425933327866, + "loss": 0.2398, + "step": 92640 + }, + { + "epoch": 3.84, + "grad_norm": 0.7109375, + "learning_rate": 0.0004801200213618424, + "loss": 0.2021, + "step": 92650 + }, + { + "epoch": 3.84, + "grad_norm": 0.70703125, + "learning_rate": 0.00048011578295734515, + "loss": 0.1872, + "step": 92660 + }, + { + "epoch": 3.84, + "grad_norm": 0.6640625, + "learning_rate": 0.00048011154411979493, + "loss": 0.2094, + "step": 92670 + }, + { + "epoch": 3.84, + "grad_norm": 2.03125, + "learning_rate": 0.00048010730484919956, + "loss": 0.2161, + "step": 92680 + }, + { + "epoch": 3.84, + "grad_norm": 0.455078125, + "learning_rate": 0.0004801030651455671, + "loss": 0.21, + "step": 92690 + }, + { + "epoch": 3.84, + "grad_norm": 0.47265625, + "learning_rate": 0.0004800988250089056, + "loss": 0.2317, + "step": 92700 + }, + { + "epoch": 3.84, + "grad_norm": 0.63671875, + "learning_rate": 0.00048009458443922285, + "loss": 0.2147, + "step": 92710 + }, + { + "epoch": 3.84, + "grad_norm": 0.7734375, + "learning_rate": 0.000480090343436527, + "loss": 0.2668, + "step": 92720 + }, + { + "epoch": 3.84, + "grad_norm": 0.7890625, + "learning_rate": 0.00048008610200082593, + "loss": 0.217, + "step": 92730 + }, + { + "epoch": 3.84, + "grad_norm": 0.294921875, + "learning_rate": 0.0004800818601321277, + "loss": 0.1825, + "step": 92740 + }, + { + "epoch": 3.84, + "grad_norm": 0.9765625, + "learning_rate": 0.0004800776178304402, + "loss": 0.1882, + "step": 92750 + }, + { + "epoch": 3.84, + "grad_norm": 0.49609375, + "learning_rate": 0.0004800733750957715, + "loss": 0.1986, + "step": 92760 + }, + { + "epoch": 3.84, + "grad_norm": 0.66015625, + "learning_rate": 0.0004800691319281296, + "loss": 0.1935, + "step": 92770 + }, + { + "epoch": 3.84, + "grad_norm": 0.671875, + "learning_rate": 0.0004800648883275225, + "loss": 0.1991, + "step": 92780 + }, + { + "epoch": 3.84, + "grad_norm": 0.73828125, + "learning_rate": 0.000480060644293958, + "loss": 0.2363, + "step": 92790 + }, + { + "epoch": 3.84, + "grad_norm": 1.5546875, + "learning_rate": 0.0004800563998274442, + "loss": 0.2162, + "step": 92800 + }, + { + "epoch": 3.84, + "grad_norm": 0.79296875, + "learning_rate": 0.00048005215492798913, + "loss": 0.1946, + "step": 92810 + }, + { + "epoch": 3.84, + "grad_norm": 1.078125, + "learning_rate": 0.0004800479095956009, + "loss": 0.2433, + "step": 92820 + }, + { + "epoch": 3.85, + "grad_norm": 1.0078125, + "learning_rate": 0.00048004366383028706, + "loss": 0.2078, + "step": 92830 + }, + { + "epoch": 3.85, + "grad_norm": 0.44921875, + "learning_rate": 0.0004800394176320561, + "loss": 0.2277, + "step": 92840 + }, + { + "epoch": 3.85, + "grad_norm": 0.296875, + "learning_rate": 0.00048003517100091576, + "loss": 0.2184, + "step": 92850 + }, + { + "epoch": 3.85, + "grad_norm": 0.32421875, + "learning_rate": 0.00048003092393687405, + "loss": 0.2352, + "step": 92860 + }, + { + "epoch": 3.85, + "grad_norm": 1.296875, + "learning_rate": 0.00048002667643993894, + "loss": 0.2233, + "step": 92870 + }, + { + "epoch": 3.85, + "grad_norm": 1.5234375, + "learning_rate": 0.00048002242851011854, + "loss": 0.2273, + "step": 92880 + }, + { + "epoch": 3.85, + "grad_norm": 1.015625, + "learning_rate": 0.0004800181801474207, + "loss": 0.2207, + "step": 92890 + }, + { + "epoch": 3.85, + "grad_norm": 0.279296875, + "learning_rate": 0.00048001393135185355, + "loss": 0.2661, + "step": 92900 + }, + { + "epoch": 3.85, + "grad_norm": 0.734375, + "learning_rate": 0.000480009682123425, + "loss": 0.2188, + "step": 92910 + }, + { + "epoch": 3.85, + "grad_norm": 0.7265625, + "learning_rate": 0.000480005432462143, + "loss": 0.2231, + "step": 92920 + }, + { + "epoch": 3.85, + "grad_norm": 0.1982421875, + "learning_rate": 0.00048000118236801574, + "loss": 0.1828, + "step": 92930 + }, + { + "epoch": 3.85, + "grad_norm": 1.9765625, + "learning_rate": 0.000479996931841051, + "loss": 0.2213, + "step": 92940 + }, + { + "epoch": 3.85, + "grad_norm": 0.64453125, + "learning_rate": 0.00047999268088125686, + "loss": 0.2177, + "step": 92950 + }, + { + "epoch": 3.85, + "grad_norm": 0.20703125, + "learning_rate": 0.0004799884294886414, + "loss": 0.2073, + "step": 92960 + }, + { + "epoch": 3.85, + "grad_norm": 0.462890625, + "learning_rate": 0.00047998417766321246, + "loss": 0.205, + "step": 92970 + }, + { + "epoch": 3.85, + "grad_norm": 0.57421875, + "learning_rate": 0.00047997992540497824, + "loss": 0.2127, + "step": 92980 + }, + { + "epoch": 3.85, + "grad_norm": 0.50390625, + "learning_rate": 0.0004799756727139466, + "loss": 0.264, + "step": 92990 + }, + { + "epoch": 3.85, + "grad_norm": 1.0390625, + "learning_rate": 0.00047997141959012555, + "loss": 0.2217, + "step": 93000 + }, + { + "epoch": 3.85, + "grad_norm": 1.09375, + "learning_rate": 0.00047996716603352307, + "loss": 0.2117, + "step": 93010 + }, + { + "epoch": 3.85, + "grad_norm": 0.93359375, + "learning_rate": 0.0004799629120441473, + "loss": 0.2431, + "step": 93020 + }, + { + "epoch": 3.85, + "grad_norm": 0.73828125, + "learning_rate": 0.00047995865762200617, + "loss": 0.2372, + "step": 93030 + }, + { + "epoch": 3.85, + "grad_norm": 0.7890625, + "learning_rate": 0.0004799544027671077, + "loss": 0.2326, + "step": 93040 + }, + { + "epoch": 3.85, + "grad_norm": 0.1748046875, + "learning_rate": 0.00047995014747945983, + "loss": 0.2512, + "step": 93050 + }, + { + "epoch": 3.85, + "grad_norm": 0.58984375, + "learning_rate": 0.0004799458917590707, + "loss": 0.1837, + "step": 93060 + }, + { + "epoch": 3.85, + "grad_norm": 0.96484375, + "learning_rate": 0.00047994163560594803, + "loss": 0.1915, + "step": 93070 + }, + { + "epoch": 3.86, + "grad_norm": 0.51171875, + "learning_rate": 0.0004799373790201002, + "loss": 0.2089, + "step": 93080 + }, + { + "epoch": 3.86, + "grad_norm": 0.4609375, + "learning_rate": 0.00047993312200153506, + "loss": 0.2223, + "step": 93090 + }, + { + "epoch": 3.86, + "grad_norm": 0.48828125, + "learning_rate": 0.0004799288645502605, + "loss": 0.2428, + "step": 93100 + }, + { + "epoch": 3.86, + "grad_norm": 0.310546875, + "learning_rate": 0.0004799246066662848, + "loss": 0.2323, + "step": 93110 + }, + { + "epoch": 3.86, + "grad_norm": 0.259765625, + "learning_rate": 0.0004799203483496157, + "loss": 0.2356, + "step": 93120 + }, + { + "epoch": 3.86, + "grad_norm": 0.4609375, + "learning_rate": 0.00047991608960026134, + "loss": 0.2018, + "step": 93130 + }, + { + "epoch": 3.86, + "grad_norm": 0.6640625, + "learning_rate": 0.00047991183041822975, + "loss": 0.1885, + "step": 93140 + }, + { + "epoch": 3.86, + "grad_norm": 0.8515625, + "learning_rate": 0.00047990757080352897, + "loss": 0.1476, + "step": 93150 + }, + { + "epoch": 3.86, + "grad_norm": 0.8828125, + "learning_rate": 0.0004799033107561669, + "loss": 0.2335, + "step": 93160 + }, + { + "epoch": 3.86, + "grad_norm": 0.373046875, + "learning_rate": 0.00047989905027615167, + "loss": 0.2289, + "step": 93170 + }, + { + "epoch": 3.86, + "grad_norm": 0.6171875, + "learning_rate": 0.00047989478936349126, + "loss": 0.2142, + "step": 93180 + }, + { + "epoch": 3.86, + "grad_norm": 1.0234375, + "learning_rate": 0.00047989052801819364, + "loss": 0.2076, + "step": 93190 + }, + { + "epoch": 3.86, + "grad_norm": 0.671875, + "learning_rate": 0.00047988626624026687, + "loss": 0.2248, + "step": 93200 + }, + { + "epoch": 3.86, + "grad_norm": 0.66015625, + "learning_rate": 0.00047988200402971905, + "loss": 0.1858, + "step": 93210 + }, + { + "epoch": 3.86, + "grad_norm": 0.0, + "learning_rate": 0.000479877741386558, + "loss": 0.1655, + "step": 93220 + }, + { + "epoch": 3.86, + "grad_norm": 0.51171875, + "learning_rate": 0.000479873478310792, + "loss": 0.228, + "step": 93230 + }, + { + "epoch": 3.86, + "grad_norm": 0.6015625, + "learning_rate": 0.00047986921480242884, + "loss": 0.1817, + "step": 93240 + }, + { + "epoch": 3.86, + "grad_norm": 0.15625, + "learning_rate": 0.0004798649508614767, + "loss": 0.2103, + "step": 93250 + }, + { + "epoch": 3.86, + "grad_norm": 0.82421875, + "learning_rate": 0.00047986068648794356, + "loss": 0.1628, + "step": 93260 + }, + { + "epoch": 3.86, + "grad_norm": 0.310546875, + "learning_rate": 0.0004798564216818374, + "loss": 0.2155, + "step": 93270 + }, + { + "epoch": 3.86, + "grad_norm": 1.375, + "learning_rate": 0.00047985215644316627, + "loss": 0.2557, + "step": 93280 + }, + { + "epoch": 3.86, + "grad_norm": 0.94140625, + "learning_rate": 0.0004798478907719382, + "loss": 0.1992, + "step": 93290 + }, + { + "epoch": 3.86, + "grad_norm": 0.6953125, + "learning_rate": 0.00047984362466816125, + "loss": 0.1969, + "step": 93300 + }, + { + "epoch": 3.86, + "grad_norm": 0.435546875, + "learning_rate": 0.0004798393581318433, + "loss": 0.296, + "step": 93310 + }, + { + "epoch": 3.87, + "grad_norm": 1.8203125, + "learning_rate": 0.00047983509116299264, + "loss": 0.1984, + "step": 93320 + }, + { + "epoch": 3.87, + "grad_norm": 0.49609375, + "learning_rate": 0.0004798308237616171, + "loss": 0.1643, + "step": 93330 + }, + { + "epoch": 3.87, + "grad_norm": 0.298828125, + "learning_rate": 0.00047982655592772484, + "loss": 0.2222, + "step": 93340 + }, + { + "epoch": 3.87, + "grad_norm": 0.703125, + "learning_rate": 0.0004798222876613237, + "loss": 0.2176, + "step": 93350 + }, + { + "epoch": 3.87, + "grad_norm": 0.6015625, + "learning_rate": 0.00047981801896242193, + "loss": 0.2266, + "step": 93360 + }, + { + "epoch": 3.87, + "grad_norm": 0.365234375, + "learning_rate": 0.00047981374983102745, + "loss": 0.2057, + "step": 93370 + }, + { + "epoch": 3.87, + "grad_norm": 0.87890625, + "learning_rate": 0.00047980948026714825, + "loss": 0.2252, + "step": 93380 + }, + { + "epoch": 3.87, + "grad_norm": 0.486328125, + "learning_rate": 0.0004798052102707925, + "loss": 0.2391, + "step": 93390 + }, + { + "epoch": 3.87, + "grad_norm": 1.1953125, + "learning_rate": 0.00047980093984196815, + "loss": 0.2326, + "step": 93400 + }, + { + "epoch": 3.87, + "grad_norm": 0.6875, + "learning_rate": 0.0004797966689806832, + "loss": 0.2464, + "step": 93410 + }, + { + "epoch": 3.87, + "grad_norm": 0.86328125, + "learning_rate": 0.0004797923976869457, + "loss": 0.1907, + "step": 93420 + }, + { + "epoch": 3.87, + "grad_norm": 0.494140625, + "learning_rate": 0.00047978812596076383, + "loss": 0.1357, + "step": 93430 + }, + { + "epoch": 3.87, + "grad_norm": 1.8671875, + "learning_rate": 0.00047978385380214553, + "loss": 0.2976, + "step": 93440 + }, + { + "epoch": 3.87, + "grad_norm": 0.5546875, + "learning_rate": 0.0004797795812110988, + "loss": 0.1923, + "step": 93450 + }, + { + "epoch": 3.87, + "grad_norm": 0.73046875, + "learning_rate": 0.0004797753081876316, + "loss": 0.2255, + "step": 93460 + }, + { + "epoch": 3.87, + "grad_norm": 0.51171875, + "learning_rate": 0.00047977103473175224, + "loss": 0.186, + "step": 93470 + }, + { + "epoch": 3.87, + "grad_norm": 0.478515625, + "learning_rate": 0.0004797667608434685, + "loss": 0.1452, + "step": 93480 + }, + { + "epoch": 3.87, + "grad_norm": 0.93359375, + "learning_rate": 0.00047976248652278853, + "loss": 0.239, + "step": 93490 + }, + { + "epoch": 3.87, + "grad_norm": 0.88671875, + "learning_rate": 0.00047975821176972046, + "loss": 0.2424, + "step": 93500 + }, + { + "epoch": 3.87, + "grad_norm": 0.69140625, + "learning_rate": 0.0004797539365842722, + "loss": 0.1844, + "step": 93510 + }, + { + "epoch": 3.87, + "grad_norm": 0.5234375, + "learning_rate": 0.0004797496609664518, + "loss": 0.2083, + "step": 93520 + }, + { + "epoch": 3.87, + "grad_norm": 0.61328125, + "learning_rate": 0.0004797453849162674, + "loss": 0.2401, + "step": 93530 + }, + { + "epoch": 3.87, + "grad_norm": 0.69140625, + "learning_rate": 0.000479741108433727, + "loss": 0.2085, + "step": 93540 + }, + { + "epoch": 3.87, + "grad_norm": 0.56640625, + "learning_rate": 0.0004797368315188386, + "loss": 0.2105, + "step": 93550 + }, + { + "epoch": 3.88, + "grad_norm": 0.84765625, + "learning_rate": 0.0004797325541716103, + "loss": 0.1706, + "step": 93560 + }, + { + "epoch": 3.88, + "grad_norm": 0.2197265625, + "learning_rate": 0.0004797282763920502, + "loss": 0.1647, + "step": 93570 + }, + { + "epoch": 3.88, + "grad_norm": 1.3046875, + "learning_rate": 0.0004797239981801663, + "loss": 0.1923, + "step": 93580 + }, + { + "epoch": 3.88, + "grad_norm": 0.478515625, + "learning_rate": 0.0004797197195359666, + "loss": 0.2025, + "step": 93590 + }, + { + "epoch": 3.88, + "grad_norm": 1.8125, + "learning_rate": 0.00047971544045945913, + "loss": 0.195, + "step": 93600 + }, + { + "epoch": 3.88, + "grad_norm": 0.65234375, + "learning_rate": 0.0004797111609506521, + "loss": 0.2228, + "step": 93610 + }, + { + "epoch": 3.88, + "grad_norm": 0.82421875, + "learning_rate": 0.00047970688100955344, + "loss": 0.149, + "step": 93620 + }, + { + "epoch": 3.88, + "grad_norm": 0.345703125, + "learning_rate": 0.00047970260063617126, + "loss": 0.1841, + "step": 93630 + }, + { + "epoch": 3.88, + "grad_norm": 0.5234375, + "learning_rate": 0.0004796983198305136, + "loss": 0.2262, + "step": 93640 + }, + { + "epoch": 3.88, + "grad_norm": 1.09375, + "learning_rate": 0.0004796940385925884, + "loss": 0.1705, + "step": 93650 + }, + { + "epoch": 3.88, + "grad_norm": 0.7890625, + "learning_rate": 0.000479689756922404, + "loss": 0.1382, + "step": 93660 + }, + { + "epoch": 3.88, + "grad_norm": 1.65625, + "learning_rate": 0.00047968547481996816, + "loss": 0.2347, + "step": 93670 + }, + { + "epoch": 3.88, + "grad_norm": 1.03125, + "learning_rate": 0.00047968119228528907, + "loss": 0.2359, + "step": 93680 + }, + { + "epoch": 3.88, + "grad_norm": 0.1650390625, + "learning_rate": 0.00047967690931837484, + "loss": 0.252, + "step": 93690 + }, + { + "epoch": 3.88, + "grad_norm": 1.40625, + "learning_rate": 0.0004796726259192334, + "loss": 0.3156, + "step": 93700 + }, + { + "epoch": 3.88, + "grad_norm": 1.4765625, + "learning_rate": 0.00047966834208787294, + "loss": 0.2031, + "step": 93710 + }, + { + "epoch": 3.88, + "grad_norm": 0.88671875, + "learning_rate": 0.00047966405782430137, + "loss": 0.1755, + "step": 93720 + }, + { + "epoch": 3.88, + "grad_norm": 0.625, + "learning_rate": 0.0004796597731285269, + "loss": 0.1882, + "step": 93730 + }, + { + "epoch": 3.88, + "grad_norm": 0.65625, + "learning_rate": 0.0004796554880005576, + "loss": 0.2042, + "step": 93740 + }, + { + "epoch": 3.88, + "grad_norm": 0.1875, + "learning_rate": 0.0004796512024404014, + "loss": 0.2459, + "step": 93750 + }, + { + "epoch": 3.88, + "grad_norm": 0.515625, + "learning_rate": 0.0004796469164480664, + "loss": 0.1968, + "step": 93760 + }, + { + "epoch": 3.88, + "grad_norm": 0.83203125, + "learning_rate": 0.0004796426300235608, + "loss": 0.2006, + "step": 93770 + }, + { + "epoch": 3.88, + "grad_norm": 0.7734375, + "learning_rate": 0.0004796383431668925, + "loss": 0.225, + "step": 93780 + }, + { + "epoch": 3.88, + "grad_norm": 0.265625, + "learning_rate": 0.00047963405587806964, + "loss": 0.1946, + "step": 93790 + }, + { + "epoch": 3.89, + "grad_norm": 0.37890625, + "learning_rate": 0.00047962976815710035, + "loss": 0.1848, + "step": 93800 + }, + { + "epoch": 3.89, + "grad_norm": 0.279296875, + "learning_rate": 0.0004796254800039925, + "loss": 0.1854, + "step": 93810 + }, + { + "epoch": 3.89, + "grad_norm": 0.71875, + "learning_rate": 0.0004796211914187545, + "loss": 0.2126, + "step": 93820 + }, + { + "epoch": 3.89, + "grad_norm": 0.6796875, + "learning_rate": 0.00047961690240139404, + "loss": 0.2141, + "step": 93830 + }, + { + "epoch": 3.89, + "grad_norm": 0.796875, + "learning_rate": 0.00047961261295191945, + "loss": 0.1651, + "step": 93840 + }, + { + "epoch": 3.89, + "grad_norm": 0.63671875, + "learning_rate": 0.0004796083230703386, + "loss": 0.2445, + "step": 93850 + }, + { + "epoch": 3.89, + "grad_norm": 0.69140625, + "learning_rate": 0.00047960403275665986, + "loss": 0.212, + "step": 93860 + }, + { + "epoch": 3.89, + "grad_norm": 0.375, + "learning_rate": 0.00047959974201089103, + "loss": 0.2307, + "step": 93870 + }, + { + "epoch": 3.89, + "grad_norm": 0.9453125, + "learning_rate": 0.0004795954508330403, + "loss": 0.2421, + "step": 93880 + }, + { + "epoch": 3.89, + "grad_norm": 1.8046875, + "learning_rate": 0.00047959115922311567, + "loss": 0.1933, + "step": 93890 + }, + { + "epoch": 3.89, + "grad_norm": 0.0, + "learning_rate": 0.0004795868671811253, + "loss": 0.2009, + "step": 93900 + }, + { + "epoch": 3.89, + "grad_norm": 0.0, + "learning_rate": 0.00047958257470707733, + "loss": 0.1973, + "step": 93910 + }, + { + "epoch": 3.89, + "grad_norm": 0.54296875, + "learning_rate": 0.0004795782818009796, + "loss": 0.1957, + "step": 93920 + }, + { + "epoch": 3.89, + "grad_norm": 0.9765625, + "learning_rate": 0.00047957398846284045, + "loss": 0.2546, + "step": 93930 + }, + { + "epoch": 3.89, + "grad_norm": 0.349609375, + "learning_rate": 0.0004795696946926678, + "loss": 0.2413, + "step": 93940 + }, + { + "epoch": 3.89, + "grad_norm": 1.140625, + "learning_rate": 0.00047956540049046983, + "loss": 0.236, + "step": 93950 + }, + { + "epoch": 3.89, + "grad_norm": 0.48828125, + "learning_rate": 0.00047956110585625447, + "loss": 0.2271, + "step": 93960 + }, + { + "epoch": 3.89, + "grad_norm": 0.796875, + "learning_rate": 0.00047955681079003, + "loss": 0.1911, + "step": 93970 + }, + { + "epoch": 3.89, + "grad_norm": 0.0, + "learning_rate": 0.00047955251529180435, + "loss": 0.1778, + "step": 93980 + }, + { + "epoch": 3.89, + "grad_norm": 0.5625, + "learning_rate": 0.00047954821936158564, + "loss": 0.2207, + "step": 93990 + }, + { + "epoch": 3.89, + "grad_norm": 2.890625, + "learning_rate": 0.000479543922999382, + "loss": 0.1507, + "step": 94000 + }, + { + "epoch": 3.89, + "grad_norm": 0.63671875, + "learning_rate": 0.0004795396262052014, + "loss": 0.2273, + "step": 94010 + }, + { + "epoch": 3.89, + "grad_norm": 0.90625, + "learning_rate": 0.0004795353289790521, + "loss": 0.233, + "step": 94020 + }, + { + "epoch": 3.89, + "grad_norm": 0.921875, + "learning_rate": 0.0004795310313209421, + "loss": 0.2481, + "step": 94030 + }, + { + "epoch": 3.9, + "grad_norm": 0.73828125, + "learning_rate": 0.00047952673323087947, + "loss": 0.1862, + "step": 94040 + }, + { + "epoch": 3.9, + "grad_norm": 0.4453125, + "learning_rate": 0.00047952243470887233, + "loss": 0.1889, + "step": 94050 + }, + { + "epoch": 3.9, + "grad_norm": 0.8125, + "learning_rate": 0.00047951813575492874, + "loss": 0.2258, + "step": 94060 + }, + { + "epoch": 3.9, + "grad_norm": 0.671875, + "learning_rate": 0.0004795138363690568, + "loss": 0.2156, + "step": 94070 + }, + { + "epoch": 3.9, + "grad_norm": 0.7421875, + "learning_rate": 0.0004795095365512646, + "loss": 0.2218, + "step": 94080 + }, + { + "epoch": 3.9, + "grad_norm": 0.76171875, + "learning_rate": 0.0004795052363015602, + "loss": 0.2273, + "step": 94090 + }, + { + "epoch": 3.9, + "grad_norm": 0.6015625, + "learning_rate": 0.0004795009356199518, + "loss": 0.2562, + "step": 94100 + }, + { + "epoch": 3.9, + "grad_norm": 0.314453125, + "learning_rate": 0.00047949663450644743, + "loss": 0.208, + "step": 94110 + }, + { + "epoch": 3.9, + "grad_norm": 0.625, + "learning_rate": 0.0004794923329610551, + "loss": 0.2059, + "step": 94120 + }, + { + "epoch": 3.9, + "grad_norm": 0.5390625, + "learning_rate": 0.000479488030983783, + "loss": 0.1836, + "step": 94130 + }, + { + "epoch": 3.9, + "grad_norm": 0.80859375, + "learning_rate": 0.00047948372857463926, + "loss": 0.1775, + "step": 94140 + }, + { + "epoch": 3.9, + "grad_norm": 0.94140625, + "learning_rate": 0.0004794794257336319, + "loss": 0.2109, + "step": 94150 + }, + { + "epoch": 3.9, + "grad_norm": 0.6484375, + "learning_rate": 0.00047947512246076905, + "loss": 0.2463, + "step": 94160 + }, + { + "epoch": 3.9, + "grad_norm": 1.4765625, + "learning_rate": 0.0004794708187560588, + "loss": 0.2215, + "step": 94170 + }, + { + "epoch": 3.9, + "grad_norm": 0.4140625, + "learning_rate": 0.00047946651461950923, + "loss": 0.2, + "step": 94180 + }, + { + "epoch": 3.9, + "grad_norm": 0.546875, + "learning_rate": 0.00047946221005112845, + "loss": 0.2445, + "step": 94190 + }, + { + "epoch": 3.9, + "grad_norm": 0.62890625, + "learning_rate": 0.00047945790505092464, + "loss": 0.222, + "step": 94200 + }, + { + "epoch": 3.9, + "grad_norm": 0.625, + "learning_rate": 0.00047945359961890576, + "loss": 0.2055, + "step": 94210 + }, + { + "epoch": 3.9, + "grad_norm": 0.9453125, + "learning_rate": 0.00047944929375508, + "loss": 0.2369, + "step": 94220 + }, + { + "epoch": 3.9, + "grad_norm": 0.4140625, + "learning_rate": 0.0004794449874594555, + "loss": 0.2381, + "step": 94230 + }, + { + "epoch": 3.9, + "grad_norm": 0.376953125, + "learning_rate": 0.00047944068073204027, + "loss": 0.1836, + "step": 94240 + }, + { + "epoch": 3.9, + "grad_norm": 0.375, + "learning_rate": 0.00047943637357284244, + "loss": 0.2055, + "step": 94250 + }, + { + "epoch": 3.9, + "grad_norm": 1.4375, + "learning_rate": 0.00047943206598187015, + "loss": 0.2166, + "step": 94260 + }, + { + "epoch": 3.9, + "grad_norm": 1.109375, + "learning_rate": 0.0004794277579591315, + "loss": 0.2104, + "step": 94270 + }, + { + "epoch": 3.91, + "grad_norm": 1.234375, + "learning_rate": 0.00047942344950463456, + "loss": 0.2353, + "step": 94280 + }, + { + "epoch": 3.91, + "grad_norm": 0.69140625, + "learning_rate": 0.0004794191406183874, + "loss": 0.2344, + "step": 94290 + }, + { + "epoch": 3.91, + "grad_norm": 0.83203125, + "learning_rate": 0.0004794148313003983, + "loss": 0.2139, + "step": 94300 + }, + { + "epoch": 3.91, + "grad_norm": 0.53515625, + "learning_rate": 0.0004794105215506752, + "loss": 0.1673, + "step": 94310 + }, + { + "epoch": 3.91, + "grad_norm": 1.03125, + "learning_rate": 0.00047940621136922636, + "loss": 0.1918, + "step": 94320 + }, + { + "epoch": 3.91, + "grad_norm": 0.65625, + "learning_rate": 0.0004794019007560597, + "loss": 0.1871, + "step": 94330 + }, + { + "epoch": 3.91, + "grad_norm": 0.494140625, + "learning_rate": 0.00047939758971118354, + "loss": 0.2221, + "step": 94340 + }, + { + "epoch": 3.91, + "grad_norm": 0.875, + "learning_rate": 0.00047939327823460576, + "loss": 0.1668, + "step": 94350 + }, + { + "epoch": 3.91, + "grad_norm": 0.71484375, + "learning_rate": 0.00047938896632633473, + "loss": 0.2783, + "step": 94360 + }, + { + "epoch": 3.91, + "grad_norm": 0.3125, + "learning_rate": 0.00047938465398637836, + "loss": 0.2076, + "step": 94370 + }, + { + "epoch": 3.91, + "grad_norm": 0.6171875, + "learning_rate": 0.00047938034121474483, + "loss": 0.2169, + "step": 94380 + }, + { + "epoch": 3.91, + "grad_norm": 0.59765625, + "learning_rate": 0.0004793760280114423, + "loss": 0.1896, + "step": 94390 + }, + { + "epoch": 3.91, + "grad_norm": 0.74609375, + "learning_rate": 0.00047937171437647885, + "loss": 0.2121, + "step": 94400 + }, + { + "epoch": 3.91, + "grad_norm": 1.0546875, + "learning_rate": 0.0004793674003098627, + "loss": 0.2539, + "step": 94410 + }, + { + "epoch": 3.91, + "grad_norm": 0.3671875, + "learning_rate": 0.0004793630858116017, + "loss": 0.2197, + "step": 94420 + }, + { + "epoch": 3.91, + "grad_norm": 1.2578125, + "learning_rate": 0.00047935877088170427, + "loss": 0.1668, + "step": 94430 + }, + { + "epoch": 3.91, + "grad_norm": 0.412109375, + "learning_rate": 0.0004793544555201783, + "loss": 0.2257, + "step": 94440 + }, + { + "epoch": 3.91, + "grad_norm": 0.37890625, + "learning_rate": 0.0004793501397270321, + "loss": 0.2531, + "step": 94450 + }, + { + "epoch": 3.91, + "grad_norm": 0.47265625, + "learning_rate": 0.00047934582350227375, + "loss": 0.2353, + "step": 94460 + }, + { + "epoch": 3.91, + "grad_norm": 0.78515625, + "learning_rate": 0.00047934150684591115, + "loss": 0.205, + "step": 94470 + }, + { + "epoch": 3.91, + "grad_norm": 0.9140625, + "learning_rate": 0.0004793371897579527, + "loss": 0.2574, + "step": 94480 + }, + { + "epoch": 3.91, + "grad_norm": 0.74609375, + "learning_rate": 0.0004793328722384065, + "loss": 0.2102, + "step": 94490 + }, + { + "epoch": 3.91, + "grad_norm": 0.55859375, + "learning_rate": 0.0004793285542872805, + "loss": 0.2137, + "step": 94500 + }, + { + "epoch": 3.91, + "grad_norm": 0.6015625, + "learning_rate": 0.00047932423590458297, + "loss": 0.1959, + "step": 94510 + }, + { + "epoch": 3.92, + "grad_norm": 0.287109375, + "learning_rate": 0.00047931991709032195, + "loss": 0.2566, + "step": 94520 + }, + { + "epoch": 3.92, + "grad_norm": 1.0078125, + "learning_rate": 0.00047931559784450563, + "loss": 0.2623, + "step": 94530 + }, + { + "epoch": 3.92, + "grad_norm": 0.55859375, + "learning_rate": 0.00047931127816714216, + "loss": 0.2681, + "step": 94540 + }, + { + "epoch": 3.92, + "grad_norm": 0.6875, + "learning_rate": 0.00047930695805823955, + "loss": 0.2346, + "step": 94550 + }, + { + "epoch": 3.92, + "grad_norm": 0.7265625, + "learning_rate": 0.0004793026375178061, + "loss": 0.2204, + "step": 94560 + }, + { + "epoch": 3.92, + "grad_norm": 1.0234375, + "learning_rate": 0.00047929831654584977, + "loss": 0.2283, + "step": 94570 + }, + { + "epoch": 3.92, + "grad_norm": 0.455078125, + "learning_rate": 0.0004792939951423788, + "loss": 0.1558, + "step": 94580 + }, + { + "epoch": 3.92, + "grad_norm": 2.984375, + "learning_rate": 0.0004792896733074013, + "loss": 0.2243, + "step": 94590 + }, + { + "epoch": 3.92, + "grad_norm": 0.37890625, + "learning_rate": 0.0004792853510409254, + "loss": 0.1984, + "step": 94600 + }, + { + "epoch": 3.92, + "grad_norm": 1.25, + "learning_rate": 0.00047928102834295924, + "loss": 0.1868, + "step": 94610 + }, + { + "epoch": 3.92, + "grad_norm": 0.87109375, + "learning_rate": 0.0004792767052135109, + "loss": 0.1949, + "step": 94620 + }, + { + "epoch": 3.92, + "grad_norm": 0.8359375, + "learning_rate": 0.0004792723816525886, + "loss": 0.2499, + "step": 94630 + }, + { + "epoch": 3.92, + "grad_norm": 0.7265625, + "learning_rate": 0.00047926805766020043, + "loss": 0.215, + "step": 94640 + }, + { + "epoch": 3.92, + "grad_norm": 0.458984375, + "learning_rate": 0.0004792637332363545, + "loss": 0.1927, + "step": 94650 + }, + { + "epoch": 3.92, + "grad_norm": 0.416015625, + "learning_rate": 0.00047925940838105895, + "loss": 0.1876, + "step": 94660 + }, + { + "epoch": 3.92, + "grad_norm": 0.92578125, + "learning_rate": 0.0004792550830943221, + "loss": 0.2193, + "step": 94670 + }, + { + "epoch": 3.92, + "grad_norm": 0.41015625, + "learning_rate": 0.00047925075737615183, + "loss": 0.2123, + "step": 94680 + }, + { + "epoch": 3.92, + "grad_norm": 1.2734375, + "learning_rate": 0.00047924643122655643, + "loss": 0.1724, + "step": 94690 + }, + { + "epoch": 3.92, + "grad_norm": 0.45703125, + "learning_rate": 0.00047924210464554397, + "loss": 0.1955, + "step": 94700 + }, + { + "epoch": 3.92, + "grad_norm": 0.81640625, + "learning_rate": 0.0004792377776331226, + "loss": 0.2064, + "step": 94710 + }, + { + "epoch": 3.92, + "grad_norm": 0.7421875, + "learning_rate": 0.0004792334501893005, + "loss": 0.2299, + "step": 94720 + }, + { + "epoch": 3.92, + "grad_norm": 0.8125, + "learning_rate": 0.0004792291223140859, + "loss": 0.2001, + "step": 94730 + }, + { + "epoch": 3.92, + "grad_norm": 0.373046875, + "learning_rate": 0.00047922479400748676, + "loss": 0.2401, + "step": 94740 + }, + { + "epoch": 3.92, + "grad_norm": 0.6953125, + "learning_rate": 0.0004792204652695114, + "loss": 0.1966, + "step": 94750 + }, + { + "epoch": 3.92, + "grad_norm": 0.36328125, + "learning_rate": 0.00047921613610016773, + "loss": 0.2004, + "step": 94760 + }, + { + "epoch": 3.93, + "grad_norm": 0.5078125, + "learning_rate": 0.00047921180649946417, + "loss": 0.1987, + "step": 94770 + }, + { + "epoch": 3.93, + "grad_norm": 0.58984375, + "learning_rate": 0.00047920747646740865, + "loss": 0.2246, + "step": 94780 + }, + { + "epoch": 3.93, + "grad_norm": 1.1640625, + "learning_rate": 0.0004792031460040095, + "loss": 0.2764, + "step": 94790 + }, + { + "epoch": 3.93, + "grad_norm": 0.37109375, + "learning_rate": 0.00047919881510927463, + "loss": 0.2536, + "step": 94800 + }, + { + "epoch": 3.93, + "grad_norm": 0.9375, + "learning_rate": 0.0004791944837832125, + "loss": 0.2322, + "step": 94810 + }, + { + "epoch": 3.93, + "grad_norm": 1.078125, + "learning_rate": 0.00047919015202583105, + "loss": 0.2299, + "step": 94820 + }, + { + "epoch": 3.93, + "grad_norm": 0.408203125, + "learning_rate": 0.00047918581983713847, + "loss": 0.2057, + "step": 94830 + }, + { + "epoch": 3.93, + "grad_norm": 0.83984375, + "learning_rate": 0.00047918148721714295, + "loss": 0.207, + "step": 94840 + }, + { + "epoch": 3.93, + "grad_norm": 0.5234375, + "learning_rate": 0.0004791771541658526, + "loss": 0.2406, + "step": 94850 + }, + { + "epoch": 3.93, + "grad_norm": 0.75, + "learning_rate": 0.0004791728206832756, + "loss": 0.2186, + "step": 94860 + }, + { + "epoch": 3.93, + "grad_norm": 1.1171875, + "learning_rate": 0.00047916848676942016, + "loss": 0.2346, + "step": 94870 + }, + { + "epoch": 3.93, + "grad_norm": 1.1953125, + "learning_rate": 0.00047916415242429435, + "loss": 0.212, + "step": 94880 + }, + { + "epoch": 3.93, + "grad_norm": 1.0, + "learning_rate": 0.0004791598176479063, + "loss": 0.2217, + "step": 94890 + }, + { + "epoch": 3.93, + "grad_norm": 1.2734375, + "learning_rate": 0.0004791554824402642, + "loss": 0.2064, + "step": 94900 + }, + { + "epoch": 3.93, + "grad_norm": 0.38671875, + "learning_rate": 0.00047915114680137627, + "loss": 0.2713, + "step": 94910 + }, + { + "epoch": 3.93, + "grad_norm": 0.240234375, + "learning_rate": 0.00047914681073125064, + "loss": 0.2294, + "step": 94920 + }, + { + "epoch": 3.93, + "grad_norm": 1.1640625, + "learning_rate": 0.0004791424742298955, + "loss": 0.2238, + "step": 94930 + }, + { + "epoch": 3.93, + "grad_norm": 0.9921875, + "learning_rate": 0.0004791381372973189, + "loss": 0.2292, + "step": 94940 + }, + { + "epoch": 3.93, + "grad_norm": 1.4609375, + "learning_rate": 0.0004791337999335291, + "loss": 0.2823, + "step": 94950 + }, + { + "epoch": 3.93, + "grad_norm": 0.78515625, + "learning_rate": 0.00047912946213853427, + "loss": 0.1927, + "step": 94960 + }, + { + "epoch": 3.93, + "grad_norm": 0.5625, + "learning_rate": 0.0004791251239123424, + "loss": 0.2365, + "step": 94970 + }, + { + "epoch": 3.93, + "grad_norm": 0.6796875, + "learning_rate": 0.00047912078525496195, + "loss": 0.1591, + "step": 94980 + }, + { + "epoch": 3.93, + "grad_norm": 1.5546875, + "learning_rate": 0.00047911644616640083, + "loss": 0.2215, + "step": 94990 + }, + { + "epoch": 3.93, + "grad_norm": 0.859375, + "learning_rate": 0.0004791121066466673, + "loss": 0.1557, + "step": 95000 + }, + { + "epoch": 3.94, + "grad_norm": 0.453125, + "learning_rate": 0.0004791077666957695, + "loss": 0.1949, + "step": 95010 + }, + { + "epoch": 3.94, + "grad_norm": 0.8046875, + "learning_rate": 0.0004791034263137157, + "loss": 0.2064, + "step": 95020 + }, + { + "epoch": 3.94, + "grad_norm": 1.6796875, + "learning_rate": 0.00047909908550051403, + "loss": 0.1925, + "step": 95030 + }, + { + "epoch": 3.94, + "grad_norm": 1.0234375, + "learning_rate": 0.0004790947442561725, + "loss": 0.1848, + "step": 95040 + }, + { + "epoch": 3.94, + "grad_norm": 1.96875, + "learning_rate": 0.00047909040258069946, + "loss": 0.2188, + "step": 95050 + }, + { + "epoch": 3.94, + "grad_norm": 0.5859375, + "learning_rate": 0.0004790860604741031, + "loss": 0.2788, + "step": 95060 + }, + { + "epoch": 3.94, + "grad_norm": 0.6875, + "learning_rate": 0.00047908171793639145, + "loss": 0.2255, + "step": 95070 + }, + { + "epoch": 3.94, + "grad_norm": 1.2734375, + "learning_rate": 0.0004790773749675726, + "loss": 0.2542, + "step": 95080 + }, + { + "epoch": 3.94, + "grad_norm": 1.5390625, + "learning_rate": 0.00047907303156765506, + "loss": 0.1881, + "step": 95090 + }, + { + "epoch": 3.94, + "grad_norm": 0.77734375, + "learning_rate": 0.0004790686877366468, + "loss": 0.1773, + "step": 95100 + }, + { + "epoch": 3.94, + "grad_norm": 0.703125, + "learning_rate": 0.00047906434347455595, + "loss": 0.1991, + "step": 95110 + }, + { + "epoch": 3.94, + "grad_norm": 1.4140625, + "learning_rate": 0.0004790599987813907, + "loss": 0.2248, + "step": 95120 + }, + { + "epoch": 3.94, + "grad_norm": 1.296875, + "learning_rate": 0.00047905565365715936, + "loss": 0.2263, + "step": 95130 + }, + { + "epoch": 3.94, + "grad_norm": 0.5, + "learning_rate": 0.00047905130810186995, + "loss": 0.2168, + "step": 95140 + }, + { + "epoch": 3.94, + "grad_norm": 0.61328125, + "learning_rate": 0.0004790469621155308, + "loss": 0.2362, + "step": 95150 + }, + { + "epoch": 3.94, + "grad_norm": 0.625, + "learning_rate": 0.0004790426156981499, + "loss": 0.2153, + "step": 95160 + }, + { + "epoch": 3.94, + "grad_norm": 0.9375, + "learning_rate": 0.00047903826884973554, + "loss": 0.1941, + "step": 95170 + }, + { + "epoch": 3.94, + "grad_norm": 0.81640625, + "learning_rate": 0.000479033921570296, + "loss": 0.2466, + "step": 95180 + }, + { + "epoch": 3.94, + "grad_norm": 0.416015625, + "learning_rate": 0.00047902957385983927, + "loss": 0.2486, + "step": 95190 + }, + { + "epoch": 3.94, + "grad_norm": 0.53125, + "learning_rate": 0.0004790252257183736, + "loss": 0.2543, + "step": 95200 + }, + { + "epoch": 3.94, + "grad_norm": 0.7109375, + "learning_rate": 0.00047902087714590726, + "loss": 0.2503, + "step": 95210 + }, + { + "epoch": 3.94, + "grad_norm": 0.984375, + "learning_rate": 0.0004790165281424483, + "loss": 0.1779, + "step": 95220 + }, + { + "epoch": 3.94, + "grad_norm": 1.15625, + "learning_rate": 0.00047901217870800495, + "loss": 0.1904, + "step": 95230 + }, + { + "epoch": 3.94, + "grad_norm": 0.84375, + "learning_rate": 0.00047900782884258543, + "loss": 0.1349, + "step": 95240 + }, + { + "epoch": 3.95, + "grad_norm": 1.609375, + "learning_rate": 0.000479003478546198, + "loss": 0.3005, + "step": 95250 + }, + { + "epoch": 3.95, + "grad_norm": 1.0234375, + "learning_rate": 0.0004789991278188506, + "loss": 0.2296, + "step": 95260 + }, + { + "epoch": 3.95, + "grad_norm": 0.546875, + "learning_rate": 0.0004789947766605517, + "loss": 0.2726, + "step": 95270 + }, + { + "epoch": 3.95, + "grad_norm": 0.6796875, + "learning_rate": 0.0004789904250713093, + "loss": 0.2494, + "step": 95280 + }, + { + "epoch": 3.95, + "grad_norm": 0.291015625, + "learning_rate": 0.00047898607305113164, + "loss": 0.2182, + "step": 95290 + }, + { + "epoch": 3.95, + "grad_norm": 0.55859375, + "learning_rate": 0.0004789817206000269, + "loss": 0.1665, + "step": 95300 + }, + { + "epoch": 3.95, + "grad_norm": 0.4765625, + "learning_rate": 0.0004789773677180033, + "loss": 0.1808, + "step": 95310 + }, + { + "epoch": 3.95, + "grad_norm": 1.515625, + "learning_rate": 0.0004789730144050691, + "loss": 0.2249, + "step": 95320 + }, + { + "epoch": 3.95, + "grad_norm": 0.9296875, + "learning_rate": 0.00047896866066123234, + "loss": 0.2623, + "step": 95330 + }, + { + "epoch": 3.95, + "grad_norm": 0.55859375, + "learning_rate": 0.00047896430648650123, + "loss": 0.2298, + "step": 95340 + }, + { + "epoch": 3.95, + "grad_norm": 1.0703125, + "learning_rate": 0.00047895995188088417, + "loss": 0.1698, + "step": 95350 + }, + { + "epoch": 3.95, + "grad_norm": 0.6484375, + "learning_rate": 0.0004789555968443891, + "loss": 0.1783, + "step": 95360 + }, + { + "epoch": 3.95, + "grad_norm": 0.9140625, + "learning_rate": 0.0004789512413770244, + "loss": 0.2562, + "step": 95370 + }, + { + "epoch": 3.95, + "grad_norm": 0.77734375, + "learning_rate": 0.0004789468854787981, + "loss": 0.2128, + "step": 95380 + }, + { + "epoch": 3.95, + "grad_norm": 1.671875, + "learning_rate": 0.00047894252914971845, + "loss": 0.1672, + "step": 95390 + }, + { + "epoch": 3.95, + "grad_norm": 0.404296875, + "learning_rate": 0.00047893817238979383, + "loss": 0.2229, + "step": 95400 + }, + { + "epoch": 3.95, + "grad_norm": 0.546875, + "learning_rate": 0.0004789338151990322, + "loss": 0.1656, + "step": 95410 + }, + { + "epoch": 3.95, + "grad_norm": 0.6640625, + "learning_rate": 0.0004789294575774419, + "loss": 0.1984, + "step": 95420 + }, + { + "epoch": 3.95, + "grad_norm": 0.58984375, + "learning_rate": 0.00047892509952503107, + "loss": 0.2329, + "step": 95430 + }, + { + "epoch": 3.95, + "grad_norm": 0.5078125, + "learning_rate": 0.00047892074104180786, + "loss": 0.2381, + "step": 95440 + }, + { + "epoch": 3.95, + "grad_norm": 0.68359375, + "learning_rate": 0.00047891638212778066, + "loss": 0.227, + "step": 95450 + }, + { + "epoch": 3.95, + "grad_norm": 1.1875, + "learning_rate": 0.00047891202278295744, + "loss": 0.2382, + "step": 95460 + }, + { + "epoch": 3.95, + "grad_norm": 0.45703125, + "learning_rate": 0.00047890766300734655, + "loss": 0.2429, + "step": 95470 + }, + { + "epoch": 3.95, + "grad_norm": 0.83984375, + "learning_rate": 0.00047890330280095616, + "loss": 0.1745, + "step": 95480 + }, + { + "epoch": 3.96, + "grad_norm": 0.58984375, + "learning_rate": 0.00047889894216379447, + "loss": 0.1883, + "step": 95490 + }, + { + "epoch": 3.96, + "grad_norm": 0.40234375, + "learning_rate": 0.0004788945810958697, + "loss": 0.1921, + "step": 95500 + }, + { + "epoch": 3.96, + "grad_norm": 1.859375, + "learning_rate": 0.00047889021959719003, + "loss": 0.2077, + "step": 95510 + }, + { + "epoch": 3.96, + "grad_norm": 0.33984375, + "learning_rate": 0.0004788858576677637, + "loss": 0.2218, + "step": 95520 + }, + { + "epoch": 3.96, + "grad_norm": 1.5, + "learning_rate": 0.0004788814953075989, + "loss": 0.2093, + "step": 95530 + }, + { + "epoch": 3.96, + "grad_norm": 1.5625, + "learning_rate": 0.00047887713251670383, + "loss": 0.2197, + "step": 95540 + }, + { + "epoch": 3.96, + "grad_norm": 0.765625, + "learning_rate": 0.0004788727692950867, + "loss": 0.2021, + "step": 95550 + }, + { + "epoch": 3.96, + "grad_norm": 1.0234375, + "learning_rate": 0.0004788684056427558, + "loss": 0.2042, + "step": 95560 + }, + { + "epoch": 3.96, + "grad_norm": 0.35546875, + "learning_rate": 0.00047886404155971917, + "loss": 0.1604, + "step": 95570 + }, + { + "epoch": 3.96, + "grad_norm": 0.455078125, + "learning_rate": 0.0004788596770459852, + "loss": 0.1723, + "step": 95580 + }, + { + "epoch": 3.96, + "grad_norm": 0.61328125, + "learning_rate": 0.0004788553121015621, + "loss": 0.2323, + "step": 95590 + }, + { + "epoch": 3.96, + "grad_norm": 0.609375, + "learning_rate": 0.00047885094672645793, + "loss": 0.2336, + "step": 95600 + }, + { + "epoch": 3.96, + "grad_norm": 0.45703125, + "learning_rate": 0.00047884658092068103, + "loss": 0.2039, + "step": 95610 + }, + { + "epoch": 3.96, + "grad_norm": 0.515625, + "learning_rate": 0.0004788422146842395, + "loss": 0.1875, + "step": 95620 + }, + { + "epoch": 3.96, + "grad_norm": 1.40625, + "learning_rate": 0.0004788378480171417, + "loss": 0.1823, + "step": 95630 + }, + { + "epoch": 3.96, + "grad_norm": 0.2470703125, + "learning_rate": 0.0004788334809193958, + "loss": 0.2119, + "step": 95640 + }, + { + "epoch": 3.96, + "grad_norm": 0.81640625, + "learning_rate": 0.00047882911339100997, + "loss": 0.25, + "step": 95650 + }, + { + "epoch": 3.96, + "grad_norm": 0.380859375, + "learning_rate": 0.0004788247454319924, + "loss": 0.243, + "step": 95660 + }, + { + "epoch": 3.96, + "grad_norm": 0.1474609375, + "learning_rate": 0.00047882037704235147, + "loss": 0.2493, + "step": 95670 + }, + { + "epoch": 3.96, + "grad_norm": 0.84375, + "learning_rate": 0.0004788160082220953, + "loss": 0.1804, + "step": 95680 + }, + { + "epoch": 3.96, + "grad_norm": 0.484375, + "learning_rate": 0.00047881163897123204, + "loss": 0.2035, + "step": 95690 + }, + { + "epoch": 3.96, + "grad_norm": 0.400390625, + "learning_rate": 0.00047880726928977005, + "loss": 0.1895, + "step": 95700 + }, + { + "epoch": 3.96, + "grad_norm": 0.13671875, + "learning_rate": 0.00047880289917771743, + "loss": 0.1624, + "step": 95710 + }, + { + "epoch": 3.96, + "grad_norm": 0.70703125, + "learning_rate": 0.0004787985286350825, + "loss": 0.2362, + "step": 95720 + }, + { + "epoch": 3.97, + "grad_norm": 0.0, + "learning_rate": 0.0004787941576618734, + "loss": 0.2328, + "step": 95730 + }, + { + "epoch": 3.97, + "grad_norm": 0.7578125, + "learning_rate": 0.0004787897862580984, + "loss": 0.245, + "step": 95740 + }, + { + "epoch": 3.97, + "grad_norm": 0.54296875, + "learning_rate": 0.00047878541442376583, + "loss": 0.2419, + "step": 95750 + }, + { + "epoch": 3.97, + "grad_norm": 0.06640625, + "learning_rate": 0.00047878104215888376, + "loss": 0.1764, + "step": 95760 + }, + { + "epoch": 3.97, + "grad_norm": 1.734375, + "learning_rate": 0.00047877666946346046, + "loss": 0.2214, + "step": 95770 + }, + { + "epoch": 3.97, + "grad_norm": 1.375, + "learning_rate": 0.0004787722963375042, + "loss": 0.2177, + "step": 95780 + }, + { + "epoch": 3.97, + "grad_norm": 0.96484375, + "learning_rate": 0.00047876792278102315, + "loss": 0.2367, + "step": 95790 + }, + { + "epoch": 3.97, + "grad_norm": 0.94921875, + "learning_rate": 0.00047876354879402563, + "loss": 0.2142, + "step": 95800 + }, + { + "epoch": 3.97, + "grad_norm": 0.5703125, + "learning_rate": 0.0004787591743765197, + "loss": 0.2213, + "step": 95810 + }, + { + "epoch": 3.97, + "grad_norm": 1.71875, + "learning_rate": 0.0004787547995285139, + "loss": 0.2259, + "step": 95820 + }, + { + "epoch": 3.97, + "grad_norm": 0.6015625, + "learning_rate": 0.0004787504242500161, + "loss": 0.228, + "step": 95830 + }, + { + "epoch": 3.97, + "grad_norm": 0.44140625, + "learning_rate": 0.00047874604854103474, + "loss": 0.1561, + "step": 95840 + }, + { + "epoch": 3.97, + "grad_norm": 0.7890625, + "learning_rate": 0.0004787416724015781, + "loss": 0.1995, + "step": 95850 + }, + { + "epoch": 3.97, + "grad_norm": 0.8671875, + "learning_rate": 0.0004787372958316543, + "loss": 0.2268, + "step": 95860 + }, + { + "epoch": 3.97, + "grad_norm": 0.6953125, + "learning_rate": 0.00047873291883127154, + "loss": 0.2381, + "step": 95870 + }, + { + "epoch": 3.97, + "grad_norm": 0.455078125, + "learning_rate": 0.00047872854140043814, + "loss": 0.2435, + "step": 95880 + }, + { + "epoch": 3.97, + "grad_norm": 0.51953125, + "learning_rate": 0.0004787241635391624, + "loss": 0.1964, + "step": 95890 + }, + { + "epoch": 3.97, + "grad_norm": 0.51171875, + "learning_rate": 0.0004787197852474524, + "loss": 0.2132, + "step": 95900 + }, + { + "epoch": 3.97, + "grad_norm": 0.314453125, + "learning_rate": 0.00047871540652531656, + "loss": 0.2206, + "step": 95910 + }, + { + "epoch": 3.97, + "grad_norm": 0.7265625, + "learning_rate": 0.00047871102737276296, + "loss": 0.2072, + "step": 95920 + }, + { + "epoch": 3.97, + "grad_norm": 0.462890625, + "learning_rate": 0.0004787066477897999, + "loss": 0.1953, + "step": 95930 + }, + { + "epoch": 3.97, + "grad_norm": 0.400390625, + "learning_rate": 0.00047870226777643564, + "loss": 0.1754, + "step": 95940 + }, + { + "epoch": 3.97, + "grad_norm": 0.84375, + "learning_rate": 0.0004786978873326784, + "loss": 0.26, + "step": 95950 + }, + { + "epoch": 3.97, + "grad_norm": 0.19921875, + "learning_rate": 0.00047869350645853644, + "loss": 0.2152, + "step": 95960 + }, + { + "epoch": 3.98, + "grad_norm": 0.78515625, + "learning_rate": 0.000478689125154018, + "loss": 0.2301, + "step": 95970 + }, + { + "epoch": 3.98, + "grad_norm": 0.70703125, + "learning_rate": 0.0004786847434191314, + "loss": 0.2174, + "step": 95980 + }, + { + "epoch": 3.98, + "grad_norm": 0.70703125, + "learning_rate": 0.0004786803612538847, + "loss": 0.2033, + "step": 95990 + }, + { + "epoch": 3.98, + "grad_norm": 0.6484375, + "learning_rate": 0.00047867597865828627, + "loss": 0.284, + "step": 96000 + }, + { + "epoch": 3.98, + "grad_norm": 0.97265625, + "learning_rate": 0.00047867159563234435, + "loss": 0.2324, + "step": 96010 + }, + { + "epoch": 3.98, + "grad_norm": 1.609375, + "learning_rate": 0.00047866721217606725, + "loss": 0.2123, + "step": 96020 + }, + { + "epoch": 3.98, + "grad_norm": 1.3203125, + "learning_rate": 0.00047866282828946304, + "loss": 0.2034, + "step": 96030 + }, + { + "epoch": 3.98, + "grad_norm": 0.61328125, + "learning_rate": 0.0004786584439725401, + "loss": 0.1649, + "step": 96040 + }, + { + "epoch": 3.98, + "grad_norm": 0.61328125, + "learning_rate": 0.0004786540592253067, + "loss": 0.1973, + "step": 96050 + }, + { + "epoch": 3.98, + "grad_norm": 0.62109375, + "learning_rate": 0.0004786496740477711, + "loss": 0.1433, + "step": 96060 + }, + { + "epoch": 3.98, + "grad_norm": 3.078125, + "learning_rate": 0.0004786452884399415, + "loss": 0.1864, + "step": 96070 + }, + { + "epoch": 3.98, + "grad_norm": 0.4609375, + "learning_rate": 0.00047864090240182607, + "loss": 0.2031, + "step": 96080 + }, + { + "epoch": 3.98, + "grad_norm": 0.8828125, + "learning_rate": 0.00047863651593343313, + "loss": 0.1531, + "step": 96090 + }, + { + "epoch": 3.98, + "grad_norm": 0.83984375, + "learning_rate": 0.00047863212903477103, + "loss": 0.2267, + "step": 96100 + }, + { + "epoch": 3.98, + "grad_norm": 1.2421875, + "learning_rate": 0.000478627741705848, + "loss": 0.1987, + "step": 96110 + }, + { + "epoch": 3.98, + "grad_norm": 0.73046875, + "learning_rate": 0.0004786233539466722, + "loss": 0.2287, + "step": 96120 + }, + { + "epoch": 3.98, + "grad_norm": 0.68359375, + "learning_rate": 0.00047861896575725194, + "loss": 0.2399, + "step": 96130 + }, + { + "epoch": 3.98, + "grad_norm": 1.71875, + "learning_rate": 0.0004786145771375955, + "loss": 0.2105, + "step": 96140 + }, + { + "epoch": 3.98, + "grad_norm": 1.0859375, + "learning_rate": 0.00047861018808771107, + "loss": 0.2341, + "step": 96150 + }, + { + "epoch": 3.98, + "grad_norm": 0.78125, + "learning_rate": 0.00047860579860760697, + "loss": 0.1793, + "step": 96160 + }, + { + "epoch": 3.98, + "grad_norm": 0.83203125, + "learning_rate": 0.0004786014086972914, + "loss": 0.2035, + "step": 96170 + }, + { + "epoch": 3.98, + "grad_norm": 1.0, + "learning_rate": 0.00047859701835677274, + "loss": 0.2291, + "step": 96180 + }, + { + "epoch": 3.98, + "grad_norm": 0.63671875, + "learning_rate": 0.00047859262758605914, + "loss": 0.1686, + "step": 96190 + }, + { + "epoch": 3.98, + "grad_norm": 0.73046875, + "learning_rate": 0.0004785882363851589, + "loss": 0.2107, + "step": 96200 + }, + { + "epoch": 3.99, + "grad_norm": 0.95703125, + "learning_rate": 0.0004785838447540803, + "loss": 0.2253, + "step": 96210 + }, + { + "epoch": 3.99, + "grad_norm": 1.0859375, + "learning_rate": 0.00047857945269283154, + "loss": 0.2093, + "step": 96220 + }, + { + "epoch": 3.99, + "grad_norm": 0.59375, + "learning_rate": 0.000478575060201421, + "loss": 0.1932, + "step": 96230 + }, + { + "epoch": 3.99, + "grad_norm": 0.796875, + "learning_rate": 0.00047857066727985685, + "loss": 0.1899, + "step": 96240 + }, + { + "epoch": 3.99, + "grad_norm": 1.5546875, + "learning_rate": 0.00047856627392814746, + "loss": 0.2065, + "step": 96250 + }, + { + "epoch": 3.99, + "grad_norm": 0.8984375, + "learning_rate": 0.00047856188014630085, + "loss": 0.1961, + "step": 96260 + }, + { + "epoch": 3.99, + "grad_norm": 0.77734375, + "learning_rate": 0.0004785574859343256, + "loss": 0.2163, + "step": 96270 + }, + { + "epoch": 3.99, + "grad_norm": 1.1796875, + "learning_rate": 0.00047855309129222985, + "loss": 0.287, + "step": 96280 + }, + { + "epoch": 3.99, + "grad_norm": 0.51953125, + "learning_rate": 0.0004785486962200218, + "loss": 0.1968, + "step": 96290 + }, + { + "epoch": 3.99, + "grad_norm": 1.265625, + "learning_rate": 0.0004785443007177098, + "loss": 0.2523, + "step": 96300 + }, + { + "epoch": 3.99, + "grad_norm": 0.60546875, + "learning_rate": 0.0004785399047853022, + "loss": 0.2017, + "step": 96310 + }, + { + "epoch": 3.99, + "grad_norm": 0.15234375, + "learning_rate": 0.00047853550842280704, + "loss": 0.2056, + "step": 96320 + }, + { + "epoch": 3.99, + "grad_norm": 0.5078125, + "learning_rate": 0.00047853111163023276, + "loss": 0.2453, + "step": 96330 + }, + { + "epoch": 3.99, + "grad_norm": 0.93359375, + "learning_rate": 0.00047852671440758767, + "loss": 0.1945, + "step": 96340 + }, + { + "epoch": 3.99, + "grad_norm": 0.50390625, + "learning_rate": 0.00047852231675488, + "loss": 0.2028, + "step": 96350 + }, + { + "epoch": 3.99, + "grad_norm": 1.015625, + "learning_rate": 0.0004785179186721179, + "loss": 0.2169, + "step": 96360 + }, + { + "epoch": 3.99, + "grad_norm": 0.365234375, + "learning_rate": 0.00047851352015930985, + "loss": 0.2064, + "step": 96370 + }, + { + "epoch": 3.99, + "grad_norm": 1.0, + "learning_rate": 0.000478509121216464, + "loss": 0.2126, + "step": 96380 + }, + { + "epoch": 3.99, + "grad_norm": 0.63671875, + "learning_rate": 0.00047850472184358863, + "loss": 0.231, + "step": 96390 + }, + { + "epoch": 3.99, + "grad_norm": 0.98828125, + "learning_rate": 0.0004785003220406921, + "loss": 0.1834, + "step": 96400 + }, + { + "epoch": 3.99, + "grad_norm": 1.734375, + "learning_rate": 0.00047849592180778267, + "loss": 0.2247, + "step": 96410 + }, + { + "epoch": 3.99, + "grad_norm": 0.703125, + "learning_rate": 0.00047849152114486847, + "loss": 0.2629, + "step": 96420 + }, + { + "epoch": 3.99, + "grad_norm": 0.1796875, + "learning_rate": 0.000478487120051958, + "loss": 0.2012, + "step": 96430 + }, + { + "epoch": 3.99, + "grad_norm": 0.447265625, + "learning_rate": 0.00047848271852905946, + "loss": 0.2, + "step": 96440 + }, + { + "epoch": 3.99, + "grad_norm": 1.7265625, + "learning_rate": 0.000478478316576181, + "loss": 0.2459, + "step": 96450 + }, + { + "epoch": 4.0, + "grad_norm": 1.421875, + "learning_rate": 0.00047847391419333116, + "loss": 0.2222, + "step": 96460 + }, + { + "epoch": 4.0, + "grad_norm": 0.384765625, + "learning_rate": 0.000478469511380518, + "loss": 0.231, + "step": 96470 + }, + { + "epoch": 4.0, + "grad_norm": 0.453125, + "learning_rate": 0.0004784651081377499, + "loss": 0.2128, + "step": 96480 + }, + { + "epoch": 4.0, + "grad_norm": 1.453125, + "learning_rate": 0.0004784607044650352, + "loss": 0.2017, + "step": 96490 + }, + { + "epoch": 4.0, + "grad_norm": 0.419921875, + "learning_rate": 0.00047845630036238204, + "loss": 0.1802, + "step": 96500 + }, + { + "epoch": 4.0, + "grad_norm": 0.6640625, + "learning_rate": 0.00047845189582979887, + "loss": 0.1999, + "step": 96510 + }, + { + "epoch": 4.0, + "grad_norm": 1.046875, + "learning_rate": 0.0004784474908672938, + "loss": 0.1914, + "step": 96520 + }, + { + "epoch": 4.0, + "grad_norm": 0.9296875, + "learning_rate": 0.00047844308547487533, + "loss": 0.2273, + "step": 96530 + }, + { + "epoch": 4.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004784386796525516, + "loss": 0.2024, + "step": 96540 + }, + { + "epoch": 4.0, + "grad_norm": 0.53125, + "learning_rate": 0.000478434273400331, + "loss": 0.1751, + "step": 96550 + }, + { + "epoch": 4.0, + "grad_norm": 0.66015625, + "learning_rate": 0.00047842986671822166, + "loss": 0.2333, + "step": 96560 + }, + { + "epoch": 4.0, + "grad_norm": 1.078125, + "learning_rate": 0.000478425459606232, + "loss": 0.2161, + "step": 96570 + }, + { + "epoch": 4.0, + "grad_norm": 0.51171875, + "learning_rate": 0.00047842105206437037, + "loss": 0.1911, + "step": 96580 + }, + { + "epoch": 4.0, + "grad_norm": 0.33203125, + "learning_rate": 0.0004784166440926449, + "loss": 0.1908, + "step": 96590 + }, + { + "epoch": 4.0, + "grad_norm": 1.3828125, + "learning_rate": 0.000478412235691064, + "loss": 0.2142, + "step": 96600 + }, + { + "epoch": 4.0, + "grad_norm": 0.435546875, + "learning_rate": 0.00047840782685963597, + "loss": 0.1829, + "step": 96610 + }, + { + "epoch": 4.0, + "grad_norm": 0.37109375, + "learning_rate": 0.00047840341759836905, + "loss": 0.2797, + "step": 96620 + }, + { + "epoch": 4.0, + "grad_norm": 2.125, + "learning_rate": 0.0004783990079072716, + "loss": 0.2295, + "step": 96630 + }, + { + "epoch": 4.0, + "grad_norm": 0.89453125, + "learning_rate": 0.0004783945977863519, + "loss": 0.2062, + "step": 96640 + }, + { + "epoch": 4.0, + "grad_norm": 0.46875, + "learning_rate": 0.0004783901872356181, + "loss": 0.2207, + "step": 96650 + }, + { + "epoch": 4.0, + "grad_norm": 0.58984375, + "learning_rate": 0.0004783857762550787, + "loss": 0.1984, + "step": 96660 + }, + { + "epoch": 4.0, + "grad_norm": 1.0703125, + "learning_rate": 0.000478381364844742, + "loss": 0.2526, + "step": 96670 + }, + { + "epoch": 4.0, + "grad_norm": 1.234375, + "learning_rate": 0.00047837695300461617, + "loss": 0.2326, + "step": 96680 + }, + { + "epoch": 4.0, + "grad_norm": 0.56640625, + "learning_rate": 0.0004783725407347096, + "loss": 0.1943, + "step": 96690 + }, + { + "epoch": 4.01, + "grad_norm": 0.578125, + "learning_rate": 0.00047836812803503056, + "loss": 0.2207, + "step": 96700 + }, + { + "epoch": 4.01, + "grad_norm": 0.0, + "learning_rate": 0.0004783637149055874, + "loss": 0.2109, + "step": 96710 + }, + { + "epoch": 4.01, + "grad_norm": 0.490234375, + "learning_rate": 0.00047835930134638825, + "loss": 0.203, + "step": 96720 + }, + { + "epoch": 4.01, + "grad_norm": 0.40625, + "learning_rate": 0.0004783548873574417, + "loss": 0.2231, + "step": 96730 + }, + { + "epoch": 4.01, + "grad_norm": 0.65234375, + "learning_rate": 0.00047835047293875587, + "loss": 0.2538, + "step": 96740 + }, + { + "epoch": 4.01, + "grad_norm": 0.546875, + "learning_rate": 0.00047834605809033917, + "loss": 0.189, + "step": 96750 + }, + { + "epoch": 4.01, + "grad_norm": 0.515625, + "learning_rate": 0.0004783416428121997, + "loss": 0.2437, + "step": 96760 + }, + { + "epoch": 4.01, + "grad_norm": 0.91796875, + "learning_rate": 0.00047833722710434603, + "loss": 0.1711, + "step": 96770 + }, + { + "epoch": 4.01, + "grad_norm": 2.109375, + "learning_rate": 0.0004783328109667864, + "loss": 0.2331, + "step": 96780 + }, + { + "epoch": 4.01, + "grad_norm": 0.9296875, + "learning_rate": 0.000478328394399529, + "loss": 0.1889, + "step": 96790 + }, + { + "epoch": 4.01, + "grad_norm": 1.21875, + "learning_rate": 0.0004783239774025822, + "loss": 0.2284, + "step": 96800 + }, + { + "epoch": 4.01, + "grad_norm": 0.5859375, + "learning_rate": 0.00047831955997595433, + "loss": 0.2032, + "step": 96810 + }, + { + "epoch": 4.01, + "grad_norm": 0.52734375, + "learning_rate": 0.00047831514211965376, + "loss": 0.177, + "step": 96820 + }, + { + "epoch": 4.01, + "grad_norm": 0.609375, + "learning_rate": 0.0004783107238336888, + "loss": 0.1412, + "step": 96830 + }, + { + "epoch": 4.01, + "grad_norm": 1.5703125, + "learning_rate": 0.0004783063051180676, + "loss": 0.222, + "step": 96840 + }, + { + "epoch": 4.01, + "grad_norm": 0.9296875, + "learning_rate": 0.00047830188597279864, + "loss": 0.2145, + "step": 96850 + }, + { + "epoch": 4.01, + "grad_norm": 0.6015625, + "learning_rate": 0.0004782974663978902, + "loss": 0.2151, + "step": 96860 + }, + { + "epoch": 4.01, + "grad_norm": 0.74609375, + "learning_rate": 0.00047829304639335045, + "loss": 0.2325, + "step": 96870 + }, + { + "epoch": 4.01, + "grad_norm": 0.73828125, + "learning_rate": 0.00047828862595918796, + "loss": 0.2481, + "step": 96880 + }, + { + "epoch": 4.01, + "grad_norm": 1.046875, + "learning_rate": 0.0004782842050954109, + "loss": 0.192, + "step": 96890 + }, + { + "epoch": 4.01, + "grad_norm": 0.373046875, + "learning_rate": 0.0004782797838020277, + "loss": 0.2047, + "step": 96900 + }, + { + "epoch": 4.01, + "grad_norm": 1.0859375, + "learning_rate": 0.00047827536207904655, + "loss": 0.2416, + "step": 96910 + }, + { + "epoch": 4.01, + "grad_norm": 0.87890625, + "learning_rate": 0.0004782709399264757, + "loss": 0.1913, + "step": 96920 + }, + { + "epoch": 4.01, + "grad_norm": 0.60546875, + "learning_rate": 0.00047826651734432377, + "loss": 0.2253, + "step": 96930 + }, + { + "epoch": 4.02, + "grad_norm": 1.9765625, + "learning_rate": 0.00047826209433259883, + "loss": 0.1791, + "step": 96940 + }, + { + "epoch": 4.02, + "grad_norm": 0.8828125, + "learning_rate": 0.00047825767089130925, + "loss": 0.2279, + "step": 96950 + }, + { + "epoch": 4.02, + "grad_norm": 0.66015625, + "learning_rate": 0.00047825324702046343, + "loss": 0.236, + "step": 96960 + }, + { + "epoch": 4.02, + "grad_norm": 0.83203125, + "learning_rate": 0.00047824882272006966, + "loss": 0.2325, + "step": 96970 + }, + { + "epoch": 4.02, + "grad_norm": 0.244140625, + "learning_rate": 0.0004782443979901362, + "loss": 0.1491, + "step": 96980 + }, + { + "epoch": 4.02, + "grad_norm": 0.77734375, + "learning_rate": 0.0004782399728306714, + "loss": 0.1987, + "step": 96990 + }, + { + "epoch": 4.02, + "grad_norm": 0.89453125, + "learning_rate": 0.0004782355472416837, + "loss": 0.1927, + "step": 97000 + }, + { + "epoch": 4.02, + "grad_norm": 0.59375, + "learning_rate": 0.00047823112122318136, + "loss": 0.2509, + "step": 97010 + }, + { + "epoch": 4.02, + "grad_norm": 0.609375, + "learning_rate": 0.0004782266947751726, + "loss": 0.1943, + "step": 97020 + }, + { + "epoch": 4.02, + "grad_norm": 0.84375, + "learning_rate": 0.00047822226789766595, + "loss": 0.2296, + "step": 97030 + }, + { + "epoch": 4.02, + "grad_norm": 0.953125, + "learning_rate": 0.00047821784059066957, + "loss": 0.2173, + "step": 97040 + }, + { + "epoch": 4.02, + "grad_norm": 1.375, + "learning_rate": 0.0004782134128541919, + "loss": 0.2148, + "step": 97050 + }, + { + "epoch": 4.02, + "grad_norm": 1.9453125, + "learning_rate": 0.0004782089846882412, + "loss": 0.2037, + "step": 97060 + }, + { + "epoch": 4.02, + "grad_norm": 0.75390625, + "learning_rate": 0.00047820455609282584, + "loss": 0.2199, + "step": 97070 + }, + { + "epoch": 4.02, + "grad_norm": 1.015625, + "learning_rate": 0.0004782001270679541, + "loss": 0.2456, + "step": 97080 + }, + { + "epoch": 4.02, + "grad_norm": 0.6171875, + "learning_rate": 0.00047819569761363445, + "loss": 0.2024, + "step": 97090 + }, + { + "epoch": 4.02, + "grad_norm": 0.90234375, + "learning_rate": 0.00047819126772987515, + "loss": 0.199, + "step": 97100 + }, + { + "epoch": 4.02, + "grad_norm": 0.55078125, + "learning_rate": 0.00047818683741668446, + "loss": 0.2201, + "step": 97110 + }, + { + "epoch": 4.02, + "grad_norm": 0.474609375, + "learning_rate": 0.0004781824066740709, + "loss": 0.1958, + "step": 97120 + }, + { + "epoch": 4.02, + "grad_norm": 1.078125, + "learning_rate": 0.0004781779755020426, + "loss": 0.2176, + "step": 97130 + }, + { + "epoch": 4.02, + "grad_norm": 0.80078125, + "learning_rate": 0.00047817354390060796, + "loss": 0.2139, + "step": 97140 + }, + { + "epoch": 4.02, + "grad_norm": 0.703125, + "learning_rate": 0.00047816911186977535, + "loss": 0.2028, + "step": 97150 + }, + { + "epoch": 4.02, + "grad_norm": 0.44921875, + "learning_rate": 0.00047816467940955314, + "loss": 0.2074, + "step": 97160 + }, + { + "epoch": 4.02, + "grad_norm": 0.265625, + "learning_rate": 0.00047816024651994973, + "loss": 0.1942, + "step": 97170 + }, + { + "epoch": 4.03, + "grad_norm": 0.640625, + "learning_rate": 0.0004781558132009732, + "loss": 0.1953, + "step": 97180 + }, + { + "epoch": 4.03, + "grad_norm": 0.73828125, + "learning_rate": 0.0004781513794526322, + "loss": 0.2174, + "step": 97190 + }, + { + "epoch": 4.03, + "grad_norm": 0.40625, + "learning_rate": 0.0004781469452749349, + "loss": 0.2116, + "step": 97200 + }, + { + "epoch": 4.03, + "grad_norm": 0.546875, + "learning_rate": 0.00047814251066788973, + "loss": 0.237, + "step": 97210 + }, + { + "epoch": 4.03, + "grad_norm": 0.34765625, + "learning_rate": 0.00047813807563150493, + "loss": 0.202, + "step": 97220 + }, + { + "epoch": 4.03, + "grad_norm": 5.53125, + "learning_rate": 0.000478133640165789, + "loss": 0.2095, + "step": 97230 + }, + { + "epoch": 4.03, + "grad_norm": 0.76953125, + "learning_rate": 0.0004781292042707501, + "loss": 0.2215, + "step": 97240 + }, + { + "epoch": 4.03, + "grad_norm": 0.26953125, + "learning_rate": 0.0004781247679463967, + "loss": 0.2324, + "step": 97250 + }, + { + "epoch": 4.03, + "grad_norm": 0.69921875, + "learning_rate": 0.00047812033119273714, + "loss": 0.2055, + "step": 97260 + }, + { + "epoch": 4.03, + "grad_norm": 0.4140625, + "learning_rate": 0.0004781158940097797, + "loss": 0.2336, + "step": 97270 + }, + { + "epoch": 4.03, + "grad_norm": 0.80859375, + "learning_rate": 0.00047811145639753286, + "loss": 0.2487, + "step": 97280 + }, + { + "epoch": 4.03, + "grad_norm": 1.078125, + "learning_rate": 0.0004781070183560048, + "loss": 0.2077, + "step": 97290 + }, + { + "epoch": 4.03, + "grad_norm": 1.6640625, + "learning_rate": 0.000478102579885204, + "loss": 0.2353, + "step": 97300 + }, + { + "epoch": 4.03, + "grad_norm": 0.59765625, + "learning_rate": 0.0004780981409851388, + "loss": 0.161, + "step": 97310 + }, + { + "epoch": 4.03, + "grad_norm": 0.83203125, + "learning_rate": 0.00047809370165581756, + "loss": 0.1935, + "step": 97320 + }, + { + "epoch": 4.03, + "grad_norm": 0.5390625, + "learning_rate": 0.0004780892618972485, + "loss": 0.2185, + "step": 97330 + }, + { + "epoch": 4.03, + "grad_norm": 0.828125, + "learning_rate": 0.0004780848217094402, + "loss": 0.2318, + "step": 97340 + }, + { + "epoch": 4.03, + "grad_norm": 1.828125, + "learning_rate": 0.0004780803810924009, + "loss": 0.2243, + "step": 97350 + }, + { + "epoch": 4.03, + "grad_norm": 0.244140625, + "learning_rate": 0.00047807594004613886, + "loss": 0.1852, + "step": 97360 + }, + { + "epoch": 4.03, + "grad_norm": 0.70703125, + "learning_rate": 0.00047807149857066256, + "loss": 0.2181, + "step": 97370 + }, + { + "epoch": 4.03, + "grad_norm": 0.23828125, + "learning_rate": 0.0004780670566659804, + "loss": 0.1682, + "step": 97380 + }, + { + "epoch": 4.03, + "grad_norm": 1.0390625, + "learning_rate": 0.00047806261433210056, + "loss": 0.2574, + "step": 97390 + }, + { + "epoch": 4.03, + "grad_norm": 0.50390625, + "learning_rate": 0.0004780581715690315, + "loss": 0.2389, + "step": 97400 + }, + { + "epoch": 4.03, + "grad_norm": 0.470703125, + "learning_rate": 0.0004780537283767817, + "loss": 0.1883, + "step": 97410 + }, + { + "epoch": 4.04, + "grad_norm": 0.84765625, + "learning_rate": 0.00047804928475535935, + "loss": 0.2381, + "step": 97420 + }, + { + "epoch": 4.04, + "grad_norm": 0.859375, + "learning_rate": 0.00047804484070477295, + "loss": 0.1944, + "step": 97430 + }, + { + "epoch": 4.04, + "grad_norm": 0.357421875, + "learning_rate": 0.0004780403962250307, + "loss": 0.1935, + "step": 97440 + }, + { + "epoch": 4.04, + "grad_norm": 0.9609375, + "learning_rate": 0.00047803595131614107, + "loss": 0.1853, + "step": 97450 + }, + { + "epoch": 4.04, + "grad_norm": 0.5625, + "learning_rate": 0.0004780315059781124, + "loss": 0.2407, + "step": 97460 + }, + { + "epoch": 4.04, + "grad_norm": 1.0546875, + "learning_rate": 0.00047802706021095304, + "loss": 0.1905, + "step": 97470 + }, + { + "epoch": 4.04, + "grad_norm": 0.400390625, + "learning_rate": 0.0004780226140146714, + "loss": 0.2391, + "step": 97480 + }, + { + "epoch": 4.04, + "grad_norm": 0.435546875, + "learning_rate": 0.00047801816738927586, + "loss": 0.1623, + "step": 97490 + }, + { + "epoch": 4.04, + "grad_norm": 1.1796875, + "learning_rate": 0.0004780137203347747, + "loss": 0.1889, + "step": 97500 + }, + { + "epoch": 4.04, + "grad_norm": 0.68359375, + "learning_rate": 0.00047800927285117633, + "loss": 0.2438, + "step": 97510 + }, + { + "epoch": 4.04, + "grad_norm": 0.62109375, + "learning_rate": 0.0004780048249384892, + "loss": 0.2394, + "step": 97520 + }, + { + "epoch": 4.04, + "grad_norm": 1.0546875, + "learning_rate": 0.0004780003765967216, + "loss": 0.1959, + "step": 97530 + }, + { + "epoch": 4.04, + "grad_norm": 0.54296875, + "learning_rate": 0.0004779959278258819, + "loss": 0.2308, + "step": 97540 + }, + { + "epoch": 4.04, + "grad_norm": 0.71875, + "learning_rate": 0.00047799147862597846, + "loss": 0.2497, + "step": 97550 + }, + { + "epoch": 4.04, + "grad_norm": 0.56640625, + "learning_rate": 0.0004779870289970196, + "loss": 0.2, + "step": 97560 + }, + { + "epoch": 4.04, + "grad_norm": 1.0625, + "learning_rate": 0.0004779825789390139, + "loss": 0.2212, + "step": 97570 + }, + { + "epoch": 4.04, + "grad_norm": 1.09375, + "learning_rate": 0.00047797812845196965, + "loss": 0.2905, + "step": 97580 + }, + { + "epoch": 4.04, + "grad_norm": 0.671875, + "learning_rate": 0.00047797367753589504, + "loss": 0.2257, + "step": 97590 + }, + { + "epoch": 4.04, + "grad_norm": 0.734375, + "learning_rate": 0.00047796922619079864, + "loss": 0.221, + "step": 97600 + }, + { + "epoch": 4.04, + "grad_norm": 0.4453125, + "learning_rate": 0.00047796477441668886, + "loss": 0.1796, + "step": 97610 + }, + { + "epoch": 4.04, + "grad_norm": 0.3828125, + "learning_rate": 0.0004779603222135739, + "loss": 0.2128, + "step": 97620 + }, + { + "epoch": 4.04, + "grad_norm": 0.546875, + "learning_rate": 0.0004779558695814622, + "loss": 0.2278, + "step": 97630 + }, + { + "epoch": 4.04, + "grad_norm": 0.9453125, + "learning_rate": 0.0004779514165203622, + "loss": 0.1838, + "step": 97640 + }, + { + "epoch": 4.04, + "grad_norm": 0.33203125, + "learning_rate": 0.0004779469630302823, + "loss": 0.1977, + "step": 97650 + }, + { + "epoch": 4.05, + "grad_norm": 0.267578125, + "learning_rate": 0.00047794250911123085, + "loss": 0.1999, + "step": 97660 + }, + { + "epoch": 4.05, + "grad_norm": 0.62890625, + "learning_rate": 0.0004779380547632161, + "loss": 0.2702, + "step": 97670 + }, + { + "epoch": 4.05, + "grad_norm": 0.66015625, + "learning_rate": 0.00047793359998624666, + "loss": 0.1849, + "step": 97680 + }, + { + "epoch": 4.05, + "grad_norm": 0.400390625, + "learning_rate": 0.0004779291447803307, + "loss": 0.2196, + "step": 97690 + }, + { + "epoch": 4.05, + "grad_norm": 0.4375, + "learning_rate": 0.00047792468914547675, + "loss": 0.1774, + "step": 97700 + }, + { + "epoch": 4.05, + "grad_norm": 0.73828125, + "learning_rate": 0.00047792023308169317, + "loss": 0.2102, + "step": 97710 + }, + { + "epoch": 4.05, + "grad_norm": 0.81640625, + "learning_rate": 0.0004779157765889883, + "loss": 0.2061, + "step": 97720 + }, + { + "epoch": 4.05, + "grad_norm": 0.61328125, + "learning_rate": 0.00047791131966737056, + "loss": 0.2098, + "step": 97730 + }, + { + "epoch": 4.05, + "grad_norm": 0.44140625, + "learning_rate": 0.0004779068623168483, + "loss": 0.2172, + "step": 97740 + }, + { + "epoch": 4.05, + "grad_norm": 0.439453125, + "learning_rate": 0.00047790240453743, + "loss": 0.2342, + "step": 97750 + }, + { + "epoch": 4.05, + "grad_norm": 0.609375, + "learning_rate": 0.00047789794632912397, + "loss": 0.2332, + "step": 97760 + }, + { + "epoch": 4.05, + "grad_norm": 0.53125, + "learning_rate": 0.0004778934876919386, + "loss": 0.1971, + "step": 97770 + }, + { + "epoch": 4.05, + "grad_norm": 0.75390625, + "learning_rate": 0.0004778890286258822, + "loss": 0.2021, + "step": 97780 + }, + { + "epoch": 4.05, + "grad_norm": 0.44140625, + "learning_rate": 0.0004778845691309635, + "loss": 0.2143, + "step": 97790 + }, + { + "epoch": 4.05, + "grad_norm": 0.6953125, + "learning_rate": 0.00047788010920719046, + "loss": 0.2155, + "step": 97800 + }, + { + "epoch": 4.05, + "grad_norm": 0.703125, + "learning_rate": 0.0004778756488545717, + "loss": 0.1767, + "step": 97810 + }, + { + "epoch": 4.05, + "grad_norm": 0.8671875, + "learning_rate": 0.00047787118807311556, + "loss": 0.1833, + "step": 97820 + }, + { + "epoch": 4.05, + "grad_norm": 0.5625, + "learning_rate": 0.0004778667268628305, + "loss": 0.233, + "step": 97830 + }, + { + "epoch": 4.05, + "grad_norm": 0.7734375, + "learning_rate": 0.0004778622652237248, + "loss": 0.1706, + "step": 97840 + }, + { + "epoch": 4.05, + "grad_norm": 1.1875, + "learning_rate": 0.00047785780315580693, + "loss": 0.2249, + "step": 97850 + }, + { + "epoch": 4.05, + "grad_norm": 0.447265625, + "learning_rate": 0.0004778533406590854, + "loss": 0.2179, + "step": 97860 + }, + { + "epoch": 4.05, + "grad_norm": 0.365234375, + "learning_rate": 0.00047784887773356835, + "loss": 0.2135, + "step": 97870 + }, + { + "epoch": 4.05, + "grad_norm": 0.83203125, + "learning_rate": 0.0004778444143792644, + "loss": 0.2272, + "step": 97880 + }, + { + "epoch": 4.05, + "grad_norm": 0.77734375, + "learning_rate": 0.0004778399505961818, + "loss": 0.2175, + "step": 97890 + }, + { + "epoch": 4.06, + "grad_norm": 1.0703125, + "learning_rate": 0.00047783548638432905, + "loss": 0.2456, + "step": 97900 + }, + { + "epoch": 4.06, + "grad_norm": 0.62109375, + "learning_rate": 0.0004778310217437146, + "loss": 0.1552, + "step": 97910 + }, + { + "epoch": 4.06, + "grad_norm": 0.51953125, + "learning_rate": 0.0004778265566743467, + "loss": 0.2098, + "step": 97920 + }, + { + "epoch": 4.06, + "grad_norm": 0.0, + "learning_rate": 0.00047782209117623375, + "loss": 0.2298, + "step": 97930 + }, + { + "epoch": 4.06, + "grad_norm": 0.416015625, + "learning_rate": 0.00047781762524938434, + "loss": 0.1582, + "step": 97940 + }, + { + "epoch": 4.06, + "grad_norm": 1.0, + "learning_rate": 0.0004778131588938067, + "loss": 0.2153, + "step": 97950 + }, + { + "epoch": 4.06, + "grad_norm": 0.390625, + "learning_rate": 0.00047780869210950933, + "loss": 0.2007, + "step": 97960 + }, + { + "epoch": 4.06, + "grad_norm": 0.77734375, + "learning_rate": 0.0004778042248965006, + "loss": 0.2359, + "step": 97970 + }, + { + "epoch": 4.06, + "grad_norm": 0.734375, + "learning_rate": 0.00047779975725478893, + "loss": 0.1834, + "step": 97980 + }, + { + "epoch": 4.06, + "grad_norm": 0.6328125, + "learning_rate": 0.00047779528918438265, + "loss": 0.2508, + "step": 97990 + }, + { + "epoch": 4.06, + "grad_norm": 0.345703125, + "learning_rate": 0.00047779082068529036, + "loss": 0.2588, + "step": 98000 + }, + { + "epoch": 4.06, + "grad_norm": 0.90234375, + "learning_rate": 0.0004777863517575202, + "loss": 0.2167, + "step": 98010 + }, + { + "epoch": 4.06, + "grad_norm": 0.2890625, + "learning_rate": 0.0004777818824010808, + "loss": 0.2053, + "step": 98020 + }, + { + "epoch": 4.06, + "grad_norm": 0.2578125, + "learning_rate": 0.00047777741261598053, + "loss": 0.1797, + "step": 98030 + }, + { + "epoch": 4.06, + "grad_norm": 0.5078125, + "learning_rate": 0.00047777294240222766, + "loss": 0.1823, + "step": 98040 + }, + { + "epoch": 4.06, + "grad_norm": 0.87890625, + "learning_rate": 0.0004777684717598308, + "loss": 0.2277, + "step": 98050 + }, + { + "epoch": 4.06, + "grad_norm": 0.62890625, + "learning_rate": 0.00047776400068879824, + "loss": 0.2102, + "step": 98060 + }, + { + "epoch": 4.06, + "grad_norm": 0.734375, + "learning_rate": 0.00047775952918913844, + "loss": 0.202, + "step": 98070 + }, + { + "epoch": 4.06, + "grad_norm": 0.77734375, + "learning_rate": 0.00047775505726085975, + "loss": 0.1849, + "step": 98080 + }, + { + "epoch": 4.06, + "grad_norm": 0.341796875, + "learning_rate": 0.00047775058490397073, + "loss": 0.1554, + "step": 98090 + }, + { + "epoch": 4.06, + "grad_norm": 0.427734375, + "learning_rate": 0.0004777461121184796, + "loss": 0.2118, + "step": 98100 + }, + { + "epoch": 4.06, + "grad_norm": 0.7421875, + "learning_rate": 0.000477741638904395, + "loss": 0.1611, + "step": 98110 + }, + { + "epoch": 4.06, + "grad_norm": 0.76953125, + "learning_rate": 0.00047773716526172515, + "loss": 0.2492, + "step": 98120 + }, + { + "epoch": 4.06, + "grad_norm": 0.44140625, + "learning_rate": 0.0004777326911904786, + "loss": 0.1691, + "step": 98130 + }, + { + "epoch": 4.06, + "grad_norm": 0.296875, + "learning_rate": 0.0004777282166906637, + "loss": 0.1624, + "step": 98140 + }, + { + "epoch": 4.07, + "grad_norm": 0.515625, + "learning_rate": 0.00047772374176228885, + "loss": 0.1728, + "step": 98150 + }, + { + "epoch": 4.07, + "grad_norm": 1.515625, + "learning_rate": 0.0004777192664053625, + "loss": 0.1766, + "step": 98160 + }, + { + "epoch": 4.07, + "grad_norm": 1.09375, + "learning_rate": 0.0004777147906198931, + "loss": 0.205, + "step": 98170 + }, + { + "epoch": 4.07, + "grad_norm": 0.6171875, + "learning_rate": 0.0004777103144058891, + "loss": 0.1836, + "step": 98180 + }, + { + "epoch": 4.07, + "grad_norm": 0.734375, + "learning_rate": 0.00047770583776335883, + "loss": 0.2449, + "step": 98190 + }, + { + "epoch": 4.07, + "grad_norm": 0.875, + "learning_rate": 0.0004777013606923108, + "loss": 0.1952, + "step": 98200 + }, + { + "epoch": 4.07, + "grad_norm": 0.55859375, + "learning_rate": 0.0004776968831927534, + "loss": 0.2371, + "step": 98210 + }, + { + "epoch": 4.07, + "grad_norm": 1.71875, + "learning_rate": 0.00047769240526469506, + "loss": 0.2093, + "step": 98220 + }, + { + "epoch": 4.07, + "grad_norm": 1.0234375, + "learning_rate": 0.0004776879269081441, + "loss": 0.1852, + "step": 98230 + }, + { + "epoch": 4.07, + "grad_norm": 0.376953125, + "learning_rate": 0.00047768344812310916, + "loss": 0.2194, + "step": 98240 + }, + { + "epoch": 4.07, + "grad_norm": 0.70703125, + "learning_rate": 0.0004776789689095985, + "loss": 0.2102, + "step": 98250 + }, + { + "epoch": 4.07, + "grad_norm": 0.52734375, + "learning_rate": 0.0004776744892676207, + "loss": 0.2358, + "step": 98260 + }, + { + "epoch": 4.07, + "grad_norm": 0.68359375, + "learning_rate": 0.000477670009197184, + "loss": 0.1893, + "step": 98270 + }, + { + "epoch": 4.07, + "grad_norm": 0.85546875, + "learning_rate": 0.00047766552869829695, + "loss": 0.2384, + "step": 98280 + }, + { + "epoch": 4.07, + "grad_norm": 1.03125, + "learning_rate": 0.00047766104777096795, + "loss": 0.1855, + "step": 98290 + }, + { + "epoch": 4.07, + "grad_norm": 0.53515625, + "learning_rate": 0.0004776565664152055, + "loss": 0.1929, + "step": 98300 + }, + { + "epoch": 4.07, + "grad_norm": 1.3984375, + "learning_rate": 0.000477652084631018, + "loss": 0.2421, + "step": 98310 + }, + { + "epoch": 4.07, + "grad_norm": 1.1484375, + "learning_rate": 0.00047764760241841375, + "loss": 0.2324, + "step": 98320 + }, + { + "epoch": 4.07, + "grad_norm": 0.765625, + "learning_rate": 0.0004776431197774013, + "loss": 0.2049, + "step": 98330 + }, + { + "epoch": 4.07, + "grad_norm": 0.578125, + "learning_rate": 0.0004776386367079892, + "loss": 0.2218, + "step": 98340 + }, + { + "epoch": 4.07, + "grad_norm": 1.3359375, + "learning_rate": 0.00047763415321018564, + "loss": 0.2227, + "step": 98350 + }, + { + "epoch": 4.07, + "grad_norm": 0.3359375, + "learning_rate": 0.0004776296692839993, + "loss": 0.1827, + "step": 98360 + }, + { + "epoch": 4.07, + "grad_norm": 0.39453125, + "learning_rate": 0.00047762518492943843, + "loss": 0.1935, + "step": 98370 + }, + { + "epoch": 4.07, + "grad_norm": 0.7109375, + "learning_rate": 0.0004776207001465116, + "loss": 0.2001, + "step": 98380 + }, + { + "epoch": 4.08, + "grad_norm": 0.81640625, + "learning_rate": 0.0004776162149352271, + "loss": 0.2527, + "step": 98390 + }, + { + "epoch": 4.08, + "grad_norm": 0.4375, + "learning_rate": 0.00047761172929559355, + "loss": 0.2065, + "step": 98400 + }, + { + "epoch": 4.08, + "grad_norm": 0.62890625, + "learning_rate": 0.00047760724322761926, + "loss": 0.2767, + "step": 98410 + }, + { + "epoch": 4.08, + "grad_norm": 1.625, + "learning_rate": 0.0004776027567313127, + "loss": 0.2396, + "step": 98420 + }, + { + "epoch": 4.08, + "grad_norm": 0.44921875, + "learning_rate": 0.0004775982698066824, + "loss": 0.2404, + "step": 98430 + }, + { + "epoch": 4.08, + "grad_norm": 0.1943359375, + "learning_rate": 0.0004775937824537367, + "loss": 0.1997, + "step": 98440 + }, + { + "epoch": 4.08, + "grad_norm": 0.345703125, + "learning_rate": 0.00047758929467248406, + "loss": 0.2197, + "step": 98450 + }, + { + "epoch": 4.08, + "grad_norm": 0.431640625, + "learning_rate": 0.00047758480646293303, + "loss": 0.2125, + "step": 98460 + }, + { + "epoch": 4.08, + "grad_norm": 0.427734375, + "learning_rate": 0.0004775803178250919, + "loss": 0.2115, + "step": 98470 + }, + { + "epoch": 4.08, + "grad_norm": 0.984375, + "learning_rate": 0.00047757582875896916, + "loss": 0.1915, + "step": 98480 + }, + { + "epoch": 4.08, + "grad_norm": 0.75390625, + "learning_rate": 0.0004775713392645733, + "loss": 0.205, + "step": 98490 + }, + { + "epoch": 4.08, + "grad_norm": 0.79296875, + "learning_rate": 0.00047756684934191273, + "loss": 0.2076, + "step": 98500 + }, + { + "epoch": 4.08, + "grad_norm": 0.8828125, + "learning_rate": 0.00047756235899099606, + "loss": 0.2049, + "step": 98510 + }, + { + "epoch": 4.08, + "grad_norm": 0.361328125, + "learning_rate": 0.00047755786821183146, + "loss": 0.2497, + "step": 98520 + }, + { + "epoch": 4.08, + "grad_norm": 0.71875, + "learning_rate": 0.0004775533770044276, + "loss": 0.2053, + "step": 98530 + }, + { + "epoch": 4.08, + "grad_norm": 0.61328125, + "learning_rate": 0.0004775488853687928, + "loss": 0.2099, + "step": 98540 + }, + { + "epoch": 4.08, + "grad_norm": 0.412109375, + "learning_rate": 0.00047754439330493565, + "loss": 0.2134, + "step": 98550 + }, + { + "epoch": 4.08, + "grad_norm": 0.5546875, + "learning_rate": 0.0004775399008128645, + "loss": 0.23, + "step": 98560 + }, + { + "epoch": 4.08, + "grad_norm": 0.80859375, + "learning_rate": 0.0004775354078925878, + "loss": 0.2541, + "step": 98570 + }, + { + "epoch": 4.08, + "grad_norm": 1.7265625, + "learning_rate": 0.00047753091454411404, + "loss": 0.1736, + "step": 98580 + }, + { + "epoch": 4.08, + "grad_norm": 0.8359375, + "learning_rate": 0.0004775264207674517, + "loss": 0.1767, + "step": 98590 + }, + { + "epoch": 4.08, + "grad_norm": 0.83984375, + "learning_rate": 0.0004775219265626092, + "loss": 0.2089, + "step": 98600 + }, + { + "epoch": 4.08, + "grad_norm": 0.94921875, + "learning_rate": 0.00047751743192959496, + "loss": 0.2386, + "step": 98610 + }, + { + "epoch": 4.08, + "grad_norm": 0.35546875, + "learning_rate": 0.0004775129368684175, + "loss": 0.2132, + "step": 98620 + }, + { + "epoch": 4.09, + "grad_norm": 0.67578125, + "learning_rate": 0.00047750844137908527, + "loss": 0.2314, + "step": 98630 + }, + { + "epoch": 4.09, + "grad_norm": 0.5625, + "learning_rate": 0.0004775039454616067, + "loss": 0.2237, + "step": 98640 + }, + { + "epoch": 4.09, + "grad_norm": 1.3203125, + "learning_rate": 0.0004774994491159903, + "loss": 0.2225, + "step": 98650 + }, + { + "epoch": 4.09, + "grad_norm": 0.6796875, + "learning_rate": 0.00047749495234224447, + "loss": 0.2628, + "step": 98660 + }, + { + "epoch": 4.09, + "grad_norm": 0.546875, + "learning_rate": 0.0004774904551403778, + "loss": 0.2181, + "step": 98670 + }, + { + "epoch": 4.09, + "grad_norm": 0.65234375, + "learning_rate": 0.0004774859575103986, + "loss": 0.2358, + "step": 98680 + }, + { + "epoch": 4.09, + "grad_norm": 0.85546875, + "learning_rate": 0.00047748145945231536, + "loss": 0.2194, + "step": 98690 + }, + { + "epoch": 4.09, + "grad_norm": 0.8671875, + "learning_rate": 0.00047747696096613664, + "loss": 0.2412, + "step": 98700 + }, + { + "epoch": 4.09, + "grad_norm": 1.5234375, + "learning_rate": 0.0004774724620518708, + "loss": 0.2859, + "step": 98710 + }, + { + "epoch": 4.09, + "grad_norm": 0.53515625, + "learning_rate": 0.00047746796270952644, + "loss": 0.2264, + "step": 98720 + }, + { + "epoch": 4.09, + "grad_norm": 0.451171875, + "learning_rate": 0.0004774634629391118, + "loss": 0.175, + "step": 98730 + }, + { + "epoch": 4.09, + "grad_norm": 0.3125, + "learning_rate": 0.0004774589627406356, + "loss": 0.1645, + "step": 98740 + }, + { + "epoch": 4.09, + "grad_norm": 1.8046875, + "learning_rate": 0.00047745446211410614, + "loss": 0.1823, + "step": 98750 + }, + { + "epoch": 4.09, + "grad_norm": 0.6640625, + "learning_rate": 0.000477449961059532, + "loss": 0.1695, + "step": 98760 + }, + { + "epoch": 4.09, + "grad_norm": 0.58203125, + "learning_rate": 0.00047744545957692156, + "loss": 0.1804, + "step": 98770 + }, + { + "epoch": 4.09, + "grad_norm": 0.443359375, + "learning_rate": 0.0004774409576662833, + "loss": 0.23, + "step": 98780 + }, + { + "epoch": 4.09, + "grad_norm": 1.0859375, + "learning_rate": 0.0004774364553276257, + "loss": 0.1947, + "step": 98790 + }, + { + "epoch": 4.09, + "grad_norm": 0.671875, + "learning_rate": 0.0004774319525609573, + "loss": 0.1945, + "step": 98800 + }, + { + "epoch": 4.09, + "grad_norm": 0.69140625, + "learning_rate": 0.0004774274493662865, + "loss": 0.2628, + "step": 98810 + }, + { + "epoch": 4.09, + "grad_norm": 1.4140625, + "learning_rate": 0.0004774229457436218, + "loss": 0.1748, + "step": 98820 + }, + { + "epoch": 4.09, + "grad_norm": 0.6171875, + "learning_rate": 0.0004774184416929717, + "loss": 0.2233, + "step": 98830 + }, + { + "epoch": 4.09, + "grad_norm": 1.875, + "learning_rate": 0.0004774139372143447, + "loss": 0.1573, + "step": 98840 + }, + { + "epoch": 4.09, + "grad_norm": 1.125, + "learning_rate": 0.00047740943230774915, + "loss": 0.211, + "step": 98850 + }, + { + "epoch": 4.09, + "grad_norm": 0.59375, + "learning_rate": 0.0004774049269731936, + "loss": 0.1767, + "step": 98860 + }, + { + "epoch": 4.1, + "grad_norm": 0.416015625, + "learning_rate": 0.00047740042121068656, + "loss": 0.208, + "step": 98870 + }, + { + "epoch": 4.1, + "grad_norm": 0.27734375, + "learning_rate": 0.0004773959150202365, + "loss": 0.1998, + "step": 98880 + }, + { + "epoch": 4.1, + "grad_norm": 0.416015625, + "learning_rate": 0.00047739140840185186, + "loss": 0.2031, + "step": 98890 + }, + { + "epoch": 4.1, + "grad_norm": 0.62890625, + "learning_rate": 0.0004773869013555412, + "loss": 0.2301, + "step": 98900 + }, + { + "epoch": 4.1, + "grad_norm": 0.69921875, + "learning_rate": 0.00047738239388131286, + "loss": 0.1786, + "step": 98910 + }, + { + "epoch": 4.1, + "grad_norm": 0.9140625, + "learning_rate": 0.00047737788597917546, + "loss": 0.2367, + "step": 98920 + }, + { + "epoch": 4.1, + "grad_norm": 0.94921875, + "learning_rate": 0.00047737337764913745, + "loss": 0.1912, + "step": 98930 + }, + { + "epoch": 4.1, + "grad_norm": 1.203125, + "learning_rate": 0.00047736886889120724, + "loss": 0.2024, + "step": 98940 + }, + { + "epoch": 4.1, + "grad_norm": 0.80078125, + "learning_rate": 0.00047736435970539343, + "loss": 0.2342, + "step": 98950 + }, + { + "epoch": 4.1, + "grad_norm": 1.046875, + "learning_rate": 0.00047735985009170445, + "loss": 0.2302, + "step": 98960 + }, + { + "epoch": 4.1, + "grad_norm": 1.2265625, + "learning_rate": 0.00047735534005014874, + "loss": 0.1795, + "step": 98970 + }, + { + "epoch": 4.1, + "grad_norm": 0.62109375, + "learning_rate": 0.0004773508295807348, + "loss": 0.254, + "step": 98980 + }, + { + "epoch": 4.1, + "grad_norm": 0.9453125, + "learning_rate": 0.00047734631868347123, + "loss": 0.2042, + "step": 98990 + }, + { + "epoch": 4.1, + "grad_norm": 1.9140625, + "learning_rate": 0.0004773418073583664, + "loss": 0.2475, + "step": 99000 + }, + { + "epoch": 4.1, + "grad_norm": 0.7109375, + "learning_rate": 0.0004773372956054289, + "loss": 0.1631, + "step": 99010 + }, + { + "epoch": 4.1, + "grad_norm": 0.2041015625, + "learning_rate": 0.0004773327834246671, + "loss": 0.181, + "step": 99020 + }, + { + "epoch": 4.1, + "grad_norm": 0.85546875, + "learning_rate": 0.00047732827081608956, + "loss": 0.1943, + "step": 99030 + }, + { + "epoch": 4.1, + "grad_norm": 0.5859375, + "learning_rate": 0.00047732375777970475, + "loss": 0.2456, + "step": 99040 + }, + { + "epoch": 4.1, + "grad_norm": 0.4140625, + "learning_rate": 0.0004773192443155212, + "loss": 0.2101, + "step": 99050 + }, + { + "epoch": 4.1, + "grad_norm": 0.60546875, + "learning_rate": 0.0004773147304235474, + "loss": 0.2202, + "step": 99060 + }, + { + "epoch": 4.1, + "grad_norm": 0.7734375, + "learning_rate": 0.00047731021610379176, + "loss": 0.2145, + "step": 99070 + }, + { + "epoch": 4.1, + "grad_norm": 0.51953125, + "learning_rate": 0.00047730570135626297, + "loss": 0.2208, + "step": 99080 + }, + { + "epoch": 4.1, + "grad_norm": 0.68359375, + "learning_rate": 0.0004773011861809693, + "loss": 0.1889, + "step": 99090 + }, + { + "epoch": 4.1, + "grad_norm": 0.50390625, + "learning_rate": 0.0004772966705779193, + "loss": 0.1673, + "step": 99100 + }, + { + "epoch": 4.11, + "grad_norm": 0.4296875, + "learning_rate": 0.0004772921545471216, + "loss": 0.2792, + "step": 99110 + }, + { + "epoch": 4.11, + "grad_norm": 0.3671875, + "learning_rate": 0.00047728763808858453, + "loss": 0.2353, + "step": 99120 + }, + { + "epoch": 4.11, + "grad_norm": 0.6796875, + "learning_rate": 0.00047728312120231674, + "loss": 0.2125, + "step": 99130 + }, + { + "epoch": 4.11, + "grad_norm": 0.79296875, + "learning_rate": 0.0004772786038883267, + "loss": 0.2239, + "step": 99140 + }, + { + "epoch": 4.11, + "grad_norm": 1.40625, + "learning_rate": 0.0004772740861466227, + "loss": 0.1912, + "step": 99150 + }, + { + "epoch": 4.11, + "grad_norm": 0.6015625, + "learning_rate": 0.00047726956797721355, + "loss": 0.1997, + "step": 99160 + }, + { + "epoch": 4.11, + "grad_norm": 0.50390625, + "learning_rate": 0.00047726504938010763, + "loss": 0.2067, + "step": 99170 + }, + { + "epoch": 4.11, + "grad_norm": 0.640625, + "learning_rate": 0.00047726053035531334, + "loss": 0.2374, + "step": 99180 + }, + { + "epoch": 4.11, + "grad_norm": 0.73828125, + "learning_rate": 0.0004772560109028393, + "loss": 0.2117, + "step": 99190 + }, + { + "epoch": 4.11, + "grad_norm": 0.150390625, + "learning_rate": 0.00047725149102269406, + "loss": 0.2054, + "step": 99200 + }, + { + "epoch": 4.11, + "grad_norm": 0.5625, + "learning_rate": 0.000477246970714886, + "loss": 0.1926, + "step": 99210 + }, + { + "epoch": 4.11, + "grad_norm": 0.62109375, + "learning_rate": 0.0004772424499794237, + "loss": 0.2491, + "step": 99220 + }, + { + "epoch": 4.11, + "grad_norm": 0.96484375, + "learning_rate": 0.0004772379288163156, + "loss": 0.2454, + "step": 99230 + }, + { + "epoch": 4.11, + "grad_norm": 0.357421875, + "learning_rate": 0.0004772334072255703, + "loss": 0.2089, + "step": 99240 + }, + { + "epoch": 4.11, + "grad_norm": 0.56640625, + "learning_rate": 0.0004772288852071963, + "loss": 0.216, + "step": 99250 + }, + { + "epoch": 4.11, + "grad_norm": 2.1875, + "learning_rate": 0.00047722436276120193, + "loss": 0.2161, + "step": 99260 + }, + { + "epoch": 4.11, + "grad_norm": 0.470703125, + "learning_rate": 0.00047721983988759597, + "loss": 0.2068, + "step": 99270 + }, + { + "epoch": 4.11, + "grad_norm": 0.54296875, + "learning_rate": 0.00047721531658638684, + "loss": 0.1914, + "step": 99280 + }, + { + "epoch": 4.11, + "grad_norm": 0.64453125, + "learning_rate": 0.00047721079285758297, + "loss": 0.218, + "step": 99290 + }, + { + "epoch": 4.11, + "grad_norm": 0.21484375, + "learning_rate": 0.00047720626870119295, + "loss": 0.2193, + "step": 99300 + }, + { + "epoch": 4.11, + "grad_norm": 0.875, + "learning_rate": 0.0004772017441172252, + "loss": 0.2121, + "step": 99310 + }, + { + "epoch": 4.11, + "grad_norm": 0.29296875, + "learning_rate": 0.00047719721910568835, + "loss": 0.2194, + "step": 99320 + }, + { + "epoch": 4.11, + "grad_norm": 0.29296875, + "learning_rate": 0.0004771926936665909, + "loss": 0.2284, + "step": 99330 + }, + { + "epoch": 4.11, + "grad_norm": 0.00131988525390625, + "learning_rate": 0.0004771881677999413, + "loss": 0.1545, + "step": 99340 + }, + { + "epoch": 4.12, + "grad_norm": 1.671875, + "learning_rate": 0.0004771836415057481, + "loss": 0.2294, + "step": 99350 + }, + { + "epoch": 4.12, + "grad_norm": 0.326171875, + "learning_rate": 0.0004771791147840198, + "loss": 0.1794, + "step": 99360 + }, + { + "epoch": 4.12, + "grad_norm": 1.5859375, + "learning_rate": 0.000477174587634765, + "loss": 0.2279, + "step": 99370 + }, + { + "epoch": 4.12, + "grad_norm": 1.3203125, + "learning_rate": 0.0004771700600579921, + "loss": 0.2204, + "step": 99380 + }, + { + "epoch": 4.12, + "grad_norm": 0.9609375, + "learning_rate": 0.0004771655320537097, + "loss": 0.2089, + "step": 99390 + }, + { + "epoch": 4.12, + "grad_norm": 0.921875, + "learning_rate": 0.0004771610036219263, + "loss": 0.196, + "step": 99400 + }, + { + "epoch": 4.12, + "grad_norm": 0.34375, + "learning_rate": 0.00047715647476265043, + "loss": 0.198, + "step": 99410 + }, + { + "epoch": 4.12, + "grad_norm": 0.71484375, + "learning_rate": 0.0004771519454758906, + "loss": 0.1949, + "step": 99420 + }, + { + "epoch": 4.12, + "grad_norm": 0.322265625, + "learning_rate": 0.00047714741576165533, + "loss": 0.2447, + "step": 99430 + }, + { + "epoch": 4.12, + "grad_norm": 0.474609375, + "learning_rate": 0.00047714288561995325, + "loss": 0.1816, + "step": 99440 + }, + { + "epoch": 4.12, + "grad_norm": 0.3359375, + "learning_rate": 0.00047713835505079264, + "loss": 0.1558, + "step": 99450 + }, + { + "epoch": 4.12, + "grad_norm": 0.94140625, + "learning_rate": 0.0004771338240541823, + "loss": 0.1883, + "step": 99460 + }, + { + "epoch": 4.12, + "grad_norm": 0.6484375, + "learning_rate": 0.0004771292926301305, + "loss": 0.1995, + "step": 99470 + }, + { + "epoch": 4.12, + "grad_norm": 0.77734375, + "learning_rate": 0.00047712476077864597, + "loss": 0.2154, + "step": 99480 + }, + { + "epoch": 4.12, + "grad_norm": 0.734375, + "learning_rate": 0.0004771202284997372, + "loss": 0.194, + "step": 99490 + }, + { + "epoch": 4.12, + "grad_norm": 1.4453125, + "learning_rate": 0.00047711569579341265, + "loss": 0.2442, + "step": 99500 + }, + { + "epoch": 4.12, + "grad_norm": 0.625, + "learning_rate": 0.00047711116265968087, + "loss": 0.182, + "step": 99510 + }, + { + "epoch": 4.12, + "grad_norm": 0.77734375, + "learning_rate": 0.0004771066290985504, + "loss": 0.1697, + "step": 99520 + }, + { + "epoch": 4.12, + "grad_norm": 4.125, + "learning_rate": 0.00047710209511002985, + "loss": 0.1968, + "step": 99530 + }, + { + "epoch": 4.12, + "grad_norm": 0.5, + "learning_rate": 0.0004770975606941276, + "loss": 0.1806, + "step": 99540 + }, + { + "epoch": 4.12, + "grad_norm": 0.85546875, + "learning_rate": 0.0004770930258508523, + "loss": 0.2216, + "step": 99550 + }, + { + "epoch": 4.12, + "grad_norm": 0.9296875, + "learning_rate": 0.0004770884905802125, + "loss": 0.2182, + "step": 99560 + }, + { + "epoch": 4.12, + "grad_norm": 1.5390625, + "learning_rate": 0.00047708395488221664, + "loss": 0.1752, + "step": 99570 + }, + { + "epoch": 4.12, + "grad_norm": 0.3671875, + "learning_rate": 0.0004770794187568733, + "loss": 0.2256, + "step": 99580 + }, + { + "epoch": 4.13, + "grad_norm": 0.98046875, + "learning_rate": 0.0004770748822041909, + "loss": 0.186, + "step": 99590 + }, + { + "epoch": 4.13, + "grad_norm": 0.74609375, + "learning_rate": 0.00047707034522417825, + "loss": 0.1676, + "step": 99600 + }, + { + "epoch": 4.13, + "grad_norm": 1.1875, + "learning_rate": 0.00047706580781684373, + "loss": 0.2254, + "step": 99610 + }, + { + "epoch": 4.13, + "grad_norm": 0.328125, + "learning_rate": 0.0004770612699821958, + "loss": 0.2371, + "step": 99620 + }, + { + "epoch": 4.13, + "grad_norm": 0.7265625, + "learning_rate": 0.0004770567317202431, + "loss": 0.2398, + "step": 99630 + }, + { + "epoch": 4.13, + "grad_norm": 2.109375, + "learning_rate": 0.00047705219303099424, + "loss": 0.2369, + "step": 99640 + }, + { + "epoch": 4.13, + "grad_norm": 0.7265625, + "learning_rate": 0.0004770476539144576, + "loss": 0.2037, + "step": 99650 + }, + { + "epoch": 4.13, + "grad_norm": 0.34375, + "learning_rate": 0.0004770431143706417, + "loss": 0.1788, + "step": 99660 + }, + { + "epoch": 4.13, + "grad_norm": 0.9765625, + "learning_rate": 0.0004770385743995553, + "loss": 0.2051, + "step": 99670 + }, + { + "epoch": 4.13, + "grad_norm": 1.4296875, + "learning_rate": 0.00047703403400120684, + "loss": 0.212, + "step": 99680 + }, + { + "epoch": 4.13, + "grad_norm": 0.333984375, + "learning_rate": 0.00047702949317560486, + "loss": 0.2033, + "step": 99690 + }, + { + "epoch": 4.13, + "grad_norm": 0.97265625, + "learning_rate": 0.00047702495192275773, + "loss": 0.2038, + "step": 99700 + }, + { + "epoch": 4.13, + "grad_norm": 0.578125, + "learning_rate": 0.0004770204102426743, + "loss": 0.2357, + "step": 99710 + }, + { + "epoch": 4.13, + "grad_norm": 1.375, + "learning_rate": 0.0004770158681353629, + "loss": 0.1652, + "step": 99720 + }, + { + "epoch": 4.13, + "grad_norm": 0.734375, + "learning_rate": 0.0004770113256008322, + "loss": 0.212, + "step": 99730 + }, + { + "epoch": 4.13, + "grad_norm": 1.390625, + "learning_rate": 0.0004770067826390907, + "loss": 0.278, + "step": 99740 + }, + { + "epoch": 4.13, + "grad_norm": 0.3515625, + "learning_rate": 0.00047700223925014695, + "loss": 0.1676, + "step": 99750 + }, + { + "epoch": 4.13, + "grad_norm": 0.5078125, + "learning_rate": 0.0004769976954340095, + "loss": 0.2102, + "step": 99760 + }, + { + "epoch": 4.13, + "grad_norm": 0.69140625, + "learning_rate": 0.00047699315119068686, + "loss": 0.2222, + "step": 99770 + }, + { + "epoch": 4.13, + "grad_norm": 0.53125, + "learning_rate": 0.0004769886065201876, + "loss": 0.1822, + "step": 99780 + }, + { + "epoch": 4.13, + "grad_norm": 0.921875, + "learning_rate": 0.0004769840614225204, + "loss": 0.205, + "step": 99790 + }, + { + "epoch": 4.13, + "grad_norm": 0.5078125, + "learning_rate": 0.00047697951589769364, + "loss": 0.2792, + "step": 99800 + }, + { + "epoch": 4.13, + "grad_norm": 0.78125, + "learning_rate": 0.000476974969945716, + "loss": 0.2261, + "step": 99810 + }, + { + "epoch": 4.13, + "grad_norm": 0.7265625, + "learning_rate": 0.0004769704235665959, + "loss": 0.2063, + "step": 99820 + }, + { + "epoch": 4.13, + "grad_norm": 0.62890625, + "learning_rate": 0.000476965876760342, + "loss": 0.1809, + "step": 99830 + }, + { + "epoch": 4.14, + "grad_norm": 0.93359375, + "learning_rate": 0.0004769613295269628, + "loss": 0.2275, + "step": 99840 + }, + { + "epoch": 4.14, + "grad_norm": 0.361328125, + "learning_rate": 0.000476956781866467, + "loss": 0.2138, + "step": 99850 + }, + { + "epoch": 4.14, + "grad_norm": 0.7734375, + "learning_rate": 0.0004769522337788629, + "loss": 0.18, + "step": 99860 + }, + { + "epoch": 4.14, + "grad_norm": 0.69140625, + "learning_rate": 0.0004769476852641593, + "loss": 0.2557, + "step": 99870 + }, + { + "epoch": 4.14, + "grad_norm": 0.796875, + "learning_rate": 0.00047694313632236466, + "loss": 0.2348, + "step": 99880 + }, + { + "epoch": 4.14, + "grad_norm": 0.7890625, + "learning_rate": 0.0004769385869534875, + "loss": 0.2076, + "step": 99890 + }, + { + "epoch": 4.14, + "grad_norm": 0.64453125, + "learning_rate": 0.00047693403715753647, + "loss": 0.2312, + "step": 99900 + }, + { + "epoch": 4.14, + "grad_norm": 0.494140625, + "learning_rate": 0.00047692948693452006, + "loss": 0.1606, + "step": 99910 + }, + { + "epoch": 4.14, + "grad_norm": 1.0703125, + "learning_rate": 0.0004769249362844469, + "loss": 0.2722, + "step": 99920 + }, + { + "epoch": 4.14, + "grad_norm": 0.5, + "learning_rate": 0.00047692038520732544, + "loss": 0.2333, + "step": 99930 + }, + { + "epoch": 4.14, + "grad_norm": 0.96875, + "learning_rate": 0.0004769158337031644, + "loss": 0.1651, + "step": 99940 + }, + { + "epoch": 4.14, + "grad_norm": 0.86328125, + "learning_rate": 0.0004769112817719722, + "loss": 0.2336, + "step": 99950 + }, + { + "epoch": 4.14, + "grad_norm": 0.5234375, + "learning_rate": 0.0004769067294137575, + "loss": 0.2397, + "step": 99960 + }, + { + "epoch": 4.14, + "grad_norm": 1.453125, + "learning_rate": 0.0004769021766285289, + "loss": 0.2469, + "step": 99970 + }, + { + "epoch": 4.14, + "grad_norm": 0.76953125, + "learning_rate": 0.00047689762341629477, + "loss": 0.2406, + "step": 99980 + }, + { + "epoch": 4.14, + "grad_norm": 0.75, + "learning_rate": 0.00047689306977706394, + "loss": 0.2232, + "step": 99990 + }, + { + "epoch": 4.14, + "grad_norm": 0.859375, + "learning_rate": 0.0004768885157108448, + "loss": 0.159, + "step": 100000 + }, + { + "epoch": 4.14, + "grad_norm": 0.57421875, + "learning_rate": 0.000476883961217646, + "loss": 0.2042, + "step": 100010 + }, + { + "epoch": 4.14, + "grad_norm": 0.49609375, + "learning_rate": 0.00047687940629747606, + "loss": 0.2118, + "step": 100020 + }, + { + "epoch": 4.14, + "grad_norm": 0.53515625, + "learning_rate": 0.00047687485095034357, + "loss": 0.2263, + "step": 100030 + }, + { + "epoch": 4.14, + "grad_norm": 0.83984375, + "learning_rate": 0.00047687029517625714, + "loss": 0.2486, + "step": 100040 + }, + { + "epoch": 4.14, + "grad_norm": 0.859375, + "learning_rate": 0.00047686573897522534, + "loss": 0.1759, + "step": 100050 + }, + { + "epoch": 4.14, + "grad_norm": 1.953125, + "learning_rate": 0.0004768611823472566, + "loss": 0.1896, + "step": 100060 + }, + { + "epoch": 4.14, + "grad_norm": 0.373046875, + "learning_rate": 0.0004768566252923597, + "loss": 0.2257, + "step": 100070 + }, + { + "epoch": 4.15, + "grad_norm": 0.33984375, + "learning_rate": 0.0004768520678105432, + "loss": 0.2127, + "step": 100080 + }, + { + "epoch": 4.15, + "grad_norm": 0.86328125, + "learning_rate": 0.0004768475099018155, + "loss": 0.1834, + "step": 100090 + }, + { + "epoch": 4.15, + "grad_norm": 0.59765625, + "learning_rate": 0.0004768429515661853, + "loss": 0.1845, + "step": 100100 + }, + { + "epoch": 4.15, + "grad_norm": 0.193359375, + "learning_rate": 0.0004768383928036612, + "loss": 0.1823, + "step": 100110 + }, + { + "epoch": 4.15, + "grad_norm": 1.9765625, + "learning_rate": 0.0004768338336142517, + "loss": 0.2616, + "step": 100120 + }, + { + "epoch": 4.15, + "grad_norm": 0.6484375, + "learning_rate": 0.0004768292739979655, + "loss": 0.2218, + "step": 100130 + }, + { + "epoch": 4.15, + "grad_norm": 0.640625, + "learning_rate": 0.000476824713954811, + "loss": 0.2634, + "step": 100140 + }, + { + "epoch": 4.15, + "grad_norm": 0.59375, + "learning_rate": 0.0004768201534847969, + "loss": 0.2311, + "step": 100150 + }, + { + "epoch": 4.15, + "grad_norm": 0.640625, + "learning_rate": 0.00047681559258793186, + "loss": 0.1867, + "step": 100160 + }, + { + "epoch": 4.15, + "grad_norm": 0.84375, + "learning_rate": 0.00047681103126422434, + "loss": 0.1654, + "step": 100170 + }, + { + "epoch": 4.15, + "grad_norm": 0.74609375, + "learning_rate": 0.0004768064695136829, + "loss": 0.1952, + "step": 100180 + }, + { + "epoch": 4.15, + "grad_norm": 0.71484375, + "learning_rate": 0.0004768019073363161, + "loss": 0.203, + "step": 100190 + }, + { + "epoch": 4.15, + "grad_norm": 1.09375, + "learning_rate": 0.00047679734473213276, + "loss": 0.2211, + "step": 100200 + }, + { + "epoch": 4.15, + "grad_norm": 0.7890625, + "learning_rate": 0.00047679278170114123, + "loss": 0.221, + "step": 100210 + }, + { + "epoch": 4.15, + "grad_norm": 0.78125, + "learning_rate": 0.00047678821824335017, + "loss": 0.169, + "step": 100220 + }, + { + "epoch": 4.15, + "grad_norm": 0.423828125, + "learning_rate": 0.00047678365435876817, + "loss": 0.1742, + "step": 100230 + }, + { + "epoch": 4.15, + "grad_norm": 0.498046875, + "learning_rate": 0.00047677909004740387, + "loss": 0.1746, + "step": 100240 + }, + { + "epoch": 4.15, + "grad_norm": 1.15625, + "learning_rate": 0.00047677452530926577, + "loss": 0.2117, + "step": 100250 + }, + { + "epoch": 4.15, + "grad_norm": 0.80859375, + "learning_rate": 0.0004767699601443626, + "loss": 0.2249, + "step": 100260 + }, + { + "epoch": 4.15, + "grad_norm": 0.515625, + "learning_rate": 0.0004767653945527027, + "loss": 0.2193, + "step": 100270 + }, + { + "epoch": 4.15, + "grad_norm": 0.466796875, + "learning_rate": 0.00047676082853429495, + "loss": 0.217, + "step": 100280 + }, + { + "epoch": 4.15, + "grad_norm": 0.361328125, + "learning_rate": 0.00047675626208914775, + "loss": 0.1734, + "step": 100290 + }, + { + "epoch": 4.15, + "grad_norm": 0.494140625, + "learning_rate": 0.00047675169521726974, + "loss": 0.1899, + "step": 100300 + }, + { + "epoch": 4.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00047674712791866955, + "loss": 0.2451, + "step": 100310 + }, + { + "epoch": 4.16, + "grad_norm": 0.75, + "learning_rate": 0.00047674256019335573, + "loss": 0.1837, + "step": 100320 + }, + { + "epoch": 4.16, + "grad_norm": 0.75390625, + "learning_rate": 0.00047673799204133696, + "loss": 0.2066, + "step": 100330 + }, + { + "epoch": 4.16, + "grad_norm": 1.328125, + "learning_rate": 0.00047673342346262165, + "loss": 0.2266, + "step": 100340 + }, + { + "epoch": 4.16, + "grad_norm": 0.0, + "learning_rate": 0.00047672885445721857, + "loss": 0.2616, + "step": 100350 + }, + { + "epoch": 4.16, + "grad_norm": 0.73828125, + "learning_rate": 0.00047672428502513634, + "loss": 0.1876, + "step": 100360 + }, + { + "epoch": 4.16, + "grad_norm": 0.52734375, + "learning_rate": 0.0004767197151663835, + "loss": 0.1781, + "step": 100370 + }, + { + "epoch": 4.16, + "grad_norm": 1.2734375, + "learning_rate": 0.00047671514488096856, + "loss": 0.1697, + "step": 100380 + }, + { + "epoch": 4.16, + "grad_norm": 0.474609375, + "learning_rate": 0.0004767105741689002, + "loss": 0.2026, + "step": 100390 + }, + { + "epoch": 4.16, + "grad_norm": 0.98046875, + "learning_rate": 0.0004767060030301871, + "loss": 0.2179, + "step": 100400 + }, + { + "epoch": 4.16, + "grad_norm": 0.45703125, + "learning_rate": 0.0004767014314648377, + "loss": 0.1945, + "step": 100410 + }, + { + "epoch": 4.16, + "grad_norm": 0.390625, + "learning_rate": 0.0004766968594728607, + "loss": 0.1963, + "step": 100420 + }, + { + "epoch": 4.16, + "grad_norm": 0.28515625, + "learning_rate": 0.0004766922870542647, + "loss": 0.1758, + "step": 100430 + }, + { + "epoch": 4.16, + "grad_norm": 0.53125, + "learning_rate": 0.0004766877142090583, + "loss": 0.1878, + "step": 100440 + }, + { + "epoch": 4.16, + "grad_norm": 0.765625, + "learning_rate": 0.0004766831409372502, + "loss": 0.2117, + "step": 100450 + }, + { + "epoch": 4.16, + "grad_norm": 2.96875, + "learning_rate": 0.00047667856723884874, + "loss": 0.2061, + "step": 100460 + }, + { + "epoch": 4.16, + "grad_norm": 0.51953125, + "learning_rate": 0.0004766739931138628, + "loss": 0.2237, + "step": 100470 + }, + { + "epoch": 4.16, + "grad_norm": 0.361328125, + "learning_rate": 0.00047666941856230085, + "loss": 0.2457, + "step": 100480 + }, + { + "epoch": 4.16, + "grad_norm": 0.96875, + "learning_rate": 0.00047666484358417155, + "loss": 0.2149, + "step": 100490 + }, + { + "epoch": 4.16, + "grad_norm": 0.81640625, + "learning_rate": 0.0004766602681794835, + "loss": 0.2365, + "step": 100500 + }, + { + "epoch": 4.16, + "grad_norm": 1.0859375, + "learning_rate": 0.00047665569234824523, + "loss": 0.2322, + "step": 100510 + }, + { + "epoch": 4.16, + "grad_norm": 0.455078125, + "learning_rate": 0.0004766511160904655, + "loss": 0.1812, + "step": 100520 + }, + { + "epoch": 4.16, + "grad_norm": 0.55859375, + "learning_rate": 0.00047664653940615283, + "loss": 0.2092, + "step": 100530 + }, + { + "epoch": 4.16, + "grad_norm": 0.515625, + "learning_rate": 0.0004766419622953159, + "loss": 0.2336, + "step": 100540 + }, + { + "epoch": 4.16, + "grad_norm": 0.87109375, + "learning_rate": 0.00047663738475796314, + "loss": 0.2457, + "step": 100550 + }, + { + "epoch": 4.17, + "grad_norm": 0.2138671875, + "learning_rate": 0.00047663280679410337, + "loss": 0.1825, + "step": 100560 + }, + { + "epoch": 4.17, + "grad_norm": 1.0703125, + "learning_rate": 0.0004766282284037452, + "loss": 0.2051, + "step": 100570 + }, + { + "epoch": 4.17, + "grad_norm": 0.79296875, + "learning_rate": 0.0004766236495868971, + "loss": 0.2273, + "step": 100580 + }, + { + "epoch": 4.17, + "grad_norm": 0.1806640625, + "learning_rate": 0.0004766190703435678, + "loss": 0.2022, + "step": 100590 + }, + { + "epoch": 4.17, + "grad_norm": 0.85546875, + "learning_rate": 0.0004766144906737658, + "loss": 0.1832, + "step": 100600 + }, + { + "epoch": 4.17, + "grad_norm": 0.322265625, + "learning_rate": 0.00047660991057749987, + "loss": 0.2026, + "step": 100610 + }, + { + "epoch": 4.17, + "grad_norm": 0.796875, + "learning_rate": 0.0004766053300547786, + "loss": 0.2452, + "step": 100620 + }, + { + "epoch": 4.17, + "grad_norm": 0.5546875, + "learning_rate": 0.0004766007491056105, + "loss": 0.1704, + "step": 100630 + }, + { + "epoch": 4.17, + "grad_norm": 0.73828125, + "learning_rate": 0.00047659616773000423, + "loss": 0.2128, + "step": 100640 + }, + { + "epoch": 4.17, + "grad_norm": 0.78125, + "learning_rate": 0.00047659158592796853, + "loss": 0.2242, + "step": 100650 + }, + { + "epoch": 4.17, + "grad_norm": 1.2578125, + "learning_rate": 0.00047658700369951194, + "loss": 0.2485, + "step": 100660 + }, + { + "epoch": 4.17, + "grad_norm": 2.3125, + "learning_rate": 0.000476582421044643, + "loss": 0.1861, + "step": 100670 + }, + { + "epoch": 4.17, + "grad_norm": 1.3359375, + "learning_rate": 0.0004765778379633704, + "loss": 0.1571, + "step": 100680 + }, + { + "epoch": 4.17, + "grad_norm": 0.77734375, + "learning_rate": 0.00047657325445570287, + "loss": 0.1607, + "step": 100690 + }, + { + "epoch": 4.17, + "grad_norm": 1.4453125, + "learning_rate": 0.00047656867052164886, + "loss": 0.1855, + "step": 100700 + }, + { + "epoch": 4.17, + "grad_norm": 0.76171875, + "learning_rate": 0.00047656408616121717, + "loss": 0.1897, + "step": 100710 + }, + { + "epoch": 4.17, + "grad_norm": 1.046875, + "learning_rate": 0.00047655950137441626, + "loss": 0.2328, + "step": 100720 + }, + { + "epoch": 4.17, + "grad_norm": 0.69140625, + "learning_rate": 0.0004765549161612549, + "loss": 0.242, + "step": 100730 + }, + { + "epoch": 4.17, + "grad_norm": 0.71875, + "learning_rate": 0.00047655033052174155, + "loss": 0.1689, + "step": 100740 + }, + { + "epoch": 4.17, + "grad_norm": 0.65234375, + "learning_rate": 0.00047654574445588505, + "loss": 0.1744, + "step": 100750 + }, + { + "epoch": 4.17, + "grad_norm": 0.71484375, + "learning_rate": 0.0004765411579636939, + "loss": 0.1991, + "step": 100760 + }, + { + "epoch": 4.17, + "grad_norm": 0.470703125, + "learning_rate": 0.00047653657104517665, + "loss": 0.2136, + "step": 100770 + }, + { + "epoch": 4.17, + "grad_norm": 0.40234375, + "learning_rate": 0.0004765319837003422, + "loss": 0.2195, + "step": 100780 + }, + { + "epoch": 4.17, + "grad_norm": 1.0703125, + "learning_rate": 0.0004765273959291989, + "loss": 0.2236, + "step": 100790 + }, + { + "epoch": 4.18, + "grad_norm": 1.1015625, + "learning_rate": 0.0004765228077317556, + "loss": 0.2047, + "step": 100800 + }, + { + "epoch": 4.18, + "grad_norm": 0.2490234375, + "learning_rate": 0.00047651821910802073, + "loss": 0.1997, + "step": 100810 + }, + { + "epoch": 4.18, + "grad_norm": 0.76171875, + "learning_rate": 0.0004765136300580031, + "loss": 0.2093, + "step": 100820 + }, + { + "epoch": 4.18, + "grad_norm": 0.6328125, + "learning_rate": 0.0004765090405817113, + "loss": 0.2265, + "step": 100830 + }, + { + "epoch": 4.18, + "grad_norm": 0.66015625, + "learning_rate": 0.0004765044506791539, + "loss": 0.2337, + "step": 100840 + }, + { + "epoch": 4.18, + "grad_norm": 0.55859375, + "learning_rate": 0.0004764998603503396, + "loss": 0.2214, + "step": 100850 + }, + { + "epoch": 4.18, + "grad_norm": 0.5703125, + "learning_rate": 0.000476495269595277, + "loss": 0.2135, + "step": 100860 + }, + { + "epoch": 4.18, + "grad_norm": 0.76171875, + "learning_rate": 0.00047649067841397483, + "loss": 0.2207, + "step": 100870 + }, + { + "epoch": 4.18, + "grad_norm": 1.5859375, + "learning_rate": 0.00047648608680644157, + "loss": 0.2299, + "step": 100880 + }, + { + "epoch": 4.18, + "grad_norm": 0.6796875, + "learning_rate": 0.000476481494772686, + "loss": 0.2143, + "step": 100890 + }, + { + "epoch": 4.18, + "grad_norm": 0.52734375, + "learning_rate": 0.0004764769023127167, + "loss": 0.2232, + "step": 100900 + }, + { + "epoch": 4.18, + "grad_norm": 1.078125, + "learning_rate": 0.0004764723094265424, + "loss": 0.2781, + "step": 100910 + }, + { + "epoch": 4.18, + "grad_norm": 0.2412109375, + "learning_rate": 0.0004764677161141716, + "loss": 0.2054, + "step": 100920 + }, + { + "epoch": 4.18, + "grad_norm": 1.1328125, + "learning_rate": 0.000476463122375613, + "loss": 0.2636, + "step": 100930 + }, + { + "epoch": 4.18, + "grad_norm": 0.6015625, + "learning_rate": 0.00047645852821087535, + "loss": 0.2493, + "step": 100940 + }, + { + "epoch": 4.18, + "grad_norm": 0.671875, + "learning_rate": 0.0004764539336199671, + "loss": 0.2273, + "step": 100950 + }, + { + "epoch": 4.18, + "grad_norm": 1.359375, + "learning_rate": 0.00047644933860289707, + "loss": 0.1895, + "step": 100960 + }, + { + "epoch": 4.18, + "grad_norm": 0.76953125, + "learning_rate": 0.0004764447431596738, + "loss": 0.2148, + "step": 100970 + }, + { + "epoch": 4.18, + "grad_norm": 1.328125, + "learning_rate": 0.00047644014729030594, + "loss": 0.1735, + "step": 100980 + }, + { + "epoch": 4.18, + "grad_norm": 0.6171875, + "learning_rate": 0.00047643555099480224, + "loss": 0.1748, + "step": 100990 + }, + { + "epoch": 4.18, + "grad_norm": 0.98046875, + "learning_rate": 0.0004764309542731713, + "loss": 0.1647, + "step": 101000 + }, + { + "epoch": 4.18, + "grad_norm": 1.203125, + "learning_rate": 0.0004764263571254217, + "loss": 0.1791, + "step": 101010 + }, + { + "epoch": 4.18, + "grad_norm": 1.84375, + "learning_rate": 0.0004764217595515622, + "loss": 0.2018, + "step": 101020 + }, + { + "epoch": 4.18, + "grad_norm": 0.6796875, + "learning_rate": 0.0004764171615516014, + "loss": 0.2031, + "step": 101030 + }, + { + "epoch": 4.19, + "grad_norm": 2.28125, + "learning_rate": 0.00047641256312554793, + "loss": 0.2054, + "step": 101040 + }, + { + "epoch": 4.19, + "grad_norm": 1.3828125, + "learning_rate": 0.00047640796427341045, + "loss": 0.1998, + "step": 101050 + }, + { + "epoch": 4.19, + "grad_norm": 0.86328125, + "learning_rate": 0.00047640336499519767, + "loss": 0.2065, + "step": 101060 + }, + { + "epoch": 4.19, + "grad_norm": 0.90234375, + "learning_rate": 0.00047639876529091815, + "loss": 0.204, + "step": 101070 + }, + { + "epoch": 4.19, + "grad_norm": 0.62109375, + "learning_rate": 0.0004763941651605806, + "loss": 0.1963, + "step": 101080 + }, + { + "epoch": 4.19, + "grad_norm": 0.74609375, + "learning_rate": 0.0004763895646041937, + "loss": 0.1978, + "step": 101090 + }, + { + "epoch": 4.19, + "grad_norm": 0.400390625, + "learning_rate": 0.00047638496362176615, + "loss": 0.2132, + "step": 101100 + }, + { + "epoch": 4.19, + "grad_norm": 0.73828125, + "learning_rate": 0.0004763803622133065, + "loss": 0.2475, + "step": 101110 + }, + { + "epoch": 4.19, + "grad_norm": 1.1796875, + "learning_rate": 0.00047637576037882346, + "loss": 0.1804, + "step": 101120 + }, + { + "epoch": 4.19, + "grad_norm": 0.5859375, + "learning_rate": 0.0004763711581183257, + "loss": 0.2459, + "step": 101130 + }, + { + "epoch": 4.19, + "grad_norm": 0.56640625, + "learning_rate": 0.00047636655543182183, + "loss": 0.238, + "step": 101140 + }, + { + "epoch": 4.19, + "grad_norm": 0.53515625, + "learning_rate": 0.00047636195231932053, + "loss": 0.1858, + "step": 101150 + }, + { + "epoch": 4.19, + "grad_norm": 0.59375, + "learning_rate": 0.0004763573487808305, + "loss": 0.2329, + "step": 101160 + }, + { + "epoch": 4.19, + "grad_norm": 0.78515625, + "learning_rate": 0.0004763527448163604, + "loss": 0.2178, + "step": 101170 + }, + { + "epoch": 4.19, + "grad_norm": 0.76171875, + "learning_rate": 0.0004763481404259189, + "loss": 0.2373, + "step": 101180 + }, + { + "epoch": 4.19, + "grad_norm": 0.859375, + "learning_rate": 0.0004763435356095146, + "loss": 0.2139, + "step": 101190 + }, + { + "epoch": 4.19, + "grad_norm": 0.6953125, + "learning_rate": 0.00047633893036715623, + "loss": 0.2269, + "step": 101200 + }, + { + "epoch": 4.19, + "grad_norm": 3.609375, + "learning_rate": 0.00047633432469885246, + "loss": 0.188, + "step": 101210 + }, + { + "epoch": 4.19, + "grad_norm": 0.828125, + "learning_rate": 0.0004763297186046119, + "loss": 0.1781, + "step": 101220 + }, + { + "epoch": 4.19, + "grad_norm": 0.53125, + "learning_rate": 0.0004763251120844433, + "loss": 0.2369, + "step": 101230 + }, + { + "epoch": 4.19, + "grad_norm": 0.640625, + "learning_rate": 0.00047632050513835516, + "loss": 0.2007, + "step": 101240 + }, + { + "epoch": 4.19, + "grad_norm": 1.015625, + "learning_rate": 0.0004763158977663564, + "loss": 0.2405, + "step": 101250 + }, + { + "epoch": 4.19, + "grad_norm": 0.1455078125, + "learning_rate": 0.00047631128996845554, + "loss": 0.1975, + "step": 101260 + }, + { + "epoch": 4.19, + "grad_norm": 1.1953125, + "learning_rate": 0.0004763066817446612, + "loss": 0.2078, + "step": 101270 + }, + { + "epoch": 4.2, + "grad_norm": 0.828125, + "learning_rate": 0.00047630207309498217, + "loss": 0.1325, + "step": 101280 + }, + { + "epoch": 4.2, + "grad_norm": 0.50390625, + "learning_rate": 0.0004762974640194271, + "loss": 0.2187, + "step": 101290 + }, + { + "epoch": 4.2, + "grad_norm": 0.70703125, + "learning_rate": 0.0004762928545180046, + "loss": 0.2215, + "step": 101300 + }, + { + "epoch": 4.2, + "grad_norm": 0.59765625, + "learning_rate": 0.0004762882445907234, + "loss": 0.2006, + "step": 101310 + }, + { + "epoch": 4.2, + "grad_norm": 1.53125, + "learning_rate": 0.0004762836342375921, + "loss": 0.1845, + "step": 101320 + }, + { + "epoch": 4.2, + "grad_norm": 0.34765625, + "learning_rate": 0.0004762790234586195, + "loss": 0.2021, + "step": 101330 + }, + { + "epoch": 4.2, + "grad_norm": 0.5234375, + "learning_rate": 0.00047627441225381426, + "loss": 0.2015, + "step": 101340 + }, + { + "epoch": 4.2, + "grad_norm": 1.4375, + "learning_rate": 0.0004762698006231849, + "loss": 0.1946, + "step": 101350 + }, + { + "epoch": 4.2, + "grad_norm": 0.74609375, + "learning_rate": 0.00047626518856674026, + "loss": 0.203, + "step": 101360 + }, + { + "epoch": 4.2, + "grad_norm": 0.43359375, + "learning_rate": 0.00047626057608448903, + "loss": 0.1718, + "step": 101370 + }, + { + "epoch": 4.2, + "grad_norm": 0.59375, + "learning_rate": 0.0004762559631764398, + "loss": 0.2212, + "step": 101380 + }, + { + "epoch": 4.2, + "grad_norm": 0.99609375, + "learning_rate": 0.0004762513498426012, + "loss": 0.1894, + "step": 101390 + }, + { + "epoch": 4.2, + "grad_norm": 0.875, + "learning_rate": 0.0004762467360829821, + "loss": 0.1702, + "step": 101400 + }, + { + "epoch": 4.2, + "grad_norm": 1.4453125, + "learning_rate": 0.00047624212189759096, + "loss": 0.2324, + "step": 101410 + }, + { + "epoch": 4.2, + "grad_norm": 0.427734375, + "learning_rate": 0.00047623750728643667, + "loss": 0.1478, + "step": 101420 + }, + { + "epoch": 4.2, + "grad_norm": 0.76953125, + "learning_rate": 0.00047623289224952783, + "loss": 0.2033, + "step": 101430 + }, + { + "epoch": 4.2, + "grad_norm": 0.166015625, + "learning_rate": 0.0004762282767868731, + "loss": 0.2358, + "step": 101440 + }, + { + "epoch": 4.2, + "grad_norm": 0.326171875, + "learning_rate": 0.00047622366089848114, + "loss": 0.2326, + "step": 101450 + }, + { + "epoch": 4.2, + "grad_norm": 0.53515625, + "learning_rate": 0.00047621904458436073, + "loss": 0.2342, + "step": 101460 + }, + { + "epoch": 4.2, + "grad_norm": 0.4375, + "learning_rate": 0.00047621442784452053, + "loss": 0.1813, + "step": 101470 + }, + { + "epoch": 4.2, + "grad_norm": 0.8515625, + "learning_rate": 0.0004762098106789691, + "loss": 0.2146, + "step": 101480 + }, + { + "epoch": 4.2, + "grad_norm": 0.81640625, + "learning_rate": 0.0004762051930877153, + "loss": 0.205, + "step": 101490 + }, + { + "epoch": 4.2, + "grad_norm": 0.55078125, + "learning_rate": 0.0004762005750707677, + "loss": 0.2142, + "step": 101500 + }, + { + "epoch": 4.2, + "grad_norm": 0.796875, + "learning_rate": 0.0004761959566281352, + "loss": 0.1688, + "step": 101510 + }, + { + "epoch": 4.2, + "grad_norm": 1.1640625, + "learning_rate": 0.00047619133775982623, + "loss": 0.1948, + "step": 101520 + }, + { + "epoch": 4.21, + "grad_norm": 1.21875, + "learning_rate": 0.0004761867184658496, + "loss": 0.2292, + "step": 101530 + }, + { + "epoch": 4.21, + "grad_norm": 0.5859375, + "learning_rate": 0.00047618209874621397, + "loss": 0.2271, + "step": 101540 + }, + { + "epoch": 4.21, + "grad_norm": 1.0234375, + "learning_rate": 0.0004761774786009281, + "loss": 0.2179, + "step": 101550 + }, + { + "epoch": 4.21, + "grad_norm": 1.1015625, + "learning_rate": 0.0004761728580300006, + "loss": 0.2363, + "step": 101560 + }, + { + "epoch": 4.21, + "grad_norm": 1.1484375, + "learning_rate": 0.0004761682370334402, + "loss": 0.2191, + "step": 101570 + }, + { + "epoch": 4.21, + "grad_norm": 0.83984375, + "learning_rate": 0.00047616361561125564, + "loss": 0.2696, + "step": 101580 + }, + { + "epoch": 4.21, + "grad_norm": 0.60546875, + "learning_rate": 0.00047615899376345555, + "loss": 0.2032, + "step": 101590 + }, + { + "epoch": 4.21, + "grad_norm": 0.87109375, + "learning_rate": 0.0004761543714900487, + "loss": 0.2165, + "step": 101600 + }, + { + "epoch": 4.21, + "grad_norm": 0.734375, + "learning_rate": 0.0004761497487910437, + "loss": 0.2025, + "step": 101610 + }, + { + "epoch": 4.21, + "grad_norm": 1.75, + "learning_rate": 0.0004761451256664493, + "loss": 0.2254, + "step": 101620 + }, + { + "epoch": 4.21, + "grad_norm": 0.6015625, + "learning_rate": 0.0004761405021162742, + "loss": 0.249, + "step": 101630 + }, + { + "epoch": 4.21, + "grad_norm": 0.45703125, + "learning_rate": 0.00047613587814052707, + "loss": 0.2421, + "step": 101640 + }, + { + "epoch": 4.21, + "grad_norm": 0.306640625, + "learning_rate": 0.00047613125373921673, + "loss": 0.1916, + "step": 101650 + }, + { + "epoch": 4.21, + "grad_norm": 0.76953125, + "learning_rate": 0.0004761266289123517, + "loss": 0.2566, + "step": 101660 + }, + { + "epoch": 4.21, + "grad_norm": 1.0703125, + "learning_rate": 0.0004761220036599408, + "loss": 0.1973, + "step": 101670 + }, + { + "epoch": 4.21, + "grad_norm": 0.765625, + "learning_rate": 0.00047611737798199273, + "loss": 0.2222, + "step": 101680 + }, + { + "epoch": 4.21, + "grad_norm": 0.7890625, + "learning_rate": 0.0004761127518785161, + "loss": 0.2419, + "step": 101690 + }, + { + "epoch": 4.21, + "grad_norm": 0.640625, + "learning_rate": 0.00047610812534951976, + "loss": 0.2222, + "step": 101700 + }, + { + "epoch": 4.21, + "grad_norm": 0.90625, + "learning_rate": 0.00047610349839501235, + "loss": 0.1908, + "step": 101710 + }, + { + "epoch": 4.21, + "grad_norm": 0.63671875, + "learning_rate": 0.0004760988710150025, + "loss": 0.2418, + "step": 101720 + }, + { + "epoch": 4.21, + "grad_norm": 0.69921875, + "learning_rate": 0.0004760942432094991, + "loss": 0.1588, + "step": 101730 + }, + { + "epoch": 4.21, + "grad_norm": 0.486328125, + "learning_rate": 0.0004760896149785107, + "loss": 0.1875, + "step": 101740 + }, + { + "epoch": 4.21, + "grad_norm": 0.94921875, + "learning_rate": 0.000476084986322046, + "loss": 0.1959, + "step": 101750 + }, + { + "epoch": 4.21, + "grad_norm": 0.490234375, + "learning_rate": 0.00047608035724011383, + "loss": 0.2313, + "step": 101760 + }, + { + "epoch": 4.22, + "grad_norm": 0.90234375, + "learning_rate": 0.00047607572773272284, + "loss": 0.1891, + "step": 101770 + }, + { + "epoch": 4.22, + "grad_norm": 1.015625, + "learning_rate": 0.0004760710977998818, + "loss": 0.2037, + "step": 101780 + }, + { + "epoch": 4.22, + "grad_norm": 0.4296875, + "learning_rate": 0.0004760664674415992, + "loss": 0.2503, + "step": 101790 + }, + { + "epoch": 4.22, + "grad_norm": 0.53515625, + "learning_rate": 0.00047606183665788404, + "loss": 0.217, + "step": 101800 + }, + { + "epoch": 4.22, + "grad_norm": 0.66015625, + "learning_rate": 0.000476057205448745, + "loss": 0.1631, + "step": 101810 + }, + { + "epoch": 4.22, + "grad_norm": 0.60546875, + "learning_rate": 0.00047605257381419053, + "loss": 0.2243, + "step": 101820 + }, + { + "epoch": 4.22, + "grad_norm": 0.93359375, + "learning_rate": 0.0004760479417542296, + "loss": 0.2798, + "step": 101830 + }, + { + "epoch": 4.22, + "grad_norm": 0.91015625, + "learning_rate": 0.0004760433092688709, + "loss": 0.2152, + "step": 101840 + }, + { + "epoch": 4.22, + "grad_norm": 0.90625, + "learning_rate": 0.00047603867635812307, + "loss": 0.1739, + "step": 101850 + }, + { + "epoch": 4.22, + "grad_norm": 1.0859375, + "learning_rate": 0.0004760340430219949, + "loss": 0.2143, + "step": 101860 + }, + { + "epoch": 4.22, + "grad_norm": 0.337890625, + "learning_rate": 0.000476029409260495, + "loss": 0.1476, + "step": 101870 + }, + { + "epoch": 4.22, + "grad_norm": 0.90234375, + "learning_rate": 0.0004760247750736322, + "loss": 0.2044, + "step": 101880 + }, + { + "epoch": 4.22, + "grad_norm": 0.267578125, + "learning_rate": 0.0004760201404614152, + "loss": 0.2041, + "step": 101890 + }, + { + "epoch": 4.22, + "grad_norm": 0.75390625, + "learning_rate": 0.00047601550542385266, + "loss": 0.1983, + "step": 101900 + }, + { + "epoch": 4.22, + "grad_norm": 0.97265625, + "learning_rate": 0.00047601086996095334, + "loss": 0.2209, + "step": 101910 + }, + { + "epoch": 4.22, + "grad_norm": 0.6484375, + "learning_rate": 0.00047600623407272606, + "loss": 0.1989, + "step": 101920 + }, + { + "epoch": 4.22, + "grad_norm": 0.578125, + "learning_rate": 0.0004760015977591793, + "loss": 0.1865, + "step": 101930 + }, + { + "epoch": 4.22, + "grad_norm": 0.2255859375, + "learning_rate": 0.0004759969610203221, + "loss": 0.2455, + "step": 101940 + }, + { + "epoch": 4.22, + "grad_norm": 0.8984375, + "learning_rate": 0.0004759923238561629, + "loss": 0.2188, + "step": 101950 + }, + { + "epoch": 4.22, + "grad_norm": 1.1484375, + "learning_rate": 0.00047598768626671065, + "loss": 0.2501, + "step": 101960 + }, + { + "epoch": 4.22, + "grad_norm": 0.68359375, + "learning_rate": 0.0004759830482519739, + "loss": 0.2726, + "step": 101970 + }, + { + "epoch": 4.22, + "grad_norm": 0.5703125, + "learning_rate": 0.0004759784098119615, + "loss": 0.1714, + "step": 101980 + }, + { + "epoch": 4.22, + "grad_norm": 0.2119140625, + "learning_rate": 0.0004759737709466822, + "loss": 0.2031, + "step": 101990 + }, + { + "epoch": 4.22, + "grad_norm": 0.3125, + "learning_rate": 0.0004759691316561445, + "loss": 0.1565, + "step": 102000 + }, + { + "epoch": 4.23, + "grad_norm": 0.6171875, + "learning_rate": 0.0004759644919403574, + "loss": 0.1759, + "step": 102010 + }, + { + "epoch": 4.23, + "grad_norm": 0.50390625, + "learning_rate": 0.00047595985179932956, + "loss": 0.178, + "step": 102020 + }, + { + "epoch": 4.23, + "grad_norm": 0.6484375, + "learning_rate": 0.00047595521123306963, + "loss": 0.2102, + "step": 102030 + }, + { + "epoch": 4.23, + "grad_norm": 0.8359375, + "learning_rate": 0.00047595057024158637, + "loss": 0.2199, + "step": 102040 + }, + { + "epoch": 4.23, + "grad_norm": 0.7734375, + "learning_rate": 0.00047594592882488855, + "loss": 0.2437, + "step": 102050 + }, + { + "epoch": 4.23, + "grad_norm": 0.34765625, + "learning_rate": 0.0004759412869829849, + "loss": 0.2094, + "step": 102060 + }, + { + "epoch": 4.23, + "grad_norm": 0.443359375, + "learning_rate": 0.00047593664471588424, + "loss": 0.2344, + "step": 102070 + }, + { + "epoch": 4.23, + "grad_norm": 0.9453125, + "learning_rate": 0.0004759320020235951, + "loss": 0.2154, + "step": 102080 + }, + { + "epoch": 4.23, + "grad_norm": 0.82421875, + "learning_rate": 0.00047592735890612635, + "loss": 0.1842, + "step": 102090 + }, + { + "epoch": 4.23, + "grad_norm": 0.92578125, + "learning_rate": 0.00047592271536348675, + "loss": 0.1807, + "step": 102100 + }, + { + "epoch": 4.23, + "grad_norm": 0.53125, + "learning_rate": 0.00047591807139568497, + "loss": 0.1932, + "step": 102110 + }, + { + "epoch": 4.23, + "grad_norm": 0.55078125, + "learning_rate": 0.0004759134270027298, + "loss": 0.2624, + "step": 102120 + }, + { + "epoch": 4.23, + "grad_norm": 0.51171875, + "learning_rate": 0.0004759087821846299, + "loss": 0.2219, + "step": 102130 + }, + { + "epoch": 4.23, + "grad_norm": 0.51953125, + "learning_rate": 0.0004759041369413941, + "loss": 0.2449, + "step": 102140 + }, + { + "epoch": 4.23, + "grad_norm": 0.5234375, + "learning_rate": 0.0004758994912730311, + "loss": 0.182, + "step": 102150 + }, + { + "epoch": 4.23, + "grad_norm": 0.546875, + "learning_rate": 0.0004758948451795497, + "loss": 0.2197, + "step": 102160 + }, + { + "epoch": 4.23, + "grad_norm": 0.4140625, + "learning_rate": 0.00047589019866095853, + "loss": 0.1975, + "step": 102170 + }, + { + "epoch": 4.23, + "grad_norm": 0.486328125, + "learning_rate": 0.00047588555171726644, + "loss": 0.2442, + "step": 102180 + }, + { + "epoch": 4.23, + "grad_norm": 1.3203125, + "learning_rate": 0.0004758809043484821, + "loss": 0.2844, + "step": 102190 + }, + { + "epoch": 4.23, + "grad_norm": 0.66015625, + "learning_rate": 0.0004758762565546143, + "loss": 0.2137, + "step": 102200 + }, + { + "epoch": 4.23, + "grad_norm": 0.322265625, + "learning_rate": 0.00047587160833567177, + "loss": 0.2235, + "step": 102210 + }, + { + "epoch": 4.23, + "grad_norm": 0.51171875, + "learning_rate": 0.0004758669596916634, + "loss": 0.2376, + "step": 102220 + }, + { + "epoch": 4.23, + "grad_norm": 0.486328125, + "learning_rate": 0.00047586231062259765, + "loss": 0.2003, + "step": 102230 + }, + { + "epoch": 4.23, + "grad_norm": 0.404296875, + "learning_rate": 0.00047585766112848344, + "loss": 0.1965, + "step": 102240 + }, + { + "epoch": 4.24, + "grad_norm": 0.8828125, + "learning_rate": 0.00047585301120932957, + "loss": 0.2144, + "step": 102250 + }, + { + "epoch": 4.24, + "grad_norm": 0.9453125, + "learning_rate": 0.0004758483608651446, + "loss": 0.2123, + "step": 102260 + }, + { + "epoch": 4.24, + "grad_norm": 0.2353515625, + "learning_rate": 0.0004758437100959375, + "loss": 0.2106, + "step": 102270 + }, + { + "epoch": 4.24, + "grad_norm": 1.34375, + "learning_rate": 0.000475839058901717, + "loss": 0.2223, + "step": 102280 + }, + { + "epoch": 4.24, + "grad_norm": 0.54296875, + "learning_rate": 0.0004758344072824916, + "loss": 0.2163, + "step": 102290 + }, + { + "epoch": 4.24, + "grad_norm": 1.265625, + "learning_rate": 0.00047582975523827035, + "loss": 0.1628, + "step": 102300 + }, + { + "epoch": 4.24, + "grad_norm": 1.2109375, + "learning_rate": 0.0004758251027690619, + "loss": 0.2257, + "step": 102310 + }, + { + "epoch": 4.24, + "grad_norm": 0.59375, + "learning_rate": 0.00047582044987487494, + "loss": 0.2391, + "step": 102320 + }, + { + "epoch": 4.24, + "grad_norm": 0.484375, + "learning_rate": 0.0004758157965557183, + "loss": 0.1954, + "step": 102330 + }, + { + "epoch": 4.24, + "grad_norm": 0.443359375, + "learning_rate": 0.00047581114281160063, + "loss": 0.1575, + "step": 102340 + }, + { + "epoch": 4.24, + "grad_norm": 0.94140625, + "learning_rate": 0.0004758064886425309, + "loss": 0.1835, + "step": 102350 + }, + { + "epoch": 4.24, + "grad_norm": 0.87890625, + "learning_rate": 0.00047580183404851773, + "loss": 0.2063, + "step": 102360 + }, + { + "epoch": 4.24, + "grad_norm": 0.59375, + "learning_rate": 0.0004757971790295699, + "loss": 0.2265, + "step": 102370 + }, + { + "epoch": 4.24, + "grad_norm": 1.3046875, + "learning_rate": 0.000475792523585696, + "loss": 0.1896, + "step": 102380 + }, + { + "epoch": 4.24, + "grad_norm": 0.345703125, + "learning_rate": 0.0004757878677169052, + "loss": 0.1877, + "step": 102390 + }, + { + "epoch": 4.24, + "grad_norm": 0.70703125, + "learning_rate": 0.00047578321142320584, + "loss": 0.1884, + "step": 102400 + }, + { + "epoch": 4.24, + "grad_norm": 0.28125, + "learning_rate": 0.0004757785547046069, + "loss": 0.1709, + "step": 102410 + }, + { + "epoch": 4.24, + "grad_norm": 1.0625, + "learning_rate": 0.0004757738975611171, + "loss": 0.1752, + "step": 102420 + }, + { + "epoch": 4.24, + "grad_norm": 0.5703125, + "learning_rate": 0.00047576923999274526, + "loss": 0.2032, + "step": 102430 + }, + { + "epoch": 4.24, + "grad_norm": 0.69140625, + "learning_rate": 0.0004757645819995, + "loss": 0.212, + "step": 102440 + }, + { + "epoch": 4.24, + "grad_norm": 0.890625, + "learning_rate": 0.0004757599235813903, + "loss": 0.2625, + "step": 102450 + }, + { + "epoch": 4.24, + "grad_norm": 1.03125, + "learning_rate": 0.00047575526473842466, + "loss": 0.236, + "step": 102460 + }, + { + "epoch": 4.24, + "grad_norm": 0.609375, + "learning_rate": 0.0004757506054706121, + "loss": 0.1896, + "step": 102470 + }, + { + "epoch": 4.24, + "grad_norm": 0.419921875, + "learning_rate": 0.0004757459457779613, + "loss": 0.2485, + "step": 102480 + }, + { + "epoch": 4.25, + "grad_norm": 0.4921875, + "learning_rate": 0.0004757412856604809, + "loss": 0.1597, + "step": 102490 + }, + { + "epoch": 4.25, + "grad_norm": 0.30859375, + "learning_rate": 0.0004757366251181798, + "loss": 0.188, + "step": 102500 + }, + { + "epoch": 4.25, + "grad_norm": 0.51953125, + "learning_rate": 0.00047573196415106684, + "loss": 0.1422, + "step": 102510 + }, + { + "epoch": 4.25, + "grad_norm": 0.7734375, + "learning_rate": 0.00047572730275915066, + "loss": 0.2401, + "step": 102520 + }, + { + "epoch": 4.25, + "grad_norm": 0.66015625, + "learning_rate": 0.00047572264094244, + "loss": 0.1946, + "step": 102530 + }, + { + "epoch": 4.25, + "grad_norm": 0.7578125, + "learning_rate": 0.0004757179787009438, + "loss": 0.159, + "step": 102540 + }, + { + "epoch": 4.25, + "grad_norm": 1.1953125, + "learning_rate": 0.0004757133160346707, + "loss": 0.2301, + "step": 102550 + }, + { + "epoch": 4.25, + "grad_norm": 0.61328125, + "learning_rate": 0.00047570865294362954, + "loss": 0.2527, + "step": 102560 + }, + { + "epoch": 4.25, + "grad_norm": 0.7109375, + "learning_rate": 0.000475703989427829, + "loss": 0.2863, + "step": 102570 + }, + { + "epoch": 4.25, + "grad_norm": 0.5625, + "learning_rate": 0.000475699325487278, + "loss": 0.2497, + "step": 102580 + }, + { + "epoch": 4.25, + "grad_norm": 0.380859375, + "learning_rate": 0.00047569466112198515, + "loss": 0.2275, + "step": 102590 + }, + { + "epoch": 4.25, + "grad_norm": 1.2265625, + "learning_rate": 0.00047568999633195943, + "loss": 0.2271, + "step": 102600 + }, + { + "epoch": 4.25, + "grad_norm": 0.9609375, + "learning_rate": 0.00047568533111720944, + "loss": 0.254, + "step": 102610 + }, + { + "epoch": 4.25, + "grad_norm": 0.275390625, + "learning_rate": 0.00047568066547774406, + "loss": 0.2085, + "step": 102620 + }, + { + "epoch": 4.25, + "grad_norm": 0.74609375, + "learning_rate": 0.00047567599941357203, + "loss": 0.2276, + "step": 102630 + }, + { + "epoch": 4.25, + "grad_norm": 0.94140625, + "learning_rate": 0.0004756713329247021, + "loss": 0.2037, + "step": 102640 + }, + { + "epoch": 4.25, + "grad_norm": 0.7265625, + "learning_rate": 0.0004756666660111432, + "loss": 0.2044, + "step": 102650 + }, + { + "epoch": 4.25, + "grad_norm": 1.1484375, + "learning_rate": 0.0004756619986729039, + "loss": 0.2574, + "step": 102660 + }, + { + "epoch": 4.25, + "grad_norm": 0.75, + "learning_rate": 0.00047565733090999314, + "loss": 0.1947, + "step": 102670 + }, + { + "epoch": 4.25, + "grad_norm": 2.296875, + "learning_rate": 0.00047565266272241966, + "loss": 0.2164, + "step": 102680 + }, + { + "epoch": 4.25, + "grad_norm": 0.9453125, + "learning_rate": 0.0004756479941101922, + "loss": 0.2502, + "step": 102690 + }, + { + "epoch": 4.25, + "grad_norm": 0.55078125, + "learning_rate": 0.0004756433250733196, + "loss": 0.2165, + "step": 102700 + }, + { + "epoch": 4.25, + "grad_norm": 0.984375, + "learning_rate": 0.0004756386556118106, + "loss": 0.1829, + "step": 102710 + }, + { + "epoch": 4.25, + "grad_norm": 0.84765625, + "learning_rate": 0.0004756339857256741, + "loss": 0.2738, + "step": 102720 + }, + { + "epoch": 4.26, + "grad_norm": 0.6015625, + "learning_rate": 0.0004756293154149187, + "loss": 0.1844, + "step": 102730 + }, + { + "epoch": 4.26, + "grad_norm": 0.83203125, + "learning_rate": 0.00047562464467955335, + "loss": 0.2229, + "step": 102740 + }, + { + "epoch": 4.26, + "grad_norm": 1.09375, + "learning_rate": 0.00047561997351958676, + "loss": 0.1745, + "step": 102750 + }, + { + "epoch": 4.26, + "grad_norm": 0.671875, + "learning_rate": 0.0004756153019350278, + "loss": 0.2415, + "step": 102760 + }, + { + "epoch": 4.26, + "grad_norm": 0.66796875, + "learning_rate": 0.0004756106299258851, + "loss": 0.2595, + "step": 102770 + }, + { + "epoch": 4.26, + "grad_norm": 0.126953125, + "learning_rate": 0.0004756059574921677, + "loss": 0.1906, + "step": 102780 + }, + { + "epoch": 4.26, + "grad_norm": 0.56640625, + "learning_rate": 0.00047560128463388415, + "loss": 0.191, + "step": 102790 + }, + { + "epoch": 4.26, + "grad_norm": 0.5625, + "learning_rate": 0.00047559661135104336, + "loss": 0.2405, + "step": 102800 + }, + { + "epoch": 4.26, + "grad_norm": 0.8359375, + "learning_rate": 0.00047559193764365416, + "loss": 0.272, + "step": 102810 + }, + { + "epoch": 4.26, + "grad_norm": 0.953125, + "learning_rate": 0.0004755872635117252, + "loss": 0.2145, + "step": 102820 + }, + { + "epoch": 4.26, + "grad_norm": 0.28125, + "learning_rate": 0.00047558258895526547, + "loss": 0.2583, + "step": 102830 + }, + { + "epoch": 4.26, + "grad_norm": 0.3359375, + "learning_rate": 0.0004755779139742836, + "loss": 0.1824, + "step": 102840 + }, + { + "epoch": 4.26, + "grad_norm": 0.96875, + "learning_rate": 0.0004755732385687885, + "loss": 0.199, + "step": 102850 + }, + { + "epoch": 4.26, + "grad_norm": 0.349609375, + "learning_rate": 0.0004755685627387889, + "loss": 0.1598, + "step": 102860 + }, + { + "epoch": 4.26, + "grad_norm": 0.7265625, + "learning_rate": 0.0004755638864842936, + "loss": 0.2152, + "step": 102870 + }, + { + "epoch": 4.26, + "grad_norm": 0.490234375, + "learning_rate": 0.0004755592098053115, + "loss": 0.2295, + "step": 102880 + }, + { + "epoch": 4.26, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004755545327018512, + "loss": 0.2453, + "step": 102890 + }, + { + "epoch": 4.26, + "grad_norm": 0.6640625, + "learning_rate": 0.00047554985517392177, + "loss": 0.1519, + "step": 102900 + }, + { + "epoch": 4.26, + "grad_norm": 0.45703125, + "learning_rate": 0.0004755451772215318, + "loss": 0.1679, + "step": 102910 + }, + { + "epoch": 4.26, + "grad_norm": 1.65625, + "learning_rate": 0.0004755404988446902, + "loss": 0.2194, + "step": 102920 + }, + { + "epoch": 4.26, + "grad_norm": 0.546875, + "learning_rate": 0.00047553582004340565, + "loss": 0.2258, + "step": 102930 + }, + { + "epoch": 4.26, + "grad_norm": 0.6640625, + "learning_rate": 0.00047553114081768717, + "loss": 0.176, + "step": 102940 + }, + { + "epoch": 4.26, + "grad_norm": 1.0078125, + "learning_rate": 0.0004755264611675434, + "loss": 0.2252, + "step": 102950 + }, + { + "epoch": 4.26, + "grad_norm": 0.765625, + "learning_rate": 0.0004755217810929831, + "loss": 0.2784, + "step": 102960 + }, + { + "epoch": 4.27, + "grad_norm": 0.7734375, + "learning_rate": 0.0004755171005940152, + "loss": 0.2309, + "step": 102970 + }, + { + "epoch": 4.27, + "grad_norm": 1.8203125, + "learning_rate": 0.00047551241967064853, + "loss": 0.2029, + "step": 102980 + }, + { + "epoch": 4.27, + "grad_norm": 0.625, + "learning_rate": 0.00047550773832289177, + "loss": 0.2865, + "step": 102990 + }, + { + "epoch": 4.27, + "grad_norm": 0.98828125, + "learning_rate": 0.0004755030565507539, + "loss": 0.2701, + "step": 103000 + }, + { + "epoch": 4.27, + "grad_norm": 0.94140625, + "learning_rate": 0.0004754983743542435, + "loss": 0.1983, + "step": 103010 + }, + { + "epoch": 4.27, + "grad_norm": 0.48828125, + "learning_rate": 0.00047549369173336954, + "loss": 0.2205, + "step": 103020 + }, + { + "epoch": 4.27, + "grad_norm": 0.4921875, + "learning_rate": 0.0004754890086881408, + "loss": 0.2251, + "step": 103030 + }, + { + "epoch": 4.27, + "grad_norm": 0.51953125, + "learning_rate": 0.0004754843252185662, + "loss": 0.2316, + "step": 103040 + }, + { + "epoch": 4.27, + "grad_norm": 0.361328125, + "learning_rate": 0.00047547964132465436, + "loss": 0.2308, + "step": 103050 + }, + { + "epoch": 4.27, + "grad_norm": 1.1015625, + "learning_rate": 0.0004754749570064142, + "loss": 0.218, + "step": 103060 + }, + { + "epoch": 4.27, + "grad_norm": 0.796875, + "learning_rate": 0.00047547027226385455, + "loss": 0.2619, + "step": 103070 + }, + { + "epoch": 4.27, + "grad_norm": 1.1796875, + "learning_rate": 0.00047546558709698415, + "loss": 0.2122, + "step": 103080 + }, + { + "epoch": 4.27, + "grad_norm": 1.140625, + "learning_rate": 0.0004754609015058119, + "loss": 0.2113, + "step": 103090 + }, + { + "epoch": 4.27, + "grad_norm": 0.412109375, + "learning_rate": 0.00047545621549034656, + "loss": 0.2347, + "step": 103100 + }, + { + "epoch": 4.27, + "grad_norm": 1.21875, + "learning_rate": 0.00047545152905059695, + "loss": 0.2078, + "step": 103110 + }, + { + "epoch": 4.27, + "grad_norm": 1.5, + "learning_rate": 0.0004754468421865719, + "loss": 0.2262, + "step": 103120 + }, + { + "epoch": 4.27, + "grad_norm": 0.9453125, + "learning_rate": 0.0004754421548982803, + "loss": 0.228, + "step": 103130 + }, + { + "epoch": 4.27, + "grad_norm": 0.69921875, + "learning_rate": 0.0004754374671857309, + "loss": 0.1812, + "step": 103140 + }, + { + "epoch": 4.27, + "grad_norm": 1.171875, + "learning_rate": 0.00047543277904893243, + "loss": 0.1583, + "step": 103150 + }, + { + "epoch": 4.27, + "grad_norm": 0.1591796875, + "learning_rate": 0.00047542809048789394, + "loss": 0.1877, + "step": 103160 + }, + { + "epoch": 4.27, + "grad_norm": 0.41796875, + "learning_rate": 0.0004754234015026241, + "loss": 0.185, + "step": 103170 + }, + { + "epoch": 4.27, + "grad_norm": 0.66796875, + "learning_rate": 0.00047541871209313177, + "loss": 0.1998, + "step": 103180 + }, + { + "epoch": 4.27, + "grad_norm": 0.51171875, + "learning_rate": 0.0004754140222594257, + "loss": 0.1987, + "step": 103190 + }, + { + "epoch": 4.27, + "grad_norm": 0.88671875, + "learning_rate": 0.0004754093320015148, + "loss": 0.165, + "step": 103200 + }, + { + "epoch": 4.27, + "grad_norm": 0.6484375, + "learning_rate": 0.00047540464131940797, + "loss": 0.2128, + "step": 103210 + }, + { + "epoch": 4.28, + "grad_norm": 0.431640625, + "learning_rate": 0.00047539995021311386, + "loss": 0.2325, + "step": 103220 + }, + { + "epoch": 4.28, + "grad_norm": 0.63671875, + "learning_rate": 0.0004753952586826414, + "loss": 0.2394, + "step": 103230 + }, + { + "epoch": 4.28, + "grad_norm": 1.109375, + "learning_rate": 0.0004753905667279994, + "loss": 0.2569, + "step": 103240 + }, + { + "epoch": 4.28, + "grad_norm": 0.671875, + "learning_rate": 0.0004753858743491967, + "loss": 0.1863, + "step": 103250 + }, + { + "epoch": 4.28, + "grad_norm": 0.59375, + "learning_rate": 0.00047538118154624216, + "loss": 0.2399, + "step": 103260 + }, + { + "epoch": 4.28, + "grad_norm": 0.375, + "learning_rate": 0.0004753764883191445, + "loss": 0.1791, + "step": 103270 + }, + { + "epoch": 4.28, + "grad_norm": 0.255859375, + "learning_rate": 0.0004753717946679127, + "loss": 0.2132, + "step": 103280 + }, + { + "epoch": 4.28, + "grad_norm": 0.62109375, + "learning_rate": 0.0004753671005925555, + "loss": 0.1949, + "step": 103290 + }, + { + "epoch": 4.28, + "grad_norm": 0.23046875, + "learning_rate": 0.00047536240609308175, + "loss": 0.2222, + "step": 103300 + }, + { + "epoch": 4.28, + "grad_norm": 0.337890625, + "learning_rate": 0.0004753577111695003, + "loss": 0.1819, + "step": 103310 + }, + { + "epoch": 4.28, + "grad_norm": 0.66015625, + "learning_rate": 0.00047535301582181996, + "loss": 0.1941, + "step": 103320 + }, + { + "epoch": 4.28, + "grad_norm": 1.0390625, + "learning_rate": 0.00047534832005004956, + "loss": 0.215, + "step": 103330 + }, + { + "epoch": 4.28, + "grad_norm": 1.578125, + "learning_rate": 0.00047534362385419793, + "loss": 0.2658, + "step": 103340 + }, + { + "epoch": 4.28, + "grad_norm": 0.76171875, + "learning_rate": 0.000475338927234274, + "loss": 0.2372, + "step": 103350 + }, + { + "epoch": 4.28, + "grad_norm": 0.67578125, + "learning_rate": 0.00047533423019028654, + "loss": 0.251, + "step": 103360 + }, + { + "epoch": 4.28, + "grad_norm": 0.875, + "learning_rate": 0.0004753295327222444, + "loss": 0.2191, + "step": 103370 + }, + { + "epoch": 4.28, + "grad_norm": 0.58984375, + "learning_rate": 0.0004753248348301564, + "loss": 0.1464, + "step": 103380 + }, + { + "epoch": 4.28, + "grad_norm": 0.29296875, + "learning_rate": 0.0004753201365140314, + "loss": 0.1775, + "step": 103390 + }, + { + "epoch": 4.28, + "grad_norm": 0.80078125, + "learning_rate": 0.0004753154377738782, + "loss": 0.2016, + "step": 103400 + }, + { + "epoch": 4.28, + "grad_norm": 0.53515625, + "learning_rate": 0.0004753107386097057, + "loss": 0.2666, + "step": 103410 + }, + { + "epoch": 4.28, + "grad_norm": 1.1328125, + "learning_rate": 0.0004753060390215227, + "loss": 0.2088, + "step": 103420 + }, + { + "epoch": 4.28, + "grad_norm": 0.349609375, + "learning_rate": 0.0004753013390093381, + "loss": 0.2652, + "step": 103430 + }, + { + "epoch": 4.28, + "grad_norm": 0.73046875, + "learning_rate": 0.0004752966385731607, + "loss": 0.2114, + "step": 103440 + }, + { + "epoch": 4.28, + "grad_norm": 0.06298828125, + "learning_rate": 0.00047529193771299934, + "loss": 0.2092, + "step": 103450 + }, + { + "epoch": 4.29, + "grad_norm": 0.90234375, + "learning_rate": 0.00047528723642886283, + "loss": 0.2883, + "step": 103460 + }, + { + "epoch": 4.29, + "grad_norm": 0.671875, + "learning_rate": 0.00047528253472076023, + "loss": 0.1615, + "step": 103470 + }, + { + "epoch": 4.29, + "grad_norm": 0.232421875, + "learning_rate": 0.00047527783258870005, + "loss": 0.197, + "step": 103480 + }, + { + "epoch": 4.29, + "grad_norm": 0.5390625, + "learning_rate": 0.00047527313003269144, + "loss": 0.2157, + "step": 103490 + }, + { + "epoch": 4.29, + "grad_norm": 0.8125, + "learning_rate": 0.0004752684270527431, + "loss": 0.2789, + "step": 103500 + }, + { + "epoch": 4.29, + "grad_norm": 0.59765625, + "learning_rate": 0.0004752637236488639, + "loss": 0.1741, + "step": 103510 + }, + { + "epoch": 4.29, + "grad_norm": 0.625, + "learning_rate": 0.00047525901982106266, + "loss": 0.1888, + "step": 103520 + }, + { + "epoch": 4.29, + "grad_norm": 0.953125, + "learning_rate": 0.0004752543155693483, + "loss": 0.2184, + "step": 103530 + }, + { + "epoch": 4.29, + "grad_norm": 0.55078125, + "learning_rate": 0.00047524961089372966, + "loss": 0.1973, + "step": 103540 + }, + { + "epoch": 4.29, + "grad_norm": 1.21875, + "learning_rate": 0.00047524490579421556, + "loss": 0.2688, + "step": 103550 + }, + { + "epoch": 4.29, + "grad_norm": 0.62109375, + "learning_rate": 0.0004752402002708149, + "loss": 0.2479, + "step": 103560 + }, + { + "epoch": 4.29, + "grad_norm": 0.37890625, + "learning_rate": 0.00047523549432353644, + "loss": 0.2482, + "step": 103570 + }, + { + "epoch": 4.29, + "grad_norm": 0.470703125, + "learning_rate": 0.00047523078795238914, + "loss": 0.2192, + "step": 103580 + }, + { + "epoch": 4.29, + "grad_norm": 0.60546875, + "learning_rate": 0.00047522608115738185, + "loss": 0.2379, + "step": 103590 + }, + { + "epoch": 4.29, + "grad_norm": 0.63671875, + "learning_rate": 0.0004752213739385234, + "loss": 0.2558, + "step": 103600 + }, + { + "epoch": 4.29, + "grad_norm": 0.6328125, + "learning_rate": 0.0004752166662958226, + "loss": 0.2558, + "step": 103610 + }, + { + "epoch": 4.29, + "grad_norm": 0.85546875, + "learning_rate": 0.00047521195822928836, + "loss": 0.1942, + "step": 103620 + }, + { + "epoch": 4.29, + "grad_norm": 1.1015625, + "learning_rate": 0.0004752072497389295, + "loss": 0.241, + "step": 103630 + }, + { + "epoch": 4.29, + "grad_norm": 0.9765625, + "learning_rate": 0.000475202540824755, + "loss": 0.2407, + "step": 103640 + }, + { + "epoch": 4.29, + "grad_norm": 1.15625, + "learning_rate": 0.00047519783148677365, + "loss": 0.2023, + "step": 103650 + }, + { + "epoch": 4.29, + "grad_norm": 1.5234375, + "learning_rate": 0.0004751931217249942, + "loss": 0.1778, + "step": 103660 + }, + { + "epoch": 4.29, + "grad_norm": 0.70703125, + "learning_rate": 0.0004751884115394257, + "loss": 0.1971, + "step": 103670 + }, + { + "epoch": 4.29, + "grad_norm": 0.671875, + "learning_rate": 0.0004751837009300769, + "loss": 0.2495, + "step": 103680 + }, + { + "epoch": 4.29, + "grad_norm": 0.65625, + "learning_rate": 0.00047517898989695665, + "loss": 0.2115, + "step": 103690 + }, + { + "epoch": 4.3, + "grad_norm": 0.75, + "learning_rate": 0.00047517427844007387, + "loss": 0.1718, + "step": 103700 + }, + { + "epoch": 4.3, + "grad_norm": 0.70703125, + "learning_rate": 0.00047516956655943744, + "loss": 0.2307, + "step": 103710 + }, + { + "epoch": 4.3, + "grad_norm": 0.359375, + "learning_rate": 0.0004751648542550562, + "loss": 0.1788, + "step": 103720 + }, + { + "epoch": 4.3, + "grad_norm": 1.21875, + "learning_rate": 0.0004751601415269391, + "loss": 0.2526, + "step": 103730 + }, + { + "epoch": 4.3, + "grad_norm": 0.70703125, + "learning_rate": 0.00047515542837509484, + "loss": 0.245, + "step": 103740 + }, + { + "epoch": 4.3, + "grad_norm": 0.5390625, + "learning_rate": 0.00047515071479953237, + "loss": 0.2438, + "step": 103750 + }, + { + "epoch": 4.3, + "grad_norm": 0.76171875, + "learning_rate": 0.0004751460008002606, + "loss": 0.184, + "step": 103760 + }, + { + "epoch": 4.3, + "grad_norm": 0.453125, + "learning_rate": 0.00047514128637728836, + "loss": 0.229, + "step": 103770 + }, + { + "epoch": 4.3, + "grad_norm": 0.44921875, + "learning_rate": 0.0004751365715306245, + "loss": 0.2405, + "step": 103780 + }, + { + "epoch": 4.3, + "grad_norm": 0.455078125, + "learning_rate": 0.000475131856260278, + "loss": 0.1876, + "step": 103790 + }, + { + "epoch": 4.3, + "grad_norm": 0.625, + "learning_rate": 0.00047512714056625763, + "loss": 0.1821, + "step": 103800 + }, + { + "epoch": 4.3, + "grad_norm": 0.828125, + "learning_rate": 0.00047512242444857223, + "loss": 0.1488, + "step": 103810 + }, + { + "epoch": 4.3, + "grad_norm": 0.640625, + "learning_rate": 0.00047511770790723077, + "loss": 0.2516, + "step": 103820 + }, + { + "epoch": 4.3, + "grad_norm": 0.67578125, + "learning_rate": 0.00047511299094224214, + "loss": 0.1969, + "step": 103830 + }, + { + "epoch": 4.3, + "grad_norm": 0.5, + "learning_rate": 0.00047510827355361505, + "loss": 0.2368, + "step": 103840 + }, + { + "epoch": 4.3, + "grad_norm": 0.62890625, + "learning_rate": 0.0004751035557413587, + "loss": 0.2319, + "step": 103850 + }, + { + "epoch": 4.3, + "grad_norm": 1.046875, + "learning_rate": 0.0004750988375054816, + "loss": 0.2076, + "step": 103860 + }, + { + "epoch": 4.3, + "grad_norm": 2.75, + "learning_rate": 0.00047509411884599285, + "loss": 0.2287, + "step": 103870 + }, + { + "epoch": 4.3, + "grad_norm": 0.330078125, + "learning_rate": 0.0004750893997629013, + "loss": 0.2088, + "step": 103880 + }, + { + "epoch": 4.3, + "grad_norm": 0.0, + "learning_rate": 0.00047508468025621574, + "loss": 0.1891, + "step": 103890 + }, + { + "epoch": 4.3, + "grad_norm": 0.7421875, + "learning_rate": 0.00047507996032594517, + "loss": 0.209, + "step": 103900 + }, + { + "epoch": 4.3, + "grad_norm": 1.0546875, + "learning_rate": 0.00047507523997209844, + "loss": 0.2057, + "step": 103910 + }, + { + "epoch": 4.3, + "grad_norm": 0.640625, + "learning_rate": 0.00047507051919468444, + "loss": 0.2498, + "step": 103920 + }, + { + "epoch": 4.3, + "grad_norm": 0.44921875, + "learning_rate": 0.00047506579799371195, + "loss": 0.2156, + "step": 103930 + }, + { + "epoch": 4.31, + "grad_norm": 1.0546875, + "learning_rate": 0.0004750610763691899, + "loss": 0.2732, + "step": 103940 + }, + { + "epoch": 4.31, + "grad_norm": 0.59375, + "learning_rate": 0.0004750563543211273, + "loss": 0.2446, + "step": 103950 + }, + { + "epoch": 4.31, + "grad_norm": 0.71875, + "learning_rate": 0.0004750516318495329, + "loss": 0.2171, + "step": 103960 + }, + { + "epoch": 4.31, + "grad_norm": 0.59765625, + "learning_rate": 0.0004750469089544157, + "loss": 0.1708, + "step": 103970 + }, + { + "epoch": 4.31, + "grad_norm": 0.73828125, + "learning_rate": 0.0004750421856357845, + "loss": 0.1995, + "step": 103980 + }, + { + "epoch": 4.31, + "grad_norm": 0.97265625, + "learning_rate": 0.00047503746189364815, + "loss": 0.2344, + "step": 103990 + }, + { + "epoch": 4.31, + "grad_norm": 0.671875, + "learning_rate": 0.00047503273772801557, + "loss": 0.2257, + "step": 104000 + }, + { + "epoch": 4.31, + "grad_norm": 1.578125, + "learning_rate": 0.00047502801313889574, + "loss": 0.2312, + "step": 104010 + }, + { + "epoch": 4.31, + "grad_norm": 0.490234375, + "learning_rate": 0.00047502328812629754, + "loss": 0.2203, + "step": 104020 + }, + { + "epoch": 4.31, + "grad_norm": 0.828125, + "learning_rate": 0.00047501856269022976, + "loss": 0.1992, + "step": 104030 + }, + { + "epoch": 4.31, + "grad_norm": 0.63671875, + "learning_rate": 0.0004750138368307013, + "loss": 0.1871, + "step": 104040 + }, + { + "epoch": 4.31, + "grad_norm": 0.55078125, + "learning_rate": 0.0004750091105477212, + "loss": 0.1839, + "step": 104050 + }, + { + "epoch": 4.31, + "grad_norm": 0.9609375, + "learning_rate": 0.00047500438384129816, + "loss": 0.2164, + "step": 104060 + }, + { + "epoch": 4.31, + "grad_norm": 0.265625, + "learning_rate": 0.0004749996567114412, + "loss": 0.2192, + "step": 104070 + }, + { + "epoch": 4.31, + "grad_norm": 0.6796875, + "learning_rate": 0.0004749949291581592, + "loss": 0.2017, + "step": 104080 + }, + { + "epoch": 4.31, + "grad_norm": 0.45703125, + "learning_rate": 0.00047499020118146103, + "loss": 0.2377, + "step": 104090 + }, + { + "epoch": 4.31, + "grad_norm": 0.99609375, + "learning_rate": 0.0004749854727813556, + "loss": 0.2618, + "step": 104100 + }, + { + "epoch": 4.31, + "grad_norm": 0.53515625, + "learning_rate": 0.00047498074395785177, + "loss": 0.2097, + "step": 104110 + }, + { + "epoch": 4.31, + "grad_norm": 0.7578125, + "learning_rate": 0.0004749760147109585, + "loss": 0.2104, + "step": 104120 + }, + { + "epoch": 4.31, + "grad_norm": 0.8828125, + "learning_rate": 0.0004749712850406847, + "loss": 0.2339, + "step": 104130 + }, + { + "epoch": 4.31, + "grad_norm": 0.75, + "learning_rate": 0.00047496655494703923, + "loss": 0.2316, + "step": 104140 + }, + { + "epoch": 4.31, + "grad_norm": 0.86328125, + "learning_rate": 0.000474961824430031, + "loss": 0.2183, + "step": 104150 + }, + { + "epoch": 4.31, + "grad_norm": 0.4765625, + "learning_rate": 0.0004749570934896689, + "loss": 0.1697, + "step": 104160 + }, + { + "epoch": 4.31, + "grad_norm": 0.474609375, + "learning_rate": 0.00047495236212596176, + "loss": 0.2147, + "step": 104170 + }, + { + "epoch": 4.32, + "grad_norm": 0.515625, + "learning_rate": 0.0004749476303389186, + "loss": 0.2478, + "step": 104180 + }, + { + "epoch": 4.32, + "grad_norm": 0.53515625, + "learning_rate": 0.00047494289812854843, + "loss": 0.2308, + "step": 104190 + }, + { + "epoch": 4.32, + "grad_norm": 0.953125, + "learning_rate": 0.0004749381654948599, + "loss": 0.1952, + "step": 104200 + }, + { + "epoch": 4.32, + "grad_norm": 0.291015625, + "learning_rate": 0.000474933432437862, + "loss": 0.2044, + "step": 104210 + }, + { + "epoch": 4.32, + "grad_norm": 1.1875, + "learning_rate": 0.00047492869895756376, + "loss": 0.21, + "step": 104220 + }, + { + "epoch": 4.32, + "grad_norm": 1.0859375, + "learning_rate": 0.00047492396505397394, + "loss": 0.1955, + "step": 104230 + }, + { + "epoch": 4.32, + "grad_norm": 0.7421875, + "learning_rate": 0.0004749192307271015, + "loss": 0.2248, + "step": 104240 + }, + { + "epoch": 4.32, + "grad_norm": 0.0, + "learning_rate": 0.00047491449597695547, + "loss": 0.2205, + "step": 104250 + }, + { + "epoch": 4.32, + "grad_norm": 0.50390625, + "learning_rate": 0.00047490976080354454, + "loss": 0.1559, + "step": 104260 + }, + { + "epoch": 4.32, + "grad_norm": 0.83984375, + "learning_rate": 0.00047490502520687773, + "loss": 0.2153, + "step": 104270 + }, + { + "epoch": 4.32, + "grad_norm": 0.62890625, + "learning_rate": 0.0004749002891869639, + "loss": 0.1944, + "step": 104280 + }, + { + "epoch": 4.32, + "grad_norm": 0.40625, + "learning_rate": 0.0004748955527438121, + "loss": 0.1711, + "step": 104290 + }, + { + "epoch": 4.32, + "grad_norm": 0.26171875, + "learning_rate": 0.0004748908158774312, + "loss": 0.1978, + "step": 104300 + }, + { + "epoch": 4.32, + "grad_norm": 0.263671875, + "learning_rate": 0.00047488607858783003, + "loss": 0.1914, + "step": 104310 + }, + { + "epoch": 4.32, + "grad_norm": 0.0, + "learning_rate": 0.00047488134087501747, + "loss": 0.2515, + "step": 104320 + }, + { + "epoch": 4.32, + "grad_norm": 0.43359375, + "learning_rate": 0.0004748766027390026, + "loss": 0.2446, + "step": 104330 + }, + { + "epoch": 4.32, + "grad_norm": 1.7578125, + "learning_rate": 0.0004748718641797942, + "loss": 0.2196, + "step": 104340 + }, + { + "epoch": 4.32, + "grad_norm": 1.765625, + "learning_rate": 0.0004748671251974013, + "loss": 0.1929, + "step": 104350 + }, + { + "epoch": 4.32, + "grad_norm": 0.72265625, + "learning_rate": 0.00047486238579183267, + "loss": 0.1462, + "step": 104360 + }, + { + "epoch": 4.32, + "grad_norm": 1.6484375, + "learning_rate": 0.0004748576459630973, + "loss": 0.244, + "step": 104370 + }, + { + "epoch": 4.32, + "grad_norm": 0.60546875, + "learning_rate": 0.0004748529057112042, + "loss": 0.2119, + "step": 104380 + }, + { + "epoch": 4.32, + "grad_norm": 0.9453125, + "learning_rate": 0.00047484816503616224, + "loss": 0.2017, + "step": 104390 + }, + { + "epoch": 4.32, + "grad_norm": 0.59375, + "learning_rate": 0.0004748434239379802, + "loss": 0.2216, + "step": 104400 + }, + { + "epoch": 4.32, + "grad_norm": 1.9140625, + "learning_rate": 0.00047483868241666717, + "loss": 0.2312, + "step": 104410 + }, + { + "epoch": 4.33, + "grad_norm": 0.86328125, + "learning_rate": 0.00047483394047223207, + "loss": 0.1796, + "step": 104420 + }, + { + "epoch": 4.33, + "grad_norm": 0.75390625, + "learning_rate": 0.0004748291981046837, + "loss": 0.2282, + "step": 104430 + }, + { + "epoch": 4.33, + "grad_norm": 0.61328125, + "learning_rate": 0.0004748244553140311, + "loss": 0.198, + "step": 104440 + }, + { + "epoch": 4.33, + "grad_norm": 1.3515625, + "learning_rate": 0.00047481971210028317, + "loss": 0.2258, + "step": 104450 + }, + { + "epoch": 4.33, + "grad_norm": 0.2890625, + "learning_rate": 0.0004748149684634488, + "loss": 0.1607, + "step": 104460 + }, + { + "epoch": 4.33, + "grad_norm": 0.45703125, + "learning_rate": 0.0004748102244035369, + "loss": 0.219, + "step": 104470 + }, + { + "epoch": 4.33, + "grad_norm": 1.6171875, + "learning_rate": 0.00047480547992055644, + "loss": 0.2229, + "step": 104480 + }, + { + "epoch": 4.33, + "grad_norm": 1.2578125, + "learning_rate": 0.0004748007350145164, + "loss": 0.2363, + "step": 104490 + }, + { + "epoch": 4.33, + "grad_norm": 1.0703125, + "learning_rate": 0.00047479598968542565, + "loss": 0.2356, + "step": 104500 + }, + { + "epoch": 4.33, + "grad_norm": 2.359375, + "learning_rate": 0.00047479124393329307, + "loss": 0.2168, + "step": 104510 + }, + { + "epoch": 4.33, + "grad_norm": 0.44140625, + "learning_rate": 0.0004747864977581277, + "loss": 0.1996, + "step": 104520 + }, + { + "epoch": 4.33, + "grad_norm": 0.59375, + "learning_rate": 0.00047478175115993836, + "loss": 0.2106, + "step": 104530 + }, + { + "epoch": 4.33, + "grad_norm": 0.75390625, + "learning_rate": 0.000474777004138734, + "loss": 0.2036, + "step": 104540 + }, + { + "epoch": 4.33, + "grad_norm": 0.6875, + "learning_rate": 0.00047477225669452375, + "loss": 0.2406, + "step": 104550 + }, + { + "epoch": 4.33, + "grad_norm": 1.1484375, + "learning_rate": 0.0004747675088273163, + "loss": 0.2026, + "step": 104560 + }, + { + "epoch": 4.33, + "grad_norm": 0.828125, + "learning_rate": 0.00047476276053712063, + "loss": 0.1517, + "step": 104570 + }, + { + "epoch": 4.33, + "grad_norm": 0.78125, + "learning_rate": 0.0004747580118239457, + "loss": 0.205, + "step": 104580 + }, + { + "epoch": 4.33, + "grad_norm": 0.376953125, + "learning_rate": 0.0004747532626878006, + "loss": 0.2147, + "step": 104590 + }, + { + "epoch": 4.33, + "grad_norm": 0.92578125, + "learning_rate": 0.00047474851312869404, + "loss": 0.1667, + "step": 104600 + }, + { + "epoch": 4.33, + "grad_norm": 0.45703125, + "learning_rate": 0.00047474376314663504, + "loss": 0.2254, + "step": 104610 + }, + { + "epoch": 4.33, + "grad_norm": 0.578125, + "learning_rate": 0.0004747390127416326, + "loss": 0.184, + "step": 104620 + }, + { + "epoch": 4.33, + "grad_norm": 0.86328125, + "learning_rate": 0.0004747342619136955, + "loss": 0.1654, + "step": 104630 + }, + { + "epoch": 4.33, + "grad_norm": 0.70703125, + "learning_rate": 0.0004747295106628329, + "loss": 0.1949, + "step": 104640 + }, + { + "epoch": 4.33, + "grad_norm": 0.63671875, + "learning_rate": 0.0004747247589890536, + "loss": 0.1756, + "step": 104650 + }, + { + "epoch": 4.34, + "grad_norm": 0.55078125, + "learning_rate": 0.00047472000689236654, + "loss": 0.1966, + "step": 104660 + }, + { + "epoch": 4.34, + "grad_norm": 0.6484375, + "learning_rate": 0.0004747152543727807, + "loss": 0.2338, + "step": 104670 + }, + { + "epoch": 4.34, + "grad_norm": 0.96875, + "learning_rate": 0.00047471050143030503, + "loss": 0.2417, + "step": 104680 + }, + { + "epoch": 4.34, + "grad_norm": 0.447265625, + "learning_rate": 0.0004747057480649485, + "loss": 0.1883, + "step": 104690 + }, + { + "epoch": 4.34, + "grad_norm": 0.5, + "learning_rate": 0.00047470099427672, + "loss": 0.2364, + "step": 104700 + }, + { + "epoch": 4.34, + "grad_norm": 0.4140625, + "learning_rate": 0.0004746962400656285, + "loss": 0.264, + "step": 104710 + }, + { + "epoch": 4.34, + "grad_norm": 1.3828125, + "learning_rate": 0.0004746914854316829, + "loss": 0.1832, + "step": 104720 + }, + { + "epoch": 4.34, + "grad_norm": 0.8125, + "learning_rate": 0.0004746867303748922, + "loss": 0.2352, + "step": 104730 + }, + { + "epoch": 4.34, + "grad_norm": 1.328125, + "learning_rate": 0.00047468197489526534, + "loss": 0.2213, + "step": 104740 + }, + { + "epoch": 4.34, + "grad_norm": 0.73046875, + "learning_rate": 0.00047467721899281125, + "loss": 0.2075, + "step": 104750 + }, + { + "epoch": 4.34, + "grad_norm": 0.5625, + "learning_rate": 0.0004746724626675389, + "loss": 0.2043, + "step": 104760 + }, + { + "epoch": 4.34, + "grad_norm": 0.271484375, + "learning_rate": 0.0004746677059194573, + "loss": 0.2027, + "step": 104770 + }, + { + "epoch": 4.34, + "grad_norm": 1.4375, + "learning_rate": 0.0004746629487485753, + "loss": 0.1453, + "step": 104780 + }, + { + "epoch": 4.34, + "grad_norm": 0.400390625, + "learning_rate": 0.0004746581911549019, + "loss": 0.271, + "step": 104790 + }, + { + "epoch": 4.34, + "grad_norm": 1.8046875, + "learning_rate": 0.000474653433138446, + "loss": 0.247, + "step": 104800 + }, + { + "epoch": 4.34, + "grad_norm": 0.328125, + "learning_rate": 0.0004746486746992167, + "loss": 0.1844, + "step": 104810 + }, + { + "epoch": 4.34, + "grad_norm": 0.6953125, + "learning_rate": 0.00047464391583722275, + "loss": 0.1968, + "step": 104820 + }, + { + "epoch": 4.34, + "grad_norm": 0.69921875, + "learning_rate": 0.00047463915655247325, + "loss": 0.1929, + "step": 104830 + }, + { + "epoch": 4.34, + "grad_norm": 0.65625, + "learning_rate": 0.0004746343968449771, + "loss": 0.2398, + "step": 104840 + }, + { + "epoch": 4.34, + "grad_norm": 0.412109375, + "learning_rate": 0.00047462963671474333, + "loss": 0.2661, + "step": 104850 + }, + { + "epoch": 4.34, + "grad_norm": 0.48828125, + "learning_rate": 0.0004746248761617808, + "loss": 0.2602, + "step": 104860 + }, + { + "epoch": 4.34, + "grad_norm": 0.671875, + "learning_rate": 0.0004746201151860985, + "loss": 0.2117, + "step": 104870 + }, + { + "epoch": 4.34, + "grad_norm": 0.384765625, + "learning_rate": 0.00047461535378770536, + "loss": 0.2132, + "step": 104880 + }, + { + "epoch": 4.34, + "grad_norm": 1.0234375, + "learning_rate": 0.0004746105919666105, + "loss": 0.1879, + "step": 104890 + }, + { + "epoch": 4.34, + "grad_norm": 0.6875, + "learning_rate": 0.0004746058297228227, + "loss": 0.244, + "step": 104900 + }, + { + "epoch": 4.35, + "grad_norm": 0.51171875, + "learning_rate": 0.000474601067056351, + "loss": 0.1979, + "step": 104910 + }, + { + "epoch": 4.35, + "grad_norm": 0.65625, + "learning_rate": 0.00047459630396720425, + "loss": 0.1698, + "step": 104920 + }, + { + "epoch": 4.35, + "grad_norm": 0.86328125, + "learning_rate": 0.0004745915404553916, + "loss": 0.1871, + "step": 104930 + }, + { + "epoch": 4.35, + "grad_norm": 0.45703125, + "learning_rate": 0.00047458677652092193, + "loss": 0.2055, + "step": 104940 + }, + { + "epoch": 4.35, + "grad_norm": 0.490234375, + "learning_rate": 0.0004745820121638041, + "loss": 0.1883, + "step": 104950 + }, + { + "epoch": 4.35, + "grad_norm": 1.3984375, + "learning_rate": 0.0004745772473840473, + "loss": 0.1706, + "step": 104960 + }, + { + "epoch": 4.35, + "grad_norm": 0.953125, + "learning_rate": 0.00047457248218166036, + "loss": 0.2203, + "step": 104970 + }, + { + "epoch": 4.35, + "grad_norm": 0.0, + "learning_rate": 0.0004745677165566522, + "loss": 0.2505, + "step": 104980 + }, + { + "epoch": 4.35, + "grad_norm": 0.423828125, + "learning_rate": 0.0004745629505090319, + "loss": 0.1988, + "step": 104990 + }, + { + "epoch": 4.35, + "grad_norm": 0.40625, + "learning_rate": 0.00047455818403880837, + "loss": 0.175, + "step": 105000 + }, + { + "epoch": 4.35, + "grad_norm": 0.92578125, + "learning_rate": 0.00047455341714599056, + "loss": 0.2253, + "step": 105010 + }, + { + "epoch": 4.35, + "grad_norm": 0.275390625, + "learning_rate": 0.00047454864983058744, + "loss": 0.187, + "step": 105020 + }, + { + "epoch": 4.35, + "grad_norm": 0.73046875, + "learning_rate": 0.00047454388209260806, + "loss": 0.196, + "step": 105030 + }, + { + "epoch": 4.35, + "grad_norm": 1.046875, + "learning_rate": 0.0004745391139320613, + "loss": 0.1801, + "step": 105040 + }, + { + "epoch": 4.35, + "grad_norm": 0.8125, + "learning_rate": 0.0004745343453489562, + "loss": 0.255, + "step": 105050 + }, + { + "epoch": 4.35, + "grad_norm": 0.3671875, + "learning_rate": 0.00047452957634330176, + "loss": 0.1998, + "step": 105060 + }, + { + "epoch": 4.35, + "grad_norm": 0.73046875, + "learning_rate": 0.00047452480691510693, + "loss": 0.2142, + "step": 105070 + }, + { + "epoch": 4.35, + "grad_norm": 0.65234375, + "learning_rate": 0.0004745200370643805, + "loss": 0.2451, + "step": 105080 + }, + { + "epoch": 4.35, + "grad_norm": 0.90625, + "learning_rate": 0.0004745152667911318, + "loss": 0.2275, + "step": 105090 + }, + { + "epoch": 4.35, + "grad_norm": 0.6796875, + "learning_rate": 0.00047451049609536946, + "loss": 0.1888, + "step": 105100 + }, + { + "epoch": 4.35, + "grad_norm": 1.1171875, + "learning_rate": 0.00047450572497710267, + "loss": 0.2329, + "step": 105110 + }, + { + "epoch": 4.35, + "grad_norm": 0.4140625, + "learning_rate": 0.0004745009534363404, + "loss": 0.2164, + "step": 105120 + }, + { + "epoch": 4.35, + "grad_norm": 0.193359375, + "learning_rate": 0.00047449618147309147, + "loss": 0.1994, + "step": 105130 + }, + { + "epoch": 4.35, + "grad_norm": 1.078125, + "learning_rate": 0.000474491409087365, + "loss": 0.2498, + "step": 105140 + }, + { + "epoch": 4.36, + "grad_norm": 0.68359375, + "learning_rate": 0.00047448663627917, + "loss": 0.2478, + "step": 105150 + }, + { + "epoch": 4.36, + "grad_norm": 0.921875, + "learning_rate": 0.0004744818630485153, + "loss": 0.1854, + "step": 105160 + }, + { + "epoch": 4.36, + "grad_norm": 0.765625, + "learning_rate": 0.0004744770893954101, + "loss": 0.2629, + "step": 105170 + }, + { + "epoch": 4.36, + "grad_norm": 0.94921875, + "learning_rate": 0.00047447231531986314, + "loss": 0.2027, + "step": 105180 + }, + { + "epoch": 4.36, + "grad_norm": 0.55078125, + "learning_rate": 0.0004744675408218836, + "loss": 0.1902, + "step": 105190 + }, + { + "epoch": 4.36, + "grad_norm": 1.140625, + "learning_rate": 0.00047446276590148035, + "loss": 0.1768, + "step": 105200 + }, + { + "epoch": 4.36, + "grad_norm": 0.4453125, + "learning_rate": 0.00047445799055866245, + "loss": 0.2226, + "step": 105210 + }, + { + "epoch": 4.36, + "grad_norm": 0.443359375, + "learning_rate": 0.00047445321479343885, + "loss": 0.2527, + "step": 105220 + }, + { + "epoch": 4.36, + "grad_norm": 0.41015625, + "learning_rate": 0.0004744484386058185, + "loss": 0.2212, + "step": 105230 + }, + { + "epoch": 4.36, + "grad_norm": 0.5078125, + "learning_rate": 0.00047444366199581045, + "loss": 0.2089, + "step": 105240 + }, + { + "epoch": 4.36, + "grad_norm": 0.890625, + "learning_rate": 0.00047443888496342365, + "loss": 0.2569, + "step": 105250 + }, + { + "epoch": 4.36, + "grad_norm": 1.3046875, + "learning_rate": 0.0004744341075086671, + "loss": 0.213, + "step": 105260 + }, + { + "epoch": 4.36, + "grad_norm": 0.9296875, + "learning_rate": 0.0004744293296315498, + "loss": 0.1995, + "step": 105270 + }, + { + "epoch": 4.36, + "grad_norm": 0.546875, + "learning_rate": 0.0004744245513320807, + "loss": 0.1829, + "step": 105280 + }, + { + "epoch": 4.36, + "grad_norm": 0.3359375, + "learning_rate": 0.0004744197726102689, + "loss": 0.1605, + "step": 105290 + }, + { + "epoch": 4.36, + "grad_norm": 0.77734375, + "learning_rate": 0.0004744149934661234, + "loss": 0.2085, + "step": 105300 + }, + { + "epoch": 4.36, + "grad_norm": 0.703125, + "learning_rate": 0.000474410213899653, + "loss": 0.1985, + "step": 105310 + }, + { + "epoch": 4.36, + "grad_norm": 0.2197265625, + "learning_rate": 0.00047440543391086683, + "loss": 0.1915, + "step": 105320 + }, + { + "epoch": 4.36, + "grad_norm": 1.1875, + "learning_rate": 0.00047440065349977386, + "loss": 0.2257, + "step": 105330 + }, + { + "epoch": 4.36, + "grad_norm": 0.76171875, + "learning_rate": 0.00047439587266638307, + "loss": 0.2236, + "step": 105340 + }, + { + "epoch": 4.36, + "grad_norm": 0.81640625, + "learning_rate": 0.00047439109141070356, + "loss": 0.1814, + "step": 105350 + }, + { + "epoch": 4.36, + "grad_norm": 0.98828125, + "learning_rate": 0.0004743863097327442, + "loss": 0.179, + "step": 105360 + }, + { + "epoch": 4.36, + "grad_norm": 0.462890625, + "learning_rate": 0.000474381527632514, + "loss": 0.181, + "step": 105370 + }, + { + "epoch": 4.36, + "grad_norm": 0.453125, + "learning_rate": 0.00047437674511002206, + "loss": 0.1906, + "step": 105380 + }, + { + "epoch": 4.37, + "grad_norm": 1.125, + "learning_rate": 0.00047437196216527726, + "loss": 0.2522, + "step": 105390 + }, + { + "epoch": 4.37, + "grad_norm": 0.466796875, + "learning_rate": 0.00047436717879828874, + "loss": 0.2143, + "step": 105400 + }, + { + "epoch": 4.37, + "grad_norm": 0.578125, + "learning_rate": 0.00047436239500906537, + "loss": 0.1683, + "step": 105410 + }, + { + "epoch": 4.37, + "grad_norm": 0.8046875, + "learning_rate": 0.00047435761079761616, + "loss": 0.2136, + "step": 105420 + }, + { + "epoch": 4.37, + "grad_norm": 0.75, + "learning_rate": 0.00047435282616395023, + "loss": 0.1957, + "step": 105430 + }, + { + "epoch": 4.37, + "grad_norm": 0.9140625, + "learning_rate": 0.0004743480411080765, + "loss": 0.2102, + "step": 105440 + }, + { + "epoch": 4.37, + "grad_norm": 0.60546875, + "learning_rate": 0.00047434325563000394, + "loss": 0.2476, + "step": 105450 + }, + { + "epoch": 4.37, + "grad_norm": 0.66796875, + "learning_rate": 0.0004743384697297416, + "loss": 0.1829, + "step": 105460 + }, + { + "epoch": 4.37, + "grad_norm": 0.54296875, + "learning_rate": 0.0004743336834072985, + "loss": 0.2192, + "step": 105470 + }, + { + "epoch": 4.37, + "grad_norm": 1.328125, + "learning_rate": 0.00047432889666268365, + "loss": 0.1741, + "step": 105480 + }, + { + "epoch": 4.37, + "grad_norm": 0.55859375, + "learning_rate": 0.000474324109495906, + "loss": 0.1587, + "step": 105490 + }, + { + "epoch": 4.37, + "grad_norm": 0.49609375, + "learning_rate": 0.0004743193219069746, + "loss": 0.2415, + "step": 105500 + }, + { + "epoch": 4.37, + "grad_norm": 0.455078125, + "learning_rate": 0.0004743145338958985, + "loss": 0.2352, + "step": 105510 + }, + { + "epoch": 4.37, + "grad_norm": 1.1953125, + "learning_rate": 0.00047430974546268666, + "loss": 0.1747, + "step": 105520 + }, + { + "epoch": 4.37, + "grad_norm": 0.890625, + "learning_rate": 0.0004743049566073482, + "loss": 0.2103, + "step": 105530 + }, + { + "epoch": 4.37, + "grad_norm": 0.8046875, + "learning_rate": 0.0004743001673298919, + "loss": 0.2008, + "step": 105540 + }, + { + "epoch": 4.37, + "grad_norm": 1.125, + "learning_rate": 0.0004742953776303269, + "loss": 0.1679, + "step": 105550 + }, + { + "epoch": 4.37, + "grad_norm": 1.390625, + "learning_rate": 0.00047429058750866226, + "loss": 0.2435, + "step": 105560 + }, + { + "epoch": 4.37, + "grad_norm": 0.8671875, + "learning_rate": 0.000474285796964907, + "loss": 0.1683, + "step": 105570 + }, + { + "epoch": 4.37, + "grad_norm": 0.78515625, + "learning_rate": 0.0004742810059990701, + "loss": 0.2306, + "step": 105580 + }, + { + "epoch": 4.37, + "grad_norm": 0.65625, + "learning_rate": 0.0004742762146111604, + "loss": 0.2008, + "step": 105590 + }, + { + "epoch": 4.37, + "grad_norm": 0.60546875, + "learning_rate": 0.0004742714228011873, + "loss": 0.2029, + "step": 105600 + }, + { + "epoch": 4.37, + "grad_norm": 0.51171875, + "learning_rate": 0.00047426663056915954, + "loss": 0.2036, + "step": 105610 + }, + { + "epoch": 4.37, + "grad_norm": 0.85546875, + "learning_rate": 0.00047426183791508613, + "loss": 0.1575, + "step": 105620 + }, + { + "epoch": 4.38, + "grad_norm": 1.359375, + "learning_rate": 0.0004742570448389762, + "loss": 0.1993, + "step": 105630 + }, + { + "epoch": 4.38, + "grad_norm": 0.2275390625, + "learning_rate": 0.0004742522513408387, + "loss": 0.215, + "step": 105640 + }, + { + "epoch": 4.38, + "grad_norm": 0.74609375, + "learning_rate": 0.0004742474574206827, + "loss": 0.1861, + "step": 105650 + }, + { + "epoch": 4.38, + "grad_norm": 2.796875, + "learning_rate": 0.0004742426630785172, + "loss": 0.2094, + "step": 105660 + }, + { + "epoch": 4.38, + "grad_norm": 1.3359375, + "learning_rate": 0.0004742378683143512, + "loss": 0.1938, + "step": 105670 + }, + { + "epoch": 4.38, + "grad_norm": 0.3125, + "learning_rate": 0.00047423307312819383, + "loss": 0.1957, + "step": 105680 + }, + { + "epoch": 4.38, + "grad_norm": 0.0, + "learning_rate": 0.00047422827752005396, + "loss": 0.156, + "step": 105690 + }, + { + "epoch": 4.38, + "grad_norm": 0.65625, + "learning_rate": 0.0004742234814899407, + "loss": 0.2224, + "step": 105700 + }, + { + "epoch": 4.38, + "grad_norm": 1.5234375, + "learning_rate": 0.00047421868503786307, + "loss": 0.2444, + "step": 105710 + }, + { + "epoch": 4.38, + "grad_norm": 1.265625, + "learning_rate": 0.0004742138881638301, + "loss": 0.1802, + "step": 105720 + }, + { + "epoch": 4.38, + "grad_norm": 0.55859375, + "learning_rate": 0.0004742090908678508, + "loss": 0.2608, + "step": 105730 + }, + { + "epoch": 4.38, + "grad_norm": 0.67578125, + "learning_rate": 0.00047420429314993417, + "loss": 0.1886, + "step": 105740 + }, + { + "epoch": 4.38, + "grad_norm": 0.41796875, + "learning_rate": 0.0004741994950100893, + "loss": 0.2283, + "step": 105750 + }, + { + "epoch": 4.38, + "grad_norm": 0.37890625, + "learning_rate": 0.0004741946964483253, + "loss": 0.2044, + "step": 105760 + }, + { + "epoch": 4.38, + "grad_norm": 1.3828125, + "learning_rate": 0.0004741898974646509, + "loss": 0.247, + "step": 105770 + }, + { + "epoch": 4.38, + "grad_norm": 0.421875, + "learning_rate": 0.0004741850980590754, + "loss": 0.2794, + "step": 105780 + }, + { + "epoch": 4.38, + "grad_norm": 0.62890625, + "learning_rate": 0.0004741802982316078, + "loss": 0.2363, + "step": 105790 + }, + { + "epoch": 4.38, + "grad_norm": 0.625, + "learning_rate": 0.000474175497982257, + "loss": 0.1893, + "step": 105800 + }, + { + "epoch": 4.38, + "grad_norm": 1.2890625, + "learning_rate": 0.0004741706973110322, + "loss": 0.2235, + "step": 105810 + }, + { + "epoch": 4.38, + "grad_norm": 1.0, + "learning_rate": 0.00047416589621794233, + "loss": 0.2008, + "step": 105820 + }, + { + "epoch": 4.38, + "grad_norm": 0.330078125, + "learning_rate": 0.0004741610947029964, + "loss": 0.2194, + "step": 105830 + }, + { + "epoch": 4.38, + "grad_norm": 0.7890625, + "learning_rate": 0.0004741562927662035, + "loss": 0.1852, + "step": 105840 + }, + { + "epoch": 4.38, + "grad_norm": 1.4296875, + "learning_rate": 0.00047415149040757275, + "loss": 0.1357, + "step": 105850 + }, + { + "epoch": 4.38, + "grad_norm": 3.75, + "learning_rate": 0.0004741466876271131, + "loss": 0.2078, + "step": 105860 + }, + { + "epoch": 4.39, + "grad_norm": 0.7109375, + "learning_rate": 0.0004741418844248335, + "loss": 0.2035, + "step": 105870 + }, + { + "epoch": 4.39, + "grad_norm": 0.57421875, + "learning_rate": 0.0004741370808007431, + "loss": 0.2136, + "step": 105880 + }, + { + "epoch": 4.39, + "grad_norm": 0.357421875, + "learning_rate": 0.000474132276754851, + "loss": 0.2437, + "step": 105890 + }, + { + "epoch": 4.39, + "grad_norm": 0.52734375, + "learning_rate": 0.0004741274722871661, + "loss": 0.2156, + "step": 105900 + }, + { + "epoch": 4.39, + "grad_norm": 0.5390625, + "learning_rate": 0.0004741226673976975, + "loss": 0.2043, + "step": 105910 + }, + { + "epoch": 4.39, + "grad_norm": 0.390625, + "learning_rate": 0.0004741178620864542, + "loss": 0.1882, + "step": 105920 + }, + { + "epoch": 4.39, + "grad_norm": 0.74609375, + "learning_rate": 0.00047411305635344537, + "loss": 0.1609, + "step": 105930 + }, + { + "epoch": 4.39, + "grad_norm": 1.0234375, + "learning_rate": 0.00047410825019867997, + "loss": 0.2455, + "step": 105940 + }, + { + "epoch": 4.39, + "grad_norm": 0.5546875, + "learning_rate": 0.000474103443622167, + "loss": 0.1891, + "step": 105950 + }, + { + "epoch": 4.39, + "grad_norm": 0.52734375, + "learning_rate": 0.00047409863662391564, + "loss": 0.1841, + "step": 105960 + }, + { + "epoch": 4.39, + "grad_norm": 1.234375, + "learning_rate": 0.0004740938292039347, + "loss": 0.1985, + "step": 105970 + }, + { + "epoch": 4.39, + "grad_norm": 0.609375, + "learning_rate": 0.0004740890213622334, + "loss": 0.2205, + "step": 105980 + }, + { + "epoch": 4.39, + "grad_norm": 1.296875, + "learning_rate": 0.00047408421309882087, + "loss": 0.2107, + "step": 105990 + }, + { + "epoch": 4.39, + "grad_norm": 2.5, + "learning_rate": 0.00047407940441370595, + "loss": 0.2247, + "step": 106000 + }, + { + "epoch": 4.39, + "grad_norm": 0.59375, + "learning_rate": 0.00047407459530689785, + "loss": 0.1863, + "step": 106010 + }, + { + "epoch": 4.39, + "grad_norm": 0.318359375, + "learning_rate": 0.00047406978577840554, + "loss": 0.2319, + "step": 106020 + }, + { + "epoch": 4.39, + "grad_norm": 0.66015625, + "learning_rate": 0.00047406497582823805, + "loss": 0.231, + "step": 106030 + }, + { + "epoch": 4.39, + "grad_norm": 0.796875, + "learning_rate": 0.0004740601654564045, + "loss": 0.2539, + "step": 106040 + }, + { + "epoch": 4.39, + "grad_norm": 0.361328125, + "learning_rate": 0.0004740553546629139, + "loss": 0.2312, + "step": 106050 + }, + { + "epoch": 4.39, + "grad_norm": 0.62890625, + "learning_rate": 0.00047405054344777534, + "loss": 0.2625, + "step": 106060 + }, + { + "epoch": 4.39, + "grad_norm": 0.45703125, + "learning_rate": 0.00047404573181099785, + "loss": 0.1775, + "step": 106070 + }, + { + "epoch": 4.39, + "grad_norm": 1.0390625, + "learning_rate": 0.0004740409197525905, + "loss": 0.1408, + "step": 106080 + }, + { + "epoch": 4.39, + "grad_norm": 0.0, + "learning_rate": 0.0004740361072725623, + "loss": 0.2081, + "step": 106090 + }, + { + "epoch": 4.39, + "grad_norm": 0.33203125, + "learning_rate": 0.0004740312943709224, + "loss": 0.1605, + "step": 106100 + }, + { + "epoch": 4.4, + "grad_norm": 0.75390625, + "learning_rate": 0.0004740264810476798, + "loss": 0.1909, + "step": 106110 + }, + { + "epoch": 4.4, + "grad_norm": 0.828125, + "learning_rate": 0.00047402166730284344, + "loss": 0.2368, + "step": 106120 + }, + { + "epoch": 4.4, + "grad_norm": 0.66796875, + "learning_rate": 0.00047401685313642263, + "loss": 0.1929, + "step": 106130 + }, + { + "epoch": 4.4, + "grad_norm": 0.5, + "learning_rate": 0.0004740120385484262, + "loss": 0.1928, + "step": 106140 + }, + { + "epoch": 4.4, + "grad_norm": 0.96484375, + "learning_rate": 0.00047400722353886334, + "loss": 0.2245, + "step": 106150 + }, + { + "epoch": 4.4, + "grad_norm": 0.5703125, + "learning_rate": 0.0004740024081077431, + "loss": 0.1644, + "step": 106160 + }, + { + "epoch": 4.4, + "grad_norm": 0.828125, + "learning_rate": 0.00047399759225507445, + "loss": 0.198, + "step": 106170 + }, + { + "epoch": 4.4, + "grad_norm": 0.46875, + "learning_rate": 0.00047399277598086654, + "loss": 0.2131, + "step": 106180 + }, + { + "epoch": 4.4, + "grad_norm": 0.75390625, + "learning_rate": 0.0004739879592851285, + "loss": 0.2018, + "step": 106190 + }, + { + "epoch": 4.4, + "grad_norm": 0.625, + "learning_rate": 0.0004739831421678692, + "loss": 0.2099, + "step": 106200 + }, + { + "epoch": 4.4, + "grad_norm": 2.453125, + "learning_rate": 0.00047397832462909786, + "loss": 0.2156, + "step": 106210 + }, + { + "epoch": 4.4, + "grad_norm": 0.3203125, + "learning_rate": 0.00047397350666882347, + "loss": 0.2654, + "step": 106220 + }, + { + "epoch": 4.4, + "grad_norm": 1.3828125, + "learning_rate": 0.0004739686882870552, + "loss": 0.2207, + "step": 106230 + }, + { + "epoch": 4.4, + "grad_norm": 0.427734375, + "learning_rate": 0.000473963869483802, + "loss": 0.2476, + "step": 106240 + }, + { + "epoch": 4.4, + "grad_norm": 1.2109375, + "learning_rate": 0.000473959050259073, + "loss": 0.2922, + "step": 106250 + }, + { + "epoch": 4.4, + "grad_norm": 0.70703125, + "learning_rate": 0.0004739542306128772, + "loss": 0.2517, + "step": 106260 + }, + { + "epoch": 4.4, + "grad_norm": 1.4921875, + "learning_rate": 0.0004739494105452238, + "loss": 0.1587, + "step": 106270 + }, + { + "epoch": 4.4, + "grad_norm": 1.453125, + "learning_rate": 0.00047394459005612176, + "loss": 0.1764, + "step": 106280 + }, + { + "epoch": 4.4, + "grad_norm": 0.302734375, + "learning_rate": 0.00047393976914558017, + "loss": 0.1751, + "step": 106290 + }, + { + "epoch": 4.4, + "grad_norm": 0.48046875, + "learning_rate": 0.0004739349478136081, + "loss": 0.2027, + "step": 106300 + }, + { + "epoch": 4.4, + "grad_norm": 0.85546875, + "learning_rate": 0.0004739301260602147, + "loss": 0.2041, + "step": 106310 + }, + { + "epoch": 4.4, + "grad_norm": 0.56640625, + "learning_rate": 0.000473925303885409, + "loss": 0.2407, + "step": 106320 + }, + { + "epoch": 4.4, + "grad_norm": 1.7265625, + "learning_rate": 0.00047392048128919997, + "loss": 0.2196, + "step": 106330 + }, + { + "epoch": 4.4, + "grad_norm": 0.72265625, + "learning_rate": 0.00047391565827159684, + "loss": 0.2324, + "step": 106340 + }, + { + "epoch": 4.41, + "grad_norm": 0.6875, + "learning_rate": 0.0004739108348326086, + "loss": 0.2242, + "step": 106350 + }, + { + "epoch": 4.41, + "grad_norm": 0.921875, + "learning_rate": 0.0004739060109722444, + "loss": 0.1981, + "step": 106360 + }, + { + "epoch": 4.41, + "grad_norm": 0.69921875, + "learning_rate": 0.0004739011866905132, + "loss": 0.1879, + "step": 106370 + }, + { + "epoch": 4.41, + "grad_norm": 1.609375, + "learning_rate": 0.00047389636198742416, + "loss": 0.2335, + "step": 106380 + }, + { + "epoch": 4.41, + "grad_norm": 1.6875, + "learning_rate": 0.00047389153686298635, + "loss": 0.2135, + "step": 106390 + }, + { + "epoch": 4.41, + "grad_norm": 0.6015625, + "learning_rate": 0.0004738867113172088, + "loss": 0.1922, + "step": 106400 + }, + { + "epoch": 4.41, + "grad_norm": 0.59765625, + "learning_rate": 0.0004738818853501007, + "loss": 0.1899, + "step": 106410 + }, + { + "epoch": 4.41, + "grad_norm": 0.478515625, + "learning_rate": 0.00047387705896167104, + "loss": 0.2123, + "step": 106420 + }, + { + "epoch": 4.41, + "grad_norm": 0.3359375, + "learning_rate": 0.000473872232151929, + "loss": 0.232, + "step": 106430 + }, + { + "epoch": 4.41, + "grad_norm": 0.404296875, + "learning_rate": 0.00047386740492088355, + "loss": 0.22, + "step": 106440 + }, + { + "epoch": 4.41, + "grad_norm": 0.328125, + "learning_rate": 0.00047386257726854376, + "loss": 0.2208, + "step": 106450 + }, + { + "epoch": 4.41, + "grad_norm": 0.50390625, + "learning_rate": 0.0004738577491949189, + "loss": 0.2214, + "step": 106460 + }, + { + "epoch": 4.41, + "grad_norm": 0.55078125, + "learning_rate": 0.0004738529207000178, + "loss": 0.1986, + "step": 106470 + }, + { + "epoch": 4.41, + "grad_norm": 0.376953125, + "learning_rate": 0.0004738480917838497, + "loss": 0.182, + "step": 106480 + }, + { + "epoch": 4.41, + "grad_norm": 1.9140625, + "learning_rate": 0.00047384326244642375, + "loss": 0.1924, + "step": 106490 + }, + { + "epoch": 4.41, + "grad_norm": 0.33203125, + "learning_rate": 0.00047383843268774883, + "loss": 0.1686, + "step": 106500 + }, + { + "epoch": 4.41, + "grad_norm": 1.03125, + "learning_rate": 0.00047383360250783426, + "loss": 0.1785, + "step": 106510 + }, + { + "epoch": 4.41, + "grad_norm": 0.0, + "learning_rate": 0.00047382877190668894, + "loss": 0.2151, + "step": 106520 + }, + { + "epoch": 4.41, + "grad_norm": 0.60546875, + "learning_rate": 0.0004738239408843221, + "loss": 0.2538, + "step": 106530 + }, + { + "epoch": 4.41, + "grad_norm": 0.201171875, + "learning_rate": 0.0004738191094407428, + "loss": 0.2114, + "step": 106540 + }, + { + "epoch": 4.41, + "grad_norm": 1.0390625, + "learning_rate": 0.0004738142775759601, + "loss": 0.1847, + "step": 106550 + }, + { + "epoch": 4.41, + "grad_norm": 0.419921875, + "learning_rate": 0.00047380944528998306, + "loss": 0.2388, + "step": 106560 + }, + { + "epoch": 4.41, + "grad_norm": 0.6328125, + "learning_rate": 0.0004738046125828208, + "loss": 0.1763, + "step": 106570 + }, + { + "epoch": 4.41, + "grad_norm": 0.46875, + "learning_rate": 0.0004737997794544824, + "loss": 0.1727, + "step": 106580 + }, + { + "epoch": 4.41, + "grad_norm": 0.65234375, + "learning_rate": 0.00047379494590497705, + "loss": 0.2172, + "step": 106590 + }, + { + "epoch": 4.42, + "grad_norm": 0.373046875, + "learning_rate": 0.00047379011193431374, + "loss": 0.2337, + "step": 106600 + }, + { + "epoch": 4.42, + "grad_norm": 0.408203125, + "learning_rate": 0.0004737852775425017, + "loss": 0.1998, + "step": 106610 + }, + { + "epoch": 4.42, + "grad_norm": 0.8828125, + "learning_rate": 0.0004737804427295498, + "loss": 0.2529, + "step": 106620 + }, + { + "epoch": 4.42, + "grad_norm": 0.59765625, + "learning_rate": 0.00047377560749546735, + "loss": 0.2144, + "step": 106630 + }, + { + "epoch": 4.42, + "grad_norm": 0.421875, + "learning_rate": 0.0004737707718402634, + "loss": 0.2163, + "step": 106640 + }, + { + "epoch": 4.42, + "grad_norm": 0.36328125, + "learning_rate": 0.0004737659357639469, + "loss": 0.2144, + "step": 106650 + }, + { + "epoch": 4.42, + "grad_norm": 0.5859375, + "learning_rate": 0.00047376109926652724, + "loss": 0.2092, + "step": 106660 + }, + { + "epoch": 4.42, + "grad_norm": 1.3984375, + "learning_rate": 0.00047375626234801325, + "loss": 0.1582, + "step": 106670 + }, + { + "epoch": 4.42, + "grad_norm": 0.7734375, + "learning_rate": 0.00047375142500841416, + "loss": 0.2499, + "step": 106680 + }, + { + "epoch": 4.42, + "grad_norm": 1.0078125, + "learning_rate": 0.000473746587247739, + "loss": 0.1956, + "step": 106690 + }, + { + "epoch": 4.42, + "grad_norm": 1.625, + "learning_rate": 0.00047374174906599696, + "loss": 0.2242, + "step": 106700 + }, + { + "epoch": 4.42, + "grad_norm": 0.6015625, + "learning_rate": 0.0004737369104631972, + "loss": 0.2595, + "step": 106710 + }, + { + "epoch": 4.42, + "grad_norm": 1.1328125, + "learning_rate": 0.0004737320714393486, + "loss": 0.1665, + "step": 106720 + }, + { + "epoch": 4.42, + "grad_norm": 0.51171875, + "learning_rate": 0.00047372723199446044, + "loss": 0.2044, + "step": 106730 + }, + { + "epoch": 4.42, + "grad_norm": 1.0703125, + "learning_rate": 0.00047372239212854187, + "loss": 0.1812, + "step": 106740 + }, + { + "epoch": 4.42, + "grad_norm": 1.359375, + "learning_rate": 0.00047371755184160184, + "loss": 0.2581, + "step": 106750 + }, + { + "epoch": 4.42, + "grad_norm": 0.6796875, + "learning_rate": 0.0004737127111336496, + "loss": 0.2131, + "step": 106760 + }, + { + "epoch": 4.42, + "grad_norm": 0.44140625, + "learning_rate": 0.0004737078700046941, + "loss": 0.2609, + "step": 106770 + }, + { + "epoch": 4.42, + "grad_norm": 1.15625, + "learning_rate": 0.0004737030284547446, + "loss": 0.186, + "step": 106780 + }, + { + "epoch": 4.42, + "grad_norm": 1.6484375, + "learning_rate": 0.00047369818648381015, + "loss": 0.2069, + "step": 106790 + }, + { + "epoch": 4.42, + "grad_norm": 1.671875, + "learning_rate": 0.0004736933440918999, + "loss": 0.1582, + "step": 106800 + }, + { + "epoch": 4.42, + "grad_norm": 0.94140625, + "learning_rate": 0.0004736885012790229, + "loss": 0.1705, + "step": 106810 + }, + { + "epoch": 4.42, + "grad_norm": 0.54296875, + "learning_rate": 0.0004736836580451883, + "loss": 0.2199, + "step": 106820 + }, + { + "epoch": 4.42, + "grad_norm": 0.50390625, + "learning_rate": 0.00047367881439040527, + "loss": 0.2343, + "step": 106830 + }, + { + "epoch": 4.43, + "grad_norm": 0.5625, + "learning_rate": 0.0004736739703146828, + "loss": 0.2109, + "step": 106840 + }, + { + "epoch": 4.43, + "grad_norm": 1.359375, + "learning_rate": 0.00047366912581803006, + "loss": 0.2389, + "step": 106850 + }, + { + "epoch": 4.43, + "grad_norm": 0.890625, + "learning_rate": 0.0004736642809004562, + "loss": 0.2075, + "step": 106860 + }, + { + "epoch": 4.43, + "grad_norm": 1.1171875, + "learning_rate": 0.0004736594355619703, + "loss": 0.2366, + "step": 106870 + }, + { + "epoch": 4.43, + "grad_norm": 0.546875, + "learning_rate": 0.00047365458980258156, + "loss": 0.219, + "step": 106880 + }, + { + "epoch": 4.43, + "grad_norm": 0.369140625, + "learning_rate": 0.000473649743622299, + "loss": 0.2576, + "step": 106890 + }, + { + "epoch": 4.43, + "grad_norm": 0.828125, + "learning_rate": 0.0004736448970211318, + "loss": 0.241, + "step": 106900 + }, + { + "epoch": 4.43, + "grad_norm": 1.0703125, + "learning_rate": 0.000473640049999089, + "loss": 0.2072, + "step": 106910 + }, + { + "epoch": 4.43, + "grad_norm": 0.828125, + "learning_rate": 0.0004736352025561798, + "loss": 0.1734, + "step": 106920 + }, + { + "epoch": 4.43, + "grad_norm": 0.27734375, + "learning_rate": 0.00047363035469241336, + "loss": 0.2159, + "step": 106930 + }, + { + "epoch": 4.43, + "grad_norm": 1.0625, + "learning_rate": 0.0004736255064077986, + "loss": 0.1744, + "step": 106940 + }, + { + "epoch": 4.43, + "grad_norm": 0.70703125, + "learning_rate": 0.0004736206577023449, + "loss": 0.191, + "step": 106950 + }, + { + "epoch": 4.43, + "grad_norm": 1.1484375, + "learning_rate": 0.00047361580857606123, + "loss": 0.1961, + "step": 106960 + }, + { + "epoch": 4.43, + "grad_norm": 0.6015625, + "learning_rate": 0.0004736109590289568, + "loss": 0.2205, + "step": 106970 + }, + { + "epoch": 4.43, + "grad_norm": 0.462890625, + "learning_rate": 0.00047360610906104064, + "loss": 0.1741, + "step": 106980 + }, + { + "epoch": 4.43, + "grad_norm": 0.49609375, + "learning_rate": 0.00047360125867232193, + "loss": 0.2354, + "step": 106990 + }, + { + "epoch": 4.43, + "grad_norm": 0.1728515625, + "learning_rate": 0.00047359640786280983, + "loss": 0.2015, + "step": 107000 + }, + { + "epoch": 4.43, + "grad_norm": 0.70703125, + "learning_rate": 0.0004735915566325134, + "loss": 0.2745, + "step": 107010 + }, + { + "epoch": 4.43, + "grad_norm": 0.5234375, + "learning_rate": 0.00047358670498144184, + "loss": 0.2272, + "step": 107020 + }, + { + "epoch": 4.43, + "grad_norm": 0.57421875, + "learning_rate": 0.00047358185290960426, + "loss": 0.1691, + "step": 107030 + }, + { + "epoch": 4.43, + "grad_norm": 0.39453125, + "learning_rate": 0.0004735770004170098, + "loss": 0.193, + "step": 107040 + }, + { + "epoch": 4.43, + "grad_norm": 0.625, + "learning_rate": 0.0004735721475036675, + "loss": 0.1823, + "step": 107050 + }, + { + "epoch": 4.43, + "grad_norm": 0.4609375, + "learning_rate": 0.00047356729416958657, + "loss": 0.1352, + "step": 107060 + }, + { + "epoch": 4.43, + "grad_norm": 1.1171875, + "learning_rate": 0.0004735624404147761, + "loss": 0.2249, + "step": 107070 + }, + { + "epoch": 4.44, + "grad_norm": 0.5546875, + "learning_rate": 0.0004735575862392454, + "loss": 0.1949, + "step": 107080 + }, + { + "epoch": 4.44, + "grad_norm": 0.87109375, + "learning_rate": 0.00047355273164300337, + "loss": 0.2218, + "step": 107090 + }, + { + "epoch": 4.44, + "grad_norm": 0.1845703125, + "learning_rate": 0.00047354787662605923, + "loss": 0.1845, + "step": 107100 + }, + { + "epoch": 4.44, + "grad_norm": 0.7578125, + "learning_rate": 0.0004735430211884222, + "loss": 0.2001, + "step": 107110 + }, + { + "epoch": 4.44, + "grad_norm": 0.5859375, + "learning_rate": 0.00047353816533010124, + "loss": 0.2347, + "step": 107120 + }, + { + "epoch": 4.44, + "grad_norm": 0.84765625, + "learning_rate": 0.00047353330905110566, + "loss": 0.2034, + "step": 107130 + }, + { + "epoch": 4.44, + "grad_norm": 0.5546875, + "learning_rate": 0.00047352845235144447, + "loss": 0.1758, + "step": 107140 + }, + { + "epoch": 4.44, + "grad_norm": 0.9453125, + "learning_rate": 0.000473523595231127, + "loss": 0.2183, + "step": 107150 + }, + { + "epoch": 4.44, + "grad_norm": 0.6796875, + "learning_rate": 0.00047351873769016213, + "loss": 0.2315, + "step": 107160 + }, + { + "epoch": 4.44, + "grad_norm": 0.52734375, + "learning_rate": 0.0004735138797285592, + "loss": 0.189, + "step": 107170 + }, + { + "epoch": 4.44, + "grad_norm": 1.0234375, + "learning_rate": 0.0004735090213463272, + "loss": 0.2183, + "step": 107180 + }, + { + "epoch": 4.44, + "grad_norm": 1.484375, + "learning_rate": 0.0004735041625434755, + "loss": 0.2688, + "step": 107190 + }, + { + "epoch": 4.44, + "grad_norm": 0.62109375, + "learning_rate": 0.000473499303320013, + "loss": 0.2353, + "step": 107200 + }, + { + "epoch": 4.44, + "grad_norm": 0.57421875, + "learning_rate": 0.000473494443675949, + "loss": 0.1869, + "step": 107210 + }, + { + "epoch": 4.44, + "grad_norm": 0.58984375, + "learning_rate": 0.0004734895836112926, + "loss": 0.2329, + "step": 107220 + }, + { + "epoch": 4.44, + "grad_norm": 0.734375, + "learning_rate": 0.00047348472312605283, + "loss": 0.1824, + "step": 107230 + }, + { + "epoch": 4.44, + "grad_norm": 0.6328125, + "learning_rate": 0.00047347986222023907, + "loss": 0.1927, + "step": 107240 + }, + { + "epoch": 4.44, + "grad_norm": 0.97265625, + "learning_rate": 0.0004734750008938603, + "loss": 0.2258, + "step": 107250 + }, + { + "epoch": 4.44, + "grad_norm": 0.71484375, + "learning_rate": 0.00047347013914692573, + "loss": 0.2204, + "step": 107260 + }, + { + "epoch": 4.44, + "grad_norm": 0.77734375, + "learning_rate": 0.00047346527697944443, + "loss": 0.2125, + "step": 107270 + }, + { + "epoch": 4.44, + "grad_norm": 0.373046875, + "learning_rate": 0.0004734604143914256, + "loss": 0.1841, + "step": 107280 + }, + { + "epoch": 4.44, + "grad_norm": 0.71875, + "learning_rate": 0.00047345555138287855, + "loss": 0.2349, + "step": 107290 + }, + { + "epoch": 4.44, + "grad_norm": 0.51171875, + "learning_rate": 0.0004734506879538121, + "loss": 0.2136, + "step": 107300 + }, + { + "epoch": 4.44, + "grad_norm": 0.419921875, + "learning_rate": 0.0004734458241042357, + "loss": 0.1802, + "step": 107310 + }, + { + "epoch": 4.45, + "grad_norm": 0.75, + "learning_rate": 0.0004734409598341584, + "loss": 0.1747, + "step": 107320 + }, + { + "epoch": 4.45, + "grad_norm": 1.0078125, + "learning_rate": 0.0004734360951435893, + "loss": 0.2047, + "step": 107330 + }, + { + "epoch": 4.45, + "grad_norm": 0.6953125, + "learning_rate": 0.0004734312300325376, + "loss": 0.1771, + "step": 107340 + }, + { + "epoch": 4.45, + "grad_norm": 1.21875, + "learning_rate": 0.00047342636450101237, + "loss": 0.2011, + "step": 107350 + }, + { + "epoch": 4.45, + "grad_norm": 0.57421875, + "learning_rate": 0.00047342149854902294, + "loss": 0.1947, + "step": 107360 + }, + { + "epoch": 4.45, + "grad_norm": 0.73828125, + "learning_rate": 0.0004734166321765784, + "loss": 0.2321, + "step": 107370 + }, + { + "epoch": 4.45, + "grad_norm": 3.078125, + "learning_rate": 0.00047341176538368793, + "loss": 0.2156, + "step": 107380 + }, + { + "epoch": 4.45, + "grad_norm": 0.6015625, + "learning_rate": 0.0004734068981703605, + "loss": 0.1932, + "step": 107390 + }, + { + "epoch": 4.45, + "grad_norm": 0.79296875, + "learning_rate": 0.00047340203053660546, + "loss": 0.196, + "step": 107400 + }, + { + "epoch": 4.45, + "grad_norm": 0.875, + "learning_rate": 0.000473397162482432, + "loss": 0.2012, + "step": 107410 + }, + { + "epoch": 4.45, + "grad_norm": 0.7265625, + "learning_rate": 0.0004733922940078491, + "loss": 0.2013, + "step": 107420 + }, + { + "epoch": 4.45, + "grad_norm": 0.41796875, + "learning_rate": 0.00047338742511286616, + "loss": 0.1913, + "step": 107430 + }, + { + "epoch": 4.45, + "grad_norm": 0.71875, + "learning_rate": 0.0004733825557974921, + "loss": 0.2169, + "step": 107440 + }, + { + "epoch": 4.45, + "grad_norm": 1.5625, + "learning_rate": 0.0004733776860617362, + "loss": 0.2045, + "step": 107450 + }, + { + "epoch": 4.45, + "grad_norm": 0.38671875, + "learning_rate": 0.0004733728159056077, + "loss": 0.1779, + "step": 107460 + }, + { + "epoch": 4.45, + "grad_norm": 0.46484375, + "learning_rate": 0.0004733679453291156, + "loss": 0.1712, + "step": 107470 + }, + { + "epoch": 4.45, + "grad_norm": 0.92578125, + "learning_rate": 0.0004733630743322691, + "loss": 0.1719, + "step": 107480 + }, + { + "epoch": 4.45, + "grad_norm": 0.69140625, + "learning_rate": 0.00047335820291507754, + "loss": 0.1588, + "step": 107490 + }, + { + "epoch": 4.45, + "grad_norm": 0.7109375, + "learning_rate": 0.0004733533310775499, + "loss": 0.2357, + "step": 107500 + }, + { + "epoch": 4.45, + "grad_norm": 0.7265625, + "learning_rate": 0.00047334845881969546, + "loss": 0.2472, + "step": 107510 + }, + { + "epoch": 4.45, + "grad_norm": 0.5703125, + "learning_rate": 0.00047334358614152327, + "loss": 0.1836, + "step": 107520 + }, + { + "epoch": 4.45, + "grad_norm": 0.201171875, + "learning_rate": 0.0004733387130430426, + "loss": 0.1837, + "step": 107530 + }, + { + "epoch": 4.45, + "grad_norm": 0.515625, + "learning_rate": 0.00047333383952426254, + "loss": 0.2272, + "step": 107540 + }, + { + "epoch": 4.45, + "grad_norm": 0.8828125, + "learning_rate": 0.00047332896558519244, + "loss": 0.2055, + "step": 107550 + }, + { + "epoch": 4.46, + "grad_norm": 0.6484375, + "learning_rate": 0.0004733240912258412, + "loss": 0.1774, + "step": 107560 + }, + { + "epoch": 4.46, + "grad_norm": 1.140625, + "learning_rate": 0.00047331921644621825, + "loss": 0.2429, + "step": 107570 + }, + { + "epoch": 4.46, + "grad_norm": 0.6875, + "learning_rate": 0.0004733143412463326, + "loss": 0.194, + "step": 107580 + }, + { + "epoch": 4.46, + "grad_norm": 0.42578125, + "learning_rate": 0.00047330946562619346, + "loss": 0.1905, + "step": 107590 + }, + { + "epoch": 4.46, + "grad_norm": 0.54296875, + "learning_rate": 0.00047330458958581004, + "loss": 0.1302, + "step": 107600 + }, + { + "epoch": 4.46, + "grad_norm": 1.0859375, + "learning_rate": 0.0004732997131251915, + "loss": 0.2132, + "step": 107610 + }, + { + "epoch": 4.46, + "grad_norm": 0.63671875, + "learning_rate": 0.000473294836244347, + "loss": 0.2605, + "step": 107620 + }, + { + "epoch": 4.46, + "grad_norm": 0.3359375, + "learning_rate": 0.00047328995894328573, + "loss": 0.1868, + "step": 107630 + }, + { + "epoch": 4.46, + "grad_norm": 0.68359375, + "learning_rate": 0.00047328508122201695, + "loss": 0.2395, + "step": 107640 + }, + { + "epoch": 4.46, + "grad_norm": 0.60546875, + "learning_rate": 0.00047328020308054963, + "loss": 0.24, + "step": 107650 + }, + { + "epoch": 4.46, + "grad_norm": 0.474609375, + "learning_rate": 0.0004732753245188932, + "loss": 0.2209, + "step": 107660 + }, + { + "epoch": 4.46, + "grad_norm": 0.671875, + "learning_rate": 0.00047327044553705666, + "loss": 0.2133, + "step": 107670 + }, + { + "epoch": 4.46, + "grad_norm": 1.0078125, + "learning_rate": 0.0004732655661350492, + "loss": 0.1885, + "step": 107680 + }, + { + "epoch": 4.46, + "grad_norm": 1.34375, + "learning_rate": 0.00047326068631288015, + "loss": 0.1837, + "step": 107690 + }, + { + "epoch": 4.46, + "grad_norm": 0.546875, + "learning_rate": 0.00047325580607055856, + "loss": 0.2316, + "step": 107700 + }, + { + "epoch": 4.46, + "grad_norm": 0.640625, + "learning_rate": 0.00047325092540809367, + "loss": 0.2419, + "step": 107710 + }, + { + "epoch": 4.46, + "grad_norm": 0.435546875, + "learning_rate": 0.0004732460443254947, + "loss": 0.2152, + "step": 107720 + }, + { + "epoch": 4.46, + "grad_norm": 0.78125, + "learning_rate": 0.0004732411628227706, + "loss": 0.2156, + "step": 107730 + }, + { + "epoch": 4.46, + "grad_norm": 1.046875, + "learning_rate": 0.00047323628089993085, + "loss": 0.1927, + "step": 107740 + }, + { + "epoch": 4.46, + "grad_norm": 0.7578125, + "learning_rate": 0.0004732313985569846, + "loss": 0.2099, + "step": 107750 + }, + { + "epoch": 4.46, + "grad_norm": 0.5859375, + "learning_rate": 0.00047322651579394086, + "loss": 0.2506, + "step": 107760 + }, + { + "epoch": 4.46, + "grad_norm": 1.484375, + "learning_rate": 0.00047322163261080897, + "loss": 0.2162, + "step": 107770 + }, + { + "epoch": 4.46, + "grad_norm": 0.640625, + "learning_rate": 0.00047321674900759807, + "loss": 0.2356, + "step": 107780 + }, + { + "epoch": 4.46, + "grad_norm": 0.703125, + "learning_rate": 0.0004732118649843173, + "loss": 0.2291, + "step": 107790 + }, + { + "epoch": 4.47, + "grad_norm": 0.416015625, + "learning_rate": 0.000473206980540976, + "loss": 0.2057, + "step": 107800 + }, + { + "epoch": 4.47, + "grad_norm": 0.49609375, + "learning_rate": 0.0004732020956775832, + "loss": 0.198, + "step": 107810 + }, + { + "epoch": 4.47, + "grad_norm": 1.1640625, + "learning_rate": 0.0004731972103941482, + "loss": 0.2067, + "step": 107820 + }, + { + "epoch": 4.47, + "grad_norm": 1.5625, + "learning_rate": 0.00047319232469068015, + "loss": 0.2017, + "step": 107830 + }, + { + "epoch": 4.47, + "grad_norm": 0.56640625, + "learning_rate": 0.0004731874385671883, + "loss": 0.1574, + "step": 107840 + }, + { + "epoch": 4.47, + "grad_norm": 0.8984375, + "learning_rate": 0.00047318255202368165, + "loss": 0.2279, + "step": 107850 + }, + { + "epoch": 4.47, + "grad_norm": 1.0546875, + "learning_rate": 0.00047317766506016956, + "loss": 0.1787, + "step": 107860 + }, + { + "epoch": 4.47, + "grad_norm": 0.81640625, + "learning_rate": 0.00047317277767666135, + "loss": 0.2409, + "step": 107870 + }, + { + "epoch": 4.47, + "grad_norm": 1.2890625, + "learning_rate": 0.00047316788987316596, + "loss": 0.2276, + "step": 107880 + }, + { + "epoch": 4.47, + "grad_norm": 1.09375, + "learning_rate": 0.0004731630016496927, + "loss": 0.1549, + "step": 107890 + }, + { + "epoch": 4.47, + "grad_norm": 0.83984375, + "learning_rate": 0.0004731581130062508, + "loss": 0.2165, + "step": 107900 + }, + { + "epoch": 4.47, + "grad_norm": 0.40625, + "learning_rate": 0.00047315322394284943, + "loss": 0.1882, + "step": 107910 + }, + { + "epoch": 4.47, + "grad_norm": 0.625, + "learning_rate": 0.0004731483344594978, + "loss": 0.2847, + "step": 107920 + }, + { + "epoch": 4.47, + "grad_norm": 0.478515625, + "learning_rate": 0.0004731434445562051, + "loss": 0.2304, + "step": 107930 + }, + { + "epoch": 4.47, + "grad_norm": 1.734375, + "learning_rate": 0.0004731385542329805, + "loss": 0.2184, + "step": 107940 + }, + { + "epoch": 4.47, + "grad_norm": 0.78125, + "learning_rate": 0.0004731336634898332, + "loss": 0.2016, + "step": 107950 + }, + { + "epoch": 4.47, + "grad_norm": 1.3515625, + "learning_rate": 0.00047312877232677254, + "loss": 0.2405, + "step": 107960 + }, + { + "epoch": 4.47, + "grad_norm": 1.421875, + "learning_rate": 0.0004731238807438076, + "loss": 0.2222, + "step": 107970 + }, + { + "epoch": 4.47, + "grad_norm": 0.5703125, + "learning_rate": 0.00047311898874094753, + "loss": 0.2079, + "step": 107980 + }, + { + "epoch": 4.47, + "grad_norm": 1.125, + "learning_rate": 0.0004731140963182017, + "loss": 0.2114, + "step": 107990 + }, + { + "epoch": 4.47, + "grad_norm": 1.1328125, + "learning_rate": 0.00047310920347557925, + "loss": 0.2106, + "step": 108000 + }, + { + "epoch": 4.47, + "grad_norm": 0.61328125, + "learning_rate": 0.0004731043102130893, + "loss": 0.2293, + "step": 108010 + }, + { + "epoch": 4.47, + "grad_norm": 0.51171875, + "learning_rate": 0.0004730994165307412, + "loss": 0.1822, + "step": 108020 + }, + { + "epoch": 4.47, + "grad_norm": 0.59375, + "learning_rate": 0.00047309452242854413, + "loss": 0.2317, + "step": 108030 + }, + { + "epoch": 4.48, + "grad_norm": 1.1015625, + "learning_rate": 0.0004730896279065071, + "loss": 0.2541, + "step": 108040 + }, + { + "epoch": 4.48, + "grad_norm": 0.58203125, + "learning_rate": 0.0004730847329646396, + "loss": 0.1732, + "step": 108050 + }, + { + "epoch": 4.48, + "grad_norm": 0.61328125, + "learning_rate": 0.0004730798376029507, + "loss": 0.2166, + "step": 108060 + }, + { + "epoch": 4.48, + "grad_norm": 0.8046875, + "learning_rate": 0.0004730749418214497, + "loss": 0.2283, + "step": 108070 + }, + { + "epoch": 4.48, + "grad_norm": 0.8671875, + "learning_rate": 0.00047307004562014565, + "loss": 0.2492, + "step": 108080 + }, + { + "epoch": 4.48, + "grad_norm": 0.40234375, + "learning_rate": 0.0004730651489990479, + "loss": 0.2405, + "step": 108090 + }, + { + "epoch": 4.48, + "grad_norm": 0.291015625, + "learning_rate": 0.0004730602519581656, + "loss": 0.2101, + "step": 108100 + }, + { + "epoch": 4.48, + "grad_norm": 0.74609375, + "learning_rate": 0.000473055354497508, + "loss": 0.2314, + "step": 108110 + }, + { + "epoch": 4.48, + "grad_norm": 1.015625, + "learning_rate": 0.00047305045661708435, + "loss": 0.218, + "step": 108120 + }, + { + "epoch": 4.48, + "grad_norm": 0.58984375, + "learning_rate": 0.0004730455583169039, + "loss": 0.1981, + "step": 108130 + }, + { + "epoch": 4.48, + "grad_norm": 0.279296875, + "learning_rate": 0.0004730406595969756, + "loss": 0.2003, + "step": 108140 + }, + { + "epoch": 4.48, + "grad_norm": 0.71875, + "learning_rate": 0.000473035760457309, + "loss": 0.1793, + "step": 108150 + }, + { + "epoch": 4.48, + "grad_norm": 1.015625, + "learning_rate": 0.0004730308608979132, + "loss": 0.1818, + "step": 108160 + }, + { + "epoch": 4.48, + "grad_norm": 0.82421875, + "learning_rate": 0.00047302596091879735, + "loss": 0.1967, + "step": 108170 + }, + { + "epoch": 4.48, + "grad_norm": 0.64453125, + "learning_rate": 0.0004730210605199707, + "loss": 0.1821, + "step": 108180 + }, + { + "epoch": 4.48, + "grad_norm": 0.84765625, + "learning_rate": 0.0004730161597014426, + "loss": 0.1469, + "step": 108190 + }, + { + "epoch": 4.48, + "grad_norm": 0.88671875, + "learning_rate": 0.0004730112584632221, + "loss": 0.2338, + "step": 108200 + }, + { + "epoch": 4.48, + "grad_norm": 0.640625, + "learning_rate": 0.00047300635680531846, + "loss": 0.2188, + "step": 108210 + }, + { + "epoch": 4.48, + "grad_norm": 1.0234375, + "learning_rate": 0.00047300145472774105, + "loss": 0.2224, + "step": 108220 + }, + { + "epoch": 4.48, + "grad_norm": 0.82421875, + "learning_rate": 0.00047299655223049885, + "loss": 0.2241, + "step": 108230 + }, + { + "epoch": 4.48, + "grad_norm": 0.59375, + "learning_rate": 0.0004729916493136013, + "loss": 0.1873, + "step": 108240 + }, + { + "epoch": 4.48, + "grad_norm": 0.59765625, + "learning_rate": 0.0004729867459770575, + "loss": 0.1908, + "step": 108250 + }, + { + "epoch": 4.48, + "grad_norm": 1.5625, + "learning_rate": 0.00047298184222087684, + "loss": 0.2052, + "step": 108260 + }, + { + "epoch": 4.48, + "grad_norm": 0.8984375, + "learning_rate": 0.0004729769380450684, + "loss": 0.2095, + "step": 108270 + }, + { + "epoch": 4.48, + "grad_norm": 0.60546875, + "learning_rate": 0.00047297203344964133, + "loss": 0.2114, + "step": 108280 + }, + { + "epoch": 4.49, + "grad_norm": 2.09375, + "learning_rate": 0.000472967128434605, + "loss": 0.2648, + "step": 108290 + }, + { + "epoch": 4.49, + "grad_norm": 0.390625, + "learning_rate": 0.00047296222299996865, + "loss": 0.2266, + "step": 108300 + }, + { + "epoch": 4.49, + "grad_norm": 0.1962890625, + "learning_rate": 0.0004729573171457415, + "loss": 0.2331, + "step": 108310 + }, + { + "epoch": 4.49, + "grad_norm": 0.26171875, + "learning_rate": 0.00047295241087193273, + "loss": 0.2133, + "step": 108320 + }, + { + "epoch": 4.49, + "grad_norm": 0.609375, + "learning_rate": 0.0004729475041785516, + "loss": 0.2877, + "step": 108330 + }, + { + "epoch": 4.49, + "grad_norm": 0.57421875, + "learning_rate": 0.0004729425970656073, + "loss": 0.1948, + "step": 108340 + }, + { + "epoch": 4.49, + "grad_norm": 0.392578125, + "learning_rate": 0.00047293768953310915, + "loss": 0.1632, + "step": 108350 + }, + { + "epoch": 4.49, + "grad_norm": 0.55078125, + "learning_rate": 0.0004729327815810663, + "loss": 0.1796, + "step": 108360 + }, + { + "epoch": 4.49, + "grad_norm": 0.8671875, + "learning_rate": 0.00047292787320948816, + "loss": 0.1858, + "step": 108370 + }, + { + "epoch": 4.49, + "grad_norm": 0.53515625, + "learning_rate": 0.0004729229644183837, + "loss": 0.2235, + "step": 108380 + }, + { + "epoch": 4.49, + "grad_norm": 1.4375, + "learning_rate": 0.00047291805520776237, + "loss": 0.1861, + "step": 108390 + }, + { + "epoch": 4.49, + "grad_norm": 0.64453125, + "learning_rate": 0.0004729131455776333, + "loss": 0.2295, + "step": 108400 + }, + { + "epoch": 4.49, + "grad_norm": 1.078125, + "learning_rate": 0.00047290823552800576, + "loss": 0.1975, + "step": 108410 + }, + { + "epoch": 4.49, + "grad_norm": 0.71484375, + "learning_rate": 0.000472903325058889, + "loss": 0.224, + "step": 108420 + }, + { + "epoch": 4.49, + "grad_norm": 0.65625, + "learning_rate": 0.0004728984141702923, + "loss": 0.2104, + "step": 108430 + }, + { + "epoch": 4.49, + "grad_norm": 0.8359375, + "learning_rate": 0.0004728935028622248, + "loss": 0.2152, + "step": 108440 + }, + { + "epoch": 4.49, + "grad_norm": 0.9453125, + "learning_rate": 0.00047288859113469586, + "loss": 0.2334, + "step": 108450 + }, + { + "epoch": 4.49, + "grad_norm": 1.5390625, + "learning_rate": 0.0004728836789877146, + "loss": 0.2335, + "step": 108460 + }, + { + "epoch": 4.49, + "grad_norm": 0.431640625, + "learning_rate": 0.0004728787664212903, + "loss": 0.1998, + "step": 108470 + }, + { + "epoch": 4.49, + "grad_norm": 0.75390625, + "learning_rate": 0.00047287385343543225, + "loss": 0.1759, + "step": 108480 + }, + { + "epoch": 4.49, + "grad_norm": 0.60546875, + "learning_rate": 0.0004728689400301497, + "loss": 0.2061, + "step": 108490 + }, + { + "epoch": 4.49, + "grad_norm": 0.65234375, + "learning_rate": 0.0004728640262054519, + "loss": 0.2269, + "step": 108500 + }, + { + "epoch": 4.49, + "grad_norm": 1.0859375, + "learning_rate": 0.000472859111961348, + "loss": 0.2444, + "step": 108510 + }, + { + "epoch": 4.49, + "grad_norm": 1.2265625, + "learning_rate": 0.0004728541972978474, + "loss": 0.214, + "step": 108520 + }, + { + "epoch": 4.5, + "grad_norm": 0.99609375, + "learning_rate": 0.00047284928221495915, + "loss": 0.1958, + "step": 108530 + }, + { + "epoch": 4.5, + "grad_norm": 0.5703125, + "learning_rate": 0.0004728443667126927, + "loss": 0.2258, + "step": 108540 + }, + { + "epoch": 4.5, + "grad_norm": 0.765625, + "learning_rate": 0.0004728394507910572, + "loss": 0.1945, + "step": 108550 + }, + { + "epoch": 4.5, + "grad_norm": 0.87109375, + "learning_rate": 0.00047283453445006194, + "loss": 0.1994, + "step": 108560 + }, + { + "epoch": 4.5, + "grad_norm": 0.90234375, + "learning_rate": 0.00047282961768971613, + "loss": 0.2127, + "step": 108570 + }, + { + "epoch": 4.5, + "grad_norm": 0.5078125, + "learning_rate": 0.00047282470051002905, + "loss": 0.2602, + "step": 108580 + }, + { + "epoch": 4.5, + "grad_norm": 0.6875, + "learning_rate": 0.0004728197829110099, + "loss": 0.1977, + "step": 108590 + }, + { + "epoch": 4.5, + "grad_norm": 1.0546875, + "learning_rate": 0.0004728148648926681, + "loss": 0.2521, + "step": 108600 + }, + { + "epoch": 4.5, + "grad_norm": 0.5859375, + "learning_rate": 0.0004728099464550127, + "loss": 0.225, + "step": 108610 + }, + { + "epoch": 4.5, + "grad_norm": 1.0078125, + "learning_rate": 0.000472805027598053, + "loss": 0.1738, + "step": 108620 + }, + { + "epoch": 4.5, + "grad_norm": 0.24609375, + "learning_rate": 0.00047280010832179836, + "loss": 0.2801, + "step": 108630 + }, + { + "epoch": 4.5, + "grad_norm": 0.224609375, + "learning_rate": 0.00047279518862625803, + "loss": 0.252, + "step": 108640 + }, + { + "epoch": 4.5, + "grad_norm": 0.4765625, + "learning_rate": 0.0004727902685114411, + "loss": 0.2397, + "step": 108650 + }, + { + "epoch": 4.5, + "grad_norm": 0.439453125, + "learning_rate": 0.000472785347977357, + "loss": 0.1686, + "step": 108660 + }, + { + "epoch": 4.5, + "grad_norm": 0.78125, + "learning_rate": 0.0004727804270240149, + "loss": 0.2474, + "step": 108670 + }, + { + "epoch": 4.5, + "grad_norm": 0.4765625, + "learning_rate": 0.00047277550565142415, + "loss": 0.1862, + "step": 108680 + }, + { + "epoch": 4.5, + "grad_norm": 0.0, + "learning_rate": 0.00047277058385959393, + "loss": 0.229, + "step": 108690 + }, + { + "epoch": 4.5, + "grad_norm": 0.38671875, + "learning_rate": 0.00047276566164853353, + "loss": 0.1664, + "step": 108700 + }, + { + "epoch": 4.5, + "grad_norm": 0.3671875, + "learning_rate": 0.0004727607390182522, + "loss": 0.1752, + "step": 108710 + }, + { + "epoch": 4.5, + "grad_norm": 0.5, + "learning_rate": 0.0004727558159687593, + "loss": 0.2063, + "step": 108720 + }, + { + "epoch": 4.5, + "grad_norm": 0.455078125, + "learning_rate": 0.00047275089250006384, + "loss": 0.2783, + "step": 108730 + }, + { + "epoch": 4.5, + "grad_norm": 0.48046875, + "learning_rate": 0.0004727459686121754, + "loss": 0.2614, + "step": 108740 + }, + { + "epoch": 4.5, + "grad_norm": 0.72265625, + "learning_rate": 0.000472741044305103, + "loss": 0.2427, + "step": 108750 + }, + { + "epoch": 4.5, + "grad_norm": 0.76171875, + "learning_rate": 0.0004727361195788561, + "loss": 0.1623, + "step": 108760 + }, + { + "epoch": 4.51, + "grad_norm": 0.53125, + "learning_rate": 0.0004727311944334438, + "loss": 0.1889, + "step": 108770 + }, + { + "epoch": 4.51, + "grad_norm": 1.109375, + "learning_rate": 0.0004727262688688755, + "loss": 0.206, + "step": 108780 + }, + { + "epoch": 4.51, + "grad_norm": 0.6015625, + "learning_rate": 0.0004727213428851604, + "loss": 0.1447, + "step": 108790 + }, + { + "epoch": 4.51, + "grad_norm": 0.68359375, + "learning_rate": 0.00047271641648230777, + "loss": 0.2053, + "step": 108800 + }, + { + "epoch": 4.51, + "grad_norm": 0.44921875, + "learning_rate": 0.00047271148966032694, + "loss": 0.2264, + "step": 108810 + }, + { + "epoch": 4.51, + "grad_norm": 1.703125, + "learning_rate": 0.0004727065624192271, + "loss": 0.2033, + "step": 108820 + }, + { + "epoch": 4.51, + "grad_norm": 0.62109375, + "learning_rate": 0.00047270163475901755, + "loss": 0.2053, + "step": 108830 + }, + { + "epoch": 4.51, + "grad_norm": 0.6640625, + "learning_rate": 0.0004726967066797076, + "loss": 0.2523, + "step": 108840 + }, + { + "epoch": 4.51, + "grad_norm": 0.54296875, + "learning_rate": 0.00047269177818130647, + "loss": 0.256, + "step": 108850 + }, + { + "epoch": 4.51, + "grad_norm": 0.68359375, + "learning_rate": 0.0004726868492638235, + "loss": 0.227, + "step": 108860 + }, + { + "epoch": 4.51, + "grad_norm": 0.88671875, + "learning_rate": 0.0004726819199272679, + "loss": 0.1991, + "step": 108870 + }, + { + "epoch": 4.51, + "grad_norm": 0.44140625, + "learning_rate": 0.000472676990171649, + "loss": 0.1966, + "step": 108880 + }, + { + "epoch": 4.51, + "grad_norm": 0.84375, + "learning_rate": 0.00047267205999697596, + "loss": 0.2116, + "step": 108890 + }, + { + "epoch": 4.51, + "grad_norm": 0.0, + "learning_rate": 0.0004726671294032583, + "loss": 0.2464, + "step": 108900 + }, + { + "epoch": 4.51, + "grad_norm": 0.390625, + "learning_rate": 0.0004726621983905051, + "loss": 0.1914, + "step": 108910 + }, + { + "epoch": 4.51, + "grad_norm": 1.0078125, + "learning_rate": 0.00047265726695872555, + "loss": 0.2211, + "step": 108920 + }, + { + "epoch": 4.51, + "grad_norm": 0.6640625, + "learning_rate": 0.0004726523351079292, + "loss": 0.2108, + "step": 108930 + }, + { + "epoch": 4.51, + "grad_norm": 0.5546875, + "learning_rate": 0.0004726474028381252, + "loss": 0.2028, + "step": 108940 + }, + { + "epoch": 4.51, + "grad_norm": 2.6875, + "learning_rate": 0.0004726424701493228, + "loss": 0.1716, + "step": 108950 + }, + { + "epoch": 4.51, + "grad_norm": 0.5625, + "learning_rate": 0.00047263753704153134, + "loss": 0.17, + "step": 108960 + }, + { + "epoch": 4.51, + "grad_norm": 1.0078125, + "learning_rate": 0.00047263260351476, + "loss": 0.1926, + "step": 108970 + }, + { + "epoch": 4.51, + "grad_norm": 1.0703125, + "learning_rate": 0.00047262766956901827, + "loss": 0.1914, + "step": 108980 + }, + { + "epoch": 4.51, + "grad_norm": 1.109375, + "learning_rate": 0.0004726227352043152, + "loss": 0.2159, + "step": 108990 + }, + { + "epoch": 4.51, + "grad_norm": 0.5625, + "learning_rate": 0.0004726178004206602, + "loss": 0.2369, + "step": 109000 + }, + { + "epoch": 4.52, + "grad_norm": 0.4921875, + "learning_rate": 0.0004726128652180626, + "loss": 0.1798, + "step": 109010 + }, + { + "epoch": 4.52, + "grad_norm": 1.1171875, + "learning_rate": 0.00047260792959653153, + "loss": 0.2284, + "step": 109020 + }, + { + "epoch": 4.52, + "grad_norm": 0.46484375, + "learning_rate": 0.0004726029935560765, + "loss": 0.2432, + "step": 109030 + }, + { + "epoch": 4.52, + "grad_norm": 0.6484375, + "learning_rate": 0.00047259805709670656, + "loss": 0.2294, + "step": 109040 + }, + { + "epoch": 4.52, + "grad_norm": 0.70703125, + "learning_rate": 0.0004725931202184311, + "loss": 0.2023, + "step": 109050 + }, + { + "epoch": 4.52, + "grad_norm": 0.609375, + "learning_rate": 0.0004725881829212596, + "loss": 0.1994, + "step": 109060 + }, + { + "epoch": 4.52, + "grad_norm": 0.59765625, + "learning_rate": 0.00047258324520520103, + "loss": 0.2379, + "step": 109070 + }, + { + "epoch": 4.52, + "grad_norm": 1.109375, + "learning_rate": 0.0004725783070702649, + "loss": 0.2197, + "step": 109080 + }, + { + "epoch": 4.52, + "grad_norm": 1.71875, + "learning_rate": 0.00047257336851646035, + "loss": 0.2809, + "step": 109090 + }, + { + "epoch": 4.52, + "grad_norm": 0.8125, + "learning_rate": 0.0004725684295437969, + "loss": 0.1942, + "step": 109100 + }, + { + "epoch": 4.52, + "grad_norm": 0.734375, + "learning_rate": 0.00047256349015228357, + "loss": 0.2468, + "step": 109110 + }, + { + "epoch": 4.52, + "grad_norm": 0.439453125, + "learning_rate": 0.0004725585503419298, + "loss": 0.2409, + "step": 109120 + }, + { + "epoch": 4.52, + "grad_norm": 0.60546875, + "learning_rate": 0.0004725536101127449, + "loss": 0.209, + "step": 109130 + }, + { + "epoch": 4.52, + "grad_norm": 1.0703125, + "learning_rate": 0.00047254866946473814, + "loss": 0.2001, + "step": 109140 + }, + { + "epoch": 4.52, + "grad_norm": 0.62109375, + "learning_rate": 0.0004725437283979188, + "loss": 0.2227, + "step": 109150 + }, + { + "epoch": 4.52, + "grad_norm": 1.0078125, + "learning_rate": 0.0004725387869122962, + "loss": 0.2335, + "step": 109160 + }, + { + "epoch": 4.52, + "grad_norm": 0.91015625, + "learning_rate": 0.00047253384500787966, + "loss": 0.1793, + "step": 109170 + }, + { + "epoch": 4.52, + "grad_norm": 1.2265625, + "learning_rate": 0.0004725289026846784, + "loss": 0.242, + "step": 109180 + }, + { + "epoch": 4.52, + "grad_norm": 0.53125, + "learning_rate": 0.00047252395994270185, + "loss": 0.2468, + "step": 109190 + }, + { + "epoch": 4.52, + "grad_norm": 0.99609375, + "learning_rate": 0.0004725190167819592, + "loss": 0.1848, + "step": 109200 + }, + { + "epoch": 4.52, + "grad_norm": 0.703125, + "learning_rate": 0.00047251407320245977, + "loss": 0.1741, + "step": 109210 + }, + { + "epoch": 4.52, + "grad_norm": 0.54296875, + "learning_rate": 0.00047250912920421295, + "loss": 0.2635, + "step": 109220 + }, + { + "epoch": 4.52, + "grad_norm": 0.388671875, + "learning_rate": 0.0004725041847872279, + "loss": 0.2147, + "step": 109230 + }, + { + "epoch": 4.52, + "grad_norm": 0.6484375, + "learning_rate": 0.00047249923995151404, + "loss": 0.2054, + "step": 109240 + }, + { + "epoch": 4.53, + "grad_norm": 1.2734375, + "learning_rate": 0.00047249429469708066, + "loss": 0.1754, + "step": 109250 + }, + { + "epoch": 4.53, + "grad_norm": 0.326171875, + "learning_rate": 0.000472489349023937, + "loss": 0.2136, + "step": 109260 + }, + { + "epoch": 4.53, + "grad_norm": 0.578125, + "learning_rate": 0.00047248440293209243, + "loss": 0.1829, + "step": 109270 + }, + { + "epoch": 4.53, + "grad_norm": 1.09375, + "learning_rate": 0.00047247945642155624, + "loss": 0.2104, + "step": 109280 + }, + { + "epoch": 4.53, + "grad_norm": 0.62109375, + "learning_rate": 0.00047247450949233777, + "loss": 0.2025, + "step": 109290 + }, + { + "epoch": 4.53, + "grad_norm": 0.26171875, + "learning_rate": 0.0004724695621444463, + "loss": 0.2244, + "step": 109300 + }, + { + "epoch": 4.53, + "grad_norm": 0.17578125, + "learning_rate": 0.0004724646143778911, + "loss": 0.2319, + "step": 109310 + }, + { + "epoch": 4.53, + "grad_norm": 1.734375, + "learning_rate": 0.0004724596661926815, + "loss": 0.2609, + "step": 109320 + }, + { + "epoch": 4.53, + "grad_norm": 0.66015625, + "learning_rate": 0.0004724547175888269, + "loss": 0.2106, + "step": 109330 + }, + { + "epoch": 4.53, + "grad_norm": 0.376953125, + "learning_rate": 0.0004724497685663365, + "loss": 0.2535, + "step": 109340 + }, + { + "epoch": 4.53, + "grad_norm": 0.984375, + "learning_rate": 0.0004724448191252196, + "loss": 0.2313, + "step": 109350 + }, + { + "epoch": 4.53, + "grad_norm": 1.3125, + "learning_rate": 0.00047243986926548566, + "loss": 0.1889, + "step": 109360 + }, + { + "epoch": 4.53, + "grad_norm": 2.5, + "learning_rate": 0.00047243491898714385, + "loss": 0.1809, + "step": 109370 + }, + { + "epoch": 4.53, + "grad_norm": 0.84765625, + "learning_rate": 0.00047242996829020356, + "loss": 0.2009, + "step": 109380 + }, + { + "epoch": 4.53, + "grad_norm": 0.8046875, + "learning_rate": 0.0004724250171746741, + "loss": 0.2065, + "step": 109390 + }, + { + "epoch": 4.53, + "grad_norm": 0.11572265625, + "learning_rate": 0.00047242006564056485, + "loss": 0.135, + "step": 109400 + }, + { + "epoch": 4.53, + "grad_norm": 0.671875, + "learning_rate": 0.00047241511368788496, + "loss": 0.2257, + "step": 109410 + }, + { + "epoch": 4.53, + "grad_norm": 1.078125, + "learning_rate": 0.0004724101613166439, + "loss": 0.2391, + "step": 109420 + }, + { + "epoch": 4.53, + "grad_norm": 0.54296875, + "learning_rate": 0.00047240520852685086, + "loss": 0.2204, + "step": 109430 + }, + { + "epoch": 4.53, + "grad_norm": 0.35546875, + "learning_rate": 0.00047240025531851534, + "loss": 0.1876, + "step": 109440 + }, + { + "epoch": 4.53, + "grad_norm": 0.70703125, + "learning_rate": 0.00047239530169164646, + "loss": 0.1782, + "step": 109450 + }, + { + "epoch": 4.53, + "grad_norm": 0.9453125, + "learning_rate": 0.00047239034764625374, + "loss": 0.2711, + "step": 109460 + }, + { + "epoch": 4.53, + "grad_norm": 0.6953125, + "learning_rate": 0.0004723853931823463, + "loss": 0.2296, + "step": 109470 + }, + { + "epoch": 4.53, + "grad_norm": 1.09375, + "learning_rate": 0.0004723804382999336, + "loss": 0.1887, + "step": 109480 + }, + { + "epoch": 4.54, + "grad_norm": 0.515625, + "learning_rate": 0.0004723754829990249, + "loss": 0.1661, + "step": 109490 + }, + { + "epoch": 4.54, + "grad_norm": 0.87890625, + "learning_rate": 0.00047237052727962963, + "loss": 0.218, + "step": 109500 + }, + { + "epoch": 4.54, + "grad_norm": 0.3828125, + "learning_rate": 0.000472365571141757, + "loss": 0.2138, + "step": 109510 + }, + { + "epoch": 4.54, + "grad_norm": 0.40625, + "learning_rate": 0.0004723606145854164, + "loss": 0.1638, + "step": 109520 + }, + { + "epoch": 4.54, + "grad_norm": 0.443359375, + "learning_rate": 0.0004723556576106171, + "loss": 0.1989, + "step": 109530 + }, + { + "epoch": 4.54, + "grad_norm": 0.2158203125, + "learning_rate": 0.0004723507002173685, + "loss": 0.2244, + "step": 109540 + }, + { + "epoch": 4.54, + "grad_norm": 0.65625, + "learning_rate": 0.00047234574240567984, + "loss": 0.2182, + "step": 109550 + }, + { + "epoch": 4.54, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004723407841755606, + "loss": 0.2095, + "step": 109560 + }, + { + "epoch": 4.54, + "grad_norm": 0.8203125, + "learning_rate": 0.00047233582552701995, + "loss": 0.2103, + "step": 109570 + }, + { + "epoch": 4.54, + "grad_norm": 1.7890625, + "learning_rate": 0.0004723308664600673, + "loss": 0.1819, + "step": 109580 + }, + { + "epoch": 4.54, + "grad_norm": 0.72265625, + "learning_rate": 0.0004723259069747119, + "loss": 0.1933, + "step": 109590 + }, + { + "epoch": 4.54, + "grad_norm": 0.3125, + "learning_rate": 0.00047232094707096324, + "loss": 0.2407, + "step": 109600 + }, + { + "epoch": 4.54, + "grad_norm": 0.515625, + "learning_rate": 0.0004723159867488306, + "loss": 0.2792, + "step": 109610 + }, + { + "epoch": 4.54, + "grad_norm": 0.37109375, + "learning_rate": 0.0004723110260083232, + "loss": 0.1967, + "step": 109620 + }, + { + "epoch": 4.54, + "grad_norm": 0.96875, + "learning_rate": 0.0004723060648494505, + "loss": 0.2227, + "step": 109630 + }, + { + "epoch": 4.54, + "grad_norm": 0.6953125, + "learning_rate": 0.0004723011032722218, + "loss": 0.1713, + "step": 109640 + }, + { + "epoch": 4.54, + "grad_norm": 1.046875, + "learning_rate": 0.00047229614127664634, + "loss": 0.2369, + "step": 109650 + }, + { + "epoch": 4.54, + "grad_norm": 0.404296875, + "learning_rate": 0.0004722911788627336, + "loss": 0.1765, + "step": 109660 + }, + { + "epoch": 4.54, + "grad_norm": 0.40234375, + "learning_rate": 0.0004722862160304929, + "loss": 0.2618, + "step": 109670 + }, + { + "epoch": 4.54, + "grad_norm": 1.2734375, + "learning_rate": 0.0004722812527799335, + "loss": 0.2504, + "step": 109680 + }, + { + "epoch": 4.54, + "grad_norm": 0.60546875, + "learning_rate": 0.0004722762891110648, + "loss": 0.2028, + "step": 109690 + }, + { + "epoch": 4.54, + "grad_norm": 0.47265625, + "learning_rate": 0.00047227132502389616, + "loss": 0.237, + "step": 109700 + }, + { + "epoch": 4.54, + "grad_norm": 1.1640625, + "learning_rate": 0.0004722663605184369, + "loss": 0.2654, + "step": 109710 + }, + { + "epoch": 4.54, + "grad_norm": 0.8203125, + "learning_rate": 0.0004722613955946963, + "loss": 0.2235, + "step": 109720 + }, + { + "epoch": 4.55, + "grad_norm": 0.9375, + "learning_rate": 0.00047225643025268374, + "loss": 0.2176, + "step": 109730 + }, + { + "epoch": 4.55, + "grad_norm": 0.271484375, + "learning_rate": 0.00047225146449240865, + "loss": 0.1674, + "step": 109740 + }, + { + "epoch": 4.55, + "grad_norm": 2.15625, + "learning_rate": 0.00047224649831388024, + "loss": 0.2403, + "step": 109750 + }, + { + "epoch": 4.55, + "grad_norm": 0.46875, + "learning_rate": 0.00047224153171710793, + "loss": 0.1807, + "step": 109760 + }, + { + "epoch": 4.55, + "grad_norm": 0.62890625, + "learning_rate": 0.00047223656470210106, + "loss": 0.2302, + "step": 109770 + }, + { + "epoch": 4.55, + "grad_norm": 2.4375, + "learning_rate": 0.000472231597268869, + "loss": 0.2094, + "step": 109780 + }, + { + "epoch": 4.55, + "grad_norm": 0.69921875, + "learning_rate": 0.0004722266294174211, + "loss": 0.1867, + "step": 109790 + }, + { + "epoch": 4.55, + "grad_norm": 0.462890625, + "learning_rate": 0.0004722216611477666, + "loss": 0.2077, + "step": 109800 + }, + { + "epoch": 4.55, + "grad_norm": 0.61328125, + "learning_rate": 0.000472216692459915, + "loss": 0.1811, + "step": 109810 + }, + { + "epoch": 4.55, + "grad_norm": 0.69921875, + "learning_rate": 0.00047221172335387555, + "loss": 0.2194, + "step": 109820 + }, + { + "epoch": 4.55, + "grad_norm": 0.3828125, + "learning_rate": 0.00047220675382965763, + "loss": 0.2166, + "step": 109830 + }, + { + "epoch": 4.55, + "grad_norm": 0.43359375, + "learning_rate": 0.00047220178388727063, + "loss": 0.2259, + "step": 109840 + }, + { + "epoch": 4.55, + "grad_norm": 0.447265625, + "learning_rate": 0.00047219681352672383, + "loss": 0.2215, + "step": 109850 + }, + { + "epoch": 4.55, + "grad_norm": 0.515625, + "learning_rate": 0.00047219184274802664, + "loss": 0.1699, + "step": 109860 + }, + { + "epoch": 4.55, + "grad_norm": 1.0625, + "learning_rate": 0.0004721868715511884, + "loss": 0.1803, + "step": 109870 + }, + { + "epoch": 4.55, + "grad_norm": 1.3828125, + "learning_rate": 0.00047218189993621844, + "loss": 0.2353, + "step": 109880 + }, + { + "epoch": 4.55, + "grad_norm": 0.73046875, + "learning_rate": 0.0004721769279031262, + "loss": 0.221, + "step": 109890 + }, + { + "epoch": 4.55, + "grad_norm": 0.80078125, + "learning_rate": 0.00047217195545192094, + "loss": 0.2077, + "step": 109900 + }, + { + "epoch": 4.55, + "grad_norm": 1.6015625, + "learning_rate": 0.00047216698258261205, + "loss": 0.1969, + "step": 109910 + }, + { + "epoch": 4.55, + "grad_norm": 0.5234375, + "learning_rate": 0.0004721620092952088, + "loss": 0.148, + "step": 109920 + }, + { + "epoch": 4.55, + "grad_norm": 0.6015625, + "learning_rate": 0.0004721570355897208, + "loss": 0.1822, + "step": 109930 + }, + { + "epoch": 4.55, + "grad_norm": 1.1796875, + "learning_rate": 0.00047215206146615713, + "loss": 0.2594, + "step": 109940 + }, + { + "epoch": 4.55, + "grad_norm": 0.42578125, + "learning_rate": 0.00047214708692452733, + "loss": 0.2044, + "step": 109950 + }, + { + "epoch": 4.55, + "grad_norm": 0.96484375, + "learning_rate": 0.0004721421119648407, + "loss": 0.2594, + "step": 109960 + }, + { + "epoch": 4.55, + "grad_norm": 0.546875, + "learning_rate": 0.00047213713658710656, + "loss": 0.1864, + "step": 109970 + }, + { + "epoch": 4.56, + "grad_norm": 1.3671875, + "learning_rate": 0.00047213216079133435, + "loss": 0.1633, + "step": 109980 + }, + { + "epoch": 4.56, + "grad_norm": 1.2109375, + "learning_rate": 0.0004721271845775334, + "loss": 0.1945, + "step": 109990 + }, + { + "epoch": 4.56, + "grad_norm": 0.99609375, + "learning_rate": 0.0004721222079457131, + "loss": 0.2264, + "step": 110000 + }, + { + "epoch": 4.56, + "grad_norm": 0.91796875, + "learning_rate": 0.00047211723089588274, + "loss": 0.2041, + "step": 110010 + }, + { + "epoch": 4.56, + "grad_norm": 0.2041015625, + "learning_rate": 0.0004721122534280518, + "loss": 0.243, + "step": 110020 + }, + { + "epoch": 4.56, + "grad_norm": 0.33203125, + "learning_rate": 0.00047210727554222953, + "loss": 0.2168, + "step": 110030 + }, + { + "epoch": 4.56, + "grad_norm": 0.7890625, + "learning_rate": 0.00047210229723842535, + "loss": 0.1822, + "step": 110040 + }, + { + "epoch": 4.56, + "grad_norm": 0.353515625, + "learning_rate": 0.00047209731851664865, + "loss": 0.2156, + "step": 110050 + }, + { + "epoch": 4.56, + "grad_norm": 0.2001953125, + "learning_rate": 0.00047209233937690876, + "loss": 0.1587, + "step": 110060 + }, + { + "epoch": 4.56, + "grad_norm": 0.578125, + "learning_rate": 0.0004720873598192151, + "loss": 0.2284, + "step": 110070 + }, + { + "epoch": 4.56, + "grad_norm": 0.69921875, + "learning_rate": 0.000472082379843577, + "loss": 0.2086, + "step": 110080 + }, + { + "epoch": 4.56, + "grad_norm": 0.99609375, + "learning_rate": 0.00047207739945000394, + "loss": 0.2577, + "step": 110090 + }, + { + "epoch": 4.56, + "grad_norm": 0.89453125, + "learning_rate": 0.0004720724186385051, + "loss": 0.2625, + "step": 110100 + }, + { + "epoch": 4.56, + "grad_norm": 0.53125, + "learning_rate": 0.0004720674374090899, + "loss": 0.2929, + "step": 110110 + }, + { + "epoch": 4.56, + "grad_norm": 0.91015625, + "learning_rate": 0.0004720624557617679, + "loss": 0.1859, + "step": 110120 + }, + { + "epoch": 4.56, + "grad_norm": 0.58984375, + "learning_rate": 0.0004720574736965482, + "loss": 0.2123, + "step": 110130 + }, + { + "epoch": 4.56, + "grad_norm": 0.5078125, + "learning_rate": 0.0004720524912134404, + "loss": 0.2034, + "step": 110140 + }, + { + "epoch": 4.56, + "grad_norm": 1.015625, + "learning_rate": 0.0004720475083124538, + "loss": 0.2637, + "step": 110150 + }, + { + "epoch": 4.56, + "grad_norm": 0.6953125, + "learning_rate": 0.0004720425249935977, + "loss": 0.272, + "step": 110160 + }, + { + "epoch": 4.56, + "grad_norm": 0.51171875, + "learning_rate": 0.00047203754125688157, + "loss": 0.1859, + "step": 110170 + }, + { + "epoch": 4.56, + "grad_norm": 0.79296875, + "learning_rate": 0.0004720325571023148, + "loss": 0.2315, + "step": 110180 + }, + { + "epoch": 4.56, + "grad_norm": 0.490234375, + "learning_rate": 0.0004720275725299067, + "loss": 0.1986, + "step": 110190 + }, + { + "epoch": 4.56, + "grad_norm": 1.1640625, + "learning_rate": 0.0004720225875396667, + "loss": 0.1776, + "step": 110200 + }, + { + "epoch": 4.56, + "grad_norm": 0.52734375, + "learning_rate": 0.00047201760213160416, + "loss": 0.2277, + "step": 110210 + }, + { + "epoch": 4.57, + "grad_norm": 0.69140625, + "learning_rate": 0.00047201261630572846, + "loss": 0.1763, + "step": 110220 + }, + { + "epoch": 4.57, + "grad_norm": 0.82421875, + "learning_rate": 0.00047200763006204907, + "loss": 0.1643, + "step": 110230 + }, + { + "epoch": 4.57, + "grad_norm": 0.57421875, + "learning_rate": 0.0004720026434005752, + "loss": 0.2142, + "step": 110240 + }, + { + "epoch": 4.57, + "grad_norm": 0.6953125, + "learning_rate": 0.00047199765632131635, + "loss": 0.1759, + "step": 110250 + }, + { + "epoch": 4.57, + "grad_norm": 0.515625, + "learning_rate": 0.00047199266882428194, + "loss": 0.2167, + "step": 110260 + }, + { + "epoch": 4.57, + "grad_norm": 0.80078125, + "learning_rate": 0.0004719876809094812, + "loss": 0.2393, + "step": 110270 + }, + { + "epoch": 4.57, + "grad_norm": 1.3125, + "learning_rate": 0.00047198269257692373, + "loss": 0.2073, + "step": 110280 + }, + { + "epoch": 4.57, + "grad_norm": 0.7421875, + "learning_rate": 0.0004719777038266187, + "loss": 0.2292, + "step": 110290 + }, + { + "epoch": 4.57, + "grad_norm": 0.59375, + "learning_rate": 0.00047197271465857567, + "loss": 0.2123, + "step": 110300 + }, + { + "epoch": 4.57, + "grad_norm": 0.60546875, + "learning_rate": 0.00047196772507280394, + "loss": 0.2088, + "step": 110310 + }, + { + "epoch": 4.57, + "grad_norm": 2.296875, + "learning_rate": 0.00047196273506931285, + "loss": 0.2251, + "step": 110320 + }, + { + "epoch": 4.57, + "grad_norm": 0.474609375, + "learning_rate": 0.000471957744648112, + "loss": 0.2075, + "step": 110330 + }, + { + "epoch": 4.57, + "grad_norm": 0.439453125, + "learning_rate": 0.00047195275380921056, + "loss": 0.2385, + "step": 110340 + }, + { + "epoch": 4.57, + "grad_norm": 0.6484375, + "learning_rate": 0.00047194776255261807, + "loss": 0.2252, + "step": 110350 + }, + { + "epoch": 4.57, + "grad_norm": 0.0, + "learning_rate": 0.0004719427708783438, + "loss": 0.1941, + "step": 110360 + }, + { + "epoch": 4.57, + "grad_norm": 0.74609375, + "learning_rate": 0.0004719377787863972, + "loss": 0.2306, + "step": 110370 + }, + { + "epoch": 4.57, + "grad_norm": 1.3203125, + "learning_rate": 0.0004719327862767877, + "loss": 0.2236, + "step": 110380 + }, + { + "epoch": 4.57, + "grad_norm": 0.80078125, + "learning_rate": 0.0004719277933495246, + "loss": 0.2835, + "step": 110390 + }, + { + "epoch": 4.57, + "grad_norm": 0.5234375, + "learning_rate": 0.0004719228000046174, + "loss": 0.2369, + "step": 110400 + }, + { + "epoch": 4.57, + "grad_norm": 1.1328125, + "learning_rate": 0.00047191780624207546, + "loss": 0.2199, + "step": 110410 + }, + { + "epoch": 4.57, + "grad_norm": 1.3125, + "learning_rate": 0.00047191281206190823, + "loss": 0.2367, + "step": 110420 + }, + { + "epoch": 4.57, + "grad_norm": 0.68359375, + "learning_rate": 0.000471907817464125, + "loss": 0.2129, + "step": 110430 + }, + { + "epoch": 4.57, + "grad_norm": 0.6875, + "learning_rate": 0.0004719028224487352, + "loss": 0.2022, + "step": 110440 + }, + { + "epoch": 4.57, + "grad_norm": 0.337890625, + "learning_rate": 0.0004718978270157483, + "loss": 0.2693, + "step": 110450 + }, + { + "epoch": 4.58, + "grad_norm": 1.359375, + "learning_rate": 0.0004718928311651736, + "loss": 0.1961, + "step": 110460 + }, + { + "epoch": 4.58, + "grad_norm": 0.91796875, + "learning_rate": 0.0004718878348970206, + "loss": 0.1779, + "step": 110470 + }, + { + "epoch": 4.58, + "grad_norm": 0.62890625, + "learning_rate": 0.0004718828382112986, + "loss": 0.2201, + "step": 110480 + }, + { + "epoch": 4.58, + "grad_norm": 0.4296875, + "learning_rate": 0.0004718778411080171, + "loss": 0.2102, + "step": 110490 + }, + { + "epoch": 4.58, + "grad_norm": 0.68359375, + "learning_rate": 0.00047187284358718554, + "loss": 0.2147, + "step": 110500 + }, + { + "epoch": 4.58, + "grad_norm": 0.55078125, + "learning_rate": 0.00047186784564881313, + "loss": 0.1896, + "step": 110510 + }, + { + "epoch": 4.58, + "grad_norm": 0.6796875, + "learning_rate": 0.0004718628472929094, + "loss": 0.197, + "step": 110520 + }, + { + "epoch": 4.58, + "grad_norm": 0.416015625, + "learning_rate": 0.0004718578485194838, + "loss": 0.2194, + "step": 110530 + }, + { + "epoch": 4.58, + "grad_norm": 0.55078125, + "learning_rate": 0.0004718528493285457, + "loss": 0.1882, + "step": 110540 + }, + { + "epoch": 4.58, + "grad_norm": 0.412109375, + "learning_rate": 0.0004718478497201044, + "loss": 0.171, + "step": 110550 + }, + { + "epoch": 4.58, + "grad_norm": 0.5546875, + "learning_rate": 0.00047184284969416945, + "loss": 0.2106, + "step": 110560 + }, + { + "epoch": 4.58, + "grad_norm": 0.53125, + "learning_rate": 0.00047183784925075025, + "loss": 0.1931, + "step": 110570 + }, + { + "epoch": 4.58, + "grad_norm": 0.4921875, + "learning_rate": 0.0004718328483898562, + "loss": 0.2442, + "step": 110580 + }, + { + "epoch": 4.58, + "grad_norm": 0.2099609375, + "learning_rate": 0.00047182784711149664, + "loss": 0.1797, + "step": 110590 + }, + { + "epoch": 4.58, + "grad_norm": 0.421875, + "learning_rate": 0.000471822845415681, + "loss": 0.2036, + "step": 110600 + }, + { + "epoch": 4.58, + "grad_norm": 0.66015625, + "learning_rate": 0.0004718178433024188, + "loss": 0.1908, + "step": 110610 + }, + { + "epoch": 4.58, + "grad_norm": 0.31640625, + "learning_rate": 0.0004718128407717193, + "loss": 0.2801, + "step": 110620 + }, + { + "epoch": 4.58, + "grad_norm": 0.5546875, + "learning_rate": 0.00047180783782359206, + "loss": 0.2051, + "step": 110630 + }, + { + "epoch": 4.58, + "grad_norm": 0.55078125, + "learning_rate": 0.00047180283445804634, + "loss": 0.1964, + "step": 110640 + }, + { + "epoch": 4.58, + "grad_norm": 1.5546875, + "learning_rate": 0.00047179783067509163, + "loss": 0.2239, + "step": 110650 + }, + { + "epoch": 4.58, + "grad_norm": 0.6484375, + "learning_rate": 0.0004717928264747374, + "loss": 0.1902, + "step": 110660 + }, + { + "epoch": 4.58, + "grad_norm": 0.251953125, + "learning_rate": 0.00047178782185699304, + "loss": 0.191, + "step": 110670 + }, + { + "epoch": 4.58, + "grad_norm": 0.96875, + "learning_rate": 0.000471782816821868, + "loss": 0.2228, + "step": 110680 + }, + { + "epoch": 4.58, + "grad_norm": 0.86328125, + "learning_rate": 0.0004717778113693715, + "loss": 0.196, + "step": 110690 + }, + { + "epoch": 4.59, + "grad_norm": 0.283203125, + "learning_rate": 0.0004717728054995132, + "loss": 0.2765, + "step": 110700 + }, + { + "epoch": 4.59, + "grad_norm": 0.62109375, + "learning_rate": 0.0004717677992123024, + "loss": 0.2214, + "step": 110710 + }, + { + "epoch": 4.59, + "grad_norm": 0.76171875, + "learning_rate": 0.0004717627925077486, + "loss": 0.241, + "step": 110720 + }, + { + "epoch": 4.59, + "grad_norm": 0.451171875, + "learning_rate": 0.0004717577853858611, + "loss": 0.2168, + "step": 110730 + }, + { + "epoch": 4.59, + "grad_norm": 1.65625, + "learning_rate": 0.00047175277784664943, + "loss": 0.2259, + "step": 110740 + }, + { + "epoch": 4.59, + "grad_norm": 0.75, + "learning_rate": 0.000471747769890123, + "loss": 0.2045, + "step": 110750 + }, + { + "epoch": 4.59, + "grad_norm": 1.03125, + "learning_rate": 0.0004717427615162912, + "loss": 0.1971, + "step": 110760 + }, + { + "epoch": 4.59, + "grad_norm": 0.6640625, + "learning_rate": 0.0004717377527251635, + "loss": 0.2423, + "step": 110770 + }, + { + "epoch": 4.59, + "grad_norm": 0.6875, + "learning_rate": 0.00047173274351674924, + "loss": 0.2139, + "step": 110780 + }, + { + "epoch": 4.59, + "grad_norm": 0.2490234375, + "learning_rate": 0.0004717277338910579, + "loss": 0.2675, + "step": 110790 + }, + { + "epoch": 4.59, + "grad_norm": 0.84375, + "learning_rate": 0.00047172272384809897, + "loss": 0.2239, + "step": 110800 + }, + { + "epoch": 4.59, + "grad_norm": 0.921875, + "learning_rate": 0.0004717177133878817, + "loss": 0.2317, + "step": 110810 + }, + { + "epoch": 4.59, + "grad_norm": 0.75, + "learning_rate": 0.0004717127025104158, + "loss": 0.1854, + "step": 110820 + }, + { + "epoch": 4.59, + "grad_norm": 1.2578125, + "learning_rate": 0.00047170769121571046, + "loss": 0.2202, + "step": 110830 + }, + { + "epoch": 4.59, + "grad_norm": 1.078125, + "learning_rate": 0.00047170267950377514, + "loss": 0.2084, + "step": 110840 + }, + { + "epoch": 4.59, + "grad_norm": 0.84765625, + "learning_rate": 0.0004716976673746194, + "loss": 0.2259, + "step": 110850 + }, + { + "epoch": 4.59, + "grad_norm": 0.265625, + "learning_rate": 0.0004716926548282525, + "loss": 0.2022, + "step": 110860 + }, + { + "epoch": 4.59, + "grad_norm": 0.404296875, + "learning_rate": 0.00047168764186468406, + "loss": 0.2544, + "step": 110870 + }, + { + "epoch": 4.59, + "grad_norm": 0.369140625, + "learning_rate": 0.00047168262848392334, + "loss": 0.1731, + "step": 110880 + }, + { + "epoch": 4.59, + "grad_norm": 0.470703125, + "learning_rate": 0.0004716776146859799, + "loss": 0.2402, + "step": 110890 + }, + { + "epoch": 4.59, + "grad_norm": 1.140625, + "learning_rate": 0.00047167260047086316, + "loss": 0.2097, + "step": 110900 + }, + { + "epoch": 4.59, + "grad_norm": 0.60546875, + "learning_rate": 0.00047166758583858245, + "loss": 0.2144, + "step": 110910 + }, + { + "epoch": 4.59, + "grad_norm": 0.921875, + "learning_rate": 0.0004716625707891473, + "loss": 0.2086, + "step": 110920 + }, + { + "epoch": 4.59, + "grad_norm": 0.54296875, + "learning_rate": 0.0004716575553225672, + "loss": 0.1736, + "step": 110930 + }, + { + "epoch": 4.6, + "grad_norm": 0.6328125, + "learning_rate": 0.00047165253943885137, + "loss": 0.2194, + "step": 110940 + }, + { + "epoch": 4.6, + "grad_norm": 2.21875, + "learning_rate": 0.00047164752313800953, + "loss": 0.2146, + "step": 110950 + }, + { + "epoch": 4.6, + "grad_norm": 0.609375, + "learning_rate": 0.0004716425064200509, + "loss": 0.2162, + "step": 110960 + }, + { + "epoch": 4.6, + "grad_norm": 0.388671875, + "learning_rate": 0.00047163748928498504, + "loss": 0.1819, + "step": 110970 + }, + { + "epoch": 4.6, + "grad_norm": 0.7109375, + "learning_rate": 0.0004716324717328214, + "loss": 0.236, + "step": 110980 + }, + { + "epoch": 4.6, + "grad_norm": 0.5234375, + "learning_rate": 0.0004716274537635694, + "loss": 0.1911, + "step": 110990 + }, + { + "epoch": 4.6, + "grad_norm": 0.54296875, + "learning_rate": 0.0004716224353772384, + "loss": 0.2181, + "step": 111000 + }, + { + "epoch": 4.6, + "grad_norm": 1.703125, + "learning_rate": 0.00047161741657383795, + "loss": 0.2184, + "step": 111010 + }, + { + "epoch": 4.6, + "grad_norm": 0.58984375, + "learning_rate": 0.0004716123973533774, + "loss": 0.194, + "step": 111020 + }, + { + "epoch": 4.6, + "grad_norm": 0.640625, + "learning_rate": 0.00047160737771586626, + "loss": 0.2246, + "step": 111030 + }, + { + "epoch": 4.6, + "grad_norm": 0.9765625, + "learning_rate": 0.000471602357661314, + "loss": 0.2448, + "step": 111040 + }, + { + "epoch": 4.6, + "grad_norm": 0.94140625, + "learning_rate": 0.00047159733718973005, + "loss": 0.2038, + "step": 111050 + }, + { + "epoch": 4.6, + "grad_norm": 1.078125, + "learning_rate": 0.0004715923163011238, + "loss": 0.2517, + "step": 111060 + }, + { + "epoch": 4.6, + "grad_norm": 1.1640625, + "learning_rate": 0.0004715872949955048, + "loss": 0.2263, + "step": 111070 + }, + { + "epoch": 4.6, + "grad_norm": 0.62890625, + "learning_rate": 0.00047158227327288236, + "loss": 0.2035, + "step": 111080 + }, + { + "epoch": 4.6, + "grad_norm": 0.68359375, + "learning_rate": 0.000471577251133266, + "loss": 0.1628, + "step": 111090 + }, + { + "epoch": 4.6, + "grad_norm": 0.80859375, + "learning_rate": 0.00047157222857666526, + "loss": 0.2466, + "step": 111100 + }, + { + "epoch": 4.6, + "grad_norm": 0.3359375, + "learning_rate": 0.0004715672056030895, + "loss": 0.2377, + "step": 111110 + }, + { + "epoch": 4.6, + "grad_norm": 0.53515625, + "learning_rate": 0.0004715621822125482, + "loss": 0.2069, + "step": 111120 + }, + { + "epoch": 4.6, + "grad_norm": 0.462890625, + "learning_rate": 0.00047155715840505074, + "loss": 0.1996, + "step": 111130 + }, + { + "epoch": 4.6, + "grad_norm": 0.61328125, + "learning_rate": 0.00047155213418060665, + "loss": 0.2178, + "step": 111140 + }, + { + "epoch": 4.6, + "grad_norm": 0.5078125, + "learning_rate": 0.0004715471095392254, + "loss": 0.2034, + "step": 111150 + }, + { + "epoch": 4.6, + "grad_norm": 0.2041015625, + "learning_rate": 0.0004715420844809164, + "loss": 0.2194, + "step": 111160 + }, + { + "epoch": 4.6, + "grad_norm": 1.8125, + "learning_rate": 0.0004715370590056891, + "loss": 0.2158, + "step": 111170 + }, + { + "epoch": 4.61, + "grad_norm": 2.125, + "learning_rate": 0.000471532033113553, + "loss": 0.2207, + "step": 111180 + }, + { + "epoch": 4.61, + "grad_norm": 0.52734375, + "learning_rate": 0.0004715270068045175, + "loss": 0.2248, + "step": 111190 + }, + { + "epoch": 4.61, + "grad_norm": 0.67578125, + "learning_rate": 0.00047152198007859215, + "loss": 0.25, + "step": 111200 + }, + { + "epoch": 4.61, + "grad_norm": 1.2109375, + "learning_rate": 0.00047151695293578633, + "loss": 0.1763, + "step": 111210 + }, + { + "epoch": 4.61, + "grad_norm": 0.765625, + "learning_rate": 0.00047151192537610954, + "loss": 0.2299, + "step": 111220 + }, + { + "epoch": 4.61, + "grad_norm": 0.921875, + "learning_rate": 0.0004715068973995713, + "loss": 0.2257, + "step": 111230 + }, + { + "epoch": 4.61, + "grad_norm": 0.81640625, + "learning_rate": 0.00047150186900618085, + "loss": 0.2144, + "step": 111240 + }, + { + "epoch": 4.61, + "grad_norm": 0.921875, + "learning_rate": 0.0004714968401959479, + "loss": 0.2565, + "step": 111250 + }, + { + "epoch": 4.61, + "grad_norm": 0.28515625, + "learning_rate": 0.0004714918109688818, + "loss": 0.189, + "step": 111260 + }, + { + "epoch": 4.61, + "grad_norm": 1.1015625, + "learning_rate": 0.00047148678132499203, + "loss": 0.1999, + "step": 111270 + }, + { + "epoch": 4.61, + "grad_norm": 0.56640625, + "learning_rate": 0.000471481751264288, + "loss": 0.18, + "step": 111280 + }, + { + "epoch": 4.61, + "grad_norm": 0.52734375, + "learning_rate": 0.0004714767207867793, + "loss": 0.1659, + "step": 111290 + }, + { + "epoch": 4.61, + "grad_norm": 0.5078125, + "learning_rate": 0.00047147168989247534, + "loss": 0.2416, + "step": 111300 + }, + { + "epoch": 4.61, + "grad_norm": 0.2373046875, + "learning_rate": 0.0004714666585813855, + "loss": 0.2517, + "step": 111310 + }, + { + "epoch": 4.61, + "grad_norm": 0.75, + "learning_rate": 0.00047146162685351935, + "loss": 0.2459, + "step": 111320 + }, + { + "epoch": 4.61, + "grad_norm": 0.92578125, + "learning_rate": 0.00047145659470888634, + "loss": 0.2339, + "step": 111330 + }, + { + "epoch": 4.61, + "grad_norm": 1.5078125, + "learning_rate": 0.00047145156214749593, + "loss": 0.1852, + "step": 111340 + }, + { + "epoch": 4.61, + "grad_norm": 0.2265625, + "learning_rate": 0.00047144652916935773, + "loss": 0.1569, + "step": 111350 + }, + { + "epoch": 4.61, + "grad_norm": 0.7109375, + "learning_rate": 0.00047144149577448095, + "loss": 0.2391, + "step": 111360 + }, + { + "epoch": 4.61, + "grad_norm": 0.443359375, + "learning_rate": 0.00047143646196287517, + "loss": 0.208, + "step": 111370 + }, + { + "epoch": 4.61, + "grad_norm": 0.98046875, + "learning_rate": 0.00047143142773454984, + "loss": 0.211, + "step": 111380 + }, + { + "epoch": 4.61, + "grad_norm": 0.72265625, + "learning_rate": 0.00047142639308951455, + "loss": 0.1953, + "step": 111390 + }, + { + "epoch": 4.61, + "grad_norm": 0.90625, + "learning_rate": 0.00047142135802777873, + "loss": 0.1728, + "step": 111400 + }, + { + "epoch": 4.61, + "grad_norm": 0.72265625, + "learning_rate": 0.00047141632254935176, + "loss": 0.2452, + "step": 111410 + }, + { + "epoch": 4.62, + "grad_norm": 0.44140625, + "learning_rate": 0.0004714112866542433, + "loss": 0.2181, + "step": 111420 + }, + { + "epoch": 4.62, + "grad_norm": 0.76953125, + "learning_rate": 0.00047140625034246253, + "loss": 0.1924, + "step": 111430 + }, + { + "epoch": 4.62, + "grad_norm": 0.93359375, + "learning_rate": 0.0004714012136140192, + "loss": 0.2094, + "step": 111440 + }, + { + "epoch": 4.62, + "grad_norm": 0.8984375, + "learning_rate": 0.00047139617646892276, + "loss": 0.159, + "step": 111450 + }, + { + "epoch": 4.62, + "grad_norm": 0.42578125, + "learning_rate": 0.0004713911389071825, + "loss": 0.2012, + "step": 111460 + }, + { + "epoch": 4.62, + "grad_norm": 0.7890625, + "learning_rate": 0.0004713861009288081, + "loss": 0.2006, + "step": 111470 + }, + { + "epoch": 4.62, + "grad_norm": 0.5546875, + "learning_rate": 0.00047138106253380896, + "loss": 0.2572, + "step": 111480 + }, + { + "epoch": 4.62, + "grad_norm": 0.64453125, + "learning_rate": 0.0004713760237221946, + "loss": 0.1927, + "step": 111490 + }, + { + "epoch": 4.62, + "grad_norm": 1.5546875, + "learning_rate": 0.0004713709844939744, + "loss": 0.1561, + "step": 111500 + }, + { + "epoch": 4.62, + "grad_norm": 0.77734375, + "learning_rate": 0.0004713659448491579, + "loss": 0.2314, + "step": 111510 + }, + { + "epoch": 4.62, + "grad_norm": 0.66015625, + "learning_rate": 0.00047136090478775463, + "loss": 0.2222, + "step": 111520 + }, + { + "epoch": 4.62, + "grad_norm": 0.7578125, + "learning_rate": 0.00047135586430977405, + "loss": 0.188, + "step": 111530 + }, + { + "epoch": 4.62, + "grad_norm": 0.828125, + "learning_rate": 0.0004713508234152256, + "loss": 0.2223, + "step": 111540 + }, + { + "epoch": 4.62, + "grad_norm": 0.4375, + "learning_rate": 0.00047134578210411885, + "loss": 0.1907, + "step": 111550 + }, + { + "epoch": 4.62, + "grad_norm": 0.79296875, + "learning_rate": 0.00047134074037646326, + "loss": 0.2349, + "step": 111560 + }, + { + "epoch": 4.62, + "grad_norm": 0.69140625, + "learning_rate": 0.0004713356982322683, + "loss": 0.2221, + "step": 111570 + }, + { + "epoch": 4.62, + "grad_norm": 1.0, + "learning_rate": 0.0004713306556715434, + "loss": 0.2033, + "step": 111580 + }, + { + "epoch": 4.62, + "grad_norm": 0.765625, + "learning_rate": 0.00047132561269429805, + "loss": 0.2515, + "step": 111590 + }, + { + "epoch": 4.62, + "grad_norm": 0.91796875, + "learning_rate": 0.00047132056930054194, + "loss": 0.1909, + "step": 111600 + }, + { + "epoch": 4.62, + "grad_norm": 0.640625, + "learning_rate": 0.00047131552549028435, + "loss": 0.2457, + "step": 111610 + }, + { + "epoch": 4.62, + "grad_norm": 0.58203125, + "learning_rate": 0.0004713104812635349, + "loss": 0.2166, + "step": 111620 + }, + { + "epoch": 4.62, + "grad_norm": 0.90625, + "learning_rate": 0.00047130543662030294, + "loss": 0.1786, + "step": 111630 + }, + { + "epoch": 4.62, + "grad_norm": 0.61328125, + "learning_rate": 0.00047130039156059814, + "loss": 0.2577, + "step": 111640 + }, + { + "epoch": 4.62, + "grad_norm": 0.33203125, + "learning_rate": 0.0004712953460844298, + "loss": 0.185, + "step": 111650 + }, + { + "epoch": 4.62, + "grad_norm": 0.6640625, + "learning_rate": 0.0004712903001918076, + "loss": 0.2459, + "step": 111660 + }, + { + "epoch": 4.63, + "grad_norm": 1.015625, + "learning_rate": 0.0004712852538827409, + "loss": 0.2334, + "step": 111670 + }, + { + "epoch": 4.63, + "grad_norm": 1.4453125, + "learning_rate": 0.00047128020715723925, + "loss": 0.1821, + "step": 111680 + }, + { + "epoch": 4.63, + "grad_norm": 0.380859375, + "learning_rate": 0.00047127516001531215, + "loss": 0.2158, + "step": 111690 + }, + { + "epoch": 4.63, + "grad_norm": 0.97265625, + "learning_rate": 0.0004712701124569691, + "loss": 0.2092, + "step": 111700 + }, + { + "epoch": 4.63, + "grad_norm": 0.6875, + "learning_rate": 0.0004712650644822196, + "loss": 0.2528, + "step": 111710 + }, + { + "epoch": 4.63, + "grad_norm": 0.578125, + "learning_rate": 0.0004712600160910732, + "loss": 0.242, + "step": 111720 + }, + { + "epoch": 4.63, + "grad_norm": 0.48828125, + "learning_rate": 0.00047125496728353923, + "loss": 0.1966, + "step": 111730 + }, + { + "epoch": 4.63, + "grad_norm": 0.921875, + "learning_rate": 0.00047124991805962737, + "loss": 0.1776, + "step": 111740 + }, + { + "epoch": 4.63, + "grad_norm": 0.640625, + "learning_rate": 0.000471244868419347, + "loss": 0.2356, + "step": 111750 + }, + { + "epoch": 4.63, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004712398183627078, + "loss": 0.1873, + "step": 111760 + }, + { + "epoch": 4.63, + "grad_norm": 0.9921875, + "learning_rate": 0.0004712347678897191, + "loss": 0.1855, + "step": 111770 + }, + { + "epoch": 4.63, + "grad_norm": 0.5234375, + "learning_rate": 0.0004712297170003904, + "loss": 0.2388, + "step": 111780 + }, + { + "epoch": 4.63, + "grad_norm": 0.7109375, + "learning_rate": 0.0004712246656947313, + "loss": 0.193, + "step": 111790 + }, + { + "epoch": 4.63, + "grad_norm": 0.49609375, + "learning_rate": 0.00047121961397275126, + "loss": 0.2153, + "step": 111800 + }, + { + "epoch": 4.63, + "grad_norm": 0.859375, + "learning_rate": 0.0004712145618344598, + "loss": 0.1474, + "step": 111810 + }, + { + "epoch": 4.63, + "grad_norm": 1.1484375, + "learning_rate": 0.00047120950927986643, + "loss": 0.2067, + "step": 111820 + }, + { + "epoch": 4.63, + "grad_norm": 2.3125, + "learning_rate": 0.0004712044563089807, + "loss": 0.2603, + "step": 111830 + }, + { + "epoch": 4.63, + "grad_norm": 0.31640625, + "learning_rate": 0.00047119940292181196, + "loss": 0.2096, + "step": 111840 + }, + { + "epoch": 4.63, + "grad_norm": 0.50390625, + "learning_rate": 0.0004711943491183699, + "loss": 0.1749, + "step": 111850 + }, + { + "epoch": 4.63, + "grad_norm": 1.8359375, + "learning_rate": 0.000471189294898664, + "loss": 0.1953, + "step": 111860 + }, + { + "epoch": 4.63, + "grad_norm": 1.390625, + "learning_rate": 0.0004711842402627037, + "loss": 0.1814, + "step": 111870 + }, + { + "epoch": 4.63, + "grad_norm": 1.375, + "learning_rate": 0.00047117918521049853, + "loss": 0.1927, + "step": 111880 + }, + { + "epoch": 4.63, + "grad_norm": 0.34375, + "learning_rate": 0.00047117412974205796, + "loss": 0.2689, + "step": 111890 + }, + { + "epoch": 4.63, + "grad_norm": 0.609375, + "learning_rate": 0.0004711690738573917, + "loss": 0.2219, + "step": 111900 + }, + { + "epoch": 4.64, + "grad_norm": 0.5859375, + "learning_rate": 0.000471164017556509, + "loss": 0.2403, + "step": 111910 + }, + { + "epoch": 4.64, + "grad_norm": 0.65625, + "learning_rate": 0.00047115896083941953, + "loss": 0.1827, + "step": 111920 + }, + { + "epoch": 4.64, + "grad_norm": 0.5390625, + "learning_rate": 0.00047115390370613286, + "loss": 0.215, + "step": 111930 + }, + { + "epoch": 4.64, + "grad_norm": 1.53125, + "learning_rate": 0.00047114884615665837, + "loss": 0.2025, + "step": 111940 + }, + { + "epoch": 4.64, + "grad_norm": 0.5625, + "learning_rate": 0.0004711437881910056, + "loss": 0.1813, + "step": 111950 + }, + { + "epoch": 4.64, + "grad_norm": 1.6171875, + "learning_rate": 0.00047113872980918413, + "loss": 0.1859, + "step": 111960 + }, + { + "epoch": 4.64, + "grad_norm": 0.4296875, + "learning_rate": 0.0004711336710112035, + "loss": 0.2472, + "step": 111970 + }, + { + "epoch": 4.64, + "grad_norm": 1.0, + "learning_rate": 0.0004711286117970731, + "loss": 0.2048, + "step": 111980 + }, + { + "epoch": 4.64, + "grad_norm": 0.458984375, + "learning_rate": 0.00047112355216680256, + "loss": 0.2224, + "step": 111990 + }, + { + "epoch": 4.64, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004711184921204015, + "loss": 0.1497, + "step": 112000 + }, + { + "epoch": 4.64, + "grad_norm": 0.71484375, + "learning_rate": 0.00047111343165787915, + "loss": 0.2517, + "step": 112010 + }, + { + "epoch": 4.64, + "grad_norm": 0.375, + "learning_rate": 0.0004711083707792453, + "loss": 0.1945, + "step": 112020 + }, + { + "epoch": 4.64, + "grad_norm": 0.73828125, + "learning_rate": 0.0004711033094845093, + "loss": 0.2381, + "step": 112030 + }, + { + "epoch": 4.64, + "grad_norm": 0.59765625, + "learning_rate": 0.00047109824777368073, + "loss": 0.213, + "step": 112040 + }, + { + "epoch": 4.64, + "grad_norm": 0.97265625, + "learning_rate": 0.0004710931856467692, + "loss": 0.2261, + "step": 112050 + }, + { + "epoch": 4.64, + "grad_norm": 1.8125, + "learning_rate": 0.00047108812310378415, + "loss": 0.2077, + "step": 112060 + }, + { + "epoch": 4.64, + "grad_norm": 0.357421875, + "learning_rate": 0.00047108306014473513, + "loss": 0.1764, + "step": 112070 + }, + { + "epoch": 4.64, + "grad_norm": 0.984375, + "learning_rate": 0.0004710779967696317, + "loss": 0.2366, + "step": 112080 + }, + { + "epoch": 4.64, + "grad_norm": 0.5859375, + "learning_rate": 0.0004710729329784833, + "loss": 0.1648, + "step": 112090 + }, + { + "epoch": 4.64, + "grad_norm": 0.6328125, + "learning_rate": 0.00047106786877129946, + "loss": 0.1952, + "step": 112100 + }, + { + "epoch": 4.64, + "grad_norm": 1.2578125, + "learning_rate": 0.00047106280414808987, + "loss": 0.2549, + "step": 112110 + }, + { + "epoch": 4.64, + "grad_norm": 0.34765625, + "learning_rate": 0.00047105773910886394, + "loss": 0.1733, + "step": 112120 + }, + { + "epoch": 4.64, + "grad_norm": 0.63671875, + "learning_rate": 0.0004710526736536312, + "loss": 0.2503, + "step": 112130 + }, + { + "epoch": 4.64, + "grad_norm": 0.609375, + "learning_rate": 0.00047104760778240117, + "loss": 0.2056, + "step": 112140 + }, + { + "epoch": 4.65, + "grad_norm": 0.46484375, + "learning_rate": 0.00047104254149518346, + "loss": 0.2183, + "step": 112150 + }, + { + "epoch": 4.65, + "grad_norm": 0.54296875, + "learning_rate": 0.00047103747479198757, + "loss": 0.1946, + "step": 112160 + }, + { + "epoch": 4.65, + "grad_norm": 0.7578125, + "learning_rate": 0.00047103240767282293, + "loss": 0.2303, + "step": 112170 + }, + { + "epoch": 4.65, + "grad_norm": 1.0546875, + "learning_rate": 0.00047102734013769926, + "loss": 0.231, + "step": 112180 + }, + { + "epoch": 4.65, + "grad_norm": 0.953125, + "learning_rate": 0.00047102227218662597, + "loss": 0.242, + "step": 112190 + }, + { + "epoch": 4.65, + "grad_norm": 0.96484375, + "learning_rate": 0.00047101720381961267, + "loss": 0.2311, + "step": 112200 + }, + { + "epoch": 4.65, + "grad_norm": 0.0, + "learning_rate": 0.00047101213503666884, + "loss": 0.1934, + "step": 112210 + }, + { + "epoch": 4.65, + "grad_norm": 0.453125, + "learning_rate": 0.000471007065837804, + "loss": 0.1399, + "step": 112220 + }, + { + "epoch": 4.65, + "grad_norm": 0.271484375, + "learning_rate": 0.00047100199622302776, + "loss": 0.2219, + "step": 112230 + }, + { + "epoch": 4.65, + "grad_norm": 1.828125, + "learning_rate": 0.00047099692619234965, + "loss": 0.2044, + "step": 112240 + }, + { + "epoch": 4.65, + "grad_norm": 1.859375, + "learning_rate": 0.0004709918557457791, + "loss": 0.2547, + "step": 112250 + }, + { + "epoch": 4.65, + "grad_norm": 0.2255859375, + "learning_rate": 0.0004709867848833259, + "loss": 0.1515, + "step": 112260 + }, + { + "epoch": 4.65, + "grad_norm": 0.181640625, + "learning_rate": 0.0004709817136049993, + "loss": 0.221, + "step": 112270 + }, + { + "epoch": 4.65, + "grad_norm": 0.8125, + "learning_rate": 0.000470976641910809, + "loss": 0.1873, + "step": 112280 + }, + { + "epoch": 4.65, + "grad_norm": 1.078125, + "learning_rate": 0.00047097156980076456, + "loss": 0.2386, + "step": 112290 + }, + { + "epoch": 4.65, + "grad_norm": 0.34765625, + "learning_rate": 0.0004709664972748755, + "loss": 0.1824, + "step": 112300 + }, + { + "epoch": 4.65, + "grad_norm": 0.625, + "learning_rate": 0.0004709614243331513, + "loss": 0.2313, + "step": 112310 + }, + { + "epoch": 4.65, + "grad_norm": 0.451171875, + "learning_rate": 0.00047095635097560163, + "loss": 0.2095, + "step": 112320 + }, + { + "epoch": 4.65, + "grad_norm": 0.796875, + "learning_rate": 0.0004709512772022358, + "loss": 0.1904, + "step": 112330 + }, + { + "epoch": 4.65, + "grad_norm": 0.59375, + "learning_rate": 0.00047094620301306374, + "loss": 0.2494, + "step": 112340 + }, + { + "epoch": 4.65, + "grad_norm": 0.7734375, + "learning_rate": 0.0004709411284080947, + "loss": 0.184, + "step": 112350 + }, + { + "epoch": 4.65, + "grad_norm": 0.64453125, + "learning_rate": 0.00047093605338733837, + "loss": 0.2315, + "step": 112360 + }, + { + "epoch": 4.65, + "grad_norm": 0.5859375, + "learning_rate": 0.00047093097795080415, + "loss": 0.1988, + "step": 112370 + }, + { + "epoch": 4.65, + "grad_norm": 0.87890625, + "learning_rate": 0.0004709259020985017, + "loss": 0.2253, + "step": 112380 + }, + { + "epoch": 4.66, + "grad_norm": 0.77734375, + "learning_rate": 0.0004709208258304406, + "loss": 0.2073, + "step": 112390 + }, + { + "epoch": 4.66, + "grad_norm": 0.62109375, + "learning_rate": 0.0004709157491466304, + "loss": 0.2892, + "step": 112400 + }, + { + "epoch": 4.66, + "grad_norm": 0.2333984375, + "learning_rate": 0.00047091067204708053, + "loss": 0.1679, + "step": 112410 + }, + { + "epoch": 4.66, + "grad_norm": 0.96875, + "learning_rate": 0.0004709055945318007, + "loss": 0.2, + "step": 112420 + }, + { + "epoch": 4.66, + "grad_norm": 0.70703125, + "learning_rate": 0.00047090051660080034, + "loss": 0.1809, + "step": 112430 + }, + { + "epoch": 4.66, + "grad_norm": 1.1796875, + "learning_rate": 0.0004708954382540891, + "loss": 0.1924, + "step": 112440 + }, + { + "epoch": 4.66, + "grad_norm": 0.765625, + "learning_rate": 0.0004708903594916765, + "loss": 0.2077, + "step": 112450 + }, + { + "epoch": 4.66, + "grad_norm": 0.57421875, + "learning_rate": 0.00047088528031357215, + "loss": 0.2037, + "step": 112460 + }, + { + "epoch": 4.66, + "grad_norm": 1.2578125, + "learning_rate": 0.00047088020071978543, + "loss": 0.2103, + "step": 112470 + }, + { + "epoch": 4.66, + "grad_norm": 0.384765625, + "learning_rate": 0.0004708751207103261, + "loss": 0.1823, + "step": 112480 + }, + { + "epoch": 4.66, + "grad_norm": 1.1953125, + "learning_rate": 0.00047087004028520364, + "loss": 0.1869, + "step": 112490 + }, + { + "epoch": 4.66, + "grad_norm": 0.9140625, + "learning_rate": 0.00047086495944442764, + "loss": 0.2712, + "step": 112500 + }, + { + "epoch": 4.66, + "grad_norm": 0.6875, + "learning_rate": 0.00047085987818800757, + "loss": 0.1694, + "step": 112510 + }, + { + "epoch": 4.66, + "grad_norm": 0.9375, + "learning_rate": 0.00047085479651595306, + "loss": 0.2001, + "step": 112520 + }, + { + "epoch": 4.66, + "grad_norm": 0.703125, + "learning_rate": 0.0004708497144282738, + "loss": 0.242, + "step": 112530 + }, + { + "epoch": 4.66, + "grad_norm": 0.6015625, + "learning_rate": 0.00047084463192497913, + "loss": 0.1745, + "step": 112540 + }, + { + "epoch": 4.66, + "grad_norm": 0.5546875, + "learning_rate": 0.00047083954900607874, + "loss": 0.2353, + "step": 112550 + }, + { + "epoch": 4.66, + "grad_norm": 1.171875, + "learning_rate": 0.00047083446567158213, + "loss": 0.1929, + "step": 112560 + }, + { + "epoch": 4.66, + "grad_norm": 0.77734375, + "learning_rate": 0.00047082938192149896, + "loss": 0.2199, + "step": 112570 + }, + { + "epoch": 4.66, + "grad_norm": 0.7421875, + "learning_rate": 0.0004708242977558387, + "loss": 0.2635, + "step": 112580 + }, + { + "epoch": 4.66, + "grad_norm": 0.67578125, + "learning_rate": 0.00047081921317461097, + "loss": 0.2314, + "step": 112590 + }, + { + "epoch": 4.66, + "grad_norm": 0.8671875, + "learning_rate": 0.0004708141281778253, + "loss": 0.1647, + "step": 112600 + }, + { + "epoch": 4.66, + "grad_norm": 0.53515625, + "learning_rate": 0.00047080904276549133, + "loss": 0.2254, + "step": 112610 + }, + { + "epoch": 4.66, + "grad_norm": 1.8984375, + "learning_rate": 0.0004708039569376185, + "loss": 0.2495, + "step": 112620 + }, + { + "epoch": 4.67, + "grad_norm": 0.51953125, + "learning_rate": 0.00047079887069421657, + "loss": 0.2092, + "step": 112630 + }, + { + "epoch": 4.67, + "grad_norm": 0.71484375, + "learning_rate": 0.0004707937840352949, + "loss": 0.1979, + "step": 112640 + }, + { + "epoch": 4.67, + "grad_norm": 0.84375, + "learning_rate": 0.0004707886969608633, + "loss": 0.1916, + "step": 112650 + }, + { + "epoch": 4.67, + "grad_norm": 1.0546875, + "learning_rate": 0.00047078360947093114, + "loss": 0.1927, + "step": 112660 + }, + { + "epoch": 4.67, + "grad_norm": 0.87890625, + "learning_rate": 0.0004707785215655081, + "loss": 0.2109, + "step": 112670 + }, + { + "epoch": 4.67, + "grad_norm": 0.890625, + "learning_rate": 0.0004707734332446038, + "loss": 0.2014, + "step": 112680 + }, + { + "epoch": 4.67, + "grad_norm": 0.5390625, + "learning_rate": 0.0004707683445082276, + "loss": 0.2402, + "step": 112690 + }, + { + "epoch": 4.67, + "grad_norm": 0.80078125, + "learning_rate": 0.00047076325535638926, + "loss": 0.2507, + "step": 112700 + }, + { + "epoch": 4.67, + "grad_norm": 0.69140625, + "learning_rate": 0.0004707581657890983, + "loss": 0.2436, + "step": 112710 + }, + { + "epoch": 4.67, + "grad_norm": 1.75, + "learning_rate": 0.00047075307580636436, + "loss": 0.2066, + "step": 112720 + }, + { + "epoch": 4.67, + "grad_norm": 0.703125, + "learning_rate": 0.000470747985408197, + "loss": 0.2332, + "step": 112730 + }, + { + "epoch": 4.67, + "grad_norm": 0.65625, + "learning_rate": 0.0004707428945946056, + "loss": 0.1986, + "step": 112740 + }, + { + "epoch": 4.67, + "grad_norm": 0.72265625, + "learning_rate": 0.00047073780336560005, + "loss": 0.2471, + "step": 112750 + }, + { + "epoch": 4.67, + "grad_norm": 1.5625, + "learning_rate": 0.0004707327117211898, + "loss": 0.2757, + "step": 112760 + }, + { + "epoch": 4.67, + "grad_norm": 0.78515625, + "learning_rate": 0.0004707276196613844, + "loss": 0.1852, + "step": 112770 + }, + { + "epoch": 4.67, + "grad_norm": 0.59375, + "learning_rate": 0.0004707225271861935, + "loss": 0.2102, + "step": 112780 + }, + { + "epoch": 4.67, + "grad_norm": 0.5703125, + "learning_rate": 0.00047071743429562653, + "loss": 0.2286, + "step": 112790 + }, + { + "epoch": 4.67, + "grad_norm": 1.0703125, + "learning_rate": 0.0004707123409896932, + "loss": 0.1926, + "step": 112800 + }, + { + "epoch": 4.67, + "grad_norm": 0.8203125, + "learning_rate": 0.0004707072472684032, + "loss": 0.1979, + "step": 112810 + }, + { + "epoch": 4.67, + "grad_norm": 0.337890625, + "learning_rate": 0.0004707021531317659, + "loss": 0.1895, + "step": 112820 + }, + { + "epoch": 4.67, + "grad_norm": 0.40625, + "learning_rate": 0.00047069705857979093, + "loss": 0.2358, + "step": 112830 + }, + { + "epoch": 4.67, + "grad_norm": 0.82421875, + "learning_rate": 0.000470691963612488, + "loss": 0.197, + "step": 112840 + }, + { + "epoch": 4.67, + "grad_norm": 0.88671875, + "learning_rate": 0.0004706868682298667, + "loss": 0.2463, + "step": 112850 + }, + { + "epoch": 4.67, + "grad_norm": 0.69140625, + "learning_rate": 0.0004706817724319364, + "loss": 0.2505, + "step": 112860 + }, + { + "epoch": 4.68, + "grad_norm": 0.81640625, + "learning_rate": 0.0004706766762187069, + "loss": 0.2264, + "step": 112870 + }, + { + "epoch": 4.68, + "grad_norm": 0.62890625, + "learning_rate": 0.0004706715795901878, + "loss": 0.2084, + "step": 112880 + }, + { + "epoch": 4.68, + "grad_norm": 0.51171875, + "learning_rate": 0.00047066648254638854, + "loss": 0.2518, + "step": 112890 + }, + { + "epoch": 4.68, + "grad_norm": 0.41015625, + "learning_rate": 0.00047066138508731884, + "loss": 0.1396, + "step": 112900 + }, + { + "epoch": 4.68, + "grad_norm": 0.90625, + "learning_rate": 0.00047065628721298816, + "loss": 0.2601, + "step": 112910 + }, + { + "epoch": 4.68, + "grad_norm": 0.404296875, + "learning_rate": 0.0004706511889234062, + "loss": 0.169, + "step": 112920 + }, + { + "epoch": 4.68, + "grad_norm": 0.6875, + "learning_rate": 0.0004706460902185825, + "loss": 0.1772, + "step": 112930 + }, + { + "epoch": 4.68, + "grad_norm": 0.92578125, + "learning_rate": 0.00047064099109852674, + "loss": 0.2158, + "step": 112940 + }, + { + "epoch": 4.68, + "grad_norm": 1.8203125, + "learning_rate": 0.0004706358915632485, + "loss": 0.1815, + "step": 112950 + }, + { + "epoch": 4.68, + "grad_norm": 0.765625, + "learning_rate": 0.0004706307916127573, + "loss": 0.2114, + "step": 112960 + }, + { + "epoch": 4.68, + "grad_norm": 0.62890625, + "learning_rate": 0.0004706256912470628, + "loss": 0.2683, + "step": 112970 + }, + { + "epoch": 4.68, + "grad_norm": 1.3359375, + "learning_rate": 0.0004706205904661745, + "loss": 0.1864, + "step": 112980 + }, + { + "epoch": 4.68, + "grad_norm": 1.421875, + "learning_rate": 0.0004706154892701021, + "loss": 0.2409, + "step": 112990 + }, + { + "epoch": 4.68, + "grad_norm": 0.80078125, + "learning_rate": 0.0004706103876588552, + "loss": 0.237, + "step": 113000 + }, + { + "epoch": 4.68, + "grad_norm": 1.1171875, + "learning_rate": 0.00047060528563244345, + "loss": 0.2494, + "step": 113010 + }, + { + "epoch": 4.68, + "grad_norm": 0.8125, + "learning_rate": 0.00047060018319087626, + "loss": 0.1467, + "step": 113020 + }, + { + "epoch": 4.68, + "grad_norm": 0.71875, + "learning_rate": 0.0004705950803341634, + "loss": 0.2251, + "step": 113030 + }, + { + "epoch": 4.68, + "grad_norm": 0.376953125, + "learning_rate": 0.0004705899770623144, + "loss": 0.2614, + "step": 113040 + }, + { + "epoch": 4.68, + "grad_norm": 1.140625, + "learning_rate": 0.00047058487337533887, + "loss": 0.2373, + "step": 113050 + }, + { + "epoch": 4.68, + "grad_norm": 0.4609375, + "learning_rate": 0.0004705797692732464, + "loss": 0.1954, + "step": 113060 + }, + { + "epoch": 4.68, + "grad_norm": 0.90234375, + "learning_rate": 0.00047057466475604673, + "loss": 0.2323, + "step": 113070 + }, + { + "epoch": 4.68, + "grad_norm": 0.2431640625, + "learning_rate": 0.00047056955982374926, + "loss": 0.2281, + "step": 113080 + }, + { + "epoch": 4.68, + "grad_norm": 0.7421875, + "learning_rate": 0.00047056445447636374, + "loss": 0.2493, + "step": 113090 + }, + { + "epoch": 4.68, + "grad_norm": 0.546875, + "learning_rate": 0.0004705593487138998, + "loss": 0.2345, + "step": 113100 + }, + { + "epoch": 4.69, + "grad_norm": 0.50390625, + "learning_rate": 0.0004705542425363669, + "loss": 0.2203, + "step": 113110 + }, + { + "epoch": 4.69, + "grad_norm": 1.453125, + "learning_rate": 0.00047054913594377475, + "loss": 0.2022, + "step": 113120 + }, + { + "epoch": 4.69, + "grad_norm": 0.85546875, + "learning_rate": 0.0004705440289361329, + "loss": 0.2295, + "step": 113130 + }, + { + "epoch": 4.69, + "grad_norm": 0.376953125, + "learning_rate": 0.0004705389215134511, + "loss": 0.2043, + "step": 113140 + }, + { + "epoch": 4.69, + "grad_norm": 0.71484375, + "learning_rate": 0.0004705338136757388, + "loss": 0.2068, + "step": 113150 + }, + { + "epoch": 4.69, + "grad_norm": 0.6015625, + "learning_rate": 0.00047052870542300577, + "loss": 0.2779, + "step": 113160 + }, + { + "epoch": 4.69, + "grad_norm": 0.51171875, + "learning_rate": 0.00047052359675526134, + "loss": 0.2252, + "step": 113170 + }, + { + "epoch": 4.69, + "grad_norm": 0.59375, + "learning_rate": 0.0004705184876725155, + "loss": 0.2107, + "step": 113180 + }, + { + "epoch": 4.69, + "grad_norm": 0.5390625, + "learning_rate": 0.00047051337817477756, + "loss": 0.2366, + "step": 113190 + }, + { + "epoch": 4.69, + "grad_norm": 0.34765625, + "learning_rate": 0.00047050826826205733, + "loss": 0.1925, + "step": 113200 + }, + { + "epoch": 4.69, + "grad_norm": 0.2490234375, + "learning_rate": 0.00047050315793436436, + "loss": 0.1769, + "step": 113210 + }, + { + "epoch": 4.69, + "grad_norm": 0.83203125, + "learning_rate": 0.0004704980471917082, + "loss": 0.218, + "step": 113220 + }, + { + "epoch": 4.69, + "grad_norm": 0.45703125, + "learning_rate": 0.0004704929360340986, + "loss": 0.1971, + "step": 113230 + }, + { + "epoch": 4.69, + "grad_norm": 3.234375, + "learning_rate": 0.00047048782446154505, + "loss": 0.2147, + "step": 113240 + }, + { + "epoch": 4.69, + "grad_norm": 0.6953125, + "learning_rate": 0.0004704827124740572, + "loss": 0.2237, + "step": 113250 + }, + { + "epoch": 4.69, + "grad_norm": 0.353515625, + "learning_rate": 0.00047047760007164473, + "loss": 0.1795, + "step": 113260 + }, + { + "epoch": 4.69, + "grad_norm": 1.1796875, + "learning_rate": 0.00047047248725431723, + "loss": 0.241, + "step": 113270 + }, + { + "epoch": 4.69, + "grad_norm": 0.9375, + "learning_rate": 0.0004704673740220843, + "loss": 0.2238, + "step": 113280 + }, + { + "epoch": 4.69, + "grad_norm": 0.703125, + "learning_rate": 0.00047046226037495564, + "loss": 0.2072, + "step": 113290 + }, + { + "epoch": 4.69, + "grad_norm": 0.318359375, + "learning_rate": 0.0004704571463129408, + "loss": 0.2089, + "step": 113300 + }, + { + "epoch": 4.69, + "grad_norm": 1.59375, + "learning_rate": 0.00047045203183604937, + "loss": 0.2248, + "step": 113310 + }, + { + "epoch": 4.69, + "grad_norm": 0.333984375, + "learning_rate": 0.00047044691694429097, + "loss": 0.1833, + "step": 113320 + }, + { + "epoch": 4.69, + "grad_norm": 0.56640625, + "learning_rate": 0.0004704418016376754, + "loss": 0.198, + "step": 113330 + }, + { + "epoch": 4.69, + "grad_norm": 0.59375, + "learning_rate": 0.00047043668591621214, + "loss": 0.2345, + "step": 113340 + }, + { + "epoch": 4.69, + "grad_norm": 0.3125, + "learning_rate": 0.0004704315697799108, + "loss": 0.1937, + "step": 113350 + }, + { + "epoch": 4.7, + "grad_norm": 0.0, + "learning_rate": 0.0004704264532287811, + "loss": 0.2046, + "step": 113360 + }, + { + "epoch": 4.7, + "grad_norm": 0.6875, + "learning_rate": 0.0004704213362628326, + "loss": 0.2045, + "step": 113370 + }, + { + "epoch": 4.7, + "grad_norm": 0.94140625, + "learning_rate": 0.0004704162188820749, + "loss": 0.2056, + "step": 113380 + }, + { + "epoch": 4.7, + "grad_norm": 0.921875, + "learning_rate": 0.0004704111010865177, + "loss": 0.1544, + "step": 113390 + }, + { + "epoch": 4.7, + "grad_norm": 0.37890625, + "learning_rate": 0.00047040598287617066, + "loss": 0.1823, + "step": 113400 + }, + { + "epoch": 4.7, + "grad_norm": 0.490234375, + "learning_rate": 0.0004704008642510433, + "loss": 0.1757, + "step": 113410 + }, + { + "epoch": 4.7, + "grad_norm": 0.52734375, + "learning_rate": 0.00047039574521114537, + "loss": 0.1845, + "step": 113420 + }, + { + "epoch": 4.7, + "grad_norm": 0.91015625, + "learning_rate": 0.0004703906257564864, + "loss": 0.2077, + "step": 113430 + }, + { + "epoch": 4.7, + "grad_norm": 0.390625, + "learning_rate": 0.0004703855058870761, + "loss": 0.1519, + "step": 113440 + }, + { + "epoch": 4.7, + "grad_norm": 0.2734375, + "learning_rate": 0.00047038038560292405, + "loss": 0.1883, + "step": 113450 + }, + { + "epoch": 4.7, + "grad_norm": 0.70703125, + "learning_rate": 0.00047037526490403993, + "loss": 0.2, + "step": 113460 + }, + { + "epoch": 4.7, + "grad_norm": 0.5625, + "learning_rate": 0.00047037014379043335, + "loss": 0.2291, + "step": 113470 + }, + { + "epoch": 4.7, + "grad_norm": 0.41015625, + "learning_rate": 0.00047036502226211396, + "loss": 0.2189, + "step": 113480 + }, + { + "epoch": 4.7, + "grad_norm": 0.515625, + "learning_rate": 0.0004703599003190914, + "loss": 0.2217, + "step": 113490 + }, + { + "epoch": 4.7, + "grad_norm": 0.91015625, + "learning_rate": 0.0004703547779613753, + "loss": 0.2021, + "step": 113500 + }, + { + "epoch": 4.7, + "grad_norm": 0.35546875, + "learning_rate": 0.00047034965518897535, + "loss": 0.1875, + "step": 113510 + }, + { + "epoch": 4.7, + "grad_norm": 0.40625, + "learning_rate": 0.00047034453200190106, + "loss": 0.227, + "step": 113520 + }, + { + "epoch": 4.7, + "grad_norm": 0.80859375, + "learning_rate": 0.0004703394084001622, + "loss": 0.2387, + "step": 113530 + }, + { + "epoch": 4.7, + "grad_norm": 0.96484375, + "learning_rate": 0.0004703342843837684, + "loss": 0.2203, + "step": 113540 + }, + { + "epoch": 4.7, + "grad_norm": 1.15625, + "learning_rate": 0.0004703291599527293, + "loss": 0.1803, + "step": 113550 + }, + { + "epoch": 4.7, + "grad_norm": 1.0234375, + "learning_rate": 0.0004703240351070544, + "loss": 0.1642, + "step": 113560 + }, + { + "epoch": 4.7, + "grad_norm": 0.5859375, + "learning_rate": 0.0004703189098467535, + "loss": 0.1696, + "step": 113570 + }, + { + "epoch": 4.7, + "grad_norm": 1.203125, + "learning_rate": 0.0004703137841718362, + "loss": 0.2215, + "step": 113580 + }, + { + "epoch": 4.7, + "grad_norm": 0.78125, + "learning_rate": 0.00047030865808231223, + "loss": 0.2555, + "step": 113590 + }, + { + "epoch": 4.71, + "grad_norm": 1.5859375, + "learning_rate": 0.00047030353157819107, + "loss": 0.2781, + "step": 113600 + }, + { + "epoch": 4.71, + "grad_norm": 0.66015625, + "learning_rate": 0.0004702984046594825, + "loss": 0.2434, + "step": 113610 + }, + { + "epoch": 4.71, + "grad_norm": 0.44921875, + "learning_rate": 0.00047029327732619607, + "loss": 0.1755, + "step": 113620 + }, + { + "epoch": 4.71, + "grad_norm": 0.310546875, + "learning_rate": 0.00047028814957834154, + "loss": 0.1881, + "step": 113630 + }, + { + "epoch": 4.71, + "grad_norm": 0.56640625, + "learning_rate": 0.0004702830214159285, + "loss": 0.2047, + "step": 113640 + }, + { + "epoch": 4.71, + "grad_norm": 0.388671875, + "learning_rate": 0.0004702778928389666, + "loss": 0.2168, + "step": 113650 + }, + { + "epoch": 4.71, + "grad_norm": 0.486328125, + "learning_rate": 0.0004702727638474654, + "loss": 0.2041, + "step": 113660 + }, + { + "epoch": 4.71, + "grad_norm": 0.6328125, + "learning_rate": 0.00047026763444143473, + "loss": 0.2, + "step": 113670 + }, + { + "epoch": 4.71, + "grad_norm": 1.1640625, + "learning_rate": 0.00047026250462088413, + "loss": 0.2583, + "step": 113680 + }, + { + "epoch": 4.71, + "grad_norm": 0.412109375, + "learning_rate": 0.00047025737438582336, + "loss": 0.1978, + "step": 113690 + }, + { + "epoch": 4.71, + "grad_norm": 0.6484375, + "learning_rate": 0.0004702522437362619, + "loss": 0.2277, + "step": 113700 + }, + { + "epoch": 4.71, + "grad_norm": 0.62890625, + "learning_rate": 0.00047024711267220953, + "loss": 0.1936, + "step": 113710 + }, + { + "epoch": 4.71, + "grad_norm": 0.5546875, + "learning_rate": 0.0004702419811936759, + "loss": 0.288, + "step": 113720 + }, + { + "epoch": 4.71, + "grad_norm": 0.57421875, + "learning_rate": 0.00047023684930067066, + "loss": 0.1741, + "step": 113730 + }, + { + "epoch": 4.71, + "grad_norm": 0.6328125, + "learning_rate": 0.00047023171699320346, + "loss": 0.1738, + "step": 113740 + }, + { + "epoch": 4.71, + "grad_norm": 0.2099609375, + "learning_rate": 0.0004702265842712839, + "loss": 0.1381, + "step": 113750 + }, + { + "epoch": 4.71, + "grad_norm": 0.333984375, + "learning_rate": 0.00047022145113492173, + "loss": 0.2252, + "step": 113760 + }, + { + "epoch": 4.71, + "grad_norm": 0.73828125, + "learning_rate": 0.0004702163175841265, + "loss": 0.198, + "step": 113770 + }, + { + "epoch": 4.71, + "grad_norm": 0.83203125, + "learning_rate": 0.000470211183618908, + "loss": 0.2186, + "step": 113780 + }, + { + "epoch": 4.71, + "grad_norm": 1.3359375, + "learning_rate": 0.00047020604923927583, + "loss": 0.2053, + "step": 113790 + }, + { + "epoch": 4.71, + "grad_norm": 1.03125, + "learning_rate": 0.0004702009144452397, + "loss": 0.2491, + "step": 113800 + }, + { + "epoch": 4.71, + "grad_norm": 0.59765625, + "learning_rate": 0.0004701957792368091, + "loss": 0.2092, + "step": 113810 + }, + { + "epoch": 4.71, + "grad_norm": 0.60546875, + "learning_rate": 0.00047019064361399396, + "loss": 0.2609, + "step": 113820 + }, + { + "epoch": 4.71, + "grad_norm": 0.98046875, + "learning_rate": 0.00047018550757680375, + "loss": 0.1833, + "step": 113830 + }, + { + "epoch": 4.72, + "grad_norm": 0.578125, + "learning_rate": 0.00047018037112524816, + "loss": 0.2394, + "step": 113840 + }, + { + "epoch": 4.72, + "grad_norm": 0.431640625, + "learning_rate": 0.00047017523425933695, + "loss": 0.2077, + "step": 113850 + }, + { + "epoch": 4.72, + "grad_norm": 0.5703125, + "learning_rate": 0.00047017009697907967, + "loss": 0.1968, + "step": 113860 + }, + { + "epoch": 4.72, + "grad_norm": 0.59765625, + "learning_rate": 0.0004701649592844861, + "loss": 0.1914, + "step": 113870 + }, + { + "epoch": 4.72, + "grad_norm": 0.439453125, + "learning_rate": 0.00047015982117556575, + "loss": 0.1844, + "step": 113880 + }, + { + "epoch": 4.72, + "grad_norm": 1.3125, + "learning_rate": 0.0004701546826523285, + "loss": 0.2172, + "step": 113890 + }, + { + "epoch": 4.72, + "grad_norm": 0.8359375, + "learning_rate": 0.0004701495437147839, + "loss": 0.2176, + "step": 113900 + }, + { + "epoch": 4.72, + "grad_norm": 0.345703125, + "learning_rate": 0.0004701444043629416, + "loss": 0.1878, + "step": 113910 + }, + { + "epoch": 4.72, + "grad_norm": 1.0546875, + "learning_rate": 0.0004701392645968113, + "loss": 0.2098, + "step": 113920 + }, + { + "epoch": 4.72, + "grad_norm": 0.7265625, + "learning_rate": 0.00047013412441640267, + "loss": 0.1959, + "step": 113930 + }, + { + "epoch": 4.72, + "grad_norm": 1.0546875, + "learning_rate": 0.0004701289838217254, + "loss": 0.2141, + "step": 113940 + }, + { + "epoch": 4.72, + "grad_norm": 0.71875, + "learning_rate": 0.0004701238428127892, + "loss": 0.1864, + "step": 113950 + }, + { + "epoch": 4.72, + "grad_norm": 0.53515625, + "learning_rate": 0.0004701187013896037, + "loss": 0.1879, + "step": 113960 + }, + { + "epoch": 4.72, + "grad_norm": 0.3671875, + "learning_rate": 0.0004701135595521785, + "loss": 0.1545, + "step": 113970 + }, + { + "epoch": 4.72, + "grad_norm": 0.9921875, + "learning_rate": 0.0004701084173005234, + "loss": 0.2791, + "step": 113980 + }, + { + "epoch": 4.72, + "grad_norm": 0.71484375, + "learning_rate": 0.000470103274634648, + "loss": 0.2172, + "step": 113990 + }, + { + "epoch": 4.72, + "grad_norm": 0.71484375, + "learning_rate": 0.00047009813155456207, + "loss": 0.1984, + "step": 114000 + }, + { + "epoch": 4.72, + "grad_norm": 0.58203125, + "learning_rate": 0.0004700929880602751, + "loss": 0.1827, + "step": 114010 + }, + { + "epoch": 4.72, + "grad_norm": 0.6875, + "learning_rate": 0.0004700878441517971, + "loss": 0.217, + "step": 114020 + }, + { + "epoch": 4.72, + "grad_norm": 0.86328125, + "learning_rate": 0.0004700826998291373, + "loss": 0.1885, + "step": 114030 + }, + { + "epoch": 4.72, + "grad_norm": 0.625, + "learning_rate": 0.0004700775550923058, + "loss": 0.1611, + "step": 114040 + }, + { + "epoch": 4.72, + "grad_norm": 0.2294921875, + "learning_rate": 0.00047007240994131205, + "loss": 0.2048, + "step": 114050 + }, + { + "epoch": 4.72, + "grad_norm": 0.60546875, + "learning_rate": 0.00047006726437616577, + "loss": 0.2616, + "step": 114060 + }, + { + "epoch": 4.72, + "grad_norm": 0.466796875, + "learning_rate": 0.00047006211839687676, + "loss": 0.2202, + "step": 114070 + }, + { + "epoch": 4.73, + "grad_norm": 0.671875, + "learning_rate": 0.0004700569720034545, + "loss": 0.1675, + "step": 114080 + }, + { + "epoch": 4.73, + "grad_norm": 0.330078125, + "learning_rate": 0.0004700518251959088, + "loss": 0.2042, + "step": 114090 + }, + { + "epoch": 4.73, + "grad_norm": 1.0390625, + "learning_rate": 0.0004700466779742494, + "loss": 0.2011, + "step": 114100 + }, + { + "epoch": 4.73, + "grad_norm": 0.6640625, + "learning_rate": 0.00047004153033848584, + "loss": 0.1648, + "step": 114110 + }, + { + "epoch": 4.73, + "grad_norm": 1.2109375, + "learning_rate": 0.0004700363822886279, + "loss": 0.1881, + "step": 114120 + }, + { + "epoch": 4.73, + "grad_norm": 2.671875, + "learning_rate": 0.0004700312338246852, + "loss": 0.2267, + "step": 114130 + }, + { + "epoch": 4.73, + "grad_norm": 0.466796875, + "learning_rate": 0.00047002608494666755, + "loss": 0.2426, + "step": 114140 + }, + { + "epoch": 4.73, + "grad_norm": 0.640625, + "learning_rate": 0.0004700209356545846, + "loss": 0.2339, + "step": 114150 + }, + { + "epoch": 4.73, + "grad_norm": 0.41796875, + "learning_rate": 0.0004700157859484459, + "loss": 0.197, + "step": 114160 + }, + { + "epoch": 4.73, + "grad_norm": 0.56640625, + "learning_rate": 0.00047001063582826133, + "loss": 0.2303, + "step": 114170 + }, + { + "epoch": 4.73, + "grad_norm": 0.259765625, + "learning_rate": 0.00047000548529404053, + "loss": 0.2006, + "step": 114180 + }, + { + "epoch": 4.73, + "grad_norm": 1.0625, + "learning_rate": 0.000470000334345793, + "loss": 0.2575, + "step": 114190 + }, + { + "epoch": 4.73, + "grad_norm": 1.390625, + "learning_rate": 0.0004699951829835288, + "loss": 0.2292, + "step": 114200 + }, + { + "epoch": 4.73, + "grad_norm": 0.7734375, + "learning_rate": 0.00046999003120725736, + "loss": 0.2396, + "step": 114210 + }, + { + "epoch": 4.73, + "grad_norm": 1.171875, + "learning_rate": 0.0004699848790169884, + "loss": 0.2143, + "step": 114220 + }, + { + "epoch": 4.73, + "grad_norm": 0.41796875, + "learning_rate": 0.0004699797264127317, + "loss": 0.2293, + "step": 114230 + }, + { + "epoch": 4.73, + "grad_norm": 0.96484375, + "learning_rate": 0.0004699745733944968, + "loss": 0.234, + "step": 114240 + }, + { + "epoch": 4.73, + "grad_norm": 1.53125, + "learning_rate": 0.00046996941996229366, + "loss": 0.2432, + "step": 114250 + }, + { + "epoch": 4.73, + "grad_norm": 0.6484375, + "learning_rate": 0.00046996426611613175, + "loss": 0.2016, + "step": 114260 + }, + { + "epoch": 4.73, + "grad_norm": 0.359375, + "learning_rate": 0.0004699591118560208, + "loss": 0.1594, + "step": 114270 + }, + { + "epoch": 4.73, + "grad_norm": 0.5234375, + "learning_rate": 0.0004699539571819706, + "loss": 0.2395, + "step": 114280 + }, + { + "epoch": 4.73, + "grad_norm": 1.203125, + "learning_rate": 0.0004699488020939908, + "loss": 0.1551, + "step": 114290 + }, + { + "epoch": 4.73, + "grad_norm": 1.046875, + "learning_rate": 0.0004699436465920912, + "loss": 0.2611, + "step": 114300 + }, + { + "epoch": 4.73, + "grad_norm": 0.68359375, + "learning_rate": 0.00046993849067628133, + "loss": 0.2615, + "step": 114310 + }, + { + "epoch": 4.74, + "grad_norm": 0.470703125, + "learning_rate": 0.00046993333434657095, + "loss": 0.1794, + "step": 114320 + }, + { + "epoch": 4.74, + "grad_norm": 0.18359375, + "learning_rate": 0.0004699281776029698, + "loss": 0.1922, + "step": 114330 + }, + { + "epoch": 4.74, + "grad_norm": 0.77734375, + "learning_rate": 0.0004699230204454876, + "loss": 0.2075, + "step": 114340 + }, + { + "epoch": 4.74, + "grad_norm": 0.54296875, + "learning_rate": 0.00046991786287413394, + "loss": 0.1887, + "step": 114350 + }, + { + "epoch": 4.74, + "grad_norm": 1.8515625, + "learning_rate": 0.0004699127048889187, + "loss": 0.2443, + "step": 114360 + }, + { + "epoch": 4.74, + "grad_norm": 0.671875, + "learning_rate": 0.00046990754648985146, + "loss": 0.2185, + "step": 114370 + }, + { + "epoch": 4.74, + "grad_norm": 0.3515625, + "learning_rate": 0.00046990238767694205, + "loss": 0.2061, + "step": 114380 + }, + { + "epoch": 4.74, + "grad_norm": 1.59375, + "learning_rate": 0.0004698972284502, + "loss": 0.2163, + "step": 114390 + }, + { + "epoch": 4.74, + "grad_norm": 0.9453125, + "learning_rate": 0.0004698920688096351, + "loss": 0.1291, + "step": 114400 + }, + { + "epoch": 4.74, + "grad_norm": 1.0, + "learning_rate": 0.0004698869087552571, + "loss": 0.222, + "step": 114410 + }, + { + "epoch": 4.74, + "grad_norm": 0.72265625, + "learning_rate": 0.0004698817482870757, + "loss": 0.2046, + "step": 114420 + }, + { + "epoch": 4.74, + "grad_norm": 1.09375, + "learning_rate": 0.00046987658740510057, + "loss": 0.2007, + "step": 114430 + }, + { + "epoch": 4.74, + "grad_norm": 0.37109375, + "learning_rate": 0.0004698714261093415, + "loss": 0.2678, + "step": 114440 + }, + { + "epoch": 4.74, + "grad_norm": 1.0859375, + "learning_rate": 0.0004698662643998081, + "loss": 0.2102, + "step": 114450 + }, + { + "epoch": 4.74, + "grad_norm": 1.1640625, + "learning_rate": 0.00046986110227651014, + "loss": 0.1866, + "step": 114460 + }, + { + "epoch": 4.74, + "grad_norm": 1.0, + "learning_rate": 0.00046985593973945735, + "loss": 0.1831, + "step": 114470 + }, + { + "epoch": 4.74, + "grad_norm": 0.75, + "learning_rate": 0.0004698507767886594, + "loss": 0.2123, + "step": 114480 + }, + { + "epoch": 4.74, + "grad_norm": 0.5859375, + "learning_rate": 0.00046984561342412604, + "loss": 0.19, + "step": 114490 + }, + { + "epoch": 4.74, + "grad_norm": 0.64453125, + "learning_rate": 0.000469840449645867, + "loss": 0.187, + "step": 114500 + }, + { + "epoch": 4.74, + "grad_norm": 0.3984375, + "learning_rate": 0.0004698352854538919, + "loss": 0.2209, + "step": 114510 + }, + { + "epoch": 4.74, + "grad_norm": 1.203125, + "learning_rate": 0.00046983012084821064, + "loss": 0.1937, + "step": 114520 + }, + { + "epoch": 4.74, + "grad_norm": 0.3984375, + "learning_rate": 0.0004698249558288328, + "loss": 0.2259, + "step": 114530 + }, + { + "epoch": 4.74, + "grad_norm": 1.1328125, + "learning_rate": 0.00046981979039576805, + "loss": 0.1887, + "step": 114540 + }, + { + "epoch": 4.74, + "grad_norm": 0.6171875, + "learning_rate": 0.00046981462454902625, + "loss": 0.1885, + "step": 114550 + }, + { + "epoch": 4.75, + "grad_norm": 0.68359375, + "learning_rate": 0.0004698094582886171, + "loss": 0.2073, + "step": 114560 + }, + { + "epoch": 4.75, + "grad_norm": 0.5859375, + "learning_rate": 0.0004698042916145502, + "loss": 0.2232, + "step": 114570 + }, + { + "epoch": 4.75, + "grad_norm": 0.498046875, + "learning_rate": 0.00046979912452683537, + "loss": 0.2193, + "step": 114580 + }, + { + "epoch": 4.75, + "grad_norm": 1.453125, + "learning_rate": 0.00046979395702548244, + "loss": 0.224, + "step": 114590 + }, + { + "epoch": 4.75, + "grad_norm": 0.79296875, + "learning_rate": 0.0004697887891105009, + "loss": 0.2045, + "step": 114600 + }, + { + "epoch": 4.75, + "grad_norm": 1.015625, + "learning_rate": 0.00046978362078190064, + "loss": 0.177, + "step": 114610 + }, + { + "epoch": 4.75, + "grad_norm": 0.59765625, + "learning_rate": 0.0004697784520396914, + "loss": 0.2473, + "step": 114620 + }, + { + "epoch": 4.75, + "grad_norm": 0.42578125, + "learning_rate": 0.00046977328288388276, + "loss": 0.1825, + "step": 114630 + }, + { + "epoch": 4.75, + "grad_norm": 0.49609375, + "learning_rate": 0.00046976811331448455, + "loss": 0.2774, + "step": 114640 + }, + { + "epoch": 4.75, + "grad_norm": 0.74609375, + "learning_rate": 0.0004697629433315065, + "loss": 0.2189, + "step": 114650 + }, + { + "epoch": 4.75, + "grad_norm": 0.96484375, + "learning_rate": 0.0004697577729349583, + "loss": 0.2008, + "step": 114660 + }, + { + "epoch": 4.75, + "grad_norm": 0.70703125, + "learning_rate": 0.0004697526021248497, + "loss": 0.1857, + "step": 114670 + }, + { + "epoch": 4.75, + "grad_norm": 0.7890625, + "learning_rate": 0.0004697474309011905, + "loss": 0.2087, + "step": 114680 + }, + { + "epoch": 4.75, + "grad_norm": 0.765625, + "learning_rate": 0.00046974225926399026, + "loss": 0.2112, + "step": 114690 + }, + { + "epoch": 4.75, + "grad_norm": 0.5625, + "learning_rate": 0.00046973708721325893, + "loss": 0.2237, + "step": 114700 + }, + { + "epoch": 4.75, + "grad_norm": 0.57421875, + "learning_rate": 0.00046973191474900607, + "loss": 0.1584, + "step": 114710 + }, + { + "epoch": 4.75, + "grad_norm": 1.375, + "learning_rate": 0.0004697267418712415, + "loss": 0.2884, + "step": 114720 + }, + { + "epoch": 4.75, + "grad_norm": 0.78515625, + "learning_rate": 0.0004697215685799749, + "loss": 0.2236, + "step": 114730 + }, + { + "epoch": 4.75, + "grad_norm": 1.6875, + "learning_rate": 0.000469716394875216, + "loss": 0.2398, + "step": 114740 + }, + { + "epoch": 4.75, + "grad_norm": 0.73046875, + "learning_rate": 0.0004697112207569747, + "loss": 0.1867, + "step": 114750 + }, + { + "epoch": 4.75, + "grad_norm": 0.421875, + "learning_rate": 0.00046970604622526045, + "loss": 0.2372, + "step": 114760 + }, + { + "epoch": 4.75, + "grad_norm": 1.171875, + "learning_rate": 0.00046970087128008327, + "loss": 0.2246, + "step": 114770 + }, + { + "epoch": 4.75, + "grad_norm": 2.609375, + "learning_rate": 0.00046969569592145274, + "loss": 0.2561, + "step": 114780 + }, + { + "epoch": 4.75, + "grad_norm": 1.2734375, + "learning_rate": 0.00046969052014937863, + "loss": 0.2061, + "step": 114790 + }, + { + "epoch": 4.76, + "grad_norm": 0.302734375, + "learning_rate": 0.00046968534396387064, + "loss": 0.1936, + "step": 114800 + }, + { + "epoch": 4.76, + "grad_norm": 0.65234375, + "learning_rate": 0.0004696801673649387, + "loss": 0.2023, + "step": 114810 + }, + { + "epoch": 4.76, + "grad_norm": 0.5546875, + "learning_rate": 0.00046967499035259225, + "loss": 0.2266, + "step": 114820 + }, + { + "epoch": 4.76, + "grad_norm": 0.263671875, + "learning_rate": 0.0004696698129268413, + "loss": 0.1984, + "step": 114830 + }, + { + "epoch": 4.76, + "grad_norm": 0.95703125, + "learning_rate": 0.00046966463508769544, + "loss": 0.2341, + "step": 114840 + }, + { + "epoch": 4.76, + "grad_norm": 0.953125, + "learning_rate": 0.0004696594568351644, + "loss": 0.1965, + "step": 114850 + }, + { + "epoch": 4.76, + "grad_norm": 1.2890625, + "learning_rate": 0.00046965427816925804, + "loss": 0.1745, + "step": 114860 + }, + { + "epoch": 4.76, + "grad_norm": 0.7578125, + "learning_rate": 0.0004696490990899861, + "loss": 0.198, + "step": 114870 + }, + { + "epoch": 4.76, + "grad_norm": 0.52734375, + "learning_rate": 0.00046964391959735817, + "loss": 0.1929, + "step": 114880 + }, + { + "epoch": 4.76, + "grad_norm": 1.234375, + "learning_rate": 0.00046963873969138413, + "loss": 0.1902, + "step": 114890 + }, + { + "epoch": 4.76, + "grad_norm": 1.4375, + "learning_rate": 0.00046963355937207373, + "loss": 0.2282, + "step": 114900 + }, + { + "epoch": 4.76, + "grad_norm": 0.6640625, + "learning_rate": 0.00046962837863943674, + "loss": 0.155, + "step": 114910 + }, + { + "epoch": 4.76, + "grad_norm": 0.7265625, + "learning_rate": 0.0004696231974934828, + "loss": 0.1862, + "step": 114920 + }, + { + "epoch": 4.76, + "grad_norm": 0.58984375, + "learning_rate": 0.0004696180159342217, + "loss": 0.2122, + "step": 114930 + }, + { + "epoch": 4.76, + "grad_norm": 0.83984375, + "learning_rate": 0.0004696128339616632, + "loss": 0.1779, + "step": 114940 + }, + { + "epoch": 4.76, + "grad_norm": 1.1015625, + "learning_rate": 0.00046960765157581715, + "loss": 0.2215, + "step": 114950 + }, + { + "epoch": 4.76, + "grad_norm": 0.89453125, + "learning_rate": 0.00046960246877669314, + "loss": 0.1492, + "step": 114960 + }, + { + "epoch": 4.76, + "grad_norm": 0.65625, + "learning_rate": 0.00046959728556430103, + "loss": 0.1968, + "step": 114970 + }, + { + "epoch": 4.76, + "grad_norm": 0.69140625, + "learning_rate": 0.0004695921019386505, + "loss": 0.2087, + "step": 114980 + }, + { + "epoch": 4.76, + "grad_norm": 0.703125, + "learning_rate": 0.00046958691789975146, + "loss": 0.1977, + "step": 114990 + }, + { + "epoch": 4.76, + "grad_norm": 0.96484375, + "learning_rate": 0.00046958173344761346, + "loss": 0.1996, + "step": 115000 + }, + { + "epoch": 4.76, + "grad_norm": 1.6328125, + "learning_rate": 0.00046957654858224634, + "loss": 0.2285, + "step": 115010 + }, + { + "epoch": 4.76, + "grad_norm": 0.34765625, + "learning_rate": 0.0004695713633036599, + "loss": 0.2158, + "step": 115020 + }, + { + "epoch": 4.76, + "grad_norm": 0.5546875, + "learning_rate": 0.0004695661776118639, + "loss": 0.1649, + "step": 115030 + }, + { + "epoch": 4.76, + "grad_norm": 1.4765625, + "learning_rate": 0.000469560991506868, + "loss": 0.228, + "step": 115040 + }, + { + "epoch": 4.77, + "grad_norm": 1.2421875, + "learning_rate": 0.0004695558049886821, + "loss": 0.229, + "step": 115050 + }, + { + "epoch": 4.77, + "grad_norm": 1.1328125, + "learning_rate": 0.0004695506180573158, + "loss": 0.187, + "step": 115060 + }, + { + "epoch": 4.77, + "grad_norm": 0.71484375, + "learning_rate": 0.000469545430712779, + "loss": 0.2496, + "step": 115070 + }, + { + "epoch": 4.77, + "grad_norm": 1.09375, + "learning_rate": 0.00046954024295508135, + "loss": 0.2229, + "step": 115080 + }, + { + "epoch": 4.77, + "grad_norm": 0.6796875, + "learning_rate": 0.00046953505478423274, + "loss": 0.1689, + "step": 115090 + }, + { + "epoch": 4.77, + "grad_norm": 0.65234375, + "learning_rate": 0.0004695298662002429, + "loss": 0.2516, + "step": 115100 + }, + { + "epoch": 4.77, + "grad_norm": 0.62109375, + "learning_rate": 0.0004695246772031214, + "loss": 0.2247, + "step": 115110 + }, + { + "epoch": 4.77, + "grad_norm": 0.64453125, + "learning_rate": 0.00046951948779287825, + "loss": 0.1792, + "step": 115120 + }, + { + "epoch": 4.77, + "grad_norm": 0.306640625, + "learning_rate": 0.00046951429796952316, + "loss": 0.203, + "step": 115130 + }, + { + "epoch": 4.77, + "grad_norm": 0.48828125, + "learning_rate": 0.00046950910773306586, + "loss": 0.2406, + "step": 115140 + }, + { + "epoch": 4.77, + "grad_norm": 0.162109375, + "learning_rate": 0.00046950391708351614, + "loss": 0.2776, + "step": 115150 + }, + { + "epoch": 4.77, + "grad_norm": 1.7734375, + "learning_rate": 0.00046949872602088365, + "loss": 0.2641, + "step": 115160 + }, + { + "epoch": 4.77, + "grad_norm": 0.3984375, + "learning_rate": 0.00046949353454517833, + "loss": 0.1943, + "step": 115170 + }, + { + "epoch": 4.77, + "grad_norm": 0.765625, + "learning_rate": 0.0004694883426564099, + "loss": 0.2224, + "step": 115180 + }, + { + "epoch": 4.77, + "grad_norm": 0.46484375, + "learning_rate": 0.00046948315035458813, + "loss": 0.2548, + "step": 115190 + }, + { + "epoch": 4.77, + "grad_norm": 0.435546875, + "learning_rate": 0.0004694779576397227, + "loss": 0.231, + "step": 115200 + }, + { + "epoch": 4.77, + "grad_norm": 0.81640625, + "learning_rate": 0.00046947276451182355, + "loss": 0.255, + "step": 115210 + }, + { + "epoch": 4.77, + "grad_norm": 0.5390625, + "learning_rate": 0.0004694675709709003, + "loss": 0.2264, + "step": 115220 + }, + { + "epoch": 4.77, + "grad_norm": 0.248046875, + "learning_rate": 0.00046946237701696276, + "loss": 0.1444, + "step": 115230 + }, + { + "epoch": 4.77, + "grad_norm": 0.70703125, + "learning_rate": 0.00046945718265002076, + "loss": 0.2224, + "step": 115240 + }, + { + "epoch": 4.77, + "grad_norm": 0.59765625, + "learning_rate": 0.00046945198787008404, + "loss": 0.1625, + "step": 115250 + }, + { + "epoch": 4.77, + "grad_norm": 0.625, + "learning_rate": 0.00046944679267716234, + "loss": 0.224, + "step": 115260 + }, + { + "epoch": 4.77, + "grad_norm": 2.078125, + "learning_rate": 0.00046944159707126555, + "loss": 0.1537, + "step": 115270 + }, + { + "epoch": 4.77, + "grad_norm": 0.45703125, + "learning_rate": 0.00046943640105240325, + "loss": 0.2133, + "step": 115280 + }, + { + "epoch": 4.78, + "grad_norm": 0.88671875, + "learning_rate": 0.0004694312046205855, + "loss": 0.2442, + "step": 115290 + }, + { + "epoch": 4.78, + "grad_norm": 0.37890625, + "learning_rate": 0.00046942600777582176, + "loss": 0.2682, + "step": 115300 + }, + { + "epoch": 4.78, + "grad_norm": 0.53125, + "learning_rate": 0.0004694208105181221, + "loss": 0.2113, + "step": 115310 + }, + { + "epoch": 4.78, + "grad_norm": 0.703125, + "learning_rate": 0.0004694156128474961, + "loss": 0.1845, + "step": 115320 + }, + { + "epoch": 4.78, + "grad_norm": 0.33203125, + "learning_rate": 0.0004694104147639536, + "loss": 0.2009, + "step": 115330 + }, + { + "epoch": 4.78, + "grad_norm": 0.68359375, + "learning_rate": 0.00046940521626750444, + "loss": 0.1794, + "step": 115340 + }, + { + "epoch": 4.78, + "grad_norm": 0.0, + "learning_rate": 0.00046940001735815834, + "loss": 0.1429, + "step": 115350 + }, + { + "epoch": 4.78, + "grad_norm": 0.67578125, + "learning_rate": 0.0004693948180359251, + "loss": 0.155, + "step": 115360 + }, + { + "epoch": 4.78, + "grad_norm": 0.55078125, + "learning_rate": 0.0004693896183008145, + "loss": 0.1807, + "step": 115370 + }, + { + "epoch": 4.78, + "grad_norm": 0.51171875, + "learning_rate": 0.00046938441815283635, + "loss": 0.2319, + "step": 115380 + }, + { + "epoch": 4.78, + "grad_norm": 0.5390625, + "learning_rate": 0.00046937921759200043, + "loss": 0.2317, + "step": 115390 + }, + { + "epoch": 4.78, + "grad_norm": 0.6171875, + "learning_rate": 0.0004693740166183165, + "loss": 0.1932, + "step": 115400 + }, + { + "epoch": 4.78, + "grad_norm": 1.4453125, + "learning_rate": 0.0004693688152317943, + "loss": 0.1443, + "step": 115410 + }, + { + "epoch": 4.78, + "grad_norm": 0.7578125, + "learning_rate": 0.0004693636134324437, + "loss": 0.2436, + "step": 115420 + }, + { + "epoch": 4.78, + "grad_norm": 0.61328125, + "learning_rate": 0.0004693584112202745, + "loss": 0.262, + "step": 115430 + }, + { + "epoch": 4.78, + "grad_norm": 0.73046875, + "learning_rate": 0.0004693532085952965, + "loss": 0.2206, + "step": 115440 + }, + { + "epoch": 4.78, + "grad_norm": 0.5625, + "learning_rate": 0.00046934800555751936, + "loss": 0.2043, + "step": 115450 + }, + { + "epoch": 4.78, + "grad_norm": 0.703125, + "learning_rate": 0.000469342802106953, + "loss": 0.1839, + "step": 115460 + }, + { + "epoch": 4.78, + "grad_norm": 1.53125, + "learning_rate": 0.0004693375982436072, + "loss": 0.1163, + "step": 115470 + }, + { + "epoch": 4.78, + "grad_norm": 0.9921875, + "learning_rate": 0.0004693323939674917, + "loss": 0.2352, + "step": 115480 + }, + { + "epoch": 4.78, + "grad_norm": 0.41015625, + "learning_rate": 0.0004693271892786163, + "loss": 0.1939, + "step": 115490 + }, + { + "epoch": 4.78, + "grad_norm": 0.8125, + "learning_rate": 0.00046932198417699085, + "loss": 0.1986, + "step": 115500 + }, + { + "epoch": 4.78, + "grad_norm": 1.4765625, + "learning_rate": 0.0004693167786626251, + "loss": 0.2708, + "step": 115510 + }, + { + "epoch": 4.78, + "grad_norm": 0.65234375, + "learning_rate": 0.00046931157273552885, + "loss": 0.2458, + "step": 115520 + }, + { + "epoch": 4.79, + "grad_norm": 1.1171875, + "learning_rate": 0.0004693063663957119, + "loss": 0.1943, + "step": 115530 + }, + { + "epoch": 4.79, + "grad_norm": 1.0703125, + "learning_rate": 0.0004693011596431841, + "loss": 0.1812, + "step": 115540 + }, + { + "epoch": 4.79, + "grad_norm": 0.98046875, + "learning_rate": 0.00046929595247795507, + "loss": 0.1864, + "step": 115550 + }, + { + "epoch": 4.79, + "grad_norm": 1.09375, + "learning_rate": 0.00046929074490003486, + "loss": 0.2393, + "step": 115560 + }, + { + "epoch": 4.79, + "grad_norm": 0.8515625, + "learning_rate": 0.00046928553690943315, + "loss": 0.2235, + "step": 115570 + }, + { + "epoch": 4.79, + "grad_norm": 0.80078125, + "learning_rate": 0.0004692803285061597, + "loss": 0.2188, + "step": 115580 + }, + { + "epoch": 4.79, + "grad_norm": 0.6015625, + "learning_rate": 0.00046927511969022443, + "loss": 0.1561, + "step": 115590 + }, + { + "epoch": 4.79, + "grad_norm": 0.62890625, + "learning_rate": 0.00046926991046163693, + "loss": 0.2063, + "step": 115600 + }, + { + "epoch": 4.79, + "grad_norm": 1.25, + "learning_rate": 0.00046926470082040724, + "loss": 0.2052, + "step": 115610 + }, + { + "epoch": 4.79, + "grad_norm": 0.0, + "learning_rate": 0.000469259490766545, + "loss": 0.224, + "step": 115620 + }, + { + "epoch": 4.79, + "grad_norm": 0.8671875, + "learning_rate": 0.00046925428030006013, + "loss": 0.1249, + "step": 115630 + }, + { + "epoch": 4.79, + "grad_norm": 0.94921875, + "learning_rate": 0.00046924906942096234, + "loss": 0.2412, + "step": 115640 + }, + { + "epoch": 4.79, + "grad_norm": 0.4765625, + "learning_rate": 0.0004692438581292615, + "loss": 0.2189, + "step": 115650 + }, + { + "epoch": 4.79, + "grad_norm": 0.359375, + "learning_rate": 0.0004692386464249674, + "loss": 0.1329, + "step": 115660 + }, + { + "epoch": 4.79, + "grad_norm": 0.77734375, + "learning_rate": 0.0004692334343080898, + "loss": 0.233, + "step": 115670 + }, + { + "epoch": 4.79, + "grad_norm": 1.8046875, + "learning_rate": 0.00046922822177863856, + "loss": 0.21, + "step": 115680 + }, + { + "epoch": 4.79, + "grad_norm": 0.1845703125, + "learning_rate": 0.0004692230088366235, + "loss": 0.2583, + "step": 115690 + }, + { + "epoch": 4.79, + "grad_norm": 0.6171875, + "learning_rate": 0.00046921779548205443, + "loss": 0.2189, + "step": 115700 + }, + { + "epoch": 4.79, + "grad_norm": 0.7890625, + "learning_rate": 0.00046921258171494113, + "loss": 0.2177, + "step": 115710 + }, + { + "epoch": 4.79, + "grad_norm": 1.015625, + "learning_rate": 0.0004692073675352934, + "loss": 0.1756, + "step": 115720 + }, + { + "epoch": 4.79, + "grad_norm": 1.0390625, + "learning_rate": 0.0004692021529431211, + "loss": 0.1555, + "step": 115730 + }, + { + "epoch": 4.79, + "grad_norm": 0.45703125, + "learning_rate": 0.000469196937938434, + "loss": 0.1829, + "step": 115740 + }, + { + "epoch": 4.79, + "grad_norm": 0.57421875, + "learning_rate": 0.000469191722521242, + "loss": 0.1905, + "step": 115750 + }, + { + "epoch": 4.79, + "grad_norm": 0.8203125, + "learning_rate": 0.00046918650669155483, + "loss": 0.2362, + "step": 115760 + }, + { + "epoch": 4.8, + "grad_norm": 0.20703125, + "learning_rate": 0.0004691812904493823, + "loss": 0.1662, + "step": 115770 + }, + { + "epoch": 4.8, + "grad_norm": 0.9375, + "learning_rate": 0.0004691760737947342, + "loss": 0.2397, + "step": 115780 + }, + { + "epoch": 4.8, + "grad_norm": 0.6640625, + "learning_rate": 0.00046917085672762047, + "loss": 0.1814, + "step": 115790 + }, + { + "epoch": 4.8, + "grad_norm": 0.9921875, + "learning_rate": 0.00046916563924805077, + "loss": 0.1837, + "step": 115800 + }, + { + "epoch": 4.8, + "grad_norm": 0.80078125, + "learning_rate": 0.0004691604213560351, + "loss": 0.1655, + "step": 115810 + }, + { + "epoch": 4.8, + "grad_norm": 0.365234375, + "learning_rate": 0.00046915520305158316, + "loss": 0.2239, + "step": 115820 + }, + { + "epoch": 4.8, + "grad_norm": 0.390625, + "learning_rate": 0.0004691499843347048, + "loss": 0.2216, + "step": 115830 + }, + { + "epoch": 4.8, + "grad_norm": 0.7265625, + "learning_rate": 0.00046914476520540984, + "loss": 0.2367, + "step": 115840 + }, + { + "epoch": 4.8, + "grad_norm": 1.6171875, + "learning_rate": 0.00046913954566370805, + "loss": 0.1601, + "step": 115850 + }, + { + "epoch": 4.8, + "grad_norm": 1.8359375, + "learning_rate": 0.00046913432570960936, + "loss": 0.2453, + "step": 115860 + }, + { + "epoch": 4.8, + "grad_norm": 0.93359375, + "learning_rate": 0.0004691291053431235, + "loss": 0.2624, + "step": 115870 + }, + { + "epoch": 4.8, + "grad_norm": 1.4765625, + "learning_rate": 0.0004691238845642603, + "loss": 0.209, + "step": 115880 + }, + { + "epoch": 4.8, + "grad_norm": 1.375, + "learning_rate": 0.0004691186633730296, + "loss": 0.2132, + "step": 115890 + }, + { + "epoch": 4.8, + "grad_norm": 0.90625, + "learning_rate": 0.00046911344176944124, + "loss": 0.2192, + "step": 115900 + }, + { + "epoch": 4.8, + "grad_norm": 0.50390625, + "learning_rate": 0.00046910821975350514, + "loss": 0.188, + "step": 115910 + }, + { + "epoch": 4.8, + "grad_norm": 0.44140625, + "learning_rate": 0.00046910299732523097, + "loss": 0.1965, + "step": 115920 + }, + { + "epoch": 4.8, + "grad_norm": 1.890625, + "learning_rate": 0.00046909777448462864, + "loss": 0.2326, + "step": 115930 + }, + { + "epoch": 4.8, + "grad_norm": 0.5546875, + "learning_rate": 0.0004690925512317079, + "loss": 0.1944, + "step": 115940 + }, + { + "epoch": 4.8, + "grad_norm": 0.890625, + "learning_rate": 0.00046908732756647875, + "loss": 0.2214, + "step": 115950 + }, + { + "epoch": 4.8, + "grad_norm": 1.078125, + "learning_rate": 0.00046908210348895087, + "loss": 0.2127, + "step": 115960 + }, + { + "epoch": 4.8, + "grad_norm": 0.76953125, + "learning_rate": 0.000469076878999134, + "loss": 0.1791, + "step": 115970 + }, + { + "epoch": 4.8, + "grad_norm": 0.478515625, + "learning_rate": 0.00046907165409703825, + "loss": 0.173, + "step": 115980 + }, + { + "epoch": 4.8, + "grad_norm": 0.78515625, + "learning_rate": 0.00046906642878267324, + "loss": 0.2065, + "step": 115990 + }, + { + "epoch": 4.8, + "grad_norm": 1.203125, + "learning_rate": 0.0004690612030560489, + "loss": 0.2454, + "step": 116000 + }, + { + "epoch": 4.81, + "grad_norm": 1.0234375, + "learning_rate": 0.00046905597691717505, + "loss": 0.2645, + "step": 116010 + }, + { + "epoch": 4.81, + "grad_norm": 0.96875, + "learning_rate": 0.0004690507503660615, + "loss": 0.1623, + "step": 116020 + }, + { + "epoch": 4.81, + "grad_norm": 0.83984375, + "learning_rate": 0.000469045523402718, + "loss": 0.1835, + "step": 116030 + }, + { + "epoch": 4.81, + "grad_norm": 0.72265625, + "learning_rate": 0.0004690402960271546, + "loss": 0.1913, + "step": 116040 + }, + { + "epoch": 4.81, + "grad_norm": 1.1640625, + "learning_rate": 0.0004690350682393809, + "loss": 0.2363, + "step": 116050 + }, + { + "epoch": 4.81, + "grad_norm": 0.55859375, + "learning_rate": 0.00046902984003940694, + "loss": 0.227, + "step": 116060 + }, + { + "epoch": 4.81, + "grad_norm": 0.74609375, + "learning_rate": 0.0004690246114272425, + "loss": 0.2401, + "step": 116070 + }, + { + "epoch": 4.81, + "grad_norm": 1.1484375, + "learning_rate": 0.0004690193824028973, + "loss": 0.2062, + "step": 116080 + }, + { + "epoch": 4.81, + "grad_norm": 0.9296875, + "learning_rate": 0.0004690141529663814, + "loss": 0.1959, + "step": 116090 + }, + { + "epoch": 4.81, + "grad_norm": 0.84765625, + "learning_rate": 0.0004690089231177044, + "loss": 0.1976, + "step": 116100 + }, + { + "epoch": 4.81, + "grad_norm": 0.5859375, + "learning_rate": 0.00046900369285687626, + "loss": 0.2462, + "step": 116110 + }, + { + "epoch": 4.81, + "grad_norm": 0.578125, + "learning_rate": 0.0004689984621839069, + "loss": 0.183, + "step": 116120 + }, + { + "epoch": 4.81, + "grad_norm": 0.34375, + "learning_rate": 0.000468993231098806, + "loss": 0.1916, + "step": 116130 + }, + { + "epoch": 4.81, + "grad_norm": 0.462890625, + "learning_rate": 0.00046898799960158356, + "loss": 0.2361, + "step": 116140 + }, + { + "epoch": 4.81, + "grad_norm": 0.73046875, + "learning_rate": 0.00046898276769224935, + "loss": 0.2086, + "step": 116150 + }, + { + "epoch": 4.81, + "grad_norm": 0.5078125, + "learning_rate": 0.0004689775353708132, + "loss": 0.2315, + "step": 116160 + }, + { + "epoch": 4.81, + "grad_norm": 0.443359375, + "learning_rate": 0.000468972302637285, + "loss": 0.1945, + "step": 116170 + }, + { + "epoch": 4.81, + "grad_norm": 0.515625, + "learning_rate": 0.00046896706949167444, + "loss": 0.2103, + "step": 116180 + }, + { + "epoch": 4.81, + "grad_norm": 0.546875, + "learning_rate": 0.0004689618359339917, + "loss": 0.1855, + "step": 116190 + }, + { + "epoch": 4.81, + "grad_norm": 0.7578125, + "learning_rate": 0.00046895660196424627, + "loss": 0.1907, + "step": 116200 + }, + { + "epoch": 4.81, + "grad_norm": 0.25, + "learning_rate": 0.00046895136758244826, + "loss": 0.1999, + "step": 116210 + }, + { + "epoch": 4.81, + "grad_norm": 0.50390625, + "learning_rate": 0.00046894613278860735, + "loss": 0.193, + "step": 116220 + }, + { + "epoch": 4.81, + "grad_norm": 1.0859375, + "learning_rate": 0.0004689408975827335, + "loss": 0.1651, + "step": 116230 + }, + { + "epoch": 4.81, + "grad_norm": 0.6796875, + "learning_rate": 0.00046893566196483654, + "loss": 0.225, + "step": 116240 + }, + { + "epoch": 4.82, + "grad_norm": 0.419921875, + "learning_rate": 0.00046893042593492627, + "loss": 0.2231, + "step": 116250 + }, + { + "epoch": 4.82, + "grad_norm": 2.390625, + "learning_rate": 0.00046892518949301267, + "loss": 0.2198, + "step": 116260 + }, + { + "epoch": 4.82, + "grad_norm": 0.65234375, + "learning_rate": 0.00046891995263910534, + "loss": 0.2214, + "step": 116270 + }, + { + "epoch": 4.82, + "grad_norm": 0.9921875, + "learning_rate": 0.00046891471537321446, + "loss": 0.2545, + "step": 116280 + }, + { + "epoch": 4.82, + "grad_norm": 0.1435546875, + "learning_rate": 0.00046890947769534963, + "loss": 0.1759, + "step": 116290 + }, + { + "epoch": 4.82, + "grad_norm": 0.99609375, + "learning_rate": 0.0004689042396055209, + "loss": 0.2123, + "step": 116300 + }, + { + "epoch": 4.82, + "grad_norm": 0.44140625, + "learning_rate": 0.000468899001103738, + "loss": 0.1534, + "step": 116310 + }, + { + "epoch": 4.82, + "grad_norm": 0.50390625, + "learning_rate": 0.00046889376219001066, + "loss": 0.2753, + "step": 116320 + }, + { + "epoch": 4.82, + "grad_norm": 0.70703125, + "learning_rate": 0.0004688885228643491, + "loss": 0.1849, + "step": 116330 + }, + { + "epoch": 4.82, + "grad_norm": 0.66015625, + "learning_rate": 0.0004688832831267629, + "loss": 0.2125, + "step": 116340 + }, + { + "epoch": 4.82, + "grad_norm": 0.546875, + "learning_rate": 0.000468878042977262, + "loss": 0.1967, + "step": 116350 + }, + { + "epoch": 4.82, + "grad_norm": 0.69140625, + "learning_rate": 0.00046887280241585626, + "loss": 0.1915, + "step": 116360 + }, + { + "epoch": 4.82, + "grad_norm": 0.26171875, + "learning_rate": 0.0004688675614425555, + "loss": 0.1498, + "step": 116370 + }, + { + "epoch": 4.82, + "grad_norm": 1.1875, + "learning_rate": 0.0004688623200573697, + "loss": 0.2082, + "step": 116380 + }, + { + "epoch": 4.82, + "grad_norm": 0.87109375, + "learning_rate": 0.0004688570782603086, + "loss": 0.2029, + "step": 116390 + }, + { + "epoch": 4.82, + "grad_norm": 0.7734375, + "learning_rate": 0.00046885183605138215, + "loss": 0.1935, + "step": 116400 + }, + { + "epoch": 4.82, + "grad_norm": 0.37109375, + "learning_rate": 0.00046884659343060015, + "loss": 0.2514, + "step": 116410 + }, + { + "epoch": 4.82, + "grad_norm": 0.609375, + "learning_rate": 0.00046884135039797247, + "loss": 0.2272, + "step": 116420 + }, + { + "epoch": 4.82, + "grad_norm": 1.3125, + "learning_rate": 0.0004688361069535091, + "loss": 0.2003, + "step": 116430 + }, + { + "epoch": 4.82, + "grad_norm": 1.03125, + "learning_rate": 0.00046883086309721967, + "loss": 0.2486, + "step": 116440 + }, + { + "epoch": 4.82, + "grad_norm": 0.2294921875, + "learning_rate": 0.00046882561882911433, + "loss": 0.2606, + "step": 116450 + }, + { + "epoch": 4.82, + "grad_norm": 0.8125, + "learning_rate": 0.0004688203741492027, + "loss": 0.1689, + "step": 116460 + }, + { + "epoch": 4.82, + "grad_norm": 0.1748046875, + "learning_rate": 0.0004688151290574948, + "loss": 0.2801, + "step": 116470 + }, + { + "epoch": 4.82, + "grad_norm": 0.3359375, + "learning_rate": 0.00046880988355400046, + "loss": 0.2497, + "step": 116480 + }, + { + "epoch": 4.83, + "grad_norm": 0.8125, + "learning_rate": 0.0004688046376387295, + "loss": 0.2225, + "step": 116490 + }, + { + "epoch": 4.83, + "grad_norm": 0.6328125, + "learning_rate": 0.0004687993913116919, + "loss": 0.1898, + "step": 116500 + }, + { + "epoch": 4.83, + "grad_norm": 1.4453125, + "learning_rate": 0.00046879414457289746, + "loss": 0.2062, + "step": 116510 + }, + { + "epoch": 4.83, + "grad_norm": 1.53125, + "learning_rate": 0.0004687888974223561, + "loss": 0.1822, + "step": 116520 + }, + { + "epoch": 4.83, + "grad_norm": 0.8125, + "learning_rate": 0.0004687836498600776, + "loss": 0.2096, + "step": 116530 + }, + { + "epoch": 4.83, + "grad_norm": 0.396484375, + "learning_rate": 0.0004687784018860719, + "loss": 0.2588, + "step": 116540 + }, + { + "epoch": 4.83, + "grad_norm": 0.83984375, + "learning_rate": 0.0004687731535003489, + "loss": 0.1945, + "step": 116550 + }, + { + "epoch": 4.83, + "grad_norm": 0.70703125, + "learning_rate": 0.0004687679047029184, + "loss": 0.1858, + "step": 116560 + }, + { + "epoch": 4.83, + "grad_norm": 0.4296875, + "learning_rate": 0.0004687626554937904, + "loss": 0.2588, + "step": 116570 + }, + { + "epoch": 4.83, + "grad_norm": 0.6171875, + "learning_rate": 0.00046875740587297465, + "loss": 0.2253, + "step": 116580 + }, + { + "epoch": 4.83, + "grad_norm": 0.3046875, + "learning_rate": 0.00046875215584048116, + "loss": 0.2079, + "step": 116590 + }, + { + "epoch": 4.83, + "grad_norm": 0.43359375, + "learning_rate": 0.0004687469053963197, + "loss": 0.2125, + "step": 116600 + }, + { + "epoch": 4.83, + "grad_norm": 0.59375, + "learning_rate": 0.00046874165454050015, + "loss": 0.2045, + "step": 116610 + }, + { + "epoch": 4.83, + "grad_norm": 1.4140625, + "learning_rate": 0.00046873640327303246, + "loss": 0.2327, + "step": 116620 + }, + { + "epoch": 4.83, + "grad_norm": 0.000232696533203125, + "learning_rate": 0.0004687311515939265, + "loss": 0.1602, + "step": 116630 + }, + { + "epoch": 4.83, + "grad_norm": 0.6015625, + "learning_rate": 0.0004687258995031921, + "loss": 0.2185, + "step": 116640 + }, + { + "epoch": 4.83, + "grad_norm": 0.796875, + "learning_rate": 0.0004687206470008392, + "loss": 0.2137, + "step": 116650 + }, + { + "epoch": 4.83, + "grad_norm": 0.328125, + "learning_rate": 0.00046871539408687763, + "loss": 0.189, + "step": 116660 + }, + { + "epoch": 4.83, + "grad_norm": 1.171875, + "learning_rate": 0.0004687101407613174, + "loss": 0.2726, + "step": 116670 + }, + { + "epoch": 4.83, + "grad_norm": 0.734375, + "learning_rate": 0.0004687048870241681, + "loss": 0.2035, + "step": 116680 + }, + { + "epoch": 4.83, + "grad_norm": 0.64453125, + "learning_rate": 0.00046869963287544004, + "loss": 0.2324, + "step": 116690 + }, + { + "epoch": 4.83, + "grad_norm": 0.5546875, + "learning_rate": 0.0004686943783151428, + "loss": 0.2003, + "step": 116700 + }, + { + "epoch": 4.83, + "grad_norm": 0.375, + "learning_rate": 0.0004686891233432863, + "loss": 0.2573, + "step": 116710 + }, + { + "epoch": 4.83, + "grad_norm": 0.60546875, + "learning_rate": 0.00046868386795988063, + "loss": 0.1533, + "step": 116720 + }, + { + "epoch": 4.83, + "grad_norm": 1.2109375, + "learning_rate": 0.00046867861216493543, + "loss": 0.2371, + "step": 116730 + }, + { + "epoch": 4.84, + "grad_norm": 0.54296875, + "learning_rate": 0.00046867335595846075, + "loss": 0.1879, + "step": 116740 + }, + { + "epoch": 4.84, + "grad_norm": 0.7734375, + "learning_rate": 0.00046866809934046635, + "loss": 0.1605, + "step": 116750 + }, + { + "epoch": 4.84, + "grad_norm": 1.03125, + "learning_rate": 0.00046866284231096225, + "loss": 0.2216, + "step": 116760 + }, + { + "epoch": 4.84, + "grad_norm": 0.283203125, + "learning_rate": 0.0004686575848699582, + "loss": 0.2114, + "step": 116770 + }, + { + "epoch": 4.84, + "grad_norm": 1.015625, + "learning_rate": 0.00046865232701746433, + "loss": 0.2416, + "step": 116780 + }, + { + "epoch": 4.84, + "grad_norm": 1.5390625, + "learning_rate": 0.0004686470687534904, + "loss": 0.1996, + "step": 116790 + }, + { + "epoch": 4.84, + "grad_norm": 0.52734375, + "learning_rate": 0.0004686418100780462, + "loss": 0.2099, + "step": 116800 + }, + { + "epoch": 4.84, + "grad_norm": 1.0859375, + "learning_rate": 0.0004686365509911418, + "loss": 0.1615, + "step": 116810 + }, + { + "epoch": 4.84, + "grad_norm": 1.0703125, + "learning_rate": 0.00046863129149278695, + "loss": 0.267, + "step": 116820 + }, + { + "epoch": 4.84, + "grad_norm": 2.125, + "learning_rate": 0.0004686260315829917, + "loss": 0.205, + "step": 116830 + }, + { + "epoch": 4.84, + "grad_norm": 0.408203125, + "learning_rate": 0.0004686207712617658, + "loss": 0.208, + "step": 116840 + }, + { + "epoch": 4.84, + "grad_norm": 0.52734375, + "learning_rate": 0.0004686155105291192, + "loss": 0.2321, + "step": 116850 + }, + { + "epoch": 4.84, + "grad_norm": 0.515625, + "learning_rate": 0.0004686102493850619, + "loss": 0.1836, + "step": 116860 + }, + { + "epoch": 4.84, + "grad_norm": 0.48046875, + "learning_rate": 0.0004686049878296037, + "loss": 0.1349, + "step": 116870 + }, + { + "epoch": 4.84, + "grad_norm": 0.75390625, + "learning_rate": 0.00046859972586275444, + "loss": 0.2654, + "step": 116880 + }, + { + "epoch": 4.84, + "grad_norm": 0.67578125, + "learning_rate": 0.00046859446348452416, + "loss": 0.1644, + "step": 116890 + }, + { + "epoch": 4.84, + "grad_norm": 0.61328125, + "learning_rate": 0.0004685892006949227, + "loss": 0.2292, + "step": 116900 + }, + { + "epoch": 4.84, + "grad_norm": 1.5546875, + "learning_rate": 0.00046858393749396, + "loss": 0.1962, + "step": 116910 + }, + { + "epoch": 4.84, + "grad_norm": 0.5625, + "learning_rate": 0.000468578673881646, + "loss": 0.2006, + "step": 116920 + }, + { + "epoch": 4.84, + "grad_norm": 0.44921875, + "learning_rate": 0.0004685734098579904, + "loss": 0.1993, + "step": 116930 + }, + { + "epoch": 4.84, + "grad_norm": 1.875, + "learning_rate": 0.0004685681454230033, + "loss": 0.2047, + "step": 116940 + }, + { + "epoch": 4.84, + "grad_norm": 0.83203125, + "learning_rate": 0.0004685628805766946, + "loss": 0.2406, + "step": 116950 + }, + { + "epoch": 4.84, + "grad_norm": 0.890625, + "learning_rate": 0.00046855761531907405, + "loss": 0.2007, + "step": 116960 + }, + { + "epoch": 4.84, + "grad_norm": 0.30859375, + "learning_rate": 0.0004685523496501518, + "loss": 0.2058, + "step": 116970 + }, + { + "epoch": 4.85, + "grad_norm": 0.88671875, + "learning_rate": 0.00046854708356993747, + "loss": 0.173, + "step": 116980 + }, + { + "epoch": 4.85, + "grad_norm": 0.64453125, + "learning_rate": 0.00046854181707844125, + "loss": 0.161, + "step": 116990 + }, + { + "epoch": 4.85, + "grad_norm": 0.62109375, + "learning_rate": 0.00046853655017567295, + "loss": 0.2251, + "step": 117000 + }, + { + "epoch": 4.85, + "grad_norm": 0.73046875, + "learning_rate": 0.00046853128286164237, + "loss": 0.197, + "step": 117010 + }, + { + "epoch": 4.85, + "grad_norm": 1.9765625, + "learning_rate": 0.0004685260151363596, + "loss": 0.2201, + "step": 117020 + }, + { + "epoch": 4.85, + "grad_norm": 1.3515625, + "learning_rate": 0.0004685207469998344, + "loss": 0.2307, + "step": 117030 + }, + { + "epoch": 4.85, + "grad_norm": 0.9296875, + "learning_rate": 0.00046851547845207675, + "loss": 0.2162, + "step": 117040 + }, + { + "epoch": 4.85, + "grad_norm": 0.25, + "learning_rate": 0.00046851020949309664, + "loss": 0.2169, + "step": 117050 + }, + { + "epoch": 4.85, + "grad_norm": 0.55859375, + "learning_rate": 0.0004685049401229039, + "loss": 0.1927, + "step": 117060 + }, + { + "epoch": 4.85, + "grad_norm": 0.462890625, + "learning_rate": 0.0004684996703415084, + "loss": 0.1674, + "step": 117070 + }, + { + "epoch": 4.85, + "grad_norm": 0.6953125, + "learning_rate": 0.00046849440014892015, + "loss": 0.2462, + "step": 117080 + }, + { + "epoch": 4.85, + "grad_norm": 0.9296875, + "learning_rate": 0.00046848912954514907, + "loss": 0.2213, + "step": 117090 + }, + { + "epoch": 4.85, + "grad_norm": 0.240234375, + "learning_rate": 0.000468483858530205, + "loss": 0.2058, + "step": 117100 + }, + { + "epoch": 4.85, + "grad_norm": 0.318359375, + "learning_rate": 0.00046847858710409785, + "loss": 0.2181, + "step": 117110 + }, + { + "epoch": 4.85, + "grad_norm": 0.48828125, + "learning_rate": 0.00046847331526683766, + "loss": 0.2409, + "step": 117120 + }, + { + "epoch": 4.85, + "grad_norm": 1.0859375, + "learning_rate": 0.0004684680430184343, + "loss": 0.2381, + "step": 117130 + }, + { + "epoch": 4.85, + "grad_norm": 0.1884765625, + "learning_rate": 0.0004684627703588976, + "loss": 0.2051, + "step": 117140 + }, + { + "epoch": 4.85, + "grad_norm": 0.86328125, + "learning_rate": 0.0004684574972882376, + "loss": 0.2341, + "step": 117150 + }, + { + "epoch": 4.85, + "grad_norm": 0.984375, + "learning_rate": 0.00046845222380646425, + "loss": 0.2506, + "step": 117160 + }, + { + "epoch": 4.85, + "grad_norm": 0.5546875, + "learning_rate": 0.00046844694991358737, + "loss": 0.2456, + "step": 117170 + }, + { + "epoch": 4.85, + "grad_norm": 1.078125, + "learning_rate": 0.00046844167560961686, + "loss": 0.2451, + "step": 117180 + }, + { + "epoch": 4.85, + "grad_norm": 0.59765625, + "learning_rate": 0.00046843640089456274, + "loss": 0.2124, + "step": 117190 + }, + { + "epoch": 4.85, + "grad_norm": 0.60546875, + "learning_rate": 0.00046843112576843495, + "loss": 0.2103, + "step": 117200 + }, + { + "epoch": 4.85, + "grad_norm": 0.474609375, + "learning_rate": 0.0004684258502312433, + "loss": 0.273, + "step": 117210 + }, + { + "epoch": 4.86, + "grad_norm": 0.76171875, + "learning_rate": 0.0004684205742829978, + "loss": 0.2175, + "step": 117220 + }, + { + "epoch": 4.86, + "grad_norm": 0.6796875, + "learning_rate": 0.0004684152979237084, + "loss": 0.2492, + "step": 117230 + }, + { + "epoch": 4.86, + "grad_norm": 1.28125, + "learning_rate": 0.0004684100211533849, + "loss": 0.2224, + "step": 117240 + }, + { + "epoch": 4.86, + "grad_norm": 0.53515625, + "learning_rate": 0.00046840474397203747, + "loss": 0.1494, + "step": 117250 + }, + { + "epoch": 4.86, + "grad_norm": 0.67578125, + "learning_rate": 0.0004683994663796758, + "loss": 0.1984, + "step": 117260 + }, + { + "epoch": 4.86, + "grad_norm": 0.451171875, + "learning_rate": 0.00046839418837631, + "loss": 0.2502, + "step": 117270 + }, + { + "epoch": 4.86, + "grad_norm": 0.6796875, + "learning_rate": 0.00046838890996194984, + "loss": 0.2161, + "step": 117280 + }, + { + "epoch": 4.86, + "grad_norm": 0.384765625, + "learning_rate": 0.00046838363113660533, + "loss": 0.209, + "step": 117290 + }, + { + "epoch": 4.86, + "grad_norm": 0.365234375, + "learning_rate": 0.00046837835190028645, + "loss": 0.2185, + "step": 117300 + }, + { + "epoch": 4.86, + "grad_norm": 0.478515625, + "learning_rate": 0.00046837307225300307, + "loss": 0.1542, + "step": 117310 + }, + { + "epoch": 4.86, + "grad_norm": 0.68359375, + "learning_rate": 0.0004683677921947652, + "loss": 0.2102, + "step": 117320 + }, + { + "epoch": 4.86, + "grad_norm": 0.50390625, + "learning_rate": 0.0004683625117255827, + "loss": 0.2259, + "step": 117330 + }, + { + "epoch": 4.86, + "grad_norm": 0.703125, + "learning_rate": 0.00046835723084546555, + "loss": 0.1975, + "step": 117340 + }, + { + "epoch": 4.86, + "grad_norm": 0.828125, + "learning_rate": 0.00046835194955442363, + "loss": 0.2656, + "step": 117350 + }, + { + "epoch": 4.86, + "grad_norm": 0.8125, + "learning_rate": 0.0004683466678524669, + "loss": 0.2312, + "step": 117360 + }, + { + "epoch": 4.86, + "grad_norm": 0.5703125, + "learning_rate": 0.0004683413857396054, + "loss": 0.2416, + "step": 117370 + }, + { + "epoch": 4.86, + "grad_norm": 0.61328125, + "learning_rate": 0.000468336103215849, + "loss": 0.2242, + "step": 117380 + }, + { + "epoch": 4.86, + "grad_norm": 0.62109375, + "learning_rate": 0.0004683308202812075, + "loss": 0.1873, + "step": 117390 + }, + { + "epoch": 4.86, + "grad_norm": 0.173828125, + "learning_rate": 0.0004683255369356911, + "loss": 0.2315, + "step": 117400 + }, + { + "epoch": 4.86, + "grad_norm": 2.9375, + "learning_rate": 0.0004683202531793096, + "loss": 0.1931, + "step": 117410 + }, + { + "epoch": 4.86, + "grad_norm": 1.2890625, + "learning_rate": 0.00046831496901207295, + "loss": 0.2293, + "step": 117420 + }, + { + "epoch": 4.86, + "grad_norm": 0.8515625, + "learning_rate": 0.00046830968443399107, + "loss": 0.1655, + "step": 117430 + }, + { + "epoch": 4.86, + "grad_norm": 1.15625, + "learning_rate": 0.000468304399445074, + "loss": 0.2289, + "step": 117440 + }, + { + "epoch": 4.86, + "grad_norm": 0.39453125, + "learning_rate": 0.00046829911404533154, + "loss": 0.2183, + "step": 117450 + }, + { + "epoch": 4.87, + "grad_norm": 0.69921875, + "learning_rate": 0.00046829382823477373, + "loss": 0.2047, + "step": 117460 + }, + { + "epoch": 4.87, + "grad_norm": 1.921875, + "learning_rate": 0.0004682885420134106, + "loss": 0.2516, + "step": 117470 + }, + { + "epoch": 4.87, + "grad_norm": 0.85546875, + "learning_rate": 0.00046828325538125196, + "loss": 0.1872, + "step": 117480 + }, + { + "epoch": 4.87, + "grad_norm": 2.203125, + "learning_rate": 0.00046827796833830783, + "loss": 0.2676, + "step": 117490 + }, + { + "epoch": 4.87, + "grad_norm": 0.52734375, + "learning_rate": 0.00046827268088458807, + "loss": 0.2004, + "step": 117500 + }, + { + "epoch": 4.87, + "grad_norm": 0.7734375, + "learning_rate": 0.00046826739302010276, + "loss": 0.2045, + "step": 117510 + }, + { + "epoch": 4.87, + "grad_norm": 1.390625, + "learning_rate": 0.00046826210474486176, + "loss": 0.2462, + "step": 117520 + }, + { + "epoch": 4.87, + "grad_norm": 0.59375, + "learning_rate": 0.00046825681605887505, + "loss": 0.1647, + "step": 117530 + }, + { + "epoch": 4.87, + "grad_norm": 0.5625, + "learning_rate": 0.00046825152696215256, + "loss": 0.2254, + "step": 117540 + }, + { + "epoch": 4.87, + "grad_norm": 0.9375, + "learning_rate": 0.0004682462374547043, + "loss": 0.1497, + "step": 117550 + }, + { + "epoch": 4.87, + "grad_norm": 0.73046875, + "learning_rate": 0.0004682409475365401, + "loss": 0.2386, + "step": 117560 + }, + { + "epoch": 4.87, + "grad_norm": 0.341796875, + "learning_rate": 0.0004682356572076701, + "loss": 0.1929, + "step": 117570 + }, + { + "epoch": 4.87, + "grad_norm": 0.43359375, + "learning_rate": 0.0004682303664681041, + "loss": 0.2043, + "step": 117580 + }, + { + "epoch": 4.87, + "grad_norm": 1.5859375, + "learning_rate": 0.00046822507531785217, + "loss": 0.1322, + "step": 117590 + }, + { + "epoch": 4.87, + "grad_norm": 1.46875, + "learning_rate": 0.00046821978375692424, + "loss": 0.2661, + "step": 117600 + }, + { + "epoch": 4.87, + "grad_norm": 0.64453125, + "learning_rate": 0.00046821449178533023, + "loss": 0.2101, + "step": 117610 + }, + { + "epoch": 4.87, + "grad_norm": 0.58984375, + "learning_rate": 0.00046820919940308007, + "loss": 0.2028, + "step": 117620 + }, + { + "epoch": 4.87, + "grad_norm": 0.890625, + "learning_rate": 0.0004682039066101837, + "loss": 0.2004, + "step": 117630 + }, + { + "epoch": 4.87, + "grad_norm": 1.328125, + "learning_rate": 0.00046819861340665134, + "loss": 0.1523, + "step": 117640 + }, + { + "epoch": 4.87, + "grad_norm": 0.96484375, + "learning_rate": 0.00046819331979249256, + "loss": 0.1796, + "step": 117650 + }, + { + "epoch": 4.87, + "grad_norm": 0.515625, + "learning_rate": 0.0004681880257677176, + "loss": 0.2223, + "step": 117660 + }, + { + "epoch": 4.87, + "grad_norm": 0.484375, + "learning_rate": 0.00046818273133233636, + "loss": 0.2455, + "step": 117670 + }, + { + "epoch": 4.87, + "grad_norm": 0.4296875, + "learning_rate": 0.00046817743648635874, + "loss": 0.2107, + "step": 117680 + }, + { + "epoch": 4.87, + "grad_norm": 0.8671875, + "learning_rate": 0.0004681721412297947, + "loss": 0.1937, + "step": 117690 + }, + { + "epoch": 4.88, + "grad_norm": 0.48828125, + "learning_rate": 0.0004681668455626543, + "loss": 0.1997, + "step": 117700 + }, + { + "epoch": 4.88, + "grad_norm": 0.490234375, + "learning_rate": 0.0004681615494849475, + "loss": 0.2086, + "step": 117710 + }, + { + "epoch": 4.88, + "grad_norm": 0.65234375, + "learning_rate": 0.00046815625299668416, + "loss": 0.1819, + "step": 117720 + }, + { + "epoch": 4.88, + "grad_norm": 1.4921875, + "learning_rate": 0.0004681509560978744, + "loss": 0.1542, + "step": 117730 + }, + { + "epoch": 4.88, + "grad_norm": 1.890625, + "learning_rate": 0.000468145658788528, + "loss": 0.2055, + "step": 117740 + }, + { + "epoch": 4.88, + "grad_norm": 1.3359375, + "learning_rate": 0.00046814036106865504, + "loss": 0.204, + "step": 117750 + }, + { + "epoch": 4.88, + "grad_norm": 1.140625, + "learning_rate": 0.0004681350629382655, + "loss": 0.1739, + "step": 117760 + }, + { + "epoch": 4.88, + "grad_norm": 0.478515625, + "learning_rate": 0.0004681297643973693, + "loss": 0.221, + "step": 117770 + }, + { + "epoch": 4.88, + "grad_norm": 0.58203125, + "learning_rate": 0.0004681244654459765, + "loss": 0.2028, + "step": 117780 + }, + { + "epoch": 4.88, + "grad_norm": 0.306640625, + "learning_rate": 0.000468119166084097, + "loss": 0.2248, + "step": 117790 + }, + { + "epoch": 4.88, + "grad_norm": 0.51171875, + "learning_rate": 0.0004681138663117407, + "loss": 0.2231, + "step": 117800 + }, + { + "epoch": 4.88, + "grad_norm": 0.58203125, + "learning_rate": 0.00046810856612891774, + "loss": 0.1721, + "step": 117810 + }, + { + "epoch": 4.88, + "grad_norm": 0.3828125, + "learning_rate": 0.00046810326553563795, + "loss": 0.201, + "step": 117820 + }, + { + "epoch": 4.88, + "grad_norm": 2.0, + "learning_rate": 0.0004680979645319114, + "loss": 0.223, + "step": 117830 + }, + { + "epoch": 4.88, + "grad_norm": 0.765625, + "learning_rate": 0.00046809266311774813, + "loss": 0.2353, + "step": 117840 + }, + { + "epoch": 4.88, + "grad_norm": 1.328125, + "learning_rate": 0.0004680873612931579, + "loss": 0.2413, + "step": 117850 + }, + { + "epoch": 4.88, + "grad_norm": 0.921875, + "learning_rate": 0.0004680820590581508, + "loss": 0.1733, + "step": 117860 + }, + { + "epoch": 4.88, + "grad_norm": 0.63671875, + "learning_rate": 0.0004680767564127369, + "loss": 0.1926, + "step": 117870 + }, + { + "epoch": 4.88, + "grad_norm": 0.474609375, + "learning_rate": 0.000468071453356926, + "loss": 0.1948, + "step": 117880 + }, + { + "epoch": 4.88, + "grad_norm": 1.4140625, + "learning_rate": 0.00046806614989072825, + "loss": 0.2007, + "step": 117890 + }, + { + "epoch": 4.88, + "grad_norm": 0.345703125, + "learning_rate": 0.00046806084601415355, + "loss": 0.237, + "step": 117900 + }, + { + "epoch": 4.88, + "grad_norm": 0.90625, + "learning_rate": 0.00046805554172721185, + "loss": 0.1789, + "step": 117910 + }, + { + "epoch": 4.88, + "grad_norm": 1.109375, + "learning_rate": 0.00046805023702991324, + "loss": 0.2168, + "step": 117920 + }, + { + "epoch": 4.88, + "grad_norm": 1.1171875, + "learning_rate": 0.0004680449319222675, + "loss": 0.2196, + "step": 117930 + }, + { + "epoch": 4.89, + "grad_norm": 0.298828125, + "learning_rate": 0.0004680396264042849, + "loss": 0.2354, + "step": 117940 + }, + { + "epoch": 4.89, + "grad_norm": 0.1767578125, + "learning_rate": 0.00046803432047597517, + "loss": 0.2284, + "step": 117950 + }, + { + "epoch": 4.89, + "grad_norm": 0.95703125, + "learning_rate": 0.00046802901413734837, + "loss": 0.1902, + "step": 117960 + }, + { + "epoch": 4.89, + "grad_norm": 0.66796875, + "learning_rate": 0.0004680237073884146, + "loss": 0.1959, + "step": 117970 + }, + { + "epoch": 4.89, + "grad_norm": 0.7109375, + "learning_rate": 0.0004680184002291837, + "loss": 0.2419, + "step": 117980 + }, + { + "epoch": 4.89, + "grad_norm": 0.310546875, + "learning_rate": 0.00046801309265966567, + "loss": 0.2059, + "step": 117990 + }, + { + "epoch": 4.89, + "grad_norm": 0.90234375, + "learning_rate": 0.0004680077846798706, + "loss": 0.1863, + "step": 118000 + }, + { + "epoch": 4.89, + "grad_norm": 1.7734375, + "learning_rate": 0.0004680024762898084, + "loss": 0.2375, + "step": 118010 + }, + { + "epoch": 4.89, + "grad_norm": 1.1640625, + "learning_rate": 0.0004679971674894892, + "loss": 0.2118, + "step": 118020 + }, + { + "epoch": 4.89, + "grad_norm": 0.373046875, + "learning_rate": 0.0004679918582789227, + "loss": 0.2161, + "step": 118030 + }, + { + "epoch": 4.89, + "grad_norm": 0.490234375, + "learning_rate": 0.0004679865486581192, + "loss": 0.2314, + "step": 118040 + }, + { + "epoch": 4.89, + "grad_norm": 0.75, + "learning_rate": 0.0004679812386270885, + "loss": 0.1827, + "step": 118050 + }, + { + "epoch": 4.89, + "grad_norm": 0.4296875, + "learning_rate": 0.0004679759281858406, + "loss": 0.2624, + "step": 118060 + }, + { + "epoch": 4.89, + "grad_norm": 0.83203125, + "learning_rate": 0.0004679706173343856, + "loss": 0.2045, + "step": 118070 + }, + { + "epoch": 4.89, + "grad_norm": 0.99609375, + "learning_rate": 0.0004679653060727334, + "loss": 0.1384, + "step": 118080 + }, + { + "epoch": 4.89, + "grad_norm": 0.33984375, + "learning_rate": 0.0004679599944008941, + "loss": 0.1723, + "step": 118090 + }, + { + "epoch": 4.89, + "grad_norm": 0.73828125, + "learning_rate": 0.00046795468231887753, + "loss": 0.1888, + "step": 118100 + }, + { + "epoch": 4.89, + "grad_norm": 1.4921875, + "learning_rate": 0.0004679493698266939, + "loss": 0.2013, + "step": 118110 + }, + { + "epoch": 4.89, + "grad_norm": 0.57421875, + "learning_rate": 0.000467944056924353, + "loss": 0.201, + "step": 118120 + }, + { + "epoch": 4.89, + "grad_norm": 0.7734375, + "learning_rate": 0.00046793874361186495, + "loss": 0.2444, + "step": 118130 + }, + { + "epoch": 4.89, + "grad_norm": 0.671875, + "learning_rate": 0.0004679334298892397, + "loss": 0.2172, + "step": 118140 + }, + { + "epoch": 4.89, + "grad_norm": 0.61328125, + "learning_rate": 0.0004679281157564873, + "loss": 0.2192, + "step": 118150 + }, + { + "epoch": 4.89, + "grad_norm": 0.388671875, + "learning_rate": 0.00046792280121361776, + "loss": 0.1676, + "step": 118160 + }, + { + "epoch": 4.89, + "grad_norm": 0.67578125, + "learning_rate": 0.00046791748626064095, + "loss": 0.2006, + "step": 118170 + }, + { + "epoch": 4.9, + "grad_norm": 0.197265625, + "learning_rate": 0.000467912170897567, + "loss": 0.1647, + "step": 118180 + }, + { + "epoch": 4.9, + "grad_norm": 0.96875, + "learning_rate": 0.0004679068551244059, + "loss": 0.2298, + "step": 118190 + }, + { + "epoch": 4.9, + "grad_norm": 0.98046875, + "learning_rate": 0.0004679015389411676, + "loss": 0.2065, + "step": 118200 + }, + { + "epoch": 4.9, + "grad_norm": 0.69921875, + "learning_rate": 0.00046789622234786216, + "loss": 0.2267, + "step": 118210 + }, + { + "epoch": 4.9, + "grad_norm": 0.625, + "learning_rate": 0.0004678909053444995, + "loss": 0.2177, + "step": 118220 + }, + { + "epoch": 4.9, + "grad_norm": 0.5859375, + "learning_rate": 0.0004678855879310898, + "loss": 0.1908, + "step": 118230 + }, + { + "epoch": 4.9, + "grad_norm": 0.56640625, + "learning_rate": 0.0004678802701076429, + "loss": 0.2157, + "step": 118240 + }, + { + "epoch": 4.9, + "grad_norm": 0.57421875, + "learning_rate": 0.00046787495187416886, + "loss": 0.194, + "step": 118250 + }, + { + "epoch": 4.9, + "grad_norm": 0.443359375, + "learning_rate": 0.0004678696332306777, + "loss": 0.2441, + "step": 118260 + }, + { + "epoch": 4.9, + "grad_norm": 0.8359375, + "learning_rate": 0.00046786431417717935, + "loss": 0.1798, + "step": 118270 + }, + { + "epoch": 4.9, + "grad_norm": 0.59765625, + "learning_rate": 0.0004678589947136839, + "loss": 0.2426, + "step": 118280 + }, + { + "epoch": 4.9, + "grad_norm": 0.75390625, + "learning_rate": 0.0004678536748402014, + "loss": 0.2134, + "step": 118290 + }, + { + "epoch": 4.9, + "grad_norm": 0.40234375, + "learning_rate": 0.0004678483545567418, + "loss": 0.1732, + "step": 118300 + }, + { + "epoch": 4.9, + "grad_norm": 0.8828125, + "learning_rate": 0.0004678430338633151, + "loss": 0.2085, + "step": 118310 + }, + { + "epoch": 4.9, + "grad_norm": 0.423828125, + "learning_rate": 0.00046783771275993134, + "loss": 0.1907, + "step": 118320 + }, + { + "epoch": 4.9, + "grad_norm": 0.609375, + "learning_rate": 0.0004678323912466006, + "loss": 0.2145, + "step": 118330 + }, + { + "epoch": 4.9, + "grad_norm": 1.140625, + "learning_rate": 0.00046782706932333275, + "loss": 0.2145, + "step": 118340 + }, + { + "epoch": 4.9, + "grad_norm": 0.79296875, + "learning_rate": 0.0004678217469901379, + "loss": 0.2388, + "step": 118350 + }, + { + "epoch": 4.9, + "grad_norm": 0.75390625, + "learning_rate": 0.00046781642424702604, + "loss": 0.2236, + "step": 118360 + }, + { + "epoch": 4.9, + "grad_norm": 1.25, + "learning_rate": 0.00046781110109400713, + "loss": 0.2517, + "step": 118370 + }, + { + "epoch": 4.9, + "grad_norm": 0.8671875, + "learning_rate": 0.00046780577753109133, + "loss": 0.2055, + "step": 118380 + }, + { + "epoch": 4.9, + "grad_norm": 1.5234375, + "learning_rate": 0.00046780045355828857, + "loss": 0.2114, + "step": 118390 + }, + { + "epoch": 4.9, + "grad_norm": 0.65625, + "learning_rate": 0.00046779512917560887, + "loss": 0.2159, + "step": 118400 + }, + { + "epoch": 4.9, + "grad_norm": 1.0390625, + "learning_rate": 0.0004677898043830623, + "loss": 0.1995, + "step": 118410 + }, + { + "epoch": 4.9, + "grad_norm": 0.486328125, + "learning_rate": 0.0004677844791806587, + "loss": 0.1779, + "step": 118420 + }, + { + "epoch": 4.91, + "grad_norm": 0.25390625, + "learning_rate": 0.0004677791535684084, + "loss": 0.1878, + "step": 118430 + }, + { + "epoch": 4.91, + "grad_norm": 0.46484375, + "learning_rate": 0.0004677738275463211, + "loss": 0.2354, + "step": 118440 + }, + { + "epoch": 4.91, + "grad_norm": 1.265625, + "learning_rate": 0.00046776850111440705, + "loss": 0.1638, + "step": 118450 + }, + { + "epoch": 4.91, + "grad_norm": 1.3359375, + "learning_rate": 0.0004677631742726762, + "loss": 0.2656, + "step": 118460 + }, + { + "epoch": 4.91, + "grad_norm": 0.193359375, + "learning_rate": 0.00046775784702113857, + "loss": 0.2102, + "step": 118470 + }, + { + "epoch": 4.91, + "grad_norm": 1.4453125, + "learning_rate": 0.00046775251935980423, + "loss": 0.1707, + "step": 118480 + }, + { + "epoch": 4.91, + "grad_norm": 0.73046875, + "learning_rate": 0.00046774719128868305, + "loss": 0.1993, + "step": 118490 + }, + { + "epoch": 4.91, + "grad_norm": 0.56640625, + "learning_rate": 0.00046774186280778527, + "loss": 0.2344, + "step": 118500 + }, + { + "epoch": 4.91, + "grad_norm": 0.74609375, + "learning_rate": 0.0004677365339171208, + "loss": 0.1963, + "step": 118510 + }, + { + "epoch": 4.91, + "grad_norm": 0.6875, + "learning_rate": 0.0004677312046166996, + "loss": 0.2048, + "step": 118520 + }, + { + "epoch": 4.91, + "grad_norm": 0.85546875, + "learning_rate": 0.0004677258749065319, + "loss": 0.233, + "step": 118530 + }, + { + "epoch": 4.91, + "grad_norm": 1.0, + "learning_rate": 0.0004677205447866275, + "loss": 0.2159, + "step": 118540 + }, + { + "epoch": 4.91, + "grad_norm": 0.71484375, + "learning_rate": 0.0004677152142569967, + "loss": 0.193, + "step": 118550 + }, + { + "epoch": 4.91, + "grad_norm": 0.90234375, + "learning_rate": 0.0004677098833176493, + "loss": 0.1871, + "step": 118560 + }, + { + "epoch": 4.91, + "grad_norm": 0.443359375, + "learning_rate": 0.0004677045519685954, + "loss": 0.1479, + "step": 118570 + }, + { + "epoch": 4.91, + "grad_norm": 0.66015625, + "learning_rate": 0.000467699220209845, + "loss": 0.2362, + "step": 118580 + }, + { + "epoch": 4.91, + "grad_norm": 0.515625, + "learning_rate": 0.00046769388804140824, + "loss": 0.1941, + "step": 118590 + }, + { + "epoch": 4.91, + "grad_norm": 0.6328125, + "learning_rate": 0.0004676885554632951, + "loss": 0.2182, + "step": 118600 + }, + { + "epoch": 4.91, + "grad_norm": 1.1640625, + "learning_rate": 0.00046768322247551554, + "loss": 0.2287, + "step": 118610 + }, + { + "epoch": 4.91, + "grad_norm": 0.56640625, + "learning_rate": 0.0004676778890780797, + "loss": 0.2282, + "step": 118620 + }, + { + "epoch": 4.91, + "grad_norm": 0.70703125, + "learning_rate": 0.0004676725552709976, + "loss": 0.2381, + "step": 118630 + }, + { + "epoch": 4.91, + "grad_norm": 1.015625, + "learning_rate": 0.00046766722105427927, + "loss": 0.1782, + "step": 118640 + }, + { + "epoch": 4.91, + "grad_norm": 0.69921875, + "learning_rate": 0.0004676618864279347, + "loss": 0.1758, + "step": 118650 + }, + { + "epoch": 4.91, + "grad_norm": 0.6171875, + "learning_rate": 0.000467656551391974, + "loss": 0.2189, + "step": 118660 + }, + { + "epoch": 4.92, + "grad_norm": 0.2578125, + "learning_rate": 0.00046765121594640716, + "loss": 0.1629, + "step": 118670 + }, + { + "epoch": 4.92, + "grad_norm": 1.1640625, + "learning_rate": 0.0004676458800912442, + "loss": 0.185, + "step": 118680 + }, + { + "epoch": 4.92, + "grad_norm": 0.66796875, + "learning_rate": 0.00046764054382649524, + "loss": 0.217, + "step": 118690 + }, + { + "epoch": 4.92, + "grad_norm": 0.470703125, + "learning_rate": 0.00046763520715217025, + "loss": 0.2438, + "step": 118700 + }, + { + "epoch": 4.92, + "grad_norm": 1.0, + "learning_rate": 0.00046762987006827936, + "loss": 0.1968, + "step": 118710 + }, + { + "epoch": 4.92, + "grad_norm": 0.99609375, + "learning_rate": 0.0004676245325748325, + "loss": 0.1314, + "step": 118720 + }, + { + "epoch": 4.92, + "grad_norm": 0.62890625, + "learning_rate": 0.0004676191946718398, + "loss": 0.2413, + "step": 118730 + }, + { + "epoch": 4.92, + "grad_norm": 0.98046875, + "learning_rate": 0.0004676138563593112, + "loss": 0.2127, + "step": 118740 + }, + { + "epoch": 4.92, + "grad_norm": 0.71484375, + "learning_rate": 0.00046760851763725695, + "loss": 0.2022, + "step": 118750 + }, + { + "epoch": 4.92, + "grad_norm": 1.421875, + "learning_rate": 0.0004676031785056869, + "loss": 0.2112, + "step": 118760 + }, + { + "epoch": 4.92, + "grad_norm": 0.59375, + "learning_rate": 0.0004675978389646111, + "loss": 0.1543, + "step": 118770 + }, + { + "epoch": 4.92, + "grad_norm": 0.5703125, + "learning_rate": 0.0004675924990140398, + "loss": 0.1924, + "step": 118780 + }, + { + "epoch": 4.92, + "grad_norm": 0.388671875, + "learning_rate": 0.00046758715865398285, + "loss": 0.1987, + "step": 118790 + }, + { + "epoch": 4.92, + "grad_norm": 0.7890625, + "learning_rate": 0.00046758181788445033, + "loss": 0.2298, + "step": 118800 + }, + { + "epoch": 4.92, + "grad_norm": 0.78515625, + "learning_rate": 0.00046757647670545233, + "loss": 0.2497, + "step": 118810 + }, + { + "epoch": 4.92, + "grad_norm": 1.125, + "learning_rate": 0.000467571135116999, + "loss": 0.2272, + "step": 118820 + }, + { + "epoch": 4.92, + "grad_norm": 0.484375, + "learning_rate": 0.0004675657931191002, + "loss": 0.206, + "step": 118830 + }, + { + "epoch": 4.92, + "grad_norm": 1.3984375, + "learning_rate": 0.00046756045071176603, + "loss": 0.1798, + "step": 118840 + }, + { + "epoch": 4.92, + "grad_norm": 0.6171875, + "learning_rate": 0.00046755510789500665, + "loss": 0.2114, + "step": 118850 + }, + { + "epoch": 4.92, + "grad_norm": 0.69140625, + "learning_rate": 0.0004675497646688321, + "loss": 0.1963, + "step": 118860 + }, + { + "epoch": 4.92, + "grad_norm": 0.8515625, + "learning_rate": 0.00046754442103325234, + "loss": 0.2213, + "step": 118870 + }, + { + "epoch": 4.92, + "grad_norm": 1.0390625, + "learning_rate": 0.00046753907698827746, + "loss": 0.2604, + "step": 118880 + }, + { + "epoch": 4.92, + "grad_norm": 0.65625, + "learning_rate": 0.0004675337325339175, + "loss": 0.1917, + "step": 118890 + }, + { + "epoch": 4.92, + "grad_norm": 0.5546875, + "learning_rate": 0.00046752838767018256, + "loss": 0.1847, + "step": 118900 + }, + { + "epoch": 4.93, + "grad_norm": 0.57421875, + "learning_rate": 0.00046752304239708277, + "loss": 0.2427, + "step": 118910 + }, + { + "epoch": 4.93, + "grad_norm": 0.498046875, + "learning_rate": 0.000467517696714628, + "loss": 0.2255, + "step": 118920 + }, + { + "epoch": 4.93, + "grad_norm": 1.1484375, + "learning_rate": 0.00046751235062282847, + "loss": 0.1692, + "step": 118930 + }, + { + "epoch": 4.93, + "grad_norm": 0.6328125, + "learning_rate": 0.0004675070041216942, + "loss": 0.249, + "step": 118940 + }, + { + "epoch": 4.93, + "grad_norm": 0.51953125, + "learning_rate": 0.0004675016572112352, + "loss": 0.2362, + "step": 118950 + }, + { + "epoch": 4.93, + "grad_norm": 0.0, + "learning_rate": 0.0004674963098914616, + "loss": 0.188, + "step": 118960 + }, + { + "epoch": 4.93, + "grad_norm": 0.72265625, + "learning_rate": 0.0004674909621623834, + "loss": 0.1553, + "step": 118970 + }, + { + "epoch": 4.93, + "grad_norm": 0.6015625, + "learning_rate": 0.0004674856140240107, + "loss": 0.2011, + "step": 118980 + }, + { + "epoch": 4.93, + "grad_norm": 2.390625, + "learning_rate": 0.00046748026547635356, + "loss": 0.2105, + "step": 118990 + }, + { + "epoch": 4.93, + "grad_norm": 4.875, + "learning_rate": 0.0004674749165194221, + "loss": 0.2692, + "step": 119000 + }, + { + "epoch": 4.93, + "grad_norm": 0.7421875, + "learning_rate": 0.0004674695671532262, + "loss": 0.2017, + "step": 119010 + }, + { + "epoch": 4.93, + "grad_norm": 1.0625, + "learning_rate": 0.00046746421737777616, + "loss": 0.2078, + "step": 119020 + }, + { + "epoch": 4.93, + "grad_norm": 0.234375, + "learning_rate": 0.00046745886719308196, + "loss": 0.1501, + "step": 119030 + }, + { + "epoch": 4.93, + "grad_norm": 0.578125, + "learning_rate": 0.0004674535165991536, + "loss": 0.1742, + "step": 119040 + }, + { + "epoch": 4.93, + "grad_norm": 0.7265625, + "learning_rate": 0.00046744816559600124, + "loss": 0.2141, + "step": 119050 + }, + { + "epoch": 4.93, + "grad_norm": 0.490234375, + "learning_rate": 0.0004674428141836349, + "loss": 0.236, + "step": 119060 + }, + { + "epoch": 4.93, + "grad_norm": 0.546875, + "learning_rate": 0.00046743746236206467, + "loss": 0.1868, + "step": 119070 + }, + { + "epoch": 4.93, + "grad_norm": 0.515625, + "learning_rate": 0.00046743211013130066, + "loss": 0.1917, + "step": 119080 + }, + { + "epoch": 4.93, + "grad_norm": 1.140625, + "learning_rate": 0.0004674267574913529, + "loss": 0.2062, + "step": 119090 + }, + { + "epoch": 4.93, + "grad_norm": 0.73828125, + "learning_rate": 0.00046742140444223137, + "loss": 0.2037, + "step": 119100 + }, + { + "epoch": 4.93, + "grad_norm": 0.7890625, + "learning_rate": 0.0004674160509839463, + "loss": 0.1871, + "step": 119110 + }, + { + "epoch": 4.93, + "grad_norm": 0.33984375, + "learning_rate": 0.00046741069711650767, + "loss": 0.2315, + "step": 119120 + }, + { + "epoch": 4.93, + "grad_norm": 0.47265625, + "learning_rate": 0.00046740534283992567, + "loss": 0.2087, + "step": 119130 + }, + { + "epoch": 4.93, + "grad_norm": 0.765625, + "learning_rate": 0.00046739998815421017, + "loss": 0.1726, + "step": 119140 + }, + { + "epoch": 4.94, + "grad_norm": 0.5703125, + "learning_rate": 0.00046739463305937146, + "loss": 0.1926, + "step": 119150 + }, + { + "epoch": 4.94, + "grad_norm": 0.91796875, + "learning_rate": 0.0004673892775554195, + "loss": 0.2438, + "step": 119160 + }, + { + "epoch": 4.94, + "grad_norm": 1.125, + "learning_rate": 0.0004673839216423643, + "loss": 0.2302, + "step": 119170 + }, + { + "epoch": 4.94, + "grad_norm": 0.0004634857177734375, + "learning_rate": 0.00046737856532021616, + "loss": 0.148, + "step": 119180 + }, + { + "epoch": 4.94, + "grad_norm": 0.60546875, + "learning_rate": 0.00046737320858898505, + "loss": 0.1991, + "step": 119190 + }, + { + "epoch": 4.94, + "grad_norm": 0.27734375, + "learning_rate": 0.000467367851448681, + "loss": 0.1914, + "step": 119200 + }, + { + "epoch": 4.94, + "grad_norm": 0.5625, + "learning_rate": 0.00046736249389931405, + "loss": 0.2032, + "step": 119210 + }, + { + "epoch": 4.94, + "grad_norm": 0.76953125, + "learning_rate": 0.00046735713594089445, + "loss": 0.2052, + "step": 119220 + }, + { + "epoch": 4.94, + "grad_norm": 0.625, + "learning_rate": 0.00046735177757343215, + "loss": 0.2335, + "step": 119230 + }, + { + "epoch": 4.94, + "grad_norm": 0.6953125, + "learning_rate": 0.0004673464187969373, + "loss": 0.1777, + "step": 119240 + }, + { + "epoch": 4.94, + "grad_norm": 0.671875, + "learning_rate": 0.00046734105961141994, + "loss": 0.2043, + "step": 119250 + }, + { + "epoch": 4.94, + "grad_norm": 0.4453125, + "learning_rate": 0.0004673357000168902, + "loss": 0.2313, + "step": 119260 + }, + { + "epoch": 4.94, + "grad_norm": 0.84375, + "learning_rate": 0.0004673303400133582, + "loss": 0.2295, + "step": 119270 + }, + { + "epoch": 4.94, + "grad_norm": 1.03125, + "learning_rate": 0.00046732497960083385, + "loss": 0.2293, + "step": 119280 + }, + { + "epoch": 4.94, + "grad_norm": 0.515625, + "learning_rate": 0.00046731961877932737, + "loss": 0.19, + "step": 119290 + }, + { + "epoch": 4.94, + "grad_norm": 0.71875, + "learning_rate": 0.0004673142575488489, + "loss": 0.231, + "step": 119300 + }, + { + "epoch": 4.94, + "grad_norm": 0.703125, + "learning_rate": 0.00046730889590940845, + "loss": 0.235, + "step": 119310 + }, + { + "epoch": 4.94, + "grad_norm": 1.15625, + "learning_rate": 0.00046730353386101607, + "loss": 0.21, + "step": 119320 + }, + { + "epoch": 4.94, + "grad_norm": 0.470703125, + "learning_rate": 0.00046729817140368204, + "loss": 0.1776, + "step": 119330 + }, + { + "epoch": 4.94, + "grad_norm": 0.87890625, + "learning_rate": 0.0004672928085374162, + "loss": 0.2692, + "step": 119340 + }, + { + "epoch": 4.94, + "grad_norm": 0.458984375, + "learning_rate": 0.00046728744526222877, + "loss": 0.19, + "step": 119350 + }, + { + "epoch": 4.94, + "grad_norm": 0.40625, + "learning_rate": 0.00046728208157812985, + "loss": 0.1623, + "step": 119360 + }, + { + "epoch": 4.94, + "grad_norm": 0.6953125, + "learning_rate": 0.00046727671748512955, + "loss": 0.2465, + "step": 119370 + }, + { + "epoch": 4.94, + "grad_norm": 1.2421875, + "learning_rate": 0.00046727135298323786, + "loss": 0.2212, + "step": 119380 + }, + { + "epoch": 4.95, + "grad_norm": 0.78515625, + "learning_rate": 0.000467265988072465, + "loss": 0.1768, + "step": 119390 + }, + { + "epoch": 4.95, + "grad_norm": 0.232421875, + "learning_rate": 0.000467260622752821, + "loss": 0.1905, + "step": 119400 + }, + { + "epoch": 4.95, + "grad_norm": 0.67578125, + "learning_rate": 0.00046725525702431597, + "loss": 0.2511, + "step": 119410 + }, + { + "epoch": 4.95, + "grad_norm": 0.7109375, + "learning_rate": 0.00046724989088696, + "loss": 0.2194, + "step": 119420 + }, + { + "epoch": 4.95, + "grad_norm": 0.60546875, + "learning_rate": 0.00046724452434076326, + "loss": 0.2171, + "step": 119430 + }, + { + "epoch": 4.95, + "grad_norm": 0.90625, + "learning_rate": 0.0004672391573857357, + "loss": 0.2031, + "step": 119440 + }, + { + "epoch": 4.95, + "grad_norm": 0.8203125, + "learning_rate": 0.00046723379002188754, + "loss": 0.1259, + "step": 119450 + }, + { + "epoch": 4.95, + "grad_norm": 0.65234375, + "learning_rate": 0.0004672284222492288, + "loss": 0.2505, + "step": 119460 + }, + { + "epoch": 4.95, + "grad_norm": 0.83203125, + "learning_rate": 0.0004672230540677697, + "loss": 0.2099, + "step": 119470 + }, + { + "epoch": 4.95, + "grad_norm": 0.640625, + "learning_rate": 0.00046721768547752025, + "loss": 0.1788, + "step": 119480 + }, + { + "epoch": 4.95, + "grad_norm": 0.625, + "learning_rate": 0.00046721231647849055, + "loss": 0.2045, + "step": 119490 + }, + { + "epoch": 4.95, + "grad_norm": 0.66796875, + "learning_rate": 0.00046720694707069076, + "loss": 0.1804, + "step": 119500 + }, + { + "epoch": 4.95, + "grad_norm": 1.046875, + "learning_rate": 0.00046720157725413093, + "loss": 0.2526, + "step": 119510 + }, + { + "epoch": 4.95, + "grad_norm": 1.3359375, + "learning_rate": 0.00046719620702882113, + "loss": 0.2729, + "step": 119520 + }, + { + "epoch": 4.95, + "grad_norm": 0.6875, + "learning_rate": 0.00046719083639477167, + "loss": 0.2464, + "step": 119530 + }, + { + "epoch": 4.95, + "grad_norm": 0.7734375, + "learning_rate": 0.00046718546535199235, + "loss": 0.217, + "step": 119540 + }, + { + "epoch": 4.95, + "grad_norm": 0.5859375, + "learning_rate": 0.00046718009390049354, + "loss": 0.1941, + "step": 119550 + }, + { + "epoch": 4.95, + "grad_norm": 0.6015625, + "learning_rate": 0.0004671747220402852, + "loss": 0.2238, + "step": 119560 + }, + { + "epoch": 4.95, + "grad_norm": 0.36328125, + "learning_rate": 0.0004671693497713775, + "loss": 0.2069, + "step": 119570 + }, + { + "epoch": 4.95, + "grad_norm": 0.74609375, + "learning_rate": 0.00046716397709378055, + "loss": 0.2041, + "step": 119580 + }, + { + "epoch": 4.95, + "grad_norm": 0.73046875, + "learning_rate": 0.0004671586040075045, + "loss": 0.1852, + "step": 119590 + }, + { + "epoch": 4.95, + "grad_norm": 0.59765625, + "learning_rate": 0.00046715323051255934, + "loss": 0.2246, + "step": 119600 + }, + { + "epoch": 4.95, + "grad_norm": 0.37890625, + "learning_rate": 0.0004671478566089552, + "loss": 0.2382, + "step": 119610 + }, + { + "epoch": 4.95, + "grad_norm": 1.3671875, + "learning_rate": 0.0004671424822967023, + "loss": 0.2053, + "step": 119620 + }, + { + "epoch": 4.96, + "grad_norm": 0.5546875, + "learning_rate": 0.00046713710757581074, + "loss": 0.2478, + "step": 119630 + }, + { + "epoch": 4.96, + "grad_norm": 0.435546875, + "learning_rate": 0.0004671317324462906, + "loss": 0.1985, + "step": 119640 + }, + { + "epoch": 4.96, + "grad_norm": 0.94140625, + "learning_rate": 0.00046712635690815185, + "loss": 0.2049, + "step": 119650 + }, + { + "epoch": 4.96, + "grad_norm": 0.81640625, + "learning_rate": 0.0004671209809614049, + "loss": 0.2086, + "step": 119660 + }, + { + "epoch": 4.96, + "grad_norm": 1.03125, + "learning_rate": 0.0004671156046060596, + "loss": 0.2089, + "step": 119670 + }, + { + "epoch": 4.96, + "grad_norm": 1.0, + "learning_rate": 0.00046711022784212626, + "loss": 0.2136, + "step": 119680 + }, + { + "epoch": 4.96, + "grad_norm": 0.66015625, + "learning_rate": 0.00046710485066961486, + "loss": 0.2383, + "step": 119690 + }, + { + "epoch": 4.96, + "grad_norm": 0.69140625, + "learning_rate": 0.00046709947308853564, + "loss": 0.1375, + "step": 119700 + }, + { + "epoch": 4.96, + "grad_norm": 0.31640625, + "learning_rate": 0.0004670940950988986, + "loss": 0.1746, + "step": 119710 + }, + { + "epoch": 4.96, + "grad_norm": 0.640625, + "learning_rate": 0.0004670887167007139, + "loss": 0.1984, + "step": 119720 + }, + { + "epoch": 4.96, + "grad_norm": 0.6640625, + "learning_rate": 0.00046708333789399173, + "loss": 0.1965, + "step": 119730 + }, + { + "epoch": 4.96, + "grad_norm": 1.3046875, + "learning_rate": 0.0004670779586787421, + "loss": 0.2305, + "step": 119740 + }, + { + "epoch": 4.96, + "grad_norm": 0.279296875, + "learning_rate": 0.00046707257905497526, + "loss": 0.2104, + "step": 119750 + }, + { + "epoch": 4.96, + "grad_norm": 0.55078125, + "learning_rate": 0.00046706719902270125, + "loss": 0.1979, + "step": 119760 + }, + { + "epoch": 4.96, + "grad_norm": 0.306640625, + "learning_rate": 0.0004670618185819302, + "loss": 0.1989, + "step": 119770 + }, + { + "epoch": 4.96, + "grad_norm": 1.0234375, + "learning_rate": 0.00046705643773267225, + "loss": 0.2098, + "step": 119780 + }, + { + "epoch": 4.96, + "grad_norm": 0.81640625, + "learning_rate": 0.00046705105647493753, + "loss": 0.1724, + "step": 119790 + }, + { + "epoch": 4.96, + "grad_norm": 0.6953125, + "learning_rate": 0.00046704567480873615, + "loss": 0.1859, + "step": 119800 + }, + { + "epoch": 4.96, + "grad_norm": 1.28125, + "learning_rate": 0.0004670402927340783, + "loss": 0.2459, + "step": 119810 + }, + { + "epoch": 4.96, + "grad_norm": 0.81640625, + "learning_rate": 0.000467034910250974, + "loss": 0.1854, + "step": 119820 + }, + { + "epoch": 4.96, + "grad_norm": 0.181640625, + "learning_rate": 0.00046702952735943346, + "loss": 0.1654, + "step": 119830 + }, + { + "epoch": 4.96, + "grad_norm": 1.71875, + "learning_rate": 0.00046702414405946674, + "loss": 0.2268, + "step": 119840 + }, + { + "epoch": 4.96, + "grad_norm": 0.89453125, + "learning_rate": 0.00046701876035108405, + "loss": 0.169, + "step": 119850 + }, + { + "epoch": 4.96, + "grad_norm": 1.078125, + "learning_rate": 0.0004670133762342955, + "loss": 0.2051, + "step": 119860 + }, + { + "epoch": 4.97, + "grad_norm": 0.423828125, + "learning_rate": 0.00046700799170911123, + "loss": 0.202, + "step": 119870 + }, + { + "epoch": 4.97, + "grad_norm": 0.6015625, + "learning_rate": 0.00046700260677554127, + "loss": 0.2125, + "step": 119880 + }, + { + "epoch": 4.97, + "grad_norm": 0.75, + "learning_rate": 0.00046699722143359593, + "loss": 0.1807, + "step": 119890 + }, + { + "epoch": 4.97, + "grad_norm": 0.83984375, + "learning_rate": 0.00046699183568328526, + "loss": 0.2171, + "step": 119900 + }, + { + "epoch": 4.97, + "grad_norm": 0.88671875, + "learning_rate": 0.0004669864495246193, + "loss": 0.2318, + "step": 119910 + }, + { + "epoch": 4.97, + "grad_norm": 0.49609375, + "learning_rate": 0.00046698106295760836, + "loss": 0.1742, + "step": 119920 + }, + { + "epoch": 4.97, + "grad_norm": 0.6328125, + "learning_rate": 0.0004669756759822624, + "loss": 0.1854, + "step": 119930 + }, + { + "epoch": 4.97, + "grad_norm": 1.2109375, + "learning_rate": 0.00046697028859859173, + "loss": 0.2247, + "step": 119940 + }, + { + "epoch": 4.97, + "grad_norm": 0.3671875, + "learning_rate": 0.0004669649008066064, + "loss": 0.2185, + "step": 119950 + }, + { + "epoch": 4.97, + "grad_norm": 0.431640625, + "learning_rate": 0.0004669595126063165, + "loss": 0.1947, + "step": 119960 + }, + { + "epoch": 4.97, + "grad_norm": 1.28125, + "learning_rate": 0.0004669541239977323, + "loss": 0.1764, + "step": 119970 + }, + { + "epoch": 4.97, + "grad_norm": 0.51953125, + "learning_rate": 0.00046694873498086384, + "loss": 0.228, + "step": 119980 + }, + { + "epoch": 4.97, + "grad_norm": 0.435546875, + "learning_rate": 0.0004669433455557213, + "loss": 0.2079, + "step": 119990 + }, + { + "epoch": 4.97, + "grad_norm": 0.90234375, + "learning_rate": 0.00046693795572231477, + "loss": 0.1636, + "step": 120000 + }, + { + "epoch": 4.97, + "grad_norm": 0.32421875, + "learning_rate": 0.0004669325654806544, + "loss": 0.1951, + "step": 120010 + }, + { + "epoch": 4.97, + "grad_norm": 0.73828125, + "learning_rate": 0.0004669271748307505, + "loss": 0.2226, + "step": 120020 + }, + { + "epoch": 4.97, + "grad_norm": 1.03125, + "learning_rate": 0.000466921783772613, + "loss": 0.2044, + "step": 120030 + }, + { + "epoch": 4.97, + "grad_norm": 0.50390625, + "learning_rate": 0.00046691639230625215, + "loss": 0.2333, + "step": 120040 + }, + { + "epoch": 4.97, + "grad_norm": 2.265625, + "learning_rate": 0.00046691100043167805, + "loss": 0.2084, + "step": 120050 + }, + { + "epoch": 4.97, + "grad_norm": 1.1953125, + "learning_rate": 0.0004669056081489009, + "loss": 0.2377, + "step": 120060 + }, + { + "epoch": 4.97, + "grad_norm": 0.59765625, + "learning_rate": 0.0004669002154579308, + "loss": 0.29, + "step": 120070 + }, + { + "epoch": 4.97, + "grad_norm": 0.392578125, + "learning_rate": 0.0004668948223587779, + "loss": 0.1755, + "step": 120080 + }, + { + "epoch": 4.97, + "grad_norm": 0.26953125, + "learning_rate": 0.0004668894288514524, + "loss": 0.2263, + "step": 120090 + }, + { + "epoch": 4.97, + "grad_norm": 0.435546875, + "learning_rate": 0.0004668840349359644, + "loss": 0.2052, + "step": 120100 + }, + { + "epoch": 4.97, + "grad_norm": 0.6796875, + "learning_rate": 0.0004668786406123241, + "loss": 0.2532, + "step": 120110 + }, + { + "epoch": 4.98, + "grad_norm": 0.9140625, + "learning_rate": 0.0004668732458805416, + "loss": 0.2061, + "step": 120120 + }, + { + "epoch": 4.98, + "grad_norm": 0.87109375, + "learning_rate": 0.000466867850740627, + "loss": 0.1965, + "step": 120130 + }, + { + "epoch": 4.98, + "grad_norm": 0.69140625, + "learning_rate": 0.0004668624551925906, + "loss": 0.2004, + "step": 120140 + }, + { + "epoch": 4.98, + "grad_norm": 0.205078125, + "learning_rate": 0.0004668570592364425, + "loss": 0.2417, + "step": 120150 + }, + { + "epoch": 4.98, + "grad_norm": 0.31640625, + "learning_rate": 0.00046685166287219285, + "loss": 0.1935, + "step": 120160 + }, + { + "epoch": 4.98, + "grad_norm": 0.2080078125, + "learning_rate": 0.0004668462660998517, + "loss": 0.1748, + "step": 120170 + }, + { + "epoch": 4.98, + "grad_norm": 0.828125, + "learning_rate": 0.00046684086891942935, + "loss": 0.2191, + "step": 120180 + }, + { + "epoch": 4.98, + "grad_norm": 0.470703125, + "learning_rate": 0.00046683547133093584, + "loss": 0.2575, + "step": 120190 + }, + { + "epoch": 4.98, + "grad_norm": 1.0078125, + "learning_rate": 0.00046683007333438143, + "loss": 0.2082, + "step": 120200 + }, + { + "epoch": 4.98, + "grad_norm": 0.458984375, + "learning_rate": 0.00046682467492977625, + "loss": 0.2376, + "step": 120210 + }, + { + "epoch": 4.98, + "grad_norm": 1.4375, + "learning_rate": 0.0004668192761171305, + "loss": 0.2537, + "step": 120220 + }, + { + "epoch": 4.98, + "grad_norm": 0.63671875, + "learning_rate": 0.00046681387689645416, + "loss": 0.2194, + "step": 120230 + }, + { + "epoch": 4.98, + "grad_norm": 0.78125, + "learning_rate": 0.00046680847726775763, + "loss": 0.2292, + "step": 120240 + }, + { + "epoch": 4.98, + "grad_norm": 0.47265625, + "learning_rate": 0.0004668030772310509, + "loss": 0.1662, + "step": 120250 + }, + { + "epoch": 4.98, + "grad_norm": 0.66796875, + "learning_rate": 0.0004667976767863442, + "loss": 0.1897, + "step": 120260 + }, + { + "epoch": 4.98, + "grad_norm": 0.2060546875, + "learning_rate": 0.00046679227593364773, + "loss": 0.2263, + "step": 120270 + }, + { + "epoch": 4.98, + "grad_norm": 0.0, + "learning_rate": 0.00046678687467297154, + "loss": 0.2444, + "step": 120280 + }, + { + "epoch": 4.98, + "grad_norm": 1.0, + "learning_rate": 0.00046678147300432585, + "loss": 0.2121, + "step": 120290 + }, + { + "epoch": 4.98, + "grad_norm": 0.86328125, + "learning_rate": 0.0004667760709277209, + "loss": 0.2257, + "step": 120300 + }, + { + "epoch": 4.98, + "grad_norm": 1.21875, + "learning_rate": 0.00046677066844316674, + "loss": 0.2265, + "step": 120310 + }, + { + "epoch": 4.98, + "grad_norm": 0.64453125, + "learning_rate": 0.0004667652655506737, + "loss": 0.2269, + "step": 120320 + }, + { + "epoch": 4.98, + "grad_norm": 0.515625, + "learning_rate": 0.0004667598622502517, + "loss": 0.2166, + "step": 120330 + }, + { + "epoch": 4.98, + "grad_norm": 0.68359375, + "learning_rate": 0.00046675445854191114, + "loss": 0.209, + "step": 120340 + }, + { + "epoch": 4.98, + "grad_norm": 0.72265625, + "learning_rate": 0.0004667490544256621, + "loss": 0.2366, + "step": 120350 + }, + { + "epoch": 4.99, + "grad_norm": 0.53515625, + "learning_rate": 0.00046674364990151464, + "loss": 0.2299, + "step": 120360 + }, + { + "epoch": 4.99, + "grad_norm": 0.4609375, + "learning_rate": 0.0004667382449694791, + "loss": 0.1972, + "step": 120370 + }, + { + "epoch": 4.99, + "grad_norm": 4.125, + "learning_rate": 0.0004667328396295656, + "loss": 0.2156, + "step": 120380 + }, + { + "epoch": 4.99, + "grad_norm": 0.4453125, + "learning_rate": 0.0004667274338817843, + "loss": 0.1833, + "step": 120390 + }, + { + "epoch": 4.99, + "grad_norm": 0.71484375, + "learning_rate": 0.00046672202772614536, + "loss": 0.1728, + "step": 120400 + }, + { + "epoch": 4.99, + "grad_norm": 2.078125, + "learning_rate": 0.000466716621162659, + "loss": 0.2233, + "step": 120410 + }, + { + "epoch": 4.99, + "grad_norm": 1.125, + "learning_rate": 0.0004667112141913353, + "loss": 0.2408, + "step": 120420 + }, + { + "epoch": 4.99, + "grad_norm": 0.400390625, + "learning_rate": 0.0004667058068121846, + "loss": 0.2064, + "step": 120430 + }, + { + "epoch": 4.99, + "grad_norm": 0.486328125, + "learning_rate": 0.0004667003990252169, + "loss": 0.1877, + "step": 120440 + }, + { + "epoch": 4.99, + "grad_norm": 1.6484375, + "learning_rate": 0.00046669499083044246, + "loss": 0.2018, + "step": 120450 + }, + { + "epoch": 4.99, + "grad_norm": 0.59765625, + "learning_rate": 0.00046668958222787144, + "loss": 0.1872, + "step": 120460 + }, + { + "epoch": 4.99, + "grad_norm": 1.2265625, + "learning_rate": 0.0004666841732175141, + "loss": 0.2439, + "step": 120470 + }, + { + "epoch": 4.99, + "grad_norm": 0.44921875, + "learning_rate": 0.00046667876379938045, + "loss": 0.2429, + "step": 120480 + }, + { + "epoch": 4.99, + "grad_norm": 0.703125, + "learning_rate": 0.00046667335397348076, + "loss": 0.2325, + "step": 120490 + }, + { + "epoch": 4.99, + "grad_norm": 0.25, + "learning_rate": 0.0004666679437398253, + "loss": 0.1812, + "step": 120500 + }, + { + "epoch": 4.99, + "grad_norm": 0.462890625, + "learning_rate": 0.0004666625330984241, + "loss": 0.1836, + "step": 120510 + }, + { + "epoch": 4.99, + "grad_norm": 2.25, + "learning_rate": 0.00046665712204928744, + "loss": 0.2656, + "step": 120520 + }, + { + "epoch": 4.99, + "grad_norm": 1.28125, + "learning_rate": 0.0004666517105924255, + "loss": 0.2353, + "step": 120530 + }, + { + "epoch": 4.99, + "grad_norm": 0.59375, + "learning_rate": 0.00046664629872784837, + "loss": 0.215, + "step": 120540 + }, + { + "epoch": 4.99, + "grad_norm": 0.72265625, + "learning_rate": 0.00046664088645556633, + "loss": 0.1669, + "step": 120550 + }, + { + "epoch": 4.99, + "grad_norm": 1.1015625, + "learning_rate": 0.0004666354737755896, + "loss": 0.1966, + "step": 120560 + }, + { + "epoch": 4.99, + "grad_norm": 0.55859375, + "learning_rate": 0.00046663006068792826, + "loss": 0.2193, + "step": 120570 + }, + { + "epoch": 4.99, + "grad_norm": 0.859375, + "learning_rate": 0.00046662464719259244, + "loss": 0.1603, + "step": 120580 + }, + { + "epoch": 4.99, + "grad_norm": 0.4453125, + "learning_rate": 0.00046661923328959254, + "loss": 0.1765, + "step": 120590 + }, + { + "epoch": 5.0, + "grad_norm": 2.390625, + "learning_rate": 0.0004666138189789386, + "loss": 0.2556, + "step": 120600 + }, + { + "epoch": 5.0, + "grad_norm": 0.5546875, + "learning_rate": 0.0004666084042606409, + "loss": 0.1317, + "step": 120610 + }, + { + "epoch": 5.0, + "grad_norm": 0.2451171875, + "learning_rate": 0.00046660298913470957, + "loss": 0.1899, + "step": 120620 + }, + { + "epoch": 5.0, + "grad_norm": 0.55078125, + "learning_rate": 0.0004665975736011547, + "loss": 0.1869, + "step": 120630 + }, + { + "epoch": 5.0, + "grad_norm": 0.76171875, + "learning_rate": 0.0004665921576599867, + "loss": 0.1985, + "step": 120640 + }, + { + "epoch": 5.0, + "grad_norm": 1.125, + "learning_rate": 0.0004665867413112156, + "loss": 0.2366, + "step": 120650 + }, + { + "epoch": 5.0, + "grad_norm": 0.703125, + "learning_rate": 0.00046658132455485156, + "loss": 0.1787, + "step": 120660 + }, + { + "epoch": 5.0, + "grad_norm": 0.49609375, + "learning_rate": 0.00046657590739090503, + "loss": 0.1781, + "step": 120670 + }, + { + "epoch": 5.0, + "grad_norm": 0.5625, + "learning_rate": 0.0004665704898193859, + "loss": 0.1872, + "step": 120680 + }, + { + "epoch": 5.0, + "grad_norm": 0.45703125, + "learning_rate": 0.00046656507184030454, + "loss": 0.2261, + "step": 120690 + }, + { + "epoch": 5.0, + "grad_norm": 0.625, + "learning_rate": 0.0004665596534536711, + "loss": 0.1959, + "step": 120700 + }, + { + "epoch": 5.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.00046655423465949577, + "loss": 0.188, + "step": 120710 + }, + { + "epoch": 5.0, + "grad_norm": 0.65625, + "learning_rate": 0.0004665488154577888, + "loss": 0.2252, + "step": 120720 + }, + { + "epoch": 5.0, + "grad_norm": 0.53515625, + "learning_rate": 0.0004665433958485603, + "loss": 0.1994, + "step": 120730 + }, + { + "epoch": 5.0, + "grad_norm": 0.80078125, + "learning_rate": 0.0004665379758318205, + "loss": 0.1825, + "step": 120740 + }, + { + "epoch": 5.0, + "grad_norm": 1.2734375, + "learning_rate": 0.00046653255540757965, + "loss": 0.1938, + "step": 120750 + }, + { + "epoch": 5.0, + "grad_norm": 1.3515625, + "learning_rate": 0.00046652713457584797, + "loss": 0.1626, + "step": 120760 + }, + { + "epoch": 5.0, + "grad_norm": 0.984375, + "learning_rate": 0.00046652171333663544, + "loss": 0.1977, + "step": 120770 + }, + { + "epoch": 5.0, + "grad_norm": 0.5234375, + "learning_rate": 0.00046651629168995256, + "loss": 0.1632, + "step": 120780 + }, + { + "epoch": 5.0, + "grad_norm": 0.400390625, + "learning_rate": 0.00046651086963580933, + "loss": 0.1447, + "step": 120790 + }, + { + "epoch": 5.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.00046650544717421604, + "loss": 0.1765, + "step": 120800 + }, + { + "epoch": 5.0, + "grad_norm": 1.0859375, + "learning_rate": 0.0004665000243051829, + "loss": 0.2307, + "step": 120810 + }, + { + "epoch": 5.0, + "grad_norm": 0.76171875, + "learning_rate": 0.0004664946010287201, + "loss": 0.2311, + "step": 120820 + }, + { + "epoch": 5.0, + "grad_norm": 0.50390625, + "learning_rate": 0.00046648917734483787, + "loss": 0.1729, + "step": 120830 + }, + { + "epoch": 5.01, + "grad_norm": 0.82421875, + "learning_rate": 0.0004664837532535463, + "loss": 0.2345, + "step": 120840 + }, + { + "epoch": 5.01, + "grad_norm": 0.85546875, + "learning_rate": 0.00046647832875485574, + "loss": 0.224, + "step": 120850 + }, + { + "epoch": 5.01, + "grad_norm": 0.85546875, + "learning_rate": 0.00046647290384877637, + "loss": 0.2309, + "step": 120860 + }, + { + "epoch": 5.01, + "grad_norm": 0.57421875, + "learning_rate": 0.00046646747853531834, + "loss": 0.2506, + "step": 120870 + }, + { + "epoch": 5.01, + "grad_norm": 0.419921875, + "learning_rate": 0.0004664620528144919, + "loss": 0.1849, + "step": 120880 + }, + { + "epoch": 5.01, + "grad_norm": 1.0078125, + "learning_rate": 0.00046645662668630725, + "loss": 0.1615, + "step": 120890 + }, + { + "epoch": 5.01, + "grad_norm": 0.51953125, + "learning_rate": 0.0004664512001507746, + "loss": 0.206, + "step": 120900 + }, + { + "epoch": 5.01, + "grad_norm": 0.41015625, + "learning_rate": 0.0004664457732079042, + "loss": 0.2335, + "step": 120910 + }, + { + "epoch": 5.01, + "grad_norm": 0.486328125, + "learning_rate": 0.0004664403458577061, + "loss": 0.212, + "step": 120920 + }, + { + "epoch": 5.01, + "grad_norm": 0.86328125, + "learning_rate": 0.00046643491810019076, + "loss": 0.2294, + "step": 120930 + }, + { + "epoch": 5.01, + "grad_norm": 1.2421875, + "learning_rate": 0.0004664294899353683, + "loss": 0.1667, + "step": 120940 + }, + { + "epoch": 5.01, + "grad_norm": 0.490234375, + "learning_rate": 0.00046642406136324885, + "loss": 0.2102, + "step": 120950 + }, + { + "epoch": 5.01, + "grad_norm": 0.72265625, + "learning_rate": 0.00046641863238384263, + "loss": 0.2137, + "step": 120960 + }, + { + "epoch": 5.01, + "grad_norm": 0.7578125, + "learning_rate": 0.00046641320299716004, + "loss": 0.1813, + "step": 120970 + }, + { + "epoch": 5.01, + "grad_norm": 0.482421875, + "learning_rate": 0.00046640777320321116, + "loss": 0.1878, + "step": 120980 + }, + { + "epoch": 5.01, + "grad_norm": 0.5625, + "learning_rate": 0.0004664023430020061, + "loss": 0.182, + "step": 120990 + }, + { + "epoch": 5.01, + "grad_norm": 0.9765625, + "learning_rate": 0.0004663969123935553, + "loss": 0.1721, + "step": 121000 + }, + { + "epoch": 5.01, + "grad_norm": 1.1328125, + "learning_rate": 0.0004663914813778689, + "loss": 0.1487, + "step": 121010 + }, + { + "epoch": 5.01, + "grad_norm": 1.140625, + "learning_rate": 0.00046638604995495706, + "loss": 0.2151, + "step": 121020 + }, + { + "epoch": 5.01, + "grad_norm": 0.67578125, + "learning_rate": 0.00046638061812483004, + "loss": 0.2017, + "step": 121030 + }, + { + "epoch": 5.01, + "grad_norm": 0.58203125, + "learning_rate": 0.00046637518588749804, + "loss": 0.2139, + "step": 121040 + }, + { + "epoch": 5.01, + "grad_norm": 0.515625, + "learning_rate": 0.0004663697532429713, + "loss": 0.1804, + "step": 121050 + }, + { + "epoch": 5.01, + "grad_norm": 1.0234375, + "learning_rate": 0.00046636432019126015, + "loss": 0.14, + "step": 121060 + }, + { + "epoch": 5.01, + "grad_norm": 0.88671875, + "learning_rate": 0.0004663588867323746, + "loss": 0.1531, + "step": 121070 + }, + { + "epoch": 5.02, + "grad_norm": 0.78125, + "learning_rate": 0.00046635345286632503, + "loss": 0.2093, + "step": 121080 + }, + { + "epoch": 5.02, + "grad_norm": 0.671875, + "learning_rate": 0.00046634801859312167, + "loss": 0.264, + "step": 121090 + }, + { + "epoch": 5.02, + "grad_norm": 1.15625, + "learning_rate": 0.00046634258391277465, + "loss": 0.2324, + "step": 121100 + }, + { + "epoch": 5.02, + "grad_norm": 0.72265625, + "learning_rate": 0.00046633714882529426, + "loss": 0.154, + "step": 121110 + }, + { + "epoch": 5.02, + "grad_norm": 0.6640625, + "learning_rate": 0.0004663317133306907, + "loss": 0.1995, + "step": 121120 + }, + { + "epoch": 5.02, + "grad_norm": 0.40625, + "learning_rate": 0.00046632627742897425, + "loss": 0.2082, + "step": 121130 + }, + { + "epoch": 5.02, + "grad_norm": 0.9765625, + "learning_rate": 0.0004663208411201552, + "loss": 0.1924, + "step": 121140 + }, + { + "epoch": 5.02, + "grad_norm": 0.45703125, + "learning_rate": 0.00046631540440424346, + "loss": 0.2032, + "step": 121150 + }, + { + "epoch": 5.02, + "grad_norm": 0.53515625, + "learning_rate": 0.00046630996728124966, + "loss": 0.1955, + "step": 121160 + }, + { + "epoch": 5.02, + "grad_norm": 1.1015625, + "learning_rate": 0.0004663045297511838, + "loss": 0.1894, + "step": 121170 + }, + { + "epoch": 5.02, + "grad_norm": 0.51953125, + "learning_rate": 0.00046629909181405624, + "loss": 0.2341, + "step": 121180 + }, + { + "epoch": 5.02, + "grad_norm": 0.90234375, + "learning_rate": 0.00046629365346987706, + "loss": 0.1922, + "step": 121190 + }, + { + "epoch": 5.02, + "grad_norm": 0.56640625, + "learning_rate": 0.0004662882147186566, + "loss": 0.2318, + "step": 121200 + }, + { + "epoch": 5.02, + "grad_norm": 1.171875, + "learning_rate": 0.0004662827755604051, + "loss": 0.2451, + "step": 121210 + }, + { + "epoch": 5.02, + "grad_norm": 0.88671875, + "learning_rate": 0.0004662773359951328, + "loss": 0.1811, + "step": 121220 + }, + { + "epoch": 5.02, + "grad_norm": 0.73828125, + "learning_rate": 0.00046627189602284987, + "loss": 0.2059, + "step": 121230 + }, + { + "epoch": 5.02, + "grad_norm": 0.640625, + "learning_rate": 0.0004662664556435666, + "loss": 0.2177, + "step": 121240 + }, + { + "epoch": 5.02, + "grad_norm": 2.0625, + "learning_rate": 0.0004662610148572932, + "loss": 0.2587, + "step": 121250 + }, + { + "epoch": 5.02, + "grad_norm": 0.91796875, + "learning_rate": 0.00046625557366403994, + "loss": 0.1885, + "step": 121260 + }, + { + "epoch": 5.02, + "grad_norm": 1.9296875, + "learning_rate": 0.00046625013206381705, + "loss": 0.1966, + "step": 121270 + }, + { + "epoch": 5.02, + "grad_norm": 1.5078125, + "learning_rate": 0.0004662446900566347, + "loss": 0.2202, + "step": 121280 + }, + { + "epoch": 5.02, + "grad_norm": 0.5234375, + "learning_rate": 0.00046623924764250326, + "loss": 0.216, + "step": 121290 + }, + { + "epoch": 5.02, + "grad_norm": 0.6484375, + "learning_rate": 0.0004662338048214329, + "loss": 0.2189, + "step": 121300 + }, + { + "epoch": 5.02, + "grad_norm": 0.375, + "learning_rate": 0.0004662283615934339, + "loss": 0.217, + "step": 121310 + }, + { + "epoch": 5.03, + "grad_norm": 1.09375, + "learning_rate": 0.00046622291795851646, + "loss": 0.2265, + "step": 121320 + }, + { + "epoch": 5.03, + "grad_norm": 0.76171875, + "learning_rate": 0.0004662174739166908, + "loss": 0.245, + "step": 121330 + }, + { + "epoch": 5.03, + "grad_norm": 0.455078125, + "learning_rate": 0.0004662120294679672, + "loss": 0.2073, + "step": 121340 + }, + { + "epoch": 5.03, + "grad_norm": 0.29296875, + "learning_rate": 0.00046620658461235596, + "loss": 0.1524, + "step": 121350 + }, + { + "epoch": 5.03, + "grad_norm": 0.314453125, + "learning_rate": 0.0004662011393498672, + "loss": 0.206, + "step": 121360 + }, + { + "epoch": 5.03, + "grad_norm": 0.5546875, + "learning_rate": 0.00046619569368051134, + "loss": 0.2014, + "step": 121370 + }, + { + "epoch": 5.03, + "grad_norm": 0.5234375, + "learning_rate": 0.0004661902476042985, + "loss": 0.2266, + "step": 121380 + }, + { + "epoch": 5.03, + "grad_norm": 0.84375, + "learning_rate": 0.000466184801121239, + "loss": 0.2302, + "step": 121390 + }, + { + "epoch": 5.03, + "grad_norm": 0.5234375, + "learning_rate": 0.00046617935423134293, + "loss": 0.2336, + "step": 121400 + }, + { + "epoch": 5.03, + "grad_norm": 0.546875, + "learning_rate": 0.0004661739069346207, + "loss": 0.2105, + "step": 121410 + }, + { + "epoch": 5.03, + "grad_norm": 0.65625, + "learning_rate": 0.0004661684592310825, + "loss": 0.1569, + "step": 121420 + }, + { + "epoch": 5.03, + "grad_norm": 0.8125, + "learning_rate": 0.00046616301112073873, + "loss": 0.1696, + "step": 121430 + }, + { + "epoch": 5.03, + "grad_norm": 0.1396484375, + "learning_rate": 0.0004661575626035994, + "loss": 0.1872, + "step": 121440 + }, + { + "epoch": 5.03, + "grad_norm": 0.2490234375, + "learning_rate": 0.0004661521136796749, + "loss": 0.2681, + "step": 121450 + }, + { + "epoch": 5.03, + "grad_norm": 0.326171875, + "learning_rate": 0.00046614666434897554, + "loss": 0.2162, + "step": 121460 + }, + { + "epoch": 5.03, + "grad_norm": 0.98828125, + "learning_rate": 0.00046614121461151145, + "loss": 0.202, + "step": 121470 + }, + { + "epoch": 5.03, + "grad_norm": 0.6640625, + "learning_rate": 0.00046613576446729286, + "loss": 0.1707, + "step": 121480 + }, + { + "epoch": 5.03, + "grad_norm": 0.6875, + "learning_rate": 0.0004661303139163302, + "loss": 0.1897, + "step": 121490 + }, + { + "epoch": 5.03, + "grad_norm": 1.40625, + "learning_rate": 0.0004661248629586336, + "loss": 0.1558, + "step": 121500 + }, + { + "epoch": 5.03, + "grad_norm": 1.125, + "learning_rate": 0.00046611941159421333, + "loss": 0.2191, + "step": 121510 + }, + { + "epoch": 5.03, + "grad_norm": 0.349609375, + "learning_rate": 0.0004661139598230797, + "loss": 0.1856, + "step": 121520 + }, + { + "epoch": 5.03, + "grad_norm": 1.734375, + "learning_rate": 0.0004661085076452429, + "loss": 0.2339, + "step": 121530 + }, + { + "epoch": 5.03, + "grad_norm": 0.427734375, + "learning_rate": 0.00046610305506071325, + "loss": 0.2039, + "step": 121540 + }, + { + "epoch": 5.03, + "grad_norm": 0.341796875, + "learning_rate": 0.00046609760206950103, + "loss": 0.1532, + "step": 121550 + }, + { + "epoch": 5.03, + "grad_norm": 0.50390625, + "learning_rate": 0.00046609214867161645, + "loss": 0.22, + "step": 121560 + }, + { + "epoch": 5.04, + "grad_norm": 0.79296875, + "learning_rate": 0.0004660866948670697, + "loss": 0.1524, + "step": 121570 + }, + { + "epoch": 5.04, + "grad_norm": 1.0625, + "learning_rate": 0.0004660812406558712, + "loss": 0.2568, + "step": 121580 + }, + { + "epoch": 5.04, + "grad_norm": 0.66796875, + "learning_rate": 0.00046607578603803104, + "loss": 0.1989, + "step": 121590 + }, + { + "epoch": 5.04, + "grad_norm": 0.6328125, + "learning_rate": 0.0004660703310135597, + "loss": 0.205, + "step": 121600 + }, + { + "epoch": 5.04, + "grad_norm": 0.71875, + "learning_rate": 0.00046606487558246724, + "loss": 0.1838, + "step": 121610 + }, + { + "epoch": 5.04, + "grad_norm": 0.53125, + "learning_rate": 0.00046605941974476407, + "loss": 0.2328, + "step": 121620 + }, + { + "epoch": 5.04, + "grad_norm": 0.65234375, + "learning_rate": 0.0004660539635004604, + "loss": 0.2074, + "step": 121630 + }, + { + "epoch": 5.04, + "grad_norm": 0.4765625, + "learning_rate": 0.0004660485068495665, + "loss": 0.2476, + "step": 121640 + }, + { + "epoch": 5.04, + "grad_norm": 0.423828125, + "learning_rate": 0.0004660430497920927, + "loss": 0.158, + "step": 121650 + }, + { + "epoch": 5.04, + "grad_norm": 0.349609375, + "learning_rate": 0.0004660375923280491, + "loss": 0.1619, + "step": 121660 + }, + { + "epoch": 5.04, + "grad_norm": 0.68359375, + "learning_rate": 0.0004660321344574461, + "loss": 0.1779, + "step": 121670 + }, + { + "epoch": 5.04, + "grad_norm": 1.6875, + "learning_rate": 0.000466026676180294, + "loss": 0.2354, + "step": 121680 + }, + { + "epoch": 5.04, + "grad_norm": 0.3203125, + "learning_rate": 0.00046602121749660303, + "loss": 0.1931, + "step": 121690 + }, + { + "epoch": 5.04, + "grad_norm": 0.369140625, + "learning_rate": 0.00046601575840638345, + "loss": 0.218, + "step": 121700 + }, + { + "epoch": 5.04, + "grad_norm": 1.203125, + "learning_rate": 0.0004660102989096455, + "loss": 0.2223, + "step": 121710 + }, + { + "epoch": 5.04, + "grad_norm": 0.6328125, + "learning_rate": 0.00046600483900639957, + "loss": 0.2138, + "step": 121720 + }, + { + "epoch": 5.04, + "grad_norm": 0.66015625, + "learning_rate": 0.0004659993786966558, + "loss": 0.1769, + "step": 121730 + }, + { + "epoch": 5.04, + "grad_norm": 0.50390625, + "learning_rate": 0.0004659939179804245, + "loss": 0.167, + "step": 121740 + }, + { + "epoch": 5.04, + "grad_norm": 1.0, + "learning_rate": 0.000465988456857716, + "loss": 0.2725, + "step": 121750 + }, + { + "epoch": 5.04, + "grad_norm": 1.5859375, + "learning_rate": 0.0004659829953285406, + "loss": 0.1993, + "step": 121760 + }, + { + "epoch": 5.04, + "grad_norm": 1.3984375, + "learning_rate": 0.00046597753339290847, + "loss": 0.2104, + "step": 121770 + }, + { + "epoch": 5.04, + "grad_norm": 0.8984375, + "learning_rate": 0.0004659720710508299, + "loss": 0.1964, + "step": 121780 + }, + { + "epoch": 5.04, + "grad_norm": 0.68359375, + "learning_rate": 0.0004659666083023153, + "loss": 0.2153, + "step": 121790 + }, + { + "epoch": 5.04, + "grad_norm": 0.7578125, + "learning_rate": 0.00046596114514737484, + "loss": 0.1868, + "step": 121800 + }, + { + "epoch": 5.05, + "grad_norm": 0.80078125, + "learning_rate": 0.00046595568158601885, + "loss": 0.2031, + "step": 121810 + }, + { + "epoch": 5.05, + "grad_norm": 0.6796875, + "learning_rate": 0.0004659502176182576, + "loss": 0.1936, + "step": 121820 + }, + { + "epoch": 5.05, + "grad_norm": 0.06884765625, + "learning_rate": 0.0004659447532441013, + "loss": 0.1748, + "step": 121830 + }, + { + "epoch": 5.05, + "grad_norm": 0.54296875, + "learning_rate": 0.0004659392884635603, + "loss": 0.2049, + "step": 121840 + }, + { + "epoch": 5.05, + "grad_norm": 0.65234375, + "learning_rate": 0.0004659338232766449, + "loss": 0.1741, + "step": 121850 + }, + { + "epoch": 5.05, + "grad_norm": 0.65625, + "learning_rate": 0.0004659283576833654, + "loss": 0.1955, + "step": 121860 + }, + { + "epoch": 5.05, + "grad_norm": 0.8046875, + "learning_rate": 0.00046592289168373197, + "loss": 0.2223, + "step": 121870 + }, + { + "epoch": 5.05, + "grad_norm": 1.3828125, + "learning_rate": 0.00046591742527775506, + "loss": 0.1882, + "step": 121880 + }, + { + "epoch": 5.05, + "grad_norm": 1.046875, + "learning_rate": 0.00046591195846544477, + "loss": 0.1993, + "step": 121890 + }, + { + "epoch": 5.05, + "grad_norm": 0.8046875, + "learning_rate": 0.00046590649124681155, + "loss": 0.2295, + "step": 121900 + }, + { + "epoch": 5.05, + "grad_norm": 0.392578125, + "learning_rate": 0.00046590102362186565, + "loss": 0.2263, + "step": 121910 + }, + { + "epoch": 5.05, + "grad_norm": 0.40234375, + "learning_rate": 0.0004658955555906174, + "loss": 0.2003, + "step": 121920 + }, + { + "epoch": 5.05, + "grad_norm": 0.484375, + "learning_rate": 0.0004658900871530769, + "loss": 0.2198, + "step": 121930 + }, + { + "epoch": 5.05, + "grad_norm": 1.3828125, + "learning_rate": 0.00046588461830925456, + "loss": 0.1799, + "step": 121940 + }, + { + "epoch": 5.05, + "grad_norm": 0.98828125, + "learning_rate": 0.00046587914905916073, + "loss": 0.2693, + "step": 121950 + }, + { + "epoch": 5.05, + "grad_norm": 0.81640625, + "learning_rate": 0.00046587367940280566, + "loss": 0.2159, + "step": 121960 + }, + { + "epoch": 5.05, + "grad_norm": 0.6484375, + "learning_rate": 0.00046586820934019967, + "loss": 0.2307, + "step": 121970 + }, + { + "epoch": 5.05, + "grad_norm": 0.404296875, + "learning_rate": 0.000465862738871353, + "loss": 0.1757, + "step": 121980 + }, + { + "epoch": 5.05, + "grad_norm": 0.466796875, + "learning_rate": 0.00046585726799627596, + "loss": 0.2398, + "step": 121990 + }, + { + "epoch": 5.05, + "grad_norm": 0.73828125, + "learning_rate": 0.00046585179671497886, + "loss": 0.2568, + "step": 122000 + }, + { + "epoch": 5.05, + "grad_norm": 1.796875, + "learning_rate": 0.00046584632502747196, + "loss": 0.2081, + "step": 122010 + }, + { + "epoch": 5.05, + "grad_norm": 1.9375, + "learning_rate": 0.00046584085293376563, + "loss": 0.2265, + "step": 122020 + }, + { + "epoch": 5.05, + "grad_norm": 0.84765625, + "learning_rate": 0.00046583538043387006, + "loss": 0.212, + "step": 122030 + }, + { + "epoch": 5.05, + "grad_norm": 0.88671875, + "learning_rate": 0.0004658299075277957, + "loss": 0.2394, + "step": 122040 + }, + { + "epoch": 5.06, + "grad_norm": 1.1484375, + "learning_rate": 0.0004658244342155527, + "loss": 0.2233, + "step": 122050 + }, + { + "epoch": 5.06, + "grad_norm": 0.271484375, + "learning_rate": 0.00046581896049715144, + "loss": 0.1909, + "step": 122060 + }, + { + "epoch": 5.06, + "grad_norm": 0.83203125, + "learning_rate": 0.0004658134863726022, + "loss": 0.2325, + "step": 122070 + }, + { + "epoch": 5.06, + "grad_norm": 0.41015625, + "learning_rate": 0.0004658080118419153, + "loss": 0.2293, + "step": 122080 + }, + { + "epoch": 5.06, + "grad_norm": 0.75390625, + "learning_rate": 0.00046580253690510097, + "loss": 0.1997, + "step": 122090 + }, + { + "epoch": 5.06, + "grad_norm": 0.83984375, + "learning_rate": 0.0004657970615621696, + "loss": 0.2328, + "step": 122100 + }, + { + "epoch": 5.06, + "grad_norm": 0.423828125, + "learning_rate": 0.0004657915858131315, + "loss": 0.2137, + "step": 122110 + }, + { + "epoch": 5.06, + "grad_norm": 0.306640625, + "learning_rate": 0.00046578610965799697, + "loss": 0.1958, + "step": 122120 + }, + { + "epoch": 5.06, + "grad_norm": 0.734375, + "learning_rate": 0.00046578063309677623, + "loss": 0.1884, + "step": 122130 + }, + { + "epoch": 5.06, + "grad_norm": 0.84765625, + "learning_rate": 0.00046577515612947955, + "loss": 0.1846, + "step": 122140 + }, + { + "epoch": 5.06, + "grad_norm": 0.5625, + "learning_rate": 0.00046576967875611744, + "loss": 0.191, + "step": 122150 + }, + { + "epoch": 5.06, + "grad_norm": 1.453125, + "learning_rate": 0.0004657642009767001, + "loss": 0.2601, + "step": 122160 + }, + { + "epoch": 5.06, + "grad_norm": 2.09375, + "learning_rate": 0.00046575872279123776, + "loss": 0.2215, + "step": 122170 + }, + { + "epoch": 5.06, + "grad_norm": 0.0, + "learning_rate": 0.00046575324419974084, + "loss": 0.2201, + "step": 122180 + }, + { + "epoch": 5.06, + "grad_norm": 0.8984375, + "learning_rate": 0.0004657477652022196, + "loss": 0.1897, + "step": 122190 + }, + { + "epoch": 5.06, + "grad_norm": 0.34375, + "learning_rate": 0.0004657422857986844, + "loss": 0.2109, + "step": 122200 + }, + { + "epoch": 5.06, + "grad_norm": 0.890625, + "learning_rate": 0.0004657368059891456, + "loss": 0.2662, + "step": 122210 + }, + { + "epoch": 5.06, + "grad_norm": 1.15625, + "learning_rate": 0.00046573132577361326, + "loss": 0.1819, + "step": 122220 + }, + { + "epoch": 5.06, + "grad_norm": 0.396484375, + "learning_rate": 0.00046572584515209794, + "loss": 0.1925, + "step": 122230 + }, + { + "epoch": 5.06, + "grad_norm": 0.80859375, + "learning_rate": 0.0004657203641246099, + "loss": 0.2046, + "step": 122240 + }, + { + "epoch": 5.06, + "grad_norm": 0.703125, + "learning_rate": 0.00046571488269115936, + "loss": 0.1895, + "step": 122250 + }, + { + "epoch": 5.06, + "grad_norm": 0.279296875, + "learning_rate": 0.00046570940085175676, + "loss": 0.2285, + "step": 122260 + }, + { + "epoch": 5.06, + "grad_norm": 0.482421875, + "learning_rate": 0.0004657039186064124, + "loss": 0.2557, + "step": 122270 + }, + { + "epoch": 5.06, + "grad_norm": 1.0625, + "learning_rate": 0.0004656984359551365, + "loss": 0.24, + "step": 122280 + }, + { + "epoch": 5.07, + "grad_norm": 0.63671875, + "learning_rate": 0.00046569295289793945, + "loss": 0.1798, + "step": 122290 + }, + { + "epoch": 5.07, + "grad_norm": 1.2734375, + "learning_rate": 0.00046568746943483156, + "loss": 0.213, + "step": 122300 + }, + { + "epoch": 5.07, + "grad_norm": 1.234375, + "learning_rate": 0.0004656819855658231, + "loss": 0.241, + "step": 122310 + }, + { + "epoch": 5.07, + "grad_norm": 0.58984375, + "learning_rate": 0.0004656765012909245, + "loss": 0.2394, + "step": 122320 + }, + { + "epoch": 5.07, + "grad_norm": 0.7890625, + "learning_rate": 0.000465671016610146, + "loss": 0.1733, + "step": 122330 + }, + { + "epoch": 5.07, + "grad_norm": 1.1171875, + "learning_rate": 0.0004656655315234979, + "loss": 0.2192, + "step": 122340 + }, + { + "epoch": 5.07, + "grad_norm": 1.9453125, + "learning_rate": 0.00046566004603099055, + "loss": 0.2123, + "step": 122350 + }, + { + "epoch": 5.07, + "grad_norm": 0.451171875, + "learning_rate": 0.0004656545601326343, + "loss": 0.2011, + "step": 122360 + }, + { + "epoch": 5.07, + "grad_norm": 0.70703125, + "learning_rate": 0.00046564907382843946, + "loss": 0.2215, + "step": 122370 + }, + { + "epoch": 5.07, + "grad_norm": 0.765625, + "learning_rate": 0.00046564358711841634, + "loss": 0.2106, + "step": 122380 + }, + { + "epoch": 5.07, + "grad_norm": 0.4921875, + "learning_rate": 0.00046563810000257533, + "loss": 0.2267, + "step": 122390 + }, + { + "epoch": 5.07, + "grad_norm": 0.94921875, + "learning_rate": 0.0004656326124809267, + "loss": 0.2131, + "step": 122400 + }, + { + "epoch": 5.07, + "grad_norm": 0.76953125, + "learning_rate": 0.0004656271245534808, + "loss": 0.2102, + "step": 122410 + }, + { + "epoch": 5.07, + "grad_norm": 0.6953125, + "learning_rate": 0.0004656216362202478, + "loss": 0.229, + "step": 122420 + }, + { + "epoch": 5.07, + "grad_norm": 0.47265625, + "learning_rate": 0.0004656161474812383, + "loss": 0.1925, + "step": 122430 + }, + { + "epoch": 5.07, + "grad_norm": 0.44921875, + "learning_rate": 0.0004656106583364624, + "loss": 0.182, + "step": 122440 + }, + { + "epoch": 5.07, + "grad_norm": 0.47265625, + "learning_rate": 0.0004656051687859306, + "loss": 0.2461, + "step": 122450 + }, + { + "epoch": 5.07, + "grad_norm": 0.6328125, + "learning_rate": 0.00046559967882965316, + "loss": 0.1862, + "step": 122460 + }, + { + "epoch": 5.07, + "grad_norm": 1.1796875, + "learning_rate": 0.00046559418846764037, + "loss": 0.2143, + "step": 122470 + }, + { + "epoch": 5.07, + "grad_norm": 0.96484375, + "learning_rate": 0.00046558869769990264, + "loss": 0.1619, + "step": 122480 + }, + { + "epoch": 5.07, + "grad_norm": 0.73828125, + "learning_rate": 0.0004655832065264502, + "loss": 0.2357, + "step": 122490 + }, + { + "epoch": 5.07, + "grad_norm": 0.7734375, + "learning_rate": 0.0004655777149472935, + "loss": 0.2115, + "step": 122500 + }, + { + "epoch": 5.07, + "grad_norm": 0.24609375, + "learning_rate": 0.00046557222296244276, + "loss": 0.2042, + "step": 122510 + }, + { + "epoch": 5.07, + "grad_norm": 1.1484375, + "learning_rate": 0.00046556673057190846, + "loss": 0.1711, + "step": 122520 + }, + { + "epoch": 5.08, + "grad_norm": 0.55859375, + "learning_rate": 0.0004655612377757008, + "loss": 0.2142, + "step": 122530 + }, + { + "epoch": 5.08, + "grad_norm": 0.625, + "learning_rate": 0.00046555574457383015, + "loss": 0.1867, + "step": 122540 + }, + { + "epoch": 5.08, + "grad_norm": 0.54296875, + "learning_rate": 0.0004655502509663069, + "loss": 0.1989, + "step": 122550 + }, + { + "epoch": 5.08, + "grad_norm": 0.5234375, + "learning_rate": 0.00046554475695314136, + "loss": 0.2054, + "step": 122560 + }, + { + "epoch": 5.08, + "grad_norm": 1.4609375, + "learning_rate": 0.0004655392625343439, + "loss": 0.226, + "step": 122570 + }, + { + "epoch": 5.08, + "grad_norm": 0.875, + "learning_rate": 0.00046553376770992473, + "loss": 0.1325, + "step": 122580 + }, + { + "epoch": 5.08, + "grad_norm": 0.66796875, + "learning_rate": 0.0004655282724798943, + "loss": 0.2088, + "step": 122590 + }, + { + "epoch": 5.08, + "grad_norm": 0.73046875, + "learning_rate": 0.00046552277684426297, + "loss": 0.2073, + "step": 122600 + }, + { + "epoch": 5.08, + "grad_norm": 0.26953125, + "learning_rate": 0.000465517280803041, + "loss": 0.2378, + "step": 122610 + }, + { + "epoch": 5.08, + "grad_norm": 0.40234375, + "learning_rate": 0.00046551178435623886, + "loss": 0.2089, + "step": 122620 + }, + { + "epoch": 5.08, + "grad_norm": 0.55859375, + "learning_rate": 0.0004655062875038668, + "loss": 0.2625, + "step": 122630 + }, + { + "epoch": 5.08, + "grad_norm": 0.5546875, + "learning_rate": 0.0004655007902459352, + "loss": 0.2126, + "step": 122640 + }, + { + "epoch": 5.08, + "grad_norm": 0.5234375, + "learning_rate": 0.00046549529258245426, + "loss": 0.2407, + "step": 122650 + }, + { + "epoch": 5.08, + "grad_norm": 0.9921875, + "learning_rate": 0.00046548979451343456, + "loss": 0.1837, + "step": 122660 + }, + { + "epoch": 5.08, + "grad_norm": 0.71875, + "learning_rate": 0.00046548429603888633, + "loss": 0.2154, + "step": 122670 + }, + { + "epoch": 5.08, + "grad_norm": 0.86328125, + "learning_rate": 0.00046547879715881993, + "loss": 0.1902, + "step": 122680 + }, + { + "epoch": 5.08, + "grad_norm": 0.4140625, + "learning_rate": 0.00046547329787324565, + "loss": 0.1598, + "step": 122690 + }, + { + "epoch": 5.08, + "grad_norm": 0.84375, + "learning_rate": 0.00046546779818217395, + "loss": 0.2369, + "step": 122700 + }, + { + "epoch": 5.08, + "grad_norm": 0.75, + "learning_rate": 0.00046546229808561515, + "loss": 0.2212, + "step": 122710 + }, + { + "epoch": 5.08, + "grad_norm": 0.80078125, + "learning_rate": 0.0004654567975835795, + "loss": 0.2113, + "step": 122720 + }, + { + "epoch": 5.08, + "grad_norm": 0.1796875, + "learning_rate": 0.0004654512966760774, + "loss": 0.2098, + "step": 122730 + }, + { + "epoch": 5.08, + "grad_norm": 0.64453125, + "learning_rate": 0.00046544579536311926, + "loss": 0.1881, + "step": 122740 + }, + { + "epoch": 5.08, + "grad_norm": 0.224609375, + "learning_rate": 0.0004654402936447154, + "loss": 0.1782, + "step": 122750 + }, + { + "epoch": 5.08, + "grad_norm": 0.5078125, + "learning_rate": 0.0004654347915208762, + "loss": 0.2336, + "step": 122760 + }, + { + "epoch": 5.09, + "grad_norm": 0.466796875, + "learning_rate": 0.00046542928899161195, + "loss": 0.1638, + "step": 122770 + }, + { + "epoch": 5.09, + "grad_norm": 0.70703125, + "learning_rate": 0.0004654237860569331, + "loss": 0.2127, + "step": 122780 + }, + { + "epoch": 5.09, + "grad_norm": 0.466796875, + "learning_rate": 0.0004654182827168499, + "loss": 0.2531, + "step": 122790 + }, + { + "epoch": 5.09, + "grad_norm": 0.2734375, + "learning_rate": 0.0004654127789713728, + "loss": 0.1886, + "step": 122800 + }, + { + "epoch": 5.09, + "grad_norm": 0.671875, + "learning_rate": 0.00046540727482051203, + "loss": 0.1458, + "step": 122810 + }, + { + "epoch": 5.09, + "grad_norm": 0.5234375, + "learning_rate": 0.0004654017702642781, + "loss": 0.1909, + "step": 122820 + }, + { + "epoch": 5.09, + "grad_norm": 1.234375, + "learning_rate": 0.00046539626530268133, + "loss": 0.1794, + "step": 122830 + }, + { + "epoch": 5.09, + "grad_norm": 0.85546875, + "learning_rate": 0.00046539075993573195, + "loss": 0.1976, + "step": 122840 + }, + { + "epoch": 5.09, + "grad_norm": 0.8125, + "learning_rate": 0.0004653852541634405, + "loss": 0.2112, + "step": 122850 + }, + { + "epoch": 5.09, + "grad_norm": 1.078125, + "learning_rate": 0.0004653797479858172, + "loss": 0.1868, + "step": 122860 + }, + { + "epoch": 5.09, + "grad_norm": 0.72265625, + "learning_rate": 0.00046537424140287247, + "loss": 0.2186, + "step": 122870 + }, + { + "epoch": 5.09, + "grad_norm": 0.56640625, + "learning_rate": 0.00046536873441461676, + "loss": 0.1761, + "step": 122880 + }, + { + "epoch": 5.09, + "grad_norm": 0.66796875, + "learning_rate": 0.00046536322702106026, + "loss": 0.1785, + "step": 122890 + }, + { + "epoch": 5.09, + "grad_norm": 0.56640625, + "learning_rate": 0.0004653577192222135, + "loss": 0.1697, + "step": 122900 + }, + { + "epoch": 5.09, + "grad_norm": 0.94140625, + "learning_rate": 0.0004653522110180867, + "loss": 0.2322, + "step": 122910 + }, + { + "epoch": 5.09, + "grad_norm": 1.328125, + "learning_rate": 0.00046534670240869035, + "loss": 0.2449, + "step": 122920 + }, + { + "epoch": 5.09, + "grad_norm": 0.359375, + "learning_rate": 0.0004653411933940347, + "loss": 0.1869, + "step": 122930 + }, + { + "epoch": 5.09, + "grad_norm": 0.318359375, + "learning_rate": 0.00046533568397413025, + "loss": 0.2086, + "step": 122940 + }, + { + "epoch": 5.09, + "grad_norm": 0.7265625, + "learning_rate": 0.00046533017414898724, + "loss": 0.2052, + "step": 122950 + }, + { + "epoch": 5.09, + "grad_norm": 0.921875, + "learning_rate": 0.0004653246639186162, + "loss": 0.2259, + "step": 122960 + }, + { + "epoch": 5.09, + "grad_norm": 0.6796875, + "learning_rate": 0.00046531915328302723, + "loss": 0.2178, + "step": 122970 + }, + { + "epoch": 5.09, + "grad_norm": 0.333984375, + "learning_rate": 0.00046531364224223094, + "loss": 0.2167, + "step": 122980 + }, + { + "epoch": 5.09, + "grad_norm": 0.5, + "learning_rate": 0.00046530813079623766, + "loss": 0.1659, + "step": 122990 + }, + { + "epoch": 5.09, + "grad_norm": 0.322265625, + "learning_rate": 0.0004653026189450577, + "loss": 0.2093, + "step": 123000 + }, + { + "epoch": 5.1, + "grad_norm": 0.8203125, + "learning_rate": 0.00046529710668870153, + "loss": 0.1655, + "step": 123010 + }, + { + "epoch": 5.1, + "grad_norm": 0.4609375, + "learning_rate": 0.00046529159402717933, + "loss": 0.1694, + "step": 123020 + }, + { + "epoch": 5.1, + "grad_norm": 0.4609375, + "learning_rate": 0.0004652860809605017, + "loss": 0.1936, + "step": 123030 + }, + { + "epoch": 5.1, + "grad_norm": 1.59375, + "learning_rate": 0.00046528056748867887, + "loss": 0.1704, + "step": 123040 + }, + { + "epoch": 5.1, + "grad_norm": 2.25, + "learning_rate": 0.0004652750536117213, + "loss": 0.2272, + "step": 123050 + }, + { + "epoch": 5.1, + "grad_norm": 0.484375, + "learning_rate": 0.0004652695393296393, + "loss": 0.2575, + "step": 123060 + }, + { + "epoch": 5.1, + "grad_norm": 0.52734375, + "learning_rate": 0.0004652640246424432, + "loss": 0.1952, + "step": 123070 + }, + { + "epoch": 5.1, + "grad_norm": 0.78125, + "learning_rate": 0.0004652585095501436, + "loss": 0.2625, + "step": 123080 + }, + { + "epoch": 5.1, + "grad_norm": 1.0234375, + "learning_rate": 0.0004652529940527506, + "loss": 0.2118, + "step": 123090 + }, + { + "epoch": 5.1, + "grad_norm": 0.73046875, + "learning_rate": 0.0004652474781502748, + "loss": 0.1834, + "step": 123100 + }, + { + "epoch": 5.1, + "grad_norm": 5.1021575927734375e-05, + "learning_rate": 0.0004652419618427264, + "loss": 0.1581, + "step": 123110 + }, + { + "epoch": 5.1, + "grad_norm": 0.36328125, + "learning_rate": 0.000465236445130116, + "loss": 0.2264, + "step": 123120 + }, + { + "epoch": 5.1, + "grad_norm": 0.306640625, + "learning_rate": 0.0004652309280124537, + "loss": 0.2266, + "step": 123130 + }, + { + "epoch": 5.1, + "grad_norm": 0.142578125, + "learning_rate": 0.00046522541048975015, + "loss": 0.1394, + "step": 123140 + }, + { + "epoch": 5.1, + "grad_norm": 0.82421875, + "learning_rate": 0.00046521989256201557, + "loss": 0.1989, + "step": 123150 + }, + { + "epoch": 5.1, + "grad_norm": 1.15625, + "learning_rate": 0.0004652143742292604, + "loss": 0.206, + "step": 123160 + }, + { + "epoch": 5.1, + "grad_norm": 0.95703125, + "learning_rate": 0.0004652088554914951, + "loss": 0.1635, + "step": 123170 + }, + { + "epoch": 5.1, + "grad_norm": 0.5234375, + "learning_rate": 0.00046520333634872987, + "loss": 0.2132, + "step": 123180 + }, + { + "epoch": 5.1, + "grad_norm": 0.6640625, + "learning_rate": 0.0004651978168009752, + "loss": 0.1437, + "step": 123190 + }, + { + "epoch": 5.1, + "grad_norm": 0.546875, + "learning_rate": 0.0004651922968482415, + "loss": 0.1886, + "step": 123200 + }, + { + "epoch": 5.1, + "grad_norm": 1.5703125, + "learning_rate": 0.0004651867764905392, + "loss": 0.2065, + "step": 123210 + }, + { + "epoch": 5.1, + "grad_norm": 0.375, + "learning_rate": 0.00046518125572787855, + "loss": 0.1612, + "step": 123220 + }, + { + "epoch": 5.1, + "grad_norm": 0.71484375, + "learning_rate": 0.00046517573456027005, + "loss": 0.1685, + "step": 123230 + }, + { + "epoch": 5.1, + "grad_norm": 1.0078125, + "learning_rate": 0.0004651702129877241, + "loss": 0.2463, + "step": 123240 + }, + { + "epoch": 5.1, + "grad_norm": 0.8515625, + "learning_rate": 0.0004651646910102509, + "loss": 0.2555, + "step": 123250 + }, + { + "epoch": 5.11, + "grad_norm": 0.58203125, + "learning_rate": 0.0004651591686278611, + "loss": 0.2205, + "step": 123260 + }, + { + "epoch": 5.11, + "grad_norm": 0.34375, + "learning_rate": 0.00046515364584056494, + "loss": 0.2217, + "step": 123270 + }, + { + "epoch": 5.11, + "grad_norm": 0.765625, + "learning_rate": 0.00046514812264837293, + "loss": 0.2214, + "step": 123280 + }, + { + "epoch": 5.11, + "grad_norm": 0.7890625, + "learning_rate": 0.0004651425990512953, + "loss": 0.2048, + "step": 123290 + }, + { + "epoch": 5.11, + "grad_norm": 0.6875, + "learning_rate": 0.00046513707504934257, + "loss": 0.1342, + "step": 123300 + }, + { + "epoch": 5.11, + "grad_norm": 0.80859375, + "learning_rate": 0.0004651315506425251, + "loss": 0.2148, + "step": 123310 + }, + { + "epoch": 5.11, + "grad_norm": 0.58203125, + "learning_rate": 0.00046512602583085325, + "loss": 0.1785, + "step": 123320 + }, + { + "epoch": 5.11, + "grad_norm": 1.0234375, + "learning_rate": 0.00046512050061433744, + "loss": 0.1873, + "step": 123330 + }, + { + "epoch": 5.11, + "grad_norm": 0.435546875, + "learning_rate": 0.0004651149749929881, + "loss": 0.2373, + "step": 123340 + }, + { + "epoch": 5.11, + "grad_norm": 1.765625, + "learning_rate": 0.0004651094489668156, + "loss": 0.2116, + "step": 123350 + }, + { + "epoch": 5.11, + "grad_norm": 0.2578125, + "learning_rate": 0.0004651039225358304, + "loss": 0.2176, + "step": 123360 + }, + { + "epoch": 5.11, + "grad_norm": 0.8203125, + "learning_rate": 0.0004650983957000428, + "loss": 0.1899, + "step": 123370 + }, + { + "epoch": 5.11, + "grad_norm": 0.59765625, + "learning_rate": 0.0004650928684594633, + "loss": 0.2412, + "step": 123380 + }, + { + "epoch": 5.11, + "grad_norm": 0.6328125, + "learning_rate": 0.0004650873408141022, + "loss": 0.1831, + "step": 123390 + }, + { + "epoch": 5.11, + "grad_norm": 1.7734375, + "learning_rate": 0.00046508181276396995, + "loss": 0.2454, + "step": 123400 + }, + { + "epoch": 5.11, + "grad_norm": 0.71875, + "learning_rate": 0.00046507628430907696, + "loss": 0.2467, + "step": 123410 + }, + { + "epoch": 5.11, + "grad_norm": 0.35546875, + "learning_rate": 0.00046507075544943363, + "loss": 0.2254, + "step": 123420 + }, + { + "epoch": 5.11, + "grad_norm": 0.244140625, + "learning_rate": 0.00046506522618505036, + "loss": 0.2111, + "step": 123430 + }, + { + "epoch": 5.11, + "grad_norm": 1.1953125, + "learning_rate": 0.0004650596965159376, + "loss": 0.2252, + "step": 123440 + }, + { + "epoch": 5.11, + "grad_norm": 1.15625, + "learning_rate": 0.0004650541664421056, + "loss": 0.2073, + "step": 123450 + }, + { + "epoch": 5.11, + "grad_norm": 0.8671875, + "learning_rate": 0.000465048635963565, + "loss": 0.2032, + "step": 123460 + }, + { + "epoch": 5.11, + "grad_norm": 0.4765625, + "learning_rate": 0.0004650431050803261, + "loss": 0.2419, + "step": 123470 + }, + { + "epoch": 5.11, + "grad_norm": 0.345703125, + "learning_rate": 0.0004650375737923992, + "loss": 0.1989, + "step": 123480 + }, + { + "epoch": 5.11, + "grad_norm": 0.62109375, + "learning_rate": 0.0004650320420997949, + "loss": 0.2135, + "step": 123490 + }, + { + "epoch": 5.12, + "grad_norm": 0.44921875, + "learning_rate": 0.0004650265100025235, + "loss": 0.2135, + "step": 123500 + }, + { + "epoch": 5.12, + "grad_norm": 0.828125, + "learning_rate": 0.00046502097750059535, + "loss": 0.2459, + "step": 123510 + }, + { + "epoch": 5.12, + "grad_norm": 0.765625, + "learning_rate": 0.000465015444594021, + "loss": 0.2066, + "step": 123520 + }, + { + "epoch": 5.12, + "grad_norm": 0.73046875, + "learning_rate": 0.00046500991128281083, + "loss": 0.1596, + "step": 123530 + }, + { + "epoch": 5.12, + "grad_norm": 0.6796875, + "learning_rate": 0.0004650043775669752, + "loss": 0.1804, + "step": 123540 + }, + { + "epoch": 5.12, + "grad_norm": 0.435546875, + "learning_rate": 0.00046499884344652453, + "loss": 0.192, + "step": 123550 + }, + { + "epoch": 5.12, + "grad_norm": 0.84765625, + "learning_rate": 0.00046499330892146927, + "loss": 0.2101, + "step": 123560 + }, + { + "epoch": 5.12, + "grad_norm": 0.7265625, + "learning_rate": 0.00046498777399181984, + "loss": 0.1981, + "step": 123570 + }, + { + "epoch": 5.12, + "grad_norm": 0.3671875, + "learning_rate": 0.0004649822386575866, + "loss": 0.1577, + "step": 123580 + }, + { + "epoch": 5.12, + "grad_norm": 0.578125, + "learning_rate": 0.00046497670291878, + "loss": 0.2481, + "step": 123590 + }, + { + "epoch": 5.12, + "grad_norm": 0.6953125, + "learning_rate": 0.0004649711667754105, + "loss": 0.2234, + "step": 123600 + }, + { + "epoch": 5.12, + "grad_norm": 0.296875, + "learning_rate": 0.0004649656302274885, + "loss": 0.2033, + "step": 123610 + }, + { + "epoch": 5.12, + "grad_norm": 0.337890625, + "learning_rate": 0.00046496009327502436, + "loss": 0.1991, + "step": 123620 + }, + { + "epoch": 5.12, + "grad_norm": 0.484375, + "learning_rate": 0.0004649545559180285, + "loss": 0.2265, + "step": 123630 + }, + { + "epoch": 5.12, + "grad_norm": 0.53125, + "learning_rate": 0.0004649490181565115, + "loss": 0.1844, + "step": 123640 + }, + { + "epoch": 5.12, + "grad_norm": 0.48046875, + "learning_rate": 0.00046494347999048355, + "loss": 0.1866, + "step": 123650 + }, + { + "epoch": 5.12, + "grad_norm": 0.7578125, + "learning_rate": 0.00046493794141995513, + "loss": 0.2209, + "step": 123660 + }, + { + "epoch": 5.12, + "grad_norm": 0.40625, + "learning_rate": 0.00046493240244493685, + "loss": 0.2508, + "step": 123670 + }, + { + "epoch": 5.12, + "grad_norm": 0.58984375, + "learning_rate": 0.00046492686306543894, + "loss": 0.2047, + "step": 123680 + }, + { + "epoch": 5.12, + "grad_norm": 0.40625, + "learning_rate": 0.00046492132328147193, + "loss": 0.189, + "step": 123690 + }, + { + "epoch": 5.12, + "grad_norm": 0.63671875, + "learning_rate": 0.0004649157830930462, + "loss": 0.2211, + "step": 123700 + }, + { + "epoch": 5.12, + "grad_norm": 0.3984375, + "learning_rate": 0.00046491024250017203, + "loss": 0.2057, + "step": 123710 + }, + { + "epoch": 5.12, + "grad_norm": 0.765625, + "learning_rate": 0.0004649047015028601, + "loss": 0.1935, + "step": 123720 + }, + { + "epoch": 5.12, + "grad_norm": 0.416015625, + "learning_rate": 0.0004648991601011208, + "loss": 0.1472, + "step": 123730 + }, + { + "epoch": 5.13, + "grad_norm": 0.5859375, + "learning_rate": 0.00046489361829496435, + "loss": 0.1673, + "step": 123740 + }, + { + "epoch": 5.13, + "grad_norm": 0.68359375, + "learning_rate": 0.0004648880760844014, + "loss": 0.2347, + "step": 123750 + }, + { + "epoch": 5.13, + "grad_norm": 2.4375, + "learning_rate": 0.0004648825334694423, + "loss": 0.2272, + "step": 123760 + }, + { + "epoch": 5.13, + "grad_norm": 0.97265625, + "learning_rate": 0.00046487699045009745, + "loss": 0.2045, + "step": 123770 + }, + { + "epoch": 5.13, + "grad_norm": 0.57421875, + "learning_rate": 0.0004648714470263773, + "loss": 0.1817, + "step": 123780 + }, + { + "epoch": 5.13, + "grad_norm": 0.65234375, + "learning_rate": 0.0004648659031982923, + "loss": 0.1733, + "step": 123790 + }, + { + "epoch": 5.13, + "grad_norm": 0.34375, + "learning_rate": 0.00046486035896585286, + "loss": 0.1638, + "step": 123800 + }, + { + "epoch": 5.13, + "grad_norm": 0.69921875, + "learning_rate": 0.00046485481432906946, + "loss": 0.1894, + "step": 123810 + }, + { + "epoch": 5.13, + "grad_norm": 0.5859375, + "learning_rate": 0.00046484926928795243, + "loss": 0.2234, + "step": 123820 + }, + { + "epoch": 5.13, + "grad_norm": 0.87109375, + "learning_rate": 0.0004648437238425124, + "loss": 0.1439, + "step": 123830 + }, + { + "epoch": 5.13, + "grad_norm": 1.3125, + "learning_rate": 0.0004648381779927596, + "loss": 0.1869, + "step": 123840 + }, + { + "epoch": 5.13, + "grad_norm": 0.03564453125, + "learning_rate": 0.0004648326317387046, + "loss": 0.2323, + "step": 123850 + }, + { + "epoch": 5.13, + "grad_norm": 0.6484375, + "learning_rate": 0.00046482708508035776, + "loss": 0.232, + "step": 123860 + }, + { + "epoch": 5.13, + "grad_norm": 0.4296875, + "learning_rate": 0.0004648215380177295, + "loss": 0.1931, + "step": 123870 + }, + { + "epoch": 5.13, + "grad_norm": 0.5703125, + "learning_rate": 0.00046481599055083037, + "loss": 0.1878, + "step": 123880 + }, + { + "epoch": 5.13, + "grad_norm": 0.42578125, + "learning_rate": 0.00046481044267967066, + "loss": 0.2251, + "step": 123890 + }, + { + "epoch": 5.13, + "grad_norm": 0.248046875, + "learning_rate": 0.00046480489440426097, + "loss": 0.2218, + "step": 123900 + }, + { + "epoch": 5.13, + "grad_norm": 0.51171875, + "learning_rate": 0.00046479934572461165, + "loss": 0.2011, + "step": 123910 + }, + { + "epoch": 5.13, + "grad_norm": 0.26171875, + "learning_rate": 0.00046479379664073316, + "loss": 0.2426, + "step": 123920 + }, + { + "epoch": 5.13, + "grad_norm": 0.7578125, + "learning_rate": 0.0004647882471526359, + "loss": 0.2488, + "step": 123930 + }, + { + "epoch": 5.13, + "grad_norm": 0.58203125, + "learning_rate": 0.0004647826972603304, + "loss": 0.2826, + "step": 123940 + }, + { + "epoch": 5.13, + "grad_norm": 0.359375, + "learning_rate": 0.00046477714696382705, + "loss": 0.222, + "step": 123950 + }, + { + "epoch": 5.13, + "grad_norm": 0.83203125, + "learning_rate": 0.0004647715962631363, + "loss": 0.2644, + "step": 123960 + }, + { + "epoch": 5.13, + "grad_norm": 0.83984375, + "learning_rate": 0.0004647660451582686, + "loss": 0.2799, + "step": 123970 + }, + { + "epoch": 5.14, + "grad_norm": 0.81640625, + "learning_rate": 0.0004647604936492344, + "loss": 0.2339, + "step": 123980 + }, + { + "epoch": 5.14, + "grad_norm": 0.625, + "learning_rate": 0.00046475494173604413, + "loss": 0.2231, + "step": 123990 + }, + { + "epoch": 5.14, + "grad_norm": 0.7734375, + "learning_rate": 0.00046474938941870825, + "loss": 0.2393, + "step": 124000 + }, + { + "epoch": 5.14, + "grad_norm": 0.220703125, + "learning_rate": 0.00046474383669723726, + "loss": 0.228, + "step": 124010 + }, + { + "epoch": 5.14, + "grad_norm": 0.2421875, + "learning_rate": 0.0004647382835716415, + "loss": 0.1849, + "step": 124020 + }, + { + "epoch": 5.14, + "grad_norm": 0.89453125, + "learning_rate": 0.0004647327300419315, + "loss": 0.2207, + "step": 124030 + }, + { + "epoch": 5.14, + "grad_norm": 0.8125, + "learning_rate": 0.00046472717610811763, + "loss": 0.2194, + "step": 124040 + }, + { + "epoch": 5.14, + "grad_norm": 0.54296875, + "learning_rate": 0.0004647216217702105, + "loss": 0.2174, + "step": 124050 + }, + { + "epoch": 5.14, + "grad_norm": 0.60546875, + "learning_rate": 0.00046471606702822043, + "loss": 0.2003, + "step": 124060 + }, + { + "epoch": 5.14, + "grad_norm": 1.03125, + "learning_rate": 0.0004647105118821579, + "loss": 0.2089, + "step": 124070 + }, + { + "epoch": 5.14, + "grad_norm": 0.546875, + "learning_rate": 0.00046470495633203334, + "loss": 0.2247, + "step": 124080 + }, + { + "epoch": 5.14, + "grad_norm": 0.82421875, + "learning_rate": 0.0004646994003778573, + "loss": 0.1773, + "step": 124090 + }, + { + "epoch": 5.14, + "grad_norm": 0.640625, + "learning_rate": 0.0004646938440196402, + "loss": 0.1836, + "step": 124100 + }, + { + "epoch": 5.14, + "grad_norm": 0.6328125, + "learning_rate": 0.00046468828725739245, + "loss": 0.1718, + "step": 124110 + }, + { + "epoch": 5.14, + "grad_norm": 1.484375, + "learning_rate": 0.0004646827300911245, + "loss": 0.1979, + "step": 124120 + }, + { + "epoch": 5.14, + "grad_norm": 1.125, + "learning_rate": 0.00046467717252084677, + "loss": 0.2208, + "step": 124130 + }, + { + "epoch": 5.14, + "grad_norm": 0.92578125, + "learning_rate": 0.0004646716145465699, + "loss": 0.2401, + "step": 124140 + }, + { + "epoch": 5.14, + "grad_norm": 0.9296875, + "learning_rate": 0.00046466605616830414, + "loss": 0.2232, + "step": 124150 + }, + { + "epoch": 5.14, + "grad_norm": 0.55078125, + "learning_rate": 0.0004646604973860601, + "loss": 0.2243, + "step": 124160 + }, + { + "epoch": 5.14, + "grad_norm": 0.31640625, + "learning_rate": 0.00046465493819984816, + "loss": 0.1424, + "step": 124170 + }, + { + "epoch": 5.14, + "grad_norm": 0.66015625, + "learning_rate": 0.0004646493786096788, + "loss": 0.249, + "step": 124180 + }, + { + "epoch": 5.14, + "grad_norm": 1.859375, + "learning_rate": 0.0004646438186155626, + "loss": 0.2303, + "step": 124190 + }, + { + "epoch": 5.14, + "grad_norm": 1.0703125, + "learning_rate": 0.0004646382582175098, + "loss": 0.2053, + "step": 124200 + }, + { + "epoch": 5.14, + "grad_norm": 0.953125, + "learning_rate": 0.00046463269741553096, + "loss": 0.2759, + "step": 124210 + }, + { + "epoch": 5.15, + "grad_norm": 0.765625, + "learning_rate": 0.00046462713620963657, + "loss": 0.2506, + "step": 124220 + }, + { + "epoch": 5.15, + "grad_norm": 0.498046875, + "learning_rate": 0.0004646215745998372, + "loss": 0.2386, + "step": 124230 + }, + { + "epoch": 5.15, + "grad_norm": 0.7734375, + "learning_rate": 0.0004646160125861431, + "loss": 0.215, + "step": 124240 + }, + { + "epoch": 5.15, + "grad_norm": 0.462890625, + "learning_rate": 0.00046461045016856487, + "loss": 0.203, + "step": 124250 + }, + { + "epoch": 5.15, + "grad_norm": 0.92578125, + "learning_rate": 0.0004646048873471129, + "loss": 0.2268, + "step": 124260 + }, + { + "epoch": 5.15, + "grad_norm": 0.330078125, + "learning_rate": 0.00046459932412179776, + "loss": 0.219, + "step": 124270 + }, + { + "epoch": 5.15, + "grad_norm": 0.51171875, + "learning_rate": 0.00046459376049262993, + "loss": 0.1666, + "step": 124280 + }, + { + "epoch": 5.15, + "grad_norm": 0.32421875, + "learning_rate": 0.00046458819645961967, + "loss": 0.1756, + "step": 124290 + }, + { + "epoch": 5.15, + "grad_norm": 0.8359375, + "learning_rate": 0.0004645826320227777, + "loss": 0.2005, + "step": 124300 + }, + { + "epoch": 5.15, + "grad_norm": 0.671875, + "learning_rate": 0.00046457706718211434, + "loss": 0.2387, + "step": 124310 + }, + { + "epoch": 5.15, + "grad_norm": 1.046875, + "learning_rate": 0.0004645715019376402, + "loss": 0.2249, + "step": 124320 + }, + { + "epoch": 5.15, + "grad_norm": 0.474609375, + "learning_rate": 0.00046456593628936555, + "loss": 0.1952, + "step": 124330 + }, + { + "epoch": 5.15, + "grad_norm": 1.453125, + "learning_rate": 0.00046456037023730103, + "loss": 0.2063, + "step": 124340 + }, + { + "epoch": 5.15, + "grad_norm": 0.67578125, + "learning_rate": 0.0004645548037814571, + "loss": 0.2151, + "step": 124350 + }, + { + "epoch": 5.15, + "grad_norm": 0.59765625, + "learning_rate": 0.00046454923692184415, + "loss": 0.1975, + "step": 124360 + }, + { + "epoch": 5.15, + "grad_norm": 1.390625, + "learning_rate": 0.0004645436696584727, + "loss": 0.2754, + "step": 124370 + }, + { + "epoch": 5.15, + "grad_norm": 0.84765625, + "learning_rate": 0.00046453810199135326, + "loss": 0.2582, + "step": 124380 + }, + { + "epoch": 5.15, + "grad_norm": 0.86328125, + "learning_rate": 0.0004645325339204963, + "loss": 0.2468, + "step": 124390 + }, + { + "epoch": 5.15, + "grad_norm": 0.578125, + "learning_rate": 0.0004645269654459122, + "loss": 0.2203, + "step": 124400 + }, + { + "epoch": 5.15, + "grad_norm": 0.7421875, + "learning_rate": 0.00046452139656761156, + "loss": 0.27, + "step": 124410 + }, + { + "epoch": 5.15, + "grad_norm": 0.54296875, + "learning_rate": 0.0004645158272856048, + "loss": 0.2193, + "step": 124420 + }, + { + "epoch": 5.15, + "grad_norm": 0.9765625, + "learning_rate": 0.0004645102575999025, + "loss": 0.1477, + "step": 124430 + }, + { + "epoch": 5.15, + "grad_norm": 0.83984375, + "learning_rate": 0.000464504687510515, + "loss": 0.1127, + "step": 124440 + }, + { + "epoch": 5.15, + "grad_norm": 0.58984375, + "learning_rate": 0.00046449911701745286, + "loss": 0.2374, + "step": 124450 + }, + { + "epoch": 5.16, + "grad_norm": 0.36328125, + "learning_rate": 0.00046449354612072645, + "loss": 0.1906, + "step": 124460 + }, + { + "epoch": 5.16, + "grad_norm": 0.734375, + "learning_rate": 0.00046448797482034643, + "loss": 0.2344, + "step": 124470 + }, + { + "epoch": 5.16, + "grad_norm": 0.73046875, + "learning_rate": 0.00046448240311632326, + "loss": 0.1851, + "step": 124480 + }, + { + "epoch": 5.16, + "grad_norm": 0.48046875, + "learning_rate": 0.0004644768310086673, + "loss": 0.2087, + "step": 124490 + }, + { + "epoch": 5.16, + "grad_norm": 0.32421875, + "learning_rate": 0.0004644712584973891, + "loss": 0.1854, + "step": 124500 + }, + { + "epoch": 5.16, + "grad_norm": 0.87890625, + "learning_rate": 0.0004644656855824992, + "loss": 0.1953, + "step": 124510 + }, + { + "epoch": 5.16, + "grad_norm": 0.71484375, + "learning_rate": 0.00046446011226400797, + "loss": 0.2147, + "step": 124520 + }, + { + "epoch": 5.16, + "grad_norm": 0.470703125, + "learning_rate": 0.00046445453854192606, + "loss": 0.1495, + "step": 124530 + }, + { + "epoch": 5.16, + "grad_norm": 0.55078125, + "learning_rate": 0.0004644489644162638, + "loss": 0.1836, + "step": 124540 + }, + { + "epoch": 5.16, + "grad_norm": 1.125, + "learning_rate": 0.0004644433898870317, + "loss": 0.2181, + "step": 124550 + }, + { + "epoch": 5.16, + "grad_norm": 0.5234375, + "learning_rate": 0.00046443781495424043, + "loss": 0.2702, + "step": 124560 + }, + { + "epoch": 5.16, + "grad_norm": 0.4609375, + "learning_rate": 0.00046443223961790026, + "loss": 0.2116, + "step": 124570 + }, + { + "epoch": 5.16, + "grad_norm": 1.8125, + "learning_rate": 0.00046442666387802177, + "loss": 0.229, + "step": 124580 + }, + { + "epoch": 5.16, + "grad_norm": 0.59765625, + "learning_rate": 0.00046442108773461554, + "loss": 0.2035, + "step": 124590 + }, + { + "epoch": 5.16, + "grad_norm": 1.2109375, + "learning_rate": 0.00046441551118769187, + "loss": 0.2041, + "step": 124600 + }, + { + "epoch": 5.16, + "grad_norm": 0.56640625, + "learning_rate": 0.00046440993423726144, + "loss": 0.2541, + "step": 124610 + }, + { + "epoch": 5.16, + "grad_norm": 0.55859375, + "learning_rate": 0.0004644043568833346, + "loss": 0.1907, + "step": 124620 + }, + { + "epoch": 5.16, + "grad_norm": 0.369140625, + "learning_rate": 0.00046439877912592196, + "loss": 0.1386, + "step": 124630 + }, + { + "epoch": 5.16, + "grad_norm": 2.5, + "learning_rate": 0.000464393200965034, + "loss": 0.1974, + "step": 124640 + }, + { + "epoch": 5.16, + "grad_norm": 1.171875, + "learning_rate": 0.0004643876224006811, + "loss": 0.1534, + "step": 124650 + }, + { + "epoch": 5.16, + "grad_norm": 0.33984375, + "learning_rate": 0.00046438204343287394, + "loss": 0.2168, + "step": 124660 + }, + { + "epoch": 5.16, + "grad_norm": 2.296875, + "learning_rate": 0.00046437646406162286, + "loss": 0.2342, + "step": 124670 + }, + { + "epoch": 5.16, + "grad_norm": 0.68359375, + "learning_rate": 0.00046437088428693846, + "loss": 0.1878, + "step": 124680 + }, + { + "epoch": 5.16, + "grad_norm": 0.61328125, + "learning_rate": 0.0004643653041088312, + "loss": 0.1646, + "step": 124690 + }, + { + "epoch": 5.17, + "grad_norm": 1.0078125, + "learning_rate": 0.0004643597235273116, + "loss": 0.1992, + "step": 124700 + }, + { + "epoch": 5.17, + "grad_norm": 0.8515625, + "learning_rate": 0.0004643541425423901, + "loss": 0.1929, + "step": 124710 + }, + { + "epoch": 5.17, + "grad_norm": 0.91796875, + "learning_rate": 0.0004643485611540773, + "loss": 0.1923, + "step": 124720 + }, + { + "epoch": 5.17, + "grad_norm": 0.78125, + "learning_rate": 0.0004643429793623837, + "loss": 0.1926, + "step": 124730 + }, + { + "epoch": 5.17, + "grad_norm": 0.5, + "learning_rate": 0.00046433739716731964, + "loss": 0.2037, + "step": 124740 + }, + { + "epoch": 5.17, + "grad_norm": 0.84375, + "learning_rate": 0.00046433181456889585, + "loss": 0.2222, + "step": 124750 + }, + { + "epoch": 5.17, + "grad_norm": 0.2197265625, + "learning_rate": 0.0004643262315671226, + "loss": 0.1932, + "step": 124760 + }, + { + "epoch": 5.17, + "grad_norm": 0.466796875, + "learning_rate": 0.00046432064816201066, + "loss": 0.1969, + "step": 124770 + }, + { + "epoch": 5.17, + "grad_norm": 0.72265625, + "learning_rate": 0.00046431506435357035, + "loss": 0.1124, + "step": 124780 + }, + { + "epoch": 5.17, + "grad_norm": 0.462890625, + "learning_rate": 0.00046430948014181226, + "loss": 0.1243, + "step": 124790 + }, + { + "epoch": 5.17, + "grad_norm": 0.578125, + "learning_rate": 0.0004643038955267468, + "loss": 0.1749, + "step": 124800 + }, + { + "epoch": 5.17, + "grad_norm": 0.49609375, + "learning_rate": 0.0004642983105083846, + "loss": 0.2211, + "step": 124810 + }, + { + "epoch": 5.17, + "grad_norm": 0.94921875, + "learning_rate": 0.00046429272508673614, + "loss": 0.1058, + "step": 124820 + }, + { + "epoch": 5.17, + "grad_norm": 0.75390625, + "learning_rate": 0.00046428713926181185, + "loss": 0.2054, + "step": 124830 + }, + { + "epoch": 5.17, + "grad_norm": 0.75390625, + "learning_rate": 0.0004642815530336224, + "loss": 0.138, + "step": 124840 + }, + { + "epoch": 5.17, + "grad_norm": 0.37109375, + "learning_rate": 0.00046427596640217817, + "loss": 0.2129, + "step": 124850 + }, + { + "epoch": 5.17, + "grad_norm": 0.58203125, + "learning_rate": 0.0004642703793674897, + "loss": 0.209, + "step": 124860 + }, + { + "epoch": 5.17, + "grad_norm": 0.75, + "learning_rate": 0.0004642647919295675, + "loss": 0.1651, + "step": 124870 + }, + { + "epoch": 5.17, + "grad_norm": 0.6328125, + "learning_rate": 0.0004642592040884222, + "loss": 0.2197, + "step": 124880 + }, + { + "epoch": 5.17, + "grad_norm": 0.421875, + "learning_rate": 0.00046425361584406403, + "loss": 0.2225, + "step": 124890 + }, + { + "epoch": 5.17, + "grad_norm": 0.828125, + "learning_rate": 0.00046424802719650384, + "loss": 0.206, + "step": 124900 + }, + { + "epoch": 5.17, + "grad_norm": 0.65625, + "learning_rate": 0.00046424243814575196, + "loss": 0.2465, + "step": 124910 + }, + { + "epoch": 5.17, + "grad_norm": 0.43359375, + "learning_rate": 0.00046423684869181895, + "loss": 0.2327, + "step": 124920 + }, + { + "epoch": 5.17, + "grad_norm": 0.37890625, + "learning_rate": 0.0004642312588347153, + "loss": 0.1885, + "step": 124930 + }, + { + "epoch": 5.17, + "grad_norm": 2.03125, + "learning_rate": 0.0004642256685744516, + "loss": 0.2323, + "step": 124940 + }, + { + "epoch": 5.18, + "grad_norm": 0.66796875, + "learning_rate": 0.00046422007791103824, + "loss": 0.1923, + "step": 124950 + }, + { + "epoch": 5.18, + "grad_norm": 0.306640625, + "learning_rate": 0.00046421448684448585, + "loss": 0.2235, + "step": 124960 + }, + { + "epoch": 5.18, + "grad_norm": 0.5078125, + "learning_rate": 0.00046420889537480503, + "loss": 0.2125, + "step": 124970 + }, + { + "epoch": 5.18, + "grad_norm": 0.333984375, + "learning_rate": 0.00046420330350200613, + "loss": 0.2288, + "step": 124980 + }, + { + "epoch": 5.18, + "grad_norm": 1.1953125, + "learning_rate": 0.0004641977112260998, + "loss": 0.2492, + "step": 124990 + }, + { + "epoch": 5.18, + "grad_norm": 0.828125, + "learning_rate": 0.00046419211854709645, + "loss": 0.2255, + "step": 125000 + }, + { + "epoch": 5.18, + "grad_norm": 1.0703125, + "learning_rate": 0.00046418652546500663, + "loss": 0.2108, + "step": 125010 + }, + { + "epoch": 5.18, + "grad_norm": 0.43359375, + "learning_rate": 0.000464180931979841, + "loss": 0.2099, + "step": 125020 + }, + { + "epoch": 5.18, + "grad_norm": 0.53125, + "learning_rate": 0.0004641753380916099, + "loss": 0.2409, + "step": 125030 + }, + { + "epoch": 5.18, + "grad_norm": 1.15625, + "learning_rate": 0.00046416974380032397, + "loss": 0.1905, + "step": 125040 + }, + { + "epoch": 5.18, + "grad_norm": 0.609375, + "learning_rate": 0.0004641641491059937, + "loss": 0.1965, + "step": 125050 + }, + { + "epoch": 5.18, + "grad_norm": 0.71484375, + "learning_rate": 0.0004641585540086296, + "loss": 0.2425, + "step": 125060 + }, + { + "epoch": 5.18, + "grad_norm": 0.91015625, + "learning_rate": 0.00046415295850824235, + "loss": 0.2234, + "step": 125070 + }, + { + "epoch": 5.18, + "grad_norm": 0.7265625, + "learning_rate": 0.00046414736260484224, + "loss": 0.2279, + "step": 125080 + }, + { + "epoch": 5.18, + "grad_norm": 0.56640625, + "learning_rate": 0.00046414176629843996, + "loss": 0.1981, + "step": 125090 + }, + { + "epoch": 5.18, + "grad_norm": 0.75, + "learning_rate": 0.000464136169589046, + "loss": 0.2038, + "step": 125100 + }, + { + "epoch": 5.18, + "grad_norm": 0.5, + "learning_rate": 0.0004641305724766709, + "loss": 0.2179, + "step": 125110 + }, + { + "epoch": 5.18, + "grad_norm": 0.51953125, + "learning_rate": 0.00046412497496132516, + "loss": 0.1876, + "step": 125120 + }, + { + "epoch": 5.18, + "grad_norm": 0.66796875, + "learning_rate": 0.00046411937704301934, + "loss": 0.2141, + "step": 125130 + }, + { + "epoch": 5.18, + "grad_norm": 0.76953125, + "learning_rate": 0.00046411377872176404, + "loss": 0.2533, + "step": 125140 + }, + { + "epoch": 5.18, + "grad_norm": 0.53125, + "learning_rate": 0.0004641081799975697, + "loss": 0.2209, + "step": 125150 + }, + { + "epoch": 5.18, + "grad_norm": 0.419921875, + "learning_rate": 0.0004641025808704469, + "loss": 0.1888, + "step": 125160 + }, + { + "epoch": 5.18, + "grad_norm": 0.63671875, + "learning_rate": 0.00046409698134040607, + "loss": 0.1717, + "step": 125170 + }, + { + "epoch": 5.18, + "grad_norm": 0.84375, + "learning_rate": 0.0004640913814074579, + "loss": 0.193, + "step": 125180 + }, + { + "epoch": 5.19, + "grad_norm": 0.703125, + "learning_rate": 0.0004640857810716129, + "loss": 0.2279, + "step": 125190 + }, + { + "epoch": 5.19, + "grad_norm": 0.375, + "learning_rate": 0.00046408018033288156, + "loss": 0.218, + "step": 125200 + }, + { + "epoch": 5.19, + "grad_norm": 0.98828125, + "learning_rate": 0.0004640745791912744, + "loss": 0.1925, + "step": 125210 + }, + { + "epoch": 5.19, + "grad_norm": 0.99609375, + "learning_rate": 0.00046406897764680204, + "loss": 0.2324, + "step": 125220 + }, + { + "epoch": 5.19, + "grad_norm": 0.55859375, + "learning_rate": 0.00046406337569947497, + "loss": 0.2234, + "step": 125230 + }, + { + "epoch": 5.19, + "grad_norm": 0.51953125, + "learning_rate": 0.0004640577733493038, + "loss": 0.2, + "step": 125240 + }, + { + "epoch": 5.19, + "grad_norm": 0.482421875, + "learning_rate": 0.000464052170596299, + "loss": 0.1861, + "step": 125250 + }, + { + "epoch": 5.19, + "grad_norm": 0.427734375, + "learning_rate": 0.00046404656744047104, + "loss": 0.2439, + "step": 125260 + }, + { + "epoch": 5.19, + "grad_norm": 0.74609375, + "learning_rate": 0.0004640409638818306, + "loss": 0.2582, + "step": 125270 + }, + { + "epoch": 5.19, + "grad_norm": 0.9375, + "learning_rate": 0.00046403535992038826, + "loss": 0.2124, + "step": 125280 + }, + { + "epoch": 5.19, + "grad_norm": 1.0546875, + "learning_rate": 0.00046402975555615434, + "loss": 0.1827, + "step": 125290 + }, + { + "epoch": 5.19, + "grad_norm": 0.71875, + "learning_rate": 0.0004640241507891397, + "loss": 0.2038, + "step": 125300 + }, + { + "epoch": 5.19, + "grad_norm": 0.412109375, + "learning_rate": 0.00046401854561935456, + "loss": 0.2025, + "step": 125310 + }, + { + "epoch": 5.19, + "grad_norm": 0.11767578125, + "learning_rate": 0.00046401294004680974, + "loss": 0.2508, + "step": 125320 + }, + { + "epoch": 5.19, + "grad_norm": 0.58203125, + "learning_rate": 0.0004640073340715156, + "loss": 0.2293, + "step": 125330 + }, + { + "epoch": 5.19, + "grad_norm": 0.73046875, + "learning_rate": 0.00046400172769348286, + "loss": 0.2302, + "step": 125340 + }, + { + "epoch": 5.19, + "grad_norm": 0.8515625, + "learning_rate": 0.00046399612091272194, + "loss": 0.2131, + "step": 125350 + }, + { + "epoch": 5.19, + "grad_norm": 0.74609375, + "learning_rate": 0.0004639905137292434, + "loss": 0.1948, + "step": 125360 + }, + { + "epoch": 5.19, + "grad_norm": 0.82421875, + "learning_rate": 0.0004639849061430579, + "loss": 0.2467, + "step": 125370 + }, + { + "epoch": 5.19, + "grad_norm": 0.34765625, + "learning_rate": 0.0004639792981541758, + "loss": 0.1704, + "step": 125380 + }, + { + "epoch": 5.19, + "grad_norm": 1.1875, + "learning_rate": 0.00046397368976260783, + "loss": 0.2056, + "step": 125390 + }, + { + "epoch": 5.19, + "grad_norm": 0.6796875, + "learning_rate": 0.00046396808096836443, + "loss": 0.1772, + "step": 125400 + }, + { + "epoch": 5.19, + "grad_norm": 0.921875, + "learning_rate": 0.0004639624717714563, + "loss": 0.2107, + "step": 125410 + }, + { + "epoch": 5.19, + "grad_norm": 1.0859375, + "learning_rate": 0.00046395686217189384, + "loss": 0.1893, + "step": 125420 + }, + { + "epoch": 5.2, + "grad_norm": 0.51171875, + "learning_rate": 0.0004639512521696877, + "loss": 0.1995, + "step": 125430 + }, + { + "epoch": 5.2, + "grad_norm": 0.77734375, + "learning_rate": 0.00046394564176484846, + "loss": 0.1912, + "step": 125440 + }, + { + "epoch": 5.2, + "grad_norm": 0.357421875, + "learning_rate": 0.00046394003095738654, + "loss": 0.1684, + "step": 125450 + }, + { + "epoch": 5.2, + "grad_norm": 0.75390625, + "learning_rate": 0.0004639344197473126, + "loss": 0.2508, + "step": 125460 + }, + { + "epoch": 5.2, + "grad_norm": 0.45703125, + "learning_rate": 0.00046392880813463723, + "loss": 0.2444, + "step": 125470 + }, + { + "epoch": 5.2, + "grad_norm": 0.984375, + "learning_rate": 0.00046392319611937083, + "loss": 0.2274, + "step": 125480 + }, + { + "epoch": 5.2, + "grad_norm": 1.046875, + "learning_rate": 0.0004639175837015242, + "loss": 0.2236, + "step": 125490 + }, + { + "epoch": 5.2, + "grad_norm": 1.4296875, + "learning_rate": 0.0004639119708811078, + "loss": 0.2084, + "step": 125500 + }, + { + "epoch": 5.2, + "grad_norm": 0.65234375, + "learning_rate": 0.00046390635765813205, + "loss": 0.2063, + "step": 125510 + }, + { + "epoch": 5.2, + "grad_norm": 0.89453125, + "learning_rate": 0.00046390074403260766, + "loss": 0.1679, + "step": 125520 + }, + { + "epoch": 5.2, + "grad_norm": 0.66015625, + "learning_rate": 0.0004638951300045452, + "loss": 0.226, + "step": 125530 + }, + { + "epoch": 5.2, + "grad_norm": 1.0703125, + "learning_rate": 0.00046388951557395526, + "loss": 0.2131, + "step": 125540 + }, + { + "epoch": 5.2, + "grad_norm": 0.240234375, + "learning_rate": 0.0004638839007408483, + "loss": 0.19, + "step": 125550 + }, + { + "epoch": 5.2, + "grad_norm": 0.921875, + "learning_rate": 0.00046387828550523494, + "loss": 0.2533, + "step": 125560 + }, + { + "epoch": 5.2, + "grad_norm": 0.59375, + "learning_rate": 0.00046387266986712574, + "loss": 0.1952, + "step": 125570 + }, + { + "epoch": 5.2, + "grad_norm": 1.0, + "learning_rate": 0.00046386705382653125, + "loss": 0.2584, + "step": 125580 + }, + { + "epoch": 5.2, + "grad_norm": 1.0234375, + "learning_rate": 0.0004638614373834621, + "loss": 0.1902, + "step": 125590 + }, + { + "epoch": 5.2, + "grad_norm": 0.578125, + "learning_rate": 0.0004638558205379287, + "loss": 0.1911, + "step": 125600 + }, + { + "epoch": 5.2, + "grad_norm": 0.703125, + "learning_rate": 0.00046385020328994187, + "loss": 0.2754, + "step": 125610 + }, + { + "epoch": 5.2, + "grad_norm": 0.33203125, + "learning_rate": 0.000463844585639512, + "loss": 0.1913, + "step": 125620 + }, + { + "epoch": 5.2, + "grad_norm": 0.58984375, + "learning_rate": 0.0004638389675866498, + "loss": 0.2109, + "step": 125630 + }, + { + "epoch": 5.2, + "grad_norm": 0.859375, + "learning_rate": 0.00046383334913136567, + "loss": 0.1788, + "step": 125640 + }, + { + "epoch": 5.2, + "grad_norm": 0.46484375, + "learning_rate": 0.00046382773027367025, + "loss": 0.1963, + "step": 125650 + }, + { + "epoch": 5.2, + "grad_norm": 0.90625, + "learning_rate": 0.00046382211101357417, + "loss": 0.1775, + "step": 125660 + }, + { + "epoch": 5.21, + "grad_norm": 0.80078125, + "learning_rate": 0.000463816491351088, + "loss": 0.221, + "step": 125670 + }, + { + "epoch": 5.21, + "grad_norm": 1.390625, + "learning_rate": 0.00046381087128622225, + "loss": 0.2091, + "step": 125680 + }, + { + "epoch": 5.21, + "grad_norm": 1.0078125, + "learning_rate": 0.0004638052508189875, + "loss": 0.1881, + "step": 125690 + }, + { + "epoch": 5.21, + "grad_norm": 0.58984375, + "learning_rate": 0.0004637996299493944, + "loss": 0.2023, + "step": 125700 + }, + { + "epoch": 5.21, + "grad_norm": 0.5390625, + "learning_rate": 0.00046379400867745346, + "loss": 0.2154, + "step": 125710 + }, + { + "epoch": 5.21, + "grad_norm": 0.2353515625, + "learning_rate": 0.0004637883870031753, + "loss": 0.2427, + "step": 125720 + }, + { + "epoch": 5.21, + "grad_norm": 0.98828125, + "learning_rate": 0.0004637827649265705, + "loss": 0.2249, + "step": 125730 + }, + { + "epoch": 5.21, + "grad_norm": 0.5859375, + "learning_rate": 0.0004637771424476496, + "loss": 0.1909, + "step": 125740 + }, + { + "epoch": 5.21, + "grad_norm": 0.2177734375, + "learning_rate": 0.0004637715195664232, + "loss": 0.2093, + "step": 125750 + }, + { + "epoch": 5.21, + "grad_norm": 1.0390625, + "learning_rate": 0.0004637658962829019, + "loss": 0.2086, + "step": 125760 + }, + { + "epoch": 5.21, + "grad_norm": 0.8515625, + "learning_rate": 0.00046376027259709623, + "loss": 0.2514, + "step": 125770 + }, + { + "epoch": 5.21, + "grad_norm": 0.9765625, + "learning_rate": 0.0004637546485090168, + "loss": 0.2199, + "step": 125780 + }, + { + "epoch": 5.21, + "grad_norm": 1.0625, + "learning_rate": 0.0004637490240186744, + "loss": 0.181, + "step": 125790 + }, + { + "epoch": 5.21, + "grad_norm": 0.453125, + "learning_rate": 0.0004637433991260792, + "loss": 0.2047, + "step": 125800 + }, + { + "epoch": 5.21, + "grad_norm": 0.3828125, + "learning_rate": 0.00046373777383124207, + "loss": 0.2607, + "step": 125810 + }, + { + "epoch": 5.21, + "grad_norm": 0.5546875, + "learning_rate": 0.0004637321481341735, + "loss": 0.191, + "step": 125820 + }, + { + "epoch": 5.21, + "grad_norm": 0.73046875, + "learning_rate": 0.0004637265220348842, + "loss": 0.2239, + "step": 125830 + }, + { + "epoch": 5.21, + "grad_norm": 0.66796875, + "learning_rate": 0.0004637208955333846, + "loss": 0.1748, + "step": 125840 + }, + { + "epoch": 5.21, + "grad_norm": 0.57421875, + "learning_rate": 0.0004637152686296853, + "loss": 0.1754, + "step": 125850 + }, + { + "epoch": 5.21, + "grad_norm": 0.49609375, + "learning_rate": 0.000463709641323797, + "loss": 0.1898, + "step": 125860 + }, + { + "epoch": 5.21, + "grad_norm": 0.5078125, + "learning_rate": 0.00046370401361573024, + "loss": 0.1808, + "step": 125870 + }, + { + "epoch": 5.21, + "grad_norm": 0.63671875, + "learning_rate": 0.0004636983855054956, + "loss": 0.2154, + "step": 125880 + }, + { + "epoch": 5.21, + "grad_norm": 1.09375, + "learning_rate": 0.00046369275699310365, + "loss": 0.1626, + "step": 125890 + }, + { + "epoch": 5.21, + "grad_norm": 0.5625, + "learning_rate": 0.00046368712807856507, + "loss": 0.2667, + "step": 125900 + }, + { + "epoch": 5.22, + "grad_norm": 0.5625, + "learning_rate": 0.00046368149876189037, + "loss": 0.2142, + "step": 125910 + }, + { + "epoch": 5.22, + "grad_norm": 0.62890625, + "learning_rate": 0.00046367586904309013, + "loss": 0.2254, + "step": 125920 + }, + { + "epoch": 5.22, + "grad_norm": 0.322265625, + "learning_rate": 0.000463670238922175, + "loss": 0.2021, + "step": 125930 + }, + { + "epoch": 5.22, + "grad_norm": 0.84765625, + "learning_rate": 0.00046366460839915557, + "loss": 0.2081, + "step": 125940 + }, + { + "epoch": 5.22, + "grad_norm": 1.0, + "learning_rate": 0.0004636589774740424, + "loss": 0.2251, + "step": 125950 + }, + { + "epoch": 5.22, + "grad_norm": 0.5, + "learning_rate": 0.0004636533461468461, + "loss": 0.1969, + "step": 125960 + }, + { + "epoch": 5.22, + "grad_norm": 0.296875, + "learning_rate": 0.0004636477144175773, + "loss": 0.2164, + "step": 125970 + }, + { + "epoch": 5.22, + "grad_norm": 0.703125, + "learning_rate": 0.0004636420822862466, + "loss": 0.2322, + "step": 125980 + }, + { + "epoch": 5.22, + "grad_norm": 0.70703125, + "learning_rate": 0.0004636364497528645, + "loss": 0.263, + "step": 125990 + }, + { + "epoch": 5.22, + "grad_norm": 0.71875, + "learning_rate": 0.00046363081681744173, + "loss": 0.2164, + "step": 126000 + }, + { + "epoch": 5.22, + "grad_norm": 0.7421875, + "learning_rate": 0.00046362518347998886, + "loss": 0.2397, + "step": 126010 + }, + { + "epoch": 5.22, + "grad_norm": 0.96875, + "learning_rate": 0.00046361954974051637, + "loss": 0.1947, + "step": 126020 + }, + { + "epoch": 5.22, + "grad_norm": 0.9296875, + "learning_rate": 0.000463613915599035, + "loss": 0.2115, + "step": 126030 + }, + { + "epoch": 5.22, + "grad_norm": 1.671875, + "learning_rate": 0.0004636082810555553, + "loss": 0.1711, + "step": 126040 + }, + { + "epoch": 5.22, + "grad_norm": 0.6953125, + "learning_rate": 0.0004636026461100879, + "loss": 0.1712, + "step": 126050 + }, + { + "epoch": 5.22, + "grad_norm": 1.890625, + "learning_rate": 0.0004635970107626434, + "loss": 0.1896, + "step": 126060 + }, + { + "epoch": 5.22, + "grad_norm": 0.859375, + "learning_rate": 0.00046359137501323234, + "loss": 0.2165, + "step": 126070 + }, + { + "epoch": 5.22, + "grad_norm": 0.6484375, + "learning_rate": 0.0004635857388618654, + "loss": 0.2224, + "step": 126080 + }, + { + "epoch": 5.22, + "grad_norm": 0.765625, + "learning_rate": 0.0004635801023085532, + "loss": 0.1645, + "step": 126090 + }, + { + "epoch": 5.22, + "grad_norm": 0.59375, + "learning_rate": 0.00046357446535330625, + "loss": 0.2181, + "step": 126100 + }, + { + "epoch": 5.22, + "grad_norm": 0.462890625, + "learning_rate": 0.00046356882799613534, + "loss": 0.2156, + "step": 126110 + }, + { + "epoch": 5.22, + "grad_norm": 0.330078125, + "learning_rate": 0.00046356319023705084, + "loss": 0.1976, + "step": 126120 + }, + { + "epoch": 5.22, + "grad_norm": 0.2392578125, + "learning_rate": 0.00046355755207606354, + "loss": 0.1703, + "step": 126130 + }, + { + "epoch": 5.22, + "grad_norm": 1.0625, + "learning_rate": 0.0004635519135131839, + "loss": 0.2423, + "step": 126140 + }, + { + "epoch": 5.23, + "grad_norm": 2.1875, + "learning_rate": 0.0004635462745484227, + "loss": 0.1869, + "step": 126150 + }, + { + "epoch": 5.23, + "grad_norm": 0.51171875, + "learning_rate": 0.00046354063518179044, + "loss": 0.2169, + "step": 126160 + }, + { + "epoch": 5.23, + "grad_norm": 0.3203125, + "learning_rate": 0.0004635349954132978, + "loss": 0.2293, + "step": 126170 + }, + { + "epoch": 5.23, + "grad_norm": 1.3359375, + "learning_rate": 0.00046352935524295535, + "loss": 0.175, + "step": 126180 + }, + { + "epoch": 5.23, + "grad_norm": 0.9375, + "learning_rate": 0.0004635237146707737, + "loss": 0.2165, + "step": 126190 + }, + { + "epoch": 5.23, + "grad_norm": 0.9375, + "learning_rate": 0.0004635180736967635, + "loss": 0.1311, + "step": 126200 + }, + { + "epoch": 5.23, + "grad_norm": 0.6484375, + "learning_rate": 0.0004635124323209353, + "loss": 0.225, + "step": 126210 + }, + { + "epoch": 5.23, + "grad_norm": 1.0, + "learning_rate": 0.0004635067905432998, + "loss": 0.1756, + "step": 126220 + }, + { + "epoch": 5.23, + "grad_norm": 0.76171875, + "learning_rate": 0.0004635011483638676, + "loss": 0.2247, + "step": 126230 + }, + { + "epoch": 5.23, + "grad_norm": 1.78125, + "learning_rate": 0.0004634955057826492, + "loss": 0.2158, + "step": 126240 + }, + { + "epoch": 5.23, + "grad_norm": 1.0390625, + "learning_rate": 0.0004634898627996554, + "loss": 0.2414, + "step": 126250 + }, + { + "epoch": 5.23, + "grad_norm": 0.384765625, + "learning_rate": 0.0004634842194148967, + "loss": 0.2023, + "step": 126260 + }, + { + "epoch": 5.23, + "grad_norm": 0.63671875, + "learning_rate": 0.00046347857562838374, + "loss": 0.2233, + "step": 126270 + }, + { + "epoch": 5.23, + "grad_norm": 0.466796875, + "learning_rate": 0.00046347293144012716, + "loss": 0.2131, + "step": 126280 + }, + { + "epoch": 5.23, + "grad_norm": 0.162109375, + "learning_rate": 0.0004634672868501376, + "loss": 0.2312, + "step": 126290 + }, + { + "epoch": 5.23, + "grad_norm": 0.6484375, + "learning_rate": 0.00046346164185842565, + "loss": 0.2405, + "step": 126300 + }, + { + "epoch": 5.23, + "grad_norm": 0.18359375, + "learning_rate": 0.0004634559964650019, + "loss": 0.1948, + "step": 126310 + }, + { + "epoch": 5.23, + "grad_norm": 0.33984375, + "learning_rate": 0.00046345035066987705, + "loss": 0.2388, + "step": 126320 + }, + { + "epoch": 5.23, + "grad_norm": 0.4921875, + "learning_rate": 0.00046344470447306173, + "loss": 0.2394, + "step": 126330 + }, + { + "epoch": 5.23, + "grad_norm": 1.109375, + "learning_rate": 0.00046343905787456653, + "loss": 0.2767, + "step": 126340 + }, + { + "epoch": 5.23, + "grad_norm": 1.1015625, + "learning_rate": 0.000463433410874402, + "loss": 0.2343, + "step": 126350 + }, + { + "epoch": 5.23, + "grad_norm": 0.474609375, + "learning_rate": 0.00046342776347257887, + "loss": 0.2502, + "step": 126360 + }, + { + "epoch": 5.23, + "grad_norm": 1.703125, + "learning_rate": 0.00046342211566910776, + "loss": 0.1723, + "step": 126370 + }, + { + "epoch": 5.23, + "grad_norm": 0.3203125, + "learning_rate": 0.00046341646746399923, + "loss": 0.1896, + "step": 126380 + }, + { + "epoch": 5.24, + "grad_norm": 0.58203125, + "learning_rate": 0.000463410818857264, + "loss": 0.1898, + "step": 126390 + }, + { + "epoch": 5.24, + "grad_norm": 1.5390625, + "learning_rate": 0.00046340516984891256, + "loss": 0.1387, + "step": 126400 + }, + { + "epoch": 5.24, + "grad_norm": 0.625, + "learning_rate": 0.00046339952043895574, + "loss": 0.1858, + "step": 126410 + }, + { + "epoch": 5.24, + "grad_norm": 0.52734375, + "learning_rate": 0.0004633938706274041, + "loss": 0.1832, + "step": 126420 + }, + { + "epoch": 5.24, + "grad_norm": 1.8828125, + "learning_rate": 0.0004633882204142681, + "loss": 0.2311, + "step": 126430 + }, + { + "epoch": 5.24, + "grad_norm": 0.69140625, + "learning_rate": 0.00046338256979955864, + "loss": 0.1915, + "step": 126440 + }, + { + "epoch": 5.24, + "grad_norm": 1.171875, + "learning_rate": 0.00046337691878328615, + "loss": 0.2468, + "step": 126450 + }, + { + "epoch": 5.24, + "grad_norm": 0.9921875, + "learning_rate": 0.00046337126736546144, + "loss": 0.1939, + "step": 126460 + }, + { + "epoch": 5.24, + "grad_norm": 0.63671875, + "learning_rate": 0.0004633656155460949, + "loss": 0.194, + "step": 126470 + }, + { + "epoch": 5.24, + "grad_norm": 1.0703125, + "learning_rate": 0.00046335996332519744, + "loss": 0.2073, + "step": 126480 + }, + { + "epoch": 5.24, + "grad_norm": 0.375, + "learning_rate": 0.0004633543107027795, + "loss": 0.1957, + "step": 126490 + }, + { + "epoch": 5.24, + "grad_norm": 1.1953125, + "learning_rate": 0.0004633486576788518, + "loss": 0.2257, + "step": 126500 + }, + { + "epoch": 5.24, + "grad_norm": 0.8828125, + "learning_rate": 0.000463343004253425, + "loss": 0.2019, + "step": 126510 + }, + { + "epoch": 5.24, + "grad_norm": 0.5078125, + "learning_rate": 0.00046333735042650967, + "loss": 0.176, + "step": 126520 + }, + { + "epoch": 5.24, + "grad_norm": 0.421875, + "learning_rate": 0.00046333169619811645, + "loss": 0.201, + "step": 126530 + }, + { + "epoch": 5.24, + "grad_norm": 1.1953125, + "learning_rate": 0.00046332604156825607, + "loss": 0.2304, + "step": 126540 + }, + { + "epoch": 5.24, + "grad_norm": 0.29296875, + "learning_rate": 0.00046332038653693915, + "loss": 0.2219, + "step": 126550 + }, + { + "epoch": 5.24, + "grad_norm": 0.37109375, + "learning_rate": 0.0004633147311041762, + "loss": 0.2038, + "step": 126560 + }, + { + "epoch": 5.24, + "grad_norm": 0.9609375, + "learning_rate": 0.000463309075269978, + "loss": 0.2059, + "step": 126570 + }, + { + "epoch": 5.24, + "grad_norm": 0.76953125, + "learning_rate": 0.00046330341903435526, + "loss": 0.2192, + "step": 126580 + }, + { + "epoch": 5.24, + "grad_norm": 0.98046875, + "learning_rate": 0.0004632977623973184, + "loss": 0.2255, + "step": 126590 + }, + { + "epoch": 5.24, + "grad_norm": 0.6015625, + "learning_rate": 0.0004632921053588782, + "loss": 0.1786, + "step": 126600 + }, + { + "epoch": 5.24, + "grad_norm": 0.7421875, + "learning_rate": 0.00046328644791904526, + "loss": 0.1789, + "step": 126610 + }, + { + "epoch": 5.24, + "grad_norm": 0.3984375, + "learning_rate": 0.0004632807900778304, + "loss": 0.2215, + "step": 126620 + }, + { + "epoch": 5.24, + "grad_norm": 0.765625, + "learning_rate": 0.0004632751318352441, + "loss": 0.2619, + "step": 126630 + }, + { + "epoch": 5.25, + "grad_norm": 0.96875, + "learning_rate": 0.00046326947319129684, + "loss": 0.2046, + "step": 126640 + }, + { + "epoch": 5.25, + "grad_norm": 0.8125, + "learning_rate": 0.0004632638141459996, + "loss": 0.1796, + "step": 126650 + }, + { + "epoch": 5.25, + "grad_norm": 0.82421875, + "learning_rate": 0.00046325815469936294, + "loss": 0.218, + "step": 126660 + }, + { + "epoch": 5.25, + "grad_norm": 0.57421875, + "learning_rate": 0.0004632524948513974, + "loss": 0.2308, + "step": 126670 + }, + { + "epoch": 5.25, + "grad_norm": 0.84765625, + "learning_rate": 0.0004632468346021137, + "loss": 0.2028, + "step": 126680 + }, + { + "epoch": 5.25, + "grad_norm": 1.234375, + "learning_rate": 0.00046324117395152244, + "loss": 0.1826, + "step": 126690 + }, + { + "epoch": 5.25, + "grad_norm": 0.5, + "learning_rate": 0.0004632355128996344, + "loss": 0.2404, + "step": 126700 + }, + { + "epoch": 5.25, + "grad_norm": 0.7734375, + "learning_rate": 0.0004632298514464601, + "loss": 0.1723, + "step": 126710 + }, + { + "epoch": 5.25, + "grad_norm": 0.53125, + "learning_rate": 0.0004632241895920103, + "loss": 0.1816, + "step": 126720 + }, + { + "epoch": 5.25, + "grad_norm": 0.67578125, + "learning_rate": 0.00046321852733629556, + "loss": 0.1843, + "step": 126730 + }, + { + "epoch": 5.25, + "grad_norm": 0.83203125, + "learning_rate": 0.0004632128646793266, + "loss": 0.2442, + "step": 126740 + }, + { + "epoch": 5.25, + "grad_norm": 0.58203125, + "learning_rate": 0.00046320720162111394, + "loss": 0.165, + "step": 126750 + }, + { + "epoch": 5.25, + "grad_norm": 0.64453125, + "learning_rate": 0.0004632015381616685, + "loss": 0.2186, + "step": 126760 + }, + { + "epoch": 5.25, + "grad_norm": 0.41015625, + "learning_rate": 0.00046319587430100075, + "loss": 0.1294, + "step": 126770 + }, + { + "epoch": 5.25, + "grad_norm": 0.451171875, + "learning_rate": 0.00046319021003912134, + "loss": 0.2167, + "step": 126780 + }, + { + "epoch": 5.25, + "grad_norm": 0.451171875, + "learning_rate": 0.00046318454537604104, + "loss": 0.2226, + "step": 126790 + }, + { + "epoch": 5.25, + "grad_norm": 0.384765625, + "learning_rate": 0.0004631788803117704, + "loss": 0.1842, + "step": 126800 + }, + { + "epoch": 5.25, + "grad_norm": 1.1171875, + "learning_rate": 0.00046317321484632014, + "loss": 0.2069, + "step": 126810 + }, + { + "epoch": 5.25, + "grad_norm": 1.078125, + "learning_rate": 0.00046316754897970095, + "loss": 0.2602, + "step": 126820 + }, + { + "epoch": 5.25, + "grad_norm": 1.4140625, + "learning_rate": 0.0004631618827119234, + "loss": 0.2237, + "step": 126830 + }, + { + "epoch": 5.25, + "grad_norm": 0.9453125, + "learning_rate": 0.00046315621604299816, + "loss": 0.2289, + "step": 126840 + }, + { + "epoch": 5.25, + "grad_norm": 0.59375, + "learning_rate": 0.0004631505489729361, + "loss": 0.2499, + "step": 126850 + }, + { + "epoch": 5.25, + "grad_norm": 0.796875, + "learning_rate": 0.0004631448815017476, + "loss": 0.196, + "step": 126860 + }, + { + "epoch": 5.25, + "grad_norm": 1.2578125, + "learning_rate": 0.00046313921362944345, + "loss": 0.1909, + "step": 126870 + }, + { + "epoch": 5.26, + "grad_norm": 0.625, + "learning_rate": 0.0004631335453560344, + "loss": 0.1832, + "step": 126880 + }, + { + "epoch": 5.26, + "grad_norm": 0.546875, + "learning_rate": 0.00046312787668153094, + "loss": 0.2506, + "step": 126890 + }, + { + "epoch": 5.26, + "grad_norm": 0.2265625, + "learning_rate": 0.00046312220760594394, + "loss": 0.1778, + "step": 126900 + }, + { + "epoch": 5.26, + "grad_norm": 0.6171875, + "learning_rate": 0.0004631165381292838, + "loss": 0.2135, + "step": 126910 + }, + { + "epoch": 5.26, + "grad_norm": 1.015625, + "learning_rate": 0.00046311086825156144, + "loss": 0.2729, + "step": 126920 + }, + { + "epoch": 5.26, + "grad_norm": 0.54296875, + "learning_rate": 0.0004631051979727875, + "loss": 0.203, + "step": 126930 + }, + { + "epoch": 5.26, + "grad_norm": 2.5, + "learning_rate": 0.00046309952729297254, + "loss": 0.2131, + "step": 126940 + }, + { + "epoch": 5.26, + "grad_norm": 0.47265625, + "learning_rate": 0.0004630938562121272, + "loss": 0.3026, + "step": 126950 + }, + { + "epoch": 5.26, + "grad_norm": 0.671875, + "learning_rate": 0.0004630881847302624, + "loss": 0.2168, + "step": 126960 + }, + { + "epoch": 5.26, + "grad_norm": 0.388671875, + "learning_rate": 0.00046308251284738855, + "loss": 0.1945, + "step": 126970 + }, + { + "epoch": 5.26, + "grad_norm": 0.90625, + "learning_rate": 0.0004630768405635164, + "loss": 0.2472, + "step": 126980 + }, + { + "epoch": 5.26, + "grad_norm": 0.9140625, + "learning_rate": 0.0004630711678786567, + "loss": 0.2101, + "step": 126990 + }, + { + "epoch": 5.26, + "grad_norm": 1.7109375, + "learning_rate": 0.00046306549479282, + "loss": 0.2476, + "step": 127000 + }, + { + "epoch": 5.26, + "grad_norm": 0.486328125, + "learning_rate": 0.00046305982130601716, + "loss": 0.2312, + "step": 127010 + }, + { + "epoch": 5.26, + "grad_norm": 0.93359375, + "learning_rate": 0.00046305414741825866, + "loss": 0.2249, + "step": 127020 + }, + { + "epoch": 5.26, + "grad_norm": 0.50390625, + "learning_rate": 0.00046304847312955526, + "loss": 0.179, + "step": 127030 + }, + { + "epoch": 5.26, + "grad_norm": 1.0078125, + "learning_rate": 0.0004630427984399177, + "loss": 0.2185, + "step": 127040 + }, + { + "epoch": 5.26, + "grad_norm": 0.000141143798828125, + "learning_rate": 0.00046303712334935657, + "loss": 0.1818, + "step": 127050 + }, + { + "epoch": 5.26, + "grad_norm": 0.79296875, + "learning_rate": 0.00046303144785788254, + "loss": 0.2109, + "step": 127060 + }, + { + "epoch": 5.26, + "grad_norm": 0.60546875, + "learning_rate": 0.0004630257719655064, + "loss": 0.2266, + "step": 127070 + }, + { + "epoch": 5.26, + "grad_norm": 0.5, + "learning_rate": 0.0004630200956722387, + "loss": 0.1917, + "step": 127080 + }, + { + "epoch": 5.26, + "grad_norm": 0.349609375, + "learning_rate": 0.00046301441897809027, + "loss": 0.1914, + "step": 127090 + }, + { + "epoch": 5.26, + "grad_norm": 1.46875, + "learning_rate": 0.00046300874188307163, + "loss": 0.2055, + "step": 127100 + }, + { + "epoch": 5.26, + "grad_norm": 0.78125, + "learning_rate": 0.0004630030643871935, + "loss": 0.2101, + "step": 127110 + }, + { + "epoch": 5.27, + "grad_norm": 0.51953125, + "learning_rate": 0.0004629973864904667, + "loss": 0.1741, + "step": 127120 + }, + { + "epoch": 5.27, + "grad_norm": 1.796875, + "learning_rate": 0.00046299170819290184, + "loss": 0.2262, + "step": 127130 + }, + { + "epoch": 5.27, + "grad_norm": 1.09375, + "learning_rate": 0.0004629860294945095, + "loss": 0.1868, + "step": 127140 + }, + { + "epoch": 5.27, + "grad_norm": 0.734375, + "learning_rate": 0.00046298035039530044, + "loss": 0.1813, + "step": 127150 + }, + { + "epoch": 5.27, + "grad_norm": 0.26953125, + "learning_rate": 0.00046297467089528546, + "loss": 0.1926, + "step": 127160 + }, + { + "epoch": 5.27, + "grad_norm": 0.640625, + "learning_rate": 0.0004629689909944751, + "loss": 0.2039, + "step": 127170 + }, + { + "epoch": 5.27, + "grad_norm": 0.55859375, + "learning_rate": 0.00046296331069288005, + "loss": 0.3192, + "step": 127180 + }, + { + "epoch": 5.27, + "grad_norm": 0.44140625, + "learning_rate": 0.0004629576299905111, + "loss": 0.1992, + "step": 127190 + }, + { + "epoch": 5.27, + "grad_norm": 0.5859375, + "learning_rate": 0.0004629519488873789, + "loss": 0.1839, + "step": 127200 + }, + { + "epoch": 5.27, + "grad_norm": 0.486328125, + "learning_rate": 0.0004629462673834941, + "loss": 0.2168, + "step": 127210 + }, + { + "epoch": 5.27, + "grad_norm": 0.71875, + "learning_rate": 0.00046294058547886744, + "loss": 0.2272, + "step": 127220 + }, + { + "epoch": 5.27, + "grad_norm": 0.490234375, + "learning_rate": 0.0004629349031735096, + "loss": 0.2251, + "step": 127230 + }, + { + "epoch": 5.27, + "grad_norm": 0.86328125, + "learning_rate": 0.0004629292204674312, + "loss": 0.2238, + "step": 127240 + }, + { + "epoch": 5.27, + "grad_norm": 0.66015625, + "learning_rate": 0.0004629235373606431, + "loss": 0.1782, + "step": 127250 + }, + { + "epoch": 5.27, + "grad_norm": 0.5390625, + "learning_rate": 0.00046291785385315574, + "loss": 0.2076, + "step": 127260 + }, + { + "epoch": 5.27, + "grad_norm": 0.3359375, + "learning_rate": 0.0004629121699449801, + "loss": 0.1794, + "step": 127270 + }, + { + "epoch": 5.27, + "grad_norm": 0.4453125, + "learning_rate": 0.00046290648563612676, + "loss": 0.2129, + "step": 127280 + }, + { + "epoch": 5.27, + "grad_norm": 0.412109375, + "learning_rate": 0.00046290080092660637, + "loss": 0.1704, + "step": 127290 + }, + { + "epoch": 5.27, + "grad_norm": 0.453125, + "learning_rate": 0.00046289511581642964, + "loss": 0.1781, + "step": 127300 + }, + { + "epoch": 5.27, + "grad_norm": 1.75, + "learning_rate": 0.00046288943030560736, + "loss": 0.1931, + "step": 127310 + }, + { + "epoch": 5.27, + "grad_norm": 0.5, + "learning_rate": 0.0004628837443941501, + "loss": 0.232, + "step": 127320 + }, + { + "epoch": 5.27, + "grad_norm": 1.2578125, + "learning_rate": 0.0004628780580820686, + "loss": 0.1811, + "step": 127330 + }, + { + "epoch": 5.27, + "grad_norm": 0.2236328125, + "learning_rate": 0.00046287237136937364, + "loss": 0.1694, + "step": 127340 + }, + { + "epoch": 5.27, + "grad_norm": 0.451171875, + "learning_rate": 0.00046286668425607594, + "loss": 0.2077, + "step": 127350 + }, + { + "epoch": 5.28, + "grad_norm": 1.15625, + "learning_rate": 0.00046286099674218596, + "loss": 0.204, + "step": 127360 + }, + { + "epoch": 5.28, + "grad_norm": 0.9375, + "learning_rate": 0.0004628553088277147, + "loss": 0.269, + "step": 127370 + }, + { + "epoch": 5.28, + "grad_norm": 0.82421875, + "learning_rate": 0.00046284962051267264, + "loss": 0.2171, + "step": 127380 + }, + { + "epoch": 5.28, + "grad_norm": 1.0234375, + "learning_rate": 0.0004628439317970706, + "loss": 0.2365, + "step": 127390 + }, + { + "epoch": 5.28, + "grad_norm": 0.7265625, + "learning_rate": 0.00046283824268091934, + "loss": 0.1926, + "step": 127400 + }, + { + "epoch": 5.28, + "grad_norm": 0.77734375, + "learning_rate": 0.00046283255316422947, + "loss": 0.2006, + "step": 127410 + }, + { + "epoch": 5.28, + "grad_norm": 1.59375, + "learning_rate": 0.0004628268632470117, + "loss": 0.1723, + "step": 127420 + }, + { + "epoch": 5.28, + "grad_norm": 0.7734375, + "learning_rate": 0.0004628211729292767, + "loss": 0.192, + "step": 127430 + }, + { + "epoch": 5.28, + "grad_norm": 1.21875, + "learning_rate": 0.0004628154822110353, + "loss": 0.2332, + "step": 127440 + }, + { + "epoch": 5.28, + "grad_norm": 0.515625, + "learning_rate": 0.0004628097910922982, + "loss": 0.1922, + "step": 127450 + }, + { + "epoch": 5.28, + "grad_norm": 0.365234375, + "learning_rate": 0.0004628040995730759, + "loss": 0.2112, + "step": 127460 + }, + { + "epoch": 5.28, + "grad_norm": 0.2138671875, + "learning_rate": 0.00046279840765337937, + "loss": 0.2347, + "step": 127470 + }, + { + "epoch": 5.28, + "grad_norm": 0.84765625, + "learning_rate": 0.0004627927153332192, + "loss": 0.2116, + "step": 127480 + }, + { + "epoch": 5.28, + "grad_norm": 0.6015625, + "learning_rate": 0.00046278702261260606, + "loss": 0.2439, + "step": 127490 + }, + { + "epoch": 5.28, + "grad_norm": 0.515625, + "learning_rate": 0.00046278132949155085, + "loss": 0.2273, + "step": 127500 + }, + { + "epoch": 5.28, + "grad_norm": 0.9375, + "learning_rate": 0.00046277563597006406, + "loss": 0.2144, + "step": 127510 + }, + { + "epoch": 5.28, + "grad_norm": 0.451171875, + "learning_rate": 0.0004627699420481565, + "loss": 0.2224, + "step": 127520 + }, + { + "epoch": 5.28, + "grad_norm": 0.45703125, + "learning_rate": 0.0004627642477258389, + "loss": 0.2046, + "step": 127530 + }, + { + "epoch": 5.28, + "grad_norm": 1.3828125, + "learning_rate": 0.00046275855300312204, + "loss": 0.2209, + "step": 127540 + }, + { + "epoch": 5.28, + "grad_norm": 1.0, + "learning_rate": 0.00046275285788001646, + "loss": 0.228, + "step": 127550 + }, + { + "epoch": 5.28, + "grad_norm": 0.9609375, + "learning_rate": 0.000462747162356533, + "loss": 0.2382, + "step": 127560 + }, + { + "epoch": 5.28, + "grad_norm": 1.3515625, + "learning_rate": 0.0004627414664326824, + "loss": 0.1869, + "step": 127570 + }, + { + "epoch": 5.28, + "grad_norm": 0.421875, + "learning_rate": 0.0004627357701084753, + "loss": 0.2711, + "step": 127580 + }, + { + "epoch": 5.28, + "grad_norm": 0.765625, + "learning_rate": 0.00046273007338392243, + "loss": 0.1879, + "step": 127590 + }, + { + "epoch": 5.29, + "grad_norm": 1.703125, + "learning_rate": 0.0004627243762590346, + "loss": 0.1574, + "step": 127600 + }, + { + "epoch": 5.29, + "grad_norm": 0.16796875, + "learning_rate": 0.0004627186787338224, + "loss": 0.1457, + "step": 127610 + }, + { + "epoch": 5.29, + "grad_norm": 0.70703125, + "learning_rate": 0.0004627129808082966, + "loss": 0.221, + "step": 127620 + }, + { + "epoch": 5.29, + "grad_norm": 0.451171875, + "learning_rate": 0.000462707282482468, + "loss": 0.2447, + "step": 127630 + }, + { + "epoch": 5.29, + "grad_norm": 0.498046875, + "learning_rate": 0.0004627015837563473, + "loss": 0.2106, + "step": 127640 + }, + { + "epoch": 5.29, + "grad_norm": 1.015625, + "learning_rate": 0.0004626958846299451, + "loss": 0.26, + "step": 127650 + }, + { + "epoch": 5.29, + "grad_norm": 0.703125, + "learning_rate": 0.00046269018510327226, + "loss": 0.1906, + "step": 127660 + }, + { + "epoch": 5.29, + "grad_norm": 0.59375, + "learning_rate": 0.0004626844851763394, + "loss": 0.2656, + "step": 127670 + }, + { + "epoch": 5.29, + "grad_norm": 0.515625, + "learning_rate": 0.00046267878484915735, + "loss": 0.1987, + "step": 127680 + }, + { + "epoch": 5.29, + "grad_norm": 0.91015625, + "learning_rate": 0.00046267308412173685, + "loss": 0.2102, + "step": 127690 + }, + { + "epoch": 5.29, + "grad_norm": 0.42578125, + "learning_rate": 0.0004626673829940885, + "loss": 0.1896, + "step": 127700 + }, + { + "epoch": 5.29, + "grad_norm": 0.392578125, + "learning_rate": 0.0004626616814662232, + "loss": 0.203, + "step": 127710 + }, + { + "epoch": 5.29, + "grad_norm": 0.47265625, + "learning_rate": 0.00046265597953815146, + "loss": 0.2171, + "step": 127720 + }, + { + "epoch": 5.29, + "grad_norm": 0.81640625, + "learning_rate": 0.00046265027720988417, + "loss": 0.1849, + "step": 127730 + }, + { + "epoch": 5.29, + "grad_norm": 0.041015625, + "learning_rate": 0.00046264457448143205, + "loss": 0.1763, + "step": 127740 + }, + { + "epoch": 5.29, + "grad_norm": 0.90234375, + "learning_rate": 0.0004626388713528058, + "loss": 0.2171, + "step": 127750 + }, + { + "epoch": 5.29, + "grad_norm": 1.3203125, + "learning_rate": 0.0004626331678240161, + "loss": 0.1707, + "step": 127760 + }, + { + "epoch": 5.29, + "grad_norm": 0.197265625, + "learning_rate": 0.00046262746389507384, + "loss": 0.1711, + "step": 127770 + }, + { + "epoch": 5.29, + "grad_norm": 1.265625, + "learning_rate": 0.0004626217595659896, + "loss": 0.2858, + "step": 127780 + }, + { + "epoch": 5.29, + "grad_norm": 0.361328125, + "learning_rate": 0.0004626160548367742, + "loss": 0.2448, + "step": 127790 + }, + { + "epoch": 5.29, + "grad_norm": 0.392578125, + "learning_rate": 0.00046261034970743826, + "loss": 0.1761, + "step": 127800 + }, + { + "epoch": 5.29, + "grad_norm": 0.75, + "learning_rate": 0.00046260464417799266, + "loss": 0.2231, + "step": 127810 + }, + { + "epoch": 5.29, + "grad_norm": 0.6484375, + "learning_rate": 0.00046259893824844806, + "loss": 0.2038, + "step": 127820 + }, + { + "epoch": 5.29, + "grad_norm": 0.59765625, + "learning_rate": 0.00046259323191881523, + "loss": 0.2055, + "step": 127830 + }, + { + "epoch": 5.3, + "grad_norm": 1.7734375, + "learning_rate": 0.0004625875251891049, + "loss": 0.2029, + "step": 127840 + }, + { + "epoch": 5.3, + "grad_norm": 0.185546875, + "learning_rate": 0.0004625818180593278, + "loss": 0.2097, + "step": 127850 + }, + { + "epoch": 5.3, + "grad_norm": 0.5859375, + "learning_rate": 0.00046257611052949465, + "loss": 0.2139, + "step": 127860 + }, + { + "epoch": 5.3, + "grad_norm": 0.478515625, + "learning_rate": 0.0004625704025996162, + "loss": 0.2233, + "step": 127870 + }, + { + "epoch": 5.3, + "grad_norm": 1.328125, + "learning_rate": 0.0004625646942697033, + "loss": 0.2305, + "step": 127880 + }, + { + "epoch": 5.3, + "grad_norm": 0.828125, + "learning_rate": 0.0004625589855397666, + "loss": 0.233, + "step": 127890 + }, + { + "epoch": 5.3, + "grad_norm": 0.796875, + "learning_rate": 0.0004625532764098168, + "loss": 0.181, + "step": 127900 + }, + { + "epoch": 5.3, + "grad_norm": 0.890625, + "learning_rate": 0.00046254756687986467, + "loss": 0.1709, + "step": 127910 + }, + { + "epoch": 5.3, + "grad_norm": 0.59375, + "learning_rate": 0.000462541856949921, + "loss": 0.169, + "step": 127920 + }, + { + "epoch": 5.3, + "grad_norm": 0.61328125, + "learning_rate": 0.0004625361466199964, + "loss": 0.2227, + "step": 127930 + }, + { + "epoch": 5.3, + "grad_norm": 0.65625, + "learning_rate": 0.0004625304358901019, + "loss": 0.2193, + "step": 127940 + }, + { + "epoch": 5.3, + "grad_norm": 1.5078125, + "learning_rate": 0.000462524724760248, + "loss": 0.1874, + "step": 127950 + }, + { + "epoch": 5.3, + "grad_norm": 0.8671875, + "learning_rate": 0.0004625190132304455, + "loss": 0.2301, + "step": 127960 + }, + { + "epoch": 5.3, + "grad_norm": 0.80859375, + "learning_rate": 0.00046251330130070515, + "loss": 0.2482, + "step": 127970 + }, + { + "epoch": 5.3, + "grad_norm": 0.3203125, + "learning_rate": 0.0004625075889710377, + "loss": 0.1928, + "step": 127980 + }, + { + "epoch": 5.3, + "grad_norm": 0.447265625, + "learning_rate": 0.000462501876241454, + "loss": 0.1628, + "step": 127990 + }, + { + "epoch": 5.3, + "grad_norm": 2.984375, + "learning_rate": 0.0004624961631119647, + "loss": 0.1917, + "step": 128000 + }, + { + "epoch": 5.3, + "grad_norm": 0.54296875, + "learning_rate": 0.0004624904495825805, + "loss": 0.1556, + "step": 128010 + }, + { + "epoch": 5.3, + "grad_norm": 1.0234375, + "learning_rate": 0.0004624847356533123, + "loss": 0.2249, + "step": 128020 + }, + { + "epoch": 5.3, + "grad_norm": 0.78515625, + "learning_rate": 0.00046247902132417074, + "loss": 0.2153, + "step": 128030 + }, + { + "epoch": 5.3, + "grad_norm": 0.83984375, + "learning_rate": 0.0004624733065951666, + "loss": 0.2027, + "step": 128040 + }, + { + "epoch": 5.3, + "grad_norm": 0.984375, + "learning_rate": 0.00046246759146631065, + "loss": 0.2315, + "step": 128050 + }, + { + "epoch": 5.3, + "grad_norm": 0.455078125, + "learning_rate": 0.00046246187593761357, + "loss": 0.1946, + "step": 128060 + }, + { + "epoch": 5.3, + "grad_norm": 1.0078125, + "learning_rate": 0.0004624561600090863, + "loss": 0.218, + "step": 128070 + }, + { + "epoch": 5.31, + "grad_norm": 0.36328125, + "learning_rate": 0.0004624504436807394, + "loss": 0.159, + "step": 128080 + }, + { + "epoch": 5.31, + "grad_norm": 0.84375, + "learning_rate": 0.00046244472695258375, + "loss": 0.1639, + "step": 128090 + }, + { + "epoch": 5.31, + "grad_norm": 0.234375, + "learning_rate": 0.0004624390098246301, + "loss": 0.1904, + "step": 128100 + }, + { + "epoch": 5.31, + "grad_norm": 0.4453125, + "learning_rate": 0.0004624332922968891, + "loss": 0.2017, + "step": 128110 + }, + { + "epoch": 5.31, + "grad_norm": 0.61328125, + "learning_rate": 0.0004624275743693716, + "loss": 0.2079, + "step": 128120 + }, + { + "epoch": 5.31, + "grad_norm": 0.6484375, + "learning_rate": 0.00046242185604208837, + "loss": 0.1926, + "step": 128130 + }, + { + "epoch": 5.31, + "grad_norm": 0.6640625, + "learning_rate": 0.0004624161373150501, + "loss": 0.2102, + "step": 128140 + }, + { + "epoch": 5.31, + "grad_norm": 1.1875, + "learning_rate": 0.00046241041818826766, + "loss": 0.1555, + "step": 128150 + }, + { + "epoch": 5.31, + "grad_norm": 0.5546875, + "learning_rate": 0.00046240469866175174, + "loss": 0.1652, + "step": 128160 + }, + { + "epoch": 5.31, + "grad_norm": 0.69921875, + "learning_rate": 0.0004623989787355131, + "loss": 0.1766, + "step": 128170 + }, + { + "epoch": 5.31, + "grad_norm": 0.60546875, + "learning_rate": 0.0004623932584095625, + "loss": 0.1758, + "step": 128180 + }, + { + "epoch": 5.31, + "grad_norm": 0.7265625, + "learning_rate": 0.0004623875376839107, + "loss": 0.2048, + "step": 128190 + }, + { + "epoch": 5.31, + "grad_norm": 0.3671875, + "learning_rate": 0.0004623818165585686, + "loss": 0.2171, + "step": 128200 + }, + { + "epoch": 5.31, + "grad_norm": 0.92578125, + "learning_rate": 0.0004623760950335467, + "loss": 0.1991, + "step": 128210 + }, + { + "epoch": 5.31, + "grad_norm": 0.625, + "learning_rate": 0.000462370373108856, + "loss": 0.1913, + "step": 128220 + }, + { + "epoch": 5.31, + "grad_norm": 1.6171875, + "learning_rate": 0.0004623646507845073, + "loss": 0.1698, + "step": 128230 + }, + { + "epoch": 5.31, + "grad_norm": 0.890625, + "learning_rate": 0.0004623589280605111, + "loss": 0.2173, + "step": 128240 + }, + { + "epoch": 5.31, + "grad_norm": 0.486328125, + "learning_rate": 0.0004623532049368784, + "loss": 0.1996, + "step": 128250 + }, + { + "epoch": 5.31, + "grad_norm": 0.53515625, + "learning_rate": 0.00046234748141361996, + "loss": 0.2039, + "step": 128260 + }, + { + "epoch": 5.31, + "grad_norm": 0.48046875, + "learning_rate": 0.00046234175749074635, + "loss": 0.2101, + "step": 128270 + }, + { + "epoch": 5.31, + "grad_norm": 1.4375, + "learning_rate": 0.0004623360331682686, + "loss": 0.1908, + "step": 128280 + }, + { + "epoch": 5.31, + "grad_norm": 1.5625, + "learning_rate": 0.0004623303084461973, + "loss": 0.2395, + "step": 128290 + }, + { + "epoch": 5.31, + "grad_norm": 0.73828125, + "learning_rate": 0.0004623245833245433, + "loss": 0.204, + "step": 128300 + }, + { + "epoch": 5.31, + "grad_norm": 0.78515625, + "learning_rate": 0.0004623188578033174, + "loss": 0.1837, + "step": 128310 + }, + { + "epoch": 5.31, + "grad_norm": 0.6328125, + "learning_rate": 0.00046231313188253034, + "loss": 0.2211, + "step": 128320 + }, + { + "epoch": 5.32, + "grad_norm": 0.546875, + "learning_rate": 0.00046230740556219284, + "loss": 0.1564, + "step": 128330 + }, + { + "epoch": 5.32, + "grad_norm": 0.49609375, + "learning_rate": 0.0004623016788423158, + "loss": 0.1948, + "step": 128340 + }, + { + "epoch": 5.32, + "grad_norm": 0.91015625, + "learning_rate": 0.00046229595172290984, + "loss": 0.1744, + "step": 128350 + }, + { + "epoch": 5.32, + "grad_norm": 0.30859375, + "learning_rate": 0.0004622902242039859, + "loss": 0.201, + "step": 128360 + }, + { + "epoch": 5.32, + "grad_norm": 0.75, + "learning_rate": 0.0004622844962855546, + "loss": 0.2436, + "step": 128370 + }, + { + "epoch": 5.32, + "grad_norm": 0.5703125, + "learning_rate": 0.00046227876796762693, + "loss": 0.1822, + "step": 128380 + }, + { + "epoch": 5.32, + "grad_norm": 0.57421875, + "learning_rate": 0.0004622730392502134, + "loss": 0.2262, + "step": 128390 + }, + { + "epoch": 5.32, + "grad_norm": 0.59765625, + "learning_rate": 0.00046226731013332504, + "loss": 0.2592, + "step": 128400 + }, + { + "epoch": 5.32, + "grad_norm": 0.9921875, + "learning_rate": 0.00046226158061697247, + "loss": 0.1754, + "step": 128410 + }, + { + "epoch": 5.32, + "grad_norm": 0.82421875, + "learning_rate": 0.0004622558507011666, + "loss": 0.2149, + "step": 128420 + }, + { + "epoch": 5.32, + "grad_norm": 1.40625, + "learning_rate": 0.00046225012038591807, + "loss": 0.2334, + "step": 128430 + }, + { + "epoch": 5.32, + "grad_norm": 1.0078125, + "learning_rate": 0.0004622443896712377, + "loss": 0.1976, + "step": 128440 + }, + { + "epoch": 5.32, + "grad_norm": 0.8515625, + "learning_rate": 0.00046223865855713636, + "loss": 0.2439, + "step": 128450 + }, + { + "epoch": 5.32, + "grad_norm": 0.435546875, + "learning_rate": 0.00046223292704362484, + "loss": 0.2001, + "step": 128460 + }, + { + "epoch": 5.32, + "grad_norm": 0.609375, + "learning_rate": 0.0004622271951307138, + "loss": 0.2086, + "step": 128470 + }, + { + "epoch": 5.32, + "grad_norm": 0.4765625, + "learning_rate": 0.0004622214628184141, + "loss": 0.1986, + "step": 128480 + }, + { + "epoch": 5.32, + "grad_norm": 0.1943359375, + "learning_rate": 0.00046221573010673655, + "loss": 0.1772, + "step": 128490 + }, + { + "epoch": 5.32, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004622099969956919, + "loss": 0.2376, + "step": 128500 + }, + { + "epoch": 5.32, + "grad_norm": 1.0234375, + "learning_rate": 0.00046220426348529095, + "loss": 0.1673, + "step": 128510 + }, + { + "epoch": 5.32, + "grad_norm": 0.376953125, + "learning_rate": 0.0004621985295755445, + "loss": 0.1942, + "step": 128520 + }, + { + "epoch": 5.32, + "grad_norm": 1.015625, + "learning_rate": 0.0004621927952664633, + "loss": 0.2042, + "step": 128530 + }, + { + "epoch": 5.32, + "grad_norm": 1.4609375, + "learning_rate": 0.00046218706055805827, + "loss": 0.2447, + "step": 128540 + }, + { + "epoch": 5.32, + "grad_norm": 0.80859375, + "learning_rate": 0.00046218132545033995, + "loss": 0.2204, + "step": 128550 + }, + { + "epoch": 5.32, + "grad_norm": 0.1982421875, + "learning_rate": 0.00046217558994331946, + "loss": 0.2065, + "step": 128560 + }, + { + "epoch": 5.33, + "grad_norm": 1.421875, + "learning_rate": 0.0004621698540370073, + "loss": 0.2149, + "step": 128570 + }, + { + "epoch": 5.33, + "grad_norm": 0.66796875, + "learning_rate": 0.00046216411773141445, + "loss": 0.2163, + "step": 128580 + }, + { + "epoch": 5.33, + "grad_norm": 1.484375, + "learning_rate": 0.0004621583810265516, + "loss": 0.241, + "step": 128590 + }, + { + "epoch": 5.33, + "grad_norm": 0.96484375, + "learning_rate": 0.0004621526439224296, + "loss": 0.2561, + "step": 128600 + }, + { + "epoch": 5.33, + "grad_norm": 1.2265625, + "learning_rate": 0.0004621469064190592, + "loss": 0.2288, + "step": 128610 + }, + { + "epoch": 5.33, + "grad_norm": 0.33203125, + "learning_rate": 0.00046214116851645134, + "loss": 0.189, + "step": 128620 + }, + { + "epoch": 5.33, + "grad_norm": 0.79296875, + "learning_rate": 0.0004621354302146166, + "loss": 0.2479, + "step": 128630 + }, + { + "epoch": 5.33, + "grad_norm": 0.5625, + "learning_rate": 0.00046212969151356595, + "loss": 0.2659, + "step": 128640 + }, + { + "epoch": 5.33, + "grad_norm": 0.478515625, + "learning_rate": 0.0004621239524133101, + "loss": 0.214, + "step": 128650 + }, + { + "epoch": 5.33, + "grad_norm": 0.49609375, + "learning_rate": 0.0004621182129138599, + "loss": 0.2093, + "step": 128660 + }, + { + "epoch": 5.33, + "grad_norm": 0.69140625, + "learning_rate": 0.00046211247301522615, + "loss": 0.2241, + "step": 128670 + }, + { + "epoch": 5.33, + "grad_norm": 0.49609375, + "learning_rate": 0.00046210673271741956, + "loss": 0.2067, + "step": 128680 + }, + { + "epoch": 5.33, + "grad_norm": 0.5078125, + "learning_rate": 0.0004621009920204511, + "loss": 0.1674, + "step": 128690 + }, + { + "epoch": 5.33, + "grad_norm": 0.78125, + "learning_rate": 0.0004620952509243314, + "loss": 0.2117, + "step": 128700 + }, + { + "epoch": 5.33, + "grad_norm": 0.47265625, + "learning_rate": 0.00046208950942907136, + "loss": 0.1988, + "step": 128710 + }, + { + "epoch": 5.33, + "grad_norm": 0.6875, + "learning_rate": 0.0004620837675346817, + "loss": 0.2667, + "step": 128720 + }, + { + "epoch": 5.33, + "grad_norm": 0.9140625, + "learning_rate": 0.00046207802524117345, + "loss": 0.2061, + "step": 128730 + }, + { + "epoch": 5.33, + "grad_norm": 0.5625, + "learning_rate": 0.00046207228254855715, + "loss": 0.2094, + "step": 128740 + }, + { + "epoch": 5.33, + "grad_norm": 1.2421875, + "learning_rate": 0.00046206653945684374, + "loss": 0.1592, + "step": 128750 + }, + { + "epoch": 5.33, + "grad_norm": 0.52734375, + "learning_rate": 0.000462060795966044, + "loss": 0.2182, + "step": 128760 + }, + { + "epoch": 5.33, + "grad_norm": 0.5546875, + "learning_rate": 0.00046205505207616874, + "loss": 0.2022, + "step": 128770 + }, + { + "epoch": 5.33, + "grad_norm": 0.83984375, + "learning_rate": 0.00046204930778722883, + "loss": 0.1991, + "step": 128780 + }, + { + "epoch": 5.33, + "grad_norm": 0.6953125, + "learning_rate": 0.00046204356309923497, + "loss": 0.1961, + "step": 128790 + }, + { + "epoch": 5.33, + "grad_norm": 0.279296875, + "learning_rate": 0.000462037818012198, + "loss": 0.2121, + "step": 128800 + }, + { + "epoch": 5.34, + "grad_norm": 0.5234375, + "learning_rate": 0.0004620320725261288, + "loss": 0.1856, + "step": 128810 + }, + { + "epoch": 5.34, + "grad_norm": 1.25, + "learning_rate": 0.0004620263266410381, + "loss": 0.2374, + "step": 128820 + }, + { + "epoch": 5.34, + "grad_norm": 0.396484375, + "learning_rate": 0.00046202058035693677, + "loss": 0.2321, + "step": 128830 + }, + { + "epoch": 5.34, + "grad_norm": 0.69140625, + "learning_rate": 0.00046201483367383554, + "loss": 0.1902, + "step": 128840 + }, + { + "epoch": 5.34, + "grad_norm": 0.96875, + "learning_rate": 0.0004620090865917455, + "loss": 0.2183, + "step": 128850 + }, + { + "epoch": 5.34, + "grad_norm": 0.625, + "learning_rate": 0.00046200333911067704, + "loss": 0.1763, + "step": 128860 + }, + { + "epoch": 5.34, + "grad_norm": 1.1875, + "learning_rate": 0.00046199759123064123, + "loss": 0.1793, + "step": 128870 + }, + { + "epoch": 5.34, + "grad_norm": 0.73828125, + "learning_rate": 0.0004619918429516489, + "loss": 0.2057, + "step": 128880 + }, + { + "epoch": 5.34, + "grad_norm": 0.302734375, + "learning_rate": 0.00046198609427371075, + "loss": 0.1565, + "step": 128890 + }, + { + "epoch": 5.34, + "grad_norm": 1.1796875, + "learning_rate": 0.0004619803451968377, + "loss": 0.2003, + "step": 128900 + }, + { + "epoch": 5.34, + "grad_norm": 0.26953125, + "learning_rate": 0.0004619745957210405, + "loss": 0.1509, + "step": 128910 + }, + { + "epoch": 5.34, + "grad_norm": 0.5703125, + "learning_rate": 0.00046196884584633004, + "loss": 0.164, + "step": 128920 + }, + { + "epoch": 5.34, + "grad_norm": 1.828125, + "learning_rate": 0.0004619630955727171, + "loss": 0.2489, + "step": 128930 + }, + { + "epoch": 5.34, + "grad_norm": 0.796875, + "learning_rate": 0.00046195734490021254, + "loss": 0.254, + "step": 128940 + }, + { + "epoch": 5.34, + "grad_norm": 0.93359375, + "learning_rate": 0.00046195159382882714, + "loss": 0.252, + "step": 128950 + }, + { + "epoch": 5.34, + "grad_norm": 0.69140625, + "learning_rate": 0.00046194584235857163, + "loss": 0.2225, + "step": 128960 + }, + { + "epoch": 5.34, + "grad_norm": 1.2578125, + "learning_rate": 0.00046194009048945706, + "loss": 0.198, + "step": 128970 + }, + { + "epoch": 5.34, + "grad_norm": 0.703125, + "learning_rate": 0.000461934338221494, + "loss": 0.2139, + "step": 128980 + }, + { + "epoch": 5.34, + "grad_norm": 0.5859375, + "learning_rate": 0.0004619285855546935, + "loss": 0.2321, + "step": 128990 + }, + { + "epoch": 5.34, + "grad_norm": 0.87890625, + "learning_rate": 0.00046192283248906626, + "loss": 0.2587, + "step": 129000 + }, + { + "epoch": 5.34, + "grad_norm": 0.8359375, + "learning_rate": 0.0004619170790246231, + "loss": 0.1752, + "step": 129010 + }, + { + "epoch": 5.34, + "grad_norm": 1.0078125, + "learning_rate": 0.0004619113251613749, + "loss": 0.2364, + "step": 129020 + }, + { + "epoch": 5.34, + "grad_norm": 0.2265625, + "learning_rate": 0.0004619055708993325, + "loss": 0.242, + "step": 129030 + }, + { + "epoch": 5.34, + "grad_norm": 0.7265625, + "learning_rate": 0.00046189981623850673, + "loss": 0.2195, + "step": 129040 + }, + { + "epoch": 5.35, + "grad_norm": 1.328125, + "learning_rate": 0.0004618940611789083, + "loss": 0.2189, + "step": 129050 + }, + { + "epoch": 5.35, + "grad_norm": 2.34375, + "learning_rate": 0.0004618883057205482, + "loss": 0.2212, + "step": 129060 + }, + { + "epoch": 5.35, + "grad_norm": 0.31640625, + "learning_rate": 0.00046188254986343717, + "loss": 0.2148, + "step": 129070 + }, + { + "epoch": 5.35, + "grad_norm": 0.91015625, + "learning_rate": 0.00046187679360758607, + "loss": 0.2324, + "step": 129080 + }, + { + "epoch": 5.35, + "grad_norm": 0.6953125, + "learning_rate": 0.0004618710369530057, + "loss": 0.1709, + "step": 129090 + }, + { + "epoch": 5.35, + "grad_norm": 0.51953125, + "learning_rate": 0.0004618652798997069, + "loss": 0.1897, + "step": 129100 + }, + { + "epoch": 5.35, + "grad_norm": 0.65625, + "learning_rate": 0.00046185952244770056, + "loss": 0.247, + "step": 129110 + }, + { + "epoch": 5.35, + "grad_norm": 0.8125, + "learning_rate": 0.00046185376459699745, + "loss": 0.233, + "step": 129120 + }, + { + "epoch": 5.35, + "grad_norm": 0.61328125, + "learning_rate": 0.0004618480063476085, + "loss": 0.2228, + "step": 129130 + }, + { + "epoch": 5.35, + "grad_norm": 0.640625, + "learning_rate": 0.00046184224769954443, + "loss": 0.21, + "step": 129140 + }, + { + "epoch": 5.35, + "grad_norm": 0.349609375, + "learning_rate": 0.00046183648865281616, + "loss": 0.189, + "step": 129150 + }, + { + "epoch": 5.35, + "grad_norm": 0.98046875, + "learning_rate": 0.0004618307292074344, + "loss": 0.213, + "step": 129160 + }, + { + "epoch": 5.35, + "grad_norm": 1.0078125, + "learning_rate": 0.00046182496936341013, + "loss": 0.244, + "step": 129170 + }, + { + "epoch": 5.35, + "grad_norm": 2.640625, + "learning_rate": 0.00046181920912075416, + "loss": 0.1937, + "step": 129180 + }, + { + "epoch": 5.35, + "grad_norm": 0.6015625, + "learning_rate": 0.00046181344847947734, + "loss": 0.2117, + "step": 129190 + }, + { + "epoch": 5.35, + "grad_norm": 1.21875, + "learning_rate": 0.00046180768743959045, + "loss": 0.211, + "step": 129200 + }, + { + "epoch": 5.35, + "grad_norm": 0.71484375, + "learning_rate": 0.0004618019260011044, + "loss": 0.2759, + "step": 129210 + }, + { + "epoch": 5.35, + "grad_norm": 0.43359375, + "learning_rate": 0.0004617961641640299, + "loss": 0.1988, + "step": 129220 + }, + { + "epoch": 5.35, + "grad_norm": 0.7109375, + "learning_rate": 0.000461790401928378, + "loss": 0.2061, + "step": 129230 + }, + { + "epoch": 5.35, + "grad_norm": 0.6953125, + "learning_rate": 0.00046178463929415935, + "loss": 0.1989, + "step": 129240 + }, + { + "epoch": 5.35, + "grad_norm": 0.390625, + "learning_rate": 0.000461778876261385, + "loss": 0.2088, + "step": 129250 + }, + { + "epoch": 5.35, + "grad_norm": 0.345703125, + "learning_rate": 0.00046177311283006556, + "loss": 0.1817, + "step": 129260 + }, + { + "epoch": 5.35, + "grad_norm": 0.32421875, + "learning_rate": 0.00046176734900021206, + "loss": 0.1951, + "step": 129270 + }, + { + "epoch": 5.35, + "grad_norm": 0.98828125, + "learning_rate": 0.0004617615847718352, + "loss": 0.2283, + "step": 129280 + }, + { + "epoch": 5.36, + "grad_norm": 0.64453125, + "learning_rate": 0.000461755820144946, + "loss": 0.2206, + "step": 129290 + }, + { + "epoch": 5.36, + "grad_norm": 1.125, + "learning_rate": 0.00046175005511955516, + "loss": 0.2522, + "step": 129300 + }, + { + "epoch": 5.36, + "grad_norm": 0.359375, + "learning_rate": 0.00046174428969567363, + "loss": 0.1741, + "step": 129310 + }, + { + "epoch": 5.36, + "grad_norm": 0.5390625, + "learning_rate": 0.0004617385238733122, + "loss": 0.2149, + "step": 129320 + }, + { + "epoch": 5.36, + "grad_norm": 0.8515625, + "learning_rate": 0.0004617327576524817, + "loss": 0.2311, + "step": 129330 + }, + { + "epoch": 5.36, + "grad_norm": 0.62890625, + "learning_rate": 0.000461726991033193, + "loss": 0.1966, + "step": 129340 + }, + { + "epoch": 5.36, + "grad_norm": 0.5078125, + "learning_rate": 0.00046172122401545696, + "loss": 0.2169, + "step": 129350 + }, + { + "epoch": 5.36, + "grad_norm": 0.259765625, + "learning_rate": 0.0004617154565992845, + "loss": 0.1603, + "step": 129360 + }, + { + "epoch": 5.36, + "grad_norm": 0.78125, + "learning_rate": 0.00046170968878468633, + "loss": 0.2613, + "step": 129370 + }, + { + "epoch": 5.36, + "grad_norm": 0.83984375, + "learning_rate": 0.00046170392057167346, + "loss": 0.2346, + "step": 129380 + }, + { + "epoch": 5.36, + "grad_norm": 0.94921875, + "learning_rate": 0.0004616981519602567, + "loss": 0.215, + "step": 129390 + }, + { + "epoch": 5.36, + "grad_norm": 0.921875, + "learning_rate": 0.0004616923829504468, + "loss": 0.2129, + "step": 129400 + }, + { + "epoch": 5.36, + "grad_norm": 0.50390625, + "learning_rate": 0.0004616866135422547, + "loss": 0.1675, + "step": 129410 + }, + { + "epoch": 5.36, + "grad_norm": 0.8828125, + "learning_rate": 0.00046168084373569135, + "loss": 0.1869, + "step": 129420 + }, + { + "epoch": 5.36, + "grad_norm": 0.5234375, + "learning_rate": 0.00046167507353076745, + "loss": 0.24, + "step": 129430 + }, + { + "epoch": 5.36, + "grad_norm": 0.68359375, + "learning_rate": 0.00046166930292749385, + "loss": 0.1692, + "step": 129440 + }, + { + "epoch": 5.36, + "grad_norm": 0.9609375, + "learning_rate": 0.0004616635319258816, + "loss": 0.2103, + "step": 129450 + }, + { + "epoch": 5.36, + "grad_norm": 1.1875, + "learning_rate": 0.00046165776052594146, + "loss": 0.2224, + "step": 129460 + }, + { + "epoch": 5.36, + "grad_norm": 0.5390625, + "learning_rate": 0.00046165198872768413, + "loss": 0.217, + "step": 129470 + }, + { + "epoch": 5.36, + "grad_norm": 0.859375, + "learning_rate": 0.0004616462165311207, + "loss": 0.2157, + "step": 129480 + }, + { + "epoch": 5.36, + "grad_norm": 0.39453125, + "learning_rate": 0.0004616404439362619, + "loss": 0.2152, + "step": 129490 + }, + { + "epoch": 5.36, + "grad_norm": 0.70703125, + "learning_rate": 0.0004616346709431187, + "loss": 0.2284, + "step": 129500 + }, + { + "epoch": 5.36, + "grad_norm": 0.97265625, + "learning_rate": 0.0004616288975517019, + "loss": 0.2258, + "step": 129510 + }, + { + "epoch": 5.36, + "grad_norm": 0.67578125, + "learning_rate": 0.0004616231237620223, + "loss": 0.1997, + "step": 129520 + }, + { + "epoch": 5.37, + "grad_norm": 1.09375, + "learning_rate": 0.0004616173495740909, + "loss": 0.2387, + "step": 129530 + }, + { + "epoch": 5.37, + "grad_norm": 0.9765625, + "learning_rate": 0.0004616115749879185, + "loss": 0.2465, + "step": 129540 + }, + { + "epoch": 5.37, + "grad_norm": 0.435546875, + "learning_rate": 0.00046160580000351593, + "loss": 0.1884, + "step": 129550 + }, + { + "epoch": 5.37, + "grad_norm": 0.486328125, + "learning_rate": 0.00046160002462089413, + "loss": 0.1573, + "step": 129560 + }, + { + "epoch": 5.37, + "grad_norm": 0.6328125, + "learning_rate": 0.0004615942488400639, + "loss": 0.2253, + "step": 129570 + }, + { + "epoch": 5.37, + "grad_norm": 0.408203125, + "learning_rate": 0.0004615884726610362, + "loss": 0.2313, + "step": 129580 + }, + { + "epoch": 5.37, + "grad_norm": 0.734375, + "learning_rate": 0.00046158269608382175, + "loss": 0.169, + "step": 129590 + }, + { + "epoch": 5.37, + "grad_norm": 1.171875, + "learning_rate": 0.00046157691910843156, + "loss": 0.1819, + "step": 129600 + }, + { + "epoch": 5.37, + "grad_norm": 1.0703125, + "learning_rate": 0.0004615711417348765, + "loss": 0.1835, + "step": 129610 + }, + { + "epoch": 5.37, + "grad_norm": 0.91796875, + "learning_rate": 0.0004615653639631674, + "loss": 0.2458, + "step": 129620 + }, + { + "epoch": 5.37, + "grad_norm": 0.54296875, + "learning_rate": 0.00046155958579331506, + "loss": 0.1984, + "step": 129630 + }, + { + "epoch": 5.37, + "grad_norm": 0.88671875, + "learning_rate": 0.0004615538072253305, + "loss": 0.2299, + "step": 129640 + }, + { + "epoch": 5.37, + "grad_norm": 1.4140625, + "learning_rate": 0.0004615480282592245, + "loss": 0.1609, + "step": 129650 + }, + { + "epoch": 5.37, + "grad_norm": 1.5859375, + "learning_rate": 0.0004615422488950079, + "loss": 0.2564, + "step": 129660 + }, + { + "epoch": 5.37, + "grad_norm": 0.59375, + "learning_rate": 0.00046153646913269164, + "loss": 0.2241, + "step": 129670 + }, + { + "epoch": 5.37, + "grad_norm": 0.6015625, + "learning_rate": 0.00046153068897228665, + "loss": 0.2118, + "step": 129680 + }, + { + "epoch": 5.37, + "grad_norm": 0.53125, + "learning_rate": 0.00046152490841380376, + "loss": 0.2175, + "step": 129690 + }, + { + "epoch": 5.37, + "grad_norm": 0.68359375, + "learning_rate": 0.00046151912745725376, + "loss": 0.2236, + "step": 129700 + }, + { + "epoch": 5.37, + "grad_norm": 0.416015625, + "learning_rate": 0.0004615133461026476, + "loss": 0.2146, + "step": 129710 + }, + { + "epoch": 5.37, + "grad_norm": 0.421875, + "learning_rate": 0.00046150756434999626, + "loss": 0.2207, + "step": 129720 + }, + { + "epoch": 5.37, + "grad_norm": 0.71484375, + "learning_rate": 0.0004615017821993105, + "loss": 0.2241, + "step": 129730 + }, + { + "epoch": 5.37, + "grad_norm": 0.7421875, + "learning_rate": 0.0004614959996506012, + "loss": 0.2086, + "step": 129740 + }, + { + "epoch": 5.37, + "grad_norm": 1.265625, + "learning_rate": 0.00046149021670387923, + "loss": 0.2429, + "step": 129750 + }, + { + "epoch": 5.37, + "grad_norm": 0.94921875, + "learning_rate": 0.00046148443335915554, + "loss": 0.196, + "step": 129760 + }, + { + "epoch": 5.38, + "grad_norm": 0.7265625, + "learning_rate": 0.00046147864961644096, + "loss": 0.2615, + "step": 129770 + }, + { + "epoch": 5.38, + "grad_norm": 0.84765625, + "learning_rate": 0.0004614728654757464, + "loss": 0.1792, + "step": 129780 + }, + { + "epoch": 5.38, + "grad_norm": 0.404296875, + "learning_rate": 0.0004614670809370828, + "loss": 0.1801, + "step": 129790 + }, + { + "epoch": 5.38, + "grad_norm": 0.3984375, + "learning_rate": 0.00046146129600046094, + "loss": 0.2111, + "step": 129800 + }, + { + "epoch": 5.38, + "grad_norm": 1.046875, + "learning_rate": 0.00046145551066589184, + "loss": 0.213, + "step": 129810 + }, + { + "epoch": 5.38, + "grad_norm": 0.84765625, + "learning_rate": 0.0004614497249333862, + "loss": 0.2489, + "step": 129820 + }, + { + "epoch": 5.38, + "grad_norm": 0.8984375, + "learning_rate": 0.0004614439388029551, + "loss": 0.2275, + "step": 129830 + }, + { + "epoch": 5.38, + "grad_norm": 0.73046875, + "learning_rate": 0.00046143815227460926, + "loss": 0.2593, + "step": 129840 + }, + { + "epoch": 5.38, + "grad_norm": 0.9453125, + "learning_rate": 0.0004614323653483597, + "loss": 0.1726, + "step": 129850 + }, + { + "epoch": 5.38, + "grad_norm": 1.046875, + "learning_rate": 0.00046142657802421727, + "loss": 0.1961, + "step": 129860 + }, + { + "epoch": 5.38, + "grad_norm": 1.6015625, + "learning_rate": 0.0004614207903021928, + "loss": 0.2276, + "step": 129870 + }, + { + "epoch": 5.38, + "grad_norm": 0.427734375, + "learning_rate": 0.00046141500218229727, + "loss": 0.1944, + "step": 129880 + }, + { + "epoch": 5.38, + "grad_norm": 0.859375, + "learning_rate": 0.00046140921366454154, + "loss": 0.1957, + "step": 129890 + }, + { + "epoch": 5.38, + "grad_norm": 0.451171875, + "learning_rate": 0.0004614034247489365, + "loss": 0.1804, + "step": 129900 + }, + { + "epoch": 5.38, + "grad_norm": 1.0703125, + "learning_rate": 0.000461397635435493, + "loss": 0.2329, + "step": 129910 + }, + { + "epoch": 5.38, + "grad_norm": 0.287109375, + "learning_rate": 0.000461391845724222, + "loss": 0.1862, + "step": 129920 + }, + { + "epoch": 5.38, + "grad_norm": 0.216796875, + "learning_rate": 0.0004613860556151344, + "loss": 0.2049, + "step": 129930 + }, + { + "epoch": 5.38, + "grad_norm": 1.015625, + "learning_rate": 0.00046138026510824103, + "loss": 0.2027, + "step": 129940 + }, + { + "epoch": 5.38, + "grad_norm": 0.375, + "learning_rate": 0.0004613744742035529, + "loss": 0.2362, + "step": 129950 + }, + { + "epoch": 5.38, + "grad_norm": 3.59375, + "learning_rate": 0.0004613686829010808, + "loss": 0.2056, + "step": 129960 + }, + { + "epoch": 5.38, + "grad_norm": 0.609375, + "learning_rate": 0.0004613628912008356, + "loss": 0.2409, + "step": 129970 + }, + { + "epoch": 5.38, + "grad_norm": 0.3828125, + "learning_rate": 0.0004613570991028284, + "loss": 0.2104, + "step": 129980 + }, + { + "epoch": 5.38, + "grad_norm": 0.35546875, + "learning_rate": 0.00046135130660706985, + "loss": 0.2034, + "step": 129990 + }, + { + "epoch": 5.38, + "grad_norm": 0.61328125, + "learning_rate": 0.000461345513713571, + "loss": 0.2532, + "step": 130000 + }, + { + "epoch": 5.38, + "grad_norm": 0.33203125, + "learning_rate": 0.0004613397204223427, + "loss": 0.2011, + "step": 130010 + }, + { + "epoch": 5.39, + "grad_norm": 0.474609375, + "learning_rate": 0.00046133392673339594, + "loss": 0.1616, + "step": 130020 + }, + { + "epoch": 5.39, + "grad_norm": 1.6328125, + "learning_rate": 0.00046132813264674146, + "loss": 0.2147, + "step": 130030 + }, + { + "epoch": 5.39, + "grad_norm": 0.3828125, + "learning_rate": 0.00046132233816239035, + "loss": 0.2247, + "step": 130040 + }, + { + "epoch": 5.39, + "grad_norm": 0.421875, + "learning_rate": 0.00046131654328035335, + "loss": 0.1933, + "step": 130050 + }, + { + "epoch": 5.39, + "grad_norm": 0.65625, + "learning_rate": 0.00046131074800064145, + "loss": 0.2065, + "step": 130060 + }, + { + "epoch": 5.39, + "grad_norm": 0.408203125, + "learning_rate": 0.00046130495232326555, + "loss": 0.1783, + "step": 130070 + }, + { + "epoch": 5.39, + "grad_norm": 0.90625, + "learning_rate": 0.0004612991562482365, + "loss": 0.2508, + "step": 130080 + }, + { + "epoch": 5.39, + "grad_norm": 1.3984375, + "learning_rate": 0.0004612933597755653, + "loss": 0.2387, + "step": 130090 + }, + { + "epoch": 5.39, + "grad_norm": 0.5234375, + "learning_rate": 0.0004612875629052629, + "loss": 0.1701, + "step": 130100 + }, + { + "epoch": 5.39, + "grad_norm": 0.84375, + "learning_rate": 0.00046128176563733995, + "loss": 0.1923, + "step": 130110 + }, + { + "epoch": 5.39, + "grad_norm": 0.6640625, + "learning_rate": 0.0004612759679718076, + "loss": 0.1991, + "step": 130120 + }, + { + "epoch": 5.39, + "grad_norm": 0.74609375, + "learning_rate": 0.00046127016990867676, + "loss": 0.267, + "step": 130130 + }, + { + "epoch": 5.39, + "grad_norm": 0.671875, + "learning_rate": 0.0004612643714479582, + "loss": 0.2361, + "step": 130140 + }, + { + "epoch": 5.39, + "grad_norm": 0.43359375, + "learning_rate": 0.00046125857258966295, + "loss": 0.2124, + "step": 130150 + }, + { + "epoch": 5.39, + "grad_norm": 1.0234375, + "learning_rate": 0.0004612527733338019, + "loss": 0.1829, + "step": 130160 + }, + { + "epoch": 5.39, + "grad_norm": 1.09375, + "learning_rate": 0.0004612469736803859, + "loss": 0.2302, + "step": 130170 + }, + { + "epoch": 5.39, + "grad_norm": 0.70703125, + "learning_rate": 0.0004612411736294259, + "loss": 0.1696, + "step": 130180 + }, + { + "epoch": 5.39, + "grad_norm": 0.435546875, + "learning_rate": 0.0004612353731809328, + "loss": 0.2221, + "step": 130190 + }, + { + "epoch": 5.39, + "grad_norm": 0.51171875, + "learning_rate": 0.0004612295723349176, + "loss": 0.2024, + "step": 130200 + }, + { + "epoch": 5.39, + "grad_norm": 0.57421875, + "learning_rate": 0.0004612237710913911, + "loss": 0.2249, + "step": 130210 + }, + { + "epoch": 5.39, + "grad_norm": 1.3671875, + "learning_rate": 0.00046121796945036437, + "loss": 0.2086, + "step": 130220 + }, + { + "epoch": 5.39, + "grad_norm": 3.078125, + "learning_rate": 0.0004612121674118482, + "loss": 0.2445, + "step": 130230 + }, + { + "epoch": 5.39, + "grad_norm": 1.2421875, + "learning_rate": 0.00046120636497585344, + "loss": 0.2521, + "step": 130240 + }, + { + "epoch": 5.39, + "grad_norm": 0.37890625, + "learning_rate": 0.0004612005621423913, + "loss": 0.2058, + "step": 130250 + }, + { + "epoch": 5.4, + "grad_norm": 1.3515625, + "learning_rate": 0.00046119475891147235, + "loss": 0.2082, + "step": 130260 + }, + { + "epoch": 5.4, + "grad_norm": 0.71484375, + "learning_rate": 0.0004611889552831077, + "loss": 0.1633, + "step": 130270 + }, + { + "epoch": 5.4, + "grad_norm": 0.0, + "learning_rate": 0.00046118315125730827, + "loss": 0.1863, + "step": 130280 + }, + { + "epoch": 5.4, + "grad_norm": 0.8125, + "learning_rate": 0.0004611773468340849, + "loss": 0.2206, + "step": 130290 + }, + { + "epoch": 5.4, + "grad_norm": 1.71875, + "learning_rate": 0.0004611715420134486, + "loss": 0.1776, + "step": 130300 + }, + { + "epoch": 5.4, + "grad_norm": 0.7734375, + "learning_rate": 0.0004611657367954103, + "loss": 0.2193, + "step": 130310 + }, + { + "epoch": 5.4, + "grad_norm": 0.54296875, + "learning_rate": 0.00046115993117998083, + "loss": 0.2203, + "step": 130320 + }, + { + "epoch": 5.4, + "grad_norm": 0.6171875, + "learning_rate": 0.0004611541251671712, + "loss": 0.1838, + "step": 130330 + }, + { + "epoch": 5.4, + "grad_norm": 0.73046875, + "learning_rate": 0.0004611483187569923, + "loss": 0.1754, + "step": 130340 + }, + { + "epoch": 5.4, + "grad_norm": 1.0390625, + "learning_rate": 0.0004611425119494551, + "loss": 0.1935, + "step": 130350 + }, + { + "epoch": 5.4, + "grad_norm": 0.953125, + "learning_rate": 0.0004611367047445705, + "loss": 0.1583, + "step": 130360 + }, + { + "epoch": 5.4, + "grad_norm": 0.55078125, + "learning_rate": 0.00046113089714234937, + "loss": 0.2109, + "step": 130370 + }, + { + "epoch": 5.4, + "grad_norm": 0.0, + "learning_rate": 0.00046112508914280273, + "loss": 0.2406, + "step": 130380 + }, + { + "epoch": 5.4, + "grad_norm": 1.1328125, + "learning_rate": 0.00046111928074594143, + "loss": 0.2242, + "step": 130390 + }, + { + "epoch": 5.4, + "grad_norm": 0.50390625, + "learning_rate": 0.00046111347195177646, + "loss": 0.1855, + "step": 130400 + }, + { + "epoch": 5.4, + "grad_norm": 0.65625, + "learning_rate": 0.00046110766276031873, + "loss": 0.2767, + "step": 130410 + }, + { + "epoch": 5.4, + "grad_norm": 1.3046875, + "learning_rate": 0.0004611018531715792, + "loss": 0.1993, + "step": 130420 + }, + { + "epoch": 5.4, + "grad_norm": 0.421875, + "learning_rate": 0.00046109604318556876, + "loss": 0.1646, + "step": 130430 + }, + { + "epoch": 5.4, + "grad_norm": 0.8046875, + "learning_rate": 0.0004610902328022983, + "loss": 0.2002, + "step": 130440 + }, + { + "epoch": 5.4, + "grad_norm": 1.0234375, + "learning_rate": 0.00046108442202177897, + "loss": 0.1686, + "step": 130450 + }, + { + "epoch": 5.4, + "grad_norm": 0.921875, + "learning_rate": 0.0004610786108440215, + "loss": 0.2066, + "step": 130460 + }, + { + "epoch": 5.4, + "grad_norm": 0.86328125, + "learning_rate": 0.0004610727992690368, + "loss": 0.2248, + "step": 130470 + }, + { + "epoch": 5.4, + "grad_norm": 0.376953125, + "learning_rate": 0.0004610669872968359, + "loss": 0.1899, + "step": 130480 + }, + { + "epoch": 5.4, + "grad_norm": 0.4453125, + "learning_rate": 0.0004610611749274298, + "loss": 0.1678, + "step": 130490 + }, + { + "epoch": 5.41, + "grad_norm": 4.90625, + "learning_rate": 0.0004610553621608293, + "loss": 0.1344, + "step": 130500 + }, + { + "epoch": 5.41, + "grad_norm": 0.890625, + "learning_rate": 0.0004610495489970454, + "loss": 0.1715, + "step": 130510 + }, + { + "epoch": 5.41, + "grad_norm": 0.486328125, + "learning_rate": 0.00046104373543608915, + "loss": 0.2128, + "step": 130520 + }, + { + "epoch": 5.41, + "grad_norm": 0.85546875, + "learning_rate": 0.00046103792147797126, + "loss": 0.2522, + "step": 130530 + }, + { + "epoch": 5.41, + "grad_norm": 0.77734375, + "learning_rate": 0.00046103210712270285, + "loss": 0.1851, + "step": 130540 + }, + { + "epoch": 5.41, + "grad_norm": 0.33203125, + "learning_rate": 0.0004610262923702947, + "loss": 0.1815, + "step": 130550 + }, + { + "epoch": 5.41, + "grad_norm": 1.015625, + "learning_rate": 0.000461020477220758, + "loss": 0.215, + "step": 130560 + }, + { + "epoch": 5.41, + "grad_norm": 1.375, + "learning_rate": 0.00046101466167410346, + "loss": 0.1971, + "step": 130570 + }, + { + "epoch": 5.41, + "grad_norm": 1.40625, + "learning_rate": 0.00046100884573034215, + "loss": 0.1861, + "step": 130580 + }, + { + "epoch": 5.41, + "grad_norm": 0.6796875, + "learning_rate": 0.00046100302938948496, + "loss": 0.1658, + "step": 130590 + }, + { + "epoch": 5.41, + "grad_norm": 0.9453125, + "learning_rate": 0.00046099721265154285, + "loss": 0.2417, + "step": 130600 + }, + { + "epoch": 5.41, + "grad_norm": 0.8828125, + "learning_rate": 0.0004609913955165268, + "loss": 0.2622, + "step": 130610 + }, + { + "epoch": 5.41, + "grad_norm": 0.271484375, + "learning_rate": 0.00046098557798444776, + "loss": 0.1987, + "step": 130620 + }, + { + "epoch": 5.41, + "grad_norm": 0.70703125, + "learning_rate": 0.00046097976005531657, + "loss": 0.1976, + "step": 130630 + }, + { + "epoch": 5.41, + "grad_norm": 0.0, + "learning_rate": 0.00046097394172914426, + "loss": 0.2311, + "step": 130640 + }, + { + "epoch": 5.41, + "grad_norm": 0.48046875, + "learning_rate": 0.0004609681230059418, + "loss": 0.2073, + "step": 130650 + }, + { + "epoch": 5.41, + "grad_norm": 1.6953125, + "learning_rate": 0.0004609623038857201, + "loss": 0.2428, + "step": 130660 + }, + { + "epoch": 5.41, + "grad_norm": 0.51953125, + "learning_rate": 0.00046095648436849014, + "loss": 0.2192, + "step": 130670 + }, + { + "epoch": 5.41, + "grad_norm": 0.5859375, + "learning_rate": 0.0004609506644542629, + "loss": 0.2307, + "step": 130680 + }, + { + "epoch": 5.41, + "grad_norm": 1.984375, + "learning_rate": 0.00046094484414304927, + "loss": 0.1659, + "step": 130690 + }, + { + "epoch": 5.41, + "grad_norm": 1.1484375, + "learning_rate": 0.00046093902343486017, + "loss": 0.2082, + "step": 130700 + }, + { + "epoch": 5.41, + "grad_norm": 1.140625, + "learning_rate": 0.0004609332023297066, + "loss": 0.2479, + "step": 130710 + }, + { + "epoch": 5.41, + "grad_norm": 0.83203125, + "learning_rate": 0.0004609273808275996, + "loss": 0.1875, + "step": 130720 + }, + { + "epoch": 5.41, + "grad_norm": 0.6484375, + "learning_rate": 0.00046092155892855, + "loss": 0.2338, + "step": 130730 + }, + { + "epoch": 5.42, + "grad_norm": 0.3828125, + "learning_rate": 0.0004609157366325688, + "loss": 0.2075, + "step": 130740 + }, + { + "epoch": 5.42, + "grad_norm": 0.9453125, + "learning_rate": 0.00046090991393966696, + "loss": 0.2275, + "step": 130750 + }, + { + "epoch": 5.42, + "grad_norm": 1.1171875, + "learning_rate": 0.00046090409084985546, + "loss": 0.2403, + "step": 130760 + }, + { + "epoch": 5.42, + "grad_norm": 0.62890625, + "learning_rate": 0.0004608982673631453, + "loss": 0.1991, + "step": 130770 + }, + { + "epoch": 5.42, + "grad_norm": 0.279296875, + "learning_rate": 0.00046089244347954727, + "loss": 0.2082, + "step": 130780 + }, + { + "epoch": 5.42, + "grad_norm": 0.84765625, + "learning_rate": 0.0004608866191990725, + "loss": 0.1776, + "step": 130790 + }, + { + "epoch": 5.42, + "grad_norm": 1.1328125, + "learning_rate": 0.0004608807945217318, + "loss": 0.2329, + "step": 130800 + }, + { + "epoch": 5.42, + "grad_norm": 0.2138671875, + "learning_rate": 0.00046087496944753625, + "loss": 0.2178, + "step": 130810 + }, + { + "epoch": 5.42, + "grad_norm": 0.82421875, + "learning_rate": 0.00046086914397649683, + "loss": 0.1878, + "step": 130820 + }, + { + "epoch": 5.42, + "grad_norm": 0.66796875, + "learning_rate": 0.00046086331810862445, + "loss": 0.2471, + "step": 130830 + }, + { + "epoch": 5.42, + "grad_norm": 0.5078125, + "learning_rate": 0.00046085749184393, + "loss": 0.2128, + "step": 130840 + }, + { + "epoch": 5.42, + "grad_norm": 0.6015625, + "learning_rate": 0.00046085166518242463, + "loss": 0.1552, + "step": 130850 + }, + { + "epoch": 5.42, + "grad_norm": 0.9140625, + "learning_rate": 0.00046084583812411913, + "loss": 0.2162, + "step": 130860 + }, + { + "epoch": 5.42, + "grad_norm": 0.1728515625, + "learning_rate": 0.00046084001066902454, + "loss": 0.2184, + "step": 130870 + }, + { + "epoch": 5.42, + "grad_norm": 1.375, + "learning_rate": 0.00046083418281715185, + "loss": 0.1732, + "step": 130880 + }, + { + "epoch": 5.42, + "grad_norm": 1.0, + "learning_rate": 0.00046082835456851196, + "loss": 0.2464, + "step": 130890 + }, + { + "epoch": 5.42, + "grad_norm": 1.15625, + "learning_rate": 0.00046082252592311593, + "loss": 0.2315, + "step": 130900 + }, + { + "epoch": 5.42, + "grad_norm": 0.431640625, + "learning_rate": 0.00046081669688097454, + "loss": 0.1442, + "step": 130910 + }, + { + "epoch": 5.42, + "grad_norm": 1.03125, + "learning_rate": 0.000460810867442099, + "loss": 0.2533, + "step": 130920 + }, + { + "epoch": 5.42, + "grad_norm": 0.5234375, + "learning_rate": 0.00046080503760650017, + "loss": 0.1897, + "step": 130930 + }, + { + "epoch": 5.42, + "grad_norm": 0.61328125, + "learning_rate": 0.000460799207374189, + "loss": 0.1817, + "step": 130940 + }, + { + "epoch": 5.42, + "grad_norm": 0.439453125, + "learning_rate": 0.0004607933767451765, + "loss": 0.2083, + "step": 130950 + }, + { + "epoch": 5.42, + "grad_norm": 0.66015625, + "learning_rate": 0.00046078754571947356, + "loss": 0.15, + "step": 130960 + }, + { + "epoch": 5.42, + "grad_norm": 0.94140625, + "learning_rate": 0.0004607817142970913, + "loss": 0.2252, + "step": 130970 + }, + { + "epoch": 5.43, + "grad_norm": 1.0234375, + "learning_rate": 0.0004607758824780406, + "loss": 0.2331, + "step": 130980 + }, + { + "epoch": 5.43, + "grad_norm": 1.8515625, + "learning_rate": 0.00046077005026233243, + "loss": 0.224, + "step": 130990 + }, + { + "epoch": 5.43, + "grad_norm": 0.8515625, + "learning_rate": 0.00046076421764997786, + "loss": 0.2362, + "step": 131000 + }, + { + "epoch": 5.43, + "grad_norm": 1.0703125, + "learning_rate": 0.00046075838464098776, + "loss": 0.1729, + "step": 131010 + }, + { + "epoch": 5.43, + "grad_norm": 0.94921875, + "learning_rate": 0.0004607525512353731, + "loss": 0.2273, + "step": 131020 + }, + { + "epoch": 5.43, + "grad_norm": 1.2734375, + "learning_rate": 0.0004607467174331449, + "loss": 0.2713, + "step": 131030 + }, + { + "epoch": 5.43, + "grad_norm": 0.51953125, + "learning_rate": 0.0004607408832343142, + "loss": 0.1979, + "step": 131040 + }, + { + "epoch": 5.43, + "grad_norm": 0.875, + "learning_rate": 0.00046073504863889184, + "loss": 0.1902, + "step": 131050 + }, + { + "epoch": 5.43, + "grad_norm": 0.3828125, + "learning_rate": 0.0004607292136468889, + "loss": 0.2645, + "step": 131060 + }, + { + "epoch": 5.43, + "grad_norm": 0.361328125, + "learning_rate": 0.00046072337825831634, + "loss": 0.1861, + "step": 131070 + }, + { + "epoch": 5.43, + "grad_norm": 0.44921875, + "learning_rate": 0.0004607175424731851, + "loss": 0.2101, + "step": 131080 + }, + { + "epoch": 5.43, + "grad_norm": 0.6796875, + "learning_rate": 0.0004607117062915063, + "loss": 0.1942, + "step": 131090 + }, + { + "epoch": 5.43, + "grad_norm": 1.765625, + "learning_rate": 0.00046070586971329075, + "loss": 0.1709, + "step": 131100 + }, + { + "epoch": 5.43, + "grad_norm": 0.302734375, + "learning_rate": 0.0004607000327385495, + "loss": 0.186, + "step": 131110 + }, + { + "epoch": 5.43, + "grad_norm": 0.5625, + "learning_rate": 0.00046069419536729357, + "loss": 0.1885, + "step": 131120 + }, + { + "epoch": 5.43, + "grad_norm": 0.87109375, + "learning_rate": 0.00046068835759953386, + "loss": 0.2074, + "step": 131130 + }, + { + "epoch": 5.43, + "grad_norm": 0.72265625, + "learning_rate": 0.0004606825194352815, + "loss": 0.2434, + "step": 131140 + }, + { + "epoch": 5.43, + "grad_norm": 0.494140625, + "learning_rate": 0.00046067668087454737, + "loss": 0.2162, + "step": 131150 + }, + { + "epoch": 5.43, + "grad_norm": 0.302734375, + "learning_rate": 0.0004606708419173424, + "loss": 0.2398, + "step": 131160 + }, + { + "epoch": 5.43, + "grad_norm": 0.5859375, + "learning_rate": 0.00046066500256367775, + "loss": 0.2043, + "step": 131170 + }, + { + "epoch": 5.43, + "grad_norm": 0.333984375, + "learning_rate": 0.0004606591628135643, + "loss": 0.1801, + "step": 131180 + }, + { + "epoch": 5.43, + "grad_norm": 1.1328125, + "learning_rate": 0.00046065332266701306, + "loss": 0.1945, + "step": 131190 + }, + { + "epoch": 5.43, + "grad_norm": 0.306640625, + "learning_rate": 0.00046064748212403493, + "loss": 0.2014, + "step": 131200 + }, + { + "epoch": 5.43, + "grad_norm": 0.77734375, + "learning_rate": 0.00046064164118464116, + "loss": 0.1827, + "step": 131210 + }, + { + "epoch": 5.44, + "grad_norm": 0.51953125, + "learning_rate": 0.0004606357998488424, + "loss": 0.237, + "step": 131220 + }, + { + "epoch": 5.44, + "grad_norm": 0.78515625, + "learning_rate": 0.00046062995811664987, + "loss": 0.2325, + "step": 131230 + }, + { + "epoch": 5.44, + "grad_norm": 0.20703125, + "learning_rate": 0.00046062411598807453, + "loss": 0.2808, + "step": 131240 + }, + { + "epoch": 5.44, + "grad_norm": 0.47265625, + "learning_rate": 0.0004606182734631273, + "loss": 0.139, + "step": 131250 + }, + { + "epoch": 5.44, + "grad_norm": 0.2060546875, + "learning_rate": 0.0004606124305418192, + "loss": 0.2182, + "step": 131260 + }, + { + "epoch": 5.44, + "grad_norm": 0.4296875, + "learning_rate": 0.0004606065872241614, + "loss": 0.1772, + "step": 131270 + }, + { + "epoch": 5.44, + "grad_norm": 0.314453125, + "learning_rate": 0.00046060074351016454, + "loss": 0.1683, + "step": 131280 + }, + { + "epoch": 5.44, + "grad_norm": 0.609375, + "learning_rate": 0.00046059489939984, + "loss": 0.1674, + "step": 131290 + }, + { + "epoch": 5.44, + "grad_norm": 0.56640625, + "learning_rate": 0.00046058905489319846, + "loss": 0.2008, + "step": 131300 + }, + { + "epoch": 5.44, + "grad_norm": 0.6875, + "learning_rate": 0.0004605832099902512, + "loss": 0.2421, + "step": 131310 + }, + { + "epoch": 5.44, + "grad_norm": 0.8671875, + "learning_rate": 0.000460577364691009, + "loss": 0.1885, + "step": 131320 + }, + { + "epoch": 5.44, + "grad_norm": 0.314453125, + "learning_rate": 0.00046057151899548293, + "loss": 0.2213, + "step": 131330 + }, + { + "epoch": 5.44, + "grad_norm": 0.6484375, + "learning_rate": 0.000460565672903684, + "loss": 0.2218, + "step": 131340 + }, + { + "epoch": 5.44, + "grad_norm": 0.48046875, + "learning_rate": 0.0004605598264156232, + "loss": 0.1896, + "step": 131350 + }, + { + "epoch": 5.44, + "grad_norm": 0.9453125, + "learning_rate": 0.00046055397953131163, + "loss": 0.1925, + "step": 131360 + }, + { + "epoch": 5.44, + "grad_norm": 0.396484375, + "learning_rate": 0.0004605481322507602, + "loss": 0.2557, + "step": 131370 + }, + { + "epoch": 5.44, + "grad_norm": 0.75, + "learning_rate": 0.0004605422845739798, + "loss": 0.213, + "step": 131380 + }, + { + "epoch": 5.44, + "grad_norm": 0.431640625, + "learning_rate": 0.0004605364365009816, + "loss": 0.2816, + "step": 131390 + }, + { + "epoch": 5.44, + "grad_norm": 1.46875, + "learning_rate": 0.00046053058803177666, + "loss": 0.2139, + "step": 131400 + }, + { + "epoch": 5.44, + "grad_norm": 0.306640625, + "learning_rate": 0.0004605247391663758, + "loss": 0.183, + "step": 131410 + }, + { + "epoch": 5.44, + "grad_norm": 0.76953125, + "learning_rate": 0.0004605188899047902, + "loss": 0.1581, + "step": 131420 + }, + { + "epoch": 5.44, + "grad_norm": 1.140625, + "learning_rate": 0.00046051304024703066, + "loss": 0.2269, + "step": 131430 + }, + { + "epoch": 5.44, + "grad_norm": 1.265625, + "learning_rate": 0.0004605071901931084, + "loss": 0.1727, + "step": 131440 + }, + { + "epoch": 5.44, + "grad_norm": 0.7890625, + "learning_rate": 0.00046050133974303437, + "loss": 0.2073, + "step": 131450 + }, + { + "epoch": 5.45, + "grad_norm": 0.70703125, + "learning_rate": 0.00046049548889681946, + "loss": 0.1775, + "step": 131460 + }, + { + "epoch": 5.45, + "grad_norm": 0.640625, + "learning_rate": 0.0004604896376544748, + "loss": 0.2095, + "step": 131470 + }, + { + "epoch": 5.45, + "grad_norm": 0.44140625, + "learning_rate": 0.0004604837860160114, + "loss": 0.1849, + "step": 131480 + }, + { + "epoch": 5.45, + "grad_norm": 0.65234375, + "learning_rate": 0.00046047793398144025, + "loss": 0.2238, + "step": 131490 + }, + { + "epoch": 5.45, + "grad_norm": 1.640625, + "learning_rate": 0.00046047208155077234, + "loss": 0.2087, + "step": 131500 + }, + { + "epoch": 5.45, + "grad_norm": 0.42578125, + "learning_rate": 0.00046046622872401865, + "loss": 0.1822, + "step": 131510 + }, + { + "epoch": 5.45, + "grad_norm": 0.62109375, + "learning_rate": 0.00046046037550119036, + "loss": 0.2281, + "step": 131520 + }, + { + "epoch": 5.45, + "grad_norm": 0.73828125, + "learning_rate": 0.0004604545218822983, + "loss": 0.2309, + "step": 131530 + }, + { + "epoch": 5.45, + "grad_norm": 0.77734375, + "learning_rate": 0.0004604486678673535, + "loss": 0.242, + "step": 131540 + }, + { + "epoch": 5.45, + "grad_norm": 0.7421875, + "learning_rate": 0.00046044281345636713, + "loss": 0.1873, + "step": 131550 + }, + { + "epoch": 5.45, + "grad_norm": 1.4140625, + "learning_rate": 0.0004604369586493501, + "loss": 0.2426, + "step": 131560 + }, + { + "epoch": 5.45, + "grad_norm": 0.40625, + "learning_rate": 0.00046043110344631344, + "loss": 0.2001, + "step": 131570 + }, + { + "epoch": 5.45, + "grad_norm": 1.078125, + "learning_rate": 0.00046042524784726814, + "loss": 0.2109, + "step": 131580 + }, + { + "epoch": 5.45, + "grad_norm": 0.765625, + "learning_rate": 0.00046041939185222525, + "loss": 0.2253, + "step": 131590 + }, + { + "epoch": 5.45, + "grad_norm": 0.447265625, + "learning_rate": 0.0004604135354611958, + "loss": 0.1714, + "step": 131600 + }, + { + "epoch": 5.45, + "grad_norm": 1.0, + "learning_rate": 0.00046040767867419076, + "loss": 0.2324, + "step": 131610 + }, + { + "epoch": 5.45, + "grad_norm": 0.8046875, + "learning_rate": 0.0004604018214912212, + "loss": 0.1433, + "step": 131620 + }, + { + "epoch": 5.45, + "grad_norm": 1.03125, + "learning_rate": 0.0004603959639122982, + "loss": 0.2178, + "step": 131630 + }, + { + "epoch": 5.45, + "grad_norm": 0.486328125, + "learning_rate": 0.00046039010593743263, + "loss": 0.2102, + "step": 131640 + }, + { + "epoch": 5.45, + "grad_norm": 0.5703125, + "learning_rate": 0.00046038424756663564, + "loss": 0.1672, + "step": 131650 + }, + { + "epoch": 5.45, + "grad_norm": 1.171875, + "learning_rate": 0.0004603783887999182, + "loss": 0.2439, + "step": 131660 + }, + { + "epoch": 5.45, + "grad_norm": 1.109375, + "learning_rate": 0.0004603725296372914, + "loss": 0.2713, + "step": 131670 + }, + { + "epoch": 5.45, + "grad_norm": 0.65234375, + "learning_rate": 0.00046036667007876616, + "loss": 0.1853, + "step": 131680 + }, + { + "epoch": 5.45, + "grad_norm": 0.353515625, + "learning_rate": 0.0004603608101243536, + "loss": 0.1844, + "step": 131690 + }, + { + "epoch": 5.45, + "grad_norm": 0.7421875, + "learning_rate": 0.0004603549497740647, + "loss": 0.2294, + "step": 131700 + }, + { + "epoch": 5.46, + "grad_norm": 0.5078125, + "learning_rate": 0.0004603490890279105, + "loss": 0.2015, + "step": 131710 + }, + { + "epoch": 5.46, + "grad_norm": 1.453125, + "learning_rate": 0.000460343227885902, + "loss": 0.2048, + "step": 131720 + }, + { + "epoch": 5.46, + "grad_norm": 1.0390625, + "learning_rate": 0.0004603373663480503, + "loss": 0.2185, + "step": 131730 + }, + { + "epoch": 5.46, + "grad_norm": 0.84375, + "learning_rate": 0.0004603315044143664, + "loss": 0.2022, + "step": 131740 + }, + { + "epoch": 5.46, + "grad_norm": 0.44921875, + "learning_rate": 0.0004603256420848613, + "loss": 0.2253, + "step": 131750 + }, + { + "epoch": 5.46, + "grad_norm": 0.373046875, + "learning_rate": 0.0004603197793595461, + "loss": 0.1738, + "step": 131760 + }, + { + "epoch": 5.46, + "grad_norm": 1.1484375, + "learning_rate": 0.0004603139162384317, + "loss": 0.2195, + "step": 131770 + }, + { + "epoch": 5.46, + "grad_norm": 0.5859375, + "learning_rate": 0.00046030805272152933, + "loss": 0.2364, + "step": 131780 + }, + { + "epoch": 5.46, + "grad_norm": 0.53125, + "learning_rate": 0.00046030218880884985, + "loss": 0.1487, + "step": 131790 + }, + { + "epoch": 5.46, + "grad_norm": 0.65625, + "learning_rate": 0.0004602963245004043, + "loss": 0.2068, + "step": 131800 + }, + { + "epoch": 5.46, + "grad_norm": 1.5078125, + "learning_rate": 0.00046029045979620385, + "loss": 0.1816, + "step": 131810 + }, + { + "epoch": 5.46, + "grad_norm": 0.74609375, + "learning_rate": 0.0004602845946962595, + "loss": 0.2121, + "step": 131820 + }, + { + "epoch": 5.46, + "grad_norm": 0.7421875, + "learning_rate": 0.00046027872920058224, + "loss": 0.165, + "step": 131830 + }, + { + "epoch": 5.46, + "grad_norm": 1.46875, + "learning_rate": 0.00046027286330918305, + "loss": 0.2123, + "step": 131840 + }, + { + "epoch": 5.46, + "grad_norm": 0.625, + "learning_rate": 0.0004602669970220731, + "loss": 0.1784, + "step": 131850 + }, + { + "epoch": 5.46, + "grad_norm": 0.7421875, + "learning_rate": 0.0004602611303392633, + "loss": 0.2161, + "step": 131860 + }, + { + "epoch": 5.46, + "grad_norm": 0.69921875, + "learning_rate": 0.0004602552632607648, + "loss": 0.1975, + "step": 131870 + }, + { + "epoch": 5.46, + "grad_norm": 0.796875, + "learning_rate": 0.00046024939578658865, + "loss": 0.1918, + "step": 131880 + }, + { + "epoch": 5.46, + "grad_norm": 1.5390625, + "learning_rate": 0.0004602435279167458, + "loss": 0.2346, + "step": 131890 + }, + { + "epoch": 5.46, + "grad_norm": 0.6875, + "learning_rate": 0.0004602376596512473, + "loss": 0.2131, + "step": 131900 + }, + { + "epoch": 5.46, + "grad_norm": 1.2265625, + "learning_rate": 0.00046023179099010427, + "loss": 0.251, + "step": 131910 + }, + { + "epoch": 5.46, + "grad_norm": 0.5078125, + "learning_rate": 0.0004602259219333277, + "loss": 0.2397, + "step": 131920 + }, + { + "epoch": 5.46, + "grad_norm": 0.77734375, + "learning_rate": 0.0004602200524809287, + "loss": 0.1728, + "step": 131930 + }, + { + "epoch": 5.46, + "grad_norm": 1.1015625, + "learning_rate": 0.00046021418263291814, + "loss": 0.196, + "step": 131940 + }, + { + "epoch": 5.47, + "grad_norm": 0.69140625, + "learning_rate": 0.0004602083123893073, + "loss": 0.1929, + "step": 131950 + }, + { + "epoch": 5.47, + "grad_norm": 0.640625, + "learning_rate": 0.0004602024417501071, + "loss": 0.182, + "step": 131960 + }, + { + "epoch": 5.47, + "grad_norm": 0.55078125, + "learning_rate": 0.00046019657071532863, + "loss": 0.2369, + "step": 131970 + }, + { + "epoch": 5.47, + "grad_norm": 0.9765625, + "learning_rate": 0.0004601906992849828, + "loss": 0.153, + "step": 131980 + }, + { + "epoch": 5.47, + "grad_norm": 0.640625, + "learning_rate": 0.00046018482745908084, + "loss": 0.1743, + "step": 131990 + }, + { + "epoch": 5.47, + "grad_norm": 0.578125, + "learning_rate": 0.0004601789552376338, + "loss": 0.1832, + "step": 132000 + }, + { + "epoch": 5.47, + "grad_norm": 1.2578125, + "learning_rate": 0.00046017308262065253, + "loss": 0.2485, + "step": 132010 + }, + { + "epoch": 5.47, + "grad_norm": 1.6328125, + "learning_rate": 0.00046016720960814826, + "loss": 0.2025, + "step": 132020 + }, + { + "epoch": 5.47, + "grad_norm": 0.65625, + "learning_rate": 0.000460161336200132, + "loss": 0.2349, + "step": 132030 + }, + { + "epoch": 5.47, + "grad_norm": 0.8203125, + "learning_rate": 0.0004601554623966149, + "loss": 0.2127, + "step": 132040 + }, + { + "epoch": 5.47, + "grad_norm": 0.70703125, + "learning_rate": 0.00046014958819760784, + "loss": 0.171, + "step": 132050 + }, + { + "epoch": 5.47, + "grad_norm": 0.6640625, + "learning_rate": 0.0004601437136031219, + "loss": 0.2154, + "step": 132060 + }, + { + "epoch": 5.47, + "grad_norm": 0.9921875, + "learning_rate": 0.0004601378386131683, + "loss": 0.2025, + "step": 132070 + }, + { + "epoch": 5.47, + "grad_norm": 0.68359375, + "learning_rate": 0.0004601319632277579, + "loss": 0.1834, + "step": 132080 + }, + { + "epoch": 5.47, + "grad_norm": 0.486328125, + "learning_rate": 0.0004601260874469018, + "loss": 0.1821, + "step": 132090 + }, + { + "epoch": 5.47, + "grad_norm": 0.80078125, + "learning_rate": 0.00046012021127061115, + "loss": 0.2222, + "step": 132100 + }, + { + "epoch": 5.47, + "grad_norm": 0.99609375, + "learning_rate": 0.000460114334698897, + "loss": 0.2201, + "step": 132110 + }, + { + "epoch": 5.47, + "grad_norm": 0.9140625, + "learning_rate": 0.0004601084577317702, + "loss": 0.2196, + "step": 132120 + }, + { + "epoch": 5.47, + "grad_norm": 0.1484375, + "learning_rate": 0.00046010258036924213, + "loss": 0.1957, + "step": 132130 + }, + { + "epoch": 5.47, + "grad_norm": 0.8515625, + "learning_rate": 0.0004600967026113236, + "loss": 0.1858, + "step": 132140 + }, + { + "epoch": 5.47, + "grad_norm": 0.640625, + "learning_rate": 0.00046009082445802585, + "loss": 0.1653, + "step": 132150 + }, + { + "epoch": 5.47, + "grad_norm": 1.1953125, + "learning_rate": 0.00046008494590935977, + "loss": 0.2234, + "step": 132160 + }, + { + "epoch": 5.47, + "grad_norm": 0.875, + "learning_rate": 0.00046007906696533666, + "loss": 0.1621, + "step": 132170 + }, + { + "epoch": 5.47, + "grad_norm": 0.5234375, + "learning_rate": 0.0004600731876259673, + "loss": 0.1971, + "step": 132180 + }, + { + "epoch": 5.48, + "grad_norm": 0.70703125, + "learning_rate": 0.0004600673078912628, + "loss": 0.19, + "step": 132190 + }, + { + "epoch": 5.48, + "grad_norm": 0.75, + "learning_rate": 0.00046006142776123447, + "loss": 0.2117, + "step": 132200 + }, + { + "epoch": 5.48, + "grad_norm": 0.69921875, + "learning_rate": 0.0004600555472358932, + "loss": 0.2296, + "step": 132210 + }, + { + "epoch": 5.48, + "grad_norm": 1.0390625, + "learning_rate": 0.00046004966631525, + "loss": 0.208, + "step": 132220 + }, + { + "epoch": 5.48, + "grad_norm": 0.7265625, + "learning_rate": 0.0004600437849993161, + "loss": 0.1961, + "step": 132230 + }, + { + "epoch": 5.48, + "grad_norm": 0.2392578125, + "learning_rate": 0.00046003790328810247, + "loss": 0.2247, + "step": 132240 + }, + { + "epoch": 5.48, + "grad_norm": 0.85546875, + "learning_rate": 0.0004600320211816201, + "loss": 0.2546, + "step": 132250 + }, + { + "epoch": 5.48, + "grad_norm": 1.109375, + "learning_rate": 0.00046002613867988023, + "loss": 0.235, + "step": 132260 + }, + { + "epoch": 5.48, + "grad_norm": 0.91015625, + "learning_rate": 0.0004600202557828938, + "loss": 0.2309, + "step": 132270 + }, + { + "epoch": 5.48, + "grad_norm": 1.2890625, + "learning_rate": 0.00046001437249067195, + "loss": 0.2178, + "step": 132280 + }, + { + "epoch": 5.48, + "grad_norm": 0.470703125, + "learning_rate": 0.0004600084888032258, + "loss": 0.2021, + "step": 132290 + }, + { + "epoch": 5.48, + "grad_norm": 0.357421875, + "learning_rate": 0.0004600026047205662, + "loss": 0.2276, + "step": 132300 + }, + { + "epoch": 5.48, + "grad_norm": 0.76953125, + "learning_rate": 0.00045999672024270446, + "loss": 0.216, + "step": 132310 + }, + { + "epoch": 5.48, + "grad_norm": 2.53125, + "learning_rate": 0.00045999083536965156, + "loss": 0.2268, + "step": 132320 + }, + { + "epoch": 5.48, + "grad_norm": 1.1796875, + "learning_rate": 0.00045998495010141863, + "loss": 0.2636, + "step": 132330 + }, + { + "epoch": 5.48, + "grad_norm": 0.53125, + "learning_rate": 0.00045997906443801667, + "loss": 0.1565, + "step": 132340 + }, + { + "epoch": 5.48, + "grad_norm": 0.671875, + "learning_rate": 0.00045997317837945674, + "loss": 0.2088, + "step": 132350 + }, + { + "epoch": 5.48, + "grad_norm": 0.251953125, + "learning_rate": 0.00045996729192575005, + "loss": 0.2215, + "step": 132360 + }, + { + "epoch": 5.48, + "grad_norm": 0.91796875, + "learning_rate": 0.00045996140507690756, + "loss": 0.1893, + "step": 132370 + }, + { + "epoch": 5.48, + "grad_norm": 2.140625, + "learning_rate": 0.00045995551783294033, + "loss": 0.3131, + "step": 132380 + }, + { + "epoch": 5.48, + "grad_norm": 0.0, + "learning_rate": 0.0004599496301938595, + "loss": 0.209, + "step": 132390 + }, + { + "epoch": 5.48, + "grad_norm": 1.125, + "learning_rate": 0.0004599437421596762, + "loss": 0.2206, + "step": 132400 + }, + { + "epoch": 5.48, + "grad_norm": 0.3515625, + "learning_rate": 0.0004599378537304014, + "loss": 0.231, + "step": 132410 + }, + { + "epoch": 5.48, + "grad_norm": 0.89453125, + "learning_rate": 0.0004599319649060463, + "loss": 0.2032, + "step": 132420 + }, + { + "epoch": 5.49, + "grad_norm": 1.5625, + "learning_rate": 0.00045992607568662183, + "loss": 0.2305, + "step": 132430 + }, + { + "epoch": 5.49, + "grad_norm": 0.60546875, + "learning_rate": 0.00045992018607213914, + "loss": 0.1635, + "step": 132440 + }, + { + "epoch": 5.49, + "grad_norm": 0.80078125, + "learning_rate": 0.0004599142960626094, + "loss": 0.1655, + "step": 132450 + }, + { + "epoch": 5.49, + "grad_norm": 0.162109375, + "learning_rate": 0.0004599084056580436, + "loss": 0.1878, + "step": 132460 + }, + { + "epoch": 5.49, + "grad_norm": 0.294921875, + "learning_rate": 0.0004599025148584528, + "loss": 0.1932, + "step": 132470 + }, + { + "epoch": 5.49, + "grad_norm": 0.93359375, + "learning_rate": 0.0004598966236638482, + "loss": 0.2358, + "step": 132480 + }, + { + "epoch": 5.49, + "grad_norm": 0.640625, + "learning_rate": 0.0004598907320742408, + "loss": 0.181, + "step": 132490 + }, + { + "epoch": 5.49, + "grad_norm": 0.5390625, + "learning_rate": 0.0004598848400896417, + "loss": 0.2235, + "step": 132500 + }, + { + "epoch": 5.49, + "grad_norm": 0.6796875, + "learning_rate": 0.000459878947710062, + "loss": 0.1702, + "step": 132510 + }, + { + "epoch": 5.49, + "grad_norm": 1.0234375, + "learning_rate": 0.0004598730549355128, + "loss": 0.2628, + "step": 132520 + }, + { + "epoch": 5.49, + "grad_norm": 0.48046875, + "learning_rate": 0.0004598671617660052, + "loss": 0.2186, + "step": 132530 + }, + { + "epoch": 5.49, + "grad_norm": 0.78125, + "learning_rate": 0.00045986126820155016, + "loss": 0.1823, + "step": 132540 + }, + { + "epoch": 5.49, + "grad_norm": 0.8046875, + "learning_rate": 0.00045985537424215897, + "loss": 0.2274, + "step": 132550 + }, + { + "epoch": 5.49, + "grad_norm": 0.5625, + "learning_rate": 0.00045984947988784265, + "loss": 0.233, + "step": 132560 + }, + { + "epoch": 5.49, + "grad_norm": 1.0546875, + "learning_rate": 0.0004598435851386122, + "loss": 0.2115, + "step": 132570 + }, + { + "epoch": 5.49, + "grad_norm": 0.4453125, + "learning_rate": 0.00045983768999447876, + "loss": 0.1918, + "step": 132580 + }, + { + "epoch": 5.49, + "grad_norm": 0.7578125, + "learning_rate": 0.0004598317944554535, + "loss": 0.2352, + "step": 132590 + }, + { + "epoch": 5.49, + "grad_norm": 0.609375, + "learning_rate": 0.00045982589852154744, + "loss": 0.2382, + "step": 132600 + }, + { + "epoch": 5.49, + "grad_norm": 0.412109375, + "learning_rate": 0.00045982000219277173, + "loss": 0.2302, + "step": 132610 + }, + { + "epoch": 5.49, + "grad_norm": 1.1015625, + "learning_rate": 0.00045981410546913745, + "loss": 0.2354, + "step": 132620 + }, + { + "epoch": 5.49, + "grad_norm": 1.109375, + "learning_rate": 0.0004598082083506556, + "loss": 0.1949, + "step": 132630 + }, + { + "epoch": 5.49, + "grad_norm": 0.7890625, + "learning_rate": 0.00045980231083733746, + "loss": 0.192, + "step": 132640 + }, + { + "epoch": 5.49, + "grad_norm": 0.48046875, + "learning_rate": 0.00045979641292919393, + "loss": 0.2222, + "step": 132650 + }, + { + "epoch": 5.49, + "grad_norm": 2.375, + "learning_rate": 0.00045979051462623633, + "loss": 0.2425, + "step": 132660 + }, + { + "epoch": 5.5, + "grad_norm": 1.078125, + "learning_rate": 0.0004597846159284755, + "loss": 0.1966, + "step": 132670 + }, + { + "epoch": 5.5, + "grad_norm": 0.55859375, + "learning_rate": 0.00045977871683592275, + "loss": 0.2111, + "step": 132680 + }, + { + "epoch": 5.5, + "grad_norm": 0.65625, + "learning_rate": 0.0004597728173485891, + "loss": 0.1532, + "step": 132690 + }, + { + "epoch": 5.5, + "grad_norm": 0.546875, + "learning_rate": 0.00045976691746648567, + "loss": 0.2525, + "step": 132700 + }, + { + "epoch": 5.5, + "grad_norm": 0.5078125, + "learning_rate": 0.0004597610171896236, + "loss": 0.1789, + "step": 132710 + }, + { + "epoch": 5.5, + "grad_norm": 0.431640625, + "learning_rate": 0.00045975511651801383, + "loss": 0.2084, + "step": 132720 + }, + { + "epoch": 5.5, + "grad_norm": 0.59375, + "learning_rate": 0.0004597492154516677, + "loss": 0.2186, + "step": 132730 + }, + { + "epoch": 5.5, + "grad_norm": 0.51171875, + "learning_rate": 0.0004597433139905961, + "loss": 0.17, + "step": 132740 + }, + { + "epoch": 5.5, + "grad_norm": 0.66796875, + "learning_rate": 0.0004597374121348103, + "loss": 0.2018, + "step": 132750 + }, + { + "epoch": 5.5, + "grad_norm": 0.3984375, + "learning_rate": 0.00045973150988432135, + "loss": 0.1629, + "step": 132760 + }, + { + "epoch": 5.5, + "grad_norm": 0.65625, + "learning_rate": 0.00045972560723914023, + "loss": 0.2305, + "step": 132770 + }, + { + "epoch": 5.5, + "grad_norm": 0.1875, + "learning_rate": 0.0004597197041992783, + "loss": 0.2098, + "step": 132780 + }, + { + "epoch": 5.5, + "grad_norm": 0.76171875, + "learning_rate": 0.00045971380076474644, + "loss": 0.1824, + "step": 132790 + }, + { + "epoch": 5.5, + "grad_norm": 0.609375, + "learning_rate": 0.00045970789693555595, + "loss": 0.1791, + "step": 132800 + }, + { + "epoch": 5.5, + "grad_norm": 0.97265625, + "learning_rate": 0.00045970199271171776, + "loss": 0.1881, + "step": 132810 + }, + { + "epoch": 5.5, + "grad_norm": 0.341796875, + "learning_rate": 0.0004596960880932431, + "loss": 0.2121, + "step": 132820 + }, + { + "epoch": 5.5, + "grad_norm": 0.8359375, + "learning_rate": 0.000459690183080143, + "loss": 0.1592, + "step": 132830 + }, + { + "epoch": 5.5, + "grad_norm": 0.6171875, + "learning_rate": 0.0004596842776724287, + "loss": 0.2189, + "step": 132840 + }, + { + "epoch": 5.5, + "grad_norm": 0.55078125, + "learning_rate": 0.0004596783718701112, + "loss": 0.1813, + "step": 132850 + }, + { + "epoch": 5.5, + "grad_norm": 1.5625, + "learning_rate": 0.00045967246567320166, + "loss": 0.2638, + "step": 132860 + }, + { + "epoch": 5.5, + "grad_norm": 0.8203125, + "learning_rate": 0.00045966655908171117, + "loss": 0.2251, + "step": 132870 + }, + { + "epoch": 5.5, + "grad_norm": 0.55078125, + "learning_rate": 0.0004596606520956509, + "loss": 0.1976, + "step": 132880 + }, + { + "epoch": 5.5, + "grad_norm": 0.2890625, + "learning_rate": 0.0004596547447150318, + "loss": 0.1722, + "step": 132890 + }, + { + "epoch": 5.5, + "grad_norm": 0.21484375, + "learning_rate": 0.00045964883693986523, + "loss": 0.2164, + "step": 132900 + }, + { + "epoch": 5.51, + "grad_norm": 0.72265625, + "learning_rate": 0.00045964292877016214, + "loss": 0.1811, + "step": 132910 + }, + { + "epoch": 5.51, + "grad_norm": 0.33984375, + "learning_rate": 0.00045963702020593365, + "loss": 0.2649, + "step": 132920 + }, + { + "epoch": 5.51, + "grad_norm": 0.66015625, + "learning_rate": 0.00045963111124719104, + "loss": 0.2055, + "step": 132930 + }, + { + "epoch": 5.51, + "grad_norm": 0.60546875, + "learning_rate": 0.0004596252018939453, + "loss": 0.1784, + "step": 132940 + }, + { + "epoch": 5.51, + "grad_norm": 2.09375, + "learning_rate": 0.00045961929214620743, + "loss": 0.1923, + "step": 132950 + }, + { + "epoch": 5.51, + "grad_norm": 0.53125, + "learning_rate": 0.0004596133820039888, + "loss": 0.2232, + "step": 132960 + }, + { + "epoch": 5.51, + "grad_norm": 0.8984375, + "learning_rate": 0.0004596074714673005, + "loss": 0.206, + "step": 132970 + }, + { + "epoch": 5.51, + "grad_norm": 0.60546875, + "learning_rate": 0.00045960156053615343, + "loss": 0.1993, + "step": 132980 + }, + { + "epoch": 5.51, + "grad_norm": 1.8359375, + "learning_rate": 0.00045959564921055894, + "loss": 0.1834, + "step": 132990 + }, + { + "epoch": 5.51, + "grad_norm": 0.57421875, + "learning_rate": 0.000459589737490528, + "loss": 0.235, + "step": 133000 + }, + { + "epoch": 5.51, + "grad_norm": 0.6171875, + "learning_rate": 0.0004595838253760719, + "loss": 0.2325, + "step": 133010 + }, + { + "epoch": 5.51, + "grad_norm": 0.4375, + "learning_rate": 0.0004595779128672016, + "loss": 0.179, + "step": 133020 + }, + { + "epoch": 5.51, + "grad_norm": 0.50390625, + "learning_rate": 0.0004595719999639283, + "loss": 0.2174, + "step": 133030 + }, + { + "epoch": 5.51, + "grad_norm": 1.2578125, + "learning_rate": 0.0004595660866662632, + "loss": 0.1762, + "step": 133040 + }, + { + "epoch": 5.51, + "grad_norm": 1.1015625, + "learning_rate": 0.00045956017297421733, + "loss": 0.229, + "step": 133050 + }, + { + "epoch": 5.51, + "grad_norm": 1.0234375, + "learning_rate": 0.0004595542588878018, + "loss": 0.2048, + "step": 133060 + }, + { + "epoch": 5.51, + "grad_norm": 0.7421875, + "learning_rate": 0.0004595483444070278, + "loss": 0.1893, + "step": 133070 + }, + { + "epoch": 5.51, + "grad_norm": 0.8046875, + "learning_rate": 0.00045954242953190653, + "loss": 0.1785, + "step": 133080 + }, + { + "epoch": 5.51, + "grad_norm": 0.453125, + "learning_rate": 0.0004595365142624489, + "loss": 0.2155, + "step": 133090 + }, + { + "epoch": 5.51, + "grad_norm": 2.1875, + "learning_rate": 0.0004595305985986662, + "loss": 0.2586, + "step": 133100 + }, + { + "epoch": 5.51, + "grad_norm": 0.369140625, + "learning_rate": 0.00045952468254056964, + "loss": 0.1774, + "step": 133110 + }, + { + "epoch": 5.51, + "grad_norm": 0.66796875, + "learning_rate": 0.0004595187660881702, + "loss": 0.1908, + "step": 133120 + }, + { + "epoch": 5.51, + "grad_norm": 0.94921875, + "learning_rate": 0.0004595128492414791, + "loss": 0.2015, + "step": 133130 + }, + { + "epoch": 5.51, + "grad_norm": 0.89453125, + "learning_rate": 0.0004595069320005073, + "loss": 0.2332, + "step": 133140 + }, + { + "epoch": 5.52, + "grad_norm": 1.4609375, + "learning_rate": 0.00045950101436526626, + "loss": 0.2079, + "step": 133150 + }, + { + "epoch": 5.52, + "grad_norm": 4.84375, + "learning_rate": 0.0004594950963357668, + "loss": 0.2171, + "step": 133160 + }, + { + "epoch": 5.52, + "grad_norm": 0.7421875, + "learning_rate": 0.0004594891779120203, + "loss": 0.2532, + "step": 133170 + }, + { + "epoch": 5.52, + "grad_norm": 0.6875, + "learning_rate": 0.00045948325909403773, + "loss": 0.1809, + "step": 133180 + }, + { + "epoch": 5.52, + "grad_norm": 0.640625, + "learning_rate": 0.0004594773398818303, + "loss": 0.2421, + "step": 133190 + }, + { + "epoch": 5.52, + "grad_norm": 0.4453125, + "learning_rate": 0.0004594714202754091, + "loss": 0.1987, + "step": 133200 + }, + { + "epoch": 5.52, + "grad_norm": 0.66015625, + "learning_rate": 0.0004594655002747854, + "loss": 0.2297, + "step": 133210 + }, + { + "epoch": 5.52, + "grad_norm": 0.3125, + "learning_rate": 0.00045945957987997017, + "loss": 0.1732, + "step": 133220 + }, + { + "epoch": 5.52, + "grad_norm": 0.41015625, + "learning_rate": 0.00045945365909097463, + "loss": 0.2269, + "step": 133230 + }, + { + "epoch": 5.52, + "grad_norm": 0.546875, + "learning_rate": 0.00045944773790781, + "loss": 0.2027, + "step": 133240 + }, + { + "epoch": 5.52, + "grad_norm": 0.59765625, + "learning_rate": 0.00045944181633048725, + "loss": 0.1732, + "step": 133250 + }, + { + "epoch": 5.52, + "grad_norm": 0.52734375, + "learning_rate": 0.0004594358943590177, + "loss": 0.1862, + "step": 133260 + }, + { + "epoch": 5.52, + "grad_norm": 0.41796875, + "learning_rate": 0.0004594299719934124, + "loss": 0.2255, + "step": 133270 + }, + { + "epoch": 5.52, + "grad_norm": 0.76171875, + "learning_rate": 0.0004594240492336824, + "loss": 0.2171, + "step": 133280 + }, + { + "epoch": 5.52, + "grad_norm": 0.4375, + "learning_rate": 0.00045941812607983907, + "loss": 0.2254, + "step": 133290 + }, + { + "epoch": 5.52, + "grad_norm": 0.376953125, + "learning_rate": 0.0004594122025318934, + "loss": 0.2003, + "step": 133300 + }, + { + "epoch": 5.52, + "grad_norm": 0.314453125, + "learning_rate": 0.0004594062785898566, + "loss": 0.2676, + "step": 133310 + }, + { + "epoch": 5.52, + "grad_norm": 0.66015625, + "learning_rate": 0.00045940035425373984, + "loss": 0.1666, + "step": 133320 + }, + { + "epoch": 5.52, + "grad_norm": 1.0234375, + "learning_rate": 0.00045939442952355416, + "loss": 0.1651, + "step": 133330 + }, + { + "epoch": 5.52, + "grad_norm": 0.6015625, + "learning_rate": 0.0004593885043993108, + "loss": 0.2153, + "step": 133340 + }, + { + "epoch": 5.52, + "grad_norm": 0.7890625, + "learning_rate": 0.00045938257888102085, + "loss": 0.2163, + "step": 133350 + }, + { + "epoch": 5.52, + "grad_norm": 0.70703125, + "learning_rate": 0.00045937665296869555, + "loss": 0.192, + "step": 133360 + }, + { + "epoch": 5.52, + "grad_norm": 0.380859375, + "learning_rate": 0.000459370726662346, + "loss": 0.1857, + "step": 133370 + }, + { + "epoch": 5.52, + "grad_norm": 0.82421875, + "learning_rate": 0.00045936479996198335, + "loss": 0.2135, + "step": 133380 + }, + { + "epoch": 5.52, + "grad_norm": 0.55078125, + "learning_rate": 0.00045935887286761867, + "loss": 0.2724, + "step": 133390 + }, + { + "epoch": 5.53, + "grad_norm": 1.1953125, + "learning_rate": 0.0004593529453792633, + "loss": 0.2069, + "step": 133400 + }, + { + "epoch": 5.53, + "grad_norm": 0.71484375, + "learning_rate": 0.00045934701749692825, + "loss": 0.2122, + "step": 133410 + }, + { + "epoch": 5.53, + "grad_norm": 1.1171875, + "learning_rate": 0.00045934108922062475, + "loss": 0.2058, + "step": 133420 + }, + { + "epoch": 5.53, + "grad_norm": 2.015625, + "learning_rate": 0.0004593351605503639, + "loss": 0.213, + "step": 133430 + }, + { + "epoch": 5.53, + "grad_norm": 0.59765625, + "learning_rate": 0.0004593292314861569, + "loss": 0.209, + "step": 133440 + }, + { + "epoch": 5.53, + "grad_norm": 0.6328125, + "learning_rate": 0.0004593233020280149, + "loss": 0.2442, + "step": 133450 + }, + { + "epoch": 5.53, + "grad_norm": 1.2109375, + "learning_rate": 0.000459317372175949, + "loss": 0.2298, + "step": 133460 + }, + { + "epoch": 5.53, + "grad_norm": 0.92578125, + "learning_rate": 0.00045931144192997044, + "loss": 0.2572, + "step": 133470 + }, + { + "epoch": 5.53, + "grad_norm": 0.69140625, + "learning_rate": 0.00045930551129009037, + "loss": 0.2309, + "step": 133480 + }, + { + "epoch": 5.53, + "grad_norm": 0.1953125, + "learning_rate": 0.0004592995802563199, + "loss": 0.1363, + "step": 133490 + }, + { + "epoch": 5.53, + "grad_norm": 0.7265625, + "learning_rate": 0.00045929364882867026, + "loss": 0.1966, + "step": 133500 + }, + { + "epoch": 5.53, + "grad_norm": 0.640625, + "learning_rate": 0.00045928771700715255, + "loss": 0.204, + "step": 133510 + }, + { + "epoch": 5.53, + "grad_norm": 0.41015625, + "learning_rate": 0.000459281784791778, + "loss": 0.2323, + "step": 133520 + }, + { + "epoch": 5.53, + "grad_norm": 1.2265625, + "learning_rate": 0.00045927585218255763, + "loss": 0.1735, + "step": 133530 + }, + { + "epoch": 5.53, + "grad_norm": 0.380859375, + "learning_rate": 0.0004592699191795028, + "loss": 0.1521, + "step": 133540 + }, + { + "epoch": 5.53, + "grad_norm": 0.578125, + "learning_rate": 0.00045926398578262454, + "loss": 0.2253, + "step": 133550 + }, + { + "epoch": 5.53, + "grad_norm": 0.458984375, + "learning_rate": 0.000459258051991934, + "loss": 0.2286, + "step": 133560 + }, + { + "epoch": 5.53, + "grad_norm": 0.59375, + "learning_rate": 0.0004592521178074425, + "loss": 0.2392, + "step": 133570 + }, + { + "epoch": 5.53, + "grad_norm": 1.1171875, + "learning_rate": 0.00045924618322916113, + "loss": 0.2249, + "step": 133580 + }, + { + "epoch": 5.53, + "grad_norm": 0.9921875, + "learning_rate": 0.000459240248257101, + "loss": 0.2276, + "step": 133590 + }, + { + "epoch": 5.53, + "grad_norm": 0.859375, + "learning_rate": 0.00045923431289127326, + "loss": 0.1856, + "step": 133600 + }, + { + "epoch": 5.53, + "grad_norm": 1.3671875, + "learning_rate": 0.0004592283771316892, + "loss": 0.238, + "step": 133610 + }, + { + "epoch": 5.53, + "grad_norm": 0.498046875, + "learning_rate": 0.00045922244097835996, + "loss": 0.1481, + "step": 133620 + }, + { + "epoch": 5.53, + "grad_norm": 0.546875, + "learning_rate": 0.00045921650443129657, + "loss": 0.2403, + "step": 133630 + }, + { + "epoch": 5.54, + "grad_norm": 0.50390625, + "learning_rate": 0.0004592105674905104, + "loss": 0.2246, + "step": 133640 + }, + { + "epoch": 5.54, + "grad_norm": 0.54296875, + "learning_rate": 0.00045920463015601255, + "loss": 0.1876, + "step": 133650 + }, + { + "epoch": 5.54, + "grad_norm": 1.15625, + "learning_rate": 0.0004591986924278141, + "loss": 0.2526, + "step": 133660 + }, + { + "epoch": 5.54, + "grad_norm": 0.6171875, + "learning_rate": 0.00045919275430592635, + "loss": 0.2175, + "step": 133670 + }, + { + "epoch": 5.54, + "grad_norm": 1.3359375, + "learning_rate": 0.0004591868157903605, + "loss": 0.2181, + "step": 133680 + }, + { + "epoch": 5.54, + "grad_norm": 0.37890625, + "learning_rate": 0.00045918087688112756, + "loss": 0.1761, + "step": 133690 + }, + { + "epoch": 5.54, + "grad_norm": 0.55859375, + "learning_rate": 0.0004591749375782388, + "loss": 0.2214, + "step": 133700 + }, + { + "epoch": 5.54, + "grad_norm": 0.90234375, + "learning_rate": 0.0004591689978817054, + "loss": 0.2188, + "step": 133710 + }, + { + "epoch": 5.54, + "grad_norm": 0.9296875, + "learning_rate": 0.00045916305779153854, + "loss": 0.185, + "step": 133720 + }, + { + "epoch": 5.54, + "grad_norm": 0.88671875, + "learning_rate": 0.0004591571173077495, + "loss": 0.2296, + "step": 133730 + }, + { + "epoch": 5.54, + "grad_norm": 0.373046875, + "learning_rate": 0.00045915117643034915, + "loss": 0.2206, + "step": 133740 + }, + { + "epoch": 5.54, + "grad_norm": 0.578125, + "learning_rate": 0.00045914523515934897, + "loss": 0.1885, + "step": 133750 + }, + { + "epoch": 5.54, + "grad_norm": 0.439453125, + "learning_rate": 0.00045913929349476, + "loss": 0.2056, + "step": 133760 + }, + { + "epoch": 5.54, + "grad_norm": 1.703125, + "learning_rate": 0.00045913335143659355, + "loss": 0.2528, + "step": 133770 + }, + { + "epoch": 5.54, + "grad_norm": 0.5546875, + "learning_rate": 0.0004591274089848607, + "loss": 0.2106, + "step": 133780 + }, + { + "epoch": 5.54, + "grad_norm": 0.298828125, + "learning_rate": 0.00045912146613957263, + "loss": 0.2187, + "step": 133790 + }, + { + "epoch": 5.54, + "grad_norm": 0.8203125, + "learning_rate": 0.00045911552290074057, + "loss": 0.2013, + "step": 133800 + }, + { + "epoch": 5.54, + "grad_norm": 0.9921875, + "learning_rate": 0.0004591095792683756, + "loss": 0.2131, + "step": 133810 + }, + { + "epoch": 5.54, + "grad_norm": 0.59375, + "learning_rate": 0.000459103635242489, + "loss": 0.1538, + "step": 133820 + }, + { + "epoch": 5.54, + "grad_norm": 0.72265625, + "learning_rate": 0.00045909769082309204, + "loss": 0.227, + "step": 133830 + }, + { + "epoch": 5.54, + "grad_norm": 1.171875, + "learning_rate": 0.00045909174601019574, + "loss": 0.2353, + "step": 133840 + }, + { + "epoch": 5.54, + "grad_norm": 2.328125, + "learning_rate": 0.00045908580080381136, + "loss": 0.1808, + "step": 133850 + }, + { + "epoch": 5.54, + "grad_norm": 0.734375, + "learning_rate": 0.00045907985520395007, + "loss": 0.2287, + "step": 133860 + }, + { + "epoch": 5.54, + "grad_norm": 0.9140625, + "learning_rate": 0.0004590739092106231, + "loss": 0.238, + "step": 133870 + }, + { + "epoch": 5.55, + "grad_norm": 0.2119140625, + "learning_rate": 0.00045906796282384156, + "loss": 0.1944, + "step": 133880 + }, + { + "epoch": 5.55, + "grad_norm": 0.25, + "learning_rate": 0.00045906201604361673, + "loss": 0.1896, + "step": 133890 + }, + { + "epoch": 5.55, + "grad_norm": 3.109375, + "learning_rate": 0.00045905606886995976, + "loss": 0.2333, + "step": 133900 + }, + { + "epoch": 5.55, + "grad_norm": 0.9296875, + "learning_rate": 0.0004590501213028818, + "loss": 0.1931, + "step": 133910 + }, + { + "epoch": 5.55, + "grad_norm": 0.8984375, + "learning_rate": 0.00045904417334239413, + "loss": 0.1976, + "step": 133920 + }, + { + "epoch": 5.55, + "grad_norm": 0.56640625, + "learning_rate": 0.0004590382249885079, + "loss": 0.1994, + "step": 133930 + }, + { + "epoch": 5.55, + "grad_norm": 1.1015625, + "learning_rate": 0.0004590322762412343, + "loss": 0.1979, + "step": 133940 + }, + { + "epoch": 5.55, + "grad_norm": 0.8203125, + "learning_rate": 0.00045902632710058454, + "loss": 0.2016, + "step": 133950 + }, + { + "epoch": 5.55, + "grad_norm": 1.046875, + "learning_rate": 0.0004590203775665698, + "loss": 0.2153, + "step": 133960 + }, + { + "epoch": 5.55, + "grad_norm": 0.73828125, + "learning_rate": 0.0004590144276392013, + "loss": 0.1832, + "step": 133970 + }, + { + "epoch": 5.55, + "grad_norm": 0.66796875, + "learning_rate": 0.00045900847731849013, + "loss": 0.1886, + "step": 133980 + }, + { + "epoch": 5.55, + "grad_norm": 0.6328125, + "learning_rate": 0.0004590025266044477, + "loss": 0.2072, + "step": 133990 + }, + { + "epoch": 5.55, + "grad_norm": 0.546875, + "learning_rate": 0.000458996575497085, + "loss": 0.1666, + "step": 134000 + }, + { + "epoch": 5.55, + "grad_norm": 0.65234375, + "learning_rate": 0.00045899062399641335, + "loss": 0.2226, + "step": 134010 + }, + { + "epoch": 5.55, + "grad_norm": 0.5625, + "learning_rate": 0.00045898467210244386, + "loss": 0.1971, + "step": 134020 + }, + { + "epoch": 5.55, + "grad_norm": 0.765625, + "learning_rate": 0.0004589787198151879, + "loss": 0.1953, + "step": 134030 + }, + { + "epoch": 5.55, + "grad_norm": 1.015625, + "learning_rate": 0.00045897276713465645, + "loss": 0.2144, + "step": 134040 + }, + { + "epoch": 5.55, + "grad_norm": 0.53125, + "learning_rate": 0.00045896681406086087, + "loss": 0.2246, + "step": 134050 + }, + { + "epoch": 5.55, + "grad_norm": 0.55859375, + "learning_rate": 0.0004589608605938123, + "loss": 0.2214, + "step": 134060 + }, + { + "epoch": 5.55, + "grad_norm": 0.578125, + "learning_rate": 0.00045895490673352195, + "loss": 0.1675, + "step": 134070 + }, + { + "epoch": 5.55, + "grad_norm": 0.466796875, + "learning_rate": 0.00045894895248000105, + "loss": 0.1703, + "step": 134080 + }, + { + "epoch": 5.55, + "grad_norm": 1.6015625, + "learning_rate": 0.0004589429978332608, + "loss": 0.1722, + "step": 134090 + }, + { + "epoch": 5.55, + "grad_norm": 0.51953125, + "learning_rate": 0.0004589370427933124, + "loss": 0.1962, + "step": 134100 + }, + { + "epoch": 5.55, + "grad_norm": 0.6328125, + "learning_rate": 0.000458931087360167, + "loss": 0.2311, + "step": 134110 + }, + { + "epoch": 5.56, + "grad_norm": 0.34765625, + "learning_rate": 0.00045892513153383585, + "loss": 0.1697, + "step": 134120 + }, + { + "epoch": 5.56, + "grad_norm": 0.5546875, + "learning_rate": 0.0004589191753143301, + "loss": 0.2164, + "step": 134130 + }, + { + "epoch": 5.56, + "grad_norm": 0.796875, + "learning_rate": 0.00045891321870166114, + "loss": 0.1991, + "step": 134140 + }, + { + "epoch": 5.56, + "grad_norm": 2.0, + "learning_rate": 0.00045890726169584006, + "loss": 0.2125, + "step": 134150 + }, + { + "epoch": 5.56, + "grad_norm": 0.53515625, + "learning_rate": 0.00045890130429687806, + "loss": 0.1775, + "step": 134160 + }, + { + "epoch": 5.56, + "grad_norm": 0.5546875, + "learning_rate": 0.00045889534650478636, + "loss": 0.1791, + "step": 134170 + }, + { + "epoch": 5.56, + "grad_norm": 1.109375, + "learning_rate": 0.0004588893883195762, + "loss": 0.1985, + "step": 134180 + }, + { + "epoch": 5.56, + "grad_norm": 0.86328125, + "learning_rate": 0.0004588834297412587, + "loss": 0.1864, + "step": 134190 + }, + { + "epoch": 5.56, + "grad_norm": 1.0703125, + "learning_rate": 0.0004588774707698452, + "loss": 0.2324, + "step": 134200 + }, + { + "epoch": 5.56, + "grad_norm": 1.109375, + "learning_rate": 0.0004588715114053468, + "loss": 0.2408, + "step": 134210 + }, + { + "epoch": 5.56, + "grad_norm": 0.498046875, + "learning_rate": 0.00045886555164777475, + "loss": 0.1927, + "step": 134220 + }, + { + "epoch": 5.56, + "grad_norm": 1.03125, + "learning_rate": 0.00045885959149714043, + "loss": 0.256, + "step": 134230 + }, + { + "epoch": 5.56, + "grad_norm": 0.75390625, + "learning_rate": 0.0004588536309534548, + "loss": 0.1883, + "step": 134240 + }, + { + "epoch": 5.56, + "grad_norm": 1.6953125, + "learning_rate": 0.00045884767001672924, + "loss": 0.1979, + "step": 134250 + }, + { + "epoch": 5.56, + "grad_norm": 0.6328125, + "learning_rate": 0.00045884170868697486, + "loss": 0.1868, + "step": 134260 + }, + { + "epoch": 5.56, + "grad_norm": 0.734375, + "learning_rate": 0.000458835746964203, + "loss": 0.1903, + "step": 134270 + }, + { + "epoch": 5.56, + "grad_norm": 0.90234375, + "learning_rate": 0.0004588297848484248, + "loss": 0.2163, + "step": 134280 + }, + { + "epoch": 5.56, + "grad_norm": 0.9609375, + "learning_rate": 0.0004588238223396515, + "loss": 0.211, + "step": 134290 + }, + { + "epoch": 5.56, + "grad_norm": 0.70703125, + "learning_rate": 0.00045881785943789426, + "loss": 0.2209, + "step": 134300 + }, + { + "epoch": 5.56, + "grad_norm": 1.34375, + "learning_rate": 0.00045881189614316444, + "loss": 0.1761, + "step": 134310 + }, + { + "epoch": 5.56, + "grad_norm": 0.4765625, + "learning_rate": 0.00045880593245547314, + "loss": 0.1945, + "step": 134320 + }, + { + "epoch": 5.56, + "grad_norm": 0.30078125, + "learning_rate": 0.0004587999683748316, + "loss": 0.226, + "step": 134330 + }, + { + "epoch": 5.56, + "grad_norm": 0.4921875, + "learning_rate": 0.0004587940039012511, + "loss": 0.1762, + "step": 134340 + }, + { + "epoch": 5.56, + "grad_norm": 0.828125, + "learning_rate": 0.00045878803903474284, + "loss": 0.1762, + "step": 134350 + }, + { + "epoch": 5.57, + "grad_norm": 0.50390625, + "learning_rate": 0.0004587820737753181, + "loss": 0.209, + "step": 134360 + }, + { + "epoch": 5.57, + "grad_norm": 0.6015625, + "learning_rate": 0.00045877610812298787, + "loss": 0.2532, + "step": 134370 + }, + { + "epoch": 5.57, + "grad_norm": 0.6875, + "learning_rate": 0.00045877014207776367, + "loss": 0.2109, + "step": 134380 + }, + { + "epoch": 5.57, + "grad_norm": 0.7578125, + "learning_rate": 0.00045876417563965653, + "loss": 0.2278, + "step": 134390 + }, + { + "epoch": 5.57, + "grad_norm": 0.458984375, + "learning_rate": 0.0004587582088086779, + "loss": 0.2018, + "step": 134400 + }, + { + "epoch": 5.57, + "grad_norm": 0.71875, + "learning_rate": 0.00045875224158483876, + "loss": 0.2274, + "step": 134410 + }, + { + "epoch": 5.57, + "grad_norm": 2.046875, + "learning_rate": 0.0004587462739681505, + "loss": 0.2016, + "step": 134420 + }, + { + "epoch": 5.57, + "grad_norm": 0.76953125, + "learning_rate": 0.0004587403059586243, + "loss": 0.2078, + "step": 134430 + }, + { + "epoch": 5.57, + "grad_norm": 0.8125, + "learning_rate": 0.0004587343375562713, + "loss": 0.2027, + "step": 134440 + }, + { + "epoch": 5.57, + "grad_norm": 0.578125, + "learning_rate": 0.00045872836876110286, + "loss": 0.1798, + "step": 134450 + }, + { + "epoch": 5.57, + "grad_norm": 0.578125, + "learning_rate": 0.0004587223995731302, + "loss": 0.2238, + "step": 134460 + }, + { + "epoch": 5.57, + "grad_norm": 1.2734375, + "learning_rate": 0.0004587164299923645, + "loss": 0.1525, + "step": 134470 + }, + { + "epoch": 5.57, + "grad_norm": 0.369140625, + "learning_rate": 0.000458710460018817, + "loss": 0.1882, + "step": 134480 + }, + { + "epoch": 5.57, + "grad_norm": 1.4375, + "learning_rate": 0.00045870448965249893, + "loss": 0.1998, + "step": 134490 + }, + { + "epoch": 5.57, + "grad_norm": 1.1484375, + "learning_rate": 0.0004586985188934216, + "loss": 0.167, + "step": 134500 + }, + { + "epoch": 5.57, + "grad_norm": 0.88671875, + "learning_rate": 0.0004586925477415962, + "loss": 0.24, + "step": 134510 + }, + { + "epoch": 5.57, + "grad_norm": 0.5703125, + "learning_rate": 0.0004586865761970339, + "loss": 0.18, + "step": 134520 + }, + { + "epoch": 5.57, + "grad_norm": 0.5859375, + "learning_rate": 0.00045868060425974613, + "loss": 0.2133, + "step": 134530 + }, + { + "epoch": 5.57, + "grad_norm": 1.0546875, + "learning_rate": 0.0004586746319297439, + "loss": 0.1903, + "step": 134540 + }, + { + "epoch": 5.57, + "grad_norm": 0.60546875, + "learning_rate": 0.0004586686592070385, + "loss": 0.1459, + "step": 134550 + }, + { + "epoch": 5.57, + "grad_norm": 0.486328125, + "learning_rate": 0.0004586626860916413, + "loss": 0.1762, + "step": 134560 + }, + { + "epoch": 5.57, + "grad_norm": 0.6328125, + "learning_rate": 0.00045865671258356344, + "loss": 0.216, + "step": 134570 + }, + { + "epoch": 5.57, + "grad_norm": 0.58984375, + "learning_rate": 0.00045865073868281615, + "loss": 0.2448, + "step": 134580 + }, + { + "epoch": 5.57, + "grad_norm": 0.458984375, + "learning_rate": 0.00045864476438941073, + "loss": 0.2045, + "step": 134590 + }, + { + "epoch": 5.58, + "grad_norm": 0.328125, + "learning_rate": 0.0004586387897033584, + "loss": 0.2612, + "step": 134600 + }, + { + "epoch": 5.58, + "grad_norm": 0.6640625, + "learning_rate": 0.00045863281462467045, + "loss": 0.1698, + "step": 134610 + }, + { + "epoch": 5.58, + "grad_norm": 0.5625, + "learning_rate": 0.00045862683915335803, + "loss": 0.1989, + "step": 134620 + }, + { + "epoch": 5.58, + "grad_norm": 0.6796875, + "learning_rate": 0.00045862086328943237, + "loss": 0.2784, + "step": 134630 + }, + { + "epoch": 5.58, + "grad_norm": 0.58984375, + "learning_rate": 0.00045861488703290485, + "loss": 0.2035, + "step": 134640 + }, + { + "epoch": 5.58, + "grad_norm": 0.58984375, + "learning_rate": 0.0004586089103837866, + "loss": 0.2116, + "step": 134650 + }, + { + "epoch": 5.58, + "grad_norm": 0.859375, + "learning_rate": 0.0004586029333420889, + "loss": 0.2238, + "step": 134660 + }, + { + "epoch": 5.58, + "grad_norm": 0.640625, + "learning_rate": 0.00045859695590782303, + "loss": 0.1823, + "step": 134670 + }, + { + "epoch": 5.58, + "grad_norm": 0.5234375, + "learning_rate": 0.00045859097808100024, + "loss": 0.2044, + "step": 134680 + }, + { + "epoch": 5.58, + "grad_norm": 0.56640625, + "learning_rate": 0.0004585849998616317, + "loss": 0.2441, + "step": 134690 + }, + { + "epoch": 5.58, + "grad_norm": 1.15625, + "learning_rate": 0.00045857902124972873, + "loss": 0.2201, + "step": 134700 + }, + { + "epoch": 5.58, + "grad_norm": 0.796875, + "learning_rate": 0.0004585730422453026, + "loss": 0.1925, + "step": 134710 + }, + { + "epoch": 5.58, + "grad_norm": 1.015625, + "learning_rate": 0.0004585670628483645, + "loss": 0.1907, + "step": 134720 + }, + { + "epoch": 5.58, + "grad_norm": 0.62890625, + "learning_rate": 0.00045856108305892575, + "loss": 0.1801, + "step": 134730 + }, + { + "epoch": 5.58, + "grad_norm": 0.423828125, + "learning_rate": 0.00045855510287699753, + "loss": 0.1968, + "step": 134740 + }, + { + "epoch": 5.58, + "grad_norm": 0.43359375, + "learning_rate": 0.0004585491223025911, + "loss": 0.206, + "step": 134750 + }, + { + "epoch": 5.58, + "grad_norm": 0.77734375, + "learning_rate": 0.00045854314133571776, + "loss": 0.1585, + "step": 134760 + }, + { + "epoch": 5.58, + "grad_norm": 0.275390625, + "learning_rate": 0.00045853715997638876, + "loss": 0.2002, + "step": 134770 + }, + { + "epoch": 5.58, + "grad_norm": 0.5078125, + "learning_rate": 0.0004585311782246154, + "loss": 0.2074, + "step": 134780 + }, + { + "epoch": 5.58, + "grad_norm": 0.6796875, + "learning_rate": 0.0004585251960804088, + "loss": 0.192, + "step": 134790 + }, + { + "epoch": 5.58, + "grad_norm": 0.78515625, + "learning_rate": 0.0004585192135437803, + "loss": 0.2214, + "step": 134800 + }, + { + "epoch": 5.58, + "grad_norm": 0.6484375, + "learning_rate": 0.0004585132306147412, + "loss": 0.1793, + "step": 134810 + }, + { + "epoch": 5.58, + "grad_norm": 0.349609375, + "learning_rate": 0.00045850724729330273, + "loss": 0.1853, + "step": 134820 + }, + { + "epoch": 5.58, + "grad_norm": 1.3828125, + "learning_rate": 0.00045850126357947606, + "loss": 0.2174, + "step": 134830 + }, + { + "epoch": 5.59, + "grad_norm": 0.49609375, + "learning_rate": 0.00045849527947327266, + "loss": 0.1877, + "step": 134840 + }, + { + "epoch": 5.59, + "grad_norm": 1.0234375, + "learning_rate": 0.00045848929497470354, + "loss": 0.1892, + "step": 134850 + }, + { + "epoch": 5.59, + "grad_norm": 0.0, + "learning_rate": 0.00045848331008378014, + "loss": 0.2072, + "step": 134860 + }, + { + "epoch": 5.59, + "grad_norm": 0.9296875, + "learning_rate": 0.00045847732480051363, + "loss": 0.2336, + "step": 134870 + }, + { + "epoch": 5.59, + "grad_norm": 0.609375, + "learning_rate": 0.00045847133912491533, + "loss": 0.2297, + "step": 134880 + }, + { + "epoch": 5.59, + "grad_norm": 0.640625, + "learning_rate": 0.0004584653530569964, + "loss": 0.2259, + "step": 134890 + }, + { + "epoch": 5.59, + "grad_norm": 0.328125, + "learning_rate": 0.00045845936659676833, + "loss": 0.2045, + "step": 134900 + }, + { + "epoch": 5.59, + "grad_norm": 0.7109375, + "learning_rate": 0.0004584533797442422, + "loss": 0.2502, + "step": 134910 + }, + { + "epoch": 5.59, + "grad_norm": 0.7265625, + "learning_rate": 0.0004584473924994293, + "loss": 0.2953, + "step": 134920 + }, + { + "epoch": 5.59, + "grad_norm": 0.56640625, + "learning_rate": 0.0004584414048623409, + "loss": 0.2581, + "step": 134930 + }, + { + "epoch": 5.59, + "grad_norm": 0.78125, + "learning_rate": 0.0004584354168329883, + "loss": 0.192, + "step": 134940 + }, + { + "epoch": 5.59, + "grad_norm": 0.90625, + "learning_rate": 0.0004584294284113828, + "loss": 0.2191, + "step": 134950 + }, + { + "epoch": 5.59, + "grad_norm": 0.77734375, + "learning_rate": 0.0004584234395975355, + "loss": 0.1907, + "step": 134960 + }, + { + "epoch": 5.59, + "grad_norm": 0.408203125, + "learning_rate": 0.00045841745039145793, + "loss": 0.1988, + "step": 134970 + }, + { + "epoch": 5.59, + "grad_norm": 0.4453125, + "learning_rate": 0.00045841146079316115, + "loss": 0.2206, + "step": 134980 + }, + { + "epoch": 5.59, + "grad_norm": 0.98046875, + "learning_rate": 0.0004584054708026565, + "loss": 0.194, + "step": 134990 + }, + { + "epoch": 5.59, + "grad_norm": 0.68359375, + "learning_rate": 0.0004583994804199554, + "loss": 0.1853, + "step": 135000 + }, + { + "epoch": 5.59, + "grad_norm": 0.97265625, + "learning_rate": 0.00045839348964506877, + "loss": 0.168, + "step": 135010 + }, + { + "epoch": 5.59, + "grad_norm": 2.796875, + "learning_rate": 0.0004583874984780082, + "loss": 0.2112, + "step": 135020 + }, + { + "epoch": 5.59, + "grad_norm": 0.67578125, + "learning_rate": 0.0004583815069187849, + "loss": 0.2071, + "step": 135030 + }, + { + "epoch": 5.59, + "grad_norm": 0.9140625, + "learning_rate": 0.0004583755149674101, + "loss": 0.2048, + "step": 135040 + }, + { + "epoch": 5.59, + "grad_norm": 1.1015625, + "learning_rate": 0.00045836952262389504, + "loss": 0.2275, + "step": 135050 + }, + { + "epoch": 5.59, + "grad_norm": 0.66015625, + "learning_rate": 0.00045836352988825105, + "loss": 0.1963, + "step": 135060 + }, + { + "epoch": 5.59, + "grad_norm": 1.59375, + "learning_rate": 0.00045835753676048936, + "loss": 0.1822, + "step": 135070 + }, + { + "epoch": 5.59, + "grad_norm": 0.44921875, + "learning_rate": 0.00045835154324062134, + "loss": 0.2181, + "step": 135080 + }, + { + "epoch": 5.6, + "grad_norm": 0.80859375, + "learning_rate": 0.0004583455493286582, + "loss": 0.2089, + "step": 135090 + }, + { + "epoch": 5.6, + "grad_norm": 0.58203125, + "learning_rate": 0.0004583395550246112, + "loss": 0.2282, + "step": 135100 + }, + { + "epoch": 5.6, + "grad_norm": 0.78125, + "learning_rate": 0.0004583335603284917, + "loss": 0.1593, + "step": 135110 + }, + { + "epoch": 5.6, + "grad_norm": 0.70703125, + "learning_rate": 0.000458327565240311, + "loss": 0.194, + "step": 135120 + }, + { + "epoch": 5.6, + "grad_norm": 0.212890625, + "learning_rate": 0.00045832156976008024, + "loss": 0.1764, + "step": 135130 + }, + { + "epoch": 5.6, + "grad_norm": 0.67578125, + "learning_rate": 0.00045831557388781075, + "loss": 0.2245, + "step": 135140 + }, + { + "epoch": 5.6, + "grad_norm": 0.99609375, + "learning_rate": 0.00045830957762351386, + "loss": 0.1929, + "step": 135150 + }, + { + "epoch": 5.6, + "grad_norm": 1.1328125, + "learning_rate": 0.0004583035809672009, + "loss": 0.1935, + "step": 135160 + }, + { + "epoch": 5.6, + "grad_norm": 0.43359375, + "learning_rate": 0.0004582975839188831, + "loss": 0.2275, + "step": 135170 + }, + { + "epoch": 5.6, + "grad_norm": 0.6640625, + "learning_rate": 0.0004582915864785717, + "loss": 0.1882, + "step": 135180 + }, + { + "epoch": 5.6, + "grad_norm": 1.109375, + "learning_rate": 0.000458285588646278, + "loss": 0.209, + "step": 135190 + }, + { + "epoch": 5.6, + "grad_norm": 1.2890625, + "learning_rate": 0.0004582795904220134, + "loss": 0.2119, + "step": 135200 + }, + { + "epoch": 5.6, + "grad_norm": 0.6171875, + "learning_rate": 0.00045827359180578903, + "loss": 0.2515, + "step": 135210 + }, + { + "epoch": 5.6, + "grad_norm": 0.57421875, + "learning_rate": 0.00045826759279761633, + "loss": 0.1923, + "step": 135220 + }, + { + "epoch": 5.6, + "grad_norm": 0.59375, + "learning_rate": 0.0004582615933975064, + "loss": 0.2232, + "step": 135230 + }, + { + "epoch": 5.6, + "grad_norm": 0.9453125, + "learning_rate": 0.00045825559360547074, + "loss": 0.1945, + "step": 135240 + }, + { + "epoch": 5.6, + "grad_norm": 0.70703125, + "learning_rate": 0.0004582495934215205, + "loss": 0.2355, + "step": 135250 + }, + { + "epoch": 5.6, + "grad_norm": 1.1328125, + "learning_rate": 0.000458243592845667, + "loss": 0.258, + "step": 135260 + }, + { + "epoch": 5.6, + "grad_norm": 0.58203125, + "learning_rate": 0.0004582375918779215, + "loss": 0.2047, + "step": 135270 + }, + { + "epoch": 5.6, + "grad_norm": 0.78125, + "learning_rate": 0.0004582315905182954, + "loss": 0.182, + "step": 135280 + }, + { + "epoch": 5.6, + "grad_norm": 0.578125, + "learning_rate": 0.0004582255887668, + "loss": 0.2189, + "step": 135290 + }, + { + "epoch": 5.6, + "grad_norm": 1.3671875, + "learning_rate": 0.00045821958662344643, + "loss": 0.2334, + "step": 135300 + }, + { + "epoch": 5.6, + "grad_norm": 1.203125, + "learning_rate": 0.0004582135840882461, + "loss": 0.2104, + "step": 135310 + }, + { + "epoch": 5.6, + "grad_norm": 0.328125, + "learning_rate": 0.00045820758116121033, + "loss": 0.1874, + "step": 135320 + }, + { + "epoch": 5.61, + "grad_norm": 0.9375, + "learning_rate": 0.0004582015778423504, + "loss": 0.2351, + "step": 135330 + }, + { + "epoch": 5.61, + "grad_norm": 0.73828125, + "learning_rate": 0.0004581955741316775, + "loss": 0.2353, + "step": 135340 + }, + { + "epoch": 5.61, + "grad_norm": 0.734375, + "learning_rate": 0.00045818957002920305, + "loss": 0.2088, + "step": 135350 + }, + { + "epoch": 5.61, + "grad_norm": 0.31640625, + "learning_rate": 0.00045818356553493835, + "loss": 0.2506, + "step": 135360 + }, + { + "epoch": 5.61, + "grad_norm": 1.0625, + "learning_rate": 0.0004581775606488946, + "loss": 0.192, + "step": 135370 + }, + { + "epoch": 5.61, + "grad_norm": 0.490234375, + "learning_rate": 0.0004581715553710831, + "loss": 0.1722, + "step": 135380 + }, + { + "epoch": 5.61, + "grad_norm": 0.388671875, + "learning_rate": 0.0004581655497015154, + "loss": 0.1746, + "step": 135390 + }, + { + "epoch": 5.61, + "grad_norm": 0.93359375, + "learning_rate": 0.00045815954364020244, + "loss": 0.1741, + "step": 135400 + }, + { + "epoch": 5.61, + "grad_norm": 0.376953125, + "learning_rate": 0.0004581535371871558, + "loss": 0.2912, + "step": 135410 + }, + { + "epoch": 5.61, + "grad_norm": 0.7265625, + "learning_rate": 0.00045814753034238665, + "loss": 0.1962, + "step": 135420 + }, + { + "epoch": 5.61, + "grad_norm": 0.263671875, + "learning_rate": 0.0004581415231059063, + "loss": 0.2266, + "step": 135430 + }, + { + "epoch": 5.61, + "grad_norm": 0.0, + "learning_rate": 0.00045813551547772613, + "loss": 0.1531, + "step": 135440 + }, + { + "epoch": 5.61, + "grad_norm": 0.66015625, + "learning_rate": 0.0004581295074578574, + "loss": 0.2065, + "step": 135450 + }, + { + "epoch": 5.61, + "grad_norm": 0.5859375, + "learning_rate": 0.0004581234990463114, + "loss": 0.2486, + "step": 135460 + }, + { + "epoch": 5.61, + "grad_norm": 1.140625, + "learning_rate": 0.00045811749024309944, + "loss": 0.1844, + "step": 135470 + }, + { + "epoch": 5.61, + "grad_norm": 0.73828125, + "learning_rate": 0.00045811148104823286, + "loss": 0.2453, + "step": 135480 + }, + { + "epoch": 5.61, + "grad_norm": 0.86328125, + "learning_rate": 0.0004581054714617229, + "loss": 0.2565, + "step": 135490 + }, + { + "epoch": 5.61, + "grad_norm": 0.5703125, + "learning_rate": 0.00045809946148358096, + "loss": 0.2758, + "step": 135500 + }, + { + "epoch": 5.61, + "grad_norm": 0.734375, + "learning_rate": 0.0004580934511138183, + "loss": 0.2284, + "step": 135510 + }, + { + "epoch": 5.61, + "grad_norm": 0.96875, + "learning_rate": 0.0004580874403524462, + "loss": 0.163, + "step": 135520 + }, + { + "epoch": 5.61, + "grad_norm": 0.54296875, + "learning_rate": 0.00045808142919947603, + "loss": 0.2029, + "step": 135530 + }, + { + "epoch": 5.61, + "grad_norm": 1.328125, + "learning_rate": 0.0004580754176549191, + "loss": 0.1637, + "step": 135540 + }, + { + "epoch": 5.61, + "grad_norm": 0.66796875, + "learning_rate": 0.00045806940571878666, + "loss": 0.195, + "step": 135550 + }, + { + "epoch": 5.61, + "grad_norm": 1.3203125, + "learning_rate": 0.0004580633933910901, + "loss": 0.2308, + "step": 135560 + }, + { + "epoch": 5.62, + "grad_norm": 0.2431640625, + "learning_rate": 0.00045805738067184067, + "loss": 0.1717, + "step": 135570 + }, + { + "epoch": 5.62, + "grad_norm": 0.65234375, + "learning_rate": 0.0004580513675610498, + "loss": 0.2253, + "step": 135580 + }, + { + "epoch": 5.62, + "grad_norm": 0.875, + "learning_rate": 0.0004580453540587286, + "loss": 0.2312, + "step": 135590 + }, + { + "epoch": 5.62, + "grad_norm": 0.447265625, + "learning_rate": 0.00045803934016488864, + "loss": 0.1669, + "step": 135600 + }, + { + "epoch": 5.62, + "grad_norm": 0.1865234375, + "learning_rate": 0.00045803332587954107, + "loss": 0.2113, + "step": 135610 + }, + { + "epoch": 5.62, + "grad_norm": 0.8359375, + "learning_rate": 0.0004580273112026972, + "loss": 0.2581, + "step": 135620 + }, + { + "epoch": 5.62, + "grad_norm": 1.265625, + "learning_rate": 0.0004580212961343684, + "loss": 0.2322, + "step": 135630 + }, + { + "epoch": 5.62, + "grad_norm": 0.4765625, + "learning_rate": 0.000458015280674566, + "loss": 0.237, + "step": 135640 + }, + { + "epoch": 5.62, + "grad_norm": 0.408203125, + "learning_rate": 0.0004580092648233013, + "loss": 0.2004, + "step": 135650 + }, + { + "epoch": 5.62, + "grad_norm": 0.361328125, + "learning_rate": 0.0004580032485805856, + "loss": 0.1246, + "step": 135660 + }, + { + "epoch": 5.62, + "grad_norm": 1.015625, + "learning_rate": 0.0004579972319464304, + "loss": 0.2001, + "step": 135670 + }, + { + "epoch": 5.62, + "grad_norm": 1.03125, + "learning_rate": 0.00045799121492084673, + "loss": 0.2516, + "step": 135680 + }, + { + "epoch": 5.62, + "grad_norm": 0.9296875, + "learning_rate": 0.000457985197503846, + "loss": 0.232, + "step": 135690 + }, + { + "epoch": 5.62, + "grad_norm": 0.37109375, + "learning_rate": 0.00045797917969543975, + "loss": 0.2372, + "step": 135700 + }, + { + "epoch": 5.62, + "grad_norm": 0.69921875, + "learning_rate": 0.00045797316149563905, + "loss": 0.2105, + "step": 135710 + }, + { + "epoch": 5.62, + "grad_norm": 0.39453125, + "learning_rate": 0.00045796714290445535, + "loss": 0.2208, + "step": 135720 + }, + { + "epoch": 5.62, + "grad_norm": 0.609375, + "learning_rate": 0.0004579611239218999, + "loss": 0.2175, + "step": 135730 + }, + { + "epoch": 5.62, + "grad_norm": 2.078125, + "learning_rate": 0.0004579551045479841, + "loss": 0.2119, + "step": 135740 + }, + { + "epoch": 5.62, + "grad_norm": 1.0703125, + "learning_rate": 0.00045794908478271923, + "loss": 0.2058, + "step": 135750 + }, + { + "epoch": 5.62, + "grad_norm": 0.9140625, + "learning_rate": 0.0004579430646261167, + "loss": 0.1678, + "step": 135760 + }, + { + "epoch": 5.62, + "grad_norm": 0.71875, + "learning_rate": 0.00045793704407818777, + "loss": 0.2603, + "step": 135770 + }, + { + "epoch": 5.62, + "grad_norm": 0.51953125, + "learning_rate": 0.00045793102313894374, + "loss": 0.2191, + "step": 135780 + }, + { + "epoch": 5.62, + "grad_norm": 0.76171875, + "learning_rate": 0.00045792500180839597, + "loss": 0.147, + "step": 135790 + }, + { + "epoch": 5.62, + "grad_norm": 1.0078125, + "learning_rate": 0.00045791898008655583, + "loss": 0.29, + "step": 135800 + }, + { + "epoch": 5.63, + "grad_norm": 0.60546875, + "learning_rate": 0.00045791295797343456, + "loss": 0.2268, + "step": 135810 + }, + { + "epoch": 5.63, + "grad_norm": 0.51171875, + "learning_rate": 0.0004579069354690436, + "loss": 0.24, + "step": 135820 + }, + { + "epoch": 5.63, + "grad_norm": 0.95703125, + "learning_rate": 0.00045790091257339426, + "loss": 0.2023, + "step": 135830 + }, + { + "epoch": 5.63, + "grad_norm": 0.400390625, + "learning_rate": 0.00045789488928649793, + "loss": 0.1858, + "step": 135840 + }, + { + "epoch": 5.63, + "grad_norm": 0.5390625, + "learning_rate": 0.00045788886560836574, + "loss": 0.2213, + "step": 135850 + }, + { + "epoch": 5.63, + "grad_norm": 0.26953125, + "learning_rate": 0.00045788284153900916, + "loss": 0.2283, + "step": 135860 + }, + { + "epoch": 5.63, + "grad_norm": 0.6640625, + "learning_rate": 0.0004578768170784396, + "loss": 0.1755, + "step": 135870 + }, + { + "epoch": 5.63, + "grad_norm": 0.9140625, + "learning_rate": 0.00045787079222666825, + "loss": 0.2508, + "step": 135880 + }, + { + "epoch": 5.63, + "grad_norm": 0.302734375, + "learning_rate": 0.0004578647669837066, + "loss": 0.166, + "step": 135890 + }, + { + "epoch": 5.63, + "grad_norm": 0.76953125, + "learning_rate": 0.0004578587413495658, + "loss": 0.2535, + "step": 135900 + }, + { + "epoch": 5.63, + "grad_norm": 0.90625, + "learning_rate": 0.0004578527153242573, + "loss": 0.2396, + "step": 135910 + }, + { + "epoch": 5.63, + "grad_norm": 0.53125, + "learning_rate": 0.0004578466889077925, + "loss": 0.2277, + "step": 135920 + }, + { + "epoch": 5.63, + "grad_norm": 0.43359375, + "learning_rate": 0.0004578406621001827, + "loss": 0.2405, + "step": 135930 + }, + { + "epoch": 5.63, + "grad_norm": 1.0703125, + "learning_rate": 0.00045783463490143916, + "loss": 0.2085, + "step": 135940 + }, + { + "epoch": 5.63, + "grad_norm": 0.50390625, + "learning_rate": 0.0004578286073115733, + "loss": 0.1511, + "step": 135950 + }, + { + "epoch": 5.63, + "grad_norm": 0.57421875, + "learning_rate": 0.00045782257933059644, + "loss": 0.2217, + "step": 135960 + }, + { + "epoch": 5.63, + "grad_norm": 1.5703125, + "learning_rate": 0.0004578165509585199, + "loss": 0.2137, + "step": 135970 + }, + { + "epoch": 5.63, + "grad_norm": 0.7109375, + "learning_rate": 0.0004578105221953551, + "loss": 0.2115, + "step": 135980 + }, + { + "epoch": 5.63, + "grad_norm": 0.482421875, + "learning_rate": 0.00045780449304111333, + "loss": 0.1816, + "step": 135990 + }, + { + "epoch": 5.63, + "grad_norm": 1.859375, + "learning_rate": 0.00045779846349580595, + "loss": 0.2077, + "step": 136000 + }, + { + "epoch": 5.63, + "grad_norm": 0.64453125, + "learning_rate": 0.00045779243355944426, + "loss": 0.2612, + "step": 136010 + }, + { + "epoch": 5.63, + "grad_norm": 2.59375, + "learning_rate": 0.0004577864032320397, + "loss": 0.1897, + "step": 136020 + }, + { + "epoch": 5.63, + "grad_norm": 0.57421875, + "learning_rate": 0.0004577803725136035, + "loss": 0.2819, + "step": 136030 + }, + { + "epoch": 5.63, + "grad_norm": 0.8515625, + "learning_rate": 0.0004577743414041472, + "loss": 0.2281, + "step": 136040 + }, + { + "epoch": 5.64, + "grad_norm": 0.380859375, + "learning_rate": 0.0004577683099036819, + "loss": 0.3223, + "step": 136050 + }, + { + "epoch": 5.64, + "grad_norm": 0.376953125, + "learning_rate": 0.00045776227801221916, + "loss": 0.2437, + "step": 136060 + }, + { + "epoch": 5.64, + "grad_norm": 0.875, + "learning_rate": 0.00045775624572977016, + "loss": 0.2218, + "step": 136070 + }, + { + "epoch": 5.64, + "grad_norm": 0.88671875, + "learning_rate": 0.0004577502130563465, + "loss": 0.2028, + "step": 136080 + }, + { + "epoch": 5.64, + "grad_norm": 0.6328125, + "learning_rate": 0.0004577441799919592, + "loss": 0.1601, + "step": 136090 + }, + { + "epoch": 5.64, + "grad_norm": 0.26953125, + "learning_rate": 0.0004577381465366199, + "loss": 0.1983, + "step": 136100 + }, + { + "epoch": 5.64, + "grad_norm": 0.84765625, + "learning_rate": 0.0004577321126903398, + "loss": 0.2272, + "step": 136110 + }, + { + "epoch": 5.64, + "grad_norm": 0.71875, + "learning_rate": 0.0004577260784531303, + "loss": 0.1862, + "step": 136120 + }, + { + "epoch": 5.64, + "grad_norm": 0.78515625, + "learning_rate": 0.00045772004382500276, + "loss": 0.2489, + "step": 136130 + }, + { + "epoch": 5.64, + "grad_norm": 0.53125, + "learning_rate": 0.00045771400880596845, + "loss": 0.2238, + "step": 136140 + }, + { + "epoch": 5.64, + "grad_norm": 0.80859375, + "learning_rate": 0.0004577079733960389, + "loss": 0.2052, + "step": 136150 + }, + { + "epoch": 5.64, + "grad_norm": 1.8359375, + "learning_rate": 0.0004577019375952254, + "loss": 0.2642, + "step": 136160 + }, + { + "epoch": 5.64, + "grad_norm": 0.67578125, + "learning_rate": 0.0004576959014035392, + "loss": 0.2182, + "step": 136170 + }, + { + "epoch": 5.64, + "grad_norm": 0.5625, + "learning_rate": 0.0004576898648209918, + "loss": 0.2058, + "step": 136180 + }, + { + "epoch": 5.64, + "grad_norm": 1.1015625, + "learning_rate": 0.0004576838278475944, + "loss": 0.1866, + "step": 136190 + }, + { + "epoch": 5.64, + "grad_norm": 1.203125, + "learning_rate": 0.0004576777904833586, + "loss": 0.2168, + "step": 136200 + }, + { + "epoch": 5.64, + "grad_norm": 1.0859375, + "learning_rate": 0.0004576717527282956, + "loss": 0.2033, + "step": 136210 + }, + { + "epoch": 5.64, + "grad_norm": 0.55859375, + "learning_rate": 0.0004576657145824168, + "loss": 0.1974, + "step": 136220 + }, + { + "epoch": 5.64, + "grad_norm": 0.703125, + "learning_rate": 0.00045765967604573346, + "loss": 0.2198, + "step": 136230 + }, + { + "epoch": 5.64, + "grad_norm": 0.57421875, + "learning_rate": 0.0004576536371182571, + "loss": 0.1752, + "step": 136240 + }, + { + "epoch": 5.64, + "grad_norm": 1.3515625, + "learning_rate": 0.000457647597799999, + "loss": 0.2386, + "step": 136250 + }, + { + "epoch": 5.64, + "grad_norm": 1.015625, + "learning_rate": 0.0004576415580909705, + "loss": 0.2168, + "step": 136260 + }, + { + "epoch": 5.64, + "grad_norm": 1.4296875, + "learning_rate": 0.00045763551799118306, + "loss": 0.2106, + "step": 136270 + }, + { + "epoch": 5.64, + "grad_norm": 0.72265625, + "learning_rate": 0.0004576294775006481, + "loss": 0.2603, + "step": 136280 + }, + { + "epoch": 5.65, + "grad_norm": 0.7109375, + "learning_rate": 0.0004576234366193767, + "loss": 0.1929, + "step": 136290 + }, + { + "epoch": 5.65, + "grad_norm": 0.78125, + "learning_rate": 0.00045761739534738054, + "loss": 0.1986, + "step": 136300 + }, + { + "epoch": 5.65, + "grad_norm": 0.8515625, + "learning_rate": 0.0004576113536846708, + "loss": 0.2183, + "step": 136310 + }, + { + "epoch": 5.65, + "grad_norm": 0.61328125, + "learning_rate": 0.00045760531163125897, + "loss": 0.2039, + "step": 136320 + }, + { + "epoch": 5.65, + "grad_norm": 0.384765625, + "learning_rate": 0.00045759926918715634, + "loss": 0.1635, + "step": 136330 + }, + { + "epoch": 5.65, + "grad_norm": 0.765625, + "learning_rate": 0.00045759322635237424, + "loss": 0.2197, + "step": 136340 + }, + { + "epoch": 5.65, + "grad_norm": 0.427734375, + "learning_rate": 0.0004575871831269242, + "loss": 0.2203, + "step": 136350 + }, + { + "epoch": 5.65, + "grad_norm": 0.302734375, + "learning_rate": 0.0004575811395108175, + "loss": 0.1984, + "step": 136360 + }, + { + "epoch": 5.65, + "grad_norm": 0.80078125, + "learning_rate": 0.00045757509550406537, + "loss": 0.198, + "step": 136370 + }, + { + "epoch": 5.65, + "grad_norm": 0.38671875, + "learning_rate": 0.00045756905110667944, + "loss": 0.198, + "step": 136380 + }, + { + "epoch": 5.65, + "grad_norm": 0.5546875, + "learning_rate": 0.000457563006318671, + "loss": 0.1936, + "step": 136390 + }, + { + "epoch": 5.65, + "grad_norm": 0.61328125, + "learning_rate": 0.0004575569611400513, + "loss": 0.2173, + "step": 136400 + }, + { + "epoch": 5.65, + "grad_norm": 0.41015625, + "learning_rate": 0.0004575509155708319, + "loss": 0.1802, + "step": 136410 + }, + { + "epoch": 5.65, + "grad_norm": 0.92578125, + "learning_rate": 0.00045754486961102403, + "loss": 0.1857, + "step": 136420 + }, + { + "epoch": 5.65, + "grad_norm": 1.4453125, + "learning_rate": 0.0004575388232606392, + "loss": 0.2073, + "step": 136430 + }, + { + "epoch": 5.65, + "grad_norm": 0.4609375, + "learning_rate": 0.0004575327765196887, + "loss": 0.2216, + "step": 136440 + }, + { + "epoch": 5.65, + "grad_norm": 1.78125, + "learning_rate": 0.00045752672938818384, + "loss": 0.181, + "step": 136450 + }, + { + "epoch": 5.65, + "grad_norm": 0.44921875, + "learning_rate": 0.00045752068186613614, + "loss": 0.1567, + "step": 136460 + }, + { + "epoch": 5.65, + "grad_norm": 1.7265625, + "learning_rate": 0.000457514633953557, + "loss": 0.1944, + "step": 136470 + }, + { + "epoch": 5.65, + "grad_norm": 1.171875, + "learning_rate": 0.0004575085856504576, + "loss": 0.1506, + "step": 136480 + }, + { + "epoch": 5.65, + "grad_norm": 0.77734375, + "learning_rate": 0.0004575025369568495, + "loss": 0.2401, + "step": 136490 + }, + { + "epoch": 5.65, + "grad_norm": 0.76953125, + "learning_rate": 0.00045749648787274405, + "loss": 0.191, + "step": 136500 + }, + { + "epoch": 5.65, + "grad_norm": 0.76953125, + "learning_rate": 0.00045749043839815265, + "loss": 0.2059, + "step": 136510 + }, + { + "epoch": 5.65, + "grad_norm": 1.3046875, + "learning_rate": 0.0004574843885330866, + "loss": 0.2129, + "step": 136520 + }, + { + "epoch": 5.66, + "grad_norm": 0.373046875, + "learning_rate": 0.0004574783382775574, + "loss": 0.1789, + "step": 136530 + }, + { + "epoch": 5.66, + "grad_norm": 0.32421875, + "learning_rate": 0.0004574722876315762, + "loss": 0.1509, + "step": 136540 + }, + { + "epoch": 5.66, + "grad_norm": 0.92578125, + "learning_rate": 0.0004574662365951547, + "loss": 0.2508, + "step": 136550 + }, + { + "epoch": 5.66, + "grad_norm": 1.0859375, + "learning_rate": 0.00045746018516830417, + "loss": 0.2044, + "step": 136560 + }, + { + "epoch": 5.66, + "grad_norm": 0.74609375, + "learning_rate": 0.00045745413335103593, + "loss": 0.2268, + "step": 136570 + }, + { + "epoch": 5.66, + "grad_norm": 0.92578125, + "learning_rate": 0.00045744808114336145, + "loss": 0.2018, + "step": 136580 + }, + { + "epoch": 5.66, + "grad_norm": 1.0625, + "learning_rate": 0.00045744202854529207, + "loss": 0.2152, + "step": 136590 + }, + { + "epoch": 5.66, + "grad_norm": 0.3984375, + "learning_rate": 0.00045743597555683916, + "loss": 0.2279, + "step": 136600 + }, + { + "epoch": 5.66, + "grad_norm": 0.71484375, + "learning_rate": 0.00045742992217801415, + "loss": 0.2234, + "step": 136610 + }, + { + "epoch": 5.66, + "grad_norm": 0.302734375, + "learning_rate": 0.00045742386840882845, + "loss": 0.2043, + "step": 136620 + }, + { + "epoch": 5.66, + "grad_norm": 0.306640625, + "learning_rate": 0.0004574178142492934, + "loss": 0.2249, + "step": 136630 + }, + { + "epoch": 5.66, + "grad_norm": 2.875, + "learning_rate": 0.00045741175969942047, + "loss": 0.2119, + "step": 136640 + }, + { + "epoch": 5.66, + "grad_norm": 0.87109375, + "learning_rate": 0.00045740570475922095, + "loss": 0.2322, + "step": 136650 + }, + { + "epoch": 5.66, + "grad_norm": 0.294921875, + "learning_rate": 0.00045739964942870633, + "loss": 0.164, + "step": 136660 + }, + { + "epoch": 5.66, + "grad_norm": 0.40625, + "learning_rate": 0.000457393593707888, + "loss": 0.1875, + "step": 136670 + }, + { + "epoch": 5.66, + "grad_norm": 1.1171875, + "learning_rate": 0.0004573875375967773, + "loss": 0.2006, + "step": 136680 + }, + { + "epoch": 5.66, + "grad_norm": 0.578125, + "learning_rate": 0.00045738148109538557, + "loss": 0.2064, + "step": 136690 + }, + { + "epoch": 5.66, + "grad_norm": 0.4609375, + "learning_rate": 0.00045737542420372433, + "loss": 0.1778, + "step": 136700 + }, + { + "epoch": 5.66, + "grad_norm": 0.56640625, + "learning_rate": 0.00045736936692180496, + "loss": 0.1917, + "step": 136710 + }, + { + "epoch": 5.66, + "grad_norm": 0.5, + "learning_rate": 0.0004573633092496389, + "loss": 0.1509, + "step": 136720 + }, + { + "epoch": 5.66, + "grad_norm": 1.0625, + "learning_rate": 0.0004573572511872374, + "loss": 0.1999, + "step": 136730 + }, + { + "epoch": 5.66, + "grad_norm": 0.91015625, + "learning_rate": 0.0004573511927346119, + "loss": 0.1698, + "step": 136740 + }, + { + "epoch": 5.66, + "grad_norm": 0.41015625, + "learning_rate": 0.0004573451338917739, + "loss": 0.1694, + "step": 136750 + }, + { + "epoch": 5.66, + "grad_norm": 1.015625, + "learning_rate": 0.00045733907465873484, + "loss": 0.199, + "step": 136760 + }, + { + "epoch": 5.66, + "grad_norm": 0.71875, + "learning_rate": 0.00045733301503550597, + "loss": 0.2295, + "step": 136770 + }, + { + "epoch": 5.67, + "grad_norm": 0.7734375, + "learning_rate": 0.00045732695502209865, + "loss": 0.2381, + "step": 136780 + }, + { + "epoch": 5.67, + "grad_norm": 0.76171875, + "learning_rate": 0.0004573208946185245, + "loss": 0.1952, + "step": 136790 + }, + { + "epoch": 5.67, + "grad_norm": 1.875, + "learning_rate": 0.00045731483382479475, + "loss": 0.2091, + "step": 136800 + }, + { + "epoch": 5.67, + "grad_norm": 0.6171875, + "learning_rate": 0.000457308772640921, + "loss": 0.1687, + "step": 136810 + }, + { + "epoch": 5.67, + "grad_norm": 1.0, + "learning_rate": 0.0004573027110669144, + "loss": 0.1997, + "step": 136820 + }, + { + "epoch": 5.67, + "grad_norm": 0.76953125, + "learning_rate": 0.00045729664910278654, + "loss": 0.1998, + "step": 136830 + }, + { + "epoch": 5.67, + "grad_norm": 0.69921875, + "learning_rate": 0.00045729058674854874, + "loss": 0.1914, + "step": 136840 + }, + { + "epoch": 5.67, + "grad_norm": 0.3984375, + "learning_rate": 0.0004572845240042125, + "loss": 0.1628, + "step": 136850 + }, + { + "epoch": 5.67, + "grad_norm": 0.5390625, + "learning_rate": 0.0004572784608697891, + "loss": 0.1863, + "step": 136860 + }, + { + "epoch": 5.67, + "grad_norm": 0.76953125, + "learning_rate": 0.00045727239734529, + "loss": 0.2581, + "step": 136870 + }, + { + "epoch": 5.67, + "grad_norm": 1.234375, + "learning_rate": 0.0004572663334307267, + "loss": 0.2421, + "step": 136880 + }, + { + "epoch": 5.67, + "grad_norm": 0.703125, + "learning_rate": 0.0004572602691261105, + "loss": 0.1945, + "step": 136890 + }, + { + "epoch": 5.67, + "grad_norm": 0.79296875, + "learning_rate": 0.0004572542044314529, + "loss": 0.1838, + "step": 136900 + }, + { + "epoch": 5.67, + "grad_norm": 0.7890625, + "learning_rate": 0.00045724813934676524, + "loss": 0.2294, + "step": 136910 + }, + { + "epoch": 5.67, + "grad_norm": 0.671875, + "learning_rate": 0.000457242073872059, + "loss": 0.1664, + "step": 136920 + }, + { + "epoch": 5.67, + "grad_norm": 0.458984375, + "learning_rate": 0.00045723600800734557, + "loss": 0.2165, + "step": 136930 + }, + { + "epoch": 5.67, + "grad_norm": 0.275390625, + "learning_rate": 0.0004572299417526363, + "loss": 0.1899, + "step": 136940 + }, + { + "epoch": 5.67, + "grad_norm": 0.44140625, + "learning_rate": 0.00045722387510794264, + "loss": 0.1693, + "step": 136950 + }, + { + "epoch": 5.67, + "grad_norm": 0.48046875, + "learning_rate": 0.00045721780807327604, + "loss": 0.2051, + "step": 136960 + }, + { + "epoch": 5.67, + "grad_norm": 0.31640625, + "learning_rate": 0.0004572117406486479, + "loss": 0.2419, + "step": 136970 + }, + { + "epoch": 5.67, + "grad_norm": 0.515625, + "learning_rate": 0.0004572056728340697, + "loss": 0.2747, + "step": 136980 + }, + { + "epoch": 5.67, + "grad_norm": 1.0234375, + "learning_rate": 0.00045719960462955274, + "loss": 0.2486, + "step": 136990 + }, + { + "epoch": 5.67, + "grad_norm": 0.75, + "learning_rate": 0.0004571935360351085, + "loss": 0.2061, + "step": 137000 + }, + { + "epoch": 5.67, + "grad_norm": 0.68359375, + "learning_rate": 0.0004571874670507484, + "loss": 0.2348, + "step": 137010 + }, + { + "epoch": 5.68, + "grad_norm": 1.15625, + "learning_rate": 0.00045718139767648395, + "loss": 0.2183, + "step": 137020 + }, + { + "epoch": 5.68, + "grad_norm": 0.494140625, + "learning_rate": 0.0004571753279123264, + "loss": 0.1893, + "step": 137030 + }, + { + "epoch": 5.68, + "grad_norm": 0.55859375, + "learning_rate": 0.00045716925775828726, + "loss": 0.2423, + "step": 137040 + }, + { + "epoch": 5.68, + "grad_norm": 0.439453125, + "learning_rate": 0.000457163187214378, + "loss": 0.1909, + "step": 137050 + }, + { + "epoch": 5.68, + "grad_norm": 0.71484375, + "learning_rate": 0.0004571571162806099, + "loss": 0.2228, + "step": 137060 + }, + { + "epoch": 5.68, + "grad_norm": 0.6484375, + "learning_rate": 0.00045715104495699455, + "loss": 0.2204, + "step": 137070 + }, + { + "epoch": 5.68, + "grad_norm": 0.7890625, + "learning_rate": 0.0004571449732435433, + "loss": 0.2472, + "step": 137080 + }, + { + "epoch": 5.68, + "grad_norm": 0.8125, + "learning_rate": 0.00045713890114026755, + "loss": 0.2591, + "step": 137090 + }, + { + "epoch": 5.68, + "grad_norm": 0.5625, + "learning_rate": 0.0004571328286471788, + "loss": 0.2361, + "step": 137100 + }, + { + "epoch": 5.68, + "grad_norm": 0.5078125, + "learning_rate": 0.00045712675576428836, + "loss": 0.1578, + "step": 137110 + }, + { + "epoch": 5.68, + "grad_norm": 1.015625, + "learning_rate": 0.00045712068249160776, + "loss": 0.1411, + "step": 137120 + }, + { + "epoch": 5.68, + "grad_norm": 0.427734375, + "learning_rate": 0.00045711460882914845, + "loss": 0.258, + "step": 137130 + }, + { + "epoch": 5.68, + "grad_norm": 0.64453125, + "learning_rate": 0.0004571085347769218, + "loss": 0.2015, + "step": 137140 + }, + { + "epoch": 5.68, + "grad_norm": 0.70703125, + "learning_rate": 0.0004571024603349392, + "loss": 0.2462, + "step": 137150 + }, + { + "epoch": 5.68, + "grad_norm": 0.71484375, + "learning_rate": 0.0004570963855032122, + "loss": 0.2025, + "step": 137160 + }, + { + "epoch": 5.68, + "grad_norm": 0.7421875, + "learning_rate": 0.0004570903102817521, + "loss": 0.2364, + "step": 137170 + }, + { + "epoch": 5.68, + "grad_norm": 0.62109375, + "learning_rate": 0.0004570842346705705, + "loss": 0.2156, + "step": 137180 + }, + { + "epoch": 5.68, + "grad_norm": 0.6171875, + "learning_rate": 0.0004570781586696786, + "loss": 0.2292, + "step": 137190 + }, + { + "epoch": 5.68, + "grad_norm": 0.6796875, + "learning_rate": 0.0004570720822790881, + "loss": 0.2176, + "step": 137200 + }, + { + "epoch": 5.68, + "grad_norm": 0.98828125, + "learning_rate": 0.00045706600549881023, + "loss": 0.1935, + "step": 137210 + }, + { + "epoch": 5.68, + "grad_norm": 0.5078125, + "learning_rate": 0.0004570599283288565, + "loss": 0.2117, + "step": 137220 + }, + { + "epoch": 5.68, + "grad_norm": 0.9609375, + "learning_rate": 0.00045705385076923833, + "loss": 0.1682, + "step": 137230 + }, + { + "epoch": 5.68, + "grad_norm": 0.470703125, + "learning_rate": 0.00045704777281996723, + "loss": 0.2107, + "step": 137240 + }, + { + "epoch": 5.68, + "grad_norm": 0.79296875, + "learning_rate": 0.0004570416944810546, + "loss": 0.1832, + "step": 137250 + }, + { + "epoch": 5.69, + "grad_norm": 1.078125, + "learning_rate": 0.00045703561575251175, + "loss": 0.2362, + "step": 137260 + }, + { + "epoch": 5.69, + "grad_norm": 2.40625, + "learning_rate": 0.0004570295366343503, + "loss": 0.1669, + "step": 137270 + }, + { + "epoch": 5.69, + "grad_norm": 9.3125, + "learning_rate": 0.0004570234571265817, + "loss": 0.208, + "step": 137280 + }, + { + "epoch": 5.69, + "grad_norm": 0.494140625, + "learning_rate": 0.00045701737722921715, + "loss": 0.2585, + "step": 137290 + }, + { + "epoch": 5.69, + "grad_norm": 0.4296875, + "learning_rate": 0.0004570112969422683, + "loss": 0.2051, + "step": 137300 + }, + { + "epoch": 5.69, + "grad_norm": 0.78515625, + "learning_rate": 0.0004570052162657466, + "loss": 0.1904, + "step": 137310 + }, + { + "epoch": 5.69, + "grad_norm": 0.82421875, + "learning_rate": 0.0004569991351996634, + "loss": 0.2311, + "step": 137320 + }, + { + "epoch": 5.69, + "grad_norm": 0.46875, + "learning_rate": 0.00045699305374403016, + "loss": 0.1975, + "step": 137330 + }, + { + "epoch": 5.69, + "grad_norm": 1.1640625, + "learning_rate": 0.0004569869718988584, + "loss": 0.2457, + "step": 137340 + }, + { + "epoch": 5.69, + "grad_norm": 0.69140625, + "learning_rate": 0.00045698088966415946, + "loss": 0.2283, + "step": 137350 + }, + { + "epoch": 5.69, + "grad_norm": 0.859375, + "learning_rate": 0.0004569748070399449, + "loss": 0.2201, + "step": 137360 + }, + { + "epoch": 5.69, + "grad_norm": 1.5703125, + "learning_rate": 0.000456968724026226, + "loss": 0.2475, + "step": 137370 + }, + { + "epoch": 5.69, + "grad_norm": 0.62109375, + "learning_rate": 0.00045696264062301443, + "loss": 0.1951, + "step": 137380 + }, + { + "epoch": 5.69, + "grad_norm": 0.287109375, + "learning_rate": 0.00045695655683032144, + "loss": 0.2205, + "step": 137390 + }, + { + "epoch": 5.69, + "grad_norm": 0.52734375, + "learning_rate": 0.0004569504726481586, + "loss": 0.2293, + "step": 137400 + }, + { + "epoch": 5.69, + "grad_norm": 1.0546875, + "learning_rate": 0.00045694438807653725, + "loss": 0.2403, + "step": 137410 + }, + { + "epoch": 5.69, + "grad_norm": 0.515625, + "learning_rate": 0.000456938303115469, + "loss": 0.2139, + "step": 137420 + }, + { + "epoch": 5.69, + "grad_norm": 0.73046875, + "learning_rate": 0.0004569322177649652, + "loss": 0.2258, + "step": 137430 + }, + { + "epoch": 5.69, + "grad_norm": 1.046875, + "learning_rate": 0.0004569261320250373, + "loss": 0.1772, + "step": 137440 + }, + { + "epoch": 5.69, + "grad_norm": 0.99609375, + "learning_rate": 0.00045692004589569673, + "loss": 0.1631, + "step": 137450 + }, + { + "epoch": 5.69, + "grad_norm": 0.458984375, + "learning_rate": 0.00045691395937695503, + "loss": 0.2366, + "step": 137460 + }, + { + "epoch": 5.69, + "grad_norm": 0.703125, + "learning_rate": 0.0004569078724688236, + "loss": 0.2532, + "step": 137470 + }, + { + "epoch": 5.69, + "grad_norm": 0.96484375, + "learning_rate": 0.0004569017851713139, + "loss": 0.165, + "step": 137480 + }, + { + "epoch": 5.69, + "grad_norm": 0.7109375, + "learning_rate": 0.00045689569748443725, + "loss": 0.2441, + "step": 137490 + }, + { + "epoch": 5.7, + "grad_norm": 0.44140625, + "learning_rate": 0.0004568896094082054, + "loss": 0.2123, + "step": 137500 + }, + { + "epoch": 5.7, + "grad_norm": 0.55859375, + "learning_rate": 0.00045688352094262963, + "loss": 0.1848, + "step": 137510 + }, + { + "epoch": 5.7, + "grad_norm": 0.73046875, + "learning_rate": 0.00045687743208772135, + "loss": 0.2432, + "step": 137520 + }, + { + "epoch": 5.7, + "grad_norm": 0.76953125, + "learning_rate": 0.00045687134284349206, + "loss": 0.1769, + "step": 137530 + }, + { + "epoch": 5.7, + "grad_norm": 0.5390625, + "learning_rate": 0.0004568652532099533, + "loss": 0.1864, + "step": 137540 + }, + { + "epoch": 5.7, + "grad_norm": 0.8828125, + "learning_rate": 0.0004568591631871165, + "loss": 0.2669, + "step": 137550 + }, + { + "epoch": 5.7, + "grad_norm": 0.291015625, + "learning_rate": 0.00045685307277499313, + "loss": 0.2054, + "step": 137560 + }, + { + "epoch": 5.7, + "grad_norm": 0.439453125, + "learning_rate": 0.0004568469819735945, + "loss": 0.1912, + "step": 137570 + }, + { + "epoch": 5.7, + "grad_norm": 1.046875, + "learning_rate": 0.00045684089078293225, + "loss": 0.2089, + "step": 137580 + }, + { + "epoch": 5.7, + "grad_norm": 0.69921875, + "learning_rate": 0.0004568347992030177, + "loss": 0.164, + "step": 137590 + }, + { + "epoch": 5.7, + "grad_norm": 0.87109375, + "learning_rate": 0.0004568287072338625, + "loss": 0.1928, + "step": 137600 + }, + { + "epoch": 5.7, + "grad_norm": 0.5703125, + "learning_rate": 0.0004568226148754781, + "loss": 0.2206, + "step": 137610 + }, + { + "epoch": 5.7, + "grad_norm": 0.796875, + "learning_rate": 0.00045681652212787563, + "loss": 0.1853, + "step": 137620 + }, + { + "epoch": 5.7, + "grad_norm": 0.6015625, + "learning_rate": 0.00045681042899106694, + "loss": 0.2116, + "step": 137630 + }, + { + "epoch": 5.7, + "grad_norm": 1.390625, + "learning_rate": 0.0004568043354650633, + "loss": 0.2142, + "step": 137640 + }, + { + "epoch": 5.7, + "grad_norm": 0.8125, + "learning_rate": 0.00045679824154987625, + "loss": 0.2037, + "step": 137650 + }, + { + "epoch": 5.7, + "grad_norm": 0.69921875, + "learning_rate": 0.00045679214724551735, + "loss": 0.2376, + "step": 137660 + }, + { + "epoch": 5.7, + "grad_norm": 1.015625, + "learning_rate": 0.0004567860525519978, + "loss": 0.2403, + "step": 137670 + }, + { + "epoch": 5.7, + "grad_norm": 0.357421875, + "learning_rate": 0.00045677995746932934, + "loss": 0.2132, + "step": 137680 + }, + { + "epoch": 5.7, + "grad_norm": 0.56640625, + "learning_rate": 0.0004567738619975232, + "loss": 0.189, + "step": 137690 + }, + { + "epoch": 5.7, + "grad_norm": 0.640625, + "learning_rate": 0.0004567677661365911, + "loss": 0.2103, + "step": 137700 + }, + { + "epoch": 5.7, + "grad_norm": 0.431640625, + "learning_rate": 0.0004567616698865444, + "loss": 0.2444, + "step": 137710 + }, + { + "epoch": 5.7, + "grad_norm": 0.94921875, + "learning_rate": 0.00045675557324739447, + "loss": 0.1574, + "step": 137720 + }, + { + "epoch": 5.7, + "grad_norm": 0.419921875, + "learning_rate": 0.000456749476219153, + "loss": 0.2003, + "step": 137730 + }, + { + "epoch": 5.71, + "grad_norm": 0.796875, + "learning_rate": 0.0004567433788018313, + "loss": 0.218, + "step": 137740 + }, + { + "epoch": 5.71, + "grad_norm": 0.80078125, + "learning_rate": 0.0004567372809954408, + "loss": 0.2321, + "step": 137750 + }, + { + "epoch": 5.71, + "grad_norm": 0.400390625, + "learning_rate": 0.0004567311827999932, + "loss": 0.2218, + "step": 137760 + }, + { + "epoch": 5.71, + "grad_norm": 0.64453125, + "learning_rate": 0.00045672508421549976, + "loss": 0.2166, + "step": 137770 + }, + { + "epoch": 5.71, + "grad_norm": 1.375, + "learning_rate": 0.000456718985241972, + "loss": 0.2107, + "step": 137780 + }, + { + "epoch": 5.71, + "grad_norm": 0.431640625, + "learning_rate": 0.0004567128858794215, + "loss": 0.2332, + "step": 137790 + }, + { + "epoch": 5.71, + "grad_norm": 1.2421875, + "learning_rate": 0.00045670678612785967, + "loss": 0.2052, + "step": 137800 + }, + { + "epoch": 5.71, + "grad_norm": 0.890625, + "learning_rate": 0.00045670068598729793, + "loss": 0.1495, + "step": 137810 + }, + { + "epoch": 5.71, + "grad_norm": 0.8828125, + "learning_rate": 0.0004566945854577479, + "loss": 0.2371, + "step": 137820 + }, + { + "epoch": 5.71, + "grad_norm": 0.63671875, + "learning_rate": 0.0004566884845392209, + "loss": 0.1865, + "step": 137830 + }, + { + "epoch": 5.71, + "grad_norm": 0.41796875, + "learning_rate": 0.0004566823832317285, + "loss": 0.1739, + "step": 137840 + }, + { + "epoch": 5.71, + "grad_norm": 1.078125, + "learning_rate": 0.0004566762815352822, + "loss": 0.2081, + "step": 137850 + }, + { + "epoch": 5.71, + "grad_norm": 0.70703125, + "learning_rate": 0.00045667017944989353, + "loss": 0.22, + "step": 137860 + }, + { + "epoch": 5.71, + "grad_norm": 0.6640625, + "learning_rate": 0.00045666407697557383, + "loss": 0.2302, + "step": 137870 + }, + { + "epoch": 5.71, + "grad_norm": 1.2578125, + "learning_rate": 0.00045665797411233465, + "loss": 0.2196, + "step": 137880 + }, + { + "epoch": 5.71, + "grad_norm": 0.298828125, + "learning_rate": 0.0004566518708601875, + "loss": 0.2396, + "step": 137890 + }, + { + "epoch": 5.71, + "grad_norm": 0.83984375, + "learning_rate": 0.00045664576721914384, + "loss": 0.2073, + "step": 137900 + }, + { + "epoch": 5.71, + "grad_norm": 0.9609375, + "learning_rate": 0.00045663966318921514, + "loss": 0.2203, + "step": 137910 + }, + { + "epoch": 5.71, + "grad_norm": 0.55859375, + "learning_rate": 0.0004566335587704129, + "loss": 0.1693, + "step": 137920 + }, + { + "epoch": 5.71, + "grad_norm": 0.921875, + "learning_rate": 0.00045662745396274866, + "loss": 0.1446, + "step": 137930 + }, + { + "epoch": 5.71, + "grad_norm": 0.82421875, + "learning_rate": 0.0004566213487662338, + "loss": 0.2162, + "step": 137940 + }, + { + "epoch": 5.71, + "grad_norm": 0.296875, + "learning_rate": 0.00045661524318088, + "loss": 0.2057, + "step": 137950 + }, + { + "epoch": 5.71, + "grad_norm": 0.99609375, + "learning_rate": 0.00045660913720669855, + "loss": 0.164, + "step": 137960 + }, + { + "epoch": 5.71, + "grad_norm": 0.421875, + "learning_rate": 0.00045660303084370105, + "loss": 0.2083, + "step": 137970 + }, + { + "epoch": 5.72, + "grad_norm": 0.5703125, + "learning_rate": 0.0004565969240918989, + "loss": 0.2201, + "step": 137980 + }, + { + "epoch": 5.72, + "grad_norm": 0.73828125, + "learning_rate": 0.0004565908169513037, + "loss": 0.2045, + "step": 137990 + }, + { + "epoch": 5.72, + "grad_norm": 1.0078125, + "learning_rate": 0.00045658470942192686, + "loss": 0.2035, + "step": 138000 + }, + { + "epoch": 5.72, + "grad_norm": 1.125, + "learning_rate": 0.0004565786015037799, + "loss": 0.1896, + "step": 138010 + }, + { + "epoch": 5.72, + "grad_norm": 0.482421875, + "learning_rate": 0.0004565724931968743, + "loss": 0.2144, + "step": 138020 + }, + { + "epoch": 5.72, + "grad_norm": 0.39453125, + "learning_rate": 0.0004565663845012216, + "loss": 0.2309, + "step": 138030 + }, + { + "epoch": 5.72, + "grad_norm": 2.15625, + "learning_rate": 0.00045656027541683333, + "loss": 0.24, + "step": 138040 + }, + { + "epoch": 5.72, + "grad_norm": 0.73828125, + "learning_rate": 0.00045655416594372086, + "loss": 0.2309, + "step": 138050 + }, + { + "epoch": 5.72, + "grad_norm": 0.65234375, + "learning_rate": 0.0004565480560818958, + "loss": 0.2678, + "step": 138060 + }, + { + "epoch": 5.72, + "grad_norm": 0.80078125, + "learning_rate": 0.00045654194583136953, + "loss": 0.2286, + "step": 138070 + }, + { + "epoch": 5.72, + "grad_norm": 0.8359375, + "learning_rate": 0.0004565358351921537, + "loss": 0.2393, + "step": 138080 + }, + { + "epoch": 5.72, + "grad_norm": 0.8984375, + "learning_rate": 0.00045652972416425965, + "loss": 0.2033, + "step": 138090 + }, + { + "epoch": 5.72, + "grad_norm": 0.84765625, + "learning_rate": 0.00045652361274769906, + "loss": 0.2137, + "step": 138100 + }, + { + "epoch": 5.72, + "grad_norm": 2.515625, + "learning_rate": 0.0004565175009424832, + "loss": 0.2374, + "step": 138110 + }, + { + "epoch": 5.72, + "grad_norm": 0.326171875, + "learning_rate": 0.00045651138874862383, + "loss": 0.1714, + "step": 138120 + }, + { + "epoch": 5.72, + "grad_norm": 1.0, + "learning_rate": 0.00045650527616613234, + "loss": 0.2274, + "step": 138130 + }, + { + "epoch": 5.72, + "grad_norm": 1.046875, + "learning_rate": 0.0004564991631950202, + "loss": 0.1855, + "step": 138140 + }, + { + "epoch": 5.72, + "grad_norm": 0.71484375, + "learning_rate": 0.00045649304983529885, + "loss": 0.2385, + "step": 138150 + }, + { + "epoch": 5.72, + "grad_norm": 1.53125, + "learning_rate": 0.00045648693608697994, + "loss": 0.2153, + "step": 138160 + }, + { + "epoch": 5.72, + "grad_norm": 1.0390625, + "learning_rate": 0.0004564808219500749, + "loss": 0.2133, + "step": 138170 + }, + { + "epoch": 5.72, + "grad_norm": 0.78125, + "learning_rate": 0.0004564747074245953, + "loss": 0.2114, + "step": 138180 + }, + { + "epoch": 5.72, + "grad_norm": 1.2890625, + "learning_rate": 0.0004564685925105525, + "loss": 0.2152, + "step": 138190 + }, + { + "epoch": 5.72, + "grad_norm": 0.44921875, + "learning_rate": 0.0004564624772079582, + "loss": 0.1738, + "step": 138200 + }, + { + "epoch": 5.72, + "grad_norm": 0.5859375, + "learning_rate": 0.0004564563615168238, + "loss": 0.1945, + "step": 138210 + }, + { + "epoch": 5.73, + "grad_norm": 0.1923828125, + "learning_rate": 0.00045645024543716073, + "loss": 0.143, + "step": 138220 + }, + { + "epoch": 5.73, + "grad_norm": 1.078125, + "learning_rate": 0.0004564441289689807, + "loss": 0.201, + "step": 138230 + }, + { + "epoch": 5.73, + "grad_norm": 0.828125, + "learning_rate": 0.0004564380121122951, + "loss": 0.222, + "step": 138240 + }, + { + "epoch": 5.73, + "grad_norm": 1.078125, + "learning_rate": 0.00045643189486711545, + "loss": 0.1744, + "step": 138250 + }, + { + "epoch": 5.73, + "grad_norm": 0.498046875, + "learning_rate": 0.00045642577723345326, + "loss": 0.1908, + "step": 138260 + }, + { + "epoch": 5.73, + "grad_norm": 0.4375, + "learning_rate": 0.00045641965921132, + "loss": 0.2021, + "step": 138270 + }, + { + "epoch": 5.73, + "grad_norm": 0.64453125, + "learning_rate": 0.0004564135408007273, + "loss": 0.2396, + "step": 138280 + }, + { + "epoch": 5.73, + "grad_norm": 2.296875, + "learning_rate": 0.00045640742200168656, + "loss": 0.218, + "step": 138290 + }, + { + "epoch": 5.73, + "grad_norm": 0.146484375, + "learning_rate": 0.0004564013028142093, + "loss": 0.168, + "step": 138300 + }, + { + "epoch": 5.73, + "grad_norm": 0.2451171875, + "learning_rate": 0.0004563951832383072, + "loss": 0.208, + "step": 138310 + }, + { + "epoch": 5.73, + "grad_norm": 0.333984375, + "learning_rate": 0.00045638906327399154, + "loss": 0.1957, + "step": 138320 + }, + { + "epoch": 5.73, + "grad_norm": 0.60546875, + "learning_rate": 0.000456382942921274, + "loss": 0.1876, + "step": 138330 + }, + { + "epoch": 5.73, + "grad_norm": 0.76171875, + "learning_rate": 0.00045637682218016605, + "loss": 0.2091, + "step": 138340 + }, + { + "epoch": 5.73, + "grad_norm": 0.72265625, + "learning_rate": 0.00045637070105067926, + "loss": 0.2178, + "step": 138350 + }, + { + "epoch": 5.73, + "grad_norm": 0.58984375, + "learning_rate": 0.00045636457953282506, + "loss": 0.1745, + "step": 138360 + }, + { + "epoch": 5.73, + "grad_norm": 0.8125, + "learning_rate": 0.000456358457626615, + "loss": 0.1934, + "step": 138370 + }, + { + "epoch": 5.73, + "grad_norm": 0.80859375, + "learning_rate": 0.00045635233533206055, + "loss": 0.192, + "step": 138380 + }, + { + "epoch": 5.73, + "grad_norm": 0.087890625, + "learning_rate": 0.0004563462126491734, + "loss": 0.1914, + "step": 138390 + }, + { + "epoch": 5.73, + "grad_norm": 1.2265625, + "learning_rate": 0.00045634008957796485, + "loss": 0.2222, + "step": 138400 + }, + { + "epoch": 5.73, + "grad_norm": 0.82421875, + "learning_rate": 0.0004563339661184466, + "loss": 0.219, + "step": 138410 + }, + { + "epoch": 5.73, + "grad_norm": 1.21875, + "learning_rate": 0.00045632784227063006, + "loss": 0.2123, + "step": 138420 + }, + { + "epoch": 5.73, + "grad_norm": 1.109375, + "learning_rate": 0.00045632171803452684, + "loss": 0.2114, + "step": 138430 + }, + { + "epoch": 5.73, + "grad_norm": 0.39453125, + "learning_rate": 0.0004563155934101484, + "loss": 0.2085, + "step": 138440 + }, + { + "epoch": 5.73, + "grad_norm": 1.0234375, + "learning_rate": 0.0004563094683975063, + "loss": 0.2227, + "step": 138450 + }, + { + "epoch": 5.73, + "grad_norm": 0.41015625, + "learning_rate": 0.00045630334299661215, + "loss": 0.2013, + "step": 138460 + }, + { + "epoch": 5.74, + "grad_norm": 0.64453125, + "learning_rate": 0.0004562972172074773, + "loss": 0.191, + "step": 138470 + }, + { + "epoch": 5.74, + "grad_norm": 0.546875, + "learning_rate": 0.00045629109103011335, + "loss": 0.2925, + "step": 138480 + }, + { + "epoch": 5.74, + "grad_norm": 0.455078125, + "learning_rate": 0.0004562849644645319, + "loss": 0.2209, + "step": 138490 + }, + { + "epoch": 5.74, + "grad_norm": 1.2421875, + "learning_rate": 0.0004562788375107444, + "loss": 0.1754, + "step": 138500 + }, + { + "epoch": 5.74, + "grad_norm": 0.81640625, + "learning_rate": 0.00045627271016876236, + "loss": 0.1651, + "step": 138510 + }, + { + "epoch": 5.74, + "grad_norm": 1.5859375, + "learning_rate": 0.00045626658243859745, + "loss": 0.1663, + "step": 138520 + }, + { + "epoch": 5.74, + "grad_norm": 0.22265625, + "learning_rate": 0.000456260454320261, + "loss": 0.2523, + "step": 138530 + }, + { + "epoch": 5.74, + "grad_norm": 1.078125, + "learning_rate": 0.0004562543258137647, + "loss": 0.2283, + "step": 138540 + }, + { + "epoch": 5.74, + "grad_norm": 0.6171875, + "learning_rate": 0.0004562481969191201, + "loss": 0.2082, + "step": 138550 + }, + { + "epoch": 5.74, + "grad_norm": 0.6015625, + "learning_rate": 0.0004562420676363386, + "loss": 0.2032, + "step": 138560 + }, + { + "epoch": 5.74, + "grad_norm": 0.90234375, + "learning_rate": 0.0004562359379654317, + "loss": 0.185, + "step": 138570 + }, + { + "epoch": 5.74, + "grad_norm": 0.61328125, + "learning_rate": 0.0004562298079064112, + "loss": 0.3242, + "step": 138580 + }, + { + "epoch": 5.74, + "grad_norm": 1.0390625, + "learning_rate": 0.0004562236774592884, + "loss": 0.1681, + "step": 138590 + }, + { + "epoch": 5.74, + "grad_norm": 0.8515625, + "learning_rate": 0.00045621754662407486, + "loss": 0.1687, + "step": 138600 + }, + { + "epoch": 5.74, + "grad_norm": 0.85546875, + "learning_rate": 0.0004562114154007823, + "loss": 0.1908, + "step": 138610 + }, + { + "epoch": 5.74, + "grad_norm": 0.72265625, + "learning_rate": 0.00045620528378942203, + "loss": 0.2095, + "step": 138620 + }, + { + "epoch": 5.74, + "grad_norm": 0.8671875, + "learning_rate": 0.0004561991517900057, + "loss": 0.1887, + "step": 138630 + }, + { + "epoch": 5.74, + "grad_norm": 0.96484375, + "learning_rate": 0.00045619301940254487, + "loss": 0.191, + "step": 138640 + }, + { + "epoch": 5.74, + "grad_norm": 0.52734375, + "learning_rate": 0.000456186886627051, + "loss": 0.1624, + "step": 138650 + }, + { + "epoch": 5.74, + "grad_norm": 0.8125, + "learning_rate": 0.00045618075346353564, + "loss": 0.2021, + "step": 138660 + }, + { + "epoch": 5.74, + "grad_norm": 0.890625, + "learning_rate": 0.0004561746199120105, + "loss": 0.2075, + "step": 138670 + }, + { + "epoch": 5.74, + "grad_norm": 1.0546875, + "learning_rate": 0.0004561684859724868, + "loss": 0.194, + "step": 138680 + }, + { + "epoch": 5.74, + "grad_norm": 0.61328125, + "learning_rate": 0.00045616235164497646, + "loss": 0.1967, + "step": 138690 + }, + { + "epoch": 5.74, + "grad_norm": 0.470703125, + "learning_rate": 0.00045615621692949074, + "loss": 0.1425, + "step": 138700 + }, + { + "epoch": 5.75, + "grad_norm": 1.3046875, + "learning_rate": 0.0004561500818260413, + "loss": 0.1961, + "step": 138710 + }, + { + "epoch": 5.75, + "grad_norm": 0.400390625, + "learning_rate": 0.0004561439463346396, + "loss": 0.253, + "step": 138720 + }, + { + "epoch": 5.75, + "grad_norm": 0.0, + "learning_rate": 0.00045613781045529734, + "loss": 0.1832, + "step": 138730 + }, + { + "epoch": 5.75, + "grad_norm": 0.478515625, + "learning_rate": 0.00045613167418802596, + "loss": 0.2215, + "step": 138740 + }, + { + "epoch": 5.75, + "grad_norm": 0.38671875, + "learning_rate": 0.000456125537532837, + "loss": 0.2408, + "step": 138750 + }, + { + "epoch": 5.75, + "grad_norm": 1.0546875, + "learning_rate": 0.0004561194004897422, + "loss": 0.1994, + "step": 138760 + }, + { + "epoch": 5.75, + "grad_norm": 0.953125, + "learning_rate": 0.0004561132630587528, + "loss": 0.2246, + "step": 138770 + }, + { + "epoch": 5.75, + "grad_norm": 1.046875, + "learning_rate": 0.00045610712523988045, + "loss": 0.1857, + "step": 138780 + }, + { + "epoch": 5.75, + "grad_norm": 0.4609375, + "learning_rate": 0.00045610098703313684, + "loss": 0.2306, + "step": 138790 + }, + { + "epoch": 5.75, + "grad_norm": 0.76171875, + "learning_rate": 0.00045609484843853345, + "loss": 0.1764, + "step": 138800 + }, + { + "epoch": 5.75, + "grad_norm": 1.0859375, + "learning_rate": 0.0004560887094560817, + "loss": 0.1662, + "step": 138810 + }, + { + "epoch": 5.75, + "grad_norm": 0.30859375, + "learning_rate": 0.00045608257008579337, + "loss": 0.2198, + "step": 138820 + }, + { + "epoch": 5.75, + "grad_norm": 0.9609375, + "learning_rate": 0.0004560764303276798, + "loss": 0.2134, + "step": 138830 + }, + { + "epoch": 5.75, + "grad_norm": 0.54296875, + "learning_rate": 0.00045607029018175275, + "loss": 0.1445, + "step": 138840 + }, + { + "epoch": 5.75, + "grad_norm": 1.0625, + "learning_rate": 0.0004560641496480236, + "loss": 0.1547, + "step": 138850 + }, + { + "epoch": 5.75, + "grad_norm": 0.24609375, + "learning_rate": 0.0004560580087265039, + "loss": 0.2084, + "step": 138860 + }, + { + "epoch": 5.75, + "grad_norm": 0.68359375, + "learning_rate": 0.0004560518674172054, + "loss": 0.2272, + "step": 138870 + }, + { + "epoch": 5.75, + "grad_norm": 0.6953125, + "learning_rate": 0.0004560457257201395, + "loss": 0.2022, + "step": 138880 + }, + { + "epoch": 5.75, + "grad_norm": 0.484375, + "learning_rate": 0.00045603958363531785, + "loss": 0.2492, + "step": 138890 + }, + { + "epoch": 5.75, + "grad_norm": 0.390625, + "learning_rate": 0.00045603344116275184, + "loss": 0.1975, + "step": 138900 + }, + { + "epoch": 5.75, + "grad_norm": 0.86328125, + "learning_rate": 0.00045602729830245323, + "loss": 0.1839, + "step": 138910 + }, + { + "epoch": 5.75, + "grad_norm": 1.2265625, + "learning_rate": 0.0004560211550544334, + "loss": 0.2312, + "step": 138920 + }, + { + "epoch": 5.75, + "grad_norm": 0.58984375, + "learning_rate": 0.0004560150114187042, + "loss": 0.2111, + "step": 138930 + }, + { + "epoch": 5.75, + "grad_norm": 1.125, + "learning_rate": 0.0004560088673952768, + "loss": 0.1748, + "step": 138940 + }, + { + "epoch": 5.76, + "grad_norm": 0.578125, + "learning_rate": 0.000456002722984163, + "loss": 0.1853, + "step": 138950 + }, + { + "epoch": 5.76, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004559965781853744, + "loss": 0.2777, + "step": 138960 + }, + { + "epoch": 5.76, + "grad_norm": 0.52734375, + "learning_rate": 0.0004559904329989224, + "loss": 0.1896, + "step": 138970 + }, + { + "epoch": 5.76, + "grad_norm": 0.92578125, + "learning_rate": 0.0004559842874248187, + "loss": 0.2379, + "step": 138980 + }, + { + "epoch": 5.76, + "grad_norm": 0.90625, + "learning_rate": 0.00045597814146307477, + "loss": 0.2582, + "step": 138990 + }, + { + "epoch": 5.76, + "grad_norm": 0.59765625, + "learning_rate": 0.0004559719951137022, + "loss": 0.2123, + "step": 139000 + }, + { + "epoch": 5.76, + "grad_norm": 1.078125, + "learning_rate": 0.0004559658483767127, + "loss": 0.158, + "step": 139010 + }, + { + "epoch": 5.76, + "grad_norm": 0.224609375, + "learning_rate": 0.0004559597012521176, + "loss": 0.2333, + "step": 139020 + }, + { + "epoch": 5.76, + "grad_norm": 0.8359375, + "learning_rate": 0.00045595355373992863, + "loss": 0.1686, + "step": 139030 + }, + { + "epoch": 5.76, + "grad_norm": 0.33984375, + "learning_rate": 0.0004559474058401574, + "loss": 0.2127, + "step": 139040 + }, + { + "epoch": 5.76, + "grad_norm": 0.80859375, + "learning_rate": 0.00045594125755281525, + "loss": 0.1732, + "step": 139050 + }, + { + "epoch": 5.76, + "grad_norm": 0.1767578125, + "learning_rate": 0.0004559351088779139, + "loss": 0.2601, + "step": 139060 + }, + { + "epoch": 5.76, + "grad_norm": 0.435546875, + "learning_rate": 0.000455928959815465, + "loss": 0.2227, + "step": 139070 + }, + { + "epoch": 5.76, + "grad_norm": 0.640625, + "learning_rate": 0.00045592281036548, + "loss": 0.2292, + "step": 139080 + }, + { + "epoch": 5.76, + "grad_norm": 1.265625, + "learning_rate": 0.0004559166605279705, + "loss": 0.197, + "step": 139090 + }, + { + "epoch": 5.76, + "grad_norm": 0.66796875, + "learning_rate": 0.0004559105103029481, + "loss": 0.1701, + "step": 139100 + }, + { + "epoch": 5.76, + "grad_norm": 1.09375, + "learning_rate": 0.00045590435969042434, + "loss": 0.2198, + "step": 139110 + }, + { + "epoch": 5.76, + "grad_norm": 0.94921875, + "learning_rate": 0.00045589820869041076, + "loss": 0.2108, + "step": 139120 + }, + { + "epoch": 5.76, + "grad_norm": 0.96875, + "learning_rate": 0.00045589205730291903, + "loss": 0.2538, + "step": 139130 + }, + { + "epoch": 5.76, + "grad_norm": 0.447265625, + "learning_rate": 0.0004558859055279607, + "loss": 0.229, + "step": 139140 + }, + { + "epoch": 5.76, + "grad_norm": 0.494140625, + "learning_rate": 0.0004558797533655473, + "loss": 0.1303, + "step": 139150 + }, + { + "epoch": 5.76, + "grad_norm": 0.82421875, + "learning_rate": 0.0004558736008156905, + "loss": 0.1949, + "step": 139160 + }, + { + "epoch": 5.76, + "grad_norm": 1.3046875, + "learning_rate": 0.00045586744787840173, + "loss": 0.2325, + "step": 139170 + }, + { + "epoch": 5.76, + "grad_norm": 1.0546875, + "learning_rate": 0.00045586129455369275, + "loss": 0.2191, + "step": 139180 + }, + { + "epoch": 5.77, + "grad_norm": 0.90234375, + "learning_rate": 0.00045585514084157497, + "loss": 0.2076, + "step": 139190 + }, + { + "epoch": 5.77, + "grad_norm": 0.67578125, + "learning_rate": 0.0004558489867420601, + "loss": 0.1637, + "step": 139200 + }, + { + "epoch": 5.77, + "grad_norm": 0.66796875, + "learning_rate": 0.0004558428322551597, + "loss": 0.2068, + "step": 139210 + }, + { + "epoch": 5.77, + "grad_norm": 1.1328125, + "learning_rate": 0.0004558366773808852, + "loss": 0.1999, + "step": 139220 + }, + { + "epoch": 5.77, + "grad_norm": 0.80859375, + "learning_rate": 0.00045583052211924834, + "loss": 0.2219, + "step": 139230 + }, + { + "epoch": 5.77, + "grad_norm": 1.046875, + "learning_rate": 0.0004558243664702607, + "loss": 0.1763, + "step": 139240 + }, + { + "epoch": 5.77, + "grad_norm": 0.796875, + "learning_rate": 0.00045581821043393377, + "loss": 0.2209, + "step": 139250 + }, + { + "epoch": 5.77, + "grad_norm": 1.2265625, + "learning_rate": 0.00045581205401027926, + "loss": 0.2312, + "step": 139260 + }, + { + "epoch": 5.77, + "grad_norm": 0.6875, + "learning_rate": 0.00045580589719930865, + "loss": 0.152, + "step": 139270 + }, + { + "epoch": 5.77, + "grad_norm": 0.412109375, + "learning_rate": 0.0004557997400010336, + "loss": 0.1766, + "step": 139280 + }, + { + "epoch": 5.77, + "grad_norm": 0.3515625, + "learning_rate": 0.00045579358241546565, + "loss": 0.1753, + "step": 139290 + }, + { + "epoch": 5.77, + "grad_norm": 1.0234375, + "learning_rate": 0.00045578742444261634, + "loss": 0.2072, + "step": 139300 + }, + { + "epoch": 5.77, + "grad_norm": 0.99609375, + "learning_rate": 0.00045578126608249744, + "loss": 0.1883, + "step": 139310 + }, + { + "epoch": 5.77, + "grad_norm": 0.62890625, + "learning_rate": 0.0004557751073351203, + "loss": 0.1847, + "step": 139320 + }, + { + "epoch": 5.77, + "grad_norm": 0.298828125, + "learning_rate": 0.0004557689482004967, + "loss": 0.2834, + "step": 139330 + }, + { + "epoch": 5.77, + "grad_norm": 0.66015625, + "learning_rate": 0.0004557627886786382, + "loss": 0.2044, + "step": 139340 + }, + { + "epoch": 5.77, + "grad_norm": 0.58203125, + "learning_rate": 0.00045575662876955626, + "loss": 0.2069, + "step": 139350 + }, + { + "epoch": 5.77, + "grad_norm": 0.7734375, + "learning_rate": 0.0004557504684732626, + "loss": 0.2169, + "step": 139360 + }, + { + "epoch": 5.77, + "grad_norm": 1.2265625, + "learning_rate": 0.0004557443077897688, + "loss": 0.1911, + "step": 139370 + }, + { + "epoch": 5.77, + "grad_norm": 1.0, + "learning_rate": 0.00045573814671908644, + "loss": 0.1865, + "step": 139380 + }, + { + "epoch": 5.77, + "grad_norm": 0.5546875, + "learning_rate": 0.00045573198526122706, + "loss": 0.1945, + "step": 139390 + }, + { + "epoch": 5.77, + "grad_norm": 0.59765625, + "learning_rate": 0.0004557258234162023, + "loss": 0.1724, + "step": 139400 + }, + { + "epoch": 5.77, + "grad_norm": 0.66015625, + "learning_rate": 0.00045571966118402376, + "loss": 0.2349, + "step": 139410 + }, + { + "epoch": 5.77, + "grad_norm": 0.416015625, + "learning_rate": 0.0004557134985647031, + "loss": 0.2197, + "step": 139420 + }, + { + "epoch": 5.78, + "grad_norm": 1.5078125, + "learning_rate": 0.0004557073355582518, + "loss": 0.2076, + "step": 139430 + }, + { + "epoch": 5.78, + "grad_norm": 0.609375, + "learning_rate": 0.0004557011721646814, + "loss": 0.2085, + "step": 139440 + }, + { + "epoch": 5.78, + "grad_norm": 0.416015625, + "learning_rate": 0.00045569500838400377, + "loss": 0.2535, + "step": 139450 + }, + { + "epoch": 5.78, + "grad_norm": 0.87890625, + "learning_rate": 0.0004556888442162303, + "loss": 0.1933, + "step": 139460 + }, + { + "epoch": 5.78, + "grad_norm": 0.5390625, + "learning_rate": 0.00045568267966137265, + "loss": 0.2175, + "step": 139470 + }, + { + "epoch": 5.78, + "grad_norm": 0.3125, + "learning_rate": 0.0004556765147194424, + "loss": 0.1446, + "step": 139480 + }, + { + "epoch": 5.78, + "grad_norm": 0.427734375, + "learning_rate": 0.0004556703493904512, + "loss": 0.2221, + "step": 139490 + }, + { + "epoch": 5.78, + "grad_norm": 0.8125, + "learning_rate": 0.00045566418367441047, + "loss": 0.2018, + "step": 139500 + }, + { + "epoch": 5.78, + "grad_norm": 0.7265625, + "learning_rate": 0.0004556580175713321, + "loss": 0.2158, + "step": 139510 + }, + { + "epoch": 5.78, + "grad_norm": 1.28125, + "learning_rate": 0.00045565185108122747, + "loss": 0.1825, + "step": 139520 + }, + { + "epoch": 5.78, + "grad_norm": 0.6015625, + "learning_rate": 0.00045564568420410827, + "loss": 0.2586, + "step": 139530 + }, + { + "epoch": 5.78, + "grad_norm": 0.453125, + "learning_rate": 0.0004556395169399862, + "loss": 0.222, + "step": 139540 + }, + { + "epoch": 5.78, + "grad_norm": 0.8359375, + "learning_rate": 0.00045563334928887267, + "loss": 0.2419, + "step": 139550 + }, + { + "epoch": 5.78, + "grad_norm": 0.66796875, + "learning_rate": 0.0004556271812507794, + "loss": 0.2003, + "step": 139560 + }, + { + "epoch": 5.78, + "grad_norm": 0.80078125, + "learning_rate": 0.000455621012825718, + "loss": 0.1834, + "step": 139570 + }, + { + "epoch": 5.78, + "grad_norm": 0.478515625, + "learning_rate": 0.00045561484401370004, + "loss": 0.1904, + "step": 139580 + }, + { + "epoch": 5.78, + "grad_norm": 0.640625, + "learning_rate": 0.0004556086748147371, + "loss": 0.2326, + "step": 139590 + }, + { + "epoch": 5.78, + "grad_norm": 0.44140625, + "learning_rate": 0.00045560250522884096, + "loss": 0.1891, + "step": 139600 + }, + { + "epoch": 5.78, + "grad_norm": 0.9765625, + "learning_rate": 0.000455596335256023, + "loss": 0.2134, + "step": 139610 + }, + { + "epoch": 5.78, + "grad_norm": 0.08984375, + "learning_rate": 0.00045559016489629494, + "loss": 0.1555, + "step": 139620 + }, + { + "epoch": 5.78, + "grad_norm": 0.58984375, + "learning_rate": 0.0004555839941496684, + "loss": 0.2034, + "step": 139630 + }, + { + "epoch": 5.78, + "grad_norm": 0.546875, + "learning_rate": 0.0004555778230161551, + "loss": 0.2295, + "step": 139640 + }, + { + "epoch": 5.78, + "grad_norm": 0.62890625, + "learning_rate": 0.0004555716514957664, + "loss": 0.1799, + "step": 139650 + }, + { + "epoch": 5.78, + "grad_norm": 0.6171875, + "learning_rate": 0.00045556547958851414, + "loss": 0.2126, + "step": 139660 + }, + { + "epoch": 5.79, + "grad_norm": 0.41796875, + "learning_rate": 0.00045555930729440983, + "loss": 0.1318, + "step": 139670 + }, + { + "epoch": 5.79, + "grad_norm": 0.427734375, + "learning_rate": 0.00045555313461346505, + "loss": 0.1499, + "step": 139680 + }, + { + "epoch": 5.79, + "grad_norm": 0.5234375, + "learning_rate": 0.0004555469615456915, + "loss": 0.1686, + "step": 139690 + }, + { + "epoch": 5.79, + "grad_norm": 0.79296875, + "learning_rate": 0.0004555407880911008, + "loss": 0.2015, + "step": 139700 + }, + { + "epoch": 5.79, + "grad_norm": 0.439453125, + "learning_rate": 0.00045553461424970444, + "loss": 0.2136, + "step": 139710 + }, + { + "epoch": 5.79, + "grad_norm": 1.2421875, + "learning_rate": 0.0004555284400215142, + "loss": 0.2013, + "step": 139720 + }, + { + "epoch": 5.79, + "grad_norm": 1.3125, + "learning_rate": 0.0004555222654065416, + "loss": 0.1928, + "step": 139730 + }, + { + "epoch": 5.79, + "grad_norm": 0.1640625, + "learning_rate": 0.00045551609040479833, + "loss": 0.2185, + "step": 139740 + }, + { + "epoch": 5.79, + "grad_norm": 0.99609375, + "learning_rate": 0.000455509915016296, + "loss": 0.218, + "step": 139750 + }, + { + "epoch": 5.79, + "grad_norm": 0.66015625, + "learning_rate": 0.00045550373924104615, + "loss": 0.2046, + "step": 139760 + }, + { + "epoch": 5.79, + "grad_norm": 1.0546875, + "learning_rate": 0.00045549756307906045, + "loss": 0.2547, + "step": 139770 + }, + { + "epoch": 5.79, + "grad_norm": 1.15625, + "learning_rate": 0.00045549138653035057, + "loss": 0.2524, + "step": 139780 + }, + { + "epoch": 5.79, + "grad_norm": 0.255859375, + "learning_rate": 0.000455485209594928, + "loss": 0.1909, + "step": 139790 + }, + { + "epoch": 5.79, + "grad_norm": 0.765625, + "learning_rate": 0.00045547903227280455, + "loss": 0.2243, + "step": 139800 + }, + { + "epoch": 5.79, + "grad_norm": 0.4609375, + "learning_rate": 0.00045547285456399173, + "loss": 0.2276, + "step": 139810 + }, + { + "epoch": 5.79, + "grad_norm": 0.70703125, + "learning_rate": 0.00045546667646850114, + "loss": 0.222, + "step": 139820 + }, + { + "epoch": 5.79, + "grad_norm": 0.81640625, + "learning_rate": 0.0004554604979863445, + "loss": 0.1677, + "step": 139830 + }, + { + "epoch": 5.79, + "grad_norm": 0.52734375, + "learning_rate": 0.00045545431911753344, + "loss": 0.1851, + "step": 139840 + }, + { + "epoch": 5.79, + "grad_norm": 0.65625, + "learning_rate": 0.00045544813986207944, + "loss": 0.1944, + "step": 139850 + }, + { + "epoch": 5.79, + "grad_norm": 0.6015625, + "learning_rate": 0.0004554419602199943, + "loss": 0.1979, + "step": 139860 + }, + { + "epoch": 5.79, + "grad_norm": 0.8515625, + "learning_rate": 0.0004554357801912895, + "loss": 0.1819, + "step": 139870 + }, + { + "epoch": 5.79, + "grad_norm": 0.26953125, + "learning_rate": 0.0004554295997759768, + "loss": 0.1913, + "step": 139880 + }, + { + "epoch": 5.79, + "grad_norm": 0.96484375, + "learning_rate": 0.0004554234189740678, + "loss": 0.2286, + "step": 139890 + }, + { + "epoch": 5.79, + "grad_norm": 0.423828125, + "learning_rate": 0.00045541723778557406, + "loss": 0.1651, + "step": 139900 + }, + { + "epoch": 5.8, + "grad_norm": 0.7734375, + "learning_rate": 0.00045541105621050724, + "loss": 0.2193, + "step": 139910 + }, + { + "epoch": 5.8, + "grad_norm": 1.2890625, + "learning_rate": 0.00045540487424887904, + "loss": 0.2191, + "step": 139920 + }, + { + "epoch": 5.8, + "grad_norm": 0.384765625, + "learning_rate": 0.00045539869190070105, + "loss": 0.204, + "step": 139930 + }, + { + "epoch": 5.8, + "grad_norm": 1.609375, + "learning_rate": 0.0004553925091659849, + "loss": 0.2213, + "step": 139940 + }, + { + "epoch": 5.8, + "grad_norm": 0.259765625, + "learning_rate": 0.0004553863260447422, + "loss": 0.2057, + "step": 139950 + }, + { + "epoch": 5.8, + "grad_norm": 0.5625, + "learning_rate": 0.0004553801425369847, + "loss": 0.2152, + "step": 139960 + }, + { + "epoch": 5.8, + "grad_norm": 2.234375, + "learning_rate": 0.00045537395864272387, + "loss": 0.1793, + "step": 139970 + }, + { + "epoch": 5.8, + "grad_norm": 0.94921875, + "learning_rate": 0.0004553677743619714, + "loss": 0.2012, + "step": 139980 + }, + { + "epoch": 5.8, + "grad_norm": 0.640625, + "learning_rate": 0.00045536158969473905, + "loss": 0.1735, + "step": 139990 + }, + { + "epoch": 5.8, + "grad_norm": 0.89453125, + "learning_rate": 0.0004553554046410383, + "loss": 0.2213, + "step": 140000 + }, + { + "epoch": 5.8, + "grad_norm": 0.68359375, + "learning_rate": 0.0004553492192008809, + "loss": 0.2606, + "step": 140010 + }, + { + "epoch": 5.8, + "grad_norm": 0.69140625, + "learning_rate": 0.0004553430333742784, + "loss": 0.2275, + "step": 140020 + }, + { + "epoch": 5.8, + "grad_norm": 0.73828125, + "learning_rate": 0.00045533684716124245, + "loss": 0.1922, + "step": 140030 + }, + { + "epoch": 5.8, + "grad_norm": 0.5859375, + "learning_rate": 0.00045533066056178484, + "loss": 0.2144, + "step": 140040 + }, + { + "epoch": 5.8, + "grad_norm": 0.8359375, + "learning_rate": 0.000455324473575917, + "loss": 0.2384, + "step": 140050 + }, + { + "epoch": 5.8, + "grad_norm": 1.109375, + "learning_rate": 0.0004553182862036508, + "loss": 0.2109, + "step": 140060 + }, + { + "epoch": 5.8, + "grad_norm": 1.4921875, + "learning_rate": 0.00045531209844499763, + "loss": 0.1986, + "step": 140070 + }, + { + "epoch": 5.8, + "grad_norm": 0.76171875, + "learning_rate": 0.0004553059102999693, + "loss": 0.1774, + "step": 140080 + }, + { + "epoch": 5.8, + "grad_norm": 0.65625, + "learning_rate": 0.0004552997217685774, + "loss": 0.1882, + "step": 140090 + }, + { + "epoch": 5.8, + "grad_norm": 0.765625, + "learning_rate": 0.0004552935328508336, + "loss": 0.2315, + "step": 140100 + }, + { + "epoch": 5.8, + "grad_norm": 0.36328125, + "learning_rate": 0.0004552873435467496, + "loss": 0.1894, + "step": 140110 + }, + { + "epoch": 5.8, + "grad_norm": 0.8359375, + "learning_rate": 0.00045528115385633694, + "loss": 0.206, + "step": 140120 + }, + { + "epoch": 5.8, + "grad_norm": 0.74609375, + "learning_rate": 0.0004552749637796073, + "loss": 0.1977, + "step": 140130 + }, + { + "epoch": 5.8, + "grad_norm": 0.470703125, + "learning_rate": 0.0004552687733165724, + "loss": 0.197, + "step": 140140 + }, + { + "epoch": 5.8, + "grad_norm": 0.455078125, + "learning_rate": 0.0004552625824672438, + "loss": 0.1929, + "step": 140150 + }, + { + "epoch": 5.81, + "grad_norm": 1.1953125, + "learning_rate": 0.0004552563912316332, + "loss": 0.1852, + "step": 140160 + }, + { + "epoch": 5.81, + "grad_norm": 0.7578125, + "learning_rate": 0.0004552501996097522, + "loss": 0.1852, + "step": 140170 + }, + { + "epoch": 5.81, + "grad_norm": 0.59765625, + "learning_rate": 0.00045524400760161253, + "loss": 0.2606, + "step": 140180 + }, + { + "epoch": 5.81, + "grad_norm": 0.89453125, + "learning_rate": 0.0004552378152072258, + "loss": 0.2438, + "step": 140190 + }, + { + "epoch": 5.81, + "grad_norm": 0.62109375, + "learning_rate": 0.00045523162242660365, + "loss": 0.2246, + "step": 140200 + }, + { + "epoch": 5.81, + "grad_norm": 0.61328125, + "learning_rate": 0.0004552254292597578, + "loss": 0.2113, + "step": 140210 + }, + { + "epoch": 5.81, + "grad_norm": 0.9140625, + "learning_rate": 0.0004552192357066998, + "loss": 0.2389, + "step": 140220 + }, + { + "epoch": 5.81, + "grad_norm": 0.486328125, + "learning_rate": 0.0004552130417674414, + "loss": 0.1721, + "step": 140230 + }, + { + "epoch": 5.81, + "grad_norm": 0.703125, + "learning_rate": 0.0004552068474419941, + "loss": 0.2249, + "step": 140240 + }, + { + "epoch": 5.81, + "grad_norm": 0.1513671875, + "learning_rate": 0.0004552006527303698, + "loss": 0.2085, + "step": 140250 + }, + { + "epoch": 5.81, + "grad_norm": 0.67578125, + "learning_rate": 0.00045519445763258007, + "loss": 0.1475, + "step": 140260 + }, + { + "epoch": 5.81, + "grad_norm": 0.55078125, + "learning_rate": 0.0004551882621486364, + "loss": 0.2116, + "step": 140270 + }, + { + "epoch": 5.81, + "grad_norm": 0.90625, + "learning_rate": 0.0004551820662785506, + "loss": 0.1976, + "step": 140280 + }, + { + "epoch": 5.81, + "grad_norm": 0.61328125, + "learning_rate": 0.0004551758700223344, + "loss": 0.2249, + "step": 140290 + }, + { + "epoch": 5.81, + "grad_norm": 1.2421875, + "learning_rate": 0.0004551696733799993, + "loss": 0.228, + "step": 140300 + }, + { + "epoch": 5.81, + "grad_norm": 0.85546875, + "learning_rate": 0.00045516347635155707, + "loss": 0.1842, + "step": 140310 + }, + { + "epoch": 5.81, + "grad_norm": 1.09375, + "learning_rate": 0.0004551572789370193, + "loss": 0.1835, + "step": 140320 + }, + { + "epoch": 5.81, + "grad_norm": 0.5546875, + "learning_rate": 0.00045515108113639767, + "loss": 0.1739, + "step": 140330 + }, + { + "epoch": 5.81, + "grad_norm": 0.54296875, + "learning_rate": 0.0004551448829497039, + "loss": 0.1866, + "step": 140340 + }, + { + "epoch": 5.81, + "grad_norm": 0.439453125, + "learning_rate": 0.0004551386843769496, + "loss": 0.1783, + "step": 140350 + }, + { + "epoch": 5.81, + "grad_norm": 0.58203125, + "learning_rate": 0.00045513248541814645, + "loss": 0.2171, + "step": 140360 + }, + { + "epoch": 5.81, + "grad_norm": 0.2431640625, + "learning_rate": 0.0004551262860733061, + "loss": 0.1921, + "step": 140370 + }, + { + "epoch": 5.81, + "grad_norm": 1.109375, + "learning_rate": 0.00045512008634244024, + "loss": 0.1939, + "step": 140380 + }, + { + "epoch": 5.81, + "grad_norm": 0.59375, + "learning_rate": 0.0004551138862255605, + "loss": 0.2049, + "step": 140390 + }, + { + "epoch": 5.82, + "grad_norm": 0.68359375, + "learning_rate": 0.0004551076857226786, + "loss": 0.1963, + "step": 140400 + }, + { + "epoch": 5.82, + "grad_norm": 0.671875, + "learning_rate": 0.0004551014848338062, + "loss": 0.2356, + "step": 140410 + }, + { + "epoch": 5.82, + "grad_norm": 0.455078125, + "learning_rate": 0.00045509528355895494, + "loss": 0.1903, + "step": 140420 + }, + { + "epoch": 5.82, + "grad_norm": 0.431640625, + "learning_rate": 0.00045508908189813654, + "loss": 0.2014, + "step": 140430 + }, + { + "epoch": 5.82, + "grad_norm": 1.046875, + "learning_rate": 0.0004550828798513626, + "loss": 0.2641, + "step": 140440 + }, + { + "epoch": 5.82, + "grad_norm": 0.4375, + "learning_rate": 0.00045507667741864484, + "loss": 0.2085, + "step": 140450 + }, + { + "epoch": 5.82, + "grad_norm": 0.8046875, + "learning_rate": 0.0004550704745999949, + "loss": 0.2255, + "step": 140460 + }, + { + "epoch": 5.82, + "grad_norm": 0.5390625, + "learning_rate": 0.0004550642713954245, + "loss": 0.1971, + "step": 140470 + }, + { + "epoch": 5.82, + "grad_norm": 0.71484375, + "learning_rate": 0.0004550580678049453, + "loss": 0.2237, + "step": 140480 + }, + { + "epoch": 5.82, + "grad_norm": 0.71875, + "learning_rate": 0.0004550518638285689, + "loss": 0.1776, + "step": 140490 + }, + { + "epoch": 5.82, + "grad_norm": 0.75390625, + "learning_rate": 0.00045504565946630703, + "loss": 0.2071, + "step": 140500 + }, + { + "epoch": 5.82, + "grad_norm": 0.419921875, + "learning_rate": 0.00045503945471817145, + "loss": 0.2239, + "step": 140510 + }, + { + "epoch": 5.82, + "grad_norm": 0.73046875, + "learning_rate": 0.0004550332495841737, + "loss": 0.2079, + "step": 140520 + }, + { + "epoch": 5.82, + "grad_norm": 0.9609375, + "learning_rate": 0.00045502704406432553, + "loss": 0.221, + "step": 140530 + }, + { + "epoch": 5.82, + "grad_norm": 1.171875, + "learning_rate": 0.0004550208381586386, + "loss": 0.223, + "step": 140540 + }, + { + "epoch": 5.82, + "grad_norm": 0.78125, + "learning_rate": 0.00045501463186712465, + "loss": 0.1855, + "step": 140550 + }, + { + "epoch": 5.82, + "grad_norm": 0.427734375, + "learning_rate": 0.00045500842518979523, + "loss": 0.209, + "step": 140560 + }, + { + "epoch": 5.82, + "grad_norm": 0.9140625, + "learning_rate": 0.0004550022181266621, + "loss": 0.1833, + "step": 140570 + }, + { + "epoch": 5.82, + "grad_norm": 1.8984375, + "learning_rate": 0.000454996010677737, + "loss": 0.211, + "step": 140580 + }, + { + "epoch": 5.82, + "grad_norm": 0.5859375, + "learning_rate": 0.00045498980284303147, + "loss": 0.176, + "step": 140590 + }, + { + "epoch": 5.82, + "grad_norm": 0.734375, + "learning_rate": 0.0004549835946225573, + "loss": 0.1766, + "step": 140600 + }, + { + "epoch": 5.82, + "grad_norm": 0.66015625, + "learning_rate": 0.0004549773860163261, + "loss": 0.1857, + "step": 140610 + }, + { + "epoch": 5.82, + "grad_norm": 0.65625, + "learning_rate": 0.00045497117702434964, + "loss": 0.1992, + "step": 140620 + }, + { + "epoch": 5.82, + "grad_norm": 0.416015625, + "learning_rate": 0.0004549649676466396, + "loss": 0.1703, + "step": 140630 + }, + { + "epoch": 5.83, + "grad_norm": 0.62890625, + "learning_rate": 0.0004549587578832075, + "loss": 0.1948, + "step": 140640 + }, + { + "epoch": 5.83, + "grad_norm": 0.84375, + "learning_rate": 0.00045495254773406525, + "loss": 0.2507, + "step": 140650 + }, + { + "epoch": 5.83, + "grad_norm": 0.90625, + "learning_rate": 0.0004549463371992244, + "loss": 0.2059, + "step": 140660 + }, + { + "epoch": 5.83, + "grad_norm": 0.65234375, + "learning_rate": 0.00045494012627869673, + "loss": 0.2077, + "step": 140670 + }, + { + "epoch": 5.83, + "grad_norm": 1.109375, + "learning_rate": 0.0004549339149724938, + "loss": 0.2021, + "step": 140680 + }, + { + "epoch": 5.83, + "grad_norm": 0.68359375, + "learning_rate": 0.00045492770328062746, + "loss": 0.2139, + "step": 140690 + }, + { + "epoch": 5.83, + "grad_norm": 1.0546875, + "learning_rate": 0.0004549214912031092, + "loss": 0.1808, + "step": 140700 + }, + { + "epoch": 5.83, + "grad_norm": 0.291015625, + "learning_rate": 0.0004549152787399509, + "loss": 0.1945, + "step": 140710 + }, + { + "epoch": 5.83, + "grad_norm": 0.6328125, + "learning_rate": 0.0004549090658911642, + "loss": 0.1736, + "step": 140720 + }, + { + "epoch": 5.83, + "grad_norm": 0.400390625, + "learning_rate": 0.00045490285265676066, + "loss": 0.1825, + "step": 140730 + }, + { + "epoch": 5.83, + "grad_norm": 0.90625, + "learning_rate": 0.00045489663903675216, + "loss": 0.2378, + "step": 140740 + }, + { + "epoch": 5.83, + "grad_norm": 0.76171875, + "learning_rate": 0.0004548904250311503, + "loss": 0.2166, + "step": 140750 + }, + { + "epoch": 5.83, + "grad_norm": 0.66015625, + "learning_rate": 0.00045488421063996675, + "loss": 0.2102, + "step": 140760 + }, + { + "epoch": 5.83, + "grad_norm": 0.359375, + "learning_rate": 0.0004548779958632133, + "loss": 0.1974, + "step": 140770 + }, + { + "epoch": 5.83, + "grad_norm": 0.828125, + "learning_rate": 0.00045487178070090163, + "loss": 0.2509, + "step": 140780 + }, + { + "epoch": 5.83, + "grad_norm": 0.345703125, + "learning_rate": 0.0004548655651530433, + "loss": 0.1991, + "step": 140790 + }, + { + "epoch": 5.83, + "grad_norm": 1.1015625, + "learning_rate": 0.0004548593492196501, + "loss": 0.1949, + "step": 140800 + }, + { + "epoch": 5.83, + "grad_norm": 0.57421875, + "learning_rate": 0.00045485313290073373, + "loss": 0.1765, + "step": 140810 + }, + { + "epoch": 5.83, + "grad_norm": 0.953125, + "learning_rate": 0.0004548469161963059, + "loss": 0.2021, + "step": 140820 + }, + { + "epoch": 5.83, + "grad_norm": 1.2578125, + "learning_rate": 0.0004548406991063784, + "loss": 0.21, + "step": 140830 + }, + { + "epoch": 5.83, + "grad_norm": 0.55078125, + "learning_rate": 0.00045483448163096264, + "loss": 0.195, + "step": 140840 + }, + { + "epoch": 5.83, + "grad_norm": 0.25390625, + "learning_rate": 0.00045482826377007057, + "loss": 0.1832, + "step": 140850 + }, + { + "epoch": 5.83, + "grad_norm": 0.50390625, + "learning_rate": 0.0004548220455237139, + "loss": 0.2033, + "step": 140860 + }, + { + "epoch": 5.83, + "grad_norm": 0.78515625, + "learning_rate": 0.0004548158268919042, + "loss": 0.2046, + "step": 140870 + }, + { + "epoch": 5.84, + "grad_norm": 0.796875, + "learning_rate": 0.00045480960787465326, + "loss": 0.1822, + "step": 140880 + }, + { + "epoch": 5.84, + "grad_norm": 0.373046875, + "learning_rate": 0.00045480338847197273, + "loss": 0.2103, + "step": 140890 + }, + { + "epoch": 5.84, + "grad_norm": 1.109375, + "learning_rate": 0.0004547971686838743, + "loss": 0.1835, + "step": 140900 + }, + { + "epoch": 5.84, + "grad_norm": 0.69140625, + "learning_rate": 0.00045479094851036986, + "loss": 0.2394, + "step": 140910 + }, + { + "epoch": 5.84, + "grad_norm": 0.67578125, + "learning_rate": 0.0004547847279514708, + "loss": 0.1928, + "step": 140920 + }, + { + "epoch": 5.84, + "grad_norm": 0.5390625, + "learning_rate": 0.00045477850700718903, + "loss": 0.176, + "step": 140930 + }, + { + "epoch": 5.84, + "grad_norm": 0.953125, + "learning_rate": 0.0004547722856775363, + "loss": 0.2063, + "step": 140940 + }, + { + "epoch": 5.84, + "grad_norm": 0.53515625, + "learning_rate": 0.00045476606396252424, + "loss": 0.2255, + "step": 140950 + }, + { + "epoch": 5.84, + "grad_norm": 0.4921875, + "learning_rate": 0.0004547598418621645, + "loss": 0.2243, + "step": 140960 + }, + { + "epoch": 5.84, + "grad_norm": 1.5390625, + "learning_rate": 0.00045475361937646886, + "loss": 0.2048, + "step": 140970 + }, + { + "epoch": 5.84, + "grad_norm": 1.15625, + "learning_rate": 0.00045474739650544905, + "loss": 0.1818, + "step": 140980 + }, + { + "epoch": 5.84, + "grad_norm": 0.421875, + "learning_rate": 0.00045474117324911676, + "loss": 0.224, + "step": 140990 + }, + { + "epoch": 5.84, + "grad_norm": 0.279296875, + "learning_rate": 0.0004547349496074836, + "loss": 0.1484, + "step": 141000 + }, + { + "epoch": 5.84, + "grad_norm": 0.65625, + "learning_rate": 0.00045472872558056145, + "loss": 0.2431, + "step": 141010 + }, + { + "epoch": 5.84, + "grad_norm": 0.8984375, + "learning_rate": 0.000454722501168362, + "loss": 0.1909, + "step": 141020 + }, + { + "epoch": 5.84, + "grad_norm": 0.87109375, + "learning_rate": 0.00045471627637089685, + "loss": 0.296, + "step": 141030 + }, + { + "epoch": 5.84, + "grad_norm": 0.703125, + "learning_rate": 0.00045471005118817776, + "loss": 0.174, + "step": 141040 + }, + { + "epoch": 5.84, + "grad_norm": 1.3359375, + "learning_rate": 0.00045470382562021644, + "loss": 0.233, + "step": 141050 + }, + { + "epoch": 5.84, + "grad_norm": 0.83203125, + "learning_rate": 0.0004546975996670247, + "loss": 0.2369, + "step": 141060 + }, + { + "epoch": 5.84, + "grad_norm": 0.28515625, + "learning_rate": 0.0004546913733286141, + "loss": 0.1714, + "step": 141070 + }, + { + "epoch": 5.84, + "grad_norm": 0.81640625, + "learning_rate": 0.0004546851466049965, + "loss": 0.2091, + "step": 141080 + }, + { + "epoch": 5.84, + "grad_norm": 0.90234375, + "learning_rate": 0.00045467891949618354, + "loss": 0.2098, + "step": 141090 + }, + { + "epoch": 5.84, + "grad_norm": 0.8359375, + "learning_rate": 0.000454672692002187, + "loss": 0.2449, + "step": 141100 + }, + { + "epoch": 5.84, + "grad_norm": 1.359375, + "learning_rate": 0.0004546664641230185, + "loss": 0.2083, + "step": 141110 + }, + { + "epoch": 5.85, + "grad_norm": 1.0078125, + "learning_rate": 0.0004546602358586898, + "loss": 0.2498, + "step": 141120 + }, + { + "epoch": 5.85, + "grad_norm": 0.5390625, + "learning_rate": 0.0004546540072092127, + "loss": 0.2272, + "step": 141130 + }, + { + "epoch": 5.85, + "grad_norm": 0.16796875, + "learning_rate": 0.0004546477781745988, + "loss": 0.197, + "step": 141140 + }, + { + "epoch": 5.85, + "grad_norm": 0.62890625, + "learning_rate": 0.0004546415487548599, + "loss": 0.2142, + "step": 141150 + }, + { + "epoch": 5.85, + "grad_norm": 0.54296875, + "learning_rate": 0.00045463531895000774, + "loss": 0.1707, + "step": 141160 + }, + { + "epoch": 5.85, + "grad_norm": 0.69140625, + "learning_rate": 0.000454629088760054, + "loss": 0.1876, + "step": 141170 + }, + { + "epoch": 5.85, + "grad_norm": 0.7109375, + "learning_rate": 0.00045462285818501037, + "loss": 0.1983, + "step": 141180 + }, + { + "epoch": 5.85, + "grad_norm": 0.64453125, + "learning_rate": 0.00045461662722488864, + "loss": 0.2403, + "step": 141190 + }, + { + "epoch": 5.85, + "grad_norm": 0.5234375, + "learning_rate": 0.00045461039587970053, + "loss": 0.2271, + "step": 141200 + }, + { + "epoch": 5.85, + "grad_norm": 0.439453125, + "learning_rate": 0.0004546041641494577, + "loss": 0.2363, + "step": 141210 + }, + { + "epoch": 5.85, + "grad_norm": 0.466796875, + "learning_rate": 0.000454597932034172, + "loss": 0.1892, + "step": 141220 + }, + { + "epoch": 5.85, + "grad_norm": 1.125, + "learning_rate": 0.00045459169953385506, + "loss": 0.23, + "step": 141230 + }, + { + "epoch": 5.85, + "grad_norm": 0.7734375, + "learning_rate": 0.0004545854666485186, + "loss": 0.2418, + "step": 141240 + }, + { + "epoch": 5.85, + "grad_norm": 0.73828125, + "learning_rate": 0.0004545792333781744, + "loss": 0.23, + "step": 141250 + }, + { + "epoch": 5.85, + "grad_norm": 0.54296875, + "learning_rate": 0.0004545729997228343, + "loss": 0.1702, + "step": 141260 + }, + { + "epoch": 5.85, + "grad_norm": 0.22265625, + "learning_rate": 0.00045456676568250974, + "loss": 0.1997, + "step": 141270 + }, + { + "epoch": 5.85, + "grad_norm": 0.82421875, + "learning_rate": 0.0004545605312572127, + "loss": 0.1362, + "step": 141280 + }, + { + "epoch": 5.85, + "grad_norm": 1.2109375, + "learning_rate": 0.0004545542964469548, + "loss": 0.1822, + "step": 141290 + }, + { + "epoch": 5.85, + "grad_norm": 0.8671875, + "learning_rate": 0.0004545480612517478, + "loss": 0.2369, + "step": 141300 + }, + { + "epoch": 5.85, + "grad_norm": 0.875, + "learning_rate": 0.00045454182567160345, + "loss": 0.1773, + "step": 141310 + }, + { + "epoch": 5.85, + "grad_norm": 0.98046875, + "learning_rate": 0.0004545355897065335, + "loss": 0.2238, + "step": 141320 + }, + { + "epoch": 5.85, + "grad_norm": 0.60546875, + "learning_rate": 0.0004545293533565496, + "loss": 0.1843, + "step": 141330 + }, + { + "epoch": 5.85, + "grad_norm": 0.66015625, + "learning_rate": 0.0004545231166216636, + "loss": 0.255, + "step": 141340 + }, + { + "epoch": 5.85, + "grad_norm": 0.3515625, + "learning_rate": 0.00045451687950188714, + "loss": 0.2097, + "step": 141350 + }, + { + "epoch": 5.86, + "grad_norm": 0.4609375, + "learning_rate": 0.000454510641997232, + "loss": 0.2234, + "step": 141360 + }, + { + "epoch": 5.86, + "grad_norm": 0.46875, + "learning_rate": 0.0004545044041077099, + "loss": 0.205, + "step": 141370 + }, + { + "epoch": 5.86, + "grad_norm": 1.0546875, + "learning_rate": 0.00045449816583333265, + "loss": 0.1979, + "step": 141380 + }, + { + "epoch": 5.86, + "grad_norm": 0.609375, + "learning_rate": 0.0004544919271741119, + "loss": 0.2354, + "step": 141390 + }, + { + "epoch": 5.86, + "grad_norm": 0.5859375, + "learning_rate": 0.0004544856881300594, + "loss": 0.2403, + "step": 141400 + }, + { + "epoch": 5.86, + "grad_norm": 0.671875, + "learning_rate": 0.00045447944870118703, + "loss": 0.1577, + "step": 141410 + }, + { + "epoch": 5.86, + "grad_norm": 0.64453125, + "learning_rate": 0.0004544732088875063, + "loss": 0.1696, + "step": 141420 + }, + { + "epoch": 5.86, + "grad_norm": 1.2109375, + "learning_rate": 0.00045446696868902916, + "loss": 0.2131, + "step": 141430 + }, + { + "epoch": 5.86, + "grad_norm": 0.4296875, + "learning_rate": 0.0004544607281057672, + "loss": 0.236, + "step": 141440 + }, + { + "epoch": 5.86, + "grad_norm": 1.359375, + "learning_rate": 0.0004544544871377323, + "loss": 0.2464, + "step": 141450 + }, + { + "epoch": 5.86, + "grad_norm": 0.30859375, + "learning_rate": 0.00045444824578493606, + "loss": 0.2456, + "step": 141460 + }, + { + "epoch": 5.86, + "grad_norm": 0.30078125, + "learning_rate": 0.00045444200404739035, + "loss": 0.1533, + "step": 141470 + }, + { + "epoch": 5.86, + "grad_norm": 2.828125, + "learning_rate": 0.0004544357619251068, + "loss": 0.1728, + "step": 141480 + }, + { + "epoch": 5.86, + "grad_norm": 0.55078125, + "learning_rate": 0.0004544295194180973, + "loss": 0.1937, + "step": 141490 + }, + { + "epoch": 5.86, + "grad_norm": 0.71875, + "learning_rate": 0.0004544232765263735, + "loss": 0.1932, + "step": 141500 + }, + { + "epoch": 5.86, + "grad_norm": 1.0, + "learning_rate": 0.0004544170332499471, + "loss": 0.2124, + "step": 141510 + }, + { + "epoch": 5.86, + "grad_norm": 0.1845703125, + "learning_rate": 0.00045441078958882996, + "loss": 0.1956, + "step": 141520 + }, + { + "epoch": 5.86, + "grad_norm": 1.4921875, + "learning_rate": 0.0004544045455430338, + "loss": 0.2891, + "step": 141530 + }, + { + "epoch": 5.86, + "grad_norm": 0.63671875, + "learning_rate": 0.00045439830111257043, + "loss": 0.2017, + "step": 141540 + }, + { + "epoch": 5.86, + "grad_norm": 0.9765625, + "learning_rate": 0.0004543920562974514, + "loss": 0.2516, + "step": 141550 + }, + { + "epoch": 5.86, + "grad_norm": 0.8125, + "learning_rate": 0.00045438581109768865, + "loss": 0.2391, + "step": 141560 + }, + { + "epoch": 5.86, + "grad_norm": 0.79296875, + "learning_rate": 0.0004543795655132939, + "loss": 0.2107, + "step": 141570 + }, + { + "epoch": 5.86, + "grad_norm": 0.26953125, + "learning_rate": 0.00045437331954427884, + "loss": 0.2053, + "step": 141580 + }, + { + "epoch": 5.86, + "grad_norm": 0.7109375, + "learning_rate": 0.0004543670731906553, + "loss": 0.217, + "step": 141590 + }, + { + "epoch": 5.87, + "grad_norm": 1.4296875, + "learning_rate": 0.00045436082645243503, + "loss": 0.2657, + "step": 141600 + }, + { + "epoch": 5.87, + "grad_norm": 0.36328125, + "learning_rate": 0.00045435457932962965, + "loss": 0.214, + "step": 141610 + }, + { + "epoch": 5.87, + "grad_norm": 0.9921875, + "learning_rate": 0.0004543483318222511, + "loss": 0.1939, + "step": 141620 + }, + { + "epoch": 5.87, + "grad_norm": 0.7421875, + "learning_rate": 0.00045434208393031105, + "loss": 0.2084, + "step": 141630 + }, + { + "epoch": 5.87, + "grad_norm": 0.72265625, + "learning_rate": 0.0004543358356538212, + "loss": 0.1973, + "step": 141640 + }, + { + "epoch": 5.87, + "grad_norm": 0.5859375, + "learning_rate": 0.00045432958699279346, + "loss": 0.1994, + "step": 141650 + }, + { + "epoch": 5.87, + "grad_norm": 1.0234375, + "learning_rate": 0.00045432333794723944, + "loss": 0.1978, + "step": 141660 + }, + { + "epoch": 5.87, + "grad_norm": 0.6875, + "learning_rate": 0.000454317088517171, + "loss": 0.2665, + "step": 141670 + }, + { + "epoch": 5.87, + "grad_norm": 0.62890625, + "learning_rate": 0.0004543108387025998, + "loss": 0.2555, + "step": 141680 + }, + { + "epoch": 5.87, + "grad_norm": 0.55078125, + "learning_rate": 0.0004543045885035377, + "loss": 0.2066, + "step": 141690 + }, + { + "epoch": 5.87, + "grad_norm": 0.98046875, + "learning_rate": 0.0004542983379199965, + "loss": 0.1539, + "step": 141700 + }, + { + "epoch": 5.87, + "grad_norm": 0.4296875, + "learning_rate": 0.0004542920869519879, + "loss": 0.2178, + "step": 141710 + }, + { + "epoch": 5.87, + "grad_norm": 0.68359375, + "learning_rate": 0.0004542858355995235, + "loss": 0.2334, + "step": 141720 + }, + { + "epoch": 5.87, + "grad_norm": 0.453125, + "learning_rate": 0.00045427958386261535, + "loss": 0.2465, + "step": 141730 + }, + { + "epoch": 5.87, + "grad_norm": 0.91796875, + "learning_rate": 0.00045427333174127507, + "loss": 0.2276, + "step": 141740 + }, + { + "epoch": 5.87, + "grad_norm": 0.61328125, + "learning_rate": 0.00045426707923551437, + "loss": 0.2152, + "step": 141750 + }, + { + "epoch": 5.87, + "grad_norm": 0.5703125, + "learning_rate": 0.00045426082634534513, + "loss": 0.1605, + "step": 141760 + }, + { + "epoch": 5.87, + "grad_norm": 0.412109375, + "learning_rate": 0.00045425457307077905, + "loss": 0.1844, + "step": 141770 + }, + { + "epoch": 5.87, + "grad_norm": 1.46875, + "learning_rate": 0.0004542483194118279, + "loss": 0.2034, + "step": 141780 + }, + { + "epoch": 5.87, + "grad_norm": 0.859375, + "learning_rate": 0.0004542420653685035, + "loss": 0.2204, + "step": 141790 + }, + { + "epoch": 5.87, + "grad_norm": 0.51171875, + "learning_rate": 0.0004542358109408177, + "loss": 0.1547, + "step": 141800 + }, + { + "epoch": 5.87, + "grad_norm": 0.7265625, + "learning_rate": 0.00045422955612878203, + "loss": 0.1924, + "step": 141810 + }, + { + "epoch": 5.87, + "grad_norm": 0.9921875, + "learning_rate": 0.0004542233009324084, + "loss": 0.2051, + "step": 141820 + }, + { + "epoch": 5.87, + "grad_norm": 0.6640625, + "learning_rate": 0.0004542170453517086, + "loss": 0.1739, + "step": 141830 + }, + { + "epoch": 5.87, + "grad_norm": 0.7109375, + "learning_rate": 0.00045421078938669433, + "loss": 0.1588, + "step": 141840 + }, + { + "epoch": 5.88, + "grad_norm": 0.87890625, + "learning_rate": 0.0004542045330373774, + "loss": 0.2202, + "step": 141850 + }, + { + "epoch": 5.88, + "grad_norm": 1.4375, + "learning_rate": 0.00045419827630376964, + "loss": 0.2615, + "step": 141860 + }, + { + "epoch": 5.88, + "grad_norm": 0.90234375, + "learning_rate": 0.0004541920191858828, + "loss": 0.1951, + "step": 141870 + }, + { + "epoch": 5.88, + "grad_norm": 0.70703125, + "learning_rate": 0.0004541857616837286, + "loss": 0.2012, + "step": 141880 + }, + { + "epoch": 5.88, + "grad_norm": 0.96875, + "learning_rate": 0.0004541795037973189, + "loss": 0.1686, + "step": 141890 + }, + { + "epoch": 5.88, + "grad_norm": 0.5625, + "learning_rate": 0.0004541732455266653, + "loss": 0.2054, + "step": 141900 + }, + { + "epoch": 5.88, + "grad_norm": 0.53515625, + "learning_rate": 0.00045416698687177976, + "loss": 0.218, + "step": 141910 + }, + { + "epoch": 5.88, + "grad_norm": 1.6484375, + "learning_rate": 0.000454160727832674, + "loss": 0.2512, + "step": 141920 + }, + { + "epoch": 5.88, + "grad_norm": 0.69921875, + "learning_rate": 0.0004541544684093598, + "loss": 0.1907, + "step": 141930 + }, + { + "epoch": 5.88, + "grad_norm": 0.466796875, + "learning_rate": 0.000454148208601849, + "loss": 0.221, + "step": 141940 + }, + { + "epoch": 5.88, + "grad_norm": 0.46875, + "learning_rate": 0.00045414194841015323, + "loss": 0.1713, + "step": 141950 + }, + { + "epoch": 5.88, + "grad_norm": 0.9375, + "learning_rate": 0.00045413568783428436, + "loss": 0.2024, + "step": 141960 + }, + { + "epoch": 5.88, + "grad_norm": 1.0625, + "learning_rate": 0.00045412942687425426, + "loss": 0.2371, + "step": 141970 + }, + { + "epoch": 5.88, + "grad_norm": 0.2470703125, + "learning_rate": 0.0004541231655300745, + "loss": 0.1794, + "step": 141980 + }, + { + "epoch": 5.88, + "grad_norm": 0.578125, + "learning_rate": 0.0004541169038017571, + "loss": 0.2074, + "step": 141990 + }, + { + "epoch": 5.88, + "grad_norm": 0.7265625, + "learning_rate": 0.00045411064168931364, + "loss": 0.2305, + "step": 142000 + }, + { + "epoch": 5.88, + "grad_norm": 0.337890625, + "learning_rate": 0.00045410437919275607, + "loss": 0.1912, + "step": 142010 + }, + { + "epoch": 5.88, + "grad_norm": 0.67578125, + "learning_rate": 0.0004540981163120961, + "loss": 0.2, + "step": 142020 + }, + { + "epoch": 5.88, + "grad_norm": 0.82421875, + "learning_rate": 0.0004540918530473454, + "loss": 0.2052, + "step": 142030 + }, + { + "epoch": 5.88, + "grad_norm": 1.203125, + "learning_rate": 0.00045408558939851596, + "loss": 0.2086, + "step": 142040 + }, + { + "epoch": 5.88, + "grad_norm": 1.109375, + "learning_rate": 0.00045407932536561946, + "loss": 0.229, + "step": 142050 + }, + { + "epoch": 5.88, + "grad_norm": 1.03125, + "learning_rate": 0.00045407306094866776, + "loss": 0.194, + "step": 142060 + }, + { + "epoch": 5.88, + "grad_norm": 1.0, + "learning_rate": 0.00045406679614767257, + "loss": 0.2097, + "step": 142070 + }, + { + "epoch": 5.88, + "grad_norm": 1.1328125, + "learning_rate": 0.00045406053096264565, + "loss": 0.1958, + "step": 142080 + }, + { + "epoch": 5.89, + "grad_norm": 0.66015625, + "learning_rate": 0.0004540542653935988, + "loss": 0.1959, + "step": 142090 + }, + { + "epoch": 5.89, + "grad_norm": 0.62109375, + "learning_rate": 0.00045404799944054395, + "loss": 0.2348, + "step": 142100 + }, + { + "epoch": 5.89, + "grad_norm": 0.62109375, + "learning_rate": 0.0004540417331034928, + "loss": 0.1989, + "step": 142110 + }, + { + "epoch": 5.89, + "grad_norm": 1.234375, + "learning_rate": 0.00045403546638245716, + "loss": 0.1999, + "step": 142120 + }, + { + "epoch": 5.89, + "grad_norm": 0.486328125, + "learning_rate": 0.00045402919927744887, + "loss": 0.266, + "step": 142130 + }, + { + "epoch": 5.89, + "grad_norm": 0.6328125, + "learning_rate": 0.00045402293178847954, + "loss": 0.2185, + "step": 142140 + }, + { + "epoch": 5.89, + "grad_norm": 0.53125, + "learning_rate": 0.0004540166639155611, + "loss": 0.1997, + "step": 142150 + }, + { + "epoch": 5.89, + "grad_norm": 0.85546875, + "learning_rate": 0.00045401039565870537, + "loss": 0.2059, + "step": 142160 + }, + { + "epoch": 5.89, + "grad_norm": 0.7890625, + "learning_rate": 0.0004540041270179241, + "loss": 0.2118, + "step": 142170 + }, + { + "epoch": 5.89, + "grad_norm": 1.4140625, + "learning_rate": 0.0004539978579932291, + "loss": 0.2229, + "step": 142180 + }, + { + "epoch": 5.89, + "grad_norm": 1.328125, + "learning_rate": 0.00045399158858463215, + "loss": 0.1633, + "step": 142190 + }, + { + "epoch": 5.89, + "grad_norm": 0.447265625, + "learning_rate": 0.000453985318792145, + "loss": 0.2428, + "step": 142200 + }, + { + "epoch": 5.89, + "grad_norm": 2.421875, + "learning_rate": 0.0004539790486157796, + "loss": 0.241, + "step": 142210 + }, + { + "epoch": 5.89, + "grad_norm": 0.91796875, + "learning_rate": 0.0004539727780555476, + "loss": 0.2349, + "step": 142220 + }, + { + "epoch": 5.89, + "grad_norm": 1.109375, + "learning_rate": 0.00045396650711146093, + "loss": 0.1772, + "step": 142230 + }, + { + "epoch": 5.89, + "grad_norm": 1.3515625, + "learning_rate": 0.0004539602357835312, + "loss": 0.1881, + "step": 142240 + }, + { + "epoch": 5.89, + "grad_norm": 0.6328125, + "learning_rate": 0.00045395396407177044, + "loss": 0.2337, + "step": 142250 + }, + { + "epoch": 5.89, + "grad_norm": 0.99609375, + "learning_rate": 0.0004539476919761903, + "loss": 0.223, + "step": 142260 + }, + { + "epoch": 5.89, + "grad_norm": 0.9453125, + "learning_rate": 0.0004539414194968026, + "loss": 0.1788, + "step": 142270 + }, + { + "epoch": 5.89, + "grad_norm": 0.87890625, + "learning_rate": 0.00045393514663361924, + "loss": 0.1965, + "step": 142280 + }, + { + "epoch": 5.89, + "grad_norm": 0.60546875, + "learning_rate": 0.0004539288733866519, + "loss": 0.2336, + "step": 142290 + }, + { + "epoch": 5.89, + "grad_norm": 0.9765625, + "learning_rate": 0.0004539225997559124, + "loss": 0.2025, + "step": 142300 + }, + { + "epoch": 5.89, + "grad_norm": 0.404296875, + "learning_rate": 0.00045391632574141275, + "loss": 0.2125, + "step": 142310 + }, + { + "epoch": 5.89, + "grad_norm": 0.79296875, + "learning_rate": 0.0004539100513431645, + "loss": 0.2268, + "step": 142320 + }, + { + "epoch": 5.9, + "grad_norm": 0.296875, + "learning_rate": 0.00045390377656117953, + "loss": 0.2543, + "step": 142330 + }, + { + "epoch": 5.9, + "grad_norm": 0.490234375, + "learning_rate": 0.0004538975013954697, + "loss": 0.2116, + "step": 142340 + }, + { + "epoch": 5.9, + "grad_norm": 0.99609375, + "learning_rate": 0.00045389122584604683, + "loss": 0.1906, + "step": 142350 + }, + { + "epoch": 5.9, + "grad_norm": 1.1484375, + "learning_rate": 0.0004538849499129226, + "loss": 0.2637, + "step": 142360 + }, + { + "epoch": 5.9, + "grad_norm": 0.859375, + "learning_rate": 0.00045387867359610897, + "loss": 0.1842, + "step": 142370 + }, + { + "epoch": 5.9, + "grad_norm": 0.83984375, + "learning_rate": 0.00045387239689561763, + "loss": 0.2054, + "step": 142380 + }, + { + "epoch": 5.9, + "grad_norm": 0.388671875, + "learning_rate": 0.0004538661198114606, + "loss": 0.1621, + "step": 142390 + }, + { + "epoch": 5.9, + "grad_norm": 0.1845703125, + "learning_rate": 0.0004538598423436494, + "loss": 0.2204, + "step": 142400 + }, + { + "epoch": 5.9, + "grad_norm": 0.9296875, + "learning_rate": 0.00045385356449219604, + "loss": 0.2493, + "step": 142410 + }, + { + "epoch": 5.9, + "grad_norm": 1.25, + "learning_rate": 0.00045384728625711237, + "loss": 0.1944, + "step": 142420 + }, + { + "epoch": 5.9, + "grad_norm": 0.69140625, + "learning_rate": 0.00045384100763841, + "loss": 0.2307, + "step": 142430 + }, + { + "epoch": 5.9, + "grad_norm": 0.96875, + "learning_rate": 0.0004538347286361009, + "loss": 0.219, + "step": 142440 + }, + { + "epoch": 5.9, + "grad_norm": 0.734375, + "learning_rate": 0.0004538284492501969, + "loss": 0.2456, + "step": 142450 + }, + { + "epoch": 5.9, + "grad_norm": 0.62109375, + "learning_rate": 0.00045382216948070973, + "loss": 0.2103, + "step": 142460 + }, + { + "epoch": 5.9, + "grad_norm": 0.7734375, + "learning_rate": 0.0004538158893276513, + "loss": 0.2026, + "step": 142470 + }, + { + "epoch": 5.9, + "grad_norm": 0.70703125, + "learning_rate": 0.00045380960879103327, + "loss": 0.247, + "step": 142480 + }, + { + "epoch": 5.9, + "grad_norm": 0.5234375, + "learning_rate": 0.00045380332787086763, + "loss": 0.2112, + "step": 142490 + }, + { + "epoch": 5.9, + "grad_norm": 0.498046875, + "learning_rate": 0.0004537970465671661, + "loss": 0.2303, + "step": 142500 + }, + { + "epoch": 5.9, + "grad_norm": 0.59375, + "learning_rate": 0.00045379076487994067, + "loss": 0.2164, + "step": 142510 + }, + { + "epoch": 5.9, + "grad_norm": 0.53515625, + "learning_rate": 0.0004537844828092029, + "loss": 0.1903, + "step": 142520 + }, + { + "epoch": 5.9, + "grad_norm": 0.69140625, + "learning_rate": 0.0004537782003549648, + "loss": 0.2048, + "step": 142530 + }, + { + "epoch": 5.9, + "grad_norm": 0.37890625, + "learning_rate": 0.000453771917517238, + "loss": 0.2428, + "step": 142540 + }, + { + "epoch": 5.9, + "grad_norm": 1.3984375, + "learning_rate": 0.00045376563429603455, + "loss": 0.1883, + "step": 142550 + }, + { + "epoch": 5.9, + "grad_norm": 1.6484375, + "learning_rate": 0.0004537593506913662, + "loss": 0.1975, + "step": 142560 + }, + { + "epoch": 5.91, + "grad_norm": 0.298828125, + "learning_rate": 0.0004537530667032448, + "loss": 0.2267, + "step": 142570 + }, + { + "epoch": 5.91, + "grad_norm": 1.6796875, + "learning_rate": 0.00045374678233168206, + "loss": 0.2259, + "step": 142580 + }, + { + "epoch": 5.91, + "grad_norm": 0.58984375, + "learning_rate": 0.00045374049757668987, + "loss": 0.2381, + "step": 142590 + }, + { + "epoch": 5.91, + "grad_norm": 0.63671875, + "learning_rate": 0.0004537342124382801, + "loss": 0.181, + "step": 142600 + }, + { + "epoch": 5.91, + "grad_norm": 0.40234375, + "learning_rate": 0.00045372792691646455, + "loss": 0.1875, + "step": 142610 + }, + { + "epoch": 5.91, + "grad_norm": 0.3515625, + "learning_rate": 0.000453721641011255, + "loss": 0.1469, + "step": 142620 + }, + { + "epoch": 5.91, + "grad_norm": 0.91015625, + "learning_rate": 0.00045371535472266334, + "loss": 0.2038, + "step": 142630 + }, + { + "epoch": 5.91, + "grad_norm": 0.62890625, + "learning_rate": 0.00045370906805070147, + "loss": 0.1996, + "step": 142640 + }, + { + "epoch": 5.91, + "grad_norm": 1.09375, + "learning_rate": 0.000453702780995381, + "loss": 0.1957, + "step": 142650 + }, + { + "epoch": 5.91, + "grad_norm": 0.50390625, + "learning_rate": 0.00045369649355671396, + "loss": 0.1904, + "step": 142660 + }, + { + "epoch": 5.91, + "grad_norm": 0.67578125, + "learning_rate": 0.0004536902057347121, + "loss": 0.2175, + "step": 142670 + }, + { + "epoch": 5.91, + "grad_norm": 0.5546875, + "learning_rate": 0.00045368391752938724, + "loss": 0.1643, + "step": 142680 + }, + { + "epoch": 5.91, + "grad_norm": 0.921875, + "learning_rate": 0.00045367762894075125, + "loss": 0.2277, + "step": 142690 + }, + { + "epoch": 5.91, + "grad_norm": 0.75390625, + "learning_rate": 0.00045367133996881607, + "loss": 0.2014, + "step": 142700 + }, + { + "epoch": 5.91, + "grad_norm": 0.69140625, + "learning_rate": 0.0004536650506135933, + "loss": 0.1929, + "step": 142710 + }, + { + "epoch": 5.91, + "grad_norm": 2.84375, + "learning_rate": 0.00045365876087509493, + "loss": 0.2045, + "step": 142720 + }, + { + "epoch": 5.91, + "grad_norm": 1.4453125, + "learning_rate": 0.0004536524707533327, + "loss": 0.1778, + "step": 142730 + }, + { + "epoch": 5.91, + "grad_norm": 1.25, + "learning_rate": 0.0004536461802483186, + "loss": 0.2075, + "step": 142740 + }, + { + "epoch": 5.91, + "grad_norm": 0.64453125, + "learning_rate": 0.0004536398893600644, + "loss": 0.216, + "step": 142750 + }, + { + "epoch": 5.91, + "grad_norm": 0.98828125, + "learning_rate": 0.00045363359808858186, + "loss": 0.1723, + "step": 142760 + }, + { + "epoch": 5.91, + "grad_norm": 0.5546875, + "learning_rate": 0.00045362730643388295, + "loss": 0.1863, + "step": 142770 + }, + { + "epoch": 5.91, + "grad_norm": 0.1328125, + "learning_rate": 0.00045362101439597934, + "loss": 0.1908, + "step": 142780 + }, + { + "epoch": 5.91, + "grad_norm": 0.31640625, + "learning_rate": 0.00045361472197488306, + "loss": 0.2716, + "step": 142790 + }, + { + "epoch": 5.91, + "grad_norm": 0.98046875, + "learning_rate": 0.0004536084291706058, + "loss": 0.231, + "step": 142800 + }, + { + "epoch": 5.92, + "grad_norm": 0.9453125, + "learning_rate": 0.00045360213598315946, + "loss": 0.2043, + "step": 142810 + }, + { + "epoch": 5.92, + "grad_norm": 0.2041015625, + "learning_rate": 0.00045359584241255594, + "loss": 0.209, + "step": 142820 + }, + { + "epoch": 5.92, + "grad_norm": 0.58203125, + "learning_rate": 0.000453589548458807, + "loss": 0.1973, + "step": 142830 + }, + { + "epoch": 5.92, + "grad_norm": 0.9375, + "learning_rate": 0.00045358325412192454, + "loss": 0.2664, + "step": 142840 + }, + { + "epoch": 5.92, + "grad_norm": 0.515625, + "learning_rate": 0.00045357695940192034, + "loss": 0.264, + "step": 142850 + }, + { + "epoch": 5.92, + "grad_norm": 0.73828125, + "learning_rate": 0.0004535706642988063, + "loss": 0.2004, + "step": 142860 + }, + { + "epoch": 5.92, + "grad_norm": 0.404296875, + "learning_rate": 0.0004535643688125943, + "loss": 0.2074, + "step": 142870 + }, + { + "epoch": 5.92, + "grad_norm": 0.443359375, + "learning_rate": 0.0004535580729432961, + "loss": 0.1781, + "step": 142880 + }, + { + "epoch": 5.92, + "grad_norm": 0.7421875, + "learning_rate": 0.00045355177669092355, + "loss": 0.2067, + "step": 142890 + }, + { + "epoch": 5.92, + "grad_norm": 0.392578125, + "learning_rate": 0.00045354548005548855, + "loss": 0.195, + "step": 142900 + }, + { + "epoch": 5.92, + "grad_norm": 0.283203125, + "learning_rate": 0.000453539183037003, + "loss": 0.2007, + "step": 142910 + }, + { + "epoch": 5.92, + "grad_norm": 0.4453125, + "learning_rate": 0.00045353288563547867, + "loss": 0.1622, + "step": 142920 + }, + { + "epoch": 5.92, + "grad_norm": 0.8203125, + "learning_rate": 0.0004535265878509274, + "loss": 0.2296, + "step": 142930 + }, + { + "epoch": 5.92, + "grad_norm": 0.953125, + "learning_rate": 0.00045352028968336113, + "loss": 0.1949, + "step": 142940 + }, + { + "epoch": 5.92, + "grad_norm": 0.9375, + "learning_rate": 0.00045351399113279157, + "loss": 0.1999, + "step": 142950 + }, + { + "epoch": 5.92, + "grad_norm": 0.64453125, + "learning_rate": 0.00045350769219923073, + "loss": 0.1613, + "step": 142960 + }, + { + "epoch": 5.92, + "grad_norm": 0.3671875, + "learning_rate": 0.00045350139288269034, + "loss": 0.2016, + "step": 142970 + }, + { + "epoch": 5.92, + "grad_norm": 0.47265625, + "learning_rate": 0.00045349509318318225, + "loss": 0.2124, + "step": 142980 + }, + { + "epoch": 5.92, + "grad_norm": 0.7890625, + "learning_rate": 0.00045348879310071845, + "loss": 0.2191, + "step": 142990 + }, + { + "epoch": 5.92, + "grad_norm": 0.671875, + "learning_rate": 0.0004534824926353107, + "loss": 0.1965, + "step": 143000 + }, + { + "epoch": 5.92, + "grad_norm": 0.388671875, + "learning_rate": 0.00045347619178697093, + "loss": 0.1693, + "step": 143010 + }, + { + "epoch": 5.92, + "grad_norm": 1.8046875, + "learning_rate": 0.00045346989055571085, + "loss": 0.2409, + "step": 143020 + }, + { + "epoch": 5.92, + "grad_norm": 0.6484375, + "learning_rate": 0.0004534635889415425, + "loss": 0.2116, + "step": 143030 + }, + { + "epoch": 5.92, + "grad_norm": 0.6796875, + "learning_rate": 0.0004534572869444775, + "loss": 0.2449, + "step": 143040 + }, + { + "epoch": 5.93, + "grad_norm": 0.185546875, + "learning_rate": 0.000453450984564528, + "loss": 0.2342, + "step": 143050 + }, + { + "epoch": 5.93, + "grad_norm": 0.63671875, + "learning_rate": 0.00045344468180170565, + "loss": 0.1783, + "step": 143060 + }, + { + "epoch": 5.93, + "grad_norm": 2.609375, + "learning_rate": 0.00045343837865602246, + "loss": 0.2074, + "step": 143070 + }, + { + "epoch": 5.93, + "grad_norm": 0.625, + "learning_rate": 0.00045343207512749014, + "loss": 0.1559, + "step": 143080 + }, + { + "epoch": 5.93, + "grad_norm": 1.015625, + "learning_rate": 0.0004534257712161206, + "loss": 0.2014, + "step": 143090 + }, + { + "epoch": 5.93, + "grad_norm": 1.09375, + "learning_rate": 0.00045341946692192576, + "loss": 0.204, + "step": 143100 + }, + { + "epoch": 5.93, + "grad_norm": 1.2734375, + "learning_rate": 0.0004534131622449175, + "loss": 0.1675, + "step": 143110 + }, + { + "epoch": 5.93, + "grad_norm": 1.15625, + "learning_rate": 0.00045340685718510756, + "loss": 0.2144, + "step": 143120 + }, + { + "epoch": 5.93, + "grad_norm": 1.0703125, + "learning_rate": 0.00045340055174250787, + "loss": 0.2159, + "step": 143130 + }, + { + "epoch": 5.93, + "grad_norm": 1.1171875, + "learning_rate": 0.0004533942459171304, + "loss": 0.183, + "step": 143140 + }, + { + "epoch": 5.93, + "grad_norm": 0.55859375, + "learning_rate": 0.0004533879397089868, + "loss": 0.262, + "step": 143150 + }, + { + "epoch": 5.93, + "grad_norm": 0.21875, + "learning_rate": 0.00045338163311808914, + "loss": 0.2667, + "step": 143160 + }, + { + "epoch": 5.93, + "grad_norm": 0.71875, + "learning_rate": 0.00045337532614444923, + "loss": 0.2442, + "step": 143170 + }, + { + "epoch": 5.93, + "grad_norm": 0.59375, + "learning_rate": 0.0004533690187880789, + "loss": 0.1769, + "step": 143180 + }, + { + "epoch": 5.93, + "grad_norm": 0.46484375, + "learning_rate": 0.00045336271104899, + "loss": 0.2142, + "step": 143190 + }, + { + "epoch": 5.93, + "grad_norm": 0.76171875, + "learning_rate": 0.0004533564029271945, + "loss": 0.2109, + "step": 143200 + }, + { + "epoch": 5.93, + "grad_norm": 0.87890625, + "learning_rate": 0.0004533500944227041, + "loss": 0.1786, + "step": 143210 + }, + { + "epoch": 5.93, + "grad_norm": 0.9296875, + "learning_rate": 0.0004533437855355309, + "loss": 0.2158, + "step": 143220 + }, + { + "epoch": 5.93, + "grad_norm": 0.6796875, + "learning_rate": 0.00045333747626568667, + "loss": 0.2079, + "step": 143230 + }, + { + "epoch": 5.93, + "grad_norm": 1.078125, + "learning_rate": 0.00045333116661318317, + "loss": 0.2131, + "step": 143240 + }, + { + "epoch": 5.93, + "grad_norm": 0.6171875, + "learning_rate": 0.0004533248565780324, + "loss": 0.2381, + "step": 143250 + }, + { + "epoch": 5.93, + "grad_norm": 1.1796875, + "learning_rate": 0.00045331854616024623, + "loss": 0.1542, + "step": 143260 + }, + { + "epoch": 5.93, + "grad_norm": 0.859375, + "learning_rate": 0.00045331223535983653, + "loss": 0.2394, + "step": 143270 + }, + { + "epoch": 5.93, + "grad_norm": 0.65625, + "learning_rate": 0.0004533059241768151, + "loss": 0.1848, + "step": 143280 + }, + { + "epoch": 5.94, + "grad_norm": 0.67578125, + "learning_rate": 0.0004532996126111939, + "loss": 0.2263, + "step": 143290 + }, + { + "epoch": 5.94, + "grad_norm": 1.0703125, + "learning_rate": 0.0004532933006629848, + "loss": 0.2233, + "step": 143300 + }, + { + "epoch": 5.94, + "grad_norm": 0.55859375, + "learning_rate": 0.0004532869883321997, + "loss": 0.1892, + "step": 143310 + }, + { + "epoch": 5.94, + "grad_norm": 1.0546875, + "learning_rate": 0.00045328067561885033, + "loss": 0.211, + "step": 143320 + }, + { + "epoch": 5.94, + "grad_norm": 2.09375, + "learning_rate": 0.0004532743625229487, + "loss": 0.211, + "step": 143330 + }, + { + "epoch": 5.94, + "grad_norm": 0.484375, + "learning_rate": 0.0004532680490445068, + "loss": 0.1895, + "step": 143340 + }, + { + "epoch": 5.94, + "grad_norm": 1.65625, + "learning_rate": 0.00045326173518353633, + "loss": 0.2534, + "step": 143350 + }, + { + "epoch": 5.94, + "grad_norm": 1.0078125, + "learning_rate": 0.0004532554209400491, + "loss": 0.1946, + "step": 143360 + }, + { + "epoch": 5.94, + "grad_norm": 0.796875, + "learning_rate": 0.00045324910631405725, + "loss": 0.2381, + "step": 143370 + }, + { + "epoch": 5.94, + "grad_norm": 1.9921875, + "learning_rate": 0.0004532427913055725, + "loss": 0.2174, + "step": 143380 + }, + { + "epoch": 5.94, + "grad_norm": 0.796875, + "learning_rate": 0.00045323647591460675, + "loss": 0.2075, + "step": 143390 + }, + { + "epoch": 5.94, + "grad_norm": 0.78515625, + "learning_rate": 0.0004532301601411719, + "loss": 0.2151, + "step": 143400 + }, + { + "epoch": 5.94, + "grad_norm": 1.109375, + "learning_rate": 0.0004532238439852798, + "loss": 0.2078, + "step": 143410 + }, + { + "epoch": 5.94, + "grad_norm": 0.703125, + "learning_rate": 0.00045321752744694247, + "loss": 0.174, + "step": 143420 + }, + { + "epoch": 5.94, + "grad_norm": 0.48046875, + "learning_rate": 0.00045321121052617166, + "loss": 0.1975, + "step": 143430 + }, + { + "epoch": 5.94, + "grad_norm": 1.15625, + "learning_rate": 0.0004532048932229793, + "loss": 0.1669, + "step": 143440 + }, + { + "epoch": 5.94, + "grad_norm": 0.6484375, + "learning_rate": 0.0004531985755373772, + "loss": 0.1919, + "step": 143450 + }, + { + "epoch": 5.94, + "grad_norm": 0.73046875, + "learning_rate": 0.0004531922574693774, + "loss": 0.1925, + "step": 143460 + }, + { + "epoch": 5.94, + "grad_norm": 0.44140625, + "learning_rate": 0.0004531859390189917, + "loss": 0.2014, + "step": 143470 + }, + { + "epoch": 5.94, + "grad_norm": 0.5546875, + "learning_rate": 0.000453179620186232, + "loss": 0.2345, + "step": 143480 + }, + { + "epoch": 5.94, + "grad_norm": 0.412109375, + "learning_rate": 0.0004531733009711102, + "loss": 0.2191, + "step": 143490 + }, + { + "epoch": 5.94, + "grad_norm": 0.81640625, + "learning_rate": 0.0004531669813736382, + "loss": 0.258, + "step": 143500 + }, + { + "epoch": 5.94, + "grad_norm": 1.5390625, + "learning_rate": 0.0004531606613938278, + "loss": 0.2146, + "step": 143510 + }, + { + "epoch": 5.94, + "grad_norm": 2.125, + "learning_rate": 0.00045315434103169105, + "loss": 0.2301, + "step": 143520 + }, + { + "epoch": 5.94, + "grad_norm": 0.412109375, + "learning_rate": 0.0004531480202872398, + "loss": 0.2177, + "step": 143530 + }, + { + "epoch": 5.95, + "grad_norm": 0.66015625, + "learning_rate": 0.00045314169916048586, + "loss": 0.2103, + "step": 143540 + }, + { + "epoch": 5.95, + "grad_norm": 0.59765625, + "learning_rate": 0.00045313537765144117, + "loss": 0.24, + "step": 143550 + }, + { + "epoch": 5.95, + "grad_norm": 0.85546875, + "learning_rate": 0.0004531290557601177, + "loss": 0.2001, + "step": 143560 + }, + { + "epoch": 5.95, + "grad_norm": 1.375, + "learning_rate": 0.00045312273348652724, + "loss": 0.2262, + "step": 143570 + }, + { + "epoch": 5.95, + "grad_norm": 1.0625, + "learning_rate": 0.00045311641083068175, + "loss": 0.2694, + "step": 143580 + }, + { + "epoch": 5.95, + "grad_norm": 0.73828125, + "learning_rate": 0.00045311008779259313, + "loss": 0.1893, + "step": 143590 + }, + { + "epoch": 5.95, + "grad_norm": 0.52734375, + "learning_rate": 0.00045310376437227316, + "loss": 0.2044, + "step": 143600 + }, + { + "epoch": 5.95, + "grad_norm": 0.83984375, + "learning_rate": 0.0004530974405697339, + "loss": 0.2205, + "step": 143610 + }, + { + "epoch": 5.95, + "grad_norm": 0.84765625, + "learning_rate": 0.00045309111638498724, + "loss": 0.2125, + "step": 143620 + }, + { + "epoch": 5.95, + "grad_norm": 0.65625, + "learning_rate": 0.00045308479181804497, + "loss": 0.1856, + "step": 143630 + }, + { + "epoch": 5.95, + "grad_norm": 1.8359375, + "learning_rate": 0.000453078466868919, + "loss": 0.2191, + "step": 143640 + }, + { + "epoch": 5.95, + "grad_norm": 0.78125, + "learning_rate": 0.0004530721415376213, + "loss": 0.2513, + "step": 143650 + }, + { + "epoch": 5.95, + "grad_norm": 0.97265625, + "learning_rate": 0.0004530658158241639, + "loss": 0.1925, + "step": 143660 + }, + { + "epoch": 5.95, + "grad_norm": 0.4375, + "learning_rate": 0.0004530594897285585, + "loss": 0.2297, + "step": 143670 + }, + { + "epoch": 5.95, + "grad_norm": 0.37890625, + "learning_rate": 0.000453053163250817, + "loss": 0.2197, + "step": 143680 + }, + { + "epoch": 5.95, + "grad_norm": 0.5390625, + "learning_rate": 0.0004530468363909514, + "loss": 0.2028, + "step": 143690 + }, + { + "epoch": 5.95, + "grad_norm": 0.5078125, + "learning_rate": 0.0004530405091489736, + "loss": 0.2105, + "step": 143700 + }, + { + "epoch": 5.95, + "grad_norm": 1.625, + "learning_rate": 0.0004530341815248955, + "loss": 0.209, + "step": 143710 + }, + { + "epoch": 5.95, + "grad_norm": 0.6015625, + "learning_rate": 0.000453027853518729, + "loss": 0.2123, + "step": 143720 + }, + { + "epoch": 5.95, + "grad_norm": 0.8046875, + "learning_rate": 0.000453021525130486, + "loss": 0.2075, + "step": 143730 + }, + { + "epoch": 5.95, + "grad_norm": 0.53515625, + "learning_rate": 0.0004530151963601784, + "loss": 0.204, + "step": 143740 + }, + { + "epoch": 5.95, + "grad_norm": 0.9609375, + "learning_rate": 0.0004530088672078181, + "loss": 0.2023, + "step": 143750 + }, + { + "epoch": 5.95, + "grad_norm": 1.53125, + "learning_rate": 0.00045300253767341706, + "loss": 0.2041, + "step": 143760 + }, + { + "epoch": 5.95, + "grad_norm": 0.76171875, + "learning_rate": 0.0004529962077569871, + "loss": 0.2161, + "step": 143770 + }, + { + "epoch": 5.96, + "grad_norm": 0.515625, + "learning_rate": 0.0004529898774585403, + "loss": 0.241, + "step": 143780 + }, + { + "epoch": 5.96, + "grad_norm": 0.5546875, + "learning_rate": 0.0004529835467780885, + "loss": 0.2389, + "step": 143790 + }, + { + "epoch": 5.96, + "grad_norm": 0.41796875, + "learning_rate": 0.00045297721571564345, + "loss": 0.214, + "step": 143800 + }, + { + "epoch": 5.96, + "grad_norm": 1.140625, + "learning_rate": 0.00045297088427121725, + "loss": 0.2669, + "step": 143810 + }, + { + "epoch": 5.96, + "grad_norm": 1.171875, + "learning_rate": 0.0004529645524448218, + "loss": 0.2183, + "step": 143820 + }, + { + "epoch": 5.96, + "grad_norm": 0.68359375, + "learning_rate": 0.000452958220236469, + "loss": 0.1563, + "step": 143830 + }, + { + "epoch": 5.96, + "grad_norm": 0.70703125, + "learning_rate": 0.0004529518876461707, + "loss": 0.2018, + "step": 143840 + }, + { + "epoch": 5.96, + "grad_norm": 0.27734375, + "learning_rate": 0.0004529455546739389, + "loss": 0.2331, + "step": 143850 + }, + { + "epoch": 5.96, + "grad_norm": 0.8828125, + "learning_rate": 0.0004529392213197855, + "loss": 0.2212, + "step": 143860 + }, + { + "epoch": 5.96, + "grad_norm": 0.40234375, + "learning_rate": 0.0004529328875837223, + "loss": 0.2478, + "step": 143870 + }, + { + "epoch": 5.96, + "grad_norm": 0.56640625, + "learning_rate": 0.0004529265534657614, + "loss": 0.2394, + "step": 143880 + }, + { + "epoch": 5.96, + "grad_norm": 0.359375, + "learning_rate": 0.0004529202189659146, + "loss": 0.2276, + "step": 143890 + }, + { + "epoch": 5.96, + "grad_norm": 0.8125, + "learning_rate": 0.00045291388408419387, + "loss": 0.2002, + "step": 143900 + }, + { + "epoch": 5.96, + "grad_norm": 0.859375, + "learning_rate": 0.00045290754882061114, + "loss": 0.2303, + "step": 143910 + }, + { + "epoch": 5.96, + "grad_norm": 0.58203125, + "learning_rate": 0.0004529012131751783, + "loss": 0.2074, + "step": 143920 + }, + { + "epoch": 5.96, + "grad_norm": 0.455078125, + "learning_rate": 0.00045289487714790733, + "loss": 0.1695, + "step": 143930 + }, + { + "epoch": 5.96, + "grad_norm": 0.625, + "learning_rate": 0.00045288854073881015, + "loss": 0.191, + "step": 143940 + }, + { + "epoch": 5.96, + "grad_norm": 0.90625, + "learning_rate": 0.00045288220394789855, + "loss": 0.188, + "step": 143950 + }, + { + "epoch": 5.96, + "grad_norm": 1.03125, + "learning_rate": 0.00045287586677518455, + "loss": 0.2027, + "step": 143960 + }, + { + "epoch": 5.96, + "grad_norm": 0.75, + "learning_rate": 0.0004528695292206801, + "loss": 0.2251, + "step": 143970 + }, + { + "epoch": 5.96, + "grad_norm": 1.296875, + "learning_rate": 0.00045286319128439714, + "loss": 0.2057, + "step": 143980 + }, + { + "epoch": 5.96, + "grad_norm": 0.78125, + "learning_rate": 0.00045285685296634747, + "loss": 0.1851, + "step": 143990 + }, + { + "epoch": 5.96, + "grad_norm": 1.03125, + "learning_rate": 0.0004528505142665432, + "loss": 0.2754, + "step": 144000 + }, + { + "epoch": 5.96, + "grad_norm": 0.54296875, + "learning_rate": 0.00045284417518499616, + "loss": 0.202, + "step": 144010 + }, + { + "epoch": 5.97, + "grad_norm": 0.55859375, + "learning_rate": 0.0004528378357217182, + "loss": 0.2056, + "step": 144020 + }, + { + "epoch": 5.97, + "grad_norm": 1.375, + "learning_rate": 0.00045283149587672147, + "loss": 0.2075, + "step": 144030 + }, + { + "epoch": 5.97, + "grad_norm": 0.79296875, + "learning_rate": 0.0004528251556500177, + "loss": 0.2065, + "step": 144040 + }, + { + "epoch": 5.97, + "grad_norm": 0.294921875, + "learning_rate": 0.00045281881504161885, + "loss": 0.2037, + "step": 144050 + }, + { + "epoch": 5.97, + "grad_norm": 0.92578125, + "learning_rate": 0.000452812474051537, + "loss": 0.2235, + "step": 144060 + }, + { + "epoch": 5.97, + "grad_norm": 0.466796875, + "learning_rate": 0.00045280613267978387, + "loss": 0.2183, + "step": 144070 + }, + { + "epoch": 5.97, + "grad_norm": 0.5859375, + "learning_rate": 0.0004527997909263716, + "loss": 0.2042, + "step": 144080 + }, + { + "epoch": 5.97, + "grad_norm": 0.69140625, + "learning_rate": 0.0004527934487913119, + "loss": 0.2175, + "step": 144090 + }, + { + "epoch": 5.97, + "grad_norm": 0.4765625, + "learning_rate": 0.00045278710627461694, + "loss": 0.1755, + "step": 144100 + }, + { + "epoch": 5.97, + "grad_norm": 0.51171875, + "learning_rate": 0.0004527807633762985, + "loss": 0.1724, + "step": 144110 + }, + { + "epoch": 5.97, + "grad_norm": 1.109375, + "learning_rate": 0.0004527744200963685, + "loss": 0.2236, + "step": 144120 + }, + { + "epoch": 5.97, + "grad_norm": 1.4921875, + "learning_rate": 0.0004527680764348391, + "loss": 0.1753, + "step": 144130 + }, + { + "epoch": 5.97, + "grad_norm": 0.5546875, + "learning_rate": 0.0004527617323917219, + "loss": 0.2236, + "step": 144140 + }, + { + "epoch": 5.97, + "grad_norm": 0.7890625, + "learning_rate": 0.00045275538796702916, + "loss": 0.217, + "step": 144150 + }, + { + "epoch": 5.97, + "grad_norm": 0.65625, + "learning_rate": 0.0004527490431607726, + "loss": 0.2231, + "step": 144160 + }, + { + "epoch": 5.97, + "grad_norm": 0.3125, + "learning_rate": 0.0004527426979729642, + "loss": 0.2004, + "step": 144170 + }, + { + "epoch": 5.97, + "grad_norm": 1.5078125, + "learning_rate": 0.00045273635240361597, + "loss": 0.2404, + "step": 144180 + }, + { + "epoch": 5.97, + "grad_norm": 0.55078125, + "learning_rate": 0.00045273000645273986, + "loss": 0.2358, + "step": 144190 + }, + { + "epoch": 5.97, + "grad_norm": 1.5625, + "learning_rate": 0.0004527236601203477, + "loss": 0.1779, + "step": 144200 + }, + { + "epoch": 5.97, + "grad_norm": 0.34375, + "learning_rate": 0.0004527173134064516, + "loss": 0.2511, + "step": 144210 + }, + { + "epoch": 5.97, + "grad_norm": 0.49609375, + "learning_rate": 0.00045271096631106333, + "loss": 0.2177, + "step": 144220 + }, + { + "epoch": 5.97, + "grad_norm": 0.8828125, + "learning_rate": 0.00045270461883419494, + "loss": 0.1986, + "step": 144230 + }, + { + "epoch": 5.97, + "grad_norm": 0.875, + "learning_rate": 0.00045269827097585836, + "loss": 0.1649, + "step": 144240 + }, + { + "epoch": 5.97, + "grad_norm": 0.1982421875, + "learning_rate": 0.00045269192273606553, + "loss": 0.2095, + "step": 144250 + }, + { + "epoch": 5.98, + "grad_norm": 1.3203125, + "learning_rate": 0.00045268557411482836, + "loss": 0.1583, + "step": 144260 + }, + { + "epoch": 5.98, + "grad_norm": 0.9609375, + "learning_rate": 0.00045267922511215883, + "loss": 0.2661, + "step": 144270 + }, + { + "epoch": 5.98, + "grad_norm": 0.48046875, + "learning_rate": 0.0004526728757280689, + "loss": 0.228, + "step": 144280 + }, + { + "epoch": 5.98, + "grad_norm": 0.578125, + "learning_rate": 0.0004526665259625705, + "loss": 0.1962, + "step": 144290 + }, + { + "epoch": 5.98, + "grad_norm": 0.671875, + "learning_rate": 0.0004526601758156755, + "loss": 0.2289, + "step": 144300 + }, + { + "epoch": 5.98, + "grad_norm": 2.28125, + "learning_rate": 0.00045265382528739606, + "loss": 0.1956, + "step": 144310 + }, + { + "epoch": 5.98, + "grad_norm": 0.59375, + "learning_rate": 0.000452647474377744, + "loss": 0.2663, + "step": 144320 + }, + { + "epoch": 5.98, + "grad_norm": 0.322265625, + "learning_rate": 0.00045264112308673123, + "loss": 0.2108, + "step": 144330 + }, + { + "epoch": 5.98, + "grad_norm": 0.353515625, + "learning_rate": 0.0004526347714143697, + "loss": 0.1956, + "step": 144340 + }, + { + "epoch": 5.98, + "grad_norm": 0.80859375, + "learning_rate": 0.0004526284193606715, + "loss": 0.2008, + "step": 144350 + }, + { + "epoch": 5.98, + "grad_norm": 0.73828125, + "learning_rate": 0.00045262206692564847, + "loss": 0.1956, + "step": 144360 + }, + { + "epoch": 5.98, + "grad_norm": 1.0234375, + "learning_rate": 0.00045261571410931255, + "loss": 0.1887, + "step": 144370 + }, + { + "epoch": 5.98, + "grad_norm": 0.671875, + "learning_rate": 0.0004526093609116758, + "loss": 0.1776, + "step": 144380 + }, + { + "epoch": 5.98, + "grad_norm": 0.87890625, + "learning_rate": 0.00045260300733275007, + "loss": 0.2245, + "step": 144390 + }, + { + "epoch": 5.98, + "grad_norm": 0.70703125, + "learning_rate": 0.0004525966533725474, + "loss": 0.2004, + "step": 144400 + }, + { + "epoch": 5.98, + "grad_norm": 0.91015625, + "learning_rate": 0.0004525902990310797, + "loss": 0.161, + "step": 144410 + }, + { + "epoch": 5.98, + "grad_norm": 0.953125, + "learning_rate": 0.00045258394430835894, + "loss": 0.2513, + "step": 144420 + }, + { + "epoch": 5.98, + "grad_norm": 1.46875, + "learning_rate": 0.00045257758920439704, + "loss": 0.2275, + "step": 144430 + }, + { + "epoch": 5.98, + "grad_norm": 0.37109375, + "learning_rate": 0.00045257123371920596, + "loss": 0.1924, + "step": 144440 + }, + { + "epoch": 5.98, + "grad_norm": 1.734375, + "learning_rate": 0.00045256487785279775, + "loss": 0.2038, + "step": 144450 + }, + { + "epoch": 5.98, + "grad_norm": 0.515625, + "learning_rate": 0.00045255852160518427, + "loss": 0.1745, + "step": 144460 + }, + { + "epoch": 5.98, + "grad_norm": 0.4453125, + "learning_rate": 0.0004525521649763776, + "loss": 0.2124, + "step": 144470 + }, + { + "epoch": 5.98, + "grad_norm": 1.953125, + "learning_rate": 0.00045254580796638954, + "loss": 0.2096, + "step": 144480 + }, + { + "epoch": 5.98, + "grad_norm": 0.33984375, + "learning_rate": 0.00045253945057523225, + "loss": 0.1565, + "step": 144490 + }, + { + "epoch": 5.99, + "grad_norm": 0.81640625, + "learning_rate": 0.00045253309280291756, + "loss": 0.1959, + "step": 144500 + }, + { + "epoch": 5.99, + "grad_norm": 0.466796875, + "learning_rate": 0.0004525267346494574, + "loss": 0.2074, + "step": 144510 + }, + { + "epoch": 5.99, + "grad_norm": 0.578125, + "learning_rate": 0.00045252037611486385, + "loss": 0.2066, + "step": 144520 + }, + { + "epoch": 5.99, + "grad_norm": 0.53515625, + "learning_rate": 0.00045251401719914873, + "loss": 0.2028, + "step": 144530 + }, + { + "epoch": 5.99, + "grad_norm": 0.90625, + "learning_rate": 0.00045250765790232425, + "loss": 0.2044, + "step": 144540 + }, + { + "epoch": 5.99, + "grad_norm": 0.5390625, + "learning_rate": 0.0004525012982244021, + "loss": 0.1803, + "step": 144550 + }, + { + "epoch": 5.99, + "grad_norm": 0.66796875, + "learning_rate": 0.00045249493816539445, + "loss": 0.2122, + "step": 144560 + }, + { + "epoch": 5.99, + "grad_norm": 0.68359375, + "learning_rate": 0.00045248857772531314, + "loss": 0.1802, + "step": 144570 + }, + { + "epoch": 5.99, + "grad_norm": 0.95703125, + "learning_rate": 0.0004524822169041702, + "loss": 0.1955, + "step": 144580 + }, + { + "epoch": 5.99, + "grad_norm": 1.1171875, + "learning_rate": 0.0004524758557019776, + "loss": 0.2266, + "step": 144590 + }, + { + "epoch": 5.99, + "grad_norm": 0.58203125, + "learning_rate": 0.0004524694941187474, + "loss": 0.1678, + "step": 144600 + }, + { + "epoch": 5.99, + "grad_norm": 2.203125, + "learning_rate": 0.0004524631321544913, + "loss": 0.215, + "step": 144610 + }, + { + "epoch": 5.99, + "grad_norm": 0.4140625, + "learning_rate": 0.0004524567698092216, + "loss": 0.183, + "step": 144620 + }, + { + "epoch": 5.99, + "grad_norm": 0.51171875, + "learning_rate": 0.00045245040708295005, + "loss": 0.2199, + "step": 144630 + }, + { + "epoch": 5.99, + "grad_norm": 0.640625, + "learning_rate": 0.00045244404397568874, + "loss": 0.2133, + "step": 144640 + }, + { + "epoch": 5.99, + "grad_norm": 0.7265625, + "learning_rate": 0.00045243768048744957, + "loss": 0.2091, + "step": 144650 + }, + { + "epoch": 5.99, + "grad_norm": 0.52734375, + "learning_rate": 0.0004524313166182445, + "loss": 0.2231, + "step": 144660 + }, + { + "epoch": 5.99, + "grad_norm": 1.046875, + "learning_rate": 0.0004524249523680857, + "loss": 0.2118, + "step": 144670 + }, + { + "epoch": 5.99, + "grad_norm": 0.94921875, + "learning_rate": 0.00045241858773698484, + "loss": 0.198, + "step": 144680 + }, + { + "epoch": 5.99, + "grad_norm": 0.384765625, + "learning_rate": 0.00045241222272495406, + "loss": 0.1619, + "step": 144690 + }, + { + "epoch": 5.99, + "grad_norm": 0.361328125, + "learning_rate": 0.0004524058573320055, + "loss": 0.2008, + "step": 144700 + }, + { + "epoch": 5.99, + "grad_norm": 0.7578125, + "learning_rate": 0.0004523994915581509, + "loss": 0.227, + "step": 144710 + }, + { + "epoch": 5.99, + "grad_norm": 0.70703125, + "learning_rate": 0.0004523931254034022, + "loss": 0.2396, + "step": 144720 + }, + { + "epoch": 5.99, + "grad_norm": 0.7578125, + "learning_rate": 0.00045238675886777156, + "loss": 0.171, + "step": 144730 + }, + { + "epoch": 6.0, + "grad_norm": 0.60546875, + "learning_rate": 0.000452380391951271, + "loss": 0.1965, + "step": 144740 + }, + { + "epoch": 6.0, + "grad_norm": 0.70703125, + "learning_rate": 0.00045237402465391223, + "loss": 0.1961, + "step": 144750 + }, + { + "epoch": 6.0, + "grad_norm": 1.0859375, + "learning_rate": 0.00045236765697570747, + "loss": 0.2172, + "step": 144760 + }, + { + "epoch": 6.0, + "grad_norm": 0.58203125, + "learning_rate": 0.00045236128891666867, + "loss": 0.1862, + "step": 144770 + }, + { + "epoch": 6.0, + "grad_norm": 1.0390625, + "learning_rate": 0.00045235492047680776, + "loss": 0.253, + "step": 144780 + }, + { + "epoch": 6.0, + "grad_norm": 1.8359375, + "learning_rate": 0.0004523485516561368, + "loss": 0.221, + "step": 144790 + }, + { + "epoch": 6.0, + "grad_norm": 0.7578125, + "learning_rate": 0.00045234218245466764, + "loss": 0.1805, + "step": 144800 + }, + { + "epoch": 6.0, + "grad_norm": 1.5703125, + "learning_rate": 0.00045233581287241234, + "loss": 0.1864, + "step": 144810 + }, + { + "epoch": 6.0, + "grad_norm": 0.40234375, + "learning_rate": 0.0004523294429093829, + "loss": 0.1978, + "step": 144820 + }, + { + "epoch": 6.0, + "grad_norm": 0.66015625, + "learning_rate": 0.0004523230725655913, + "loss": 0.1748, + "step": 144830 + }, + { + "epoch": 6.0, + "grad_norm": 0.94140625, + "learning_rate": 0.0004523167018410495, + "loss": 0.2275, + "step": 144840 + }, + { + "epoch": 6.0, + "grad_norm": 0.26953125, + "learning_rate": 0.00045231033073576954, + "loss": 0.2668, + "step": 144850 + }, + { + "epoch": 6.0, + "grad_norm": 0.65625, + "learning_rate": 0.0004523039592497634, + "loss": 0.1528, + "step": 144860 + }, + { + "epoch": 6.0, + "grad_norm": 0.96484375, + "learning_rate": 0.000452297587383043, + "loss": 0.1482, + "step": 144870 + }, + { + "epoch": 6.0, + "grad_norm": 1.7109375, + "learning_rate": 0.0004522912151356205, + "loss": 0.2558, + "step": 144880 + }, + { + "epoch": 6.0, + "grad_norm": 0.83203125, + "learning_rate": 0.00045228484250750767, + "loss": 0.192, + "step": 144890 + }, + { + "epoch": 6.0, + "grad_norm": 1.7578125, + "learning_rate": 0.00045227846949871673, + "loss": 0.1697, + "step": 144900 + }, + { + "epoch": 6.0, + "grad_norm": 0.39453125, + "learning_rate": 0.0004522720961092595, + "loss": 0.1961, + "step": 144910 + }, + { + "epoch": 6.0, + "grad_norm": 0.5078125, + "learning_rate": 0.000452265722339148, + "loss": 0.2096, + "step": 144920 + }, + { + "epoch": 6.0, + "grad_norm": 0.8671875, + "learning_rate": 0.0004522593481883942, + "loss": 0.1812, + "step": 144930 + }, + { + "epoch": 6.0, + "grad_norm": 0.7578125, + "learning_rate": 0.00045225297365701026, + "loss": 0.1799, + "step": 144940 + }, + { + "epoch": 6.0, + "grad_norm": 0.61328125, + "learning_rate": 0.00045224659874500795, + "loss": 0.1827, + "step": 144950 + }, + { + "epoch": 6.0, + "grad_norm": 0.28515625, + "learning_rate": 0.00045224022345239945, + "loss": 0.2156, + "step": 144960 + }, + { + "epoch": 6.0, + "grad_norm": 1.1640625, + "learning_rate": 0.00045223384777919674, + "loss": 0.2106, + "step": 144970 + }, + { + "epoch": 6.01, + "grad_norm": 0.77734375, + "learning_rate": 0.0004522274717254117, + "loss": 0.1504, + "step": 144980 + }, + { + "epoch": 6.01, + "grad_norm": 0.78125, + "learning_rate": 0.0004522210952910564, + "loss": 0.2457, + "step": 144990 + }, + { + "epoch": 6.01, + "grad_norm": 0.55078125, + "learning_rate": 0.00045221471847614283, + "loss": 0.1698, + "step": 145000 + }, + { + "epoch": 6.01, + "grad_norm": 0.77734375, + "learning_rate": 0.000452208341280683, + "loss": 0.2101, + "step": 145010 + }, + { + "epoch": 6.01, + "grad_norm": 0.54296875, + "learning_rate": 0.00045220196370468897, + "loss": 0.2413, + "step": 145020 + }, + { + "epoch": 6.01, + "grad_norm": 0.796875, + "learning_rate": 0.00045219558574817264, + "loss": 0.1854, + "step": 145030 + }, + { + "epoch": 6.01, + "grad_norm": 0.71875, + "learning_rate": 0.000452189207411146, + "loss": 0.2583, + "step": 145040 + }, + { + "epoch": 6.01, + "grad_norm": 0.76171875, + "learning_rate": 0.00045218282869362113, + "loss": 0.2475, + "step": 145050 + }, + { + "epoch": 6.01, + "grad_norm": 0.97265625, + "learning_rate": 0.00045217644959561013, + "loss": 0.2098, + "step": 145060 + }, + { + "epoch": 6.01, + "grad_norm": 0.63671875, + "learning_rate": 0.00045217007011712473, + "loss": 0.1898, + "step": 145070 + }, + { + "epoch": 6.01, + "grad_norm": 0.7109375, + "learning_rate": 0.00045216369025817717, + "loss": 0.2057, + "step": 145080 + }, + { + "epoch": 6.01, + "grad_norm": 0.4765625, + "learning_rate": 0.0004521573100187794, + "loss": 0.2296, + "step": 145090 + }, + { + "epoch": 6.01, + "grad_norm": 0.2001953125, + "learning_rate": 0.0004521509293989433, + "loss": 0.2046, + "step": 145100 + }, + { + "epoch": 6.01, + "grad_norm": 0.7109375, + "learning_rate": 0.00045214454839868104, + "loss": 0.166, + "step": 145110 + }, + { + "epoch": 6.01, + "grad_norm": 0.6796875, + "learning_rate": 0.0004521381670180046, + "loss": 0.2225, + "step": 145120 + }, + { + "epoch": 6.01, + "grad_norm": 0.439453125, + "learning_rate": 0.0004521317852569259, + "loss": 0.2289, + "step": 145130 + }, + { + "epoch": 6.01, + "grad_norm": 0.89453125, + "learning_rate": 0.00045212540311545703, + "loss": 0.2739, + "step": 145140 + }, + { + "epoch": 6.01, + "grad_norm": 0.95703125, + "learning_rate": 0.00045211902059361, + "loss": 0.2, + "step": 145150 + }, + { + "epoch": 6.01, + "grad_norm": 0.42578125, + "learning_rate": 0.00045211263769139677, + "loss": 0.1738, + "step": 145160 + }, + { + "epoch": 6.01, + "grad_norm": 0.51953125, + "learning_rate": 0.0004521062544088294, + "loss": 0.2204, + "step": 145170 + }, + { + "epoch": 6.01, + "grad_norm": 0.66796875, + "learning_rate": 0.0004520998707459199, + "loss": 0.2074, + "step": 145180 + }, + { + "epoch": 6.01, + "grad_norm": 0.5390625, + "learning_rate": 0.00045209348670268026, + "loss": 0.2583, + "step": 145190 + }, + { + "epoch": 6.01, + "grad_norm": 0.384765625, + "learning_rate": 0.00045208710227912245, + "loss": 0.1286, + "step": 145200 + }, + { + "epoch": 6.01, + "grad_norm": 0.5234375, + "learning_rate": 0.00045208071747525856, + "loss": 0.2046, + "step": 145210 + }, + { + "epoch": 6.01, + "grad_norm": 0.9765625, + "learning_rate": 0.0004520743322911006, + "loss": 0.1848, + "step": 145220 + }, + { + "epoch": 6.02, + "grad_norm": 0.90625, + "learning_rate": 0.0004520679467266606, + "loss": 0.1738, + "step": 145230 + }, + { + "epoch": 6.02, + "grad_norm": 0.388671875, + "learning_rate": 0.00045206156078195047, + "loss": 0.2169, + "step": 145240 + }, + { + "epoch": 6.02, + "grad_norm": 0.58203125, + "learning_rate": 0.0004520551744569824, + "loss": 0.1755, + "step": 145250 + }, + { + "epoch": 6.02, + "grad_norm": 0.5546875, + "learning_rate": 0.00045204878775176825, + "loss": 0.2216, + "step": 145260 + }, + { + "epoch": 6.02, + "grad_norm": 0.84765625, + "learning_rate": 0.00045204240066632016, + "loss": 0.2381, + "step": 145270 + }, + { + "epoch": 6.02, + "grad_norm": 0.67578125, + "learning_rate": 0.00045203601320065, + "loss": 0.2155, + "step": 145280 + }, + { + "epoch": 6.02, + "grad_norm": 0.73046875, + "learning_rate": 0.00045202962535477, + "loss": 0.1933, + "step": 145290 + }, + { + "epoch": 6.02, + "grad_norm": 0.298828125, + "learning_rate": 0.00045202323712869197, + "loss": 0.1926, + "step": 145300 + }, + { + "epoch": 6.02, + "grad_norm": 1.828125, + "learning_rate": 0.000452016848522428, + "loss": 0.1605, + "step": 145310 + }, + { + "epoch": 6.02, + "grad_norm": 0.76171875, + "learning_rate": 0.0004520104595359902, + "loss": 0.1939, + "step": 145320 + }, + { + "epoch": 6.02, + "grad_norm": 0.7578125, + "learning_rate": 0.00045200407016939047, + "loss": 0.1695, + "step": 145330 + }, + { + "epoch": 6.02, + "grad_norm": 0.408203125, + "learning_rate": 0.000451997680422641, + "loss": 0.1985, + "step": 145340 + }, + { + "epoch": 6.02, + "grad_norm": 1.0703125, + "learning_rate": 0.0004519912902957536, + "loss": 0.2093, + "step": 145350 + }, + { + "epoch": 6.02, + "grad_norm": 0.353515625, + "learning_rate": 0.0004519848997887405, + "loss": 0.2457, + "step": 145360 + }, + { + "epoch": 6.02, + "grad_norm": 0.69140625, + "learning_rate": 0.0004519785089016135, + "loss": 0.2044, + "step": 145370 + }, + { + "epoch": 6.02, + "grad_norm": 0.41796875, + "learning_rate": 0.0004519721176343849, + "loss": 0.1874, + "step": 145380 + }, + { + "epoch": 6.02, + "grad_norm": 0.5859375, + "learning_rate": 0.00045196572598706655, + "loss": 0.1511, + "step": 145390 + }, + { + "epoch": 6.02, + "grad_norm": 0.73828125, + "learning_rate": 0.00045195933395967045, + "loss": 0.18, + "step": 145400 + }, + { + "epoch": 6.02, + "grad_norm": 0.5, + "learning_rate": 0.0004519529415522087, + "loss": 0.1799, + "step": 145410 + }, + { + "epoch": 6.02, + "grad_norm": 1.015625, + "learning_rate": 0.00045194654876469335, + "loss": 0.2357, + "step": 145420 + }, + { + "epoch": 6.02, + "grad_norm": 0.5859375, + "learning_rate": 0.0004519401555971364, + "loss": 0.2674, + "step": 145430 + }, + { + "epoch": 6.02, + "grad_norm": 0.4609375, + "learning_rate": 0.00045193376204954994, + "loss": 0.2539, + "step": 145440 + }, + { + "epoch": 6.02, + "grad_norm": 0.92578125, + "learning_rate": 0.0004519273681219459, + "loss": 0.2384, + "step": 145450 + }, + { + "epoch": 6.02, + "grad_norm": 0.42578125, + "learning_rate": 0.0004519209738143363, + "loss": 0.2105, + "step": 145460 + }, + { + "epoch": 6.03, + "grad_norm": 0.95703125, + "learning_rate": 0.00045191457912673326, + "loss": 0.1746, + "step": 145470 + }, + { + "epoch": 6.03, + "grad_norm": 0.59765625, + "learning_rate": 0.0004519081840591489, + "loss": 0.2125, + "step": 145480 + }, + { + "epoch": 6.03, + "grad_norm": 0.3671875, + "learning_rate": 0.000451901788611595, + "loss": 0.1828, + "step": 145490 + }, + { + "epoch": 6.03, + "grad_norm": 0.421875, + "learning_rate": 0.00045189539278408386, + "loss": 0.193, + "step": 145500 + }, + { + "epoch": 6.03, + "grad_norm": 0.279296875, + "learning_rate": 0.00045188899657662727, + "loss": 0.1993, + "step": 145510 + }, + { + "epoch": 6.03, + "grad_norm": 0.70703125, + "learning_rate": 0.00045188259998923746, + "loss": 0.1773, + "step": 145520 + }, + { + "epoch": 6.03, + "grad_norm": 0.42578125, + "learning_rate": 0.0004518762030219264, + "loss": 0.2174, + "step": 145530 + }, + { + "epoch": 6.03, + "grad_norm": 0.515625, + "learning_rate": 0.00045186980567470613, + "loss": 0.1779, + "step": 145540 + }, + { + "epoch": 6.03, + "grad_norm": 0.55078125, + "learning_rate": 0.0004518634079475887, + "loss": 0.2354, + "step": 145550 + }, + { + "epoch": 6.03, + "grad_norm": 1.15625, + "learning_rate": 0.0004518570098405861, + "loss": 0.1774, + "step": 145560 + }, + { + "epoch": 6.03, + "grad_norm": 0.44140625, + "learning_rate": 0.0004518506113537104, + "loss": 0.2192, + "step": 145570 + }, + { + "epoch": 6.03, + "grad_norm": 0.83984375, + "learning_rate": 0.0004518442124869737, + "loss": 0.2021, + "step": 145580 + }, + { + "epoch": 6.03, + "grad_norm": 0.8359375, + "learning_rate": 0.0004518378132403879, + "loss": 0.223, + "step": 145590 + }, + { + "epoch": 6.03, + "grad_norm": 0.369140625, + "learning_rate": 0.00045183141361396516, + "loss": 0.2102, + "step": 145600 + }, + { + "epoch": 6.03, + "grad_norm": 1.0390625, + "learning_rate": 0.00045182501360771754, + "loss": 0.2071, + "step": 145610 + }, + { + "epoch": 6.03, + "grad_norm": 0.58203125, + "learning_rate": 0.00045181861322165704, + "loss": 0.1873, + "step": 145620 + }, + { + "epoch": 6.03, + "grad_norm": 0.60546875, + "learning_rate": 0.0004518122124557957, + "loss": 0.2251, + "step": 145630 + }, + { + "epoch": 6.03, + "grad_norm": 0.546875, + "learning_rate": 0.00045180581131014553, + "loss": 0.1827, + "step": 145640 + }, + { + "epoch": 6.03, + "grad_norm": 1.421875, + "learning_rate": 0.0004517994097847186, + "loss": 0.2281, + "step": 145650 + }, + { + "epoch": 6.03, + "grad_norm": 0.7890625, + "learning_rate": 0.00045179300787952703, + "loss": 0.2553, + "step": 145660 + }, + { + "epoch": 6.03, + "grad_norm": 1.0625, + "learning_rate": 0.0004517866055945828, + "loss": 0.1943, + "step": 145670 + }, + { + "epoch": 6.03, + "grad_norm": 0.515625, + "learning_rate": 0.00045178020292989797, + "loss": 0.1643, + "step": 145680 + }, + { + "epoch": 6.03, + "grad_norm": 1.5703125, + "learning_rate": 0.00045177379988548455, + "loss": 0.236, + "step": 145690 + }, + { + "epoch": 6.03, + "grad_norm": 0.671875, + "learning_rate": 0.0004517673964613547, + "loss": 0.1583, + "step": 145700 + }, + { + "epoch": 6.04, + "grad_norm": 0.302734375, + "learning_rate": 0.00045176099265752036, + "loss": 0.223, + "step": 145710 + }, + { + "epoch": 6.04, + "grad_norm": 0.50390625, + "learning_rate": 0.0004517545884739936, + "loss": 0.2277, + "step": 145720 + }, + { + "epoch": 6.04, + "grad_norm": 0.83984375, + "learning_rate": 0.0004517481839107865, + "loss": 0.2201, + "step": 145730 + }, + { + "epoch": 6.04, + "grad_norm": 0.76953125, + "learning_rate": 0.00045174177896791114, + "loss": 0.1861, + "step": 145740 + }, + { + "epoch": 6.04, + "grad_norm": 0.9609375, + "learning_rate": 0.0004517353736453795, + "loss": 0.1996, + "step": 145750 + }, + { + "epoch": 6.04, + "grad_norm": 0.828125, + "learning_rate": 0.00045172896794320365, + "loss": 0.1949, + "step": 145760 + }, + { + "epoch": 6.04, + "grad_norm": 0.51953125, + "learning_rate": 0.00045172256186139573, + "loss": 0.1712, + "step": 145770 + }, + { + "epoch": 6.04, + "grad_norm": 0.859375, + "learning_rate": 0.0004517161553999677, + "loss": 0.1879, + "step": 145780 + }, + { + "epoch": 6.04, + "grad_norm": 0.8671875, + "learning_rate": 0.00045170974855893165, + "loss": 0.2586, + "step": 145790 + }, + { + "epoch": 6.04, + "grad_norm": 0.921875, + "learning_rate": 0.0004517033413382996, + "loss": 0.272, + "step": 145800 + }, + { + "epoch": 6.04, + "grad_norm": 0.65234375, + "learning_rate": 0.0004516969337380837, + "loss": 0.2032, + "step": 145810 + }, + { + "epoch": 6.04, + "grad_norm": 1.1015625, + "learning_rate": 0.00045169052575829593, + "loss": 0.1794, + "step": 145820 + }, + { + "epoch": 6.04, + "grad_norm": 0.6015625, + "learning_rate": 0.0004516841173989484, + "loss": 0.1701, + "step": 145830 + }, + { + "epoch": 6.04, + "grad_norm": 1.125, + "learning_rate": 0.0004516777086600531, + "loss": 0.2191, + "step": 145840 + }, + { + "epoch": 6.04, + "grad_norm": 0.67578125, + "learning_rate": 0.0004516712995416222, + "loss": 0.2333, + "step": 145850 + }, + { + "epoch": 6.04, + "grad_norm": 0.51171875, + "learning_rate": 0.0004516648900436676, + "loss": 0.1968, + "step": 145860 + }, + { + "epoch": 6.04, + "grad_norm": 0.81640625, + "learning_rate": 0.00045165848016620147, + "loss": 0.1755, + "step": 145870 + }, + { + "epoch": 6.04, + "grad_norm": 0.90234375, + "learning_rate": 0.00045165206990923593, + "loss": 0.2107, + "step": 145880 + }, + { + "epoch": 6.04, + "grad_norm": 0.59375, + "learning_rate": 0.0004516456592727829, + "loss": 0.2255, + "step": 145890 + }, + { + "epoch": 6.04, + "grad_norm": 0.77734375, + "learning_rate": 0.0004516392482568545, + "loss": 0.2073, + "step": 145900 + }, + { + "epoch": 6.04, + "grad_norm": 0.68359375, + "learning_rate": 0.00045163283686146293, + "loss": 0.1802, + "step": 145910 + }, + { + "epoch": 6.04, + "grad_norm": 0.86328125, + "learning_rate": 0.00045162642508662, + "loss": 0.2322, + "step": 145920 + }, + { + "epoch": 6.04, + "grad_norm": 0.4609375, + "learning_rate": 0.000451620012932338, + "loss": 0.2296, + "step": 145930 + }, + { + "epoch": 6.04, + "grad_norm": 0.68359375, + "learning_rate": 0.0004516136003986289, + "loss": 0.2113, + "step": 145940 + }, + { + "epoch": 6.05, + "grad_norm": 0.625, + "learning_rate": 0.00045160718748550474, + "loss": 0.2059, + "step": 145950 + }, + { + "epoch": 6.05, + "grad_norm": 1.015625, + "learning_rate": 0.0004516007741929776, + "loss": 0.2351, + "step": 145960 + }, + { + "epoch": 6.05, + "grad_norm": 1.0234375, + "learning_rate": 0.0004515943605210596, + "loss": 0.1591, + "step": 145970 + }, + { + "epoch": 6.05, + "grad_norm": 0.51953125, + "learning_rate": 0.00045158794646976285, + "loss": 0.174, + "step": 145980 + }, + { + "epoch": 6.05, + "grad_norm": 1.640625, + "learning_rate": 0.0004515815320390993, + "loss": 0.1879, + "step": 145990 + }, + { + "epoch": 6.05, + "grad_norm": 0.58203125, + "learning_rate": 0.00045157511722908106, + "loss": 0.18, + "step": 146000 + }, + { + "epoch": 6.05, + "grad_norm": 0.84765625, + "learning_rate": 0.0004515687020397202, + "loss": 0.2063, + "step": 146010 + }, + { + "epoch": 6.05, + "grad_norm": 1.3203125, + "learning_rate": 0.00045156228647102894, + "loss": 0.2115, + "step": 146020 + }, + { + "epoch": 6.05, + "grad_norm": 0.34375, + "learning_rate": 0.00045155587052301904, + "loss": 0.2286, + "step": 146030 + }, + { + "epoch": 6.05, + "grad_norm": 0.65234375, + "learning_rate": 0.0004515494541957029, + "loss": 0.2078, + "step": 146040 + }, + { + "epoch": 6.05, + "grad_norm": 0.84375, + "learning_rate": 0.0004515430374890924, + "loss": 0.2446, + "step": 146050 + }, + { + "epoch": 6.05, + "grad_norm": 0.66796875, + "learning_rate": 0.0004515366204031997, + "loss": 0.2192, + "step": 146060 + }, + { + "epoch": 6.05, + "grad_norm": 0.703125, + "learning_rate": 0.00045153020293803677, + "loss": 0.2499, + "step": 146070 + }, + { + "epoch": 6.05, + "grad_norm": 0.447265625, + "learning_rate": 0.0004515237850936158, + "loss": 0.1918, + "step": 146080 + }, + { + "epoch": 6.05, + "grad_norm": 0.63671875, + "learning_rate": 0.00045151736686994883, + "loss": 0.2151, + "step": 146090 + }, + { + "epoch": 6.05, + "grad_norm": 1.078125, + "learning_rate": 0.0004515109482670479, + "loss": 0.195, + "step": 146100 + }, + { + "epoch": 6.05, + "grad_norm": 1.0625, + "learning_rate": 0.00045150452928492517, + "loss": 0.1827, + "step": 146110 + }, + { + "epoch": 6.05, + "grad_norm": 0.9765625, + "learning_rate": 0.00045149810992359263, + "loss": 0.1868, + "step": 146120 + }, + { + "epoch": 6.05, + "grad_norm": 2.046875, + "learning_rate": 0.00045149169018306244, + "loss": 0.1836, + "step": 146130 + }, + { + "epoch": 6.05, + "grad_norm": 0.359375, + "learning_rate": 0.00045148527006334674, + "loss": 0.1895, + "step": 146140 + }, + { + "epoch": 6.05, + "grad_norm": 0.2451171875, + "learning_rate": 0.0004514788495644574, + "loss": 0.1674, + "step": 146150 + }, + { + "epoch": 6.05, + "grad_norm": 1.0390625, + "learning_rate": 0.0004514724286864067, + "loss": 0.1766, + "step": 146160 + }, + { + "epoch": 6.05, + "grad_norm": 1.1328125, + "learning_rate": 0.00045146600742920664, + "loss": 0.2255, + "step": 146170 + }, + { + "epoch": 6.05, + "grad_norm": 0.66796875, + "learning_rate": 0.0004514595857928692, + "loss": 0.2209, + "step": 146180 + }, + { + "epoch": 6.06, + "grad_norm": 0.4609375, + "learning_rate": 0.00045145316377740666, + "loss": 0.1878, + "step": 146190 + }, + { + "epoch": 6.06, + "grad_norm": 1.125, + "learning_rate": 0.00045144674138283104, + "loss": 0.1615, + "step": 146200 + }, + { + "epoch": 6.06, + "grad_norm": 0.8046875, + "learning_rate": 0.00045144031860915436, + "loss": 0.2061, + "step": 146210 + }, + { + "epoch": 6.06, + "grad_norm": 0.318359375, + "learning_rate": 0.0004514338954563888, + "loss": 0.2011, + "step": 146220 + }, + { + "epoch": 6.06, + "grad_norm": 0.50390625, + "learning_rate": 0.0004514274719245464, + "loss": 0.2233, + "step": 146230 + }, + { + "epoch": 6.06, + "grad_norm": 0.4375, + "learning_rate": 0.00045142104801363915, + "loss": 0.1705, + "step": 146240 + }, + { + "epoch": 6.06, + "grad_norm": 0.322265625, + "learning_rate": 0.00045141462372367934, + "loss": 0.1869, + "step": 146250 + }, + { + "epoch": 6.06, + "grad_norm": 0.9453125, + "learning_rate": 0.0004514081990546789, + "loss": 0.1676, + "step": 146260 + }, + { + "epoch": 6.06, + "grad_norm": 0.390625, + "learning_rate": 0.00045140177400665004, + "loss": 0.1785, + "step": 146270 + }, + { + "epoch": 6.06, + "grad_norm": 0.6015625, + "learning_rate": 0.0004513953485796048, + "loss": 0.2183, + "step": 146280 + }, + { + "epoch": 6.06, + "grad_norm": 1.203125, + "learning_rate": 0.00045138892277355526, + "loss": 0.2046, + "step": 146290 + }, + { + "epoch": 6.06, + "grad_norm": 1.109375, + "learning_rate": 0.00045138249658851343, + "loss": 0.203, + "step": 146300 + }, + { + "epoch": 6.06, + "grad_norm": 1.046875, + "learning_rate": 0.00045137607002449155, + "loss": 0.2277, + "step": 146310 + }, + { + "epoch": 6.06, + "grad_norm": 1.1953125, + "learning_rate": 0.00045136964308150165, + "loss": 0.2329, + "step": 146320 + }, + { + "epoch": 6.06, + "grad_norm": 0.80859375, + "learning_rate": 0.0004513632157595559, + "loss": 0.2226, + "step": 146330 + }, + { + "epoch": 6.06, + "grad_norm": 0.7578125, + "learning_rate": 0.00045135678805866615, + "loss": 0.142, + "step": 146340 + }, + { + "epoch": 6.06, + "grad_norm": 0.77734375, + "learning_rate": 0.0004513503599788448, + "loss": 0.1618, + "step": 146350 + }, + { + "epoch": 6.06, + "grad_norm": 0.244140625, + "learning_rate": 0.0004513439315201038, + "loss": 0.2212, + "step": 146360 + }, + { + "epoch": 6.06, + "grad_norm": 1.1875, + "learning_rate": 0.00045133750268245524, + "loss": 0.164, + "step": 146370 + }, + { + "epoch": 6.06, + "grad_norm": 1.109375, + "learning_rate": 0.00045133107346591116, + "loss": 0.2637, + "step": 146380 + }, + { + "epoch": 6.06, + "grad_norm": 0.36328125, + "learning_rate": 0.0004513246438704839, + "loss": 0.1945, + "step": 146390 + }, + { + "epoch": 6.06, + "grad_norm": 0.33984375, + "learning_rate": 0.0004513182138961853, + "loss": 0.1894, + "step": 146400 + }, + { + "epoch": 6.06, + "grad_norm": 0.58984375, + "learning_rate": 0.0004513117835430276, + "loss": 0.1848, + "step": 146410 + }, + { + "epoch": 6.06, + "grad_norm": 2.421875, + "learning_rate": 0.00045130535281102285, + "loss": 0.2553, + "step": 146420 + }, + { + "epoch": 6.07, + "grad_norm": 0.4765625, + "learning_rate": 0.0004512989217001832, + "loss": 0.2116, + "step": 146430 + }, + { + "epoch": 6.07, + "grad_norm": 0.7578125, + "learning_rate": 0.00045129249021052067, + "loss": 0.2157, + "step": 146440 + }, + { + "epoch": 6.07, + "grad_norm": 0.90625, + "learning_rate": 0.0004512860583420474, + "loss": 0.1851, + "step": 146450 + }, + { + "epoch": 6.07, + "grad_norm": 0.86328125, + "learning_rate": 0.0004512796260947756, + "loss": 0.2072, + "step": 146460 + }, + { + "epoch": 6.07, + "grad_norm": 1.03125, + "learning_rate": 0.0004512731934687172, + "loss": 0.2168, + "step": 146470 + }, + { + "epoch": 6.07, + "grad_norm": 0.984375, + "learning_rate": 0.0004512667604638844, + "loss": 0.201, + "step": 146480 + }, + { + "epoch": 6.07, + "grad_norm": 0.9140625, + "learning_rate": 0.0004512603270802893, + "loss": 0.2434, + "step": 146490 + }, + { + "epoch": 6.07, + "grad_norm": 1.0625, + "learning_rate": 0.00045125389331794396, + "loss": 0.1938, + "step": 146500 + }, + { + "epoch": 6.07, + "grad_norm": 0.91796875, + "learning_rate": 0.00045124745917686054, + "loss": 0.1728, + "step": 146510 + }, + { + "epoch": 6.07, + "grad_norm": 0.80078125, + "learning_rate": 0.0004512410246570512, + "loss": 0.1961, + "step": 146520 + }, + { + "epoch": 6.07, + "grad_norm": 1.6328125, + "learning_rate": 0.00045123458975852794, + "loss": 0.1879, + "step": 146530 + }, + { + "epoch": 6.07, + "grad_norm": 0.71875, + "learning_rate": 0.0004512281544813029, + "loss": 0.1587, + "step": 146540 + }, + { + "epoch": 6.07, + "grad_norm": 0.765625, + "learning_rate": 0.00045122171882538823, + "loss": 0.221, + "step": 146550 + }, + { + "epoch": 6.07, + "grad_norm": 0.7265625, + "learning_rate": 0.00045121528279079597, + "loss": 0.1949, + "step": 146560 + }, + { + "epoch": 6.07, + "grad_norm": 0.59765625, + "learning_rate": 0.00045120884637753835, + "loss": 0.1788, + "step": 146570 + }, + { + "epoch": 6.07, + "grad_norm": 0.408203125, + "learning_rate": 0.0004512024095856273, + "loss": 0.2579, + "step": 146580 + }, + { + "epoch": 6.07, + "grad_norm": 0.8984375, + "learning_rate": 0.00045119597241507516, + "loss": 0.2298, + "step": 146590 + }, + { + "epoch": 6.07, + "grad_norm": 0.6015625, + "learning_rate": 0.0004511895348658939, + "loss": 0.2429, + "step": 146600 + }, + { + "epoch": 6.07, + "grad_norm": 0.5078125, + "learning_rate": 0.00045118309693809554, + "loss": 0.1606, + "step": 146610 + }, + { + "epoch": 6.07, + "grad_norm": 0.45703125, + "learning_rate": 0.00045117665863169244, + "loss": 0.2024, + "step": 146620 + }, + { + "epoch": 6.07, + "grad_norm": 0.1962890625, + "learning_rate": 0.0004511702199466966, + "loss": 0.1439, + "step": 146630 + }, + { + "epoch": 6.07, + "grad_norm": 0.498046875, + "learning_rate": 0.0004511637808831201, + "loss": 0.2264, + "step": 146640 + }, + { + "epoch": 6.07, + "grad_norm": 0.0, + "learning_rate": 0.0004511573414409751, + "loss": 0.209, + "step": 146650 + }, + { + "epoch": 6.07, + "grad_norm": 1.3984375, + "learning_rate": 0.0004511509016202737, + "loss": 0.2041, + "step": 146660 + }, + { + "epoch": 6.08, + "grad_norm": 0.30078125, + "learning_rate": 0.000451144461421028, + "loss": 0.167, + "step": 146670 + }, + { + "epoch": 6.08, + "grad_norm": 1.2109375, + "learning_rate": 0.00045113802084325016, + "loss": 0.2213, + "step": 146680 + }, + { + "epoch": 6.08, + "grad_norm": 0.6796875, + "learning_rate": 0.00045113157988695233, + "loss": 0.2059, + "step": 146690 + }, + { + "epoch": 6.08, + "grad_norm": 0.73828125, + "learning_rate": 0.00045112513855214656, + "loss": 0.1358, + "step": 146700 + }, + { + "epoch": 6.08, + "grad_norm": 0.93359375, + "learning_rate": 0.00045111869683884495, + "loss": 0.2311, + "step": 146710 + }, + { + "epoch": 6.08, + "grad_norm": 0.76953125, + "learning_rate": 0.0004511122547470597, + "loss": 0.1763, + "step": 146720 + }, + { + "epoch": 6.08, + "grad_norm": 0.51953125, + "learning_rate": 0.00045110581227680293, + "loss": 0.161, + "step": 146730 + }, + { + "epoch": 6.08, + "grad_norm": 0.60546875, + "learning_rate": 0.0004510993694280867, + "loss": 0.2181, + "step": 146740 + }, + { + "epoch": 6.08, + "grad_norm": 0.71875, + "learning_rate": 0.00045109292620092325, + "loss": 0.2045, + "step": 146750 + }, + { + "epoch": 6.08, + "grad_norm": 1.15625, + "learning_rate": 0.00045108648259532455, + "loss": 0.1924, + "step": 146760 + }, + { + "epoch": 6.08, + "grad_norm": 1.5, + "learning_rate": 0.0004510800386113028, + "loss": 0.209, + "step": 146770 + }, + { + "epoch": 6.08, + "grad_norm": 0.3359375, + "learning_rate": 0.0004510735942488702, + "loss": 0.1916, + "step": 146780 + }, + { + "epoch": 6.08, + "grad_norm": 1.3515625, + "learning_rate": 0.00045106714950803874, + "loss": 0.2382, + "step": 146790 + }, + { + "epoch": 6.08, + "grad_norm": 0.6640625, + "learning_rate": 0.00045106070438882064, + "loss": 0.1749, + "step": 146800 + }, + { + "epoch": 6.08, + "grad_norm": 0.59765625, + "learning_rate": 0.00045105425889122806, + "loss": 0.1708, + "step": 146810 + }, + { + "epoch": 6.08, + "grad_norm": 0.34375, + "learning_rate": 0.000451047813015273, + "loss": 0.2196, + "step": 146820 + }, + { + "epoch": 6.08, + "grad_norm": 0.65234375, + "learning_rate": 0.00045104136676096774, + "loss": 0.2116, + "step": 146830 + }, + { + "epoch": 6.08, + "grad_norm": 0.39453125, + "learning_rate": 0.00045103492012832423, + "loss": 0.1945, + "step": 146840 + }, + { + "epoch": 6.08, + "grad_norm": 2.1875, + "learning_rate": 0.00045102847311735483, + "loss": 0.2201, + "step": 146850 + }, + { + "epoch": 6.08, + "grad_norm": 0.65234375, + "learning_rate": 0.0004510220257280715, + "loss": 0.2064, + "step": 146860 + }, + { + "epoch": 6.08, + "grad_norm": 1.328125, + "learning_rate": 0.0004510155779604864, + "loss": 0.2082, + "step": 146870 + }, + { + "epoch": 6.08, + "grad_norm": 0.7265625, + "learning_rate": 0.00045100912981461174, + "loss": 0.2216, + "step": 146880 + }, + { + "epoch": 6.08, + "grad_norm": 0.87109375, + "learning_rate": 0.00045100268129045964, + "loss": 0.2353, + "step": 146890 + }, + { + "epoch": 6.08, + "grad_norm": 0.92578125, + "learning_rate": 0.0004509962323880421, + "loss": 0.156, + "step": 146900 + }, + { + "epoch": 6.08, + "grad_norm": 0.58984375, + "learning_rate": 0.0004509897831073714, + "loss": 0.2228, + "step": 146910 + }, + { + "epoch": 6.09, + "grad_norm": 0.79296875, + "learning_rate": 0.00045098333344845967, + "loss": 0.1932, + "step": 146920 + }, + { + "epoch": 6.09, + "grad_norm": 0.6171875, + "learning_rate": 0.00045097688341131894, + "loss": 0.2028, + "step": 146930 + }, + { + "epoch": 6.09, + "grad_norm": 0.328125, + "learning_rate": 0.00045097043299596143, + "loss": 0.1912, + "step": 146940 + }, + { + "epoch": 6.09, + "grad_norm": 0.27734375, + "learning_rate": 0.0004509639822023993, + "loss": 0.194, + "step": 146950 + }, + { + "epoch": 6.09, + "grad_norm": 0.54296875, + "learning_rate": 0.0004509575310306446, + "loss": 0.2248, + "step": 146960 + }, + { + "epoch": 6.09, + "grad_norm": 0.5703125, + "learning_rate": 0.0004509510794807096, + "loss": 0.2501, + "step": 146970 + }, + { + "epoch": 6.09, + "grad_norm": 0.6328125, + "learning_rate": 0.00045094462755260634, + "loss": 0.1792, + "step": 146980 + }, + { + "epoch": 6.09, + "grad_norm": 0.6875, + "learning_rate": 0.00045093817524634707, + "loss": 0.1523, + "step": 146990 + }, + { + "epoch": 6.09, + "grad_norm": 1.203125, + "learning_rate": 0.0004509317225619437, + "loss": 0.1988, + "step": 147000 + }, + { + "epoch": 6.09, + "grad_norm": 0.65234375, + "learning_rate": 0.00045092526949940865, + "loss": 0.2343, + "step": 147010 + }, + { + "epoch": 6.09, + "grad_norm": 0.6796875, + "learning_rate": 0.0004509188160587539, + "loss": 0.1703, + "step": 147020 + }, + { + "epoch": 6.09, + "grad_norm": 0.59375, + "learning_rate": 0.0004509123622399917, + "loss": 0.2486, + "step": 147030 + }, + { + "epoch": 6.09, + "grad_norm": 0.7890625, + "learning_rate": 0.00045090590804313405, + "loss": 0.1831, + "step": 147040 + }, + { + "epoch": 6.09, + "grad_norm": 0.63671875, + "learning_rate": 0.00045089945346819317, + "loss": 0.2141, + "step": 147050 + }, + { + "epoch": 6.09, + "grad_norm": 0.86328125, + "learning_rate": 0.0004508929985151812, + "loss": 0.2265, + "step": 147060 + }, + { + "epoch": 6.09, + "grad_norm": 0.69140625, + "learning_rate": 0.0004508865431841104, + "loss": 0.2057, + "step": 147070 + }, + { + "epoch": 6.09, + "grad_norm": 0.7890625, + "learning_rate": 0.0004508800874749927, + "loss": 0.1769, + "step": 147080 + }, + { + "epoch": 6.09, + "grad_norm": 0.7578125, + "learning_rate": 0.0004508736313878404, + "loss": 0.1694, + "step": 147090 + }, + { + "epoch": 6.09, + "grad_norm": 0.58984375, + "learning_rate": 0.00045086717492266563, + "loss": 0.1964, + "step": 147100 + }, + { + "epoch": 6.09, + "grad_norm": 0.8046875, + "learning_rate": 0.0004508607180794806, + "loss": 0.1832, + "step": 147110 + }, + { + "epoch": 6.09, + "grad_norm": 1.296875, + "learning_rate": 0.0004508542608582973, + "loss": 0.2065, + "step": 147120 + }, + { + "epoch": 6.09, + "grad_norm": 0.7578125, + "learning_rate": 0.000450847803259128, + "loss": 0.2343, + "step": 147130 + }, + { + "epoch": 6.09, + "grad_norm": 0.32421875, + "learning_rate": 0.00045084134528198486, + "loss": 0.2132, + "step": 147140 + }, + { + "epoch": 6.09, + "grad_norm": 0.466796875, + "learning_rate": 0.00045083488692688, + "loss": 0.2288, + "step": 147150 + }, + { + "epoch": 6.1, + "grad_norm": 0.388671875, + "learning_rate": 0.0004508284281938255, + "loss": 0.2086, + "step": 147160 + }, + { + "epoch": 6.1, + "grad_norm": 0.44140625, + "learning_rate": 0.00045082196908283363, + "loss": 0.2024, + "step": 147170 + }, + { + "epoch": 6.1, + "grad_norm": 2.0, + "learning_rate": 0.00045081550959391646, + "loss": 0.2009, + "step": 147180 + }, + { + "epoch": 6.1, + "grad_norm": 0.5078125, + "learning_rate": 0.00045080904972708626, + "loss": 0.1738, + "step": 147190 + }, + { + "epoch": 6.1, + "grad_norm": 0.79296875, + "learning_rate": 0.0004508025894823551, + "loss": 0.247, + "step": 147200 + }, + { + "epoch": 6.1, + "grad_norm": 0.63671875, + "learning_rate": 0.0004507961288597351, + "loss": 0.2377, + "step": 147210 + }, + { + "epoch": 6.1, + "grad_norm": 0.85546875, + "learning_rate": 0.00045078966785923845, + "loss": 0.1953, + "step": 147220 + }, + { + "epoch": 6.1, + "grad_norm": 0.69921875, + "learning_rate": 0.0004507832064808774, + "loss": 0.2449, + "step": 147230 + }, + { + "epoch": 6.1, + "grad_norm": 1.3515625, + "learning_rate": 0.000450776744724664, + "loss": 0.1654, + "step": 147240 + }, + { + "epoch": 6.1, + "grad_norm": 0.80859375, + "learning_rate": 0.0004507702825906105, + "loss": 0.228, + "step": 147250 + }, + { + "epoch": 6.1, + "grad_norm": 1.21875, + "learning_rate": 0.00045076382007872896, + "loss": 0.1511, + "step": 147260 + }, + { + "epoch": 6.1, + "grad_norm": 0.69921875, + "learning_rate": 0.00045075735718903153, + "loss": 0.1935, + "step": 147270 + }, + { + "epoch": 6.1, + "grad_norm": 0.412109375, + "learning_rate": 0.0004507508939215306, + "loss": 0.1922, + "step": 147280 + }, + { + "epoch": 6.1, + "grad_norm": 0.57421875, + "learning_rate": 0.000450744430276238, + "loss": 0.2019, + "step": 147290 + }, + { + "epoch": 6.1, + "grad_norm": 0.2294921875, + "learning_rate": 0.0004507379662531662, + "loss": 0.206, + "step": 147300 + }, + { + "epoch": 6.1, + "grad_norm": 0.75390625, + "learning_rate": 0.00045073150185232706, + "loss": 0.186, + "step": 147310 + }, + { + "epoch": 6.1, + "grad_norm": 0.484375, + "learning_rate": 0.00045072503707373305, + "loss": 0.2382, + "step": 147320 + }, + { + "epoch": 6.1, + "grad_norm": 0.84375, + "learning_rate": 0.00045071857191739616, + "loss": 0.226, + "step": 147330 + }, + { + "epoch": 6.1, + "grad_norm": 0.4140625, + "learning_rate": 0.0004507121063833286, + "loss": 0.1746, + "step": 147340 + }, + { + "epoch": 6.1, + "grad_norm": 0.7890625, + "learning_rate": 0.0004507056404715425, + "loss": 0.1665, + "step": 147350 + }, + { + "epoch": 6.1, + "grad_norm": 0.54296875, + "learning_rate": 0.00045069917418205007, + "loss": 0.2413, + "step": 147360 + }, + { + "epoch": 6.1, + "grad_norm": 0.345703125, + "learning_rate": 0.00045069270751486347, + "loss": 0.222, + "step": 147370 + }, + { + "epoch": 6.1, + "grad_norm": 1.5, + "learning_rate": 0.0004506862404699949, + "loss": 0.1822, + "step": 147380 + }, + { + "epoch": 6.1, + "grad_norm": 0.74609375, + "learning_rate": 0.00045067977304745645, + "loss": 0.2626, + "step": 147390 + }, + { + "epoch": 6.11, + "grad_norm": 1.078125, + "learning_rate": 0.00045067330524726037, + "loss": 0.2078, + "step": 147400 + }, + { + "epoch": 6.11, + "grad_norm": 0.490234375, + "learning_rate": 0.0004506668370694188, + "loss": 0.1752, + "step": 147410 + }, + { + "epoch": 6.11, + "grad_norm": 0.37890625, + "learning_rate": 0.0004506603685139439, + "loss": 0.2053, + "step": 147420 + }, + { + "epoch": 6.11, + "grad_norm": 0.373046875, + "learning_rate": 0.0004506538995808478, + "loss": 0.2109, + "step": 147430 + }, + { + "epoch": 6.11, + "grad_norm": 0.61328125, + "learning_rate": 0.00045064743027014285, + "loss": 0.2506, + "step": 147440 + }, + { + "epoch": 6.11, + "grad_norm": 0.8359375, + "learning_rate": 0.000450640960581841, + "loss": 0.2065, + "step": 147450 + }, + { + "epoch": 6.11, + "grad_norm": 0.94921875, + "learning_rate": 0.0004506344905159546, + "loss": 0.2131, + "step": 147460 + }, + { + "epoch": 6.11, + "grad_norm": 0.56640625, + "learning_rate": 0.0004506280200724957, + "loss": 0.1806, + "step": 147470 + }, + { + "epoch": 6.11, + "grad_norm": 0.83203125, + "learning_rate": 0.00045062154925147656, + "loss": 0.2074, + "step": 147480 + }, + { + "epoch": 6.11, + "grad_norm": 0.470703125, + "learning_rate": 0.00045061507805290937, + "loss": 0.2024, + "step": 147490 + }, + { + "epoch": 6.11, + "grad_norm": 1.109375, + "learning_rate": 0.00045060860647680624, + "loss": 0.2016, + "step": 147500 + }, + { + "epoch": 6.11, + "grad_norm": 0.859375, + "learning_rate": 0.0004506021345231793, + "loss": 0.1863, + "step": 147510 + }, + { + "epoch": 6.11, + "grad_norm": 0.55859375, + "learning_rate": 0.0004505956621920409, + "loss": 0.2425, + "step": 147520 + }, + { + "epoch": 6.11, + "grad_norm": 0.17578125, + "learning_rate": 0.0004505891894834031, + "loss": 0.2081, + "step": 147530 + }, + { + "epoch": 6.11, + "grad_norm": 0.76953125, + "learning_rate": 0.0004505827163972781, + "loss": 0.1794, + "step": 147540 + }, + { + "epoch": 6.11, + "grad_norm": 0.8046875, + "learning_rate": 0.0004505762429336781, + "loss": 0.2116, + "step": 147550 + }, + { + "epoch": 6.11, + "grad_norm": 0.71484375, + "learning_rate": 0.0004505697690926153, + "loss": 0.2255, + "step": 147560 + }, + { + "epoch": 6.11, + "grad_norm": 0.0, + "learning_rate": 0.00045056329487410177, + "loss": 0.2028, + "step": 147570 + }, + { + "epoch": 6.11, + "grad_norm": 2.015625, + "learning_rate": 0.00045055682027814983, + "loss": 0.2194, + "step": 147580 + }, + { + "epoch": 6.11, + "grad_norm": 0.212890625, + "learning_rate": 0.00045055034530477157, + "loss": 0.1672, + "step": 147590 + }, + { + "epoch": 6.11, + "grad_norm": 0.63671875, + "learning_rate": 0.00045054386995397927, + "loss": 0.2457, + "step": 147600 + }, + { + "epoch": 6.11, + "grad_norm": 0.703125, + "learning_rate": 0.0004505373942257851, + "loss": 0.2049, + "step": 147610 + }, + { + "epoch": 6.11, + "grad_norm": 1.625, + "learning_rate": 0.00045053091812020116, + "loss": 0.1573, + "step": 147620 + }, + { + "epoch": 6.11, + "grad_norm": 1.578125, + "learning_rate": 0.00045052444163723964, + "loss": 0.2616, + "step": 147630 + }, + { + "epoch": 6.12, + "grad_norm": 0.76171875, + "learning_rate": 0.0004505179647769128, + "loss": 0.1705, + "step": 147640 + }, + { + "epoch": 6.12, + "grad_norm": 0.447265625, + "learning_rate": 0.0004505114875392329, + "loss": 0.2111, + "step": 147650 + }, + { + "epoch": 6.12, + "grad_norm": 0.5703125, + "learning_rate": 0.000450505009924212, + "loss": 0.2262, + "step": 147660 + }, + { + "epoch": 6.12, + "grad_norm": 0.44140625, + "learning_rate": 0.0004504985319318622, + "loss": 0.1722, + "step": 147670 + }, + { + "epoch": 6.12, + "grad_norm": 1.5390625, + "learning_rate": 0.0004504920535621959, + "loss": 0.2367, + "step": 147680 + }, + { + "epoch": 6.12, + "grad_norm": 0.78515625, + "learning_rate": 0.00045048557481522524, + "loss": 0.2184, + "step": 147690 + }, + { + "epoch": 6.12, + "grad_norm": 0.462890625, + "learning_rate": 0.00045047909569096236, + "loss": 0.2037, + "step": 147700 + }, + { + "epoch": 6.12, + "grad_norm": 0.81640625, + "learning_rate": 0.00045047261618941936, + "loss": 0.2326, + "step": 147710 + }, + { + "epoch": 6.12, + "grad_norm": 1.0625, + "learning_rate": 0.0004504661363106087, + "loss": 0.1741, + "step": 147720 + }, + { + "epoch": 6.12, + "grad_norm": 0.58984375, + "learning_rate": 0.00045045965605454235, + "loss": 0.2363, + "step": 147730 + }, + { + "epoch": 6.12, + "grad_norm": 1.03125, + "learning_rate": 0.00045045317542123257, + "loss": 0.2016, + "step": 147740 + }, + { + "epoch": 6.12, + "grad_norm": 0.482421875, + "learning_rate": 0.0004504466944106916, + "loss": 0.2139, + "step": 147750 + }, + { + "epoch": 6.12, + "grad_norm": 0.5859375, + "learning_rate": 0.0004504402130229316, + "loss": 0.1834, + "step": 147760 + }, + { + "epoch": 6.12, + "grad_norm": 0.62109375, + "learning_rate": 0.0004504337312579647, + "loss": 0.2173, + "step": 147770 + }, + { + "epoch": 6.12, + "grad_norm": 0.97265625, + "learning_rate": 0.0004504272491158032, + "loss": 0.2528, + "step": 147780 + }, + { + "epoch": 6.12, + "grad_norm": 0.5546875, + "learning_rate": 0.00045042076659645926, + "loss": 0.1975, + "step": 147790 + }, + { + "epoch": 6.12, + "grad_norm": 0.71875, + "learning_rate": 0.0004504142836999451, + "loss": 0.2097, + "step": 147800 + }, + { + "epoch": 6.12, + "grad_norm": 0.53515625, + "learning_rate": 0.0004504078004262729, + "loss": 0.1629, + "step": 147810 + }, + { + "epoch": 6.12, + "grad_norm": 0.5078125, + "learning_rate": 0.00045040131677545484, + "loss": 0.2203, + "step": 147820 + }, + { + "epoch": 6.12, + "grad_norm": 0.435546875, + "learning_rate": 0.00045039483274750313, + "loss": 0.1937, + "step": 147830 + }, + { + "epoch": 6.12, + "grad_norm": 0.341796875, + "learning_rate": 0.00045038834834243, + "loss": 0.2281, + "step": 147840 + }, + { + "epoch": 6.12, + "grad_norm": 0.87109375, + "learning_rate": 0.0004503818635602477, + "loss": 0.2138, + "step": 147850 + }, + { + "epoch": 6.12, + "grad_norm": 0.62890625, + "learning_rate": 0.0004503753784009683, + "loss": 0.2326, + "step": 147860 + }, + { + "epoch": 6.12, + "grad_norm": 0.91796875, + "learning_rate": 0.00045036889286460406, + "loss": 0.2078, + "step": 147870 + }, + { + "epoch": 6.13, + "grad_norm": 0.7734375, + "learning_rate": 0.00045036240695116726, + "loss": 0.2287, + "step": 147880 + }, + { + "epoch": 6.13, + "grad_norm": 0.765625, + "learning_rate": 0.00045035592066067, + "loss": 0.1844, + "step": 147890 + }, + { + "epoch": 6.13, + "grad_norm": 0.416015625, + "learning_rate": 0.00045034943399312454, + "loss": 0.1806, + "step": 147900 + }, + { + "epoch": 6.13, + "grad_norm": 0.60546875, + "learning_rate": 0.0004503429469485432, + "loss": 0.2097, + "step": 147910 + }, + { + "epoch": 6.13, + "grad_norm": 0.7421875, + "learning_rate": 0.0004503364595269379, + "loss": 0.1441, + "step": 147920 + }, + { + "epoch": 6.13, + "grad_norm": 0.7265625, + "learning_rate": 0.0004503299717283211, + "loss": 0.218, + "step": 147930 + }, + { + "epoch": 6.13, + "grad_norm": 0.890625, + "learning_rate": 0.0004503234835527049, + "loss": 0.2218, + "step": 147940 + }, + { + "epoch": 6.13, + "grad_norm": 1.125, + "learning_rate": 0.00045031699500010155, + "loss": 0.2079, + "step": 147950 + }, + { + "epoch": 6.13, + "grad_norm": 0.345703125, + "learning_rate": 0.00045031050607052326, + "loss": 0.2087, + "step": 147960 + }, + { + "epoch": 6.13, + "grad_norm": 0.6796875, + "learning_rate": 0.0004503040167639822, + "loss": 0.1567, + "step": 147970 + }, + { + "epoch": 6.13, + "grad_norm": 0.55078125, + "learning_rate": 0.0004502975270804906, + "loss": 0.2003, + "step": 147980 + }, + { + "epoch": 6.13, + "grad_norm": 0.5234375, + "learning_rate": 0.0004502910370200607, + "loss": 0.2071, + "step": 147990 + }, + { + "epoch": 6.13, + "grad_norm": 0.5703125, + "learning_rate": 0.00045028454658270473, + "loss": 0.1915, + "step": 148000 + }, + { + "epoch": 6.13, + "grad_norm": 1.1171875, + "learning_rate": 0.0004502780557684348, + "loss": 0.1965, + "step": 148010 + }, + { + "epoch": 6.13, + "grad_norm": 0.6640625, + "learning_rate": 0.0004502715645772633, + "loss": 0.1972, + "step": 148020 + }, + { + "epoch": 6.13, + "grad_norm": 0.3984375, + "learning_rate": 0.0004502650730092023, + "loss": 0.1811, + "step": 148030 + }, + { + "epoch": 6.13, + "grad_norm": 1.078125, + "learning_rate": 0.000450258581064264, + "loss": 0.2526, + "step": 148040 + }, + { + "epoch": 6.13, + "grad_norm": 0.5234375, + "learning_rate": 0.00045025208874246075, + "loss": 0.1639, + "step": 148050 + }, + { + "epoch": 6.13, + "grad_norm": 0.6796875, + "learning_rate": 0.00045024559604380464, + "loss": 0.2595, + "step": 148060 + }, + { + "epoch": 6.13, + "grad_norm": 0.79296875, + "learning_rate": 0.000450239102968308, + "loss": 0.2274, + "step": 148070 + }, + { + "epoch": 6.13, + "grad_norm": 0.65234375, + "learning_rate": 0.000450232609515983, + "loss": 0.2289, + "step": 148080 + }, + { + "epoch": 6.13, + "grad_norm": 0.2578125, + "learning_rate": 0.0004502261156868418, + "loss": 0.1981, + "step": 148090 + }, + { + "epoch": 6.13, + "grad_norm": 1.140625, + "learning_rate": 0.00045021962148089666, + "loss": 0.1797, + "step": 148100 + }, + { + "epoch": 6.13, + "grad_norm": 0.640625, + "learning_rate": 0.0004502131268981599, + "loss": 0.1691, + "step": 148110 + }, + { + "epoch": 6.14, + "grad_norm": 0.421875, + "learning_rate": 0.0004502066319386436, + "loss": 0.2056, + "step": 148120 + }, + { + "epoch": 6.14, + "grad_norm": 1.65625, + "learning_rate": 0.00045020013660236006, + "loss": 0.2052, + "step": 148130 + }, + { + "epoch": 6.14, + "grad_norm": 0.73046875, + "learning_rate": 0.0004501936408893215, + "loss": 0.2523, + "step": 148140 + }, + { + "epoch": 6.14, + "grad_norm": 0.5, + "learning_rate": 0.00045018714479954004, + "loss": 0.193, + "step": 148150 + }, + { + "epoch": 6.14, + "grad_norm": 0.57421875, + "learning_rate": 0.000450180648333028, + "loss": 0.201, + "step": 148160 + }, + { + "epoch": 6.14, + "grad_norm": 1.4765625, + "learning_rate": 0.00045017415148979767, + "loss": 0.1983, + "step": 148170 + }, + { + "epoch": 6.14, + "grad_norm": 0.78515625, + "learning_rate": 0.0004501676542698612, + "loss": 0.2127, + "step": 148180 + }, + { + "epoch": 6.14, + "grad_norm": 0.6328125, + "learning_rate": 0.0004501611566732308, + "loss": 0.2018, + "step": 148190 + }, + { + "epoch": 6.14, + "grad_norm": 1.140625, + "learning_rate": 0.0004501546586999187, + "loss": 0.1963, + "step": 148200 + }, + { + "epoch": 6.14, + "grad_norm": 0.3671875, + "learning_rate": 0.0004501481603499371, + "loss": 0.2085, + "step": 148210 + }, + { + "epoch": 6.14, + "grad_norm": 0.50390625, + "learning_rate": 0.0004501416616232984, + "loss": 0.1512, + "step": 148220 + }, + { + "epoch": 6.14, + "grad_norm": 0.5078125, + "learning_rate": 0.00045013516252001464, + "loss": 0.1924, + "step": 148230 + }, + { + "epoch": 6.14, + "grad_norm": 0.8671875, + "learning_rate": 0.0004501286630400981, + "loss": 0.2402, + "step": 148240 + }, + { + "epoch": 6.14, + "grad_norm": 0.5, + "learning_rate": 0.000450122163183561, + "loss": 0.238, + "step": 148250 + }, + { + "epoch": 6.14, + "grad_norm": 0.9921875, + "learning_rate": 0.0004501156629504157, + "loss": 0.2108, + "step": 148260 + }, + { + "epoch": 6.14, + "grad_norm": 0.498046875, + "learning_rate": 0.00045010916234067424, + "loss": 0.2448, + "step": 148270 + }, + { + "epoch": 6.14, + "grad_norm": 0.921875, + "learning_rate": 0.00045010266135434897, + "loss": 0.2114, + "step": 148280 + }, + { + "epoch": 6.14, + "grad_norm": 0.765625, + "learning_rate": 0.00045009615999145214, + "loss": 0.2154, + "step": 148290 + }, + { + "epoch": 6.14, + "grad_norm": 0.8203125, + "learning_rate": 0.0004500896582519959, + "loss": 0.2113, + "step": 148300 + }, + { + "epoch": 6.14, + "grad_norm": 0.474609375, + "learning_rate": 0.0004500831561359926, + "loss": 0.2018, + "step": 148310 + }, + { + "epoch": 6.14, + "grad_norm": 0.5625, + "learning_rate": 0.00045007665364345435, + "loss": 0.1839, + "step": 148320 + }, + { + "epoch": 6.14, + "grad_norm": 0.86328125, + "learning_rate": 0.00045007015077439346, + "loss": 0.1994, + "step": 148330 + }, + { + "epoch": 6.14, + "grad_norm": 0.55078125, + "learning_rate": 0.0004500636475288222, + "loss": 0.2139, + "step": 148340 + }, + { + "epoch": 6.14, + "grad_norm": 1.203125, + "learning_rate": 0.0004500571439067527, + "loss": 0.2179, + "step": 148350 + }, + { + "epoch": 6.15, + "grad_norm": 0.0, + "learning_rate": 0.00045005063990819725, + "loss": 0.2079, + "step": 148360 + }, + { + "epoch": 6.15, + "grad_norm": 1.234375, + "learning_rate": 0.0004500441355331681, + "loss": 0.1896, + "step": 148370 + }, + { + "epoch": 6.15, + "grad_norm": 0.73046875, + "learning_rate": 0.00045003763078167746, + "loss": 0.2322, + "step": 148380 + }, + { + "epoch": 6.15, + "grad_norm": 0.73046875, + "learning_rate": 0.00045003112565373773, + "loss": 0.215, + "step": 148390 + }, + { + "epoch": 6.15, + "grad_norm": 0.416015625, + "learning_rate": 0.00045002462014936083, + "loss": 0.2037, + "step": 148400 + }, + { + "epoch": 6.15, + "grad_norm": 0.75390625, + "learning_rate": 0.00045001811426855935, + "loss": 0.1801, + "step": 148410 + }, + { + "epoch": 6.15, + "grad_norm": 0.578125, + "learning_rate": 0.0004500116080113453, + "loss": 0.2011, + "step": 148420 + }, + { + "epoch": 6.15, + "grad_norm": 2.3125, + "learning_rate": 0.0004500051013777311, + "loss": 0.162, + "step": 148430 + }, + { + "epoch": 6.15, + "grad_norm": 0.6328125, + "learning_rate": 0.0004499985943677288, + "loss": 0.242, + "step": 148440 + }, + { + "epoch": 6.15, + "grad_norm": 0.8359375, + "learning_rate": 0.00044999208698135086, + "loss": 0.2214, + "step": 148450 + }, + { + "epoch": 6.15, + "grad_norm": 0.72265625, + "learning_rate": 0.0004499855792186093, + "loss": 0.1997, + "step": 148460 + }, + { + "epoch": 6.15, + "grad_norm": 1.1015625, + "learning_rate": 0.0004499790710795165, + "loss": 0.1515, + "step": 148470 + }, + { + "epoch": 6.15, + "grad_norm": 1.2109375, + "learning_rate": 0.0004499725625640847, + "loss": 0.2126, + "step": 148480 + }, + { + "epoch": 6.15, + "grad_norm": 1.453125, + "learning_rate": 0.0004499660536723261, + "loss": 0.2064, + "step": 148490 + }, + { + "epoch": 6.15, + "grad_norm": 0.97265625, + "learning_rate": 0.000449959544404253, + "loss": 0.2425, + "step": 148500 + }, + { + "epoch": 6.15, + "grad_norm": 2.0625, + "learning_rate": 0.00044995303475987765, + "loss": 0.2209, + "step": 148510 + }, + { + "epoch": 6.15, + "grad_norm": 0.58984375, + "learning_rate": 0.00044994652473921226, + "loss": 0.1575, + "step": 148520 + }, + { + "epoch": 6.15, + "grad_norm": 1.484375, + "learning_rate": 0.0004499400143422691, + "loss": 0.2126, + "step": 148530 + }, + { + "epoch": 6.15, + "grad_norm": 1.46875, + "learning_rate": 0.0004499335035690605, + "loss": 0.276, + "step": 148540 + }, + { + "epoch": 6.15, + "grad_norm": 0.83203125, + "learning_rate": 0.00044992699241959856, + "loss": 0.2298, + "step": 148550 + }, + { + "epoch": 6.15, + "grad_norm": 1.625, + "learning_rate": 0.0004499204808938956, + "loss": 0.2053, + "step": 148560 + }, + { + "epoch": 6.15, + "grad_norm": 0.9296875, + "learning_rate": 0.0004499139689919639, + "loss": 0.1732, + "step": 148570 + }, + { + "epoch": 6.15, + "grad_norm": 0.76953125, + "learning_rate": 0.00044990745671381573, + "loss": 0.2549, + "step": 148580 + }, + { + "epoch": 6.15, + "grad_norm": 0.8984375, + "learning_rate": 0.00044990094405946324, + "loss": 0.2059, + "step": 148590 + }, + { + "epoch": 6.15, + "grad_norm": 0.8125, + "learning_rate": 0.00044989443102891883, + "loss": 0.1765, + "step": 148600 + }, + { + "epoch": 6.16, + "grad_norm": 0.55078125, + "learning_rate": 0.00044988791762219464, + "loss": 0.1859, + "step": 148610 + }, + { + "epoch": 6.16, + "grad_norm": 0.404296875, + "learning_rate": 0.000449881403839303, + "loss": 0.171, + "step": 148620 + }, + { + "epoch": 6.16, + "grad_norm": 0.357421875, + "learning_rate": 0.0004498748896802561, + "loss": 0.2494, + "step": 148630 + }, + { + "epoch": 6.16, + "grad_norm": 0.8203125, + "learning_rate": 0.00044986837514506635, + "loss": 0.2173, + "step": 148640 + }, + { + "epoch": 6.16, + "grad_norm": 0.59375, + "learning_rate": 0.00044986186023374586, + "loss": 0.2223, + "step": 148650 + }, + { + "epoch": 6.16, + "grad_norm": 1.0, + "learning_rate": 0.00044985534494630685, + "loss": 0.1938, + "step": 148660 + }, + { + "epoch": 6.16, + "grad_norm": 0.2412109375, + "learning_rate": 0.0004498488292827618, + "loss": 0.2349, + "step": 148670 + }, + { + "epoch": 6.16, + "grad_norm": 0.55078125, + "learning_rate": 0.00044984231324312273, + "loss": 0.1754, + "step": 148680 + }, + { + "epoch": 6.16, + "grad_norm": 0.62890625, + "learning_rate": 0.00044983579682740207, + "loss": 0.1642, + "step": 148690 + }, + { + "epoch": 6.16, + "grad_norm": 1.0390625, + "learning_rate": 0.00044982928003561195, + "loss": 0.2006, + "step": 148700 + }, + { + "epoch": 6.16, + "grad_norm": 0.8046875, + "learning_rate": 0.0004498227628677648, + "loss": 0.2203, + "step": 148710 + }, + { + "epoch": 6.16, + "grad_norm": 1.015625, + "learning_rate": 0.00044981624532387267, + "loss": 0.2443, + "step": 148720 + }, + { + "epoch": 6.16, + "grad_norm": 0.51953125, + "learning_rate": 0.000449809727403948, + "loss": 0.2436, + "step": 148730 + }, + { + "epoch": 6.16, + "grad_norm": 1.53125, + "learning_rate": 0.000449803209108003, + "loss": 0.2559, + "step": 148740 + }, + { + "epoch": 6.16, + "grad_norm": 1.0546875, + "learning_rate": 0.00044979669043605, + "loss": 0.2694, + "step": 148750 + }, + { + "epoch": 6.16, + "grad_norm": 0.73828125, + "learning_rate": 0.00044979017138810113, + "loss": 0.2072, + "step": 148760 + }, + { + "epoch": 6.16, + "grad_norm": 0.50390625, + "learning_rate": 0.00044978365196416884, + "loss": 0.2046, + "step": 148770 + }, + { + "epoch": 6.16, + "grad_norm": 1.3515625, + "learning_rate": 0.00044977713216426515, + "loss": 0.2192, + "step": 148780 + }, + { + "epoch": 6.16, + "grad_norm": 0.89453125, + "learning_rate": 0.0004497706119884025, + "loss": 0.2412, + "step": 148790 + }, + { + "epoch": 6.16, + "grad_norm": 0.68359375, + "learning_rate": 0.0004497640914365931, + "loss": 0.19, + "step": 148800 + }, + { + "epoch": 6.16, + "grad_norm": 0.890625, + "learning_rate": 0.0004497575705088494, + "loss": 0.183, + "step": 148810 + }, + { + "epoch": 6.16, + "grad_norm": 1.46875, + "learning_rate": 0.0004497510492051834, + "loss": 0.2381, + "step": 148820 + }, + { + "epoch": 6.16, + "grad_norm": 0.28125, + "learning_rate": 0.0004497445275256076, + "loss": 0.1663, + "step": 148830 + }, + { + "epoch": 6.16, + "grad_norm": 0.3515625, + "learning_rate": 0.00044973800547013404, + "loss": 0.219, + "step": 148840 + }, + { + "epoch": 6.17, + "grad_norm": 0.6796875, + "learning_rate": 0.0004497314830387752, + "loss": 0.1791, + "step": 148850 + }, + { + "epoch": 6.17, + "grad_norm": 0.80078125, + "learning_rate": 0.0004497249602315433, + "loss": 0.1978, + "step": 148860 + }, + { + "epoch": 6.17, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004497184370484505, + "loss": 0.1851, + "step": 148870 + }, + { + "epoch": 6.17, + "grad_norm": 0.265625, + "learning_rate": 0.00044971191348950923, + "loss": 0.203, + "step": 148880 + }, + { + "epoch": 6.17, + "grad_norm": 0.2109375, + "learning_rate": 0.00044970538955473174, + "loss": 0.1828, + "step": 148890 + }, + { + "epoch": 6.17, + "grad_norm": 0.78125, + "learning_rate": 0.00044969886524413025, + "loss": 0.1747, + "step": 148900 + }, + { + "epoch": 6.17, + "grad_norm": 0.384765625, + "learning_rate": 0.000449692340557717, + "loss": 0.2232, + "step": 148910 + }, + { + "epoch": 6.17, + "grad_norm": 1.3984375, + "learning_rate": 0.00044968581549550436, + "loss": 0.2051, + "step": 148920 + }, + { + "epoch": 6.17, + "grad_norm": 0.98828125, + "learning_rate": 0.00044967929005750463, + "loss": 0.217, + "step": 148930 + }, + { + "epoch": 6.17, + "grad_norm": 0.5703125, + "learning_rate": 0.00044967276424373, + "loss": 0.2474, + "step": 148940 + }, + { + "epoch": 6.17, + "grad_norm": 0.3515625, + "learning_rate": 0.00044966623805419283, + "loss": 0.165, + "step": 148950 + }, + { + "epoch": 6.17, + "grad_norm": 0.6953125, + "learning_rate": 0.00044965971148890527, + "loss": 0.2385, + "step": 148960 + }, + { + "epoch": 6.17, + "grad_norm": 0.0, + "learning_rate": 0.0004496531845478798, + "loss": 0.2195, + "step": 148970 + }, + { + "epoch": 6.17, + "grad_norm": 1.4453125, + "learning_rate": 0.0004496466572311285, + "loss": 0.2734, + "step": 148980 + }, + { + "epoch": 6.17, + "grad_norm": 0.306640625, + "learning_rate": 0.0004496401295386638, + "loss": 0.2376, + "step": 148990 + }, + { + "epoch": 6.17, + "grad_norm": 0.73828125, + "learning_rate": 0.00044963360147049793, + "loss": 0.2008, + "step": 149000 + }, + { + "epoch": 6.17, + "grad_norm": 0.55078125, + "learning_rate": 0.00044962707302664324, + "loss": 0.2096, + "step": 149010 + }, + { + "epoch": 6.17, + "grad_norm": 0.484375, + "learning_rate": 0.00044962054420711185, + "loss": 0.1943, + "step": 149020 + }, + { + "epoch": 6.17, + "grad_norm": 0.609375, + "learning_rate": 0.0004496140150119162, + "loss": 0.2216, + "step": 149030 + }, + { + "epoch": 6.17, + "grad_norm": 0.25, + "learning_rate": 0.00044960748544106856, + "loss": 0.2002, + "step": 149040 + }, + { + "epoch": 6.17, + "grad_norm": 0.91015625, + "learning_rate": 0.00044960095549458117, + "loss": 0.2189, + "step": 149050 + }, + { + "epoch": 6.17, + "grad_norm": 0.578125, + "learning_rate": 0.00044959442517246634, + "loss": 0.2137, + "step": 149060 + }, + { + "epoch": 6.17, + "grad_norm": 0.5, + "learning_rate": 0.00044958789447473635, + "loss": 0.2531, + "step": 149070 + }, + { + "epoch": 6.17, + "grad_norm": 0.294921875, + "learning_rate": 0.0004495813634014035, + "loss": 0.211, + "step": 149080 + }, + { + "epoch": 6.18, + "grad_norm": 0.21484375, + "learning_rate": 0.00044957483195248005, + "loss": 0.1658, + "step": 149090 + }, + { + "epoch": 6.18, + "grad_norm": 0.41796875, + "learning_rate": 0.0004495683001279783, + "loss": 0.1827, + "step": 149100 + }, + { + "epoch": 6.18, + "grad_norm": 0.70703125, + "learning_rate": 0.0004495617679279107, + "loss": 0.2045, + "step": 149110 + }, + { + "epoch": 6.18, + "grad_norm": 2.71875, + "learning_rate": 0.0004495552353522893, + "loss": 0.2141, + "step": 149120 + }, + { + "epoch": 6.18, + "grad_norm": 0.451171875, + "learning_rate": 0.00044954870240112644, + "loss": 0.2016, + "step": 149130 + }, + { + "epoch": 6.18, + "grad_norm": 1.3828125, + "learning_rate": 0.00044954216907443456, + "loss": 0.1599, + "step": 149140 + }, + { + "epoch": 6.18, + "grad_norm": 1.0, + "learning_rate": 0.00044953563537222585, + "loss": 0.2113, + "step": 149150 + }, + { + "epoch": 6.18, + "grad_norm": 0.68359375, + "learning_rate": 0.0004495291012945126, + "loss": 0.2359, + "step": 149160 + }, + { + "epoch": 6.18, + "grad_norm": 0.45703125, + "learning_rate": 0.0004495225668413071, + "loss": 0.2168, + "step": 149170 + }, + { + "epoch": 6.18, + "grad_norm": 0.515625, + "learning_rate": 0.00044951603201262173, + "loss": 0.1872, + "step": 149180 + }, + { + "epoch": 6.18, + "grad_norm": 0.34375, + "learning_rate": 0.0004495094968084688, + "loss": 0.1688, + "step": 149190 + }, + { + "epoch": 6.18, + "grad_norm": 0.61328125, + "learning_rate": 0.00044950296122886035, + "loss": 0.2167, + "step": 149200 + }, + { + "epoch": 6.18, + "grad_norm": 1.0, + "learning_rate": 0.00044949642527380903, + "loss": 0.1843, + "step": 149210 + }, + { + "epoch": 6.18, + "grad_norm": 0.8984375, + "learning_rate": 0.00044948988894332695, + "loss": 0.2336, + "step": 149220 + }, + { + "epoch": 6.18, + "grad_norm": 0.5546875, + "learning_rate": 0.00044948335223742643, + "loss": 0.1766, + "step": 149230 + }, + { + "epoch": 6.18, + "grad_norm": 0.609375, + "learning_rate": 0.0004494768151561197, + "loss": 0.1688, + "step": 149240 + }, + { + "epoch": 6.18, + "grad_norm": 0.83203125, + "learning_rate": 0.0004494702776994193, + "loss": 0.2094, + "step": 149250 + }, + { + "epoch": 6.18, + "grad_norm": 0.890625, + "learning_rate": 0.0004494637398673373, + "loss": 0.2118, + "step": 149260 + }, + { + "epoch": 6.18, + "grad_norm": 0.546875, + "learning_rate": 0.00044945720165988606, + "loss": 0.1921, + "step": 149270 + }, + { + "epoch": 6.18, + "grad_norm": 0.4375, + "learning_rate": 0.00044945066307707795, + "loss": 0.1655, + "step": 149280 + }, + { + "epoch": 6.18, + "grad_norm": 0.77734375, + "learning_rate": 0.0004494441241189252, + "loss": 0.1793, + "step": 149290 + }, + { + "epoch": 6.18, + "grad_norm": 0.470703125, + "learning_rate": 0.0004494375847854402, + "loss": 0.1969, + "step": 149300 + }, + { + "epoch": 6.18, + "grad_norm": 1.0078125, + "learning_rate": 0.0004494310450766351, + "loss": 0.1864, + "step": 149310 + }, + { + "epoch": 6.18, + "grad_norm": 0.37890625, + "learning_rate": 0.00044942450499252243, + "loss": 0.1848, + "step": 149320 + }, + { + "epoch": 6.19, + "grad_norm": 0.640625, + "learning_rate": 0.00044941796453311426, + "loss": 0.2255, + "step": 149330 + }, + { + "epoch": 6.19, + "grad_norm": 0.380859375, + "learning_rate": 0.00044941142369842306, + "loss": 0.1799, + "step": 149340 + }, + { + "epoch": 6.19, + "grad_norm": 0.6484375, + "learning_rate": 0.00044940488248846114, + "loss": 0.1835, + "step": 149350 + }, + { + "epoch": 6.19, + "grad_norm": 0.478515625, + "learning_rate": 0.00044939834090324075, + "loss": 0.2181, + "step": 149360 + }, + { + "epoch": 6.19, + "grad_norm": 0.96484375, + "learning_rate": 0.00044939179894277413, + "loss": 0.2384, + "step": 149370 + }, + { + "epoch": 6.19, + "grad_norm": 0.7578125, + "learning_rate": 0.0004493852566070738, + "loss": 0.1819, + "step": 149380 + }, + { + "epoch": 6.19, + "grad_norm": 1.09375, + "learning_rate": 0.00044937871389615194, + "loss": 0.1857, + "step": 149390 + }, + { + "epoch": 6.19, + "grad_norm": 0.470703125, + "learning_rate": 0.0004493721708100208, + "loss": 0.2034, + "step": 149400 + }, + { + "epoch": 6.19, + "grad_norm": 2.28125, + "learning_rate": 0.0004493656273486928, + "loss": 0.2274, + "step": 149410 + }, + { + "epoch": 6.19, + "grad_norm": 0.56640625, + "learning_rate": 0.00044935908351218017, + "loss": 0.2297, + "step": 149420 + }, + { + "epoch": 6.19, + "grad_norm": 1.984375, + "learning_rate": 0.00044935253930049535, + "loss": 0.1842, + "step": 149430 + }, + { + "epoch": 6.19, + "grad_norm": 0.283203125, + "learning_rate": 0.00044934599471365056, + "loss": 0.1599, + "step": 149440 + }, + { + "epoch": 6.19, + "grad_norm": 1.9140625, + "learning_rate": 0.0004493394497516581, + "loss": 0.197, + "step": 149450 + }, + { + "epoch": 6.19, + "grad_norm": 1.25, + "learning_rate": 0.00044933290441453034, + "loss": 0.2319, + "step": 149460 + }, + { + "epoch": 6.19, + "grad_norm": 0.47265625, + "learning_rate": 0.0004493263587022796, + "loss": 0.2096, + "step": 149470 + }, + { + "epoch": 6.19, + "grad_norm": 0.93359375, + "learning_rate": 0.00044931981261491817, + "loss": 0.1939, + "step": 149480 + }, + { + "epoch": 6.19, + "grad_norm": 1.1875, + "learning_rate": 0.0004493132661524584, + "loss": 0.193, + "step": 149490 + }, + { + "epoch": 6.19, + "grad_norm": 1.09375, + "learning_rate": 0.0004493067193149125, + "loss": 0.2328, + "step": 149500 + }, + { + "epoch": 6.19, + "grad_norm": 0.51953125, + "learning_rate": 0.0004493001721022929, + "loss": 0.2321, + "step": 149510 + }, + { + "epoch": 6.19, + "grad_norm": 1.8359375, + "learning_rate": 0.000449293624514612, + "loss": 0.2116, + "step": 149520 + }, + { + "epoch": 6.19, + "grad_norm": 0.81640625, + "learning_rate": 0.00044928707655188195, + "loss": 0.2278, + "step": 149530 + }, + { + "epoch": 6.19, + "grad_norm": 0.51953125, + "learning_rate": 0.0004492805282141151, + "loss": 0.1852, + "step": 149540 + }, + { + "epoch": 6.19, + "grad_norm": 0.412109375, + "learning_rate": 0.0004492739795013239, + "loss": 0.143, + "step": 149550 + }, + { + "epoch": 6.19, + "grad_norm": 0.3984375, + "learning_rate": 0.00044926743041352046, + "loss": 0.2115, + "step": 149560 + }, + { + "epoch": 6.2, + "grad_norm": 1.1796875, + "learning_rate": 0.00044926088095071737, + "loss": 0.1908, + "step": 149570 + }, + { + "epoch": 6.2, + "grad_norm": 0.68359375, + "learning_rate": 0.0004492543311129268, + "loss": 0.221, + "step": 149580 + }, + { + "epoch": 6.2, + "grad_norm": 0.9296875, + "learning_rate": 0.00044924778090016105, + "loss": 0.2466, + "step": 149590 + }, + { + "epoch": 6.2, + "grad_norm": 2.03125, + "learning_rate": 0.0004492412303124325, + "loss": 0.23, + "step": 149600 + }, + { + "epoch": 6.2, + "grad_norm": 0.482421875, + "learning_rate": 0.0004492346793497535, + "loss": 0.2115, + "step": 149610 + }, + { + "epoch": 6.2, + "grad_norm": 0.3203125, + "learning_rate": 0.00044922812801213635, + "loss": 0.1849, + "step": 149620 + }, + { + "epoch": 6.2, + "grad_norm": 0.828125, + "learning_rate": 0.00044922157629959336, + "loss": 0.1798, + "step": 149630 + }, + { + "epoch": 6.2, + "grad_norm": 0.443359375, + "learning_rate": 0.0004492150242121369, + "loss": 0.1421, + "step": 149640 + }, + { + "epoch": 6.2, + "grad_norm": 0.0, + "learning_rate": 0.00044920847174977925, + "loss": 0.1771, + "step": 149650 + }, + { + "epoch": 6.2, + "grad_norm": 0.92578125, + "learning_rate": 0.00044920191891253275, + "loss": 0.1822, + "step": 149660 + }, + { + "epoch": 6.2, + "grad_norm": 0.462890625, + "learning_rate": 0.00044919536570040973, + "loss": 0.1631, + "step": 149670 + }, + { + "epoch": 6.2, + "grad_norm": 0.3828125, + "learning_rate": 0.0004491888121134227, + "loss": 0.1848, + "step": 149680 + }, + { + "epoch": 6.2, + "grad_norm": 0.765625, + "learning_rate": 0.0004491822581515837, + "loss": 0.1954, + "step": 149690 + }, + { + "epoch": 6.2, + "grad_norm": 1.2421875, + "learning_rate": 0.00044917570381490524, + "loss": 0.2259, + "step": 149700 + }, + { + "epoch": 6.2, + "grad_norm": 0.462890625, + "learning_rate": 0.00044916914910339957, + "loss": 0.2031, + "step": 149710 + }, + { + "epoch": 6.2, + "grad_norm": 0.474609375, + "learning_rate": 0.0004491625940170791, + "loss": 0.1561, + "step": 149720 + }, + { + "epoch": 6.2, + "grad_norm": 1.1171875, + "learning_rate": 0.00044915603855595615, + "loss": 0.1851, + "step": 149730 + }, + { + "epoch": 6.2, + "grad_norm": 0.890625, + "learning_rate": 0.000449149482720043, + "loss": 0.1721, + "step": 149740 + }, + { + "epoch": 6.2, + "grad_norm": 0.46484375, + "learning_rate": 0.00044914292650935207, + "loss": 0.1529, + "step": 149750 + }, + { + "epoch": 6.2, + "grad_norm": 1.578125, + "learning_rate": 0.0004491363699238956, + "loss": 0.1713, + "step": 149760 + }, + { + "epoch": 6.2, + "grad_norm": 2.484375, + "learning_rate": 0.0004491298129636861, + "loss": 0.1826, + "step": 149770 + }, + { + "epoch": 6.2, + "grad_norm": 1.4765625, + "learning_rate": 0.00044912325562873574, + "loss": 0.1962, + "step": 149780 + }, + { + "epoch": 6.2, + "grad_norm": 0.98046875, + "learning_rate": 0.0004491166979190569, + "loss": 0.1835, + "step": 149790 + }, + { + "epoch": 6.2, + "grad_norm": 0.94140625, + "learning_rate": 0.00044911013983466193, + "loss": 0.2303, + "step": 149800 + }, + { + "epoch": 6.21, + "grad_norm": 0.8359375, + "learning_rate": 0.0004491035813755632, + "loss": 0.2617, + "step": 149810 + }, + { + "epoch": 6.21, + "grad_norm": 0.546875, + "learning_rate": 0.00044909702254177305, + "loss": 0.184, + "step": 149820 + }, + { + "epoch": 6.21, + "grad_norm": 0.89453125, + "learning_rate": 0.0004490904633333037, + "loss": 0.241, + "step": 149830 + }, + { + "epoch": 6.21, + "grad_norm": 0.443359375, + "learning_rate": 0.0004490839037501677, + "loss": 0.2085, + "step": 149840 + }, + { + "epoch": 6.21, + "grad_norm": 0.6171875, + "learning_rate": 0.0004490773437923773, + "loss": 0.197, + "step": 149850 + }, + { + "epoch": 6.21, + "grad_norm": 0.9609375, + "learning_rate": 0.0004490707834599448, + "loss": 0.1759, + "step": 149860 + }, + { + "epoch": 6.21, + "grad_norm": 0.796875, + "learning_rate": 0.0004490642227528826, + "loss": 0.1968, + "step": 149870 + }, + { + "epoch": 6.21, + "grad_norm": 0.8203125, + "learning_rate": 0.000449057661671203, + "loss": 0.1868, + "step": 149880 + }, + { + "epoch": 6.21, + "grad_norm": 0.46875, + "learning_rate": 0.0004490511002149184, + "loss": 0.1668, + "step": 149890 + }, + { + "epoch": 6.21, + "grad_norm": 0.4921875, + "learning_rate": 0.00044904453838404113, + "loss": 0.199, + "step": 149900 + }, + { + "epoch": 6.21, + "grad_norm": 0.36328125, + "learning_rate": 0.00044903797617858355, + "loss": 0.213, + "step": 149910 + }, + { + "epoch": 6.21, + "grad_norm": 0.87109375, + "learning_rate": 0.0004490314135985579, + "loss": 0.2263, + "step": 149920 + }, + { + "epoch": 6.21, + "grad_norm": 1.578125, + "learning_rate": 0.00044902485064397677, + "loss": 0.2432, + "step": 149930 + }, + { + "epoch": 6.21, + "grad_norm": 0.59765625, + "learning_rate": 0.0004490182873148523, + "loss": 0.1481, + "step": 149940 + }, + { + "epoch": 6.21, + "grad_norm": 1.015625, + "learning_rate": 0.00044901172361119687, + "loss": 0.209, + "step": 149950 + }, + { + "epoch": 6.21, + "grad_norm": 0.486328125, + "learning_rate": 0.00044900515953302285, + "loss": 0.1906, + "step": 149960 + }, + { + "epoch": 6.21, + "grad_norm": 1.59375, + "learning_rate": 0.0004489985950803427, + "loss": 0.1701, + "step": 149970 + }, + { + "epoch": 6.21, + "grad_norm": 0.6328125, + "learning_rate": 0.00044899203025316857, + "loss": 0.2231, + "step": 149980 + }, + { + "epoch": 6.21, + "grad_norm": 0.400390625, + "learning_rate": 0.00044898546505151303, + "loss": 0.1802, + "step": 149990 + }, + { + "epoch": 6.21, + "grad_norm": 0.8515625, + "learning_rate": 0.00044897889947538827, + "loss": 0.2123, + "step": 150000 + }, + { + "epoch": 6.21, + "grad_norm": 1.1171875, + "learning_rate": 0.0004489723335248067, + "loss": 0.2073, + "step": 150010 + }, + { + "epoch": 6.21, + "grad_norm": 1.09375, + "learning_rate": 0.00044896576719978076, + "loss": 0.2543, + "step": 150020 + }, + { + "epoch": 6.21, + "grad_norm": 1.453125, + "learning_rate": 0.0004489592005003227, + "loss": 0.2283, + "step": 150030 + }, + { + "epoch": 6.21, + "grad_norm": 1.1640625, + "learning_rate": 0.0004489526334264449, + "loss": 0.1785, + "step": 150040 + }, + { + "epoch": 6.22, + "grad_norm": 0.828125, + "learning_rate": 0.0004489460659781597, + "loss": 0.1677, + "step": 150050 + }, + { + "epoch": 6.22, + "grad_norm": 0.50390625, + "learning_rate": 0.0004489394981554795, + "loss": 0.1914, + "step": 150060 + }, + { + "epoch": 6.22, + "grad_norm": 0.6171875, + "learning_rate": 0.00044893292995841673, + "loss": 0.2257, + "step": 150070 + }, + { + "epoch": 6.22, + "grad_norm": 0.546875, + "learning_rate": 0.00044892636138698364, + "loss": 0.2391, + "step": 150080 + }, + { + "epoch": 6.22, + "grad_norm": 0.95703125, + "learning_rate": 0.0004489197924411925, + "loss": 0.2285, + "step": 150090 + }, + { + "epoch": 6.22, + "grad_norm": 0.46875, + "learning_rate": 0.0004489132231210559, + "loss": 0.2337, + "step": 150100 + }, + { + "epoch": 6.22, + "grad_norm": 0.71875, + "learning_rate": 0.0004489066534265861, + "loss": 0.222, + "step": 150110 + }, + { + "epoch": 6.22, + "grad_norm": 0.7734375, + "learning_rate": 0.0004489000833577954, + "loss": 0.2121, + "step": 150120 + }, + { + "epoch": 6.22, + "grad_norm": 0.41796875, + "learning_rate": 0.00044889351291469624, + "loss": 0.2094, + "step": 150130 + }, + { + "epoch": 6.22, + "grad_norm": 0.98046875, + "learning_rate": 0.000448886942097301, + "loss": 0.1791, + "step": 150140 + }, + { + "epoch": 6.22, + "grad_norm": 1.0390625, + "learning_rate": 0.000448880370905622, + "loss": 0.2398, + "step": 150150 + }, + { + "epoch": 6.22, + "grad_norm": 1.2265625, + "learning_rate": 0.0004488737993396716, + "loss": 0.1814, + "step": 150160 + }, + { + "epoch": 6.22, + "grad_norm": 0.8671875, + "learning_rate": 0.00044886722739946225, + "loss": 0.2434, + "step": 150170 + }, + { + "epoch": 6.22, + "grad_norm": 0.58203125, + "learning_rate": 0.0004488606550850062, + "loss": 0.2157, + "step": 150180 + }, + { + "epoch": 6.22, + "grad_norm": 0.447265625, + "learning_rate": 0.00044885408239631584, + "loss": 0.1681, + "step": 150190 + }, + { + "epoch": 6.22, + "grad_norm": 0.328125, + "learning_rate": 0.0004488475093334036, + "loss": 0.1895, + "step": 150200 + }, + { + "epoch": 6.22, + "grad_norm": 0.8125, + "learning_rate": 0.00044884093589628185, + "loss": 0.2053, + "step": 150210 + }, + { + "epoch": 6.22, + "grad_norm": 0.51171875, + "learning_rate": 0.0004488343620849629, + "loss": 0.1977, + "step": 150220 + }, + { + "epoch": 6.22, + "grad_norm": 1.1484375, + "learning_rate": 0.00044882778789945917, + "loss": 0.2179, + "step": 150230 + }, + { + "epoch": 6.22, + "grad_norm": 0.7734375, + "learning_rate": 0.000448821213339783, + "loss": 0.2109, + "step": 150240 + }, + { + "epoch": 6.22, + "grad_norm": 0.412109375, + "learning_rate": 0.00044881463840594683, + "loss": 0.1969, + "step": 150250 + }, + { + "epoch": 6.22, + "grad_norm": 0.671875, + "learning_rate": 0.0004488080630979629, + "loss": 0.2253, + "step": 150260 + }, + { + "epoch": 6.22, + "grad_norm": 1.0, + "learning_rate": 0.0004488014874158437, + "loss": 0.2053, + "step": 150270 + }, + { + "epoch": 6.22, + "grad_norm": 0.73828125, + "learning_rate": 0.0004487949113596016, + "loss": 0.244, + "step": 150280 + }, + { + "epoch": 6.22, + "grad_norm": 0.404296875, + "learning_rate": 0.00044878833492924887, + "loss": 0.1863, + "step": 150290 + }, + { + "epoch": 6.23, + "grad_norm": 0.267578125, + "learning_rate": 0.00044878175812479806, + "loss": 0.2051, + "step": 150300 + }, + { + "epoch": 6.23, + "grad_norm": 0.58203125, + "learning_rate": 0.00044877518094626134, + "loss": 0.1466, + "step": 150310 + }, + { + "epoch": 6.23, + "grad_norm": 0.62890625, + "learning_rate": 0.00044876860339365123, + "loss": 0.2236, + "step": 150320 + }, + { + "epoch": 6.23, + "grad_norm": 0.416015625, + "learning_rate": 0.0004487620254669801, + "loss": 0.1822, + "step": 150330 + }, + { + "epoch": 6.23, + "grad_norm": 0.61328125, + "learning_rate": 0.00044875544716626025, + "loss": 0.2213, + "step": 150340 + }, + { + "epoch": 6.23, + "grad_norm": 1.078125, + "learning_rate": 0.0004487488684915042, + "loss": 0.1752, + "step": 150350 + }, + { + "epoch": 6.23, + "grad_norm": 0.7265625, + "learning_rate": 0.0004487422894427242, + "loss": 0.1715, + "step": 150360 + }, + { + "epoch": 6.23, + "grad_norm": 0.0, + "learning_rate": 0.0004487357100199327, + "loss": 0.1678, + "step": 150370 + }, + { + "epoch": 6.23, + "grad_norm": 2.0, + "learning_rate": 0.000448729130223142, + "loss": 0.2207, + "step": 150380 + }, + { + "epoch": 6.23, + "grad_norm": 1.21875, + "learning_rate": 0.0004487225500523646, + "loss": 0.1894, + "step": 150390 + }, + { + "epoch": 6.23, + "grad_norm": 0.8125, + "learning_rate": 0.00044871596950761273, + "loss": 0.2157, + "step": 150400 + }, + { + "epoch": 6.23, + "grad_norm": 0.578125, + "learning_rate": 0.000448709388588899, + "loss": 0.2017, + "step": 150410 + }, + { + "epoch": 6.23, + "grad_norm": 0.6875, + "learning_rate": 0.00044870280729623555, + "loss": 0.1972, + "step": 150420 + }, + { + "epoch": 6.23, + "grad_norm": 1.015625, + "learning_rate": 0.0004486962256296349, + "loss": 0.2085, + "step": 150430 + }, + { + "epoch": 6.23, + "grad_norm": 0.6171875, + "learning_rate": 0.0004486896435891094, + "loss": 0.1998, + "step": 150440 + }, + { + "epoch": 6.23, + "grad_norm": 0.359375, + "learning_rate": 0.0004486830611746715, + "loss": 0.27, + "step": 150450 + }, + { + "epoch": 6.23, + "grad_norm": 0.51171875, + "learning_rate": 0.0004486764783863335, + "loss": 0.2076, + "step": 150460 + }, + { + "epoch": 6.23, + "grad_norm": 1.6484375, + "learning_rate": 0.0004486698952241078, + "loss": 0.203, + "step": 150470 + }, + { + "epoch": 6.23, + "grad_norm": 0.71875, + "learning_rate": 0.00044866331168800686, + "loss": 0.2045, + "step": 150480 + }, + { + "epoch": 6.23, + "grad_norm": 1.5859375, + "learning_rate": 0.000448656727778043, + "loss": 0.253, + "step": 150490 + }, + { + "epoch": 6.23, + "grad_norm": 0.6484375, + "learning_rate": 0.00044865014349422863, + "loss": 0.2289, + "step": 150500 + }, + { + "epoch": 6.23, + "grad_norm": 0.5546875, + "learning_rate": 0.0004486435588365761, + "loss": 0.2335, + "step": 150510 + }, + { + "epoch": 6.23, + "grad_norm": 0.72265625, + "learning_rate": 0.0004486369738050979, + "loss": 0.2047, + "step": 150520 + }, + { + "epoch": 6.23, + "grad_norm": 1.3515625, + "learning_rate": 0.0004486303883998063, + "loss": 0.1818, + "step": 150530 + }, + { + "epoch": 6.24, + "grad_norm": 0.34765625, + "learning_rate": 0.00044862380262071386, + "loss": 0.235, + "step": 150540 + }, + { + "epoch": 6.24, + "grad_norm": 0.70703125, + "learning_rate": 0.00044861721646783276, + "loss": 0.2171, + "step": 150550 + }, + { + "epoch": 6.24, + "grad_norm": 0.796875, + "learning_rate": 0.0004486106299411756, + "loss": 0.221, + "step": 150560 + }, + { + "epoch": 6.24, + "grad_norm": 0.671875, + "learning_rate": 0.0004486040430407546, + "loss": 0.2203, + "step": 150570 + }, + { + "epoch": 6.24, + "grad_norm": 0.72265625, + "learning_rate": 0.00044859745576658237, + "loss": 0.2091, + "step": 150580 + }, + { + "epoch": 6.24, + "grad_norm": 0.4375, + "learning_rate": 0.00044859086811867104, + "loss": 0.1903, + "step": 150590 + }, + { + "epoch": 6.24, + "grad_norm": 0.671875, + "learning_rate": 0.0004485842800970332, + "loss": 0.1935, + "step": 150600 + }, + { + "epoch": 6.24, + "grad_norm": 1.1875, + "learning_rate": 0.00044857769170168115, + "loss": 0.2245, + "step": 150610 + }, + { + "epoch": 6.24, + "grad_norm": 0.953125, + "learning_rate": 0.0004485711029326274, + "loss": 0.2059, + "step": 150620 + }, + { + "epoch": 6.24, + "grad_norm": 0.490234375, + "learning_rate": 0.00044856451378988426, + "loss": 0.1944, + "step": 150630 + }, + { + "epoch": 6.24, + "grad_norm": 1.0234375, + "learning_rate": 0.0004485579242734641, + "loss": 0.2052, + "step": 150640 + }, + { + "epoch": 6.24, + "grad_norm": 0.625, + "learning_rate": 0.00044855133438337943, + "loss": 0.151, + "step": 150650 + }, + { + "epoch": 6.24, + "grad_norm": 3.0, + "learning_rate": 0.0004485447441196425, + "loss": 0.2046, + "step": 150660 + }, + { + "epoch": 6.24, + "grad_norm": 1.0234375, + "learning_rate": 0.0004485381534822659, + "loss": 0.195, + "step": 150670 + }, + { + "epoch": 6.24, + "grad_norm": 0.55859375, + "learning_rate": 0.0004485315624712619, + "loss": 0.1786, + "step": 150680 + }, + { + "epoch": 6.24, + "grad_norm": 0.1513671875, + "learning_rate": 0.0004485249710866429, + "loss": 0.256, + "step": 150690 + }, + { + "epoch": 6.24, + "grad_norm": 0.494140625, + "learning_rate": 0.00044851837932842145, + "loss": 0.2347, + "step": 150700 + }, + { + "epoch": 6.24, + "grad_norm": 1.359375, + "learning_rate": 0.0004485117871966098, + "loss": 0.2208, + "step": 150710 + }, + { + "epoch": 6.24, + "grad_norm": 0.5, + "learning_rate": 0.0004485051946912203, + "loss": 0.1675, + "step": 150720 + }, + { + "epoch": 6.24, + "grad_norm": 0.828125, + "learning_rate": 0.0004484986018122656, + "loss": 0.203, + "step": 150730 + }, + { + "epoch": 6.24, + "grad_norm": 0.6953125, + "learning_rate": 0.0004484920085597579, + "loss": 0.1956, + "step": 150740 + }, + { + "epoch": 6.24, + "grad_norm": 0.0016632080078125, + "learning_rate": 0.00044848541493370966, + "loss": 0.166, + "step": 150750 + }, + { + "epoch": 6.24, + "grad_norm": 0.0, + "learning_rate": 0.0004484788209341333, + "loss": 0.178, + "step": 150760 + }, + { + "epoch": 6.24, + "grad_norm": 0.71484375, + "learning_rate": 0.0004484722265610413, + "loss": 0.2365, + "step": 150770 + }, + { + "epoch": 6.25, + "grad_norm": 0.408203125, + "learning_rate": 0.00044846563181444597, + "loss": 0.2221, + "step": 150780 + }, + { + "epoch": 6.25, + "grad_norm": 0.7421875, + "learning_rate": 0.00044845903669435975, + "loss": 0.2248, + "step": 150790 + }, + { + "epoch": 6.25, + "grad_norm": 0.71875, + "learning_rate": 0.00044845244120079504, + "loss": 0.2142, + "step": 150800 + }, + { + "epoch": 6.25, + "grad_norm": 0.33203125, + "learning_rate": 0.0004484458453337643, + "loss": 0.1947, + "step": 150810 + }, + { + "epoch": 6.25, + "grad_norm": 1.3984375, + "learning_rate": 0.0004484392490932799, + "loss": 0.1771, + "step": 150820 + }, + { + "epoch": 6.25, + "grad_norm": 0.5234375, + "learning_rate": 0.0004484326524793542, + "loss": 0.2175, + "step": 150830 + }, + { + "epoch": 6.25, + "grad_norm": 0.66796875, + "learning_rate": 0.00044842605549199975, + "loss": 0.2316, + "step": 150840 + }, + { + "epoch": 6.25, + "grad_norm": 0.87109375, + "learning_rate": 0.00044841945813122884, + "loss": 0.2206, + "step": 150850 + }, + { + "epoch": 6.25, + "grad_norm": 0.73828125, + "learning_rate": 0.000448412860397054, + "loss": 0.2015, + "step": 150860 + }, + { + "epoch": 6.25, + "grad_norm": 0.66796875, + "learning_rate": 0.00044840626228948754, + "loss": 0.2206, + "step": 150870 + }, + { + "epoch": 6.25, + "grad_norm": 0.3671875, + "learning_rate": 0.0004483996638085419, + "loss": 0.1905, + "step": 150880 + }, + { + "epoch": 6.25, + "grad_norm": 0.462890625, + "learning_rate": 0.0004483930649542296, + "loss": 0.2125, + "step": 150890 + }, + { + "epoch": 6.25, + "grad_norm": 0.75390625, + "learning_rate": 0.00044838646572656295, + "loss": 0.2412, + "step": 150900 + }, + { + "epoch": 6.25, + "grad_norm": 0.64453125, + "learning_rate": 0.00044837986612555436, + "loss": 0.2246, + "step": 150910 + }, + { + "epoch": 6.25, + "grad_norm": 1.4375, + "learning_rate": 0.00044837326615121626, + "loss": 0.2109, + "step": 150920 + }, + { + "epoch": 6.25, + "grad_norm": 0.4765625, + "learning_rate": 0.0004483666658035611, + "loss": 0.1893, + "step": 150930 + }, + { + "epoch": 6.25, + "grad_norm": 0.8046875, + "learning_rate": 0.0004483600650826014, + "loss": 0.2106, + "step": 150940 + }, + { + "epoch": 6.25, + "grad_norm": 0.462890625, + "learning_rate": 0.00044835346398834945, + "loss": 0.2346, + "step": 150950 + }, + { + "epoch": 6.25, + "grad_norm": 0.59375, + "learning_rate": 0.0004483468625208176, + "loss": 0.1935, + "step": 150960 + }, + { + "epoch": 6.25, + "grad_norm": 1.359375, + "learning_rate": 0.0004483402606800184, + "loss": 0.2191, + "step": 150970 + }, + { + "epoch": 6.25, + "grad_norm": 0.44140625, + "learning_rate": 0.0004483336584659643, + "loss": 0.1991, + "step": 150980 + }, + { + "epoch": 6.25, + "grad_norm": 0.439453125, + "learning_rate": 0.0004483270558786676, + "loss": 0.1852, + "step": 150990 + }, + { + "epoch": 6.25, + "grad_norm": 0.4765625, + "learning_rate": 0.0004483204529181409, + "loss": 0.2044, + "step": 151000 + }, + { + "epoch": 6.25, + "grad_norm": 0.984375, + "learning_rate": 0.0004483138495843965, + "loss": 0.1672, + "step": 151010 + }, + { + "epoch": 6.26, + "grad_norm": 0.6640625, + "learning_rate": 0.0004483072458774468, + "loss": 0.2276, + "step": 151020 + }, + { + "epoch": 6.26, + "grad_norm": 0.53515625, + "learning_rate": 0.0004483006417973043, + "loss": 0.1782, + "step": 151030 + }, + { + "epoch": 6.26, + "grad_norm": 0.5703125, + "learning_rate": 0.00044829403734398144, + "loss": 0.2444, + "step": 151040 + }, + { + "epoch": 6.26, + "grad_norm": 1.46875, + "learning_rate": 0.0004482874325174906, + "loss": 0.2087, + "step": 151050 + }, + { + "epoch": 6.26, + "grad_norm": 0.3671875, + "learning_rate": 0.0004482808273178442, + "loss": 0.2433, + "step": 151060 + }, + { + "epoch": 6.26, + "grad_norm": 0.5078125, + "learning_rate": 0.00044827422174505474, + "loss": 0.1968, + "step": 151070 + }, + { + "epoch": 6.26, + "grad_norm": 0.90625, + "learning_rate": 0.0004482676157991346, + "loss": 0.2177, + "step": 151080 + }, + { + "epoch": 6.26, + "grad_norm": 0.625, + "learning_rate": 0.0004482610094800961, + "loss": 0.2373, + "step": 151090 + }, + { + "epoch": 6.26, + "grad_norm": 0.451171875, + "learning_rate": 0.00044825440278795195, + "loss": 0.2194, + "step": 151100 + }, + { + "epoch": 6.26, + "grad_norm": 0.76171875, + "learning_rate": 0.00044824779572271437, + "loss": 0.2069, + "step": 151110 + }, + { + "epoch": 6.26, + "grad_norm": 2.328125, + "learning_rate": 0.00044824118828439577, + "loss": 0.2173, + "step": 151120 + }, + { + "epoch": 6.26, + "grad_norm": 0.76171875, + "learning_rate": 0.00044823458047300874, + "loss": 0.1965, + "step": 151130 + }, + { + "epoch": 6.26, + "grad_norm": 0.52734375, + "learning_rate": 0.0004482279722885656, + "loss": 0.202, + "step": 151140 + }, + { + "epoch": 6.26, + "grad_norm": 0.5546875, + "learning_rate": 0.00044822136373107884, + "loss": 0.2482, + "step": 151150 + }, + { + "epoch": 6.26, + "grad_norm": 0.6796875, + "learning_rate": 0.000448214754800561, + "loss": 0.1803, + "step": 151160 + }, + { + "epoch": 6.26, + "grad_norm": 0.734375, + "learning_rate": 0.0004482081454970242, + "loss": 0.2256, + "step": 151170 + }, + { + "epoch": 6.26, + "grad_norm": 1.2890625, + "learning_rate": 0.00044820153582048115, + "loss": 0.2481, + "step": 151180 + }, + { + "epoch": 6.26, + "grad_norm": 0.84375, + "learning_rate": 0.00044819492577094424, + "loss": 0.2086, + "step": 151190 + }, + { + "epoch": 6.26, + "grad_norm": 0.4140625, + "learning_rate": 0.00044818831534842584, + "loss": 0.174, + "step": 151200 + }, + { + "epoch": 6.26, + "grad_norm": 0.357421875, + "learning_rate": 0.0004481817045529385, + "loss": 0.2132, + "step": 151210 + }, + { + "epoch": 6.26, + "grad_norm": 0.92578125, + "learning_rate": 0.0004481750933844945, + "loss": 0.2157, + "step": 151220 + }, + { + "epoch": 6.26, + "grad_norm": 0.58203125, + "learning_rate": 0.0004481684818431064, + "loss": 0.2395, + "step": 151230 + }, + { + "epoch": 6.26, + "grad_norm": 3.140625, + "learning_rate": 0.0004481618699287867, + "loss": 0.2322, + "step": 151240 + }, + { + "epoch": 6.26, + "grad_norm": 0.6796875, + "learning_rate": 0.00044815525764154774, + "loss": 0.2035, + "step": 151250 + }, + { + "epoch": 6.27, + "grad_norm": 0.85546875, + "learning_rate": 0.0004481486449814019, + "loss": 0.2066, + "step": 151260 + }, + { + "epoch": 6.27, + "grad_norm": 1.03125, + "learning_rate": 0.00044814203194836177, + "loss": 0.2372, + "step": 151270 + }, + { + "epoch": 6.27, + "grad_norm": 0.52734375, + "learning_rate": 0.0004481354185424397, + "loss": 0.2028, + "step": 151280 + }, + { + "epoch": 6.27, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004481288047636482, + "loss": 0.1921, + "step": 151290 + }, + { + "epoch": 6.27, + "grad_norm": 0.6171875, + "learning_rate": 0.00044812219061199967, + "loss": 0.1819, + "step": 151300 + }, + { + "epoch": 6.27, + "grad_norm": 0.82421875, + "learning_rate": 0.0004481155760875066, + "loss": 0.2251, + "step": 151310 + }, + { + "epoch": 6.27, + "grad_norm": 0.64453125, + "learning_rate": 0.0004481089611901814, + "loss": 0.2528, + "step": 151320 + }, + { + "epoch": 6.27, + "grad_norm": 0.7265625, + "learning_rate": 0.0004481023459200365, + "loss": 0.2287, + "step": 151330 + }, + { + "epoch": 6.27, + "grad_norm": 1.078125, + "learning_rate": 0.0004480957302770844, + "loss": 0.2066, + "step": 151340 + }, + { + "epoch": 6.27, + "grad_norm": 0.98828125, + "learning_rate": 0.00044808911426133746, + "loss": 0.1932, + "step": 151350 + }, + { + "epoch": 6.27, + "grad_norm": 0.7109375, + "learning_rate": 0.0004480824978728083, + "loss": 0.1821, + "step": 151360 + }, + { + "epoch": 6.27, + "grad_norm": 0.443359375, + "learning_rate": 0.0004480758811115092, + "loss": 0.1965, + "step": 151370 + }, + { + "epoch": 6.27, + "grad_norm": 1.0625, + "learning_rate": 0.00044806926397745274, + "loss": 0.222, + "step": 151380 + }, + { + "epoch": 6.27, + "grad_norm": 0.62109375, + "learning_rate": 0.0004480626464706513, + "loss": 0.2071, + "step": 151390 + }, + { + "epoch": 6.27, + "grad_norm": 0.66796875, + "learning_rate": 0.00044805602859111733, + "loss": 0.2221, + "step": 151400 + }, + { + "epoch": 6.27, + "grad_norm": 0.9140625, + "learning_rate": 0.0004480494103388633, + "loss": 0.2125, + "step": 151410 + }, + { + "epoch": 6.27, + "grad_norm": 0.46484375, + "learning_rate": 0.00044804279171390174, + "loss": 0.1873, + "step": 151420 + }, + { + "epoch": 6.27, + "grad_norm": 0.515625, + "learning_rate": 0.0004480361727162449, + "loss": 0.2082, + "step": 151430 + }, + { + "epoch": 6.27, + "grad_norm": 0.453125, + "learning_rate": 0.00044802955334590546, + "loss": 0.1665, + "step": 151440 + }, + { + "epoch": 6.27, + "grad_norm": 1.5859375, + "learning_rate": 0.00044802293360289574, + "loss": 0.2425, + "step": 151450 + }, + { + "epoch": 6.27, + "grad_norm": 0.494140625, + "learning_rate": 0.0004480163134872283, + "loss": 0.1804, + "step": 151460 + }, + { + "epoch": 6.27, + "grad_norm": 1.375, + "learning_rate": 0.0004480096929989155, + "loss": 0.1988, + "step": 151470 + }, + { + "epoch": 6.27, + "grad_norm": 0.0, + "learning_rate": 0.00044800307213796985, + "loss": 0.2481, + "step": 151480 + }, + { + "epoch": 6.27, + "grad_norm": 0.0, + "learning_rate": 0.00044799645090440375, + "loss": 0.2219, + "step": 151490 + }, + { + "epoch": 6.28, + "grad_norm": 0.76171875, + "learning_rate": 0.00044798982929822984, + "loss": 0.1684, + "step": 151500 + }, + { + "epoch": 6.28, + "grad_norm": 0.62109375, + "learning_rate": 0.00044798320731946037, + "loss": 0.2068, + "step": 151510 + }, + { + "epoch": 6.28, + "grad_norm": 0.0, + "learning_rate": 0.0004479765849681079, + "loss": 0.2306, + "step": 151520 + }, + { + "epoch": 6.28, + "grad_norm": 0.69921875, + "learning_rate": 0.0004479699622441848, + "loss": 0.2529, + "step": 151530 + }, + { + "epoch": 6.28, + "grad_norm": 0.57421875, + "learning_rate": 0.0004479633391477037, + "loss": 0.2169, + "step": 151540 + }, + { + "epoch": 6.28, + "grad_norm": 0.890625, + "learning_rate": 0.00044795671567867697, + "loss": 0.2042, + "step": 151550 + }, + { + "epoch": 6.28, + "grad_norm": 0.55859375, + "learning_rate": 0.00044795009183711703, + "loss": 0.2359, + "step": 151560 + }, + { + "epoch": 6.28, + "grad_norm": 0.73828125, + "learning_rate": 0.00044794346762303645, + "loss": 0.1882, + "step": 151570 + }, + { + "epoch": 6.28, + "grad_norm": 0.51953125, + "learning_rate": 0.00044793684303644763, + "loss": 0.2217, + "step": 151580 + }, + { + "epoch": 6.28, + "grad_norm": 1.328125, + "learning_rate": 0.000447930218077363, + "loss": 0.1918, + "step": 151590 + }, + { + "epoch": 6.28, + "grad_norm": 1.09375, + "learning_rate": 0.0004479235927457951, + "loss": 0.1478, + "step": 151600 + }, + { + "epoch": 6.28, + "grad_norm": 0.9375, + "learning_rate": 0.0004479169670417564, + "loss": 0.1782, + "step": 151610 + }, + { + "epoch": 6.28, + "grad_norm": 0.515625, + "learning_rate": 0.00044791034096525927, + "loss": 0.2065, + "step": 151620 + }, + { + "epoch": 6.28, + "grad_norm": 1.296875, + "learning_rate": 0.00044790371451631627, + "loss": 0.2354, + "step": 151630 + }, + { + "epoch": 6.28, + "grad_norm": 0.50390625, + "learning_rate": 0.0004478970876949399, + "loss": 0.2055, + "step": 151640 + }, + { + "epoch": 6.28, + "grad_norm": 1.0, + "learning_rate": 0.0004478904605011426, + "loss": 0.1576, + "step": 151650 + }, + { + "epoch": 6.28, + "grad_norm": 0.59765625, + "learning_rate": 0.00044788383293493673, + "loss": 0.1925, + "step": 151660 + }, + { + "epoch": 6.28, + "grad_norm": 1.0703125, + "learning_rate": 0.00044787720499633497, + "loss": 0.2204, + "step": 151670 + }, + { + "epoch": 6.28, + "grad_norm": 1.203125, + "learning_rate": 0.0004478705766853496, + "loss": 0.1956, + "step": 151680 + }, + { + "epoch": 6.28, + "grad_norm": 1.203125, + "learning_rate": 0.0004478639480019932, + "loss": 0.2101, + "step": 151690 + }, + { + "epoch": 6.28, + "grad_norm": 1.1015625, + "learning_rate": 0.00044785731894627826, + "loss": 0.2091, + "step": 151700 + }, + { + "epoch": 6.28, + "grad_norm": 0.5703125, + "learning_rate": 0.0004478506895182171, + "loss": 0.1688, + "step": 151710 + }, + { + "epoch": 6.28, + "grad_norm": 0.796875, + "learning_rate": 0.0004478440597178224, + "loss": 0.1761, + "step": 151720 + }, + { + "epoch": 6.28, + "grad_norm": 0.765625, + "learning_rate": 0.0004478374295451065, + "loss": 0.2197, + "step": 151730 + }, + { + "epoch": 6.29, + "grad_norm": 1.15625, + "learning_rate": 0.00044783079900008194, + "loss": 0.2229, + "step": 151740 + }, + { + "epoch": 6.29, + "grad_norm": 0.61328125, + "learning_rate": 0.0004478241680827612, + "loss": 0.1909, + "step": 151750 + }, + { + "epoch": 6.29, + "grad_norm": 0.875, + "learning_rate": 0.0004478175367931567, + "loss": 0.192, + "step": 151760 + }, + { + "epoch": 6.29, + "grad_norm": 0.408203125, + "learning_rate": 0.000447810905131281, + "loss": 0.1789, + "step": 151770 + }, + { + "epoch": 6.29, + "grad_norm": 0.494140625, + "learning_rate": 0.00044780427309714646, + "loss": 0.1794, + "step": 151780 + }, + { + "epoch": 6.29, + "grad_norm": 0.0, + "learning_rate": 0.0004477976406907657, + "loss": 0.2009, + "step": 151790 + }, + { + "epoch": 6.29, + "grad_norm": 0.546875, + "learning_rate": 0.0004477910079121511, + "loss": 0.1768, + "step": 151800 + }, + { + "epoch": 6.29, + "grad_norm": 0.8046875, + "learning_rate": 0.0004477843747613153, + "loss": 0.1982, + "step": 151810 + }, + { + "epoch": 6.29, + "grad_norm": 0.5703125, + "learning_rate": 0.00044777774123827053, + "loss": 0.2249, + "step": 151820 + }, + { + "epoch": 6.29, + "grad_norm": 2.03125, + "learning_rate": 0.00044777110734302945, + "loss": 0.2337, + "step": 151830 + }, + { + "epoch": 6.29, + "grad_norm": 0.59765625, + "learning_rate": 0.00044776447307560453, + "loss": 0.1868, + "step": 151840 + }, + { + "epoch": 6.29, + "grad_norm": 1.2421875, + "learning_rate": 0.0004477578384360082, + "loss": 0.2038, + "step": 151850 + }, + { + "epoch": 6.29, + "grad_norm": 0.6484375, + "learning_rate": 0.000447751203424253, + "loss": 0.1949, + "step": 151860 + }, + { + "epoch": 6.29, + "grad_norm": 0.5390625, + "learning_rate": 0.00044774456804035133, + "loss": 0.1894, + "step": 151870 + }, + { + "epoch": 6.29, + "grad_norm": 1.234375, + "learning_rate": 0.0004477379322843158, + "loss": 0.1823, + "step": 151880 + }, + { + "epoch": 6.29, + "grad_norm": 0.59375, + "learning_rate": 0.00044773129615615883, + "loss": 0.2172, + "step": 151890 + }, + { + "epoch": 6.29, + "grad_norm": 0.57421875, + "learning_rate": 0.00044772465965589294, + "loss": 0.2071, + "step": 151900 + }, + { + "epoch": 6.29, + "grad_norm": 0.5234375, + "learning_rate": 0.00044771802278353054, + "loss": 0.186, + "step": 151910 + }, + { + "epoch": 6.29, + "grad_norm": 1.0546875, + "learning_rate": 0.0004477113855390842, + "loss": 0.2282, + "step": 151920 + }, + { + "epoch": 6.29, + "grad_norm": 0.4140625, + "learning_rate": 0.0004477047479225663, + "loss": 0.1975, + "step": 151930 + }, + { + "epoch": 6.29, + "grad_norm": 1.8515625, + "learning_rate": 0.0004476981099339895, + "loss": 0.1843, + "step": 151940 + }, + { + "epoch": 6.29, + "grad_norm": 0.435546875, + "learning_rate": 0.0004476914715733662, + "loss": 0.2241, + "step": 151950 + }, + { + "epoch": 6.29, + "grad_norm": 1.125, + "learning_rate": 0.00044768483284070895, + "loss": 0.1495, + "step": 151960 + }, + { + "epoch": 6.29, + "grad_norm": 0.2353515625, + "learning_rate": 0.0004476781937360301, + "loss": 0.2125, + "step": 151970 + }, + { + "epoch": 6.29, + "grad_norm": 0.828125, + "learning_rate": 0.00044767155425934227, + "loss": 0.186, + "step": 151980 + }, + { + "epoch": 6.3, + "grad_norm": 0.69921875, + "learning_rate": 0.00044766491441065794, + "loss": 0.1468, + "step": 151990 + }, + { + "epoch": 6.3, + "grad_norm": 0.66015625, + "learning_rate": 0.00044765827418998954, + "loss": 0.2213, + "step": 152000 + }, + { + "epoch": 6.3, + "grad_norm": 0.455078125, + "learning_rate": 0.0004476516335973497, + "loss": 0.2045, + "step": 152010 + }, + { + "epoch": 6.3, + "grad_norm": 0.4765625, + "learning_rate": 0.0004476449926327507, + "loss": 0.1973, + "step": 152020 + }, + { + "epoch": 6.3, + "grad_norm": 2.0, + "learning_rate": 0.0004476383512962052, + "loss": 0.1972, + "step": 152030 + }, + { + "epoch": 6.3, + "grad_norm": 0.65625, + "learning_rate": 0.0004476317095877258, + "loss": 0.2515, + "step": 152040 + }, + { + "epoch": 6.3, + "grad_norm": 0.6171875, + "learning_rate": 0.0004476250675073248, + "loss": 0.2004, + "step": 152050 + }, + { + "epoch": 6.3, + "grad_norm": 0.7578125, + "learning_rate": 0.00044761842505501473, + "loss": 0.224, + "step": 152060 + }, + { + "epoch": 6.3, + "grad_norm": 1.3046875, + "learning_rate": 0.0004476117822308081, + "loss": 0.213, + "step": 152070 + }, + { + "epoch": 6.3, + "grad_norm": 0.7109375, + "learning_rate": 0.00044760513903471745, + "loss": 0.2297, + "step": 152080 + }, + { + "epoch": 6.3, + "grad_norm": 2.265625, + "learning_rate": 0.0004475984954667553, + "loss": 0.2457, + "step": 152090 + }, + { + "epoch": 6.3, + "grad_norm": 0.5859375, + "learning_rate": 0.0004475918515269341, + "loss": 0.1795, + "step": 152100 + }, + { + "epoch": 6.3, + "grad_norm": 0.4921875, + "learning_rate": 0.0004475852072152664, + "loss": 0.1838, + "step": 152110 + }, + { + "epoch": 6.3, + "grad_norm": 0.462890625, + "learning_rate": 0.0004475785625317647, + "loss": 0.2003, + "step": 152120 + }, + { + "epoch": 6.3, + "grad_norm": 0.435546875, + "learning_rate": 0.0004475719174764414, + "loss": 0.2045, + "step": 152130 + }, + { + "epoch": 6.3, + "grad_norm": 0.64453125, + "learning_rate": 0.00044756527204930905, + "loss": 0.2094, + "step": 152140 + }, + { + "epoch": 6.3, + "grad_norm": 0.640625, + "learning_rate": 0.00044755862625038036, + "loss": 0.2216, + "step": 152150 + }, + { + "epoch": 6.3, + "grad_norm": 1.203125, + "learning_rate": 0.00044755198007966757, + "loss": 0.1368, + "step": 152160 + }, + { + "epoch": 6.3, + "grad_norm": 1.0703125, + "learning_rate": 0.0004475453335371833, + "loss": 0.2213, + "step": 152170 + }, + { + "epoch": 6.3, + "grad_norm": 0.70703125, + "learning_rate": 0.00044753868662294, + "loss": 0.209, + "step": 152180 + }, + { + "epoch": 6.3, + "grad_norm": 1.296875, + "learning_rate": 0.0004475320393369503, + "loss": 0.1751, + "step": 152190 + }, + { + "epoch": 6.3, + "grad_norm": 0.58984375, + "learning_rate": 0.0004475253916792266, + "loss": 0.1946, + "step": 152200 + }, + { + "epoch": 6.3, + "grad_norm": 0.458984375, + "learning_rate": 0.0004475187436497814, + "loss": 0.226, + "step": 152210 + }, + { + "epoch": 6.3, + "grad_norm": 0.7265625, + "learning_rate": 0.00044751209524862735, + "loss": 0.1933, + "step": 152220 + }, + { + "epoch": 6.31, + "grad_norm": 0.3515625, + "learning_rate": 0.0004475054464757767, + "loss": 0.1801, + "step": 152230 + }, + { + "epoch": 6.31, + "grad_norm": 0.73046875, + "learning_rate": 0.0004474987973312423, + "loss": 0.1642, + "step": 152240 + }, + { + "epoch": 6.31, + "grad_norm": 0.88671875, + "learning_rate": 0.00044749214781503643, + "loss": 0.2292, + "step": 152250 + }, + { + "epoch": 6.31, + "grad_norm": 0.75, + "learning_rate": 0.0004474854979271717, + "loss": 0.1755, + "step": 152260 + }, + { + "epoch": 6.31, + "grad_norm": 0.330078125, + "learning_rate": 0.0004474788476676606, + "loss": 0.1557, + "step": 152270 + }, + { + "epoch": 6.31, + "grad_norm": 0.671875, + "learning_rate": 0.00044747219703651555, + "loss": 0.1783, + "step": 152280 + }, + { + "epoch": 6.31, + "grad_norm": 0.80859375, + "learning_rate": 0.0004474655460337492, + "loss": 0.1665, + "step": 152290 + }, + { + "epoch": 6.31, + "grad_norm": 0.33203125, + "learning_rate": 0.000447458894659374, + "loss": 0.1837, + "step": 152300 + }, + { + "epoch": 6.31, + "grad_norm": 0.5703125, + "learning_rate": 0.0004474522429134025, + "loss": 0.1971, + "step": 152310 + }, + { + "epoch": 6.31, + "grad_norm": 0.87890625, + "learning_rate": 0.00044744559079584723, + "loss": 0.1608, + "step": 152320 + }, + { + "epoch": 6.31, + "grad_norm": 0.376953125, + "learning_rate": 0.0004474389383067206, + "loss": 0.1447, + "step": 152330 + }, + { + "epoch": 6.31, + "grad_norm": 0.515625, + "learning_rate": 0.00044743228544603524, + "loss": 0.223, + "step": 152340 + }, + { + "epoch": 6.31, + "grad_norm": 0.40234375, + "learning_rate": 0.00044742563221380365, + "loss": 0.155, + "step": 152350 + }, + { + "epoch": 6.31, + "grad_norm": 0.69140625, + "learning_rate": 0.0004474189786100384, + "loss": 0.1923, + "step": 152360 + }, + { + "epoch": 6.31, + "grad_norm": 0.53515625, + "learning_rate": 0.0004474123246347519, + "loss": 0.2206, + "step": 152370 + }, + { + "epoch": 6.31, + "grad_norm": 0.828125, + "learning_rate": 0.00044740567028795665, + "loss": 0.1785, + "step": 152380 + }, + { + "epoch": 6.31, + "grad_norm": 0.6953125, + "learning_rate": 0.0004473990155696653, + "loss": 0.2406, + "step": 152390 + }, + { + "epoch": 6.31, + "grad_norm": 0.4921875, + "learning_rate": 0.0004473923604798904, + "loss": 0.2173, + "step": 152400 + }, + { + "epoch": 6.31, + "grad_norm": 0.5703125, + "learning_rate": 0.00044738570501864434, + "loss": 0.2276, + "step": 152410 + }, + { + "epoch": 6.31, + "grad_norm": 0.7109375, + "learning_rate": 0.00044737904918593964, + "loss": 0.2164, + "step": 152420 + }, + { + "epoch": 6.31, + "grad_norm": 0.5078125, + "learning_rate": 0.000447372392981789, + "loss": 0.2466, + "step": 152430 + }, + { + "epoch": 6.31, + "grad_norm": 0.78515625, + "learning_rate": 0.00044736573640620476, + "loss": 0.2208, + "step": 152440 + }, + { + "epoch": 6.31, + "grad_norm": 0.7578125, + "learning_rate": 0.0004473590794591995, + "loss": 0.1849, + "step": 152450 + }, + { + "epoch": 6.31, + "grad_norm": 0.453125, + "learning_rate": 0.00044735242214078574, + "loss": 0.2017, + "step": 152460 + }, + { + "epoch": 6.32, + "grad_norm": 1.234375, + "learning_rate": 0.00044734576445097613, + "loss": 0.2249, + "step": 152470 + }, + { + "epoch": 6.32, + "grad_norm": 0.7578125, + "learning_rate": 0.0004473391063897831, + "loss": 0.2026, + "step": 152480 + }, + { + "epoch": 6.32, + "grad_norm": 0.87890625, + "learning_rate": 0.0004473324479572191, + "loss": 0.1573, + "step": 152490 + }, + { + "epoch": 6.32, + "grad_norm": 0.75, + "learning_rate": 0.0004473257891532968, + "loss": 0.2193, + "step": 152500 + }, + { + "epoch": 6.32, + "grad_norm": 0.703125, + "learning_rate": 0.00044731912997802867, + "loss": 0.1875, + "step": 152510 + }, + { + "epoch": 6.32, + "grad_norm": 0.875, + "learning_rate": 0.0004473124704314272, + "loss": 0.1946, + "step": 152520 + }, + { + "epoch": 6.32, + "grad_norm": 1.5, + "learning_rate": 0.000447305810513505, + "loss": 0.2048, + "step": 152530 + }, + { + "epoch": 6.32, + "grad_norm": 0.6328125, + "learning_rate": 0.0004472991502242746, + "loss": 0.2146, + "step": 152540 + }, + { + "epoch": 6.32, + "grad_norm": 0.734375, + "learning_rate": 0.00044729248956374844, + "loss": 0.2449, + "step": 152550 + }, + { + "epoch": 6.32, + "grad_norm": 0.52734375, + "learning_rate": 0.00044728582853193914, + "loss": 0.1776, + "step": 152560 + }, + { + "epoch": 6.32, + "grad_norm": 1.796875, + "learning_rate": 0.0004472791671288593, + "loss": 0.2177, + "step": 152570 + }, + { + "epoch": 6.32, + "grad_norm": 1.1796875, + "learning_rate": 0.0004472725053545212, + "loss": 0.2195, + "step": 152580 + }, + { + "epoch": 6.32, + "grad_norm": 0.55078125, + "learning_rate": 0.0004472658432089377, + "loss": 0.1815, + "step": 152590 + }, + { + "epoch": 6.32, + "grad_norm": 1.28125, + "learning_rate": 0.00044725918069212113, + "loss": 0.1649, + "step": 152600 + }, + { + "epoch": 6.32, + "grad_norm": 1.03125, + "learning_rate": 0.00044725251780408415, + "loss": 0.2286, + "step": 152610 + }, + { + "epoch": 6.32, + "grad_norm": 0.67578125, + "learning_rate": 0.0004472458545448391, + "loss": 0.2023, + "step": 152620 + }, + { + "epoch": 6.32, + "grad_norm": 0.72265625, + "learning_rate": 0.0004472391909143988, + "loss": 0.1837, + "step": 152630 + }, + { + "epoch": 6.32, + "grad_norm": 0.875, + "learning_rate": 0.00044723252691277555, + "loss": 0.1946, + "step": 152640 + }, + { + "epoch": 6.32, + "grad_norm": 1.2265625, + "learning_rate": 0.00044722586253998197, + "loss": 0.189, + "step": 152650 + }, + { + "epoch": 6.32, + "grad_norm": 0.2275390625, + "learning_rate": 0.0004472191977960307, + "loss": 0.2011, + "step": 152660 + }, + { + "epoch": 6.32, + "grad_norm": 0.384765625, + "learning_rate": 0.00044721253268093414, + "loss": 0.2712, + "step": 152670 + }, + { + "epoch": 6.32, + "grad_norm": 1.65625, + "learning_rate": 0.00044720586719470493, + "loss": 0.1893, + "step": 152680 + }, + { + "epoch": 6.32, + "grad_norm": 0.953125, + "learning_rate": 0.00044719920133735555, + "loss": 0.2144, + "step": 152690 + }, + { + "epoch": 6.32, + "grad_norm": 0.51171875, + "learning_rate": 0.00044719253510889855, + "loss": 0.193, + "step": 152700 + }, + { + "epoch": 6.33, + "grad_norm": 0.5078125, + "learning_rate": 0.0004471858685093465, + "loss": 0.215, + "step": 152710 + }, + { + "epoch": 6.33, + "grad_norm": 1.015625, + "learning_rate": 0.00044717920153871196, + "loss": 0.1945, + "step": 152720 + }, + { + "epoch": 6.33, + "grad_norm": 0.1943359375, + "learning_rate": 0.00044717253419700745, + "loss": 0.1996, + "step": 152730 + }, + { + "epoch": 6.33, + "grad_norm": 0.859375, + "learning_rate": 0.0004471658664842455, + "loss": 0.2364, + "step": 152740 + }, + { + "epoch": 6.33, + "grad_norm": 0.484375, + "learning_rate": 0.0004471591984004387, + "loss": 0.2218, + "step": 152750 + }, + { + "epoch": 6.33, + "grad_norm": 0.494140625, + "learning_rate": 0.00044715252994559953, + "loss": 0.1998, + "step": 152760 + }, + { + "epoch": 6.33, + "grad_norm": 0.7265625, + "learning_rate": 0.00044714586111974065, + "loss": 0.2081, + "step": 152770 + }, + { + "epoch": 6.33, + "grad_norm": 0.404296875, + "learning_rate": 0.00044713919192287455, + "loss": 0.233, + "step": 152780 + }, + { + "epoch": 6.33, + "grad_norm": 0.28515625, + "learning_rate": 0.0004471325223550138, + "loss": 0.1615, + "step": 152790 + }, + { + "epoch": 6.33, + "grad_norm": 0.482421875, + "learning_rate": 0.0004471258524161709, + "loss": 0.2051, + "step": 152800 + }, + { + "epoch": 6.33, + "grad_norm": 0.56640625, + "learning_rate": 0.00044711918210635837, + "loss": 0.1966, + "step": 152810 + }, + { + "epoch": 6.33, + "grad_norm": 0.380859375, + "learning_rate": 0.0004471125114255889, + "loss": 0.1933, + "step": 152820 + }, + { + "epoch": 6.33, + "grad_norm": 0.78515625, + "learning_rate": 0.0004471058403738749, + "loss": 0.2135, + "step": 152830 + }, + { + "epoch": 6.33, + "grad_norm": 0.59375, + "learning_rate": 0.00044709916895122916, + "loss": 0.1841, + "step": 152840 + }, + { + "epoch": 6.33, + "grad_norm": 0.515625, + "learning_rate": 0.00044709249715766395, + "loss": 0.1886, + "step": 152850 + }, + { + "epoch": 6.33, + "grad_norm": 1.0625, + "learning_rate": 0.00044708582499319193, + "loss": 0.201, + "step": 152860 + }, + { + "epoch": 6.33, + "grad_norm": 0.69140625, + "learning_rate": 0.00044707915245782574, + "loss": 0.2541, + "step": 152870 + }, + { + "epoch": 6.33, + "grad_norm": 0.83984375, + "learning_rate": 0.0004470724795515778, + "loss": 0.2947, + "step": 152880 + }, + { + "epoch": 6.33, + "grad_norm": 0.92578125, + "learning_rate": 0.0004470658062744607, + "loss": 0.26, + "step": 152890 + }, + { + "epoch": 6.33, + "grad_norm": 0.578125, + "learning_rate": 0.00044705913262648713, + "loss": 0.2188, + "step": 152900 + }, + { + "epoch": 6.33, + "grad_norm": 0.55859375, + "learning_rate": 0.0004470524586076695, + "loss": 0.1955, + "step": 152910 + }, + { + "epoch": 6.33, + "grad_norm": 0.64453125, + "learning_rate": 0.00044704578421802046, + "loss": 0.216, + "step": 152920 + }, + { + "epoch": 6.33, + "grad_norm": 0.9140625, + "learning_rate": 0.0004470391094575525, + "loss": 0.1748, + "step": 152930 + }, + { + "epoch": 6.33, + "grad_norm": 0.490234375, + "learning_rate": 0.0004470324343262782, + "loss": 0.2114, + "step": 152940 + }, + { + "epoch": 6.34, + "grad_norm": 0.64453125, + "learning_rate": 0.00044702575882421016, + "loss": 0.1845, + "step": 152950 + }, + { + "epoch": 6.34, + "grad_norm": 0.55078125, + "learning_rate": 0.00044701908295136095, + "loss": 0.167, + "step": 152960 + }, + { + "epoch": 6.34, + "grad_norm": 1.5078125, + "learning_rate": 0.000447012406707743, + "loss": 0.2088, + "step": 152970 + }, + { + "epoch": 6.34, + "grad_norm": 0.427734375, + "learning_rate": 0.0004470057300933691, + "loss": 0.1939, + "step": 152980 + }, + { + "epoch": 6.34, + "grad_norm": 0.69921875, + "learning_rate": 0.00044699905310825164, + "loss": 0.216, + "step": 152990 + }, + { + "epoch": 6.34, + "grad_norm": 0.498046875, + "learning_rate": 0.00044699237575240326, + "loss": 0.1721, + "step": 153000 + }, + { + "epoch": 6.34, + "grad_norm": 0.85546875, + "learning_rate": 0.00044698569802583646, + "loss": 0.2129, + "step": 153010 + }, + { + "epoch": 6.34, + "grad_norm": 1.109375, + "learning_rate": 0.00044697901992856386, + "loss": 0.2057, + "step": 153020 + }, + { + "epoch": 6.34, + "grad_norm": 0.1787109375, + "learning_rate": 0.00044697234146059804, + "loss": 0.2286, + "step": 153030 + }, + { + "epoch": 6.34, + "grad_norm": 0.0, + "learning_rate": 0.00044696566262195146, + "loss": 0.2118, + "step": 153040 + }, + { + "epoch": 6.34, + "grad_norm": 0.9921875, + "learning_rate": 0.00044695898341263687, + "loss": 0.1993, + "step": 153050 + }, + { + "epoch": 6.34, + "grad_norm": 0.8671875, + "learning_rate": 0.00044695230383266673, + "loss": 0.1999, + "step": 153060 + }, + { + "epoch": 6.34, + "grad_norm": 0.5625, + "learning_rate": 0.00044694562388205363, + "loss": 0.2062, + "step": 153070 + }, + { + "epoch": 6.34, + "grad_norm": 0.5234375, + "learning_rate": 0.00044693894356081013, + "loss": 0.213, + "step": 153080 + }, + { + "epoch": 6.34, + "grad_norm": 0.8984375, + "learning_rate": 0.0004469322628689487, + "loss": 0.2359, + "step": 153090 + }, + { + "epoch": 6.34, + "grad_norm": 0.89453125, + "learning_rate": 0.00044692558180648215, + "loss": 0.148, + "step": 153100 + }, + { + "epoch": 6.34, + "grad_norm": 0.53515625, + "learning_rate": 0.0004469189003734229, + "loss": 0.1762, + "step": 153110 + }, + { + "epoch": 6.34, + "grad_norm": 1.4140625, + "learning_rate": 0.00044691221856978347, + "loss": 0.2247, + "step": 153120 + }, + { + "epoch": 6.34, + "grad_norm": 1.0234375, + "learning_rate": 0.0004469055363955766, + "loss": 0.207, + "step": 153130 + }, + { + "epoch": 6.34, + "grad_norm": 0.83203125, + "learning_rate": 0.00044689885385081475, + "loss": 0.2181, + "step": 153140 + }, + { + "epoch": 6.34, + "grad_norm": 0.6484375, + "learning_rate": 0.0004468921709355105, + "loss": 0.186, + "step": 153150 + }, + { + "epoch": 6.34, + "grad_norm": 0.41796875, + "learning_rate": 0.00044688548764967645, + "loss": 0.1725, + "step": 153160 + }, + { + "epoch": 6.34, + "grad_norm": 0.3828125, + "learning_rate": 0.00044687880399332515, + "loss": 0.1546, + "step": 153170 + }, + { + "epoch": 6.34, + "grad_norm": 1.3046875, + "learning_rate": 0.0004468721199664693, + "loss": 0.2076, + "step": 153180 + }, + { + "epoch": 6.35, + "grad_norm": 1.25, + "learning_rate": 0.0004468654355691213, + "loss": 0.2621, + "step": 153190 + }, + { + "epoch": 6.35, + "grad_norm": 0.57421875, + "learning_rate": 0.0004468587508012938, + "loss": 0.1679, + "step": 153200 + }, + { + "epoch": 6.35, + "grad_norm": 0.41015625, + "learning_rate": 0.0004468520656629994, + "loss": 0.2031, + "step": 153210 + }, + { + "epoch": 6.35, + "grad_norm": 1.171875, + "learning_rate": 0.00044684538015425066, + "loss": 0.2199, + "step": 153220 + }, + { + "epoch": 6.35, + "grad_norm": 0.50390625, + "learning_rate": 0.0004468386942750602, + "loss": 0.208, + "step": 153230 + }, + { + "epoch": 6.35, + "grad_norm": 0.51953125, + "learning_rate": 0.00044683200802544054, + "loss": 0.1993, + "step": 153240 + }, + { + "epoch": 6.35, + "grad_norm": 1.9609375, + "learning_rate": 0.00044682532140540433, + "loss": 0.2408, + "step": 153250 + }, + { + "epoch": 6.35, + "grad_norm": 0.361328125, + "learning_rate": 0.0004468186344149641, + "loss": 0.2132, + "step": 153260 + }, + { + "epoch": 6.35, + "grad_norm": 0.43359375, + "learning_rate": 0.00044681194705413247, + "loss": 0.2532, + "step": 153270 + }, + { + "epoch": 6.35, + "grad_norm": 0.6484375, + "learning_rate": 0.00044680525932292207, + "loss": 0.2348, + "step": 153280 + }, + { + "epoch": 6.35, + "grad_norm": 0.390625, + "learning_rate": 0.00044679857122134533, + "loss": 0.2108, + "step": 153290 + }, + { + "epoch": 6.35, + "grad_norm": 0.2099609375, + "learning_rate": 0.00044679188274941495, + "loss": 0.1584, + "step": 153300 + }, + { + "epoch": 6.35, + "grad_norm": 1.15625, + "learning_rate": 0.00044678519390714343, + "loss": 0.1982, + "step": 153310 + }, + { + "epoch": 6.35, + "grad_norm": 0.203125, + "learning_rate": 0.00044677850469454364, + "loss": 0.216, + "step": 153320 + }, + { + "epoch": 6.35, + "grad_norm": 0.4453125, + "learning_rate": 0.00044677181511162777, + "loss": 0.193, + "step": 153330 + }, + { + "epoch": 6.35, + "grad_norm": 0.73828125, + "learning_rate": 0.0004467651251584086, + "loss": 0.1393, + "step": 153340 + }, + { + "epoch": 6.35, + "grad_norm": 0.453125, + "learning_rate": 0.00044675843483489873, + "loss": 0.1792, + "step": 153350 + }, + { + "epoch": 6.35, + "grad_norm": 1.78125, + "learning_rate": 0.0004467517441411109, + "loss": 0.2177, + "step": 153360 + }, + { + "epoch": 6.35, + "grad_norm": 0.62109375, + "learning_rate": 0.0004467450530770573, + "loss": 0.1739, + "step": 153370 + }, + { + "epoch": 6.35, + "grad_norm": 1.140625, + "learning_rate": 0.0004467383616427508, + "loss": 0.1989, + "step": 153380 + }, + { + "epoch": 6.35, + "grad_norm": 1.5, + "learning_rate": 0.00044673166983820407, + "loss": 0.2041, + "step": 153390 + }, + { + "epoch": 6.35, + "grad_norm": 0.6171875, + "learning_rate": 0.00044672497766342956, + "loss": 0.228, + "step": 153400 + }, + { + "epoch": 6.35, + "grad_norm": 0.52734375, + "learning_rate": 0.0004467182851184398, + "loss": 0.2265, + "step": 153410 + }, + { + "epoch": 6.35, + "grad_norm": 0.482421875, + "learning_rate": 0.0004467115922032475, + "loss": 0.201, + "step": 153420 + }, + { + "epoch": 6.36, + "grad_norm": 1.1640625, + "learning_rate": 0.00044670489891786524, + "loss": 0.2207, + "step": 153430 + }, + { + "epoch": 6.36, + "grad_norm": 0.353515625, + "learning_rate": 0.00044669820526230567, + "loss": 0.226, + "step": 153440 + }, + { + "epoch": 6.36, + "grad_norm": 0.859375, + "learning_rate": 0.00044669151123658126, + "loss": 0.222, + "step": 153450 + }, + { + "epoch": 6.36, + "grad_norm": 0.828125, + "learning_rate": 0.0004466848168407046, + "loss": 0.1672, + "step": 153460 + }, + { + "epoch": 6.36, + "grad_norm": 1.65625, + "learning_rate": 0.0004466781220746884, + "loss": 0.2157, + "step": 153470 + }, + { + "epoch": 6.36, + "grad_norm": 0.67578125, + "learning_rate": 0.0004466714269385452, + "loss": 0.1933, + "step": 153480 + }, + { + "epoch": 6.36, + "grad_norm": 1.5, + "learning_rate": 0.0004466647314322877, + "loss": 0.1872, + "step": 153490 + }, + { + "epoch": 6.36, + "grad_norm": 0.75390625, + "learning_rate": 0.0004466580355559283, + "loss": 0.2174, + "step": 153500 + }, + { + "epoch": 6.36, + "grad_norm": 0.75390625, + "learning_rate": 0.00044665133930947977, + "loss": 0.1804, + "step": 153510 + }, + { + "epoch": 6.36, + "grad_norm": 0.494140625, + "learning_rate": 0.0004466446426929547, + "loss": 0.1931, + "step": 153520 + }, + { + "epoch": 6.36, + "grad_norm": 0.796875, + "learning_rate": 0.00044663794570636565, + "loss": 0.2047, + "step": 153530 + }, + { + "epoch": 6.36, + "grad_norm": 0.376953125, + "learning_rate": 0.00044663124834972513, + "loss": 0.2312, + "step": 153540 + }, + { + "epoch": 6.36, + "grad_norm": 2.296875, + "learning_rate": 0.00044662455062304587, + "loss": 0.2207, + "step": 153550 + }, + { + "epoch": 6.36, + "grad_norm": 1.1171875, + "learning_rate": 0.00044661785252634044, + "loss": 0.1931, + "step": 153560 + }, + { + "epoch": 6.36, + "grad_norm": 1.25, + "learning_rate": 0.0004466111540596215, + "loss": 0.2121, + "step": 153570 + }, + { + "epoch": 6.36, + "grad_norm": 0.53125, + "learning_rate": 0.0004466044552229015, + "loss": 0.2034, + "step": 153580 + }, + { + "epoch": 6.36, + "grad_norm": 0.427734375, + "learning_rate": 0.0004465977560161932, + "loss": 0.1631, + "step": 153590 + }, + { + "epoch": 6.36, + "grad_norm": 0.9609375, + "learning_rate": 0.00044659105643950913, + "loss": 0.193, + "step": 153600 + }, + { + "epoch": 6.36, + "grad_norm": 0.7421875, + "learning_rate": 0.000446584356492862, + "loss": 0.1808, + "step": 153610 + }, + { + "epoch": 6.36, + "grad_norm": 0.92578125, + "learning_rate": 0.00044657765617626423, + "loss": 0.2146, + "step": 153620 + }, + { + "epoch": 6.36, + "grad_norm": 0.6875, + "learning_rate": 0.0004465709554897286, + "loss": 0.2095, + "step": 153630 + }, + { + "epoch": 6.36, + "grad_norm": 0.671875, + "learning_rate": 0.00044656425443326763, + "loss": 0.1964, + "step": 153640 + }, + { + "epoch": 6.36, + "grad_norm": 0.80859375, + "learning_rate": 0.00044655755300689406, + "loss": 0.204, + "step": 153650 + }, + { + "epoch": 6.36, + "grad_norm": 1.828125, + "learning_rate": 0.00044655085121062026, + "loss": 0.2215, + "step": 153660 + }, + { + "epoch": 6.36, + "grad_norm": 1.109375, + "learning_rate": 0.00044654414904445907, + "loss": 0.2583, + "step": 153670 + }, + { + "epoch": 6.37, + "grad_norm": 0.75390625, + "learning_rate": 0.000446537446508423, + "loss": 0.2013, + "step": 153680 + }, + { + "epoch": 6.37, + "grad_norm": 0.0, + "learning_rate": 0.00044653074360252466, + "loss": 0.199, + "step": 153690 + }, + { + "epoch": 6.37, + "grad_norm": 0.640625, + "learning_rate": 0.00044652404032677676, + "loss": 0.1834, + "step": 153700 + }, + { + "epoch": 6.37, + "grad_norm": 0.15625, + "learning_rate": 0.0004465173366811918, + "loss": 0.234, + "step": 153710 + }, + { + "epoch": 6.37, + "grad_norm": 0.62890625, + "learning_rate": 0.00044651063266578234, + "loss": 0.2081, + "step": 153720 + }, + { + "epoch": 6.37, + "grad_norm": 0.251953125, + "learning_rate": 0.0004465039282805612, + "loss": 0.2048, + "step": 153730 + }, + { + "epoch": 6.37, + "grad_norm": 0.5546875, + "learning_rate": 0.0004464972235255408, + "loss": 0.2637, + "step": 153740 + }, + { + "epoch": 6.37, + "grad_norm": 0.828125, + "learning_rate": 0.000446490518400734, + "loss": 0.2147, + "step": 153750 + }, + { + "epoch": 6.37, + "grad_norm": 0.5859375, + "learning_rate": 0.0004464838129061531, + "loss": 0.1716, + "step": 153760 + }, + { + "epoch": 6.37, + "grad_norm": 0.58203125, + "learning_rate": 0.0004464771070418109, + "loss": 0.1735, + "step": 153770 + }, + { + "epoch": 6.37, + "grad_norm": 0.875, + "learning_rate": 0.00044647040080772005, + "loss": 0.1909, + "step": 153780 + }, + { + "epoch": 6.37, + "grad_norm": 1.7578125, + "learning_rate": 0.0004464636942038931, + "loss": 0.2078, + "step": 153790 + }, + { + "epoch": 6.37, + "grad_norm": 4.3125, + "learning_rate": 0.0004464569872303428, + "loss": 0.1855, + "step": 153800 + }, + { + "epoch": 6.37, + "grad_norm": 0.84375, + "learning_rate": 0.00044645027988708154, + "loss": 0.1924, + "step": 153810 + }, + { + "epoch": 6.37, + "grad_norm": 0.578125, + "learning_rate": 0.00044644357217412214, + "loss": 0.1983, + "step": 153820 + }, + { + "epoch": 6.37, + "grad_norm": 0.197265625, + "learning_rate": 0.0004464368640914771, + "loss": 0.1827, + "step": 153830 + }, + { + "epoch": 6.37, + "grad_norm": 0.8125, + "learning_rate": 0.0004464301556391591, + "loss": 0.2254, + "step": 153840 + }, + { + "epoch": 6.37, + "grad_norm": 0.72265625, + "learning_rate": 0.00044642344681718074, + "loss": 0.1767, + "step": 153850 + }, + { + "epoch": 6.37, + "grad_norm": 0.53125, + "learning_rate": 0.0004464167376255547, + "loss": 0.2504, + "step": 153860 + }, + { + "epoch": 6.37, + "grad_norm": 1.546875, + "learning_rate": 0.0004464100280642935, + "loss": 0.2086, + "step": 153870 + }, + { + "epoch": 6.37, + "grad_norm": 0.82421875, + "learning_rate": 0.00044640331813340997, + "loss": 0.1967, + "step": 153880 + }, + { + "epoch": 6.37, + "grad_norm": 1.8671875, + "learning_rate": 0.0004463966078329166, + "loss": 0.2161, + "step": 153890 + }, + { + "epoch": 6.37, + "grad_norm": 0.357421875, + "learning_rate": 0.0004463898971628259, + "loss": 0.2454, + "step": 153900 + }, + { + "epoch": 6.37, + "grad_norm": 0.83203125, + "learning_rate": 0.00044638318612315065, + "loss": 0.2235, + "step": 153910 + }, + { + "epoch": 6.38, + "grad_norm": 0.64453125, + "learning_rate": 0.0004463764747139035, + "loss": 0.2127, + "step": 153920 + }, + { + "epoch": 6.38, + "grad_norm": 0.1689453125, + "learning_rate": 0.000446369762935097, + "loss": 0.222, + "step": 153930 + }, + { + "epoch": 6.38, + "grad_norm": 0.77734375, + "learning_rate": 0.00044636305078674386, + "loss": 0.2378, + "step": 153940 + }, + { + "epoch": 6.38, + "grad_norm": 0.5703125, + "learning_rate": 0.00044635633826885663, + "loss": 0.1584, + "step": 153950 + }, + { + "epoch": 6.38, + "grad_norm": 0.447265625, + "learning_rate": 0.000446349625381448, + "loss": 0.246, + "step": 153960 + }, + { + "epoch": 6.38, + "grad_norm": 2.109375, + "learning_rate": 0.00044634291212453054, + "loss": 0.2031, + "step": 153970 + }, + { + "epoch": 6.38, + "grad_norm": 0.6484375, + "learning_rate": 0.0004463361984981169, + "loss": 0.2445, + "step": 153980 + }, + { + "epoch": 6.38, + "grad_norm": 0.859375, + "learning_rate": 0.00044632948450221984, + "loss": 0.2399, + "step": 153990 + }, + { + "epoch": 6.38, + "grad_norm": 0.474609375, + "learning_rate": 0.00044632277013685183, + "loss": 0.2447, + "step": 154000 + }, + { + "epoch": 6.38, + "grad_norm": 0.609375, + "learning_rate": 0.00044631605540202556, + "loss": 0.193, + "step": 154010 + }, + { + "epoch": 6.38, + "grad_norm": 0.6875, + "learning_rate": 0.00044630934029775367, + "loss": 0.2441, + "step": 154020 + }, + { + "epoch": 6.38, + "grad_norm": 0.796875, + "learning_rate": 0.0004463026248240488, + "loss": 0.1503, + "step": 154030 + }, + { + "epoch": 6.38, + "grad_norm": 0.357421875, + "learning_rate": 0.00044629590898092366, + "loss": 0.1482, + "step": 154040 + }, + { + "epoch": 6.38, + "grad_norm": 0.458984375, + "learning_rate": 0.00044628919276839076, + "loss": 0.205, + "step": 154050 + }, + { + "epoch": 6.38, + "grad_norm": 0.65234375, + "learning_rate": 0.00044628247618646276, + "loss": 0.1957, + "step": 154060 + }, + { + "epoch": 6.38, + "grad_norm": 0.8203125, + "learning_rate": 0.00044627575923515233, + "loss": 0.2322, + "step": 154070 + }, + { + "epoch": 6.38, + "grad_norm": 0.3125, + "learning_rate": 0.0004462690419144722, + "loss": 0.1691, + "step": 154080 + }, + { + "epoch": 6.38, + "grad_norm": 0.6640625, + "learning_rate": 0.0004462623242244349, + "loss": 0.2143, + "step": 154090 + }, + { + "epoch": 6.38, + "grad_norm": 0.80078125, + "learning_rate": 0.00044625560616505305, + "loss": 0.1335, + "step": 154100 + }, + { + "epoch": 6.38, + "grad_norm": 0.6328125, + "learning_rate": 0.00044624888773633935, + "loss": 0.1769, + "step": 154110 + }, + { + "epoch": 6.38, + "grad_norm": 0.55859375, + "learning_rate": 0.0004462421689383065, + "loss": 0.2441, + "step": 154120 + }, + { + "epoch": 6.38, + "grad_norm": 1.5859375, + "learning_rate": 0.00044623544977096707, + "loss": 0.2208, + "step": 154130 + }, + { + "epoch": 6.38, + "grad_norm": 0.71875, + "learning_rate": 0.0004462287302343337, + "loss": 0.1486, + "step": 154140 + }, + { + "epoch": 6.38, + "grad_norm": 0.373046875, + "learning_rate": 0.000446222010328419, + "loss": 0.2741, + "step": 154150 + }, + { + "epoch": 6.39, + "grad_norm": 1.0859375, + "learning_rate": 0.00044621529005323574, + "loss": 0.213, + "step": 154160 + }, + { + "epoch": 6.39, + "grad_norm": 0.59765625, + "learning_rate": 0.00044620856940879645, + "loss": 0.2147, + "step": 154170 + }, + { + "epoch": 6.39, + "grad_norm": 1.2265625, + "learning_rate": 0.00044620184839511384, + "loss": 0.1912, + "step": 154180 + }, + { + "epoch": 6.39, + "grad_norm": 0.341796875, + "learning_rate": 0.00044619512701220054, + "loss": 0.2302, + "step": 154190 + }, + { + "epoch": 6.39, + "grad_norm": 0.48828125, + "learning_rate": 0.0004461884052600692, + "loss": 0.1955, + "step": 154200 + }, + { + "epoch": 6.39, + "grad_norm": 0.345703125, + "learning_rate": 0.0004461816831387324, + "loss": 0.1922, + "step": 154210 + }, + { + "epoch": 6.39, + "grad_norm": 0.455078125, + "learning_rate": 0.00044617496064820294, + "loss": 0.2251, + "step": 154220 + }, + { + "epoch": 6.39, + "grad_norm": 0.46875, + "learning_rate": 0.00044616823778849336, + "loss": 0.2142, + "step": 154230 + }, + { + "epoch": 6.39, + "grad_norm": 0.94921875, + "learning_rate": 0.0004461615145596164, + "loss": 0.2365, + "step": 154240 + }, + { + "epoch": 6.39, + "grad_norm": 0.60546875, + "learning_rate": 0.0004461547909615846, + "loss": 0.2243, + "step": 154250 + }, + { + "epoch": 6.39, + "grad_norm": 0.4296875, + "learning_rate": 0.00044614806699441067, + "loss": 0.2255, + "step": 154260 + }, + { + "epoch": 6.39, + "grad_norm": 0.82421875, + "learning_rate": 0.00044614134265810723, + "loss": 0.2121, + "step": 154270 + }, + { + "epoch": 6.39, + "grad_norm": 1.1171875, + "learning_rate": 0.000446134617952687, + "loss": 0.2288, + "step": 154280 + }, + { + "epoch": 6.39, + "grad_norm": 0.9453125, + "learning_rate": 0.00044612789287816257, + "loss": 0.1971, + "step": 154290 + }, + { + "epoch": 6.39, + "grad_norm": 0.328125, + "learning_rate": 0.00044612116743454665, + "loss": 0.2107, + "step": 154300 + }, + { + "epoch": 6.39, + "grad_norm": 0.54296875, + "learning_rate": 0.0004461144416218519, + "loss": 0.2168, + "step": 154310 + }, + { + "epoch": 6.39, + "grad_norm": 0.5703125, + "learning_rate": 0.00044610771544009085, + "loss": 0.2174, + "step": 154320 + }, + { + "epoch": 6.39, + "grad_norm": 0.52734375, + "learning_rate": 0.0004461009888892764, + "loss": 0.1862, + "step": 154330 + }, + { + "epoch": 6.39, + "grad_norm": 0.91796875, + "learning_rate": 0.00044609426196942094, + "loss": 0.2227, + "step": 154340 + }, + { + "epoch": 6.39, + "grad_norm": 0.66796875, + "learning_rate": 0.0004460875346805373, + "loss": 0.237, + "step": 154350 + }, + { + "epoch": 6.39, + "grad_norm": 0.419921875, + "learning_rate": 0.0004460808070226381, + "loss": 0.1858, + "step": 154360 + }, + { + "epoch": 6.39, + "grad_norm": 0.447265625, + "learning_rate": 0.00044607407899573603, + "loss": 0.2095, + "step": 154370 + }, + { + "epoch": 6.39, + "grad_norm": 0.5234375, + "learning_rate": 0.0004460673505998437, + "loss": 0.1917, + "step": 154380 + }, + { + "epoch": 6.39, + "grad_norm": 1.1328125, + "learning_rate": 0.0004460606218349738, + "loss": 0.2101, + "step": 154390 + }, + { + "epoch": 6.4, + "grad_norm": 1.3046875, + "learning_rate": 0.00044605389270113894, + "loss": 0.2197, + "step": 154400 + }, + { + "epoch": 6.4, + "grad_norm": 0.466796875, + "learning_rate": 0.00044604716319835184, + "loss": 0.257, + "step": 154410 + }, + { + "epoch": 6.4, + "grad_norm": 1.046875, + "learning_rate": 0.00044604043332662516, + "loss": 0.1689, + "step": 154420 + }, + { + "epoch": 6.4, + "grad_norm": 1.0, + "learning_rate": 0.0004460337030859715, + "loss": 0.1624, + "step": 154430 + }, + { + "epoch": 6.4, + "grad_norm": 0.412109375, + "learning_rate": 0.0004460269724764037, + "loss": 0.2149, + "step": 154440 + }, + { + "epoch": 6.4, + "grad_norm": 0.828125, + "learning_rate": 0.00044602024149793416, + "loss": 0.1936, + "step": 154450 + }, + { + "epoch": 6.4, + "grad_norm": 0.75, + "learning_rate": 0.0004460135101505758, + "loss": 0.1764, + "step": 154460 + }, + { + "epoch": 6.4, + "grad_norm": 0.53125, + "learning_rate": 0.0004460067784343411, + "loss": 0.1824, + "step": 154470 + }, + { + "epoch": 6.4, + "grad_norm": 1.7109375, + "learning_rate": 0.00044600004634924296, + "loss": 0.1992, + "step": 154480 + }, + { + "epoch": 6.4, + "grad_norm": 0.84375, + "learning_rate": 0.00044599331389529376, + "loss": 0.2309, + "step": 154490 + }, + { + "epoch": 6.4, + "grad_norm": 1.125, + "learning_rate": 0.00044598658107250635, + "loss": 0.1911, + "step": 154500 + }, + { + "epoch": 6.4, + "grad_norm": 0.369140625, + "learning_rate": 0.00044597984788089336, + "loss": 0.2111, + "step": 154510 + }, + { + "epoch": 6.4, + "grad_norm": 0.7421875, + "learning_rate": 0.0004459731143204675, + "loss": 0.1743, + "step": 154520 + }, + { + "epoch": 6.4, + "grad_norm": 0.486328125, + "learning_rate": 0.0004459663803912413, + "loss": 0.245, + "step": 154530 + }, + { + "epoch": 6.4, + "grad_norm": 0.41015625, + "learning_rate": 0.0004459596460932276, + "loss": 0.2195, + "step": 154540 + }, + { + "epoch": 6.4, + "grad_norm": 0.68359375, + "learning_rate": 0.000445952911426439, + "loss": 0.2287, + "step": 154550 + }, + { + "epoch": 6.4, + "grad_norm": 0.7578125, + "learning_rate": 0.0004459461763908882, + "loss": 0.2477, + "step": 154560 + }, + { + "epoch": 6.4, + "grad_norm": 2.109375, + "learning_rate": 0.00044593944098658786, + "loss": 0.1628, + "step": 154570 + }, + { + "epoch": 6.4, + "grad_norm": 0.318359375, + "learning_rate": 0.0004459327052135506, + "loss": 0.2003, + "step": 154580 + }, + { + "epoch": 6.4, + "grad_norm": 0.390625, + "learning_rate": 0.0004459259690717892, + "loss": 0.2057, + "step": 154590 + }, + { + "epoch": 6.4, + "grad_norm": 0.40234375, + "learning_rate": 0.00044591923256131626, + "loss": 0.2261, + "step": 154600 + }, + { + "epoch": 6.4, + "grad_norm": 0.65234375, + "learning_rate": 0.0004459124956821445, + "loss": 0.1928, + "step": 154610 + }, + { + "epoch": 6.4, + "grad_norm": 0.51953125, + "learning_rate": 0.00044590575843428653, + "loss": 0.2059, + "step": 154620 + }, + { + "epoch": 6.4, + "grad_norm": 0.408203125, + "learning_rate": 0.0004458990208177551, + "loss": 0.1805, + "step": 154630 + }, + { + "epoch": 6.41, + "grad_norm": 0.91796875, + "learning_rate": 0.00044589228283256296, + "loss": 0.1908, + "step": 154640 + }, + { + "epoch": 6.41, + "grad_norm": 0.66796875, + "learning_rate": 0.0004458855444787226, + "loss": 0.2023, + "step": 154650 + }, + { + "epoch": 6.41, + "grad_norm": 0.427734375, + "learning_rate": 0.0004458788057562468, + "loss": 0.2064, + "step": 154660 + }, + { + "epoch": 6.41, + "grad_norm": 0.91015625, + "learning_rate": 0.0004458720666651482, + "loss": 0.2758, + "step": 154670 + }, + { + "epoch": 6.41, + "grad_norm": 1.8046875, + "learning_rate": 0.00044586532720543965, + "loss": 0.2095, + "step": 154680 + }, + { + "epoch": 6.41, + "grad_norm": 0.58984375, + "learning_rate": 0.0004458585873771336, + "loss": 0.1251, + "step": 154690 + }, + { + "epoch": 6.41, + "grad_norm": 1.0390625, + "learning_rate": 0.00044585184718024293, + "loss": 0.1506, + "step": 154700 + }, + { + "epoch": 6.41, + "grad_norm": 0.890625, + "learning_rate": 0.00044584510661478015, + "loss": 0.2311, + "step": 154710 + }, + { + "epoch": 6.41, + "grad_norm": 0.62890625, + "learning_rate": 0.0004458383656807581, + "loss": 0.2114, + "step": 154720 + }, + { + "epoch": 6.41, + "grad_norm": 1.25, + "learning_rate": 0.00044583162437818934, + "loss": 0.1784, + "step": 154730 + }, + { + "epoch": 6.41, + "grad_norm": 1.1875, + "learning_rate": 0.00044582488270708667, + "loss": 0.2257, + "step": 154740 + }, + { + "epoch": 6.41, + "grad_norm": 0.71875, + "learning_rate": 0.0004458181406674626, + "loss": 0.1962, + "step": 154750 + }, + { + "epoch": 6.41, + "grad_norm": 0.2314453125, + "learning_rate": 0.00044581139825933003, + "loss": 0.2021, + "step": 154760 + }, + { + "epoch": 6.41, + "grad_norm": 1.421875, + "learning_rate": 0.00044580465548270154, + "loss": 0.1545, + "step": 154770 + }, + { + "epoch": 6.41, + "grad_norm": 0.4921875, + "learning_rate": 0.0004457979123375898, + "loss": 0.1689, + "step": 154780 + }, + { + "epoch": 6.41, + "grad_norm": 1.09375, + "learning_rate": 0.00044579116882400763, + "loss": 0.2534, + "step": 154790 + }, + { + "epoch": 6.41, + "grad_norm": 0.7421875, + "learning_rate": 0.0004457844249419676, + "loss": 0.1742, + "step": 154800 + }, + { + "epoch": 6.41, + "grad_norm": 0.404296875, + "learning_rate": 0.00044577768069148236, + "loss": 0.2021, + "step": 154810 + }, + { + "epoch": 6.41, + "grad_norm": 0.40234375, + "learning_rate": 0.00044577093607256463, + "loss": 0.2448, + "step": 154820 + }, + { + "epoch": 6.41, + "grad_norm": 0.61328125, + "learning_rate": 0.00044576419108522725, + "loss": 0.2518, + "step": 154830 + }, + { + "epoch": 6.41, + "grad_norm": 2.0625, + "learning_rate": 0.00044575744572948275, + "loss": 0.1987, + "step": 154840 + }, + { + "epoch": 6.41, + "grad_norm": 0.40234375, + "learning_rate": 0.00044575070000534386, + "loss": 0.2341, + "step": 154850 + }, + { + "epoch": 6.41, + "grad_norm": 0.546875, + "learning_rate": 0.0004457439539128233, + "loss": 0.1973, + "step": 154860 + }, + { + "epoch": 6.41, + "grad_norm": 1.09375, + "learning_rate": 0.0004457372074519338, + "loss": 0.1911, + "step": 154870 + }, + { + "epoch": 6.42, + "grad_norm": 0.58984375, + "learning_rate": 0.00044573046062268797, + "loss": 0.1856, + "step": 154880 + }, + { + "epoch": 6.42, + "grad_norm": 0.83203125, + "learning_rate": 0.00044572371342509854, + "loss": 0.1991, + "step": 154890 + }, + { + "epoch": 6.42, + "grad_norm": 0.640625, + "learning_rate": 0.0004457169658591782, + "loss": 0.1766, + "step": 154900 + }, + { + "epoch": 6.42, + "grad_norm": 0.2890625, + "learning_rate": 0.0004457102179249397, + "loss": 0.1987, + "step": 154910 + }, + { + "epoch": 6.42, + "grad_norm": 0.984375, + "learning_rate": 0.00044570346962239575, + "loss": 0.2123, + "step": 154920 + }, + { + "epoch": 6.42, + "grad_norm": 0.134765625, + "learning_rate": 0.000445696720951559, + "loss": 0.1995, + "step": 154930 + }, + { + "epoch": 6.42, + "grad_norm": 2.109375, + "learning_rate": 0.0004456899719124421, + "loss": 0.2275, + "step": 154940 + }, + { + "epoch": 6.42, + "grad_norm": 0.34765625, + "learning_rate": 0.0004456832225050578, + "loss": 0.1539, + "step": 154950 + }, + { + "epoch": 6.42, + "grad_norm": 0.55859375, + "learning_rate": 0.00044567647272941886, + "loss": 0.2273, + "step": 154960 + }, + { + "epoch": 6.42, + "grad_norm": 1.0390625, + "learning_rate": 0.0004456697225855379, + "loss": 0.1919, + "step": 154970 + }, + { + "epoch": 6.42, + "grad_norm": 0.66796875, + "learning_rate": 0.00044566297207342766, + "loss": 0.2422, + "step": 154980 + }, + { + "epoch": 6.42, + "grad_norm": 0.60546875, + "learning_rate": 0.0004456562211931008, + "loss": 0.181, + "step": 154990 + }, + { + "epoch": 6.42, + "grad_norm": 0.9921875, + "learning_rate": 0.00044564946994457013, + "loss": 0.2042, + "step": 155000 + }, + { + "epoch": 6.42, + "grad_norm": 1.21875, + "learning_rate": 0.0004456427183278482, + "loss": 0.2494, + "step": 155010 + }, + { + "epoch": 6.42, + "grad_norm": 1.109375, + "learning_rate": 0.0004456359663429479, + "loss": 0.2042, + "step": 155020 + }, + { + "epoch": 6.42, + "grad_norm": 0.55859375, + "learning_rate": 0.00044562921398988176, + "loss": 0.2492, + "step": 155030 + }, + { + "epoch": 6.42, + "grad_norm": 0.50390625, + "learning_rate": 0.00044562246126866254, + "loss": 0.1761, + "step": 155040 + }, + { + "epoch": 6.42, + "grad_norm": 0.46484375, + "learning_rate": 0.0004456157081793031, + "loss": 0.2255, + "step": 155050 + }, + { + "epoch": 6.42, + "grad_norm": 0.97265625, + "learning_rate": 0.0004456089547218159, + "loss": 0.1774, + "step": 155060 + }, + { + "epoch": 6.42, + "grad_norm": 0.44921875, + "learning_rate": 0.00044560220089621386, + "loss": 0.1883, + "step": 155070 + }, + { + "epoch": 6.42, + "grad_norm": 0.67578125, + "learning_rate": 0.00044559544670250953, + "loss": 0.164, + "step": 155080 + }, + { + "epoch": 6.42, + "grad_norm": 0.41015625, + "learning_rate": 0.00044558869214071574, + "loss": 0.1931, + "step": 155090 + }, + { + "epoch": 6.42, + "grad_norm": 1.2265625, + "learning_rate": 0.00044558193721084513, + "loss": 0.1692, + "step": 155100 + }, + { + "epoch": 6.42, + "grad_norm": 0.1728515625, + "learning_rate": 0.0004455751819129104, + "loss": 0.1984, + "step": 155110 + }, + { + "epoch": 6.43, + "grad_norm": 0.5234375, + "learning_rate": 0.0004455684262469244, + "loss": 0.2106, + "step": 155120 + }, + { + "epoch": 6.43, + "grad_norm": 0.7890625, + "learning_rate": 0.00044556167021289964, + "loss": 0.2189, + "step": 155130 + }, + { + "epoch": 6.43, + "grad_norm": 0.890625, + "learning_rate": 0.000445554913810849, + "loss": 0.2269, + "step": 155140 + }, + { + "epoch": 6.43, + "grad_norm": 1.296875, + "learning_rate": 0.0004455481570407851, + "loss": 0.2228, + "step": 155150 + }, + { + "epoch": 6.43, + "grad_norm": 0.6484375, + "learning_rate": 0.0004455413999027207, + "loss": 0.2103, + "step": 155160 + }, + { + "epoch": 6.43, + "grad_norm": 1.03125, + "learning_rate": 0.00044553464239666845, + "loss": 0.219, + "step": 155170 + }, + { + "epoch": 6.43, + "grad_norm": 0.7734375, + "learning_rate": 0.00044552788452264114, + "loss": 0.2709, + "step": 155180 + }, + { + "epoch": 6.43, + "grad_norm": 0.474609375, + "learning_rate": 0.0004455211262806515, + "loss": 0.2106, + "step": 155190 + }, + { + "epoch": 6.43, + "grad_norm": 0.578125, + "learning_rate": 0.00044551436767071217, + "loss": 0.2049, + "step": 155200 + }, + { + "epoch": 6.43, + "grad_norm": 0.92578125, + "learning_rate": 0.00044550760869283603, + "loss": 0.2118, + "step": 155210 + }, + { + "epoch": 6.43, + "grad_norm": 0.609375, + "learning_rate": 0.0004455008493470356, + "loss": 0.2062, + "step": 155220 + }, + { + "epoch": 6.43, + "grad_norm": 1.171875, + "learning_rate": 0.0004454940896333236, + "loss": 0.229, + "step": 155230 + }, + { + "epoch": 6.43, + "grad_norm": 0.859375, + "learning_rate": 0.000445487329551713, + "loss": 0.2066, + "step": 155240 + }, + { + "epoch": 6.43, + "grad_norm": 0.255859375, + "learning_rate": 0.00044548056910221625, + "loss": 0.183, + "step": 155250 + }, + { + "epoch": 6.43, + "grad_norm": 1.3046875, + "learning_rate": 0.00044547380828484617, + "loss": 0.2321, + "step": 155260 + }, + { + "epoch": 6.43, + "grad_norm": 0.5859375, + "learning_rate": 0.00044546704709961556, + "loss": 0.1822, + "step": 155270 + }, + { + "epoch": 6.43, + "grad_norm": 0.8359375, + "learning_rate": 0.00044546028554653705, + "loss": 0.2105, + "step": 155280 + }, + { + "epoch": 6.43, + "grad_norm": 0.7109375, + "learning_rate": 0.00044545352362562334, + "loss": 0.2394, + "step": 155290 + }, + { + "epoch": 6.43, + "grad_norm": 0.330078125, + "learning_rate": 0.00044544676133688726, + "loss": 0.1851, + "step": 155300 + }, + { + "epoch": 6.43, + "grad_norm": 1.34375, + "learning_rate": 0.00044543999868034144, + "loss": 0.238, + "step": 155310 + }, + { + "epoch": 6.43, + "grad_norm": 0.2333984375, + "learning_rate": 0.0004454332356559987, + "loss": 0.1847, + "step": 155320 + }, + { + "epoch": 6.43, + "grad_norm": 0.78125, + "learning_rate": 0.0004454264722638717, + "loss": 0.1876, + "step": 155330 + }, + { + "epoch": 6.43, + "grad_norm": 0.546875, + "learning_rate": 0.00044541970850397317, + "loss": 0.2084, + "step": 155340 + }, + { + "epoch": 6.43, + "grad_norm": 0.28515625, + "learning_rate": 0.0004454129443763159, + "loss": 0.1959, + "step": 155350 + }, + { + "epoch": 6.43, + "grad_norm": 0.67578125, + "learning_rate": 0.0004454061798809125, + "loss": 0.242, + "step": 155360 + }, + { + "epoch": 6.44, + "grad_norm": 0.97265625, + "learning_rate": 0.0004453994150177758, + "loss": 0.2103, + "step": 155370 + }, + { + "epoch": 6.44, + "grad_norm": 1.3828125, + "learning_rate": 0.0004453926497869185, + "loss": 0.2224, + "step": 155380 + }, + { + "epoch": 6.44, + "grad_norm": 0.46875, + "learning_rate": 0.00044538588418835336, + "loss": 0.1405, + "step": 155390 + }, + { + "epoch": 6.44, + "grad_norm": 0.78125, + "learning_rate": 0.0004453791182220931, + "loss": 0.2285, + "step": 155400 + }, + { + "epoch": 6.44, + "grad_norm": 1.59375, + "learning_rate": 0.0004453723518881504, + "loss": 0.1985, + "step": 155410 + }, + { + "epoch": 6.44, + "grad_norm": 1.0625, + "learning_rate": 0.00044536558518653804, + "loss": 0.1969, + "step": 155420 + }, + { + "epoch": 6.44, + "grad_norm": 1.3515625, + "learning_rate": 0.0004453588181172687, + "loss": 0.2393, + "step": 155430 + }, + { + "epoch": 6.44, + "grad_norm": 0.40234375, + "learning_rate": 0.00044535205068035524, + "loss": 0.2209, + "step": 155440 + }, + { + "epoch": 6.44, + "grad_norm": 1.0234375, + "learning_rate": 0.0004453452828758102, + "loss": 0.1682, + "step": 155450 + }, + { + "epoch": 6.44, + "grad_norm": 0.734375, + "learning_rate": 0.0004453385147036465, + "loss": 0.1674, + "step": 155460 + }, + { + "epoch": 6.44, + "grad_norm": 0.78515625, + "learning_rate": 0.0004453317461638768, + "loss": 0.1858, + "step": 155470 + }, + { + "epoch": 6.44, + "grad_norm": 0.87890625, + "learning_rate": 0.0004453249772565139, + "loss": 0.2182, + "step": 155480 + }, + { + "epoch": 6.44, + "grad_norm": 0.89453125, + "learning_rate": 0.00044531820798157055, + "loss": 0.1718, + "step": 155490 + }, + { + "epoch": 6.44, + "grad_norm": 0.921875, + "learning_rate": 0.00044531143833905927, + "loss": 0.2385, + "step": 155500 + }, + { + "epoch": 6.44, + "grad_norm": 1.546875, + "learning_rate": 0.00044530466832899305, + "loss": 0.1798, + "step": 155510 + }, + { + "epoch": 6.44, + "grad_norm": 0.9296875, + "learning_rate": 0.00044529789795138446, + "loss": 0.2237, + "step": 155520 + }, + { + "epoch": 6.44, + "grad_norm": 2.25, + "learning_rate": 0.00044529112720624635, + "loss": 0.2099, + "step": 155530 + }, + { + "epoch": 6.44, + "grad_norm": 0.412109375, + "learning_rate": 0.0004452843560935914, + "loss": 0.174, + "step": 155540 + }, + { + "epoch": 6.44, + "grad_norm": 0.859375, + "learning_rate": 0.0004452775846134325, + "loss": 0.175, + "step": 155550 + }, + { + "epoch": 6.44, + "grad_norm": 0.859375, + "learning_rate": 0.0004452708127657822, + "loss": 0.2236, + "step": 155560 + }, + { + "epoch": 6.44, + "grad_norm": 1.1015625, + "learning_rate": 0.0004452640405506533, + "loss": 0.2405, + "step": 155570 + }, + { + "epoch": 6.44, + "grad_norm": 0.3515625, + "learning_rate": 0.00044525726796805856, + "loss": 0.206, + "step": 155580 + }, + { + "epoch": 6.44, + "grad_norm": 0.62109375, + "learning_rate": 0.00044525049501801075, + "loss": 0.2506, + "step": 155590 + }, + { + "epoch": 6.44, + "grad_norm": 0.4296875, + "learning_rate": 0.0004452437217005226, + "loss": 0.2203, + "step": 155600 + }, + { + "epoch": 6.45, + "grad_norm": 0.4765625, + "learning_rate": 0.0004452369480156068, + "loss": 0.1667, + "step": 155610 + }, + { + "epoch": 6.45, + "grad_norm": 0.71875, + "learning_rate": 0.00044523017396327615, + "loss": 0.1939, + "step": 155620 + }, + { + "epoch": 6.45, + "grad_norm": 0.349609375, + "learning_rate": 0.0004452233995435434, + "loss": 0.195, + "step": 155630 + }, + { + "epoch": 6.45, + "grad_norm": 0.58984375, + "learning_rate": 0.00044521662475642136, + "loss": 0.1864, + "step": 155640 + }, + { + "epoch": 6.45, + "grad_norm": 0.69921875, + "learning_rate": 0.0004452098496019227, + "loss": 0.1776, + "step": 155650 + }, + { + "epoch": 6.45, + "grad_norm": 0.7265625, + "learning_rate": 0.0004452030740800601, + "loss": 0.1859, + "step": 155660 + }, + { + "epoch": 6.45, + "grad_norm": 0.39453125, + "learning_rate": 0.0004451962981908465, + "loss": 0.1476, + "step": 155670 + }, + { + "epoch": 6.45, + "grad_norm": 1.0234375, + "learning_rate": 0.0004451895219342945, + "loss": 0.2056, + "step": 155680 + }, + { + "epoch": 6.45, + "grad_norm": 0.48046875, + "learning_rate": 0.0004451827453104169, + "loss": 0.2062, + "step": 155690 + }, + { + "epoch": 6.45, + "grad_norm": 1.015625, + "learning_rate": 0.0004451759683192264, + "loss": 0.1924, + "step": 155700 + }, + { + "epoch": 6.45, + "grad_norm": 0.6796875, + "learning_rate": 0.00044516919096073586, + "loss": 0.2132, + "step": 155710 + }, + { + "epoch": 6.45, + "grad_norm": 0.5703125, + "learning_rate": 0.000445162413234958, + "loss": 0.2109, + "step": 155720 + }, + { + "epoch": 6.45, + "grad_norm": 0.578125, + "learning_rate": 0.0004451556351419055, + "loss": 0.1871, + "step": 155730 + }, + { + "epoch": 6.45, + "grad_norm": 1.765625, + "learning_rate": 0.0004451488566815912, + "loss": 0.1943, + "step": 155740 + }, + { + "epoch": 6.45, + "grad_norm": 0.4296875, + "learning_rate": 0.00044514207785402785, + "loss": 0.2119, + "step": 155750 + }, + { + "epoch": 6.45, + "grad_norm": 0.859375, + "learning_rate": 0.00044513529865922814, + "loss": 0.1669, + "step": 155760 + }, + { + "epoch": 6.45, + "grad_norm": 0.59765625, + "learning_rate": 0.0004451285190972049, + "loss": 0.1482, + "step": 155770 + }, + { + "epoch": 6.45, + "grad_norm": 0.625, + "learning_rate": 0.00044512173916797085, + "loss": 0.2124, + "step": 155780 + }, + { + "epoch": 6.45, + "grad_norm": 0.5, + "learning_rate": 0.00044511495887153874, + "loss": 0.2193, + "step": 155790 + }, + { + "epoch": 6.45, + "grad_norm": 0.443359375, + "learning_rate": 0.0004451081782079214, + "loss": 0.1931, + "step": 155800 + }, + { + "epoch": 6.45, + "grad_norm": 0.353515625, + "learning_rate": 0.00044510139717713145, + "loss": 0.1973, + "step": 155810 + }, + { + "epoch": 6.45, + "grad_norm": 1.640625, + "learning_rate": 0.00044509461577918176, + "loss": 0.2029, + "step": 155820 + }, + { + "epoch": 6.45, + "grad_norm": 1.125, + "learning_rate": 0.00044508783401408516, + "loss": 0.1853, + "step": 155830 + }, + { + "epoch": 6.45, + "grad_norm": 0.181640625, + "learning_rate": 0.00044508105188185423, + "loss": 0.2004, + "step": 155840 + }, + { + "epoch": 6.46, + "grad_norm": 1.15625, + "learning_rate": 0.0004450742693825018, + "loss": 0.2191, + "step": 155850 + }, + { + "epoch": 6.46, + "grad_norm": 0.703125, + "learning_rate": 0.0004450674865160408, + "loss": 0.2083, + "step": 155860 + }, + { + "epoch": 6.46, + "grad_norm": 1.0859375, + "learning_rate": 0.00044506070328248373, + "loss": 0.1848, + "step": 155870 + }, + { + "epoch": 6.46, + "grad_norm": 0.625, + "learning_rate": 0.00044505391968184353, + "loss": 0.1877, + "step": 155880 + }, + { + "epoch": 6.46, + "grad_norm": 1.125, + "learning_rate": 0.000445047135714133, + "loss": 0.2597, + "step": 155890 + }, + { + "epoch": 6.46, + "grad_norm": 0.59765625, + "learning_rate": 0.00044504035137936474, + "loss": 0.2332, + "step": 155900 + }, + { + "epoch": 6.46, + "grad_norm": 2.203125, + "learning_rate": 0.00044503356667755157, + "loss": 0.1763, + "step": 155910 + }, + { + "epoch": 6.46, + "grad_norm": 0.5703125, + "learning_rate": 0.0004450267816087063, + "loss": 0.2401, + "step": 155920 + }, + { + "epoch": 6.46, + "grad_norm": 0.85546875, + "learning_rate": 0.0004450199961728417, + "loss": 0.2084, + "step": 155930 + }, + { + "epoch": 6.46, + "grad_norm": 0.7890625, + "learning_rate": 0.00044501321036997054, + "loss": 0.1908, + "step": 155940 + }, + { + "epoch": 6.46, + "grad_norm": 1.2421875, + "learning_rate": 0.00044500642420010556, + "loss": 0.1631, + "step": 155950 + }, + { + "epoch": 6.46, + "grad_norm": 0.39453125, + "learning_rate": 0.00044499963766325956, + "loss": 0.2547, + "step": 155960 + }, + { + "epoch": 6.46, + "grad_norm": 0.53515625, + "learning_rate": 0.00044499285075944527, + "loss": 0.2119, + "step": 155970 + }, + { + "epoch": 6.46, + "grad_norm": 0.455078125, + "learning_rate": 0.00044498606348867555, + "loss": 0.2189, + "step": 155980 + }, + { + "epoch": 6.46, + "grad_norm": 0.408203125, + "learning_rate": 0.0004449792758509631, + "loss": 0.1899, + "step": 155990 + }, + { + "epoch": 6.46, + "grad_norm": 0.92578125, + "learning_rate": 0.0004449724878463207, + "loss": 0.193, + "step": 156000 + }, + { + "epoch": 6.46, + "grad_norm": 0.59375, + "learning_rate": 0.00044496569947476117, + "loss": 0.2181, + "step": 156010 + }, + { + "epoch": 6.46, + "grad_norm": 1.9296875, + "learning_rate": 0.00044495891073629716, + "loss": 0.1668, + "step": 156020 + }, + { + "epoch": 6.46, + "grad_norm": 0.236328125, + "learning_rate": 0.0004449521216309416, + "loss": 0.1301, + "step": 156030 + }, + { + "epoch": 6.46, + "grad_norm": 0.859375, + "learning_rate": 0.0004449453321587072, + "loss": 0.2623, + "step": 156040 + }, + { + "epoch": 6.46, + "grad_norm": 0.8359375, + "learning_rate": 0.00044493854231960673, + "loss": 0.2312, + "step": 156050 + }, + { + "epoch": 6.46, + "grad_norm": 0.9296875, + "learning_rate": 0.000444931752113653, + "loss": 0.2768, + "step": 156060 + }, + { + "epoch": 6.46, + "grad_norm": 0.71484375, + "learning_rate": 0.00044492496154085873, + "loss": 0.2153, + "step": 156070 + }, + { + "epoch": 6.46, + "grad_norm": 0.71484375, + "learning_rate": 0.0004449181706012367, + "loss": 0.2031, + "step": 156080 + }, + { + "epoch": 6.47, + "grad_norm": 0.64453125, + "learning_rate": 0.00044491137929479976, + "loss": 0.1768, + "step": 156090 + }, + { + "epoch": 6.47, + "grad_norm": 0.6328125, + "learning_rate": 0.0004449045876215606, + "loss": 0.2219, + "step": 156100 + }, + { + "epoch": 6.47, + "grad_norm": 0.447265625, + "learning_rate": 0.0004448977955815321, + "loss": 0.1654, + "step": 156110 + }, + { + "epoch": 6.47, + "grad_norm": 0.75, + "learning_rate": 0.000444891003174727, + "loss": 0.2246, + "step": 156120 + }, + { + "epoch": 6.47, + "grad_norm": 0.88671875, + "learning_rate": 0.00044488421040115813, + "loss": 0.2173, + "step": 156130 + }, + { + "epoch": 6.47, + "grad_norm": 0.9296875, + "learning_rate": 0.0004448774172608381, + "loss": 0.2745, + "step": 156140 + }, + { + "epoch": 6.47, + "grad_norm": 0.86328125, + "learning_rate": 0.00044487062375377995, + "loss": 0.1925, + "step": 156150 + }, + { + "epoch": 6.47, + "grad_norm": 0.65625, + "learning_rate": 0.00044486382987999625, + "loss": 0.2279, + "step": 156160 + }, + { + "epoch": 6.47, + "grad_norm": 1.3125, + "learning_rate": 0.00044485703563949987, + "loss": 0.1849, + "step": 156170 + }, + { + "epoch": 6.47, + "grad_norm": 0.484375, + "learning_rate": 0.0004448502410323036, + "loss": 0.1797, + "step": 156180 + }, + { + "epoch": 6.47, + "grad_norm": 1.1640625, + "learning_rate": 0.0004448434460584202, + "loss": 0.1603, + "step": 156190 + }, + { + "epoch": 6.47, + "grad_norm": 1.03125, + "learning_rate": 0.0004448366507178625, + "loss": 0.2049, + "step": 156200 + }, + { + "epoch": 6.47, + "grad_norm": 0.546875, + "learning_rate": 0.0004448298550106433, + "loss": 0.2077, + "step": 156210 + }, + { + "epoch": 6.47, + "grad_norm": 0.99609375, + "learning_rate": 0.0004448230589367753, + "loss": 0.2026, + "step": 156220 + }, + { + "epoch": 6.47, + "grad_norm": 0.4609375, + "learning_rate": 0.0004448162624962713, + "loss": 0.176, + "step": 156230 + }, + { + "epoch": 6.47, + "grad_norm": 1.3515625, + "learning_rate": 0.0004448094656891442, + "loss": 0.2185, + "step": 156240 + }, + { + "epoch": 6.47, + "grad_norm": 0.328125, + "learning_rate": 0.0004448026685154067, + "loss": 0.1621, + "step": 156250 + }, + { + "epoch": 6.47, + "grad_norm": 0.1689453125, + "learning_rate": 0.00044479587097507163, + "loss": 0.2041, + "step": 156260 + }, + { + "epoch": 6.47, + "grad_norm": 1.1171875, + "learning_rate": 0.00044478907306815175, + "loss": 0.2424, + "step": 156270 + }, + { + "epoch": 6.47, + "grad_norm": 0.337890625, + "learning_rate": 0.0004447822747946599, + "loss": 0.2387, + "step": 156280 + }, + { + "epoch": 6.47, + "grad_norm": 0.26171875, + "learning_rate": 0.00044477547615460886, + "loss": 0.1346, + "step": 156290 + }, + { + "epoch": 6.47, + "grad_norm": 1.03125, + "learning_rate": 0.00044476867714801134, + "loss": 0.2287, + "step": 156300 + }, + { + "epoch": 6.47, + "grad_norm": 0.54296875, + "learning_rate": 0.0004447618777748802, + "loss": 0.1983, + "step": 156310 + }, + { + "epoch": 6.47, + "grad_norm": 1.078125, + "learning_rate": 0.0004447550780352283, + "loss": 0.2115, + "step": 156320 + }, + { + "epoch": 6.48, + "grad_norm": 0.828125, + "learning_rate": 0.00044474827792906834, + "loss": 0.198, + "step": 156330 + }, + { + "epoch": 6.48, + "grad_norm": 0.439453125, + "learning_rate": 0.0004447414774564131, + "loss": 0.1725, + "step": 156340 + }, + { + "epoch": 6.48, + "grad_norm": 0.41015625, + "learning_rate": 0.00044473467661727557, + "loss": 0.2134, + "step": 156350 + }, + { + "epoch": 6.48, + "grad_norm": 0.94921875, + "learning_rate": 0.00044472787541166835, + "loss": 0.2244, + "step": 156360 + }, + { + "epoch": 6.48, + "grad_norm": 0.6171875, + "learning_rate": 0.00044472107383960427, + "loss": 0.2151, + "step": 156370 + }, + { + "epoch": 6.48, + "grad_norm": 0.640625, + "learning_rate": 0.0004447142719010961, + "loss": 0.2153, + "step": 156380 + }, + { + "epoch": 6.48, + "grad_norm": 0.8046875, + "learning_rate": 0.0004447074695961568, + "loss": 0.2183, + "step": 156390 + }, + { + "epoch": 6.48, + "grad_norm": 0.890625, + "learning_rate": 0.00044470066692479905, + "loss": 0.2174, + "step": 156400 + }, + { + "epoch": 6.48, + "grad_norm": 0.5078125, + "learning_rate": 0.00044469386388703566, + "loss": 0.2252, + "step": 156410 + }, + { + "epoch": 6.48, + "grad_norm": 0.64453125, + "learning_rate": 0.00044468706048287937, + "loss": 0.1599, + "step": 156420 + }, + { + "epoch": 6.48, + "grad_norm": 0.462890625, + "learning_rate": 0.00044468025671234315, + "loss": 0.1215, + "step": 156430 + }, + { + "epoch": 6.48, + "grad_norm": 1.0234375, + "learning_rate": 0.00044467345257543967, + "loss": 0.19, + "step": 156440 + }, + { + "epoch": 6.48, + "grad_norm": 0.61328125, + "learning_rate": 0.00044466664807218183, + "loss": 0.2391, + "step": 156450 + }, + { + "epoch": 6.48, + "grad_norm": 0.46484375, + "learning_rate": 0.0004446598432025823, + "loss": 0.2048, + "step": 156460 + }, + { + "epoch": 6.48, + "grad_norm": 0.3046875, + "learning_rate": 0.000444653037966654, + "loss": 0.2016, + "step": 156470 + }, + { + "epoch": 6.48, + "grad_norm": 0.73828125, + "learning_rate": 0.0004446462323644097, + "loss": 0.2237, + "step": 156480 + }, + { + "epoch": 6.48, + "grad_norm": 0.57421875, + "learning_rate": 0.0004446394263958622, + "loss": 0.2352, + "step": 156490 + }, + { + "epoch": 6.48, + "grad_norm": 0.359375, + "learning_rate": 0.0004446326200610243, + "loss": 0.2472, + "step": 156500 + }, + { + "epoch": 6.48, + "grad_norm": 1.1796875, + "learning_rate": 0.0004446258133599089, + "loss": 0.1921, + "step": 156510 + }, + { + "epoch": 6.48, + "grad_norm": 0.87890625, + "learning_rate": 0.0004446190062925287, + "loss": 0.1613, + "step": 156520 + }, + { + "epoch": 6.48, + "grad_norm": 1.90625, + "learning_rate": 0.00044461219885889657, + "loss": 0.1967, + "step": 156530 + }, + { + "epoch": 6.48, + "grad_norm": 0.48046875, + "learning_rate": 0.00044460539105902527, + "loss": 0.2233, + "step": 156540 + }, + { + "epoch": 6.48, + "grad_norm": 0.38671875, + "learning_rate": 0.00044459858289292765, + "loss": 0.1412, + "step": 156550 + }, + { + "epoch": 6.48, + "grad_norm": 0.47265625, + "learning_rate": 0.00044459177436061647, + "loss": 0.189, + "step": 156560 + }, + { + "epoch": 6.49, + "grad_norm": 0.4609375, + "learning_rate": 0.0004445849654621046, + "loss": 0.1668, + "step": 156570 + }, + { + "epoch": 6.49, + "grad_norm": 0.74609375, + "learning_rate": 0.00044457815619740494, + "loss": 0.1524, + "step": 156580 + }, + { + "epoch": 6.49, + "grad_norm": 0.57421875, + "learning_rate": 0.0004445713465665301, + "loss": 0.2226, + "step": 156590 + }, + { + "epoch": 6.49, + "grad_norm": 0.68359375, + "learning_rate": 0.00044456453656949304, + "loss": 0.198, + "step": 156600 + }, + { + "epoch": 6.49, + "grad_norm": 0.625, + "learning_rate": 0.00044455772620630645, + "loss": 0.1403, + "step": 156610 + }, + { + "epoch": 6.49, + "grad_norm": 1.28125, + "learning_rate": 0.00044455091547698333, + "loss": 0.2057, + "step": 156620 + }, + { + "epoch": 6.49, + "grad_norm": 0.83203125, + "learning_rate": 0.00044454410438153635, + "loss": 0.1755, + "step": 156630 + }, + { + "epoch": 6.49, + "grad_norm": 0.6484375, + "learning_rate": 0.00044453729291997837, + "loss": 0.1563, + "step": 156640 + }, + { + "epoch": 6.49, + "grad_norm": 1.234375, + "learning_rate": 0.00044453048109232223, + "loss": 0.1638, + "step": 156650 + }, + { + "epoch": 6.49, + "grad_norm": 0.40234375, + "learning_rate": 0.0004445236688985808, + "loss": 0.2125, + "step": 156660 + }, + { + "epoch": 6.49, + "grad_norm": 0.4296875, + "learning_rate": 0.0004445168563387667, + "loss": 0.2561, + "step": 156670 + }, + { + "epoch": 6.49, + "grad_norm": 0.0, + "learning_rate": 0.0004445100434128929, + "loss": 0.2083, + "step": 156680 + }, + { + "epoch": 6.49, + "grad_norm": 0.7890625, + "learning_rate": 0.00044450323012097225, + "loss": 0.1889, + "step": 156690 + }, + { + "epoch": 6.49, + "grad_norm": 0.7109375, + "learning_rate": 0.0004444964164630175, + "loss": 0.21, + "step": 156700 + }, + { + "epoch": 6.49, + "grad_norm": 0.921875, + "learning_rate": 0.0004444896024390416, + "loss": 0.2174, + "step": 156710 + }, + { + "epoch": 6.49, + "grad_norm": 0.6796875, + "learning_rate": 0.00044448278804905717, + "loss": 0.1929, + "step": 156720 + }, + { + "epoch": 6.49, + "grad_norm": 0.36328125, + "learning_rate": 0.0004444759732930771, + "loss": 0.1695, + "step": 156730 + }, + { + "epoch": 6.49, + "grad_norm": 0.85546875, + "learning_rate": 0.00044446915817111435, + "loss": 0.2394, + "step": 156740 + }, + { + "epoch": 6.49, + "grad_norm": 0.82421875, + "learning_rate": 0.00044446234268318165, + "loss": 0.1988, + "step": 156750 + }, + { + "epoch": 6.49, + "grad_norm": 0.69921875, + "learning_rate": 0.0004444555268292917, + "loss": 0.2264, + "step": 156760 + }, + { + "epoch": 6.49, + "grad_norm": 0.71875, + "learning_rate": 0.00044444871060945757, + "loss": 0.2291, + "step": 156770 + }, + { + "epoch": 6.49, + "grad_norm": 1.0, + "learning_rate": 0.0004444418940236919, + "loss": 0.1762, + "step": 156780 + }, + { + "epoch": 6.49, + "grad_norm": 0.578125, + "learning_rate": 0.0004444350770720076, + "loss": 0.2188, + "step": 156790 + }, + { + "epoch": 6.49, + "grad_norm": 0.4140625, + "learning_rate": 0.0004444282597544175, + "loss": 0.2216, + "step": 156800 + }, + { + "epoch": 6.5, + "grad_norm": 0.796875, + "learning_rate": 0.0004444214420709344, + "loss": 0.2085, + "step": 156810 + }, + { + "epoch": 6.5, + "grad_norm": 0.6640625, + "learning_rate": 0.0004444146240215711, + "loss": 0.1881, + "step": 156820 + }, + { + "epoch": 6.5, + "grad_norm": 0.40234375, + "learning_rate": 0.00044440780560634055, + "loss": 0.1581, + "step": 156830 + }, + { + "epoch": 6.5, + "grad_norm": 0.69140625, + "learning_rate": 0.0004444009868252554, + "loss": 0.2258, + "step": 156840 + }, + { + "epoch": 6.5, + "grad_norm": 0.79296875, + "learning_rate": 0.0004443941676783286, + "loss": 0.2169, + "step": 156850 + }, + { + "epoch": 6.5, + "grad_norm": 0.52734375, + "learning_rate": 0.00044438734816557306, + "loss": 0.2448, + "step": 156860 + }, + { + "epoch": 6.5, + "grad_norm": 0.490234375, + "learning_rate": 0.00044438052828700146, + "loss": 0.2144, + "step": 156870 + }, + { + "epoch": 6.5, + "grad_norm": 0.75390625, + "learning_rate": 0.0004443737080426268, + "loss": 0.1967, + "step": 156880 + }, + { + "epoch": 6.5, + "grad_norm": 0.71484375, + "learning_rate": 0.0004443668874324617, + "loss": 0.2305, + "step": 156890 + }, + { + "epoch": 6.5, + "grad_norm": 0.40625, + "learning_rate": 0.0004443600664565191, + "loss": 0.2164, + "step": 156900 + }, + { + "epoch": 6.5, + "grad_norm": 0.8671875, + "learning_rate": 0.0004443532451148119, + "loss": 0.1776, + "step": 156910 + }, + { + "epoch": 6.5, + "grad_norm": 2.609375, + "learning_rate": 0.0004443464234073529, + "loss": 0.2342, + "step": 156920 + }, + { + "epoch": 6.5, + "grad_norm": 0.78515625, + "learning_rate": 0.00044433960133415486, + "loss": 0.1994, + "step": 156930 + }, + { + "epoch": 6.5, + "grad_norm": 0.5390625, + "learning_rate": 0.0004443327788952307, + "loss": 0.2228, + "step": 156940 + }, + { + "epoch": 6.5, + "grad_norm": 1.2734375, + "learning_rate": 0.0004443259560905932, + "loss": 0.2157, + "step": 156950 + }, + { + "epoch": 6.5, + "grad_norm": 0.337890625, + "learning_rate": 0.0004443191329202553, + "loss": 0.1703, + "step": 156960 + }, + { + "epoch": 6.5, + "grad_norm": 0.66796875, + "learning_rate": 0.00044431230938422975, + "loss": 0.2235, + "step": 156970 + }, + { + "epoch": 6.5, + "grad_norm": 2.09375, + "learning_rate": 0.0004443054854825294, + "loss": 0.1879, + "step": 156980 + }, + { + "epoch": 6.5, + "grad_norm": 1.0234375, + "learning_rate": 0.00044429866121516707, + "loss": 0.2217, + "step": 156990 + }, + { + "epoch": 6.5, + "grad_norm": 0.6953125, + "learning_rate": 0.0004442918365821558, + "loss": 0.2284, + "step": 157000 + }, + { + "epoch": 6.5, + "grad_norm": 0.52734375, + "learning_rate": 0.00044428501158350807, + "loss": 0.1975, + "step": 157010 + }, + { + "epoch": 6.5, + "grad_norm": 0.408203125, + "learning_rate": 0.00044427818621923707, + "loss": 0.2262, + "step": 157020 + }, + { + "epoch": 6.5, + "grad_norm": 0.6875, + "learning_rate": 0.0004442713604893555, + "loss": 0.2118, + "step": 157030 + }, + { + "epoch": 6.5, + "grad_norm": 0.72265625, + "learning_rate": 0.00044426453439387614, + "loss": 0.1871, + "step": 157040 + }, + { + "epoch": 6.5, + "grad_norm": 0.640625, + "learning_rate": 0.00044425770793281197, + "loss": 0.1787, + "step": 157050 + }, + { + "epoch": 6.51, + "grad_norm": 0.34765625, + "learning_rate": 0.0004442508811061757, + "loss": 0.1613, + "step": 157060 + }, + { + "epoch": 6.51, + "grad_norm": 0.7265625, + "learning_rate": 0.00044424405391398027, + "loss": 0.2045, + "step": 157070 + }, + { + "epoch": 6.51, + "grad_norm": 0.78125, + "learning_rate": 0.00044423722635623854, + "loss": 0.1968, + "step": 157080 + }, + { + "epoch": 6.51, + "grad_norm": 0.88671875, + "learning_rate": 0.0004442303984329633, + "loss": 0.2474, + "step": 157090 + }, + { + "epoch": 6.51, + "grad_norm": 0.64453125, + "learning_rate": 0.00044422357014416744, + "loss": 0.2557, + "step": 157100 + }, + { + "epoch": 6.51, + "grad_norm": 0.609375, + "learning_rate": 0.0004442167414898638, + "loss": 0.1799, + "step": 157110 + }, + { + "epoch": 6.51, + "grad_norm": 0.76171875, + "learning_rate": 0.0004442099124700652, + "loss": 0.1982, + "step": 157120 + }, + { + "epoch": 6.51, + "grad_norm": 0.88671875, + "learning_rate": 0.0004442030830847845, + "loss": 0.2036, + "step": 157130 + }, + { + "epoch": 6.51, + "grad_norm": 1.0390625, + "learning_rate": 0.00044419625333403463, + "loss": 0.2101, + "step": 157140 + }, + { + "epoch": 6.51, + "grad_norm": 0.458984375, + "learning_rate": 0.0004441894232178284, + "loss": 0.2092, + "step": 157150 + }, + { + "epoch": 6.51, + "grad_norm": 0.63671875, + "learning_rate": 0.00044418259273617855, + "loss": 0.1534, + "step": 157160 + }, + { + "epoch": 6.51, + "grad_norm": 0.828125, + "learning_rate": 0.00044417576188909814, + "loss": 0.1815, + "step": 157170 + }, + { + "epoch": 6.51, + "grad_norm": 0.60546875, + "learning_rate": 0.0004441689306765998, + "loss": 0.22, + "step": 157180 + }, + { + "epoch": 6.51, + "grad_norm": 0.5390625, + "learning_rate": 0.0004441620990986966, + "loss": 0.1978, + "step": 157190 + }, + { + "epoch": 6.51, + "grad_norm": 0.5859375, + "learning_rate": 0.0004441552671554012, + "loss": 0.2653, + "step": 157200 + }, + { + "epoch": 6.51, + "grad_norm": 0.828125, + "learning_rate": 0.0004441484348467266, + "loss": 0.1877, + "step": 157210 + }, + { + "epoch": 6.51, + "grad_norm": 1.203125, + "learning_rate": 0.0004441416021726857, + "loss": 0.1963, + "step": 157220 + }, + { + "epoch": 6.51, + "grad_norm": 0.40234375, + "learning_rate": 0.0004441347691332912, + "loss": 0.219, + "step": 157230 + }, + { + "epoch": 6.51, + "grad_norm": 0.9609375, + "learning_rate": 0.00044412793572855605, + "loss": 0.2199, + "step": 157240 + }, + { + "epoch": 6.51, + "grad_norm": 0.609375, + "learning_rate": 0.000444121101958493, + "loss": 0.2087, + "step": 157250 + }, + { + "epoch": 6.51, + "grad_norm": 0.95703125, + "learning_rate": 0.00044411426782311504, + "loss": 0.1934, + "step": 157260 + }, + { + "epoch": 6.51, + "grad_norm": 0.455078125, + "learning_rate": 0.000444107433322435, + "loss": 0.1492, + "step": 157270 + }, + { + "epoch": 6.51, + "grad_norm": 0.470703125, + "learning_rate": 0.00044410059845646576, + "loss": 0.2037, + "step": 157280 + }, + { + "epoch": 6.51, + "grad_norm": 0.9453125, + "learning_rate": 0.00044409376322522017, + "loss": 0.2411, + "step": 157290 + }, + { + "epoch": 6.52, + "grad_norm": 0.41796875, + "learning_rate": 0.00044408692762871104, + "loss": 0.1795, + "step": 157300 + }, + { + "epoch": 6.52, + "grad_norm": 1.8984375, + "learning_rate": 0.0004440800916669513, + "loss": 0.2023, + "step": 157310 + }, + { + "epoch": 6.52, + "grad_norm": 0.546875, + "learning_rate": 0.0004440732553399537, + "loss": 0.2007, + "step": 157320 + }, + { + "epoch": 6.52, + "grad_norm": 0.95703125, + "learning_rate": 0.0004440664186477313, + "loss": 0.1835, + "step": 157330 + }, + { + "epoch": 6.52, + "grad_norm": 0.53125, + "learning_rate": 0.0004440595815902968, + "loss": 0.2261, + "step": 157340 + }, + { + "epoch": 6.52, + "grad_norm": 0.6484375, + "learning_rate": 0.00044405274416766316, + "loss": 0.1728, + "step": 157350 + }, + { + "epoch": 6.52, + "grad_norm": 0.76171875, + "learning_rate": 0.00044404590637984315, + "loss": 0.2045, + "step": 157360 + }, + { + "epoch": 6.52, + "grad_norm": 0.640625, + "learning_rate": 0.00044403906822684976, + "loss": 0.1568, + "step": 157370 + }, + { + "epoch": 6.52, + "grad_norm": 0.7734375, + "learning_rate": 0.0004440322297086957, + "loss": 0.1599, + "step": 157380 + }, + { + "epoch": 6.52, + "grad_norm": 1.3359375, + "learning_rate": 0.00044402539082539404, + "loss": 0.224, + "step": 157390 + }, + { + "epoch": 6.52, + "grad_norm": 0.388671875, + "learning_rate": 0.0004440185515769576, + "loss": 0.212, + "step": 157400 + }, + { + "epoch": 6.52, + "grad_norm": 0.734375, + "learning_rate": 0.00044401171196339906, + "loss": 0.2136, + "step": 157410 + }, + { + "epoch": 6.52, + "grad_norm": 0.62890625, + "learning_rate": 0.0004440048719847315, + "loss": 0.2316, + "step": 157420 + }, + { + "epoch": 6.52, + "grad_norm": 0.62109375, + "learning_rate": 0.00044399803164096776, + "loss": 0.1642, + "step": 157430 + }, + { + "epoch": 6.52, + "grad_norm": 1.0234375, + "learning_rate": 0.0004439911909321206, + "loss": 0.199, + "step": 157440 + }, + { + "epoch": 6.52, + "grad_norm": 0.52734375, + "learning_rate": 0.00044398434985820303, + "loss": 0.2067, + "step": 157450 + }, + { + "epoch": 6.52, + "grad_norm": 0.365234375, + "learning_rate": 0.0004439775084192278, + "loss": 0.1551, + "step": 157460 + }, + { + "epoch": 6.52, + "grad_norm": 1.59375, + "learning_rate": 0.0004439706666152079, + "loss": 0.2614, + "step": 157470 + }, + { + "epoch": 6.52, + "grad_norm": 1.0703125, + "learning_rate": 0.00044396382444615606, + "loss": 0.2009, + "step": 157480 + }, + { + "epoch": 6.52, + "grad_norm": 0.298828125, + "learning_rate": 0.00044395698191208535, + "loss": 0.1795, + "step": 157490 + }, + { + "epoch": 6.52, + "grad_norm": 0.80859375, + "learning_rate": 0.0004439501390130085, + "loss": 0.2199, + "step": 157500 + }, + { + "epoch": 6.52, + "grad_norm": 0.86328125, + "learning_rate": 0.00044394329574893844, + "loss": 0.2321, + "step": 157510 + }, + { + "epoch": 6.52, + "grad_norm": 0.421875, + "learning_rate": 0.000443936452119888, + "loss": 0.1796, + "step": 157520 + }, + { + "epoch": 6.52, + "grad_norm": 0.431640625, + "learning_rate": 0.0004439296081258701, + "loss": 0.2178, + "step": 157530 + }, + { + "epoch": 6.53, + "grad_norm": 0.8828125, + "learning_rate": 0.00044392276376689773, + "loss": 0.1966, + "step": 157540 + }, + { + "epoch": 6.53, + "grad_norm": 1.1015625, + "learning_rate": 0.00044391591904298356, + "loss": 0.2164, + "step": 157550 + }, + { + "epoch": 6.53, + "grad_norm": 0.578125, + "learning_rate": 0.0004439090739541406, + "loss": 0.174, + "step": 157560 + }, + { + "epoch": 6.53, + "grad_norm": 1.203125, + "learning_rate": 0.00044390222850038174, + "loss": 0.2126, + "step": 157570 + }, + { + "epoch": 6.53, + "grad_norm": 0.87890625, + "learning_rate": 0.0004438953826817198, + "loss": 0.1856, + "step": 157580 + }, + { + "epoch": 6.53, + "grad_norm": 0.5390625, + "learning_rate": 0.0004438885364981677, + "loss": 0.1598, + "step": 157590 + }, + { + "epoch": 6.53, + "grad_norm": 0.5, + "learning_rate": 0.00044388168994973827, + "loss": 0.2389, + "step": 157600 + }, + { + "epoch": 6.53, + "grad_norm": 1.0859375, + "learning_rate": 0.00044387484303644444, + "loss": 0.2364, + "step": 157610 + }, + { + "epoch": 6.53, + "grad_norm": 0.7421875, + "learning_rate": 0.0004438679957582992, + "loss": 0.2406, + "step": 157620 + }, + { + "epoch": 6.53, + "grad_norm": 0.458984375, + "learning_rate": 0.0004438611481153152, + "loss": 0.1723, + "step": 157630 + }, + { + "epoch": 6.53, + "grad_norm": 1.4375, + "learning_rate": 0.0004438543001075055, + "loss": 0.1816, + "step": 157640 + }, + { + "epoch": 6.53, + "grad_norm": 1.609375, + "learning_rate": 0.00044384745173488293, + "loss": 0.1884, + "step": 157650 + }, + { + "epoch": 6.53, + "grad_norm": 1.3515625, + "learning_rate": 0.0004438406029974604, + "loss": 0.2247, + "step": 157660 + }, + { + "epoch": 6.53, + "grad_norm": 0.8203125, + "learning_rate": 0.0004438337538952508, + "loss": 0.2003, + "step": 157670 + }, + { + "epoch": 6.53, + "grad_norm": 1.1328125, + "learning_rate": 0.00044382690442826695, + "loss": 0.1858, + "step": 157680 + }, + { + "epoch": 6.53, + "grad_norm": 0.3828125, + "learning_rate": 0.00044382005459652185, + "loss": 0.2003, + "step": 157690 + }, + { + "epoch": 6.53, + "grad_norm": 0.578125, + "learning_rate": 0.0004438132044000283, + "loss": 0.2334, + "step": 157700 + }, + { + "epoch": 6.53, + "grad_norm": 1.3125, + "learning_rate": 0.0004438063538387993, + "loss": 0.2126, + "step": 157710 + }, + { + "epoch": 6.53, + "grad_norm": 0.921875, + "learning_rate": 0.00044379950291284763, + "loss": 0.1639, + "step": 157720 + }, + { + "epoch": 6.53, + "grad_norm": 0.21484375, + "learning_rate": 0.0004437926516221862, + "loss": 0.163, + "step": 157730 + }, + { + "epoch": 6.53, + "grad_norm": 0.89453125, + "learning_rate": 0.00044378579996682795, + "loss": 0.2041, + "step": 157740 + }, + { + "epoch": 6.53, + "grad_norm": 0.5703125, + "learning_rate": 0.00044377894794678577, + "loss": 0.2244, + "step": 157750 + }, + { + "epoch": 6.53, + "grad_norm": 0.73046875, + "learning_rate": 0.0004437720955620725, + "loss": 0.1841, + "step": 157760 + }, + { + "epoch": 6.53, + "grad_norm": 0.6640625, + "learning_rate": 0.0004437652428127011, + "loss": 0.1732, + "step": 157770 + }, + { + "epoch": 6.54, + "grad_norm": 0.66796875, + "learning_rate": 0.00044375838969868444, + "loss": 0.2053, + "step": 157780 + }, + { + "epoch": 6.54, + "grad_norm": 0.38671875, + "learning_rate": 0.0004437515362200354, + "loss": 0.1877, + "step": 157790 + }, + { + "epoch": 6.54, + "grad_norm": 1.515625, + "learning_rate": 0.0004437446823767669, + "loss": 0.192, + "step": 157800 + }, + { + "epoch": 6.54, + "grad_norm": 0.94921875, + "learning_rate": 0.0004437378281688918, + "loss": 0.2113, + "step": 157810 + }, + { + "epoch": 6.54, + "grad_norm": 0.7265625, + "learning_rate": 0.0004437309735964231, + "loss": 0.1711, + "step": 157820 + }, + { + "epoch": 6.54, + "grad_norm": 0.361328125, + "learning_rate": 0.00044372411865937356, + "loss": 0.1849, + "step": 157830 + }, + { + "epoch": 6.54, + "grad_norm": 0.95703125, + "learning_rate": 0.00044371726335775617, + "loss": 0.2292, + "step": 157840 + }, + { + "epoch": 6.54, + "grad_norm": 0.98046875, + "learning_rate": 0.00044371040769158377, + "loss": 0.2446, + "step": 157850 + }, + { + "epoch": 6.54, + "grad_norm": 0.515625, + "learning_rate": 0.0004437035516608694, + "loss": 0.1641, + "step": 157860 + }, + { + "epoch": 6.54, + "grad_norm": 1.0625, + "learning_rate": 0.00044369669526562577, + "loss": 0.2831, + "step": 157870 + }, + { + "epoch": 6.54, + "grad_norm": 0.7890625, + "learning_rate": 0.00044368983850586596, + "loss": 0.2173, + "step": 157880 + }, + { + "epoch": 6.54, + "grad_norm": 1.0078125, + "learning_rate": 0.0004436829813816027, + "loss": 0.2006, + "step": 157890 + }, + { + "epoch": 6.54, + "grad_norm": 0.53515625, + "learning_rate": 0.00044367612389284905, + "loss": 0.1874, + "step": 157900 + }, + { + "epoch": 6.54, + "grad_norm": 0.5234375, + "learning_rate": 0.00044366926603961785, + "loss": 0.2119, + "step": 157910 + }, + { + "epoch": 6.54, + "grad_norm": 0.95703125, + "learning_rate": 0.00044366240782192195, + "loss": 0.1655, + "step": 157920 + }, + { + "epoch": 6.54, + "grad_norm": 0.6484375, + "learning_rate": 0.00044365554923977437, + "loss": 0.1734, + "step": 157930 + }, + { + "epoch": 6.54, + "grad_norm": 0.52734375, + "learning_rate": 0.0004436486902931879, + "loss": 0.2267, + "step": 157940 + }, + { + "epoch": 6.54, + "grad_norm": 0.9765625, + "learning_rate": 0.0004436418309821756, + "loss": 0.1765, + "step": 157950 + }, + { + "epoch": 6.54, + "grad_norm": 0.4453125, + "learning_rate": 0.00044363497130675024, + "loss": 0.2023, + "step": 157960 + }, + { + "epoch": 6.54, + "grad_norm": 0.6875, + "learning_rate": 0.0004436281112669247, + "loss": 0.1909, + "step": 157970 + }, + { + "epoch": 6.54, + "grad_norm": 1.4453125, + "learning_rate": 0.00044362125086271206, + "loss": 0.2214, + "step": 157980 + }, + { + "epoch": 6.54, + "grad_norm": 0.54296875, + "learning_rate": 0.0004436143900941251, + "loss": 0.1884, + "step": 157990 + }, + { + "epoch": 6.54, + "grad_norm": 0.7890625, + "learning_rate": 0.0004436075289611768, + "loss": 0.2153, + "step": 158000 + }, + { + "epoch": 6.54, + "grad_norm": 0.357421875, + "learning_rate": 0.00044360066746388, + "loss": 0.1818, + "step": 158010 + }, + { + "epoch": 6.55, + "grad_norm": 0.41796875, + "learning_rate": 0.00044359380560224766, + "loss": 0.2346, + "step": 158020 + }, + { + "epoch": 6.55, + "grad_norm": 0.67578125, + "learning_rate": 0.0004435869433762927, + "loss": 0.1714, + "step": 158030 + }, + { + "epoch": 6.55, + "grad_norm": 1.8828125, + "learning_rate": 0.00044358008078602794, + "loss": 0.233, + "step": 158040 + }, + { + "epoch": 6.55, + "grad_norm": 0.61328125, + "learning_rate": 0.00044357321783146646, + "loss": 0.2037, + "step": 158050 + }, + { + "epoch": 6.55, + "grad_norm": 0.59765625, + "learning_rate": 0.0004435663545126211, + "loss": 0.1593, + "step": 158060 + }, + { + "epoch": 6.55, + "grad_norm": 0.5, + "learning_rate": 0.00044355949082950475, + "loss": 0.2248, + "step": 158070 + }, + { + "epoch": 6.55, + "grad_norm": 0.419921875, + "learning_rate": 0.00044355262678213026, + "loss": 0.2842, + "step": 158080 + }, + { + "epoch": 6.55, + "grad_norm": 0.80078125, + "learning_rate": 0.0004435457623705107, + "loss": 0.2216, + "step": 158090 + }, + { + "epoch": 6.55, + "grad_norm": 0.375, + "learning_rate": 0.0004435388975946589, + "loss": 0.1752, + "step": 158100 + }, + { + "epoch": 6.55, + "grad_norm": 0.703125, + "learning_rate": 0.00044353203245458785, + "loss": 0.2228, + "step": 158110 + }, + { + "epoch": 6.55, + "grad_norm": 0.71875, + "learning_rate": 0.0004435251669503103, + "loss": 0.1865, + "step": 158120 + }, + { + "epoch": 6.55, + "grad_norm": 0.6796875, + "learning_rate": 0.00044351830108183937, + "loss": 0.2248, + "step": 158130 + }, + { + "epoch": 6.55, + "grad_norm": 0.384765625, + "learning_rate": 0.00044351143484918786, + "loss": 0.1893, + "step": 158140 + }, + { + "epoch": 6.55, + "grad_norm": 0.625, + "learning_rate": 0.0004435045682523687, + "loss": 0.2102, + "step": 158150 + }, + { + "epoch": 6.55, + "grad_norm": 0.578125, + "learning_rate": 0.00044349770129139496, + "loss": 0.1842, + "step": 158160 + }, + { + "epoch": 6.55, + "grad_norm": 0.76171875, + "learning_rate": 0.00044349083396627933, + "loss": 0.178, + "step": 158170 + }, + { + "epoch": 6.55, + "grad_norm": 0.57421875, + "learning_rate": 0.0004434839662770349, + "loss": 0.2051, + "step": 158180 + }, + { + "epoch": 6.55, + "grad_norm": 1.3046875, + "learning_rate": 0.00044347709822367454, + "loss": 0.1826, + "step": 158190 + }, + { + "epoch": 6.55, + "grad_norm": 1.4296875, + "learning_rate": 0.0004434702298062111, + "loss": 0.1929, + "step": 158200 + }, + { + "epoch": 6.55, + "grad_norm": 0.45703125, + "learning_rate": 0.00044346336102465766, + "loss": 0.2315, + "step": 158210 + }, + { + "epoch": 6.55, + "grad_norm": 0.57421875, + "learning_rate": 0.00044345649187902704, + "loss": 0.1781, + "step": 158220 + }, + { + "epoch": 6.55, + "grad_norm": 1.328125, + "learning_rate": 0.0004434496223693322, + "loss": 0.2006, + "step": 158230 + }, + { + "epoch": 6.55, + "grad_norm": 0.423828125, + "learning_rate": 0.000443442752495586, + "loss": 0.1847, + "step": 158240 + }, + { + "epoch": 6.55, + "grad_norm": 0.8046875, + "learning_rate": 0.00044343588225780154, + "loss": 0.1776, + "step": 158250 + }, + { + "epoch": 6.56, + "grad_norm": 0.6171875, + "learning_rate": 0.0004434290116559916, + "loss": 0.2465, + "step": 158260 + }, + { + "epoch": 6.56, + "grad_norm": 0.7265625, + "learning_rate": 0.00044342214069016905, + "loss": 0.1992, + "step": 158270 + }, + { + "epoch": 6.56, + "grad_norm": 1.4375, + "learning_rate": 0.000443415269360347, + "loss": 0.186, + "step": 158280 + }, + { + "epoch": 6.56, + "grad_norm": 0.78515625, + "learning_rate": 0.0004434083976665383, + "loss": 0.1997, + "step": 158290 + }, + { + "epoch": 6.56, + "grad_norm": 0.625, + "learning_rate": 0.0004434015256087559, + "loss": 0.237, + "step": 158300 + }, + { + "epoch": 6.56, + "grad_norm": 0.6875, + "learning_rate": 0.0004433946531870128, + "loss": 0.2453, + "step": 158310 + }, + { + "epoch": 6.56, + "grad_norm": 0.142578125, + "learning_rate": 0.00044338778040132164, + "loss": 0.2228, + "step": 158320 + }, + { + "epoch": 6.56, + "grad_norm": 1.8046875, + "learning_rate": 0.00044338090725169565, + "loss": 0.2435, + "step": 158330 + }, + { + "epoch": 6.56, + "grad_norm": 1.0625, + "learning_rate": 0.00044337403373814776, + "loss": 0.2009, + "step": 158340 + }, + { + "epoch": 6.56, + "grad_norm": 0.84375, + "learning_rate": 0.0004433671598606908, + "loss": 0.1661, + "step": 158350 + }, + { + "epoch": 6.56, + "grad_norm": 1.015625, + "learning_rate": 0.0004433602856193376, + "loss": 0.2079, + "step": 158360 + }, + { + "epoch": 6.56, + "grad_norm": 0.69140625, + "learning_rate": 0.0004433534110141013, + "loss": 0.2127, + "step": 158370 + }, + { + "epoch": 6.56, + "grad_norm": 0.271484375, + "learning_rate": 0.00044334653604499475, + "loss": 0.2405, + "step": 158380 + }, + { + "epoch": 6.56, + "grad_norm": 0.6640625, + "learning_rate": 0.000443339660712031, + "loss": 0.2045, + "step": 158390 + }, + { + "epoch": 6.56, + "grad_norm": 0.6640625, + "learning_rate": 0.0004433327850152228, + "loss": 0.2183, + "step": 158400 + }, + { + "epoch": 6.56, + "grad_norm": 0.55859375, + "learning_rate": 0.00044332590895458315, + "loss": 0.1842, + "step": 158410 + }, + { + "epoch": 6.56, + "grad_norm": 0.70703125, + "learning_rate": 0.000443319032530125, + "loss": 0.2473, + "step": 158420 + }, + { + "epoch": 6.56, + "grad_norm": 0.49609375, + "learning_rate": 0.0004433121557418614, + "loss": 0.1922, + "step": 158430 + }, + { + "epoch": 6.56, + "grad_norm": 0.5, + "learning_rate": 0.00044330527858980515, + "loss": 0.2105, + "step": 158440 + }, + { + "epoch": 6.56, + "grad_norm": 0.72265625, + "learning_rate": 0.00044329840107396924, + "loss": 0.1869, + "step": 158450 + }, + { + "epoch": 6.56, + "grad_norm": 0.490234375, + "learning_rate": 0.0004432915231943666, + "loss": 0.2301, + "step": 158460 + }, + { + "epoch": 6.56, + "grad_norm": 1.5, + "learning_rate": 0.0004432846449510102, + "loss": 0.2392, + "step": 158470 + }, + { + "epoch": 6.56, + "grad_norm": 0.51171875, + "learning_rate": 0.000443277766343913, + "loss": 0.2393, + "step": 158480 + }, + { + "epoch": 6.56, + "grad_norm": 0.33203125, + "learning_rate": 0.00044327088737308783, + "loss": 0.1861, + "step": 158490 + }, + { + "epoch": 6.57, + "grad_norm": 0.55859375, + "learning_rate": 0.0004432640080385478, + "loss": 0.1878, + "step": 158500 + }, + { + "epoch": 6.57, + "grad_norm": 0.357421875, + "learning_rate": 0.0004432571283403058, + "loss": 0.2011, + "step": 158510 + }, + { + "epoch": 6.57, + "grad_norm": 0.5390625, + "learning_rate": 0.00044325024827837465, + "loss": 0.2189, + "step": 158520 + }, + { + "epoch": 6.57, + "grad_norm": 0.58984375, + "learning_rate": 0.0004432433678527675, + "loss": 0.2287, + "step": 158530 + }, + { + "epoch": 6.57, + "grad_norm": 0.21875, + "learning_rate": 0.0004432364870634972, + "loss": 0.1968, + "step": 158540 + }, + { + "epoch": 6.57, + "grad_norm": 1.203125, + "learning_rate": 0.00044322960591057666, + "loss": 0.1985, + "step": 158550 + }, + { + "epoch": 6.57, + "grad_norm": 0.5703125, + "learning_rate": 0.00044322272439401884, + "loss": 0.21, + "step": 158560 + }, + { + "epoch": 6.57, + "grad_norm": 0.390625, + "learning_rate": 0.00044321584251383675, + "loss": 0.1949, + "step": 158570 + }, + { + "epoch": 6.57, + "grad_norm": 0.359375, + "learning_rate": 0.00044320896027004333, + "loss": 0.1532, + "step": 158580 + }, + { + "epoch": 6.57, + "grad_norm": 0.73046875, + "learning_rate": 0.0004432020776626515, + "loss": 0.1903, + "step": 158590 + }, + { + "epoch": 6.57, + "grad_norm": 0.3515625, + "learning_rate": 0.0004431951946916742, + "loss": 0.2115, + "step": 158600 + }, + { + "epoch": 6.57, + "grad_norm": 0.921875, + "learning_rate": 0.00044318831135712445, + "loss": 0.2309, + "step": 158610 + }, + { + "epoch": 6.57, + "grad_norm": 0.8125, + "learning_rate": 0.0004431814276590151, + "loss": 0.2113, + "step": 158620 + }, + { + "epoch": 6.57, + "grad_norm": 0.1689453125, + "learning_rate": 0.0004431745435973592, + "loss": 0.1924, + "step": 158630 + }, + { + "epoch": 6.57, + "grad_norm": 0.609375, + "learning_rate": 0.00044316765917216966, + "loss": 0.2367, + "step": 158640 + }, + { + "epoch": 6.57, + "grad_norm": 0.94921875, + "learning_rate": 0.0004431607743834595, + "loss": 0.1821, + "step": 158650 + }, + { + "epoch": 6.57, + "grad_norm": 0.25390625, + "learning_rate": 0.00044315388923124154, + "loss": 0.1519, + "step": 158660 + }, + { + "epoch": 6.57, + "grad_norm": 0.68359375, + "learning_rate": 0.0004431470037155288, + "loss": 0.247, + "step": 158670 + }, + { + "epoch": 6.57, + "grad_norm": 0.2294921875, + "learning_rate": 0.00044314011783633435, + "loss": 0.1739, + "step": 158680 + }, + { + "epoch": 6.57, + "grad_norm": 0.765625, + "learning_rate": 0.000443133231593671, + "loss": 0.2153, + "step": 158690 + }, + { + "epoch": 6.57, + "grad_norm": 0.546875, + "learning_rate": 0.0004431263449875518, + "loss": 0.2264, + "step": 158700 + }, + { + "epoch": 6.57, + "grad_norm": 0.8046875, + "learning_rate": 0.0004431194580179897, + "loss": 0.1966, + "step": 158710 + }, + { + "epoch": 6.57, + "grad_norm": 0.40625, + "learning_rate": 0.00044311257068499753, + "loss": 0.1882, + "step": 158720 + }, + { + "epoch": 6.57, + "grad_norm": 0.546875, + "learning_rate": 0.00044310568298858844, + "loss": 0.2079, + "step": 158730 + }, + { + "epoch": 6.57, + "grad_norm": 0.99609375, + "learning_rate": 0.0004430987949287753, + "loss": 0.1938, + "step": 158740 + }, + { + "epoch": 6.58, + "grad_norm": 0.69921875, + "learning_rate": 0.0004430919065055711, + "loss": 0.2137, + "step": 158750 + }, + { + "epoch": 6.58, + "grad_norm": 1.6484375, + "learning_rate": 0.00044308501771898873, + "loss": 0.1959, + "step": 158760 + }, + { + "epoch": 6.58, + "grad_norm": 0.482421875, + "learning_rate": 0.0004430781285690412, + "loss": 0.14, + "step": 158770 + }, + { + "epoch": 6.58, + "grad_norm": 0.98046875, + "learning_rate": 0.0004430712390557415, + "loss": 0.1904, + "step": 158780 + }, + { + "epoch": 6.58, + "grad_norm": 0.8828125, + "learning_rate": 0.00044306434917910264, + "loss": 0.1708, + "step": 158790 + }, + { + "epoch": 6.58, + "grad_norm": 0.55078125, + "learning_rate": 0.00044305745893913746, + "loss": 0.2292, + "step": 158800 + }, + { + "epoch": 6.58, + "grad_norm": 0.75390625, + "learning_rate": 0.000443050568335859, + "loss": 0.2356, + "step": 158810 + }, + { + "epoch": 6.58, + "grad_norm": 0.90625, + "learning_rate": 0.00044304367736928025, + "loss": 0.2138, + "step": 158820 + }, + { + "epoch": 6.58, + "grad_norm": 0.58984375, + "learning_rate": 0.0004430367860394141, + "loss": 0.1758, + "step": 158830 + }, + { + "epoch": 6.58, + "grad_norm": 0.67578125, + "learning_rate": 0.0004430298943462736, + "loss": 0.2498, + "step": 158840 + }, + { + "epoch": 6.58, + "grad_norm": 0.96875, + "learning_rate": 0.0004430230022898717, + "loss": 0.2254, + "step": 158850 + }, + { + "epoch": 6.58, + "grad_norm": 1.0546875, + "learning_rate": 0.0004430161098702213, + "loss": 0.1787, + "step": 158860 + }, + { + "epoch": 6.58, + "grad_norm": 0.98828125, + "learning_rate": 0.0004430092170873354, + "loss": 0.1687, + "step": 158870 + }, + { + "epoch": 6.58, + "grad_norm": 0.6171875, + "learning_rate": 0.0004430023239412271, + "loss": 0.1387, + "step": 158880 + }, + { + "epoch": 6.58, + "grad_norm": 0.4921875, + "learning_rate": 0.0004429954304319092, + "loss": 0.2586, + "step": 158890 + }, + { + "epoch": 6.58, + "grad_norm": 0.578125, + "learning_rate": 0.00044298853655939474, + "loss": 0.1841, + "step": 158900 + }, + { + "epoch": 6.58, + "grad_norm": 0.376953125, + "learning_rate": 0.00044298164232369673, + "loss": 0.1526, + "step": 158910 + }, + { + "epoch": 6.58, + "grad_norm": 0.7890625, + "learning_rate": 0.0004429747477248281, + "loss": 0.2, + "step": 158920 + }, + { + "epoch": 6.58, + "grad_norm": 0.466796875, + "learning_rate": 0.00044296785276280184, + "loss": 0.2224, + "step": 158930 + }, + { + "epoch": 6.58, + "grad_norm": 0.82421875, + "learning_rate": 0.00044296095743763086, + "loss": 0.1761, + "step": 158940 + }, + { + "epoch": 6.58, + "grad_norm": 1.0, + "learning_rate": 0.00044295406174932827, + "loss": 0.2099, + "step": 158950 + }, + { + "epoch": 6.58, + "grad_norm": 0.466796875, + "learning_rate": 0.00044294716569790696, + "loss": 0.2307, + "step": 158960 + }, + { + "epoch": 6.58, + "grad_norm": 0.6015625, + "learning_rate": 0.0004429402692833799, + "loss": 0.1771, + "step": 158970 + }, + { + "epoch": 6.58, + "grad_norm": 0.44921875, + "learning_rate": 0.00044293337250576, + "loss": 0.2365, + "step": 158980 + }, + { + "epoch": 6.59, + "grad_norm": 0.69921875, + "learning_rate": 0.0004429264753650605, + "loss": 0.2277, + "step": 158990 + }, + { + "epoch": 6.59, + "grad_norm": 0.5703125, + "learning_rate": 0.0004429195778612941, + "loss": 0.217, + "step": 159000 + }, + { + "epoch": 6.59, + "grad_norm": 0.53125, + "learning_rate": 0.00044291267999447393, + "loss": 0.1751, + "step": 159010 + }, + { + "epoch": 6.59, + "grad_norm": 0.87109375, + "learning_rate": 0.00044290578176461295, + "loss": 0.1936, + "step": 159020 + }, + { + "epoch": 6.59, + "grad_norm": 0.57421875, + "learning_rate": 0.0004428988831717241, + "loss": 0.2197, + "step": 159030 + }, + { + "epoch": 6.59, + "grad_norm": 0.62109375, + "learning_rate": 0.00044289198421582033, + "loss": 0.2248, + "step": 159040 + }, + { + "epoch": 6.59, + "grad_norm": 1.59375, + "learning_rate": 0.0004428850848969147, + "loss": 0.16, + "step": 159050 + }, + { + "epoch": 6.59, + "grad_norm": 0.51953125, + "learning_rate": 0.0004428781852150202, + "loss": 0.2043, + "step": 159060 + }, + { + "epoch": 6.59, + "grad_norm": 0.5390625, + "learning_rate": 0.0004428712851701498, + "loss": 0.2055, + "step": 159070 + }, + { + "epoch": 6.59, + "grad_norm": 0.76953125, + "learning_rate": 0.00044286438476231636, + "loss": 0.203, + "step": 159080 + }, + { + "epoch": 6.59, + "grad_norm": 0.50390625, + "learning_rate": 0.00044285748399153316, + "loss": 0.1524, + "step": 159090 + }, + { + "epoch": 6.59, + "grad_norm": 0.71484375, + "learning_rate": 0.00044285058285781285, + "loss": 0.1924, + "step": 159100 + }, + { + "epoch": 6.59, + "grad_norm": 0.80859375, + "learning_rate": 0.0004428436813611686, + "loss": 0.2365, + "step": 159110 + }, + { + "epoch": 6.59, + "grad_norm": 1.0625, + "learning_rate": 0.0004428367795016134, + "loss": 0.2038, + "step": 159120 + }, + { + "epoch": 6.59, + "grad_norm": 0.9453125, + "learning_rate": 0.0004428298772791602, + "loss": 0.2199, + "step": 159130 + }, + { + "epoch": 6.59, + "grad_norm": 0.56640625, + "learning_rate": 0.000442822974693822, + "loss": 0.1903, + "step": 159140 + }, + { + "epoch": 6.59, + "grad_norm": 0.6875, + "learning_rate": 0.0004428160717456118, + "loss": 0.1761, + "step": 159150 + }, + { + "epoch": 6.59, + "grad_norm": 0.390625, + "learning_rate": 0.00044280916843454247, + "loss": 0.2051, + "step": 159160 + }, + { + "epoch": 6.59, + "grad_norm": 0.484375, + "learning_rate": 0.00044280226476062717, + "loss": 0.2056, + "step": 159170 + }, + { + "epoch": 6.59, + "grad_norm": 0.51953125, + "learning_rate": 0.0004427953607238789, + "loss": 0.2301, + "step": 159180 + }, + { + "epoch": 6.59, + "grad_norm": 0.419921875, + "learning_rate": 0.00044278845632431054, + "loss": 0.1816, + "step": 159190 + }, + { + "epoch": 6.59, + "grad_norm": 0.8125, + "learning_rate": 0.00044278155156193507, + "loss": 0.211, + "step": 159200 + }, + { + "epoch": 6.59, + "grad_norm": 0.51171875, + "learning_rate": 0.00044277464643676555, + "loss": 0.2561, + "step": 159210 + }, + { + "epoch": 6.59, + "grad_norm": 0.8671875, + "learning_rate": 0.000442767740948815, + "loss": 0.2033, + "step": 159220 + }, + { + "epoch": 6.6, + "grad_norm": 0.5078125, + "learning_rate": 0.0004427608350980964, + "loss": 0.2046, + "step": 159230 + }, + { + "epoch": 6.6, + "grad_norm": 0.486328125, + "learning_rate": 0.00044275392888462264, + "loss": 0.2391, + "step": 159240 + }, + { + "epoch": 6.6, + "grad_norm": 0.28515625, + "learning_rate": 0.00044274702230840687, + "loss": 0.1882, + "step": 159250 + }, + { + "epoch": 6.6, + "grad_norm": 0.546875, + "learning_rate": 0.00044274011536946204, + "loss": 0.2317, + "step": 159260 + }, + { + "epoch": 6.6, + "grad_norm": 1.09375, + "learning_rate": 0.00044273320806780106, + "loss": 0.1245, + "step": 159270 + }, + { + "epoch": 6.6, + "grad_norm": 0.66796875, + "learning_rate": 0.000442726300403437, + "loss": 0.2565, + "step": 159280 + }, + { + "epoch": 6.6, + "grad_norm": 0.640625, + "learning_rate": 0.00044271939237638285, + "loss": 0.223, + "step": 159290 + }, + { + "epoch": 6.6, + "grad_norm": 0.396484375, + "learning_rate": 0.00044271248398665164, + "loss": 0.2274, + "step": 159300 + }, + { + "epoch": 6.6, + "grad_norm": 1.03125, + "learning_rate": 0.0004427055752342564, + "loss": 0.2238, + "step": 159310 + }, + { + "epoch": 6.6, + "grad_norm": 1.703125, + "learning_rate": 0.0004426986661192101, + "loss": 0.1907, + "step": 159320 + }, + { + "epoch": 6.6, + "grad_norm": 0.8203125, + "learning_rate": 0.0004426917566415256, + "loss": 0.1746, + "step": 159330 + }, + { + "epoch": 6.6, + "grad_norm": 0.72265625, + "learning_rate": 0.000442684846801216, + "loss": 0.2317, + "step": 159340 + }, + { + "epoch": 6.6, + "grad_norm": 0.306640625, + "learning_rate": 0.00044267793659829446, + "loss": 0.2176, + "step": 159350 + }, + { + "epoch": 6.6, + "grad_norm": 0.6796875, + "learning_rate": 0.00044267102603277376, + "loss": 0.2543, + "step": 159360 + }, + { + "epoch": 6.6, + "grad_norm": 1.1328125, + "learning_rate": 0.000442664115104667, + "loss": 0.165, + "step": 159370 + }, + { + "epoch": 6.6, + "grad_norm": 0.5625, + "learning_rate": 0.0004426572038139872, + "loss": 0.1659, + "step": 159380 + }, + { + "epoch": 6.6, + "grad_norm": 2.046875, + "learning_rate": 0.00044265029216074736, + "loss": 0.2384, + "step": 159390 + }, + { + "epoch": 6.6, + "grad_norm": 0.5390625, + "learning_rate": 0.00044264338014496054, + "loss": 0.2052, + "step": 159400 + }, + { + "epoch": 6.6, + "grad_norm": 1.2265625, + "learning_rate": 0.00044263646776663955, + "loss": 0.16, + "step": 159410 + }, + { + "epoch": 6.6, + "grad_norm": 0.984375, + "learning_rate": 0.0004426295550257976, + "loss": 0.2113, + "step": 159420 + }, + { + "epoch": 6.6, + "grad_norm": 0.546875, + "learning_rate": 0.0004426226419224476, + "loss": 0.1874, + "step": 159430 + }, + { + "epoch": 6.6, + "grad_norm": 0.796875, + "learning_rate": 0.00044261572845660265, + "loss": 0.191, + "step": 159440 + }, + { + "epoch": 6.6, + "grad_norm": 0.46484375, + "learning_rate": 0.00044260881462827563, + "loss": 0.1852, + "step": 159450 + }, + { + "epoch": 6.6, + "grad_norm": 0.94921875, + "learning_rate": 0.00044260190043747966, + "loss": 0.2006, + "step": 159460 + }, + { + "epoch": 6.61, + "grad_norm": 0.412109375, + "learning_rate": 0.0004425949858842277, + "loss": 0.2137, + "step": 159470 + }, + { + "epoch": 6.61, + "grad_norm": 0.875, + "learning_rate": 0.0004425880709685328, + "loss": 0.1888, + "step": 159480 + }, + { + "epoch": 6.61, + "grad_norm": 0.74609375, + "learning_rate": 0.00044258115569040794, + "loss": 0.1933, + "step": 159490 + }, + { + "epoch": 6.61, + "grad_norm": 0.59765625, + "learning_rate": 0.00044257424004986605, + "loss": 0.1954, + "step": 159500 + }, + { + "epoch": 6.61, + "grad_norm": 0.333984375, + "learning_rate": 0.00044256732404692037, + "loss": 0.2257, + "step": 159510 + }, + { + "epoch": 6.61, + "grad_norm": 0.79296875, + "learning_rate": 0.00044256040768158374, + "loss": 0.2225, + "step": 159520 + }, + { + "epoch": 6.61, + "grad_norm": 0.26953125, + "learning_rate": 0.0004425534909538692, + "loss": 0.1848, + "step": 159530 + }, + { + "epoch": 6.61, + "grad_norm": 0.703125, + "learning_rate": 0.00044254657386378975, + "loss": 0.2149, + "step": 159540 + }, + { + "epoch": 6.61, + "grad_norm": 0.8125, + "learning_rate": 0.0004425396564113585, + "loss": 0.2098, + "step": 159550 + }, + { + "epoch": 6.61, + "grad_norm": 0.51953125, + "learning_rate": 0.0004425327385965884, + "loss": 0.174, + "step": 159560 + }, + { + "epoch": 6.61, + "grad_norm": 0.81640625, + "learning_rate": 0.0004425258204194924, + "loss": 0.1996, + "step": 159570 + }, + { + "epoch": 6.61, + "grad_norm": 0.51171875, + "learning_rate": 0.0004425189018800837, + "loss": 0.1516, + "step": 159580 + }, + { + "epoch": 6.61, + "grad_norm": 0.98046875, + "learning_rate": 0.0004425119829783752, + "loss": 0.1596, + "step": 159590 + }, + { + "epoch": 6.61, + "grad_norm": 0.7734375, + "learning_rate": 0.00044250506371437996, + "loss": 0.1857, + "step": 159600 + }, + { + "epoch": 6.61, + "grad_norm": 0.57421875, + "learning_rate": 0.00044249814408811085, + "loss": 0.2033, + "step": 159610 + }, + { + "epoch": 6.61, + "grad_norm": 0.40234375, + "learning_rate": 0.0004424912240995811, + "loss": 0.2086, + "step": 159620 + }, + { + "epoch": 6.61, + "grad_norm": 0.78125, + "learning_rate": 0.00044248430374880366, + "loss": 0.232, + "step": 159630 + }, + { + "epoch": 6.61, + "grad_norm": 0.6796875, + "learning_rate": 0.00044247738303579155, + "loss": 0.2624, + "step": 159640 + }, + { + "epoch": 6.61, + "grad_norm": 0.93359375, + "learning_rate": 0.00044247046196055784, + "loss": 0.2061, + "step": 159650 + }, + { + "epoch": 6.61, + "grad_norm": 0.365234375, + "learning_rate": 0.00044246354052311545, + "loss": 0.2086, + "step": 159660 + }, + { + "epoch": 6.61, + "grad_norm": 0.640625, + "learning_rate": 0.00044245661872347743, + "loss": 0.2174, + "step": 159670 + }, + { + "epoch": 6.61, + "grad_norm": 0.416015625, + "learning_rate": 0.0004424496965616569, + "loss": 0.147, + "step": 159680 + }, + { + "epoch": 6.61, + "grad_norm": 0.7265625, + "learning_rate": 0.00044244277403766673, + "loss": 0.2037, + "step": 159690 + }, + { + "epoch": 6.61, + "grad_norm": 0.361328125, + "learning_rate": 0.00044243585115152017, + "loss": 0.2179, + "step": 159700 + }, + { + "epoch": 6.62, + "grad_norm": 0.96875, + "learning_rate": 0.0004424289279032301, + "loss": 0.2512, + "step": 159710 + }, + { + "epoch": 6.62, + "grad_norm": 0.66796875, + "learning_rate": 0.0004424220042928094, + "loss": 0.1983, + "step": 159720 + }, + { + "epoch": 6.62, + "grad_norm": 0.5390625, + "learning_rate": 0.00044241508032027147, + "loss": 0.1822, + "step": 159730 + }, + { + "epoch": 6.62, + "grad_norm": 0.380859375, + "learning_rate": 0.0004424081559856291, + "loss": 0.1916, + "step": 159740 + }, + { + "epoch": 6.62, + "grad_norm": 0.5703125, + "learning_rate": 0.0004424012312888953, + "loss": 0.2049, + "step": 159750 + }, + { + "epoch": 6.62, + "grad_norm": 0.71484375, + "learning_rate": 0.00044239430623008315, + "loss": 0.1878, + "step": 159760 + }, + { + "epoch": 6.62, + "grad_norm": 0.75390625, + "learning_rate": 0.0004423873808092057, + "loss": 0.2013, + "step": 159770 + }, + { + "epoch": 6.62, + "grad_norm": 0.287109375, + "learning_rate": 0.000442380455026276, + "loss": 0.1784, + "step": 159780 + }, + { + "epoch": 6.62, + "grad_norm": 1.515625, + "learning_rate": 0.0004423735288813071, + "loss": 0.2154, + "step": 159790 + }, + { + "epoch": 6.62, + "grad_norm": 1.15625, + "learning_rate": 0.0004423666023743119, + "loss": 0.2352, + "step": 159800 + }, + { + "epoch": 6.62, + "grad_norm": 0.875, + "learning_rate": 0.0004423596755053036, + "loss": 0.2422, + "step": 159810 + }, + { + "epoch": 6.62, + "grad_norm": 0.8046875, + "learning_rate": 0.00044235274827429514, + "loss": 0.1705, + "step": 159820 + }, + { + "epoch": 6.62, + "grad_norm": 0.2041015625, + "learning_rate": 0.0004423458206812996, + "loss": 0.2022, + "step": 159830 + }, + { + "epoch": 6.62, + "grad_norm": 0.9765625, + "learning_rate": 0.00044233889272633, + "loss": 0.2019, + "step": 159840 + }, + { + "epoch": 6.62, + "grad_norm": 0.56640625, + "learning_rate": 0.00044233196440939925, + "loss": 0.2169, + "step": 159850 + }, + { + "epoch": 6.62, + "grad_norm": 0.671875, + "learning_rate": 0.0004423250357305206, + "loss": 0.2333, + "step": 159860 + }, + { + "epoch": 6.62, + "grad_norm": 0.5859375, + "learning_rate": 0.00044231810668970707, + "loss": 0.1942, + "step": 159870 + }, + { + "epoch": 6.62, + "grad_norm": 0.4609375, + "learning_rate": 0.0004423111772869716, + "loss": 0.1912, + "step": 159880 + }, + { + "epoch": 6.62, + "grad_norm": 0.66796875, + "learning_rate": 0.0004423042475223272, + "loss": 0.1942, + "step": 159890 + }, + { + "epoch": 6.62, + "grad_norm": 0.421875, + "learning_rate": 0.00044229731739578703, + "loss": 0.1599, + "step": 159900 + }, + { + "epoch": 6.62, + "grad_norm": 0.98046875, + "learning_rate": 0.00044229038690736413, + "loss": 0.2075, + "step": 159910 + }, + { + "epoch": 6.62, + "grad_norm": 0.85546875, + "learning_rate": 0.00044228345605707136, + "loss": 0.1596, + "step": 159920 + }, + { + "epoch": 6.62, + "grad_norm": 0.8046875, + "learning_rate": 0.000442276524844922, + "loss": 0.2201, + "step": 159930 + }, + { + "epoch": 6.62, + "grad_norm": 0.78515625, + "learning_rate": 0.0004422695932709289, + "loss": 0.226, + "step": 159940 + }, + { + "epoch": 6.63, + "grad_norm": 0.90625, + "learning_rate": 0.0004422626613351052, + "loss": 0.2842, + "step": 159950 + }, + { + "epoch": 6.63, + "grad_norm": 0.6796875, + "learning_rate": 0.000442255729037464, + "loss": 0.2453, + "step": 159960 + }, + { + "epoch": 6.63, + "grad_norm": 0.59765625, + "learning_rate": 0.0004422487963780182, + "loss": 0.2224, + "step": 159970 + }, + { + "epoch": 6.63, + "grad_norm": 0.84765625, + "learning_rate": 0.00044224186335678093, + "loss": 0.1978, + "step": 159980 + }, + { + "epoch": 6.63, + "grad_norm": 1.6953125, + "learning_rate": 0.0004422349299737653, + "loss": 0.2436, + "step": 159990 + }, + { + "epoch": 6.63, + "grad_norm": 0.9609375, + "learning_rate": 0.00044222799622898424, + "loss": 0.1963, + "step": 160000 + }, + { + "epoch": 6.63, + "grad_norm": 0.361328125, + "learning_rate": 0.0004422210621224508, + "loss": 0.1643, + "step": 160010 + }, + { + "epoch": 6.63, + "grad_norm": 0.29296875, + "learning_rate": 0.0004422141276541782, + "loss": 0.1956, + "step": 160020 + }, + { + "epoch": 6.63, + "grad_norm": 0.9296875, + "learning_rate": 0.00044220719282417933, + "loss": 0.2224, + "step": 160030 + }, + { + "epoch": 6.63, + "grad_norm": 0.458984375, + "learning_rate": 0.0004422002576324673, + "loss": 0.2148, + "step": 160040 + }, + { + "epoch": 6.63, + "grad_norm": 0.80859375, + "learning_rate": 0.00044219332207905506, + "loss": 0.1913, + "step": 160050 + }, + { + "epoch": 6.63, + "grad_norm": 0.9375, + "learning_rate": 0.0004421863861639558, + "loss": 0.1958, + "step": 160060 + }, + { + "epoch": 6.63, + "grad_norm": 0.6875, + "learning_rate": 0.00044217944988718253, + "loss": 0.2255, + "step": 160070 + }, + { + "epoch": 6.63, + "grad_norm": 0.4921875, + "learning_rate": 0.00044217251324874825, + "loss": 0.1669, + "step": 160080 + }, + { + "epoch": 6.63, + "grad_norm": 1.265625, + "learning_rate": 0.00044216557624866603, + "loss": 0.194, + "step": 160090 + }, + { + "epoch": 6.63, + "grad_norm": 0.57421875, + "learning_rate": 0.00044215863888694894, + "loss": 0.2193, + "step": 160100 + }, + { + "epoch": 6.63, + "grad_norm": 0.4375, + "learning_rate": 0.00044215170116361014, + "loss": 0.2037, + "step": 160110 + }, + { + "epoch": 6.63, + "grad_norm": 0.671875, + "learning_rate": 0.00044214476307866254, + "loss": 0.1855, + "step": 160120 + }, + { + "epoch": 6.63, + "grad_norm": 0.240234375, + "learning_rate": 0.0004421378246321192, + "loss": 0.1563, + "step": 160130 + }, + { + "epoch": 6.63, + "grad_norm": 1.109375, + "learning_rate": 0.0004421308858239933, + "loss": 0.2698, + "step": 160140 + }, + { + "epoch": 6.63, + "grad_norm": 0.55859375, + "learning_rate": 0.0004421239466542978, + "loss": 0.2313, + "step": 160150 + }, + { + "epoch": 6.63, + "grad_norm": 0.609375, + "learning_rate": 0.00044211700712304576, + "loss": 0.1995, + "step": 160160 + }, + { + "epoch": 6.63, + "grad_norm": 0.44140625, + "learning_rate": 0.0004421100672302503, + "loss": 0.1881, + "step": 160170 + }, + { + "epoch": 6.63, + "grad_norm": 0.82421875, + "learning_rate": 0.00044210312697592436, + "loss": 0.2222, + "step": 160180 + }, + { + "epoch": 6.64, + "grad_norm": 0.85546875, + "learning_rate": 0.00044209618636008107, + "loss": 0.1837, + "step": 160190 + }, + { + "epoch": 6.64, + "grad_norm": 0.66015625, + "learning_rate": 0.0004420892453827336, + "loss": 0.1295, + "step": 160200 + }, + { + "epoch": 6.64, + "grad_norm": 2.78125, + "learning_rate": 0.0004420823040438948, + "loss": 0.2801, + "step": 160210 + }, + { + "epoch": 6.64, + "grad_norm": 0.376953125, + "learning_rate": 0.000442075362343578, + "loss": 0.227, + "step": 160220 + }, + { + "epoch": 6.64, + "grad_norm": 0.890625, + "learning_rate": 0.000442068420281796, + "loss": 0.1865, + "step": 160230 + }, + { + "epoch": 6.64, + "grad_norm": 0.80859375, + "learning_rate": 0.000442061477858562, + "loss": 0.1942, + "step": 160240 + }, + { + "epoch": 6.64, + "grad_norm": 0.0, + "learning_rate": 0.00044205453507388905, + "loss": 0.2555, + "step": 160250 + }, + { + "epoch": 6.64, + "grad_norm": 0.60546875, + "learning_rate": 0.0004420475919277902, + "loss": 0.205, + "step": 160260 + }, + { + "epoch": 6.64, + "grad_norm": 0.92578125, + "learning_rate": 0.0004420406484202785, + "loss": 0.2043, + "step": 160270 + }, + { + "epoch": 6.64, + "grad_norm": 0.8359375, + "learning_rate": 0.00044203370455136705, + "loss": 0.1812, + "step": 160280 + }, + { + "epoch": 6.64, + "grad_norm": 0.65625, + "learning_rate": 0.00044202676032106895, + "loss": 0.1813, + "step": 160290 + }, + { + "epoch": 6.64, + "grad_norm": 0.33984375, + "learning_rate": 0.00044201981572939715, + "loss": 0.1979, + "step": 160300 + }, + { + "epoch": 6.64, + "grad_norm": 0.515625, + "learning_rate": 0.0004420128707763649, + "loss": 0.1509, + "step": 160310 + }, + { + "epoch": 6.64, + "grad_norm": 0.54296875, + "learning_rate": 0.000442005925461985, + "loss": 0.1951, + "step": 160320 + }, + { + "epoch": 6.64, + "grad_norm": 1.5, + "learning_rate": 0.0004419989797862708, + "loss": 0.2002, + "step": 160330 + }, + { + "epoch": 6.64, + "grad_norm": 0.84765625, + "learning_rate": 0.0004419920337492352, + "loss": 0.1872, + "step": 160340 + }, + { + "epoch": 6.64, + "grad_norm": 0.578125, + "learning_rate": 0.0004419850873508914, + "loss": 0.2449, + "step": 160350 + }, + { + "epoch": 6.64, + "grad_norm": 0.51171875, + "learning_rate": 0.0004419781405912523, + "loss": 0.24, + "step": 160360 + }, + { + "epoch": 6.64, + "grad_norm": 0.48828125, + "learning_rate": 0.00044197119347033116, + "loss": 0.2409, + "step": 160370 + }, + { + "epoch": 6.64, + "grad_norm": 0.0, + "learning_rate": 0.00044196424598814087, + "loss": 0.2273, + "step": 160380 + }, + { + "epoch": 6.64, + "grad_norm": 1.15625, + "learning_rate": 0.0004419572981446948, + "loss": 0.2208, + "step": 160390 + }, + { + "epoch": 6.64, + "grad_norm": 0.69921875, + "learning_rate": 0.0004419503499400056, + "loss": 0.1714, + "step": 160400 + }, + { + "epoch": 6.64, + "grad_norm": 0.65234375, + "learning_rate": 0.00044194340137408667, + "loss": 0.2112, + "step": 160410 + }, + { + "epoch": 6.64, + "grad_norm": 0.79296875, + "learning_rate": 0.000441936452446951, + "loss": 0.189, + "step": 160420 + }, + { + "epoch": 6.64, + "grad_norm": 0.97265625, + "learning_rate": 0.0004419295031586116, + "loss": 0.2448, + "step": 160430 + }, + { + "epoch": 6.65, + "grad_norm": 1.8203125, + "learning_rate": 0.0004419225535090816, + "loss": 0.1976, + "step": 160440 + }, + { + "epoch": 6.65, + "grad_norm": 0.40625, + "learning_rate": 0.00044191560349837413, + "loss": 0.1961, + "step": 160450 + }, + { + "epoch": 6.65, + "grad_norm": 0.8828125, + "learning_rate": 0.0004419086531265022, + "loss": 0.2101, + "step": 160460 + }, + { + "epoch": 6.65, + "grad_norm": 0.5234375, + "learning_rate": 0.00044190170239347885, + "loss": 0.1758, + "step": 160470 + }, + { + "epoch": 6.65, + "grad_norm": 0.87109375, + "learning_rate": 0.0004418947512993173, + "loss": 0.2103, + "step": 160480 + }, + { + "epoch": 6.65, + "grad_norm": 0.51171875, + "learning_rate": 0.00044188779984403055, + "loss": 0.2009, + "step": 160490 + }, + { + "epoch": 6.65, + "grad_norm": 0.515625, + "learning_rate": 0.00044188084802763164, + "loss": 0.155, + "step": 160500 + }, + { + "epoch": 6.65, + "grad_norm": 0.80078125, + "learning_rate": 0.00044187389585013373, + "loss": 0.226, + "step": 160510 + }, + { + "epoch": 6.65, + "grad_norm": 0.6328125, + "learning_rate": 0.00044186694331154987, + "loss": 0.1407, + "step": 160520 + }, + { + "epoch": 6.65, + "grad_norm": 0.443359375, + "learning_rate": 0.0004418599904118931, + "loss": 0.2032, + "step": 160530 + }, + { + "epoch": 6.65, + "grad_norm": 1.203125, + "learning_rate": 0.00044185303715117653, + "loss": 0.2187, + "step": 160540 + }, + { + "epoch": 6.65, + "grad_norm": 0.6640625, + "learning_rate": 0.00044184608352941334, + "loss": 0.2064, + "step": 160550 + }, + { + "epoch": 6.65, + "grad_norm": 1.0078125, + "learning_rate": 0.00044183912954661646, + "loss": 0.1666, + "step": 160560 + }, + { + "epoch": 6.65, + "grad_norm": 0.71484375, + "learning_rate": 0.0004418321752027991, + "loss": 0.204, + "step": 160570 + }, + { + "epoch": 6.65, + "grad_norm": 2.15625, + "learning_rate": 0.00044182522049797434, + "loss": 0.1737, + "step": 160580 + }, + { + "epoch": 6.65, + "grad_norm": 0.60546875, + "learning_rate": 0.0004418182654321552, + "loss": 0.1795, + "step": 160590 + }, + { + "epoch": 6.65, + "grad_norm": 0.8203125, + "learning_rate": 0.0004418113100053548, + "loss": 0.2264, + "step": 160600 + }, + { + "epoch": 6.65, + "grad_norm": 0.0, + "learning_rate": 0.00044180435421758616, + "loss": 0.1832, + "step": 160610 + }, + { + "epoch": 6.65, + "grad_norm": 1.0703125, + "learning_rate": 0.00044179739806886255, + "loss": 0.1938, + "step": 160620 + }, + { + "epoch": 6.65, + "grad_norm": 0.2109375, + "learning_rate": 0.0004417904415591969, + "loss": 0.1918, + "step": 160630 + }, + { + "epoch": 6.65, + "grad_norm": 0.58984375, + "learning_rate": 0.0004417834846886023, + "loss": 0.1746, + "step": 160640 + }, + { + "epoch": 6.65, + "grad_norm": 0.80859375, + "learning_rate": 0.00044177652745709195, + "loss": 0.2319, + "step": 160650 + }, + { + "epoch": 6.65, + "grad_norm": 0.5859375, + "learning_rate": 0.0004417695698646789, + "loss": 0.2751, + "step": 160660 + }, + { + "epoch": 6.65, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004417626119113762, + "loss": 0.1731, + "step": 160670 + }, + { + "epoch": 6.66, + "grad_norm": 1.3125, + "learning_rate": 0.00044175565359719693, + "loss": 0.1954, + "step": 160680 + }, + { + "epoch": 6.66, + "grad_norm": 0.462890625, + "learning_rate": 0.0004417486949221543, + "loss": 0.2076, + "step": 160690 + }, + { + "epoch": 6.66, + "grad_norm": 0.71484375, + "learning_rate": 0.0004417417358862613, + "loss": 0.2222, + "step": 160700 + }, + { + "epoch": 6.66, + "grad_norm": 1.0234375, + "learning_rate": 0.0004417347764895311, + "loss": 0.2498, + "step": 160710 + }, + { + "epoch": 6.66, + "grad_norm": 1.40625, + "learning_rate": 0.0004417278167319767, + "loss": 0.1895, + "step": 160720 + }, + { + "epoch": 6.66, + "grad_norm": 0.470703125, + "learning_rate": 0.00044172085661361125, + "loss": 0.2361, + "step": 160730 + }, + { + "epoch": 6.66, + "grad_norm": 0.59765625, + "learning_rate": 0.0004417138961344479, + "loss": 0.1821, + "step": 160740 + }, + { + "epoch": 6.66, + "grad_norm": 0.87109375, + "learning_rate": 0.00044170693529449966, + "loss": 0.1717, + "step": 160750 + }, + { + "epoch": 6.66, + "grad_norm": 0.765625, + "learning_rate": 0.0004416999740937797, + "loss": 0.211, + "step": 160760 + }, + { + "epoch": 6.66, + "grad_norm": 0.578125, + "learning_rate": 0.00044169301253230104, + "loss": 0.2291, + "step": 160770 + }, + { + "epoch": 6.66, + "grad_norm": 0.427734375, + "learning_rate": 0.0004416860506100769, + "loss": 0.2049, + "step": 160780 + }, + { + "epoch": 6.66, + "grad_norm": 0.515625, + "learning_rate": 0.0004416790883271203, + "loss": 0.2206, + "step": 160790 + }, + { + "epoch": 6.66, + "grad_norm": 1.3046875, + "learning_rate": 0.0004416721256834443, + "loss": 0.2477, + "step": 160800 + }, + { + "epoch": 6.66, + "grad_norm": 0.515625, + "learning_rate": 0.0004416651626790621, + "loss": 0.199, + "step": 160810 + }, + { + "epoch": 6.66, + "grad_norm": 0.84765625, + "learning_rate": 0.00044165819931398675, + "loss": 0.1971, + "step": 160820 + }, + { + "epoch": 6.66, + "grad_norm": 0.6875, + "learning_rate": 0.00044165123558823137, + "loss": 0.2123, + "step": 160830 + }, + { + "epoch": 6.66, + "grad_norm": 0.380859375, + "learning_rate": 0.0004416442715018091, + "loss": 0.199, + "step": 160840 + }, + { + "epoch": 6.66, + "grad_norm": 3.453125, + "learning_rate": 0.0004416373070547329, + "loss": 0.1978, + "step": 160850 + }, + { + "epoch": 6.66, + "grad_norm": 0.5546875, + "learning_rate": 0.00044163034224701614, + "loss": 0.1989, + "step": 160860 + }, + { + "epoch": 6.66, + "grad_norm": 1.3046875, + "learning_rate": 0.00044162337707867166, + "loss": 0.1994, + "step": 160870 + }, + { + "epoch": 6.66, + "grad_norm": 0.47265625, + "learning_rate": 0.0004416164115497127, + "loss": 0.2318, + "step": 160880 + }, + { + "epoch": 6.66, + "grad_norm": 0.76953125, + "learning_rate": 0.00044160944566015237, + "loss": 0.2162, + "step": 160890 + }, + { + "epoch": 6.66, + "grad_norm": 0.224609375, + "learning_rate": 0.00044160247941000375, + "loss": 0.2287, + "step": 160900 + }, + { + "epoch": 6.66, + "grad_norm": 0.67578125, + "learning_rate": 0.0004415955127992799, + "loss": 0.2418, + "step": 160910 + }, + { + "epoch": 6.67, + "grad_norm": 0.73046875, + "learning_rate": 0.000441588545827994, + "loss": 0.1794, + "step": 160920 + }, + { + "epoch": 6.67, + "grad_norm": 0.75, + "learning_rate": 0.0004415815784961592, + "loss": 0.1744, + "step": 160930 + }, + { + "epoch": 6.67, + "grad_norm": 0.8671875, + "learning_rate": 0.0004415746108037886, + "loss": 0.1793, + "step": 160940 + }, + { + "epoch": 6.67, + "grad_norm": 1.1015625, + "learning_rate": 0.0004415676427508951, + "loss": 0.1715, + "step": 160950 + }, + { + "epoch": 6.67, + "grad_norm": 0.8671875, + "learning_rate": 0.0004415606743374921, + "loss": 0.1559, + "step": 160960 + }, + { + "epoch": 6.67, + "grad_norm": 0.7265625, + "learning_rate": 0.00044155370556359265, + "loss": 0.2175, + "step": 160970 + }, + { + "epoch": 6.67, + "grad_norm": 0.73828125, + "learning_rate": 0.00044154673642920973, + "loss": 0.202, + "step": 160980 + }, + { + "epoch": 6.67, + "grad_norm": 0.83984375, + "learning_rate": 0.0004415397669343565, + "loss": 0.1869, + "step": 160990 + }, + { + "epoch": 6.67, + "grad_norm": 1.75, + "learning_rate": 0.0004415327970790462, + "loss": 0.2116, + "step": 161000 + }, + { + "epoch": 6.67, + "grad_norm": 0.466796875, + "learning_rate": 0.00044152582686329177, + "loss": 0.1924, + "step": 161010 + }, + { + "epoch": 6.67, + "grad_norm": 1.2109375, + "learning_rate": 0.00044151885628710655, + "loss": 0.2374, + "step": 161020 + }, + { + "epoch": 6.67, + "grad_norm": 0.92578125, + "learning_rate": 0.0004415118853505035, + "loss": 0.1638, + "step": 161030 + }, + { + "epoch": 6.67, + "grad_norm": 0.455078125, + "learning_rate": 0.0004415049140534956, + "loss": 0.2104, + "step": 161040 + }, + { + "epoch": 6.67, + "grad_norm": 0.69921875, + "learning_rate": 0.00044149794239609633, + "loss": 0.1685, + "step": 161050 + }, + { + "epoch": 6.67, + "grad_norm": 1.1484375, + "learning_rate": 0.0004414909703783185, + "loss": 0.1769, + "step": 161060 + }, + { + "epoch": 6.67, + "grad_norm": 0.84375, + "learning_rate": 0.00044148399800017534, + "loss": 0.2525, + "step": 161070 + }, + { + "epoch": 6.67, + "grad_norm": 0.60546875, + "learning_rate": 0.00044147702526167997, + "loss": 0.1312, + "step": 161080 + }, + { + "epoch": 6.67, + "grad_norm": 0.7421875, + "learning_rate": 0.00044147005216284556, + "loss": 0.192, + "step": 161090 + }, + { + "epoch": 6.67, + "grad_norm": 0.9140625, + "learning_rate": 0.0004414630787036852, + "loss": 0.2106, + "step": 161100 + }, + { + "epoch": 6.67, + "grad_norm": 0.6875, + "learning_rate": 0.00044145610488421195, + "loss": 0.2419, + "step": 161110 + }, + { + "epoch": 6.67, + "grad_norm": 0.91015625, + "learning_rate": 0.00044144913070443903, + "loss": 0.2362, + "step": 161120 + }, + { + "epoch": 6.67, + "grad_norm": 0.400390625, + "learning_rate": 0.00044144215616437956, + "loss": 0.2432, + "step": 161130 + }, + { + "epoch": 6.67, + "grad_norm": 1.03125, + "learning_rate": 0.00044143518126404653, + "loss": 0.1312, + "step": 161140 + }, + { + "epoch": 6.67, + "grad_norm": 0.404296875, + "learning_rate": 0.00044142820600345323, + "loss": 0.2079, + "step": 161150 + }, + { + "epoch": 6.68, + "grad_norm": 1.875, + "learning_rate": 0.0004414212303826127, + "loss": 0.1821, + "step": 161160 + }, + { + "epoch": 6.68, + "grad_norm": 0.54296875, + "learning_rate": 0.00044141425440153803, + "loss": 0.2096, + "step": 161170 + }, + { + "epoch": 6.68, + "grad_norm": 0.51953125, + "learning_rate": 0.0004414072780602425, + "loss": 0.236, + "step": 161180 + }, + { + "epoch": 6.68, + "grad_norm": 2.234375, + "learning_rate": 0.0004414003013587391, + "loss": 0.2258, + "step": 161190 + }, + { + "epoch": 6.68, + "grad_norm": 0.73828125, + "learning_rate": 0.00044139332429704093, + "loss": 0.2072, + "step": 161200 + }, + { + "epoch": 6.68, + "grad_norm": 0.73046875, + "learning_rate": 0.00044138634687516134, + "loss": 0.3223, + "step": 161210 + }, + { + "epoch": 6.68, + "grad_norm": 0.78515625, + "learning_rate": 0.00044137936909311316, + "loss": 0.1863, + "step": 161220 + }, + { + "epoch": 6.68, + "grad_norm": 1.328125, + "learning_rate": 0.00044137239095090976, + "loss": 0.1715, + "step": 161230 + }, + { + "epoch": 6.68, + "grad_norm": 0.6484375, + "learning_rate": 0.0004413654124485642, + "loss": 0.209, + "step": 161240 + }, + { + "epoch": 6.68, + "grad_norm": 0.32421875, + "learning_rate": 0.00044135843358608954, + "loss": 0.2068, + "step": 161250 + }, + { + "epoch": 6.68, + "grad_norm": 0.423828125, + "learning_rate": 0.000441351454363499, + "loss": 0.201, + "step": 161260 + }, + { + "epoch": 6.68, + "grad_norm": 0.7265625, + "learning_rate": 0.00044134447478080574, + "loss": 0.1717, + "step": 161270 + }, + { + "epoch": 6.68, + "grad_norm": 0.79296875, + "learning_rate": 0.00044133749483802275, + "loss": 0.1677, + "step": 161280 + }, + { + "epoch": 6.68, + "grad_norm": 0.56640625, + "learning_rate": 0.0004413305145351633, + "loss": 0.1769, + "step": 161290 + }, + { + "epoch": 6.68, + "grad_norm": 0.72265625, + "learning_rate": 0.00044132353387224055, + "loss": 0.2011, + "step": 161300 + }, + { + "epoch": 6.68, + "grad_norm": 0.55859375, + "learning_rate": 0.00044131655284926753, + "loss": 0.2095, + "step": 161310 + }, + { + "epoch": 6.68, + "grad_norm": 0.88671875, + "learning_rate": 0.00044130957146625737, + "loss": 0.1794, + "step": 161320 + }, + { + "epoch": 6.68, + "grad_norm": 0.51953125, + "learning_rate": 0.00044130258972322324, + "loss": 0.1667, + "step": 161330 + }, + { + "epoch": 6.68, + "grad_norm": 0.76953125, + "learning_rate": 0.00044129560762017837, + "loss": 0.2297, + "step": 161340 + }, + { + "epoch": 6.68, + "grad_norm": 0.73046875, + "learning_rate": 0.00044128862515713577, + "loss": 0.1958, + "step": 161350 + }, + { + "epoch": 6.68, + "grad_norm": 0.5546875, + "learning_rate": 0.00044128164233410866, + "loss": 0.2601, + "step": 161360 + }, + { + "epoch": 6.68, + "grad_norm": 0.92578125, + "learning_rate": 0.0004412746591511101, + "loss": 0.2048, + "step": 161370 + }, + { + "epoch": 6.68, + "grad_norm": 0.396484375, + "learning_rate": 0.0004412676756081534, + "loss": 0.2021, + "step": 161380 + }, + { + "epoch": 6.68, + "grad_norm": 1.09375, + "learning_rate": 0.0004412606917052515, + "loss": 0.2027, + "step": 161390 + }, + { + "epoch": 6.69, + "grad_norm": 1.4140625, + "learning_rate": 0.00044125370744241765, + "loss": 0.1938, + "step": 161400 + }, + { + "epoch": 6.69, + "grad_norm": 1.7109375, + "learning_rate": 0.000441246722819665, + "loss": 0.2313, + "step": 161410 + }, + { + "epoch": 6.69, + "grad_norm": 0.5625, + "learning_rate": 0.00044123973783700667, + "loss": 0.2132, + "step": 161420 + }, + { + "epoch": 6.69, + "grad_norm": 1.1484375, + "learning_rate": 0.0004412327524944558, + "loss": 0.2144, + "step": 161430 + }, + { + "epoch": 6.69, + "grad_norm": 0.6875, + "learning_rate": 0.00044122576679202555, + "loss": 0.2359, + "step": 161440 + }, + { + "epoch": 6.69, + "grad_norm": 1.8671875, + "learning_rate": 0.000441218780729729, + "loss": 0.2737, + "step": 161450 + }, + { + "epoch": 6.69, + "grad_norm": 0.53515625, + "learning_rate": 0.0004412117943075794, + "loss": 0.2225, + "step": 161460 + }, + { + "epoch": 6.69, + "grad_norm": 0.42578125, + "learning_rate": 0.0004412048075255899, + "loss": 0.1877, + "step": 161470 + }, + { + "epoch": 6.69, + "grad_norm": 0.431640625, + "learning_rate": 0.00044119782038377355, + "loss": 0.1816, + "step": 161480 + }, + { + "epoch": 6.69, + "grad_norm": 0.6953125, + "learning_rate": 0.0004411908328821435, + "loss": 0.2197, + "step": 161490 + }, + { + "epoch": 6.69, + "grad_norm": 0.5, + "learning_rate": 0.00044118384502071306, + "loss": 0.2, + "step": 161500 + }, + { + "epoch": 6.69, + "grad_norm": 0.5234375, + "learning_rate": 0.00044117685679949517, + "loss": 0.1828, + "step": 161510 + }, + { + "epoch": 6.69, + "grad_norm": 0.65625, + "learning_rate": 0.00044116986821850316, + "loss": 0.1377, + "step": 161520 + }, + { + "epoch": 6.69, + "grad_norm": 1.578125, + "learning_rate": 0.00044116287927775, + "loss": 0.214, + "step": 161530 + }, + { + "epoch": 6.69, + "grad_norm": 1.8671875, + "learning_rate": 0.00044115588997724906, + "loss": 0.2383, + "step": 161540 + }, + { + "epoch": 6.69, + "grad_norm": 0.462890625, + "learning_rate": 0.0004411489003170133, + "loss": 0.266, + "step": 161550 + }, + { + "epoch": 6.69, + "grad_norm": 0.416015625, + "learning_rate": 0.000441141910297056, + "loss": 0.1909, + "step": 161560 + }, + { + "epoch": 6.69, + "grad_norm": 0.96484375, + "learning_rate": 0.0004411349199173902, + "loss": 0.1838, + "step": 161570 + }, + { + "epoch": 6.69, + "grad_norm": 1.03125, + "learning_rate": 0.0004411279291780292, + "loss": 0.2009, + "step": 161580 + }, + { + "epoch": 6.69, + "grad_norm": 0.275390625, + "learning_rate": 0.000441120938078986, + "loss": 0.2427, + "step": 161590 + }, + { + "epoch": 6.69, + "grad_norm": 0.77734375, + "learning_rate": 0.00044111394662027395, + "loss": 0.1858, + "step": 161600 + }, + { + "epoch": 6.69, + "grad_norm": 1.7421875, + "learning_rate": 0.00044110695480190597, + "loss": 0.2209, + "step": 161610 + }, + { + "epoch": 6.69, + "grad_norm": 0.5390625, + "learning_rate": 0.0004410999626238954, + "loss": 0.1949, + "step": 161620 + }, + { + "epoch": 6.69, + "grad_norm": 0.390625, + "learning_rate": 0.00044109297008625533, + "loss": 0.173, + "step": 161630 + }, + { + "epoch": 6.7, + "grad_norm": 1.21875, + "learning_rate": 0.0004410859771889989, + "loss": 0.2396, + "step": 161640 + }, + { + "epoch": 6.7, + "grad_norm": 0.70703125, + "learning_rate": 0.0004410789839321393, + "loss": 0.1877, + "step": 161650 + }, + { + "epoch": 6.7, + "grad_norm": 0.466796875, + "learning_rate": 0.0004410719903156897, + "loss": 0.1911, + "step": 161660 + }, + { + "epoch": 6.7, + "grad_norm": 0.78515625, + "learning_rate": 0.00044106499633966324, + "loss": 0.1791, + "step": 161670 + }, + { + "epoch": 6.7, + "grad_norm": 0.57421875, + "learning_rate": 0.00044105800200407315, + "loss": 0.1777, + "step": 161680 + }, + { + "epoch": 6.7, + "grad_norm": 1.2265625, + "learning_rate": 0.00044105100730893246, + "loss": 0.2022, + "step": 161690 + }, + { + "epoch": 6.7, + "grad_norm": 0.2041015625, + "learning_rate": 0.00044104401225425444, + "loss": 0.2203, + "step": 161700 + }, + { + "epoch": 6.7, + "grad_norm": 1.90625, + "learning_rate": 0.0004410370168400522, + "loss": 0.2262, + "step": 161710 + }, + { + "epoch": 6.7, + "grad_norm": 1.0390625, + "learning_rate": 0.00044103002106633896, + "loss": 0.1704, + "step": 161720 + }, + { + "epoch": 6.7, + "grad_norm": 0.484375, + "learning_rate": 0.0004410230249331278, + "loss": 0.2037, + "step": 161730 + }, + { + "epoch": 6.7, + "grad_norm": 1.2734375, + "learning_rate": 0.000441016028440432, + "loss": 0.2142, + "step": 161740 + }, + { + "epoch": 6.7, + "grad_norm": 0.609375, + "learning_rate": 0.0004410090315882646, + "loss": 0.1626, + "step": 161750 + }, + { + "epoch": 6.7, + "grad_norm": 0.703125, + "learning_rate": 0.00044100203437663887, + "loss": 0.2213, + "step": 161760 + }, + { + "epoch": 6.7, + "grad_norm": 0.98828125, + "learning_rate": 0.00044099503680556783, + "loss": 0.2336, + "step": 161770 + }, + { + "epoch": 6.7, + "grad_norm": 1.2890625, + "learning_rate": 0.0004409880388750649, + "loss": 0.2363, + "step": 161780 + }, + { + "epoch": 6.7, + "grad_norm": 0.56640625, + "learning_rate": 0.0004409810405851431, + "loss": 0.189, + "step": 161790 + }, + { + "epoch": 6.7, + "grad_norm": 0.322265625, + "learning_rate": 0.0004409740419358155, + "loss": 0.2011, + "step": 161800 + }, + { + "epoch": 6.7, + "grad_norm": 0.62890625, + "learning_rate": 0.0004409670429270954, + "loss": 0.2259, + "step": 161810 + }, + { + "epoch": 6.7, + "grad_norm": 0.87890625, + "learning_rate": 0.00044096004355899604, + "loss": 0.2393, + "step": 161820 + }, + { + "epoch": 6.7, + "grad_norm": 1.0625, + "learning_rate": 0.0004409530438315305, + "loss": 0.219, + "step": 161830 + }, + { + "epoch": 6.7, + "grad_norm": 0.74609375, + "learning_rate": 0.0004409460437447118, + "loss": 0.1696, + "step": 161840 + }, + { + "epoch": 6.7, + "grad_norm": 0.53515625, + "learning_rate": 0.0004409390432985534, + "loss": 0.2388, + "step": 161850 + }, + { + "epoch": 6.7, + "grad_norm": 0.41015625, + "learning_rate": 0.0004409320424930683, + "loss": 0.1623, + "step": 161860 + }, + { + "epoch": 6.7, + "grad_norm": 0.59375, + "learning_rate": 0.00044092504132826974, + "loss": 0.2186, + "step": 161870 + }, + { + "epoch": 6.71, + "grad_norm": 0.482421875, + "learning_rate": 0.00044091803980417075, + "loss": 0.2373, + "step": 161880 + }, + { + "epoch": 6.71, + "grad_norm": 0.5703125, + "learning_rate": 0.0004409110379207848, + "loss": 0.1743, + "step": 161890 + }, + { + "epoch": 6.71, + "grad_norm": 1.1171875, + "learning_rate": 0.00044090403567812483, + "loss": 0.2232, + "step": 161900 + }, + { + "epoch": 6.71, + "grad_norm": 0.3046875, + "learning_rate": 0.00044089703307620404, + "loss": 0.1741, + "step": 161910 + }, + { + "epoch": 6.71, + "grad_norm": 0.4375, + "learning_rate": 0.0004408900301150357, + "loss": 0.2137, + "step": 161920 + }, + { + "epoch": 6.71, + "grad_norm": 0.85546875, + "learning_rate": 0.0004408830267946329, + "loss": 0.1677, + "step": 161930 + }, + { + "epoch": 6.71, + "grad_norm": 0.5859375, + "learning_rate": 0.00044087602311500886, + "loss": 0.1551, + "step": 161940 + }, + { + "epoch": 6.71, + "grad_norm": 0.41015625, + "learning_rate": 0.00044086901907617676, + "loss": 0.2087, + "step": 161950 + }, + { + "epoch": 6.71, + "grad_norm": 0.8046875, + "learning_rate": 0.0004408620146781498, + "loss": 0.2187, + "step": 161960 + }, + { + "epoch": 6.71, + "grad_norm": 0.671875, + "learning_rate": 0.0004408550099209411, + "loss": 0.181, + "step": 161970 + }, + { + "epoch": 6.71, + "grad_norm": 0.76171875, + "learning_rate": 0.00044084800480456395, + "loss": 0.2012, + "step": 161980 + }, + { + "epoch": 6.71, + "grad_norm": 0.734375, + "learning_rate": 0.00044084099932903143, + "loss": 0.2112, + "step": 161990 + }, + { + "epoch": 6.71, + "grad_norm": 0.9140625, + "learning_rate": 0.00044083399349435673, + "loss": 0.1884, + "step": 162000 + }, + { + "epoch": 6.71, + "grad_norm": 1.1015625, + "learning_rate": 0.00044082698730055306, + "loss": 0.2168, + "step": 162010 + }, + { + "epoch": 6.71, + "grad_norm": 0.37890625, + "learning_rate": 0.0004408199807476336, + "loss": 0.1992, + "step": 162020 + }, + { + "epoch": 6.71, + "grad_norm": 0.61328125, + "learning_rate": 0.0004408129738356116, + "loss": 0.2297, + "step": 162030 + }, + { + "epoch": 6.71, + "grad_norm": 0.53125, + "learning_rate": 0.00044080596656450016, + "loss": 0.2053, + "step": 162040 + }, + { + "epoch": 6.71, + "grad_norm": 0.2578125, + "learning_rate": 0.00044079895893431255, + "loss": 0.2142, + "step": 162050 + }, + { + "epoch": 6.71, + "grad_norm": 0.5, + "learning_rate": 0.00044079195094506185, + "loss": 0.2048, + "step": 162060 + }, + { + "epoch": 6.71, + "grad_norm": 0.67578125, + "learning_rate": 0.0004407849425967613, + "loss": 0.1862, + "step": 162070 + }, + { + "epoch": 6.71, + "grad_norm": 0.51953125, + "learning_rate": 0.0004407779338894241, + "loss": 0.1956, + "step": 162080 + }, + { + "epoch": 6.71, + "grad_norm": 1.6328125, + "learning_rate": 0.00044077092482306334, + "loss": 0.2659, + "step": 162090 + }, + { + "epoch": 6.71, + "grad_norm": 1.0078125, + "learning_rate": 0.00044076391539769243, + "loss": 0.1632, + "step": 162100 + }, + { + "epoch": 6.71, + "grad_norm": 0.380859375, + "learning_rate": 0.0004407569056133244, + "loss": 0.2243, + "step": 162110 + }, + { + "epoch": 6.71, + "grad_norm": 0.98828125, + "learning_rate": 0.00044074989546997247, + "loss": 0.1706, + "step": 162120 + }, + { + "epoch": 6.72, + "grad_norm": 0.265625, + "learning_rate": 0.0004407428849676498, + "loss": 0.1589, + "step": 162130 + }, + { + "epoch": 6.72, + "grad_norm": 0.8984375, + "learning_rate": 0.0004407358741063697, + "loss": 0.1814, + "step": 162140 + }, + { + "epoch": 6.72, + "grad_norm": 0.51171875, + "learning_rate": 0.0004407288628861452, + "loss": 0.1683, + "step": 162150 + }, + { + "epoch": 6.72, + "grad_norm": 1.78125, + "learning_rate": 0.0004407218513069896, + "loss": 0.1421, + "step": 162160 + }, + { + "epoch": 6.72, + "grad_norm": 0.625, + "learning_rate": 0.0004407148393689161, + "loss": 0.2183, + "step": 162170 + }, + { + "epoch": 6.72, + "grad_norm": 1.3125, + "learning_rate": 0.0004407078270719378, + "loss": 0.2384, + "step": 162180 + }, + { + "epoch": 6.72, + "grad_norm": 1.296875, + "learning_rate": 0.0004407008144160681, + "loss": 0.211, + "step": 162190 + }, + { + "epoch": 6.72, + "grad_norm": 0.6171875, + "learning_rate": 0.0004406938014013199, + "loss": 0.2237, + "step": 162200 + }, + { + "epoch": 6.72, + "grad_norm": 0.28515625, + "learning_rate": 0.00044068678802770665, + "loss": 0.1453, + "step": 162210 + }, + { + "epoch": 6.72, + "grad_norm": 0.7890625, + "learning_rate": 0.0004406797742952414, + "loss": 0.2438, + "step": 162220 + }, + { + "epoch": 6.72, + "grad_norm": 1.2421875, + "learning_rate": 0.00044067276020393753, + "loss": 0.2199, + "step": 162230 + }, + { + "epoch": 6.72, + "grad_norm": 0.625, + "learning_rate": 0.000440665745753808, + "loss": 0.252, + "step": 162240 + }, + { + "epoch": 6.72, + "grad_norm": 0.59765625, + "learning_rate": 0.0004406587309448661, + "loss": 0.1718, + "step": 162250 + }, + { + "epoch": 6.72, + "grad_norm": 1.1640625, + "learning_rate": 0.00044065171577712515, + "loss": 0.2455, + "step": 162260 + }, + { + "epoch": 6.72, + "grad_norm": 0.82421875, + "learning_rate": 0.0004406447002505982, + "loss": 0.1672, + "step": 162270 + }, + { + "epoch": 6.72, + "grad_norm": 1.046875, + "learning_rate": 0.0004406376843652986, + "loss": 0.2188, + "step": 162280 + }, + { + "epoch": 6.72, + "grad_norm": 1.203125, + "learning_rate": 0.00044063066812123934, + "loss": 0.2032, + "step": 162290 + }, + { + "epoch": 6.72, + "grad_norm": 0.5390625, + "learning_rate": 0.0004406236515184339, + "loss": 0.1995, + "step": 162300 + }, + { + "epoch": 6.72, + "grad_norm": 0.7421875, + "learning_rate": 0.0004406166345568952, + "loss": 0.1778, + "step": 162310 + }, + { + "epoch": 6.72, + "grad_norm": 0.6171875, + "learning_rate": 0.0004406096172366366, + "loss": 0.1405, + "step": 162320 + }, + { + "epoch": 6.72, + "grad_norm": 0.5546875, + "learning_rate": 0.00044060259955767137, + "loss": 0.2062, + "step": 162330 + }, + { + "epoch": 6.72, + "grad_norm": 1.15625, + "learning_rate": 0.00044059558152001254, + "loss": 0.2081, + "step": 162340 + }, + { + "epoch": 6.72, + "grad_norm": 1.0078125, + "learning_rate": 0.00044058856312367346, + "loss": 0.2229, + "step": 162350 + }, + { + "epoch": 6.72, + "grad_norm": 1.109375, + "learning_rate": 0.0004405815443686673, + "loss": 0.2147, + "step": 162360 + }, + { + "epoch": 6.73, + "grad_norm": 1.078125, + "learning_rate": 0.00044057452525500713, + "loss": 0.2334, + "step": 162370 + }, + { + "epoch": 6.73, + "grad_norm": 0.57421875, + "learning_rate": 0.0004405675057827064, + "loss": 0.2421, + "step": 162380 + }, + { + "epoch": 6.73, + "grad_norm": 0.7421875, + "learning_rate": 0.00044056048595177816, + "loss": 0.2001, + "step": 162390 + }, + { + "epoch": 6.73, + "grad_norm": 0.4296875, + "learning_rate": 0.00044055346576223563, + "loss": 0.1687, + "step": 162400 + }, + { + "epoch": 6.73, + "grad_norm": 0.41015625, + "learning_rate": 0.00044054644521409215, + "loss": 0.2377, + "step": 162410 + }, + { + "epoch": 6.73, + "grad_norm": 0.81640625, + "learning_rate": 0.00044053942430736083, + "loss": 0.2129, + "step": 162420 + }, + { + "epoch": 6.73, + "grad_norm": 1.0703125, + "learning_rate": 0.0004405324030420548, + "loss": 0.1477, + "step": 162430 + }, + { + "epoch": 6.73, + "grad_norm": 0.52734375, + "learning_rate": 0.00044052538141818744, + "loss": 0.2258, + "step": 162440 + }, + { + "epoch": 6.73, + "grad_norm": 1.2734375, + "learning_rate": 0.0004405183594357718, + "loss": 0.2041, + "step": 162450 + }, + { + "epoch": 6.73, + "grad_norm": 0.5703125, + "learning_rate": 0.0004405113370948213, + "loss": 0.257, + "step": 162460 + }, + { + "epoch": 6.73, + "grad_norm": 0.259765625, + "learning_rate": 0.000440504314395349, + "loss": 0.1769, + "step": 162470 + }, + { + "epoch": 6.73, + "grad_norm": 0.6953125, + "learning_rate": 0.00044049729133736816, + "loss": 0.2185, + "step": 162480 + }, + { + "epoch": 6.73, + "grad_norm": 0.79296875, + "learning_rate": 0.00044049026792089194, + "loss": 0.1988, + "step": 162490 + }, + { + "epoch": 6.73, + "grad_norm": 0.2734375, + "learning_rate": 0.0004404832441459336, + "loss": 0.1776, + "step": 162500 + }, + { + "epoch": 6.73, + "grad_norm": 0.75, + "learning_rate": 0.0004404762200125064, + "loss": 0.2036, + "step": 162510 + }, + { + "epoch": 6.73, + "grad_norm": 0.83203125, + "learning_rate": 0.00044046919552062354, + "loss": 0.1867, + "step": 162520 + }, + { + "epoch": 6.73, + "grad_norm": 0.5078125, + "learning_rate": 0.0004404621706702981, + "loss": 0.1895, + "step": 162530 + }, + { + "epoch": 6.73, + "grad_norm": 0.482421875, + "learning_rate": 0.00044045514546154366, + "loss": 0.2014, + "step": 162540 + }, + { + "epoch": 6.73, + "grad_norm": 1.109375, + "learning_rate": 0.000440448119894373, + "loss": 0.1818, + "step": 162550 + }, + { + "epoch": 6.73, + "grad_norm": 1.203125, + "learning_rate": 0.0004404410939687996, + "loss": 0.2454, + "step": 162560 + }, + { + "epoch": 6.73, + "grad_norm": 0.52734375, + "learning_rate": 0.00044043406768483665, + "loss": 0.2194, + "step": 162570 + }, + { + "epoch": 6.73, + "grad_norm": 0.5859375, + "learning_rate": 0.0004404270410424974, + "loss": 0.2012, + "step": 162580 + }, + { + "epoch": 6.73, + "grad_norm": 0.17578125, + "learning_rate": 0.00044042001404179487, + "loss": 0.2086, + "step": 162590 + }, + { + "epoch": 6.73, + "grad_norm": 1.25, + "learning_rate": 0.00044041298668274256, + "loss": 0.2112, + "step": 162600 + }, + { + "epoch": 6.74, + "grad_norm": 0.6640625, + "learning_rate": 0.0004404059589653536, + "loss": 0.2308, + "step": 162610 + }, + { + "epoch": 6.74, + "grad_norm": 0.51953125, + "learning_rate": 0.00044039893088964114, + "loss": 0.1835, + "step": 162620 + }, + { + "epoch": 6.74, + "grad_norm": 1.6015625, + "learning_rate": 0.00044039190245561836, + "loss": 0.2787, + "step": 162630 + }, + { + "epoch": 6.74, + "grad_norm": 0.416015625, + "learning_rate": 0.0004403848736632987, + "loss": 0.2102, + "step": 162640 + }, + { + "epoch": 6.74, + "grad_norm": 0.455078125, + "learning_rate": 0.0004403778445126952, + "loss": 0.196, + "step": 162650 + }, + { + "epoch": 6.74, + "grad_norm": 0.68359375, + "learning_rate": 0.00044037081500382125, + "loss": 0.1967, + "step": 162660 + }, + { + "epoch": 6.74, + "grad_norm": 0.5078125, + "learning_rate": 0.00044036378513668986, + "loss": 0.2053, + "step": 162670 + }, + { + "epoch": 6.74, + "grad_norm": 0.7109375, + "learning_rate": 0.00044035675491131443, + "loss": 0.2241, + "step": 162680 + }, + { + "epoch": 6.74, + "grad_norm": 0.55859375, + "learning_rate": 0.0004403497243277082, + "loss": 0.1654, + "step": 162690 + }, + { + "epoch": 6.74, + "grad_norm": 0.7265625, + "learning_rate": 0.0004403426933858843, + "loss": 0.236, + "step": 162700 + }, + { + "epoch": 6.74, + "grad_norm": 0.26171875, + "learning_rate": 0.000440335662085856, + "loss": 0.1957, + "step": 162710 + }, + { + "epoch": 6.74, + "grad_norm": 0.78515625, + "learning_rate": 0.00044032863042763656, + "loss": 0.2019, + "step": 162720 + }, + { + "epoch": 6.74, + "grad_norm": 0.8203125, + "learning_rate": 0.0004403215984112392, + "loss": 0.1838, + "step": 162730 + }, + { + "epoch": 6.74, + "grad_norm": 0.91015625, + "learning_rate": 0.0004403145660366771, + "loss": 0.2222, + "step": 162740 + }, + { + "epoch": 6.74, + "grad_norm": 1.9140625, + "learning_rate": 0.00044030753330396357, + "loss": 0.2147, + "step": 162750 + }, + { + "epoch": 6.74, + "grad_norm": 2.625, + "learning_rate": 0.0004403005002131118, + "loss": 0.1816, + "step": 162760 + }, + { + "epoch": 6.74, + "grad_norm": 0.57421875, + "learning_rate": 0.00044029346676413507, + "loss": 0.1956, + "step": 162770 + }, + { + "epoch": 6.74, + "grad_norm": 1.0234375, + "learning_rate": 0.00044028643295704654, + "loss": 0.2514, + "step": 162780 + }, + { + "epoch": 6.74, + "grad_norm": 1.2109375, + "learning_rate": 0.0004402793987918595, + "loss": 0.2044, + "step": 162790 + }, + { + "epoch": 6.74, + "grad_norm": 0.84765625, + "learning_rate": 0.0004402723642685872, + "loss": 0.2293, + "step": 162800 + }, + { + "epoch": 6.74, + "grad_norm": 0.9609375, + "learning_rate": 0.00044026532938724286, + "loss": 0.2001, + "step": 162810 + }, + { + "epoch": 6.74, + "grad_norm": 1.53125, + "learning_rate": 0.00044025829414783967, + "loss": 0.2614, + "step": 162820 + }, + { + "epoch": 6.74, + "grad_norm": 0.8671875, + "learning_rate": 0.000440251258550391, + "loss": 0.1953, + "step": 162830 + }, + { + "epoch": 6.74, + "grad_norm": 1.2109375, + "learning_rate": 0.0004402442225949099, + "loss": 0.2252, + "step": 162840 + }, + { + "epoch": 6.75, + "grad_norm": 1.3984375, + "learning_rate": 0.00044023718628140985, + "loss": 0.1941, + "step": 162850 + }, + { + "epoch": 6.75, + "grad_norm": 0.466796875, + "learning_rate": 0.0004402301496099039, + "loss": 0.2423, + "step": 162860 + }, + { + "epoch": 6.75, + "grad_norm": 0.73828125, + "learning_rate": 0.00044022311258040534, + "loss": 0.1638, + "step": 162870 + }, + { + "epoch": 6.75, + "grad_norm": 0.53125, + "learning_rate": 0.0004402160751929274, + "loss": 0.1942, + "step": 162880 + }, + { + "epoch": 6.75, + "grad_norm": 0.66015625, + "learning_rate": 0.0004402090374474834, + "loss": 0.1466, + "step": 162890 + }, + { + "epoch": 6.75, + "grad_norm": 0.8125, + "learning_rate": 0.00044020199934408647, + "loss": 0.2304, + "step": 162900 + }, + { + "epoch": 6.75, + "grad_norm": 1.6328125, + "learning_rate": 0.00044019496088275, + "loss": 0.2246, + "step": 162910 + }, + { + "epoch": 6.75, + "grad_norm": 1.0859375, + "learning_rate": 0.00044018792206348707, + "loss": 0.1817, + "step": 162920 + }, + { + "epoch": 6.75, + "grad_norm": 0.41015625, + "learning_rate": 0.000440180882886311, + "loss": 0.222, + "step": 162930 + }, + { + "epoch": 6.75, + "grad_norm": 1.1484375, + "learning_rate": 0.0004401738433512351, + "loss": 0.2596, + "step": 162940 + }, + { + "epoch": 6.75, + "grad_norm": 0.85546875, + "learning_rate": 0.0004401668034582726, + "loss": 0.1939, + "step": 162950 + }, + { + "epoch": 6.75, + "grad_norm": 0.34375, + "learning_rate": 0.0004401597632074367, + "loss": 0.2047, + "step": 162960 + }, + { + "epoch": 6.75, + "grad_norm": 0.458984375, + "learning_rate": 0.0004401527225987406, + "loss": 0.1834, + "step": 162970 + }, + { + "epoch": 6.75, + "grad_norm": 0.1611328125, + "learning_rate": 0.0004401456816321976, + "loss": 0.2019, + "step": 162980 + }, + { + "epoch": 6.75, + "grad_norm": 0.78125, + "learning_rate": 0.000440138640307821, + "loss": 0.2144, + "step": 162990 + }, + { + "epoch": 6.75, + "grad_norm": 1.2734375, + "learning_rate": 0.000440131598625624, + "loss": 0.2417, + "step": 163000 + }, + { + "epoch": 6.75, + "grad_norm": 0.5, + "learning_rate": 0.0004401245565856199, + "loss": 0.1717, + "step": 163010 + }, + { + "epoch": 6.75, + "grad_norm": 0.2890625, + "learning_rate": 0.0004401175141878219, + "loss": 0.1737, + "step": 163020 + }, + { + "epoch": 6.75, + "grad_norm": 1.03125, + "learning_rate": 0.00044011047143224325, + "loss": 0.2383, + "step": 163030 + }, + { + "epoch": 6.75, + "grad_norm": 0.3828125, + "learning_rate": 0.0004401034283188973, + "loss": 0.2064, + "step": 163040 + }, + { + "epoch": 6.75, + "grad_norm": 0.50390625, + "learning_rate": 0.00044009638484779714, + "loss": 0.1477, + "step": 163050 + }, + { + "epoch": 6.75, + "grad_norm": 0.875, + "learning_rate": 0.0004400893410189561, + "loss": 0.2007, + "step": 163060 + }, + { + "epoch": 6.75, + "grad_norm": 0.357421875, + "learning_rate": 0.0004400822968323875, + "loss": 0.1864, + "step": 163070 + }, + { + "epoch": 6.75, + "grad_norm": 0.578125, + "learning_rate": 0.00044007525228810453, + "loss": 0.1967, + "step": 163080 + }, + { + "epoch": 6.76, + "grad_norm": 1.3046875, + "learning_rate": 0.00044006820738612043, + "loss": 0.2202, + "step": 163090 + }, + { + "epoch": 6.76, + "grad_norm": 0.83984375, + "learning_rate": 0.00044006116212644855, + "loss": 0.1988, + "step": 163100 + }, + { + "epoch": 6.76, + "grad_norm": 0.66796875, + "learning_rate": 0.000440054116509102, + "loss": 0.2292, + "step": 163110 + }, + { + "epoch": 6.76, + "grad_norm": 0.59375, + "learning_rate": 0.0004400470705340942, + "loss": 0.2222, + "step": 163120 + }, + { + "epoch": 6.76, + "grad_norm": 0.53125, + "learning_rate": 0.0004400400242014383, + "loss": 0.2113, + "step": 163130 + }, + { + "epoch": 6.76, + "grad_norm": 0.9453125, + "learning_rate": 0.00044003297751114763, + "loss": 0.2203, + "step": 163140 + }, + { + "epoch": 6.76, + "grad_norm": 0.56640625, + "learning_rate": 0.0004400259304632354, + "loss": 0.2345, + "step": 163150 + }, + { + "epoch": 6.76, + "grad_norm": 0.453125, + "learning_rate": 0.00044001888305771487, + "loss": 0.1738, + "step": 163160 + }, + { + "epoch": 6.76, + "grad_norm": 0.97265625, + "learning_rate": 0.00044001183529459936, + "loss": 0.2042, + "step": 163170 + }, + { + "epoch": 6.76, + "grad_norm": 0.86328125, + "learning_rate": 0.00044000478717390215, + "loss": 0.2121, + "step": 163180 + }, + { + "epoch": 6.76, + "grad_norm": 0.359375, + "learning_rate": 0.0004399977386956363, + "loss": 0.2017, + "step": 163190 + }, + { + "epoch": 6.76, + "grad_norm": 0.92578125, + "learning_rate": 0.00043999068985981534, + "loss": 0.1866, + "step": 163200 + }, + { + "epoch": 6.76, + "grad_norm": 1.3984375, + "learning_rate": 0.00043998364066645236, + "loss": 0.2332, + "step": 163210 + }, + { + "epoch": 6.76, + "grad_norm": 0.9453125, + "learning_rate": 0.0004399765911155607, + "loss": 0.1846, + "step": 163220 + }, + { + "epoch": 6.76, + "grad_norm": 0.890625, + "learning_rate": 0.0004399695412071536, + "loss": 0.1803, + "step": 163230 + }, + { + "epoch": 6.76, + "grad_norm": 1.5390625, + "learning_rate": 0.00043996249094124443, + "loss": 0.2439, + "step": 163240 + }, + { + "epoch": 6.76, + "grad_norm": 0.5390625, + "learning_rate": 0.00043995544031784627, + "loss": 0.1915, + "step": 163250 + }, + { + "epoch": 6.76, + "grad_norm": 0.625, + "learning_rate": 0.0004399483893369725, + "loss": 0.173, + "step": 163260 + }, + { + "epoch": 6.76, + "grad_norm": 0.275390625, + "learning_rate": 0.00043994133799863636, + "loss": 0.2015, + "step": 163270 + }, + { + "epoch": 6.76, + "grad_norm": 0.890625, + "learning_rate": 0.00043993428630285117, + "loss": 0.2079, + "step": 163280 + }, + { + "epoch": 6.76, + "grad_norm": 1.015625, + "learning_rate": 0.00043992723424963017, + "loss": 0.1901, + "step": 163290 + }, + { + "epoch": 6.76, + "grad_norm": 0.63671875, + "learning_rate": 0.00043992018183898663, + "loss": 0.1629, + "step": 163300 + }, + { + "epoch": 6.76, + "grad_norm": 0.455078125, + "learning_rate": 0.00043991312907093374, + "loss": 0.2481, + "step": 163310 + }, + { + "epoch": 6.76, + "grad_norm": 0.8203125, + "learning_rate": 0.0004399060759454849, + "loss": 0.1929, + "step": 163320 + }, + { + "epoch": 6.77, + "grad_norm": 2.734375, + "learning_rate": 0.0004398990224626533, + "loss": 0.1658, + "step": 163330 + }, + { + "epoch": 6.77, + "grad_norm": 0.91015625, + "learning_rate": 0.00043989196862245235, + "loss": 0.2573, + "step": 163340 + }, + { + "epoch": 6.77, + "grad_norm": 0.875, + "learning_rate": 0.00043988491442489516, + "loss": 0.1975, + "step": 163350 + }, + { + "epoch": 6.77, + "grad_norm": 0.74609375, + "learning_rate": 0.00043987785986999506, + "loss": 0.2262, + "step": 163360 + }, + { + "epoch": 6.77, + "grad_norm": 0.9140625, + "learning_rate": 0.00043987080495776535, + "loss": 0.1979, + "step": 163370 + }, + { + "epoch": 6.77, + "grad_norm": 1.921875, + "learning_rate": 0.00043986374968821925, + "loss": 0.201, + "step": 163380 + }, + { + "epoch": 6.77, + "grad_norm": 0.73828125, + "learning_rate": 0.0004398566940613701, + "loss": 0.2137, + "step": 163390 + }, + { + "epoch": 6.77, + "grad_norm": 0.984375, + "learning_rate": 0.0004398496380772311, + "loss": 0.1717, + "step": 163400 + }, + { + "epoch": 6.77, + "grad_norm": 0.53515625, + "learning_rate": 0.0004398425817358157, + "loss": 0.1543, + "step": 163410 + }, + { + "epoch": 6.77, + "grad_norm": 0.671875, + "learning_rate": 0.000439835525037137, + "loss": 0.2109, + "step": 163420 + }, + { + "epoch": 6.77, + "grad_norm": 0.8515625, + "learning_rate": 0.00043982846798120833, + "loss": 0.2134, + "step": 163430 + }, + { + "epoch": 6.77, + "grad_norm": 1.4140625, + "learning_rate": 0.00043982141056804296, + "loss": 0.2406, + "step": 163440 + }, + { + "epoch": 6.77, + "grad_norm": 0.7734375, + "learning_rate": 0.0004398143527976542, + "loss": 0.2386, + "step": 163450 + }, + { + "epoch": 6.77, + "grad_norm": 1.28125, + "learning_rate": 0.0004398072946700554, + "loss": 0.2193, + "step": 163460 + }, + { + "epoch": 6.77, + "grad_norm": 0.55078125, + "learning_rate": 0.0004398002361852598, + "loss": 0.2153, + "step": 163470 + }, + { + "epoch": 6.77, + "grad_norm": 0.703125, + "learning_rate": 0.0004397931773432805, + "loss": 0.172, + "step": 163480 + }, + { + "epoch": 6.77, + "grad_norm": 0.55859375, + "learning_rate": 0.00043978611814413104, + "loss": 0.1628, + "step": 163490 + }, + { + "epoch": 6.77, + "grad_norm": 0.76171875, + "learning_rate": 0.0004397790585878246, + "loss": 0.2593, + "step": 163500 + }, + { + "epoch": 6.77, + "grad_norm": 0.765625, + "learning_rate": 0.0004397719986743744, + "loss": 0.187, + "step": 163510 + }, + { + "epoch": 6.77, + "grad_norm": 0.8828125, + "learning_rate": 0.00043976493840379384, + "loss": 0.2078, + "step": 163520 + }, + { + "epoch": 6.77, + "grad_norm": 0.578125, + "learning_rate": 0.00043975787777609615, + "loss": 0.187, + "step": 163530 + }, + { + "epoch": 6.77, + "grad_norm": 1.921875, + "learning_rate": 0.00043975081679129463, + "loss": 0.2655, + "step": 163540 + }, + { + "epoch": 6.77, + "grad_norm": 0.75390625, + "learning_rate": 0.0004397437554494026, + "loss": 0.1852, + "step": 163550 + }, + { + "epoch": 6.77, + "grad_norm": 0.671875, + "learning_rate": 0.0004397366937504332, + "loss": 0.1512, + "step": 163560 + }, + { + "epoch": 6.78, + "grad_norm": 0.98046875, + "learning_rate": 0.00043972963169439994, + "loss": 0.1846, + "step": 163570 + }, + { + "epoch": 6.78, + "grad_norm": 1.21875, + "learning_rate": 0.0004397225692813159, + "loss": 0.2045, + "step": 163580 + }, + { + "epoch": 6.78, + "grad_norm": 0.421875, + "learning_rate": 0.0004397155065111946, + "loss": 0.1684, + "step": 163590 + }, + { + "epoch": 6.78, + "grad_norm": 0.68359375, + "learning_rate": 0.0004397084433840491, + "loss": 0.1609, + "step": 163600 + }, + { + "epoch": 6.78, + "grad_norm": 0.734375, + "learning_rate": 0.00043970137989989287, + "loss": 0.2003, + "step": 163610 + }, + { + "epoch": 6.78, + "grad_norm": 1.015625, + "learning_rate": 0.0004396943160587391, + "loss": 0.2383, + "step": 163620 + }, + { + "epoch": 6.78, + "grad_norm": 1.0859375, + "learning_rate": 0.0004396872518606011, + "loss": 0.2085, + "step": 163630 + }, + { + "epoch": 6.78, + "grad_norm": 0.7890625, + "learning_rate": 0.0004396801873054922, + "loss": 0.2184, + "step": 163640 + }, + { + "epoch": 6.78, + "grad_norm": 1.546875, + "learning_rate": 0.00043967312239342565, + "loss": 0.2131, + "step": 163650 + }, + { + "epoch": 6.78, + "grad_norm": 0.82421875, + "learning_rate": 0.0004396660571244148, + "loss": 0.2023, + "step": 163660 + }, + { + "epoch": 6.78, + "grad_norm": 0.56640625, + "learning_rate": 0.0004396589914984729, + "loss": 0.2136, + "step": 163670 + }, + { + "epoch": 6.78, + "grad_norm": 0.51171875, + "learning_rate": 0.0004396519255156133, + "loss": 0.1747, + "step": 163680 + }, + { + "epoch": 6.78, + "grad_norm": 0.2119140625, + "learning_rate": 0.00043964485917584917, + "loss": 0.1905, + "step": 163690 + }, + { + "epoch": 6.78, + "grad_norm": 0.66015625, + "learning_rate": 0.00043963779247919397, + "loss": 0.1728, + "step": 163700 + }, + { + "epoch": 6.78, + "grad_norm": 0.4921875, + "learning_rate": 0.00043963072542566083, + "loss": 0.2515, + "step": 163710 + }, + { + "epoch": 6.78, + "grad_norm": 0.50390625, + "learning_rate": 0.00043962365801526314, + "loss": 0.1779, + "step": 163720 + }, + { + "epoch": 6.78, + "grad_norm": 0.294921875, + "learning_rate": 0.0004396165902480143, + "loss": 0.2418, + "step": 163730 + }, + { + "epoch": 6.78, + "grad_norm": 1.21875, + "learning_rate": 0.00043960952212392747, + "loss": 0.195, + "step": 163740 + }, + { + "epoch": 6.78, + "grad_norm": 0.455078125, + "learning_rate": 0.00043960245364301604, + "loss": 0.1838, + "step": 163750 + }, + { + "epoch": 6.78, + "grad_norm": 0.69921875, + "learning_rate": 0.0004395953848052932, + "loss": 0.2534, + "step": 163760 + }, + { + "epoch": 6.78, + "grad_norm": 3.0625, + "learning_rate": 0.00043958831561077236, + "loss": 0.2398, + "step": 163770 + }, + { + "epoch": 6.78, + "grad_norm": 0.87890625, + "learning_rate": 0.00043958124605946683, + "loss": 0.2025, + "step": 163780 + }, + { + "epoch": 6.78, + "grad_norm": 1.390625, + "learning_rate": 0.00043957417615138973, + "loss": 0.1786, + "step": 163790 + }, + { + "epoch": 6.78, + "grad_norm": 0.97265625, + "learning_rate": 0.0004395671058865546, + "loss": 0.2233, + "step": 163800 + }, + { + "epoch": 6.78, + "grad_norm": 0.28515625, + "learning_rate": 0.0004395600352649746, + "loss": 0.18, + "step": 163810 + }, + { + "epoch": 6.79, + "grad_norm": 0.5859375, + "learning_rate": 0.0004395529642866631, + "loss": 0.1776, + "step": 163820 + }, + { + "epoch": 6.79, + "grad_norm": 0.93359375, + "learning_rate": 0.0004395458929516335, + "loss": 0.1391, + "step": 163830 + }, + { + "epoch": 6.79, + "grad_norm": 0.46875, + "learning_rate": 0.00043953882125989885, + "loss": 0.2422, + "step": 163840 + }, + { + "epoch": 6.79, + "grad_norm": 0.451171875, + "learning_rate": 0.00043953174921147265, + "loss": 0.1466, + "step": 163850 + }, + { + "epoch": 6.79, + "grad_norm": 0.6171875, + "learning_rate": 0.0004395246768063682, + "loss": 0.1863, + "step": 163860 + }, + { + "epoch": 6.79, + "grad_norm": 0.7421875, + "learning_rate": 0.00043951760404459874, + "loss": 0.1707, + "step": 163870 + }, + { + "epoch": 6.79, + "grad_norm": 1.421875, + "learning_rate": 0.00043951053092617764, + "loss": 0.2218, + "step": 163880 + }, + { + "epoch": 6.79, + "grad_norm": 1.2890625, + "learning_rate": 0.0004395034574511182, + "loss": 0.2281, + "step": 163890 + }, + { + "epoch": 6.79, + "grad_norm": 1.0859375, + "learning_rate": 0.0004394963836194337, + "loss": 0.2164, + "step": 163900 + }, + { + "epoch": 6.79, + "grad_norm": 1.1171875, + "learning_rate": 0.0004394893094311374, + "loss": 0.2047, + "step": 163910 + }, + { + "epoch": 6.79, + "grad_norm": 0.35546875, + "learning_rate": 0.00043948223488624277, + "loss": 0.2312, + "step": 163920 + }, + { + "epoch": 6.79, + "grad_norm": 0.63671875, + "learning_rate": 0.000439475159984763, + "loss": 0.2296, + "step": 163930 + }, + { + "epoch": 6.79, + "grad_norm": 1.0703125, + "learning_rate": 0.0004394680847267115, + "loss": 0.2125, + "step": 163940 + }, + { + "epoch": 6.79, + "grad_norm": 0.357421875, + "learning_rate": 0.0004394610091121014, + "loss": 0.1713, + "step": 163950 + }, + { + "epoch": 6.79, + "grad_norm": 0.345703125, + "learning_rate": 0.00043945393314094626, + "loss": 0.1677, + "step": 163960 + }, + { + "epoch": 6.79, + "grad_norm": 0.73828125, + "learning_rate": 0.00043944685681325925, + "loss": 0.2557, + "step": 163970 + }, + { + "epoch": 6.79, + "grad_norm": 0.63671875, + "learning_rate": 0.0004394397801290537, + "loss": 0.2219, + "step": 163980 + }, + { + "epoch": 6.79, + "grad_norm": 0.373046875, + "learning_rate": 0.00043943270308834293, + "loss": 0.1847, + "step": 163990 + }, + { + "epoch": 6.79, + "grad_norm": 0.64453125, + "learning_rate": 0.0004394256256911403, + "loss": 0.2168, + "step": 164000 + }, + { + "epoch": 6.79, + "grad_norm": 0.80859375, + "learning_rate": 0.00043941854793745904, + "loss": 0.1999, + "step": 164010 + }, + { + "epoch": 6.79, + "grad_norm": 0.44921875, + "learning_rate": 0.0004394114698273126, + "loss": 0.1758, + "step": 164020 + }, + { + "epoch": 6.79, + "grad_norm": 0.427734375, + "learning_rate": 0.0004394043913607142, + "loss": 0.1943, + "step": 164030 + }, + { + "epoch": 6.79, + "grad_norm": 0.34375, + "learning_rate": 0.00043939731253767717, + "loss": 0.1877, + "step": 164040 + }, + { + "epoch": 6.79, + "grad_norm": 0.416015625, + "learning_rate": 0.0004393902333582148, + "loss": 0.2258, + "step": 164050 + }, + { + "epoch": 6.8, + "grad_norm": 0.482421875, + "learning_rate": 0.0004393831538223406, + "loss": 0.165, + "step": 164060 + }, + { + "epoch": 6.8, + "grad_norm": 0.55859375, + "learning_rate": 0.0004393760739300677, + "loss": 0.1837, + "step": 164070 + }, + { + "epoch": 6.8, + "grad_norm": 0.291015625, + "learning_rate": 0.00043936899368140944, + "loss": 0.1905, + "step": 164080 + }, + { + "epoch": 6.8, + "grad_norm": 1.6171875, + "learning_rate": 0.0004393619130763792, + "loss": 0.17, + "step": 164090 + }, + { + "epoch": 6.8, + "grad_norm": 0.474609375, + "learning_rate": 0.0004393548321149903, + "loss": 0.1737, + "step": 164100 + }, + { + "epoch": 6.8, + "grad_norm": 0.5, + "learning_rate": 0.0004393477507972561, + "loss": 0.2169, + "step": 164110 + }, + { + "epoch": 6.8, + "grad_norm": 0.66015625, + "learning_rate": 0.0004393406691231898, + "loss": 0.2142, + "step": 164120 + }, + { + "epoch": 6.8, + "grad_norm": 0.4609375, + "learning_rate": 0.0004393335870928048, + "loss": 0.1723, + "step": 164130 + }, + { + "epoch": 6.8, + "grad_norm": 0.5546875, + "learning_rate": 0.0004393265047061145, + "loss": 0.2635, + "step": 164140 + }, + { + "epoch": 6.8, + "grad_norm": 0.53125, + "learning_rate": 0.0004393194219631321, + "loss": 0.1246, + "step": 164150 + }, + { + "epoch": 6.8, + "grad_norm": 0.88671875, + "learning_rate": 0.00043931233886387107, + "loss": 0.2095, + "step": 164160 + }, + { + "epoch": 6.8, + "grad_norm": 0.6640625, + "learning_rate": 0.00043930525540834467, + "loss": 0.2058, + "step": 164170 + }, + { + "epoch": 6.8, + "grad_norm": 1.0703125, + "learning_rate": 0.00043929817159656613, + "loss": 0.1847, + "step": 164180 + }, + { + "epoch": 6.8, + "grad_norm": 0.83984375, + "learning_rate": 0.00043929108742854896, + "loss": 0.1986, + "step": 164190 + }, + { + "epoch": 6.8, + "grad_norm": 0.6171875, + "learning_rate": 0.0004392840029043063, + "loss": 0.2024, + "step": 164200 + }, + { + "epoch": 6.8, + "grad_norm": 1.0234375, + "learning_rate": 0.00043927691802385165, + "loss": 0.19, + "step": 164210 + }, + { + "epoch": 6.8, + "grad_norm": 0.349609375, + "learning_rate": 0.00043926983278719836, + "loss": 0.1699, + "step": 164220 + }, + { + "epoch": 6.8, + "grad_norm": 0.59765625, + "learning_rate": 0.00043926274719435955, + "loss": 0.1742, + "step": 164230 + }, + { + "epoch": 6.8, + "grad_norm": 0.75390625, + "learning_rate": 0.0004392556612453488, + "loss": 0.1421, + "step": 164240 + }, + { + "epoch": 6.8, + "grad_norm": 1.609375, + "learning_rate": 0.0004392485749401792, + "loss": 0.1859, + "step": 164250 + }, + { + "epoch": 6.8, + "grad_norm": 0.95703125, + "learning_rate": 0.0004392414882788643, + "loss": 0.2178, + "step": 164260 + }, + { + "epoch": 6.8, + "grad_norm": 0.94921875, + "learning_rate": 0.0004392344012614174, + "loss": 0.2228, + "step": 164270 + }, + { + "epoch": 6.8, + "grad_norm": 0.322265625, + "learning_rate": 0.00043922731388785173, + "loss": 0.16, + "step": 164280 + }, + { + "epoch": 6.8, + "grad_norm": 0.9140625, + "learning_rate": 0.0004392202261581807, + "loss": 0.2159, + "step": 164290 + }, + { + "epoch": 6.81, + "grad_norm": 0.74609375, + "learning_rate": 0.0004392131380724177, + "loss": 0.1982, + "step": 164300 + }, + { + "epoch": 6.81, + "grad_norm": 0.6171875, + "learning_rate": 0.0004392060496305759, + "loss": 0.1816, + "step": 164310 + }, + { + "epoch": 6.81, + "grad_norm": 0.80859375, + "learning_rate": 0.0004391989608326688, + "loss": 0.2052, + "step": 164320 + }, + { + "epoch": 6.81, + "grad_norm": 0.380859375, + "learning_rate": 0.00043919187167870964, + "loss": 0.1634, + "step": 164330 + }, + { + "epoch": 6.81, + "grad_norm": 0.60546875, + "learning_rate": 0.0004391847821687119, + "loss": 0.2298, + "step": 164340 + }, + { + "epoch": 6.81, + "grad_norm": 1.078125, + "learning_rate": 0.00043917769230268877, + "loss": 0.2419, + "step": 164350 + }, + { + "epoch": 6.81, + "grad_norm": 0.90234375, + "learning_rate": 0.0004391706020806537, + "loss": 0.187, + "step": 164360 + }, + { + "epoch": 6.81, + "grad_norm": 1.109375, + "learning_rate": 0.0004391635115026199, + "loss": 0.236, + "step": 164370 + }, + { + "epoch": 6.81, + "grad_norm": 0.162109375, + "learning_rate": 0.0004391564205686009, + "loss": 0.1151, + "step": 164380 + }, + { + "epoch": 6.81, + "grad_norm": 0.83203125, + "learning_rate": 0.00043914932927860986, + "loss": 0.2122, + "step": 164390 + }, + { + "epoch": 6.81, + "grad_norm": 0.953125, + "learning_rate": 0.00043914223763266024, + "loss": 0.1719, + "step": 164400 + }, + { + "epoch": 6.81, + "grad_norm": 1.3203125, + "learning_rate": 0.0004391351456307654, + "loss": 0.1445, + "step": 164410 + }, + { + "epoch": 6.81, + "grad_norm": 1.4375, + "learning_rate": 0.0004391280532729386, + "loss": 0.2117, + "step": 164420 + }, + { + "epoch": 6.81, + "grad_norm": 0.345703125, + "learning_rate": 0.0004391209605591932, + "loss": 0.2197, + "step": 164430 + }, + { + "epoch": 6.81, + "grad_norm": 0.65234375, + "learning_rate": 0.00043911386748954264, + "loss": 0.232, + "step": 164440 + }, + { + "epoch": 6.81, + "grad_norm": 0.79296875, + "learning_rate": 0.0004391067740640001, + "loss": 0.1531, + "step": 164450 + }, + { + "epoch": 6.81, + "grad_norm": 0.8984375, + "learning_rate": 0.00043909968028257916, + "loss": 0.2124, + "step": 164460 + }, + { + "epoch": 6.81, + "grad_norm": 1.9140625, + "learning_rate": 0.000439092586145293, + "loss": 0.2198, + "step": 164470 + }, + { + "epoch": 6.81, + "grad_norm": 0.81640625, + "learning_rate": 0.00043908549165215494, + "loss": 0.2027, + "step": 164480 + }, + { + "epoch": 6.81, + "grad_norm": 0.9375, + "learning_rate": 0.00043907839680317845, + "loss": 0.1788, + "step": 164490 + }, + { + "epoch": 6.81, + "grad_norm": 1.6015625, + "learning_rate": 0.00043907130159837683, + "loss": 0.2268, + "step": 164500 + }, + { + "epoch": 6.81, + "grad_norm": 0.287109375, + "learning_rate": 0.00043906420603776353, + "loss": 0.1917, + "step": 164510 + }, + { + "epoch": 6.81, + "grad_norm": 0.60546875, + "learning_rate": 0.0004390571101213517, + "loss": 0.2237, + "step": 164520 + }, + { + "epoch": 6.81, + "grad_norm": 0.82421875, + "learning_rate": 0.0004390500138491548, + "loss": 0.1972, + "step": 164530 + }, + { + "epoch": 6.82, + "grad_norm": 1.40625, + "learning_rate": 0.00043904291722118627, + "loss": 0.2021, + "step": 164540 + }, + { + "epoch": 6.82, + "grad_norm": 0.6171875, + "learning_rate": 0.00043903582023745933, + "loss": 0.2224, + "step": 164550 + }, + { + "epoch": 6.82, + "grad_norm": 0.5703125, + "learning_rate": 0.00043902872289798745, + "loss": 0.1665, + "step": 164560 + }, + { + "epoch": 6.82, + "grad_norm": 1.484375, + "learning_rate": 0.00043902162520278385, + "loss": 0.1874, + "step": 164570 + }, + { + "epoch": 6.82, + "grad_norm": 0.439453125, + "learning_rate": 0.000439014527151862, + "loss": 0.1762, + "step": 164580 + }, + { + "epoch": 6.82, + "grad_norm": 0.7890625, + "learning_rate": 0.00043900742874523527, + "loss": 0.2465, + "step": 164590 + }, + { + "epoch": 6.82, + "grad_norm": 0.51171875, + "learning_rate": 0.00043900032998291685, + "loss": 0.209, + "step": 164600 + }, + { + "epoch": 6.82, + "grad_norm": 1.0859375, + "learning_rate": 0.00043899323086492027, + "loss": 0.1663, + "step": 164610 + }, + { + "epoch": 6.82, + "grad_norm": 0.75390625, + "learning_rate": 0.00043898613139125886, + "loss": 0.225, + "step": 164620 + }, + { + "epoch": 6.82, + "grad_norm": 0.66796875, + "learning_rate": 0.00043897903156194597, + "loss": 0.2307, + "step": 164630 + }, + { + "epoch": 6.82, + "grad_norm": 0.78515625, + "learning_rate": 0.000438971931376995, + "loss": 0.2128, + "step": 164640 + }, + { + "epoch": 6.82, + "grad_norm": 1.3359375, + "learning_rate": 0.00043896483083641916, + "loss": 0.2013, + "step": 164650 + }, + { + "epoch": 6.82, + "grad_norm": 0.3046875, + "learning_rate": 0.000438957729940232, + "loss": 0.1915, + "step": 164660 + }, + { + "epoch": 6.82, + "grad_norm": 0.39453125, + "learning_rate": 0.0004389506286884467, + "loss": 0.2, + "step": 164670 + }, + { + "epoch": 6.82, + "grad_norm": 0.97265625, + "learning_rate": 0.00043894352708107677, + "loss": 0.2286, + "step": 164680 + }, + { + "epoch": 6.82, + "grad_norm": 0.37890625, + "learning_rate": 0.0004389364251181355, + "loss": 0.2185, + "step": 164690 + }, + { + "epoch": 6.82, + "grad_norm": 0.92578125, + "learning_rate": 0.0004389293227996363, + "loss": 0.2504, + "step": 164700 + }, + { + "epoch": 6.82, + "grad_norm": 1.6171875, + "learning_rate": 0.00043892222012559257, + "loss": 0.2055, + "step": 164710 + }, + { + "epoch": 6.82, + "grad_norm": 1.0546875, + "learning_rate": 0.00043891511709601757, + "loss": 0.1708, + "step": 164720 + }, + { + "epoch": 6.82, + "grad_norm": 0.73046875, + "learning_rate": 0.0004389080137109247, + "loss": 0.1925, + "step": 164730 + }, + { + "epoch": 6.82, + "grad_norm": 0.7265625, + "learning_rate": 0.0004389009099703274, + "loss": 0.2101, + "step": 164740 + }, + { + "epoch": 6.82, + "grad_norm": 1.7578125, + "learning_rate": 0.000438893805874239, + "loss": 0.2106, + "step": 164750 + }, + { + "epoch": 6.82, + "grad_norm": 0.376953125, + "learning_rate": 0.00043888670142267286, + "loss": 0.1527, + "step": 164760 + }, + { + "epoch": 6.82, + "grad_norm": 0.52734375, + "learning_rate": 0.00043887959661564224, + "loss": 0.1704, + "step": 164770 + }, + { + "epoch": 6.83, + "grad_norm": 1.8828125, + "learning_rate": 0.00043887249145316065, + "loss": 0.1887, + "step": 164780 + }, + { + "epoch": 6.83, + "grad_norm": 0.66796875, + "learning_rate": 0.0004388653859352415, + "loss": 0.2237, + "step": 164790 + }, + { + "epoch": 6.83, + "grad_norm": 0.41015625, + "learning_rate": 0.0004388582800618981, + "loss": 0.2178, + "step": 164800 + }, + { + "epoch": 6.83, + "grad_norm": 0.55078125, + "learning_rate": 0.0004388511738331437, + "loss": 0.2173, + "step": 164810 + }, + { + "epoch": 6.83, + "grad_norm": 1.7734375, + "learning_rate": 0.00043884406724899183, + "loss": 0.1847, + "step": 164820 + }, + { + "epoch": 6.83, + "grad_norm": 0.470703125, + "learning_rate": 0.0004388369603094559, + "loss": 0.1924, + "step": 164830 + }, + { + "epoch": 6.83, + "grad_norm": 1.0078125, + "learning_rate": 0.00043882985301454914, + "loss": 0.1997, + "step": 164840 + }, + { + "epoch": 6.83, + "grad_norm": 0.609375, + "learning_rate": 0.00043882274536428495, + "loss": 0.1805, + "step": 164850 + }, + { + "epoch": 6.83, + "grad_norm": 0.8359375, + "learning_rate": 0.0004388156373586768, + "loss": 0.2388, + "step": 164860 + }, + { + "epoch": 6.83, + "grad_norm": 0.56640625, + "learning_rate": 0.000438808528997738, + "loss": 0.1916, + "step": 164870 + }, + { + "epoch": 6.83, + "grad_norm": 0.72265625, + "learning_rate": 0.000438801420281482, + "loss": 0.1769, + "step": 164880 + }, + { + "epoch": 6.83, + "grad_norm": 0.83984375, + "learning_rate": 0.000438794311209922, + "loss": 0.2295, + "step": 164890 + }, + { + "epoch": 6.83, + "grad_norm": 0.34375, + "learning_rate": 0.00043878720178307153, + "loss": 0.2226, + "step": 164900 + }, + { + "epoch": 6.83, + "grad_norm": 1.2421875, + "learning_rate": 0.0004387800920009439, + "loss": 0.2229, + "step": 164910 + }, + { + "epoch": 6.83, + "grad_norm": 1.265625, + "learning_rate": 0.00043877298186355263, + "loss": 0.2116, + "step": 164920 + }, + { + "epoch": 6.83, + "grad_norm": 0.7109375, + "learning_rate": 0.00043876587137091095, + "loss": 0.179, + "step": 164930 + }, + { + "epoch": 6.83, + "grad_norm": 0.73828125, + "learning_rate": 0.0004387587605230322, + "loss": 0.2277, + "step": 164940 + }, + { + "epoch": 6.83, + "grad_norm": 0.8203125, + "learning_rate": 0.00043875164931992994, + "loss": 0.1977, + "step": 164950 + }, + { + "epoch": 6.83, + "grad_norm": 0.87109375, + "learning_rate": 0.00043874453776161743, + "loss": 0.2373, + "step": 164960 + }, + { + "epoch": 6.83, + "grad_norm": 0.9453125, + "learning_rate": 0.0004387374258481081, + "loss": 0.1725, + "step": 164970 + }, + { + "epoch": 6.83, + "grad_norm": 0.890625, + "learning_rate": 0.0004387303135794153, + "loss": 0.2434, + "step": 164980 + }, + { + "epoch": 6.83, + "grad_norm": 0.453125, + "learning_rate": 0.0004387232009555524, + "loss": 0.1856, + "step": 164990 + }, + { + "epoch": 6.83, + "grad_norm": 0.78125, + "learning_rate": 0.0004387160879765328, + "loss": 0.1634, + "step": 165000 + }, + { + "epoch": 6.83, + "grad_norm": 1.1328125, + "learning_rate": 0.00043870897464237, + "loss": 0.1696, + "step": 165010 + }, + { + "epoch": 6.84, + "grad_norm": 0.6640625, + "learning_rate": 0.0004387018609530773, + "loss": 0.2212, + "step": 165020 + }, + { + "epoch": 6.84, + "grad_norm": 0.60546875, + "learning_rate": 0.000438694746908668, + "loss": 0.1779, + "step": 165030 + }, + { + "epoch": 6.84, + "grad_norm": 0.73828125, + "learning_rate": 0.00043868763250915556, + "loss": 0.2238, + "step": 165040 + }, + { + "epoch": 6.84, + "grad_norm": 0.68359375, + "learning_rate": 0.0004386805177545534, + "loss": 0.1985, + "step": 165050 + }, + { + "epoch": 6.84, + "grad_norm": 0.6328125, + "learning_rate": 0.0004386734026448749, + "loss": 0.2221, + "step": 165060 + }, + { + "epoch": 6.84, + "grad_norm": 0.78515625, + "learning_rate": 0.0004386662871801334, + "loss": 0.1806, + "step": 165070 + }, + { + "epoch": 6.84, + "grad_norm": 0.65625, + "learning_rate": 0.0004386591713603423, + "loss": 0.217, + "step": 165080 + }, + { + "epoch": 6.84, + "grad_norm": 0.65625, + "learning_rate": 0.0004386520551855151, + "loss": 0.1998, + "step": 165090 + }, + { + "epoch": 6.84, + "grad_norm": 0.9453125, + "learning_rate": 0.000438644938655665, + "loss": 0.205, + "step": 165100 + }, + { + "epoch": 6.84, + "grad_norm": 1.375, + "learning_rate": 0.00043863782177080556, + "loss": 0.2371, + "step": 165110 + }, + { + "epoch": 6.84, + "grad_norm": 0.283203125, + "learning_rate": 0.0004386307045309501, + "loss": 0.2027, + "step": 165120 + }, + { + "epoch": 6.84, + "grad_norm": 0.3125, + "learning_rate": 0.00043862358693611204, + "loss": 0.2407, + "step": 165130 + }, + { + "epoch": 6.84, + "grad_norm": 0.478515625, + "learning_rate": 0.00043861646898630467, + "loss": 0.2084, + "step": 165140 + }, + { + "epoch": 6.84, + "grad_norm": 1.8046875, + "learning_rate": 0.00043860935068154155, + "loss": 0.2332, + "step": 165150 + }, + { + "epoch": 6.84, + "grad_norm": 0.5, + "learning_rate": 0.000438602232021836, + "loss": 0.1941, + "step": 165160 + }, + { + "epoch": 6.84, + "grad_norm": 0.0, + "learning_rate": 0.0004385951130072014, + "loss": 0.148, + "step": 165170 + }, + { + "epoch": 6.84, + "grad_norm": 0.390625, + "learning_rate": 0.0004385879936376512, + "loss": 0.2542, + "step": 165180 + }, + { + "epoch": 6.84, + "grad_norm": 0.80859375, + "learning_rate": 0.00043858087391319874, + "loss": 0.2528, + "step": 165190 + }, + { + "epoch": 6.84, + "grad_norm": 0.466796875, + "learning_rate": 0.00043857375383385747, + "loss": 0.2388, + "step": 165200 + }, + { + "epoch": 6.84, + "grad_norm": 1.8359375, + "learning_rate": 0.00043856663339964067, + "loss": 0.2329, + "step": 165210 + }, + { + "epoch": 6.84, + "grad_norm": 0.57421875, + "learning_rate": 0.00043855951261056195, + "loss": 0.2485, + "step": 165220 + }, + { + "epoch": 6.84, + "grad_norm": 0.859375, + "learning_rate": 0.00043855239146663443, + "loss": 0.2125, + "step": 165230 + }, + { + "epoch": 6.84, + "grad_norm": 0.7421875, + "learning_rate": 0.0004385452699678718, + "loss": 0.209, + "step": 165240 + }, + { + "epoch": 6.84, + "grad_norm": 0.59375, + "learning_rate": 0.00043853814811428726, + "loss": 0.2354, + "step": 165250 + }, + { + "epoch": 6.85, + "grad_norm": 1.1328125, + "learning_rate": 0.0004385310259058943, + "loss": 0.2616, + "step": 165260 + }, + { + "epoch": 6.85, + "grad_norm": 0.69921875, + "learning_rate": 0.00043852390334270637, + "loss": 0.2047, + "step": 165270 + }, + { + "epoch": 6.85, + "grad_norm": 0.546875, + "learning_rate": 0.00043851678042473673, + "loss": 0.1727, + "step": 165280 + }, + { + "epoch": 6.85, + "grad_norm": 0.72265625, + "learning_rate": 0.00043850965715199895, + "loss": 0.1822, + "step": 165290 + }, + { + "epoch": 6.85, + "grad_norm": 0.5546875, + "learning_rate": 0.0004385025335245063, + "loss": 0.1836, + "step": 165300 + }, + { + "epoch": 6.85, + "grad_norm": 0.65234375, + "learning_rate": 0.0004384954095422723, + "loss": 0.1968, + "step": 165310 + }, + { + "epoch": 6.85, + "grad_norm": 0.4609375, + "learning_rate": 0.0004384882852053103, + "loss": 0.189, + "step": 165320 + }, + { + "epoch": 6.85, + "grad_norm": 1.015625, + "learning_rate": 0.0004384811605136336, + "loss": 0.1956, + "step": 165330 + }, + { + "epoch": 6.85, + "grad_norm": 0.85546875, + "learning_rate": 0.0004384740354672557, + "loss": 0.2088, + "step": 165340 + }, + { + "epoch": 6.85, + "grad_norm": 0.59375, + "learning_rate": 0.0004384669100661901, + "loss": 0.201, + "step": 165350 + }, + { + "epoch": 6.85, + "grad_norm": 0.80078125, + "learning_rate": 0.00043845978431045007, + "loss": 0.1726, + "step": 165360 + }, + { + "epoch": 6.85, + "grad_norm": 1.25, + "learning_rate": 0.00043845265820004913, + "loss": 0.191, + "step": 165370 + }, + { + "epoch": 6.85, + "grad_norm": 1.09375, + "learning_rate": 0.0004384455317350007, + "loss": 0.2134, + "step": 165380 + }, + { + "epoch": 6.85, + "grad_norm": 0.54296875, + "learning_rate": 0.000438438404915318, + "loss": 0.2078, + "step": 165390 + }, + { + "epoch": 6.85, + "grad_norm": 0.81640625, + "learning_rate": 0.0004384312777410147, + "loss": 0.1313, + "step": 165400 + }, + { + "epoch": 6.85, + "grad_norm": 0.474609375, + "learning_rate": 0.0004384241502121039, + "loss": 0.1952, + "step": 165410 + }, + { + "epoch": 6.85, + "grad_norm": 0.5546875, + "learning_rate": 0.0004384170223285994, + "loss": 0.1927, + "step": 165420 + }, + { + "epoch": 6.85, + "grad_norm": 0.498046875, + "learning_rate": 0.00043840989409051425, + "loss": 0.1879, + "step": 165430 + }, + { + "epoch": 6.85, + "grad_norm": 0.419921875, + "learning_rate": 0.00043840276549786205, + "loss": 0.2189, + "step": 165440 + }, + { + "epoch": 6.85, + "grad_norm": 0.400390625, + "learning_rate": 0.00043839563655065627, + "loss": 0.2142, + "step": 165450 + }, + { + "epoch": 6.85, + "grad_norm": 0.388671875, + "learning_rate": 0.0004383885072489102, + "loss": 0.1575, + "step": 165460 + }, + { + "epoch": 6.85, + "grad_norm": 0.50390625, + "learning_rate": 0.0004383813775926373, + "loss": 0.1857, + "step": 165470 + }, + { + "epoch": 6.85, + "grad_norm": 0.5078125, + "learning_rate": 0.000438374247581851, + "loss": 0.2204, + "step": 165480 + }, + { + "epoch": 6.85, + "grad_norm": 2.328125, + "learning_rate": 0.00043836711721656474, + "loss": 0.2559, + "step": 165490 + }, + { + "epoch": 6.85, + "grad_norm": 0.9765625, + "learning_rate": 0.0004383599864967919, + "loss": 0.1896, + "step": 165500 + }, + { + "epoch": 6.86, + "grad_norm": 0.86328125, + "learning_rate": 0.0004383528554225459, + "loss": 0.209, + "step": 165510 + }, + { + "epoch": 6.86, + "grad_norm": 0.7578125, + "learning_rate": 0.0004383457239938401, + "loss": 0.1635, + "step": 165520 + }, + { + "epoch": 6.86, + "grad_norm": 2.109375, + "learning_rate": 0.00043833859221068807, + "loss": 0.2118, + "step": 165530 + }, + { + "epoch": 6.86, + "grad_norm": 1.5546875, + "learning_rate": 0.00043833146007310307, + "loss": 0.1899, + "step": 165540 + }, + { + "epoch": 6.86, + "grad_norm": 0.3359375, + "learning_rate": 0.0004383243275810986, + "loss": 0.1988, + "step": 165550 + }, + { + "epoch": 6.86, + "grad_norm": 0.7578125, + "learning_rate": 0.00043831719473468825, + "loss": 0.1822, + "step": 165560 + }, + { + "epoch": 6.86, + "grad_norm": 0.498046875, + "learning_rate": 0.0004383100615338851, + "loss": 0.2204, + "step": 165570 + }, + { + "epoch": 6.86, + "grad_norm": 0.67578125, + "learning_rate": 0.00043830292797870284, + "loss": 0.1558, + "step": 165580 + }, + { + "epoch": 6.86, + "grad_norm": 0.97265625, + "learning_rate": 0.0004382957940691548, + "loss": 0.2565, + "step": 165590 + }, + { + "epoch": 6.86, + "grad_norm": 0.640625, + "learning_rate": 0.0004382886598052543, + "loss": 0.1623, + "step": 165600 + }, + { + "epoch": 6.86, + "grad_norm": 0.82421875, + "learning_rate": 0.000438281525187015, + "loss": 0.2479, + "step": 165610 + }, + { + "epoch": 6.86, + "grad_norm": 0.54296875, + "learning_rate": 0.00043827439021445016, + "loss": 0.1663, + "step": 165620 + }, + { + "epoch": 6.86, + "grad_norm": 0.400390625, + "learning_rate": 0.00043826725488757323, + "loss": 0.2339, + "step": 165630 + }, + { + "epoch": 6.86, + "grad_norm": 0.91796875, + "learning_rate": 0.0004382601192063976, + "loss": 0.1856, + "step": 165640 + }, + { + "epoch": 6.86, + "grad_norm": 0.625, + "learning_rate": 0.00043825298317093695, + "loss": 0.2559, + "step": 165650 + }, + { + "epoch": 6.86, + "grad_norm": 0.80078125, + "learning_rate": 0.0004382458467812044, + "loss": 0.1872, + "step": 165660 + }, + { + "epoch": 6.86, + "grad_norm": 0.65625, + "learning_rate": 0.0004382387100372135, + "loss": 0.1871, + "step": 165670 + }, + { + "epoch": 6.86, + "grad_norm": 0.8359375, + "learning_rate": 0.00043823157293897765, + "loss": 0.2174, + "step": 165680 + }, + { + "epoch": 6.86, + "grad_norm": 0.85546875, + "learning_rate": 0.0004382244354865103, + "loss": 0.1886, + "step": 165690 + }, + { + "epoch": 6.86, + "grad_norm": 0.310546875, + "learning_rate": 0.00043821729767982493, + "loss": 0.1789, + "step": 165700 + }, + { + "epoch": 6.86, + "grad_norm": 1.0625, + "learning_rate": 0.00043821015951893487, + "loss": 0.1643, + "step": 165710 + }, + { + "epoch": 6.86, + "grad_norm": 2.796875, + "learning_rate": 0.0004382030210038537, + "loss": 0.2283, + "step": 165720 + }, + { + "epoch": 6.86, + "grad_norm": 0.59375, + "learning_rate": 0.0004381958821345947, + "loss": 0.1957, + "step": 165730 + }, + { + "epoch": 6.86, + "grad_norm": 0.4296875, + "learning_rate": 0.0004381887429111714, + "loss": 0.2035, + "step": 165740 + }, + { + "epoch": 6.87, + "grad_norm": 1.984375, + "learning_rate": 0.0004381816033335972, + "loss": 0.1852, + "step": 165750 + }, + { + "epoch": 6.87, + "grad_norm": 0.61328125, + "learning_rate": 0.0004381744634018856, + "loss": 0.22, + "step": 165760 + }, + { + "epoch": 6.87, + "grad_norm": 0.8203125, + "learning_rate": 0.0004381673231160499, + "loss": 0.2237, + "step": 165770 + }, + { + "epoch": 6.87, + "grad_norm": 1.0703125, + "learning_rate": 0.00043816018247610366, + "loss": 0.1744, + "step": 165780 + }, + { + "epoch": 6.87, + "grad_norm": 0.56640625, + "learning_rate": 0.00043815304148206024, + "loss": 0.223, + "step": 165790 + }, + { + "epoch": 6.87, + "grad_norm": 0.7265625, + "learning_rate": 0.0004381459001339332, + "loss": 0.159, + "step": 165800 + }, + { + "epoch": 6.87, + "grad_norm": 0.84765625, + "learning_rate": 0.00043813875843173584, + "loss": 0.201, + "step": 165810 + }, + { + "epoch": 6.87, + "grad_norm": 1.484375, + "learning_rate": 0.00043813161637548166, + "loss": 0.2478, + "step": 165820 + }, + { + "epoch": 6.87, + "grad_norm": 0.466796875, + "learning_rate": 0.00043812447396518407, + "loss": 0.2494, + "step": 165830 + }, + { + "epoch": 6.87, + "grad_norm": 0.408203125, + "learning_rate": 0.00043811733120085653, + "loss": 0.2147, + "step": 165840 + }, + { + "epoch": 6.87, + "grad_norm": 0.361328125, + "learning_rate": 0.0004381101880825126, + "loss": 0.2256, + "step": 165850 + }, + { + "epoch": 6.87, + "grad_norm": 1.453125, + "learning_rate": 0.00043810304461016546, + "loss": 0.2021, + "step": 165860 + }, + { + "epoch": 6.87, + "grad_norm": 0.671875, + "learning_rate": 0.00043809590078382877, + "loss": 0.2621, + "step": 165870 + }, + { + "epoch": 6.87, + "grad_norm": 0.5546875, + "learning_rate": 0.0004380887566035159, + "loss": 0.2219, + "step": 165880 + }, + { + "epoch": 6.87, + "grad_norm": 0.73828125, + "learning_rate": 0.0004380816120692403, + "loss": 0.211, + "step": 165890 + }, + { + "epoch": 6.87, + "grad_norm": 0.78515625, + "learning_rate": 0.0004380744671810154, + "loss": 0.2399, + "step": 165900 + }, + { + "epoch": 6.87, + "grad_norm": 0.8515625, + "learning_rate": 0.0004380673219388547, + "loss": 0.1596, + "step": 165910 + }, + { + "epoch": 6.87, + "grad_norm": 0.96484375, + "learning_rate": 0.00043806017634277165, + "loss": 0.1826, + "step": 165920 + }, + { + "epoch": 6.87, + "grad_norm": 0.55078125, + "learning_rate": 0.00043805303039277954, + "loss": 0.1658, + "step": 165930 + }, + { + "epoch": 6.87, + "grad_norm": 1.203125, + "learning_rate": 0.000438045884088892, + "loss": 0.1923, + "step": 165940 + }, + { + "epoch": 6.87, + "grad_norm": 0.875, + "learning_rate": 0.00043803873743112243, + "loss": 0.2176, + "step": 165950 + }, + { + "epoch": 6.87, + "grad_norm": 0.412109375, + "learning_rate": 0.0004380315904194842, + "loss": 0.2008, + "step": 165960 + }, + { + "epoch": 6.87, + "grad_norm": 0.3671875, + "learning_rate": 0.0004380244430539909, + "loss": 0.234, + "step": 165970 + }, + { + "epoch": 6.87, + "grad_norm": 2.578125, + "learning_rate": 0.00043801729533465584, + "loss": 0.2256, + "step": 165980 + }, + { + "epoch": 6.88, + "grad_norm": 0.5, + "learning_rate": 0.00043801014726149257, + "loss": 0.1925, + "step": 165990 + }, + { + "epoch": 6.88, + "grad_norm": 1.1953125, + "learning_rate": 0.00043800299883451445, + "loss": 0.241, + "step": 166000 + }, + { + "epoch": 6.88, + "grad_norm": 0.77734375, + "learning_rate": 0.0004379958500537351, + "loss": 0.2574, + "step": 166010 + }, + { + "epoch": 6.88, + "grad_norm": 0.875, + "learning_rate": 0.00043798870091916775, + "loss": 0.1822, + "step": 166020 + }, + { + "epoch": 6.88, + "grad_norm": 1.296875, + "learning_rate": 0.00043798155143082607, + "loss": 0.1991, + "step": 166030 + }, + { + "epoch": 6.88, + "grad_norm": 0.79296875, + "learning_rate": 0.0004379744015887233, + "loss": 0.2165, + "step": 166040 + }, + { + "epoch": 6.88, + "grad_norm": 1.3984375, + "learning_rate": 0.0004379672513928731, + "loss": 0.2139, + "step": 166050 + }, + { + "epoch": 6.88, + "grad_norm": 0.546875, + "learning_rate": 0.00043796010084328873, + "loss": 0.1649, + "step": 166060 + }, + { + "epoch": 6.88, + "grad_norm": 0.80859375, + "learning_rate": 0.00043795294993998385, + "loss": 0.2413, + "step": 166070 + }, + { + "epoch": 6.88, + "grad_norm": 0.671875, + "learning_rate": 0.0004379457986829718, + "loss": 0.2051, + "step": 166080 + }, + { + "epoch": 6.88, + "grad_norm": 2.453125, + "learning_rate": 0.000437938647072266, + "loss": 0.2616, + "step": 166090 + }, + { + "epoch": 6.88, + "grad_norm": 0.20703125, + "learning_rate": 0.00043793149510788, + "loss": 0.1691, + "step": 166100 + }, + { + "epoch": 6.88, + "grad_norm": 0.2578125, + "learning_rate": 0.0004379243427898272, + "loss": 0.2379, + "step": 166110 + }, + { + "epoch": 6.88, + "grad_norm": 0.384765625, + "learning_rate": 0.0004379171901181211, + "loss": 0.1848, + "step": 166120 + }, + { + "epoch": 6.88, + "grad_norm": 0.46484375, + "learning_rate": 0.0004379100370927751, + "loss": 0.2186, + "step": 166130 + }, + { + "epoch": 6.88, + "grad_norm": 0.515625, + "learning_rate": 0.00043790288371380276, + "loss": 0.1613, + "step": 166140 + }, + { + "epoch": 6.88, + "grad_norm": 1.234375, + "learning_rate": 0.00043789572998121745, + "loss": 0.1769, + "step": 166150 + }, + { + "epoch": 6.88, + "grad_norm": 0.4609375, + "learning_rate": 0.0004378885758950327, + "loss": 0.2132, + "step": 166160 + }, + { + "epoch": 6.88, + "grad_norm": 5.125, + "learning_rate": 0.0004378814214552619, + "loss": 0.1786, + "step": 166170 + }, + { + "epoch": 6.88, + "grad_norm": 0.734375, + "learning_rate": 0.00043787426666191856, + "loss": 0.2339, + "step": 166180 + }, + { + "epoch": 6.88, + "grad_norm": 0.765625, + "learning_rate": 0.0004378671115150162, + "loss": 0.2138, + "step": 166190 + }, + { + "epoch": 6.88, + "grad_norm": 0.421875, + "learning_rate": 0.0004378599560145682, + "loss": 0.1899, + "step": 166200 + }, + { + "epoch": 6.88, + "grad_norm": 0.6953125, + "learning_rate": 0.000437852800160588, + "loss": 0.1977, + "step": 166210 + }, + { + "epoch": 6.88, + "grad_norm": 0.60546875, + "learning_rate": 0.0004378456439530891, + "loss": 0.2682, + "step": 166220 + }, + { + "epoch": 6.89, + "grad_norm": 0.68359375, + "learning_rate": 0.00043783848739208504, + "loss": 0.2134, + "step": 166230 + }, + { + "epoch": 6.89, + "grad_norm": 0.56640625, + "learning_rate": 0.0004378313304775892, + "loss": 0.1988, + "step": 166240 + }, + { + "epoch": 6.89, + "grad_norm": 0.3125, + "learning_rate": 0.00043782417320961507, + "loss": 0.2083, + "step": 166250 + }, + { + "epoch": 6.89, + "grad_norm": 0.25390625, + "learning_rate": 0.0004378170155881762, + "loss": 0.1569, + "step": 166260 + }, + { + "epoch": 6.89, + "grad_norm": 1.0859375, + "learning_rate": 0.00043780985761328594, + "loss": 0.1929, + "step": 166270 + }, + { + "epoch": 6.89, + "grad_norm": 0.59375, + "learning_rate": 0.0004378026992849578, + "loss": 0.2099, + "step": 166280 + }, + { + "epoch": 6.89, + "grad_norm": 1.234375, + "learning_rate": 0.0004377955406032053, + "loss": 0.1926, + "step": 166290 + }, + { + "epoch": 6.89, + "grad_norm": 0.53125, + "learning_rate": 0.00043778838156804186, + "loss": 0.1768, + "step": 166300 + }, + { + "epoch": 6.89, + "grad_norm": 0.423828125, + "learning_rate": 0.0004377812221794809, + "loss": 0.2527, + "step": 166310 + }, + { + "epoch": 6.89, + "grad_norm": 0.220703125, + "learning_rate": 0.00043777406243753603, + "loss": 0.2092, + "step": 166320 + }, + { + "epoch": 6.89, + "grad_norm": 0.6328125, + "learning_rate": 0.0004377669023422206, + "loss": 0.237, + "step": 166330 + }, + { + "epoch": 6.89, + "grad_norm": 0.9140625, + "learning_rate": 0.00043775974189354824, + "loss": 0.2208, + "step": 166340 + }, + { + "epoch": 6.89, + "grad_norm": 0.69140625, + "learning_rate": 0.0004377525810915323, + "loss": 0.1783, + "step": 166350 + }, + { + "epoch": 6.89, + "grad_norm": 0.0, + "learning_rate": 0.00043774541993618616, + "loss": 0.2444, + "step": 166360 + }, + { + "epoch": 6.89, + "grad_norm": 0.3203125, + "learning_rate": 0.0004377382584275235, + "loss": 0.1478, + "step": 166370 + }, + { + "epoch": 6.89, + "grad_norm": 0.62890625, + "learning_rate": 0.00043773109656555765, + "loss": 0.2238, + "step": 166380 + }, + { + "epoch": 6.89, + "grad_norm": 0.8515625, + "learning_rate": 0.0004377239343503022, + "loss": 0.2019, + "step": 166390 + }, + { + "epoch": 6.89, + "grad_norm": 0.486328125, + "learning_rate": 0.0004377167717817706, + "loss": 0.2187, + "step": 166400 + }, + { + "epoch": 6.89, + "grad_norm": 0.45703125, + "learning_rate": 0.0004377096088599763, + "loss": 0.1828, + "step": 166410 + }, + { + "epoch": 6.89, + "grad_norm": 0.1904296875, + "learning_rate": 0.00043770244558493277, + "loss": 0.2059, + "step": 166420 + }, + { + "epoch": 6.89, + "grad_norm": 0.28515625, + "learning_rate": 0.0004376952819566535, + "loss": 0.2403, + "step": 166430 + }, + { + "epoch": 6.89, + "grad_norm": 0.63671875, + "learning_rate": 0.00043768811797515193, + "loss": 0.2642, + "step": 166440 + }, + { + "epoch": 6.89, + "grad_norm": 1.0234375, + "learning_rate": 0.0004376809536404417, + "loss": 0.2108, + "step": 166450 + }, + { + "epoch": 6.89, + "grad_norm": 0.3203125, + "learning_rate": 0.00043767378895253614, + "loss": 0.1627, + "step": 166460 + }, + { + "epoch": 6.9, + "grad_norm": 0.53125, + "learning_rate": 0.00043766662391144873, + "loss": 0.1887, + "step": 166470 + }, + { + "epoch": 6.9, + "grad_norm": 0.515625, + "learning_rate": 0.00043765945851719304, + "loss": 0.2147, + "step": 166480 + }, + { + "epoch": 6.9, + "grad_norm": 0.62109375, + "learning_rate": 0.0004376522927697825, + "loss": 0.153, + "step": 166490 + }, + { + "epoch": 6.9, + "grad_norm": 1.234375, + "learning_rate": 0.0004376451266692306, + "loss": 0.2144, + "step": 166500 + }, + { + "epoch": 6.9, + "grad_norm": 1.078125, + "learning_rate": 0.0004376379602155509, + "loss": 0.2004, + "step": 166510 + }, + { + "epoch": 6.9, + "grad_norm": 1.53125, + "learning_rate": 0.00043763079340875677, + "loss": 0.2247, + "step": 166520 + }, + { + "epoch": 6.9, + "grad_norm": 0.74609375, + "learning_rate": 0.00043762362624886174, + "loss": 0.2087, + "step": 166530 + }, + { + "epoch": 6.9, + "grad_norm": 0.37109375, + "learning_rate": 0.00043761645873587935, + "loss": 0.165, + "step": 166540 + }, + { + "epoch": 6.9, + "grad_norm": 0.6015625, + "learning_rate": 0.00043760929086982306, + "loss": 0.259, + "step": 166550 + }, + { + "epoch": 6.9, + "grad_norm": 0.51953125, + "learning_rate": 0.00043760212265070626, + "loss": 0.2345, + "step": 166560 + }, + { + "epoch": 6.9, + "grad_norm": 1.5703125, + "learning_rate": 0.00043759495407854254, + "loss": 0.193, + "step": 166570 + }, + { + "epoch": 6.9, + "grad_norm": 0.47265625, + "learning_rate": 0.0004375877851533454, + "loss": 0.162, + "step": 166580 + }, + { + "epoch": 6.9, + "grad_norm": 0.58984375, + "learning_rate": 0.0004375806158751283, + "loss": 0.2265, + "step": 166590 + }, + { + "epoch": 6.9, + "grad_norm": 1.3125, + "learning_rate": 0.00043757344624390475, + "loss": 0.2068, + "step": 166600 + }, + { + "epoch": 6.9, + "grad_norm": 0.66015625, + "learning_rate": 0.00043756627625968827, + "loss": 0.25, + "step": 166610 + }, + { + "epoch": 6.9, + "grad_norm": 0.796875, + "learning_rate": 0.00043755910592249226, + "loss": 0.1993, + "step": 166620 + }, + { + "epoch": 6.9, + "grad_norm": 0.78515625, + "learning_rate": 0.00043755193523233027, + "loss": 0.1613, + "step": 166630 + }, + { + "epoch": 6.9, + "grad_norm": 0.1689453125, + "learning_rate": 0.0004375447641892158, + "loss": 0.2136, + "step": 166640 + }, + { + "epoch": 6.9, + "grad_norm": 0.423828125, + "learning_rate": 0.00043753759279316237, + "loss": 0.1894, + "step": 166650 + }, + { + "epoch": 6.9, + "grad_norm": 0.462890625, + "learning_rate": 0.0004375304210441834, + "loss": 0.198, + "step": 166660 + }, + { + "epoch": 6.9, + "grad_norm": 1.28125, + "learning_rate": 0.00043752324894229244, + "loss": 0.2122, + "step": 166670 + }, + { + "epoch": 6.9, + "grad_norm": 1.1328125, + "learning_rate": 0.0004375160764875029, + "loss": 0.1805, + "step": 166680 + }, + { + "epoch": 6.9, + "grad_norm": 1.53125, + "learning_rate": 0.0004375089036798285, + "loss": 0.234, + "step": 166690 + }, + { + "epoch": 6.9, + "grad_norm": 0.40625, + "learning_rate": 0.00043750173051928257, + "loss": 0.2001, + "step": 166700 + }, + { + "epoch": 6.91, + "grad_norm": 0.70703125, + "learning_rate": 0.00043749455700587846, + "loss": 0.2474, + "step": 166710 + }, + { + "epoch": 6.91, + "grad_norm": 0.26171875, + "learning_rate": 0.00043748738313963, + "loss": 0.2356, + "step": 166720 + }, + { + "epoch": 6.91, + "grad_norm": 0.7265625, + "learning_rate": 0.00043748020892055053, + "loss": 0.2477, + "step": 166730 + }, + { + "epoch": 6.91, + "grad_norm": 0.76953125, + "learning_rate": 0.0004374730343486535, + "loss": 0.2709, + "step": 166740 + }, + { + "epoch": 6.91, + "grad_norm": 0.55078125, + "learning_rate": 0.00043746585942395247, + "loss": 0.1439, + "step": 166750 + }, + { + "epoch": 6.91, + "grad_norm": 0.9609375, + "learning_rate": 0.00043745868414646094, + "loss": 0.2148, + "step": 166760 + }, + { + "epoch": 6.91, + "grad_norm": 0.88671875, + "learning_rate": 0.0004374515085161924, + "loss": 0.1874, + "step": 166770 + }, + { + "epoch": 6.91, + "grad_norm": 1.03125, + "learning_rate": 0.0004374443325331604, + "loss": 0.2418, + "step": 166780 + }, + { + "epoch": 6.91, + "grad_norm": 1.1171875, + "learning_rate": 0.00043743715619737843, + "loss": 0.209, + "step": 166790 + }, + { + "epoch": 6.91, + "grad_norm": 0.71875, + "learning_rate": 0.00043742997950885996, + "loss": 0.1892, + "step": 166800 + }, + { + "epoch": 6.91, + "grad_norm": 0.349609375, + "learning_rate": 0.0004374228024676185, + "loss": 0.2012, + "step": 166810 + }, + { + "epoch": 6.91, + "grad_norm": 0.8984375, + "learning_rate": 0.0004374156250736675, + "loss": 0.1873, + "step": 166820 + }, + { + "epoch": 6.91, + "grad_norm": 0.412109375, + "learning_rate": 0.00043740844732702056, + "loss": 0.2175, + "step": 166830 + }, + { + "epoch": 6.91, + "grad_norm": 0.640625, + "learning_rate": 0.0004374012692276912, + "loss": 0.2099, + "step": 166840 + }, + { + "epoch": 6.91, + "grad_norm": 0.283203125, + "learning_rate": 0.0004373940907756929, + "loss": 0.2147, + "step": 166850 + }, + { + "epoch": 6.91, + "grad_norm": 0.498046875, + "learning_rate": 0.0004373869119710391, + "loss": 0.1659, + "step": 166860 + }, + { + "epoch": 6.91, + "grad_norm": 1.4765625, + "learning_rate": 0.00043737973281374344, + "loss": 0.2216, + "step": 166870 + }, + { + "epoch": 6.91, + "grad_norm": 0.2099609375, + "learning_rate": 0.0004373725533038193, + "loss": 0.2076, + "step": 166880 + }, + { + "epoch": 6.91, + "grad_norm": 0.64453125, + "learning_rate": 0.00043736537344128023, + "loss": 0.2274, + "step": 166890 + }, + { + "epoch": 6.91, + "grad_norm": 0.419921875, + "learning_rate": 0.0004373581932261398, + "loss": 0.2487, + "step": 166900 + }, + { + "epoch": 6.91, + "grad_norm": 0.349609375, + "learning_rate": 0.0004373510126584115, + "loss": 0.2328, + "step": 166910 + }, + { + "epoch": 6.91, + "grad_norm": 0.77734375, + "learning_rate": 0.0004373438317381088, + "loss": 0.1934, + "step": 166920 + }, + { + "epoch": 6.91, + "grad_norm": 0.703125, + "learning_rate": 0.0004373366504652453, + "loss": 0.2154, + "step": 166930 + }, + { + "epoch": 6.91, + "grad_norm": 1.4296875, + "learning_rate": 0.0004373294688398344, + "loss": 0.2048, + "step": 166940 + }, + { + "epoch": 6.92, + "grad_norm": 0.373046875, + "learning_rate": 0.0004373222868618897, + "loss": 0.18, + "step": 166950 + }, + { + "epoch": 6.92, + "grad_norm": 0.376953125, + "learning_rate": 0.0004373151045314246, + "loss": 0.1683, + "step": 166960 + }, + { + "epoch": 6.92, + "grad_norm": 2.484375, + "learning_rate": 0.0004373079218484528, + "loss": 0.2376, + "step": 166970 + }, + { + "epoch": 6.92, + "grad_norm": 0.5078125, + "learning_rate": 0.00043730073881298757, + "loss": 0.208, + "step": 166980 + }, + { + "epoch": 6.92, + "grad_norm": 0.29296875, + "learning_rate": 0.0004372935554250427, + "loss": 0.2238, + "step": 166990 + }, + { + "epoch": 6.92, + "grad_norm": 1.703125, + "learning_rate": 0.0004372863716846316, + "loss": 0.1962, + "step": 167000 + }, + { + "epoch": 6.92, + "grad_norm": 0.55859375, + "learning_rate": 0.00043727918759176775, + "loss": 0.2044, + "step": 167010 + }, + { + "epoch": 6.92, + "grad_norm": 0.52734375, + "learning_rate": 0.00043727200314646464, + "loss": 0.2153, + "step": 167020 + }, + { + "epoch": 6.92, + "grad_norm": 0.91796875, + "learning_rate": 0.00043726481834873587, + "loss": 0.1967, + "step": 167030 + }, + { + "epoch": 6.92, + "grad_norm": 0.78125, + "learning_rate": 0.00043725763319859504, + "loss": 0.2141, + "step": 167040 + }, + { + "epoch": 6.92, + "grad_norm": 0.50390625, + "learning_rate": 0.0004372504476960555, + "loss": 0.1748, + "step": 167050 + }, + { + "epoch": 6.92, + "grad_norm": 0.68359375, + "learning_rate": 0.0004372432618411308, + "loss": 0.1924, + "step": 167060 + }, + { + "epoch": 6.92, + "grad_norm": 0.373046875, + "learning_rate": 0.0004372360756338345, + "loss": 0.2029, + "step": 167070 + }, + { + "epoch": 6.92, + "grad_norm": 0.8828125, + "learning_rate": 0.0004372288890741801, + "loss": 0.1707, + "step": 167080 + }, + { + "epoch": 6.92, + "grad_norm": 0.7578125, + "learning_rate": 0.00043722170216218127, + "loss": 0.1722, + "step": 167090 + }, + { + "epoch": 6.92, + "grad_norm": 0.4140625, + "learning_rate": 0.0004372145148978513, + "loss": 0.2264, + "step": 167100 + }, + { + "epoch": 6.92, + "grad_norm": 0.58984375, + "learning_rate": 0.0004372073272812038, + "loss": 0.2155, + "step": 167110 + }, + { + "epoch": 6.92, + "grad_norm": 0.57421875, + "learning_rate": 0.00043720013931225244, + "loss": 0.1982, + "step": 167120 + }, + { + "epoch": 6.92, + "grad_norm": 1.046875, + "learning_rate": 0.00043719295099101064, + "loss": 0.2152, + "step": 167130 + }, + { + "epoch": 6.92, + "grad_norm": 1.359375, + "learning_rate": 0.00043718576231749183, + "loss": 0.1956, + "step": 167140 + }, + { + "epoch": 6.92, + "grad_norm": 0.8203125, + "learning_rate": 0.00043717857329170966, + "loss": 0.1698, + "step": 167150 + }, + { + "epoch": 6.92, + "grad_norm": 0.74609375, + "learning_rate": 0.00043717138391367763, + "loss": 0.1943, + "step": 167160 + }, + { + "epoch": 6.92, + "grad_norm": 0.5234375, + "learning_rate": 0.00043716419418340935, + "loss": 0.1927, + "step": 167170 + }, + { + "epoch": 6.92, + "grad_norm": 1.546875, + "learning_rate": 0.0004371570041009182, + "loss": 0.1748, + "step": 167180 + }, + { + "epoch": 6.92, + "grad_norm": 0.7421875, + "learning_rate": 0.00043714981366621775, + "loss": 0.1776, + "step": 167190 + }, + { + "epoch": 6.93, + "grad_norm": 0.87109375, + "learning_rate": 0.00043714262287932154, + "loss": 0.1764, + "step": 167200 + }, + { + "epoch": 6.93, + "grad_norm": 0.2099609375, + "learning_rate": 0.0004371354317402432, + "loss": 0.1992, + "step": 167210 + }, + { + "epoch": 6.93, + "grad_norm": 0.72265625, + "learning_rate": 0.00043712824024899616, + "loss": 0.2323, + "step": 167220 + }, + { + "epoch": 6.93, + "grad_norm": 0.67578125, + "learning_rate": 0.000437121048405594, + "loss": 0.1756, + "step": 167230 + }, + { + "epoch": 6.93, + "grad_norm": 0.6015625, + "learning_rate": 0.00043711385621005017, + "loss": 0.1588, + "step": 167240 + }, + { + "epoch": 6.93, + "grad_norm": 1.984375, + "learning_rate": 0.00043710666366237835, + "loss": 0.2068, + "step": 167250 + }, + { + "epoch": 6.93, + "grad_norm": 0.640625, + "learning_rate": 0.0004370994707625919, + "loss": 0.1908, + "step": 167260 + }, + { + "epoch": 6.93, + "grad_norm": 0.72265625, + "learning_rate": 0.00043709227751070456, + "loss": 0.2465, + "step": 167270 + }, + { + "epoch": 6.93, + "grad_norm": 1.015625, + "learning_rate": 0.00043708508390672963, + "loss": 0.2194, + "step": 167280 + }, + { + "epoch": 6.93, + "grad_norm": 1.0, + "learning_rate": 0.0004370778899506809, + "loss": 0.216, + "step": 167290 + }, + { + "epoch": 6.93, + "grad_norm": 1.046875, + "learning_rate": 0.0004370706956425717, + "loss": 0.217, + "step": 167300 + }, + { + "epoch": 6.93, + "grad_norm": 0.67578125, + "learning_rate": 0.0004370635009824157, + "loss": 0.2429, + "step": 167310 + }, + { + "epoch": 6.93, + "grad_norm": 0.77734375, + "learning_rate": 0.0004370563059702264, + "loss": 0.2228, + "step": 167320 + }, + { + "epoch": 6.93, + "grad_norm": 1.1640625, + "learning_rate": 0.00043704911060601725, + "loss": 0.2323, + "step": 167330 + }, + { + "epoch": 6.93, + "grad_norm": 0.61328125, + "learning_rate": 0.000437041914889802, + "loss": 0.2132, + "step": 167340 + }, + { + "epoch": 6.93, + "grad_norm": 1.8125, + "learning_rate": 0.000437034718821594, + "loss": 0.2076, + "step": 167350 + }, + { + "epoch": 6.93, + "grad_norm": 0.279296875, + "learning_rate": 0.00043702752240140684, + "loss": 0.1805, + "step": 167360 + }, + { + "epoch": 6.93, + "grad_norm": 1.9765625, + "learning_rate": 0.00043702032562925407, + "loss": 0.1969, + "step": 167370 + }, + { + "epoch": 6.93, + "grad_norm": 2.25, + "learning_rate": 0.00043701312850514927, + "loss": 0.2099, + "step": 167380 + }, + { + "epoch": 6.93, + "grad_norm": 0.91015625, + "learning_rate": 0.00043700593102910605, + "loss": 0.2526, + "step": 167390 + }, + { + "epoch": 6.93, + "grad_norm": 0.80859375, + "learning_rate": 0.0004369987332011377, + "loss": 0.1817, + "step": 167400 + }, + { + "epoch": 6.93, + "grad_norm": 0.6796875, + "learning_rate": 0.000436991535021258, + "loss": 0.2068, + "step": 167410 + }, + { + "epoch": 6.93, + "grad_norm": 0.90234375, + "learning_rate": 0.00043698433648948045, + "loss": 0.1572, + "step": 167420 + }, + { + "epoch": 6.93, + "grad_norm": 0.83984375, + "learning_rate": 0.0004369771376058185, + "loss": 0.2034, + "step": 167430 + }, + { + "epoch": 6.94, + "grad_norm": 0.330078125, + "learning_rate": 0.0004369699383702859, + "loss": 0.1901, + "step": 167440 + }, + { + "epoch": 6.94, + "grad_norm": 0.453125, + "learning_rate": 0.00043696273878289597, + "loss": 0.1952, + "step": 167450 + }, + { + "epoch": 6.94, + "grad_norm": 0.94140625, + "learning_rate": 0.00043695553884366234, + "loss": 0.1921, + "step": 167460 + }, + { + "epoch": 6.94, + "grad_norm": 0.68359375, + "learning_rate": 0.0004369483385525986, + "loss": 0.185, + "step": 167470 + }, + { + "epoch": 6.94, + "grad_norm": 0.49609375, + "learning_rate": 0.00043694113790971825, + "loss": 0.2455, + "step": 167480 + }, + { + "epoch": 6.94, + "grad_norm": 0.0, + "learning_rate": 0.0004369339369150349, + "loss": 0.1962, + "step": 167490 + }, + { + "epoch": 6.94, + "grad_norm": 0.85546875, + "learning_rate": 0.0004369267355685621, + "loss": 0.2715, + "step": 167500 + }, + { + "epoch": 6.94, + "grad_norm": 0.68359375, + "learning_rate": 0.00043691953387031327, + "loss": 0.2284, + "step": 167510 + }, + { + "epoch": 6.94, + "grad_norm": 0.458984375, + "learning_rate": 0.0004369123318203021, + "loss": 0.2288, + "step": 167520 + }, + { + "epoch": 6.94, + "grad_norm": 0.734375, + "learning_rate": 0.0004369051294185422, + "loss": 0.1736, + "step": 167530 + }, + { + "epoch": 6.94, + "grad_norm": 0.470703125, + "learning_rate": 0.0004368979266650469, + "loss": 0.2305, + "step": 167540 + }, + { + "epoch": 6.94, + "grad_norm": 0.80859375, + "learning_rate": 0.00043689072355982994, + "loss": 0.2367, + "step": 167550 + }, + { + "epoch": 6.94, + "grad_norm": 0.87109375, + "learning_rate": 0.00043688352010290486, + "loss": 0.2412, + "step": 167560 + }, + { + "epoch": 6.94, + "grad_norm": 0.392578125, + "learning_rate": 0.00043687631629428514, + "loss": 0.2226, + "step": 167570 + }, + { + "epoch": 6.94, + "grad_norm": 0.310546875, + "learning_rate": 0.0004368691121339844, + "loss": 0.1872, + "step": 167580 + }, + { + "epoch": 6.94, + "grad_norm": 1.0546875, + "learning_rate": 0.00043686190762201615, + "loss": 0.2231, + "step": 167590 + }, + { + "epoch": 6.94, + "grad_norm": 0.57421875, + "learning_rate": 0.00043685470275839393, + "loss": 0.2054, + "step": 167600 + }, + { + "epoch": 6.94, + "grad_norm": 0.4375, + "learning_rate": 0.00043684749754313134, + "loss": 0.2221, + "step": 167610 + }, + { + "epoch": 6.94, + "grad_norm": 1.3125, + "learning_rate": 0.00043684029197624204, + "loss": 0.2506, + "step": 167620 + }, + { + "epoch": 6.94, + "grad_norm": 0.76171875, + "learning_rate": 0.0004368330860577394, + "loss": 0.2438, + "step": 167630 + }, + { + "epoch": 6.94, + "grad_norm": 1.1328125, + "learning_rate": 0.0004368258797876371, + "loss": 0.2541, + "step": 167640 + }, + { + "epoch": 6.94, + "grad_norm": 1.125, + "learning_rate": 0.00043681867316594866, + "loss": 0.2292, + "step": 167650 + }, + { + "epoch": 6.94, + "grad_norm": 0.87890625, + "learning_rate": 0.00043681146619268765, + "loss": 0.1596, + "step": 167660 + }, + { + "epoch": 6.94, + "grad_norm": 0.38671875, + "learning_rate": 0.0004368042588678676, + "loss": 0.2001, + "step": 167670 + }, + { + "epoch": 6.95, + "grad_norm": 0.640625, + "learning_rate": 0.00043679705119150215, + "loss": 0.273, + "step": 167680 + }, + { + "epoch": 6.95, + "grad_norm": 0.9296875, + "learning_rate": 0.00043678984316360484, + "loss": 0.2104, + "step": 167690 + }, + { + "epoch": 6.95, + "grad_norm": 1.125, + "learning_rate": 0.00043678263478418917, + "loss": 0.2552, + "step": 167700 + }, + { + "epoch": 6.95, + "grad_norm": 0.2890625, + "learning_rate": 0.0004367754260532688, + "loss": 0.2176, + "step": 167710 + }, + { + "epoch": 6.95, + "grad_norm": 1.359375, + "learning_rate": 0.0004367682169708572, + "loss": 0.2097, + "step": 167720 + }, + { + "epoch": 6.95, + "grad_norm": 0.439453125, + "learning_rate": 0.00043676100753696804, + "loss": 0.1992, + "step": 167730 + }, + { + "epoch": 6.95, + "grad_norm": 0.8359375, + "learning_rate": 0.0004367537977516148, + "loss": 0.185, + "step": 167740 + }, + { + "epoch": 6.95, + "grad_norm": 1.2421875, + "learning_rate": 0.000436746587614811, + "loss": 0.1849, + "step": 167750 + }, + { + "epoch": 6.95, + "grad_norm": 0.703125, + "learning_rate": 0.00043673937712657043, + "loss": 0.2189, + "step": 167760 + }, + { + "epoch": 6.95, + "grad_norm": 0.58984375, + "learning_rate": 0.0004367321662869065, + "loss": 0.182, + "step": 167770 + }, + { + "epoch": 6.95, + "grad_norm": 0.703125, + "learning_rate": 0.0004367249550958327, + "loss": 0.2052, + "step": 167780 + }, + { + "epoch": 6.95, + "grad_norm": 1.09375, + "learning_rate": 0.00043671774355336275, + "loss": 0.2242, + "step": 167790 + }, + { + "epoch": 6.95, + "grad_norm": 1.28125, + "learning_rate": 0.00043671053165951013, + "loss": 0.2036, + "step": 167800 + }, + { + "epoch": 6.95, + "grad_norm": 0.640625, + "learning_rate": 0.0004367033194142885, + "loss": 0.1531, + "step": 167810 + }, + { + "epoch": 6.95, + "grad_norm": 0.6328125, + "learning_rate": 0.00043669610681771145, + "loss": 0.1888, + "step": 167820 + }, + { + "epoch": 6.95, + "grad_norm": 0.474609375, + "learning_rate": 0.0004366888938697924, + "loss": 0.205, + "step": 167830 + }, + { + "epoch": 6.95, + "grad_norm": 1.0, + "learning_rate": 0.00043668168057054503, + "loss": 0.2135, + "step": 167840 + }, + { + "epoch": 6.95, + "grad_norm": 0.96484375, + "learning_rate": 0.0004366744669199829, + "loss": 0.1773, + "step": 167850 + }, + { + "epoch": 6.95, + "grad_norm": 0.85546875, + "learning_rate": 0.0004366672529181196, + "loss": 0.215, + "step": 167860 + }, + { + "epoch": 6.95, + "grad_norm": 0.57421875, + "learning_rate": 0.0004366600385649686, + "loss": 0.2417, + "step": 167870 + }, + { + "epoch": 6.95, + "grad_norm": 0.76953125, + "learning_rate": 0.00043665282386054374, + "loss": 0.1788, + "step": 167880 + }, + { + "epoch": 6.95, + "grad_norm": 1.15625, + "learning_rate": 0.00043664560880485824, + "loss": 0.2235, + "step": 167890 + }, + { + "epoch": 6.95, + "grad_norm": 0.65234375, + "learning_rate": 0.000436638393397926, + "loss": 0.2325, + "step": 167900 + }, + { + "epoch": 6.95, + "grad_norm": 0.4375, + "learning_rate": 0.00043663117763976037, + "loss": 0.1897, + "step": 167910 + }, + { + "epoch": 6.96, + "grad_norm": 0.46484375, + "learning_rate": 0.000436623961530375, + "loss": 0.1593, + "step": 167920 + }, + { + "epoch": 6.96, + "grad_norm": 0.361328125, + "learning_rate": 0.00043661674506978356, + "loss": 0.1918, + "step": 167930 + }, + { + "epoch": 6.96, + "grad_norm": 0.80078125, + "learning_rate": 0.00043660952825799954, + "loss": 0.2321, + "step": 167940 + }, + { + "epoch": 6.96, + "grad_norm": 1.25, + "learning_rate": 0.0004366023110950365, + "loss": 0.223, + "step": 167950 + }, + { + "epoch": 6.96, + "grad_norm": 1.296875, + "learning_rate": 0.0004365950935809081, + "loss": 0.1825, + "step": 167960 + }, + { + "epoch": 6.96, + "grad_norm": 0.51171875, + "learning_rate": 0.0004365878757156279, + "loss": 0.21, + "step": 167970 + }, + { + "epoch": 6.96, + "grad_norm": 1.546875, + "learning_rate": 0.0004365806574992094, + "loss": 0.2085, + "step": 167980 + }, + { + "epoch": 6.96, + "grad_norm": 0.345703125, + "learning_rate": 0.00043657343893166635, + "loss": 0.1951, + "step": 167990 + }, + { + "epoch": 6.96, + "grad_norm": 0.625, + "learning_rate": 0.00043656622001301214, + "loss": 0.2259, + "step": 168000 + }, + { + "epoch": 6.96, + "grad_norm": 0.53515625, + "learning_rate": 0.0004365590007432605, + "loss": 0.1657, + "step": 168010 + }, + { + "epoch": 6.96, + "grad_norm": 0.369140625, + "learning_rate": 0.000436551781122425, + "loss": 0.2273, + "step": 168020 + }, + { + "epoch": 6.96, + "grad_norm": 0.3125, + "learning_rate": 0.0004365445611505191, + "loss": 0.2021, + "step": 168030 + }, + { + "epoch": 6.96, + "grad_norm": 0.392578125, + "learning_rate": 0.0004365373408275566, + "loss": 0.2027, + "step": 168040 + }, + { + "epoch": 6.96, + "grad_norm": 0.427734375, + "learning_rate": 0.00043653012015355087, + "loss": 0.1715, + "step": 168050 + }, + { + "epoch": 6.96, + "grad_norm": 0.5234375, + "learning_rate": 0.0004365228991285156, + "loss": 0.257, + "step": 168060 + }, + { + "epoch": 6.96, + "grad_norm": 1.203125, + "learning_rate": 0.0004365156777524645, + "loss": 0.248, + "step": 168070 + }, + { + "epoch": 6.96, + "grad_norm": 0.65234375, + "learning_rate": 0.0004365084560254109, + "loss": 0.2309, + "step": 168080 + }, + { + "epoch": 6.96, + "grad_norm": 1.2578125, + "learning_rate": 0.0004365012339473686, + "loss": 0.1346, + "step": 168090 + }, + { + "epoch": 6.96, + "grad_norm": 1.1640625, + "learning_rate": 0.00043649401151835105, + "loss": 0.1866, + "step": 168100 + }, + { + "epoch": 6.96, + "grad_norm": 0.6171875, + "learning_rate": 0.00043648678873837196, + "loss": 0.173, + "step": 168110 + }, + { + "epoch": 6.96, + "grad_norm": 0.9609375, + "learning_rate": 0.00043647956560744487, + "loss": 0.2029, + "step": 168120 + }, + { + "epoch": 6.96, + "grad_norm": 1.03125, + "learning_rate": 0.0004364723421255833, + "loss": 0.214, + "step": 168130 + }, + { + "epoch": 6.96, + "grad_norm": 1.1796875, + "learning_rate": 0.00043646511829280104, + "loss": 0.2418, + "step": 168140 + }, + { + "epoch": 6.96, + "grad_norm": 0.478515625, + "learning_rate": 0.0004364578941091115, + "loss": 0.2066, + "step": 168150 + }, + { + "epoch": 6.97, + "grad_norm": 0.490234375, + "learning_rate": 0.0004364506695745283, + "loss": 0.1792, + "step": 168160 + }, + { + "epoch": 6.97, + "grad_norm": 0.51953125, + "learning_rate": 0.00043644344468906515, + "loss": 0.1955, + "step": 168170 + }, + { + "epoch": 6.97, + "grad_norm": 0.58984375, + "learning_rate": 0.0004364362194527356, + "loss": 0.2189, + "step": 168180 + }, + { + "epoch": 6.97, + "grad_norm": 0.58203125, + "learning_rate": 0.0004364289938655531, + "loss": 0.2317, + "step": 168190 + }, + { + "epoch": 6.97, + "grad_norm": 0.7578125, + "learning_rate": 0.00043642176792753145, + "loss": 0.2128, + "step": 168200 + }, + { + "epoch": 6.97, + "grad_norm": 0.80078125, + "learning_rate": 0.00043641454163868414, + "loss": 0.1968, + "step": 168210 + }, + { + "epoch": 6.97, + "grad_norm": 0.703125, + "learning_rate": 0.00043640731499902484, + "loss": 0.1312, + "step": 168220 + }, + { + "epoch": 6.97, + "grad_norm": 0.55859375, + "learning_rate": 0.00043640008800856705, + "loss": 0.2275, + "step": 168230 + }, + { + "epoch": 6.97, + "grad_norm": 0.41796875, + "learning_rate": 0.00043639286066732443, + "loss": 0.2142, + "step": 168240 + }, + { + "epoch": 6.97, + "grad_norm": 0.76171875, + "learning_rate": 0.0004363856329753106, + "loss": 0.1652, + "step": 168250 + }, + { + "epoch": 6.97, + "grad_norm": 0.390625, + "learning_rate": 0.0004363784049325391, + "loss": 0.2392, + "step": 168260 + }, + { + "epoch": 6.97, + "grad_norm": 0.6640625, + "learning_rate": 0.0004363711765390236, + "loss": 0.171, + "step": 168270 + }, + { + "epoch": 6.97, + "grad_norm": 0.73828125, + "learning_rate": 0.0004363639477947776, + "loss": 0.1825, + "step": 168280 + }, + { + "epoch": 6.97, + "grad_norm": 0.58984375, + "learning_rate": 0.0004363567186998149, + "loss": 0.2419, + "step": 168290 + }, + { + "epoch": 6.97, + "grad_norm": 0.77734375, + "learning_rate": 0.00043634948925414885, + "loss": 0.1956, + "step": 168300 + }, + { + "epoch": 6.97, + "grad_norm": 0.3203125, + "learning_rate": 0.00043634225945779324, + "loss": 0.2287, + "step": 168310 + }, + { + "epoch": 6.97, + "grad_norm": 0.640625, + "learning_rate": 0.0004363350293107617, + "loss": 0.1752, + "step": 168320 + }, + { + "epoch": 6.97, + "grad_norm": 0.9375, + "learning_rate": 0.00043632779881306775, + "loss": 0.2241, + "step": 168330 + }, + { + "epoch": 6.97, + "grad_norm": 0.91015625, + "learning_rate": 0.00043632056796472486, + "loss": 0.2162, + "step": 168340 + }, + { + "epoch": 6.97, + "grad_norm": 0.92578125, + "learning_rate": 0.00043631333676574693, + "loss": 0.2209, + "step": 168350 + }, + { + "epoch": 6.97, + "grad_norm": 0.8046875, + "learning_rate": 0.00043630610521614734, + "loss": 0.1966, + "step": 168360 + }, + { + "epoch": 6.97, + "grad_norm": 0.7890625, + "learning_rate": 0.00043629887331593975, + "loss": 0.1604, + "step": 168370 + }, + { + "epoch": 6.97, + "grad_norm": 0.62890625, + "learning_rate": 0.0004362916410651379, + "loss": 0.1708, + "step": 168380 + }, + { + "epoch": 6.97, + "grad_norm": 0.421875, + "learning_rate": 0.00043628440846375517, + "loss": 0.2107, + "step": 168390 + }, + { + "epoch": 6.98, + "grad_norm": 1.9453125, + "learning_rate": 0.0004362771755118054, + "loss": 0.2193, + "step": 168400 + }, + { + "epoch": 6.98, + "grad_norm": 0.4609375, + "learning_rate": 0.00043626994220930204, + "loss": 0.2341, + "step": 168410 + }, + { + "epoch": 6.98, + "grad_norm": 0.77734375, + "learning_rate": 0.00043626270855625884, + "loss": 0.2158, + "step": 168420 + }, + { + "epoch": 6.98, + "grad_norm": 1.1484375, + "learning_rate": 0.00043625547455268933, + "loss": 0.2051, + "step": 168430 + }, + { + "epoch": 6.98, + "grad_norm": 0.1962890625, + "learning_rate": 0.000436248240198607, + "loss": 0.2024, + "step": 168440 + }, + { + "epoch": 6.98, + "grad_norm": 0.546875, + "learning_rate": 0.0004362410054940258, + "loss": 0.2327, + "step": 168450 + }, + { + "epoch": 6.98, + "grad_norm": 0.1923828125, + "learning_rate": 0.00043623377043895895, + "loss": 0.1923, + "step": 168460 + }, + { + "epoch": 6.98, + "grad_norm": 0.69921875, + "learning_rate": 0.00043622653503342035, + "loss": 0.1638, + "step": 168470 + }, + { + "epoch": 6.98, + "grad_norm": 0.74609375, + "learning_rate": 0.00043621929927742354, + "loss": 0.1544, + "step": 168480 + }, + { + "epoch": 6.98, + "grad_norm": 0.38671875, + "learning_rate": 0.00043621206317098207, + "loss": 0.2362, + "step": 168490 + }, + { + "epoch": 6.98, + "grad_norm": 0.76171875, + "learning_rate": 0.00043620482671410965, + "loss": 0.2167, + "step": 168500 + }, + { + "epoch": 6.98, + "grad_norm": 1.09375, + "learning_rate": 0.0004361975899068198, + "loss": 0.2628, + "step": 168510 + }, + { + "epoch": 6.98, + "grad_norm": 3.109375, + "learning_rate": 0.0004361903527491262, + "loss": 0.1997, + "step": 168520 + }, + { + "epoch": 6.98, + "grad_norm": 0.30859375, + "learning_rate": 0.0004361831152410425, + "loss": 0.2308, + "step": 168530 + }, + { + "epoch": 6.98, + "grad_norm": 0.7421875, + "learning_rate": 0.00043617587738258224, + "loss": 0.2531, + "step": 168540 + }, + { + "epoch": 6.98, + "grad_norm": 1.40625, + "learning_rate": 0.00043616863917375913, + "loss": 0.1965, + "step": 168550 + }, + { + "epoch": 6.98, + "grad_norm": 0.80078125, + "learning_rate": 0.00043616140061458677, + "loss": 0.2366, + "step": 168560 + }, + { + "epoch": 6.98, + "grad_norm": 0.66015625, + "learning_rate": 0.0004361541617050787, + "loss": 0.2022, + "step": 168570 + }, + { + "epoch": 6.98, + "grad_norm": 0.71484375, + "learning_rate": 0.0004361469224452486, + "loss": 0.2031, + "step": 168580 + }, + { + "epoch": 6.98, + "grad_norm": 0.44921875, + "learning_rate": 0.00043613968283511006, + "loss": 0.2626, + "step": 168590 + }, + { + "epoch": 6.98, + "grad_norm": 0.6015625, + "learning_rate": 0.00043613244287467686, + "loss": 0.2007, + "step": 168600 + }, + { + "epoch": 6.98, + "grad_norm": 1.125, + "learning_rate": 0.00043612520256396245, + "loss": 0.1931, + "step": 168610 + }, + { + "epoch": 6.98, + "grad_norm": 0.80859375, + "learning_rate": 0.00043611796190298047, + "loss": 0.2072, + "step": 168620 + }, + { + "epoch": 6.98, + "grad_norm": 1.6484375, + "learning_rate": 0.0004361107208917446, + "loss": 0.2285, + "step": 168630 + }, + { + "epoch": 6.99, + "grad_norm": 0.423828125, + "learning_rate": 0.00043610347953026843, + "loss": 0.1604, + "step": 168640 + }, + { + "epoch": 6.99, + "grad_norm": 0.83203125, + "learning_rate": 0.00043609623781856564, + "loss": 0.2156, + "step": 168650 + }, + { + "epoch": 6.99, + "grad_norm": 0.62890625, + "learning_rate": 0.0004360889957566498, + "loss": 0.1765, + "step": 168660 + }, + { + "epoch": 6.99, + "grad_norm": 0.61328125, + "learning_rate": 0.0004360817533445346, + "loss": 0.178, + "step": 168670 + }, + { + "epoch": 6.99, + "grad_norm": 1.3125, + "learning_rate": 0.00043607451058223366, + "loss": 0.2466, + "step": 168680 + }, + { + "epoch": 6.99, + "grad_norm": 0.55078125, + "learning_rate": 0.00043606726746976053, + "loss": 0.2031, + "step": 168690 + }, + { + "epoch": 6.99, + "grad_norm": 0.61328125, + "learning_rate": 0.00043606002400712896, + "loss": 0.2632, + "step": 168700 + }, + { + "epoch": 6.99, + "grad_norm": 0.75, + "learning_rate": 0.00043605278019435246, + "loss": 0.2416, + "step": 168710 + }, + { + "epoch": 6.99, + "grad_norm": 0.4375, + "learning_rate": 0.0004360455360314447, + "loss": 0.2353, + "step": 168720 + }, + { + "epoch": 6.99, + "grad_norm": 0.431640625, + "learning_rate": 0.0004360382915184194, + "loss": 0.1901, + "step": 168730 + }, + { + "epoch": 6.99, + "grad_norm": 0.7421875, + "learning_rate": 0.00043603104665529, + "loss": 0.2017, + "step": 168740 + }, + { + "epoch": 6.99, + "grad_norm": 1.0546875, + "learning_rate": 0.00043602380144207043, + "loss": 0.2235, + "step": 168750 + }, + { + "epoch": 6.99, + "grad_norm": 0.8359375, + "learning_rate": 0.000436016555878774, + "loss": 0.1853, + "step": 168760 + }, + { + "epoch": 6.99, + "grad_norm": 0.5703125, + "learning_rate": 0.00043600930996541467, + "loss": 0.2288, + "step": 168770 + }, + { + "epoch": 6.99, + "grad_norm": 0.51171875, + "learning_rate": 0.0004360020637020058, + "loss": 0.2394, + "step": 168780 + }, + { + "epoch": 6.99, + "grad_norm": 0.70703125, + "learning_rate": 0.00043599481708856113, + "loss": 0.2021, + "step": 168790 + }, + { + "epoch": 6.99, + "grad_norm": 0.83203125, + "learning_rate": 0.00043598757012509425, + "loss": 0.2258, + "step": 168800 + }, + { + "epoch": 6.99, + "grad_norm": 0.59765625, + "learning_rate": 0.000435980322811619, + "loss": 0.2229, + "step": 168810 + }, + { + "epoch": 6.99, + "grad_norm": 1.0625, + "learning_rate": 0.00043597307514814875, + "loss": 0.1941, + "step": 168820 + }, + { + "epoch": 6.99, + "grad_norm": 0.87109375, + "learning_rate": 0.00043596582713469723, + "loss": 0.217, + "step": 168830 + }, + { + "epoch": 6.99, + "grad_norm": 0.5390625, + "learning_rate": 0.0004359585787712782, + "loss": 0.1767, + "step": 168840 + }, + { + "epoch": 6.99, + "grad_norm": 0.52734375, + "learning_rate": 0.00043595133005790523, + "loss": 0.1419, + "step": 168850 + }, + { + "epoch": 6.99, + "grad_norm": 0.96484375, + "learning_rate": 0.0004359440809945918, + "loss": 0.1329, + "step": 168860 + }, + { + "epoch": 6.99, + "grad_norm": 1.3046875, + "learning_rate": 0.0004359368315813518, + "loss": 0.1853, + "step": 168870 + }, + { + "epoch": 6.99, + "grad_norm": 1.671875, + "learning_rate": 0.00043592958181819865, + "loss": 0.2236, + "step": 168880 + }, + { + "epoch": 7.0, + "grad_norm": 0.2314453125, + "learning_rate": 0.0004359223317051462, + "loss": 0.2161, + "step": 168890 + }, + { + "epoch": 7.0, + "grad_norm": 0.40625, + "learning_rate": 0.00043591508124220805, + "loss": 0.1471, + "step": 168900 + }, + { + "epoch": 7.0, + "grad_norm": 1.0546875, + "learning_rate": 0.0004359078304293977, + "loss": 0.1561, + "step": 168910 + }, + { + "epoch": 7.0, + "grad_norm": 0.3515625, + "learning_rate": 0.0004359005792667289, + "loss": 0.2504, + "step": 168920 + }, + { + "epoch": 7.0, + "grad_norm": 1.0703125, + "learning_rate": 0.0004358933277542153, + "loss": 0.2147, + "step": 168930 + }, + { + "epoch": 7.0, + "grad_norm": 0.7109375, + "learning_rate": 0.00043588607589187055, + "loss": 0.2467, + "step": 168940 + }, + { + "epoch": 7.0, + "grad_norm": 0.5625, + "learning_rate": 0.0004358788236797082, + "loss": 0.2081, + "step": 168950 + }, + { + "epoch": 7.0, + "grad_norm": 2.21875, + "learning_rate": 0.0004358715711177421, + "loss": 0.2514, + "step": 168960 + }, + { + "epoch": 7.0, + "grad_norm": 0.158203125, + "learning_rate": 0.0004358643182059857, + "loss": 0.1524, + "step": 168970 + }, + { + "epoch": 7.0, + "grad_norm": 0.609375, + "learning_rate": 0.00043585706494445275, + "loss": 0.2248, + "step": 168980 + }, + { + "epoch": 7.0, + "grad_norm": 0.66796875, + "learning_rate": 0.00043584981133315683, + "loss": 0.2344, + "step": 168990 + }, + { + "epoch": 7.0, + "grad_norm": 0.66796875, + "learning_rate": 0.00043584255737211166, + "loss": 0.1869, + "step": 169000 + }, + { + "epoch": 7.0, + "grad_norm": 0.75390625, + "learning_rate": 0.0004358353030613309, + "loss": 0.2043, + "step": 169010 + }, + { + "epoch": 7.0, + "grad_norm": 0.6328125, + "learning_rate": 0.00043582804840082813, + "loss": 0.1911, + "step": 169020 + }, + { + "epoch": 7.0, + "grad_norm": 0.8828125, + "learning_rate": 0.0004358207933906171, + "loss": 0.2083, + "step": 169030 + }, + { + "epoch": 7.0, + "grad_norm": 0.302734375, + "learning_rate": 0.00043581353803071136, + "loss": 0.2291, + "step": 169040 + }, + { + "epoch": 7.0, + "grad_norm": 0.185546875, + "learning_rate": 0.00043580628232112455, + "loss": 0.2559, + "step": 169050 + }, + { + "epoch": 7.0, + "grad_norm": 0.33203125, + "learning_rate": 0.0004357990262618704, + "loss": 0.1638, + "step": 169060 + }, + { + "epoch": 7.0, + "grad_norm": 0.439453125, + "learning_rate": 0.00043579176985296267, + "loss": 0.2216, + "step": 169070 + }, + { + "epoch": 7.0, + "grad_norm": 0.322265625, + "learning_rate": 0.00043578451309441476, + "loss": 0.2102, + "step": 169080 + }, + { + "epoch": 7.0, + "grad_norm": 0.85546875, + "learning_rate": 0.0004357772559862405, + "loss": 0.215, + "step": 169090 + }, + { + "epoch": 7.0, + "grad_norm": 0.796875, + "learning_rate": 0.0004357699985284535, + "loss": 0.2094, + "step": 169100 + }, + { + "epoch": 7.0, + "grad_norm": 1.0, + "learning_rate": 0.00043576274072106746, + "loss": 0.1783, + "step": 169110 + }, + { + "epoch": 7.0, + "grad_norm": 0.462890625, + "learning_rate": 0.00043575548256409596, + "loss": 0.1491, + "step": 169120 + }, + { + "epoch": 7.01, + "grad_norm": 1.6953125, + "learning_rate": 0.00043574822405755275, + "loss": 0.2171, + "step": 169130 + }, + { + "epoch": 7.01, + "grad_norm": 0.76953125, + "learning_rate": 0.0004357409652014514, + "loss": 0.2583, + "step": 169140 + }, + { + "epoch": 7.01, + "grad_norm": 0.6640625, + "learning_rate": 0.00043573370599580565, + "loss": 0.1804, + "step": 169150 + }, + { + "epoch": 7.01, + "grad_norm": 1.078125, + "learning_rate": 0.0004357264464406291, + "loss": 0.2085, + "step": 169160 + }, + { + "epoch": 7.01, + "grad_norm": 0.47265625, + "learning_rate": 0.0004357191865359354, + "loss": 0.2034, + "step": 169170 + }, + { + "epoch": 7.01, + "grad_norm": 0.259765625, + "learning_rate": 0.0004357119262817383, + "loss": 0.1766, + "step": 169180 + }, + { + "epoch": 7.01, + "grad_norm": 1.0625, + "learning_rate": 0.00043570466567805134, + "loss": 0.1465, + "step": 169190 + }, + { + "epoch": 7.01, + "grad_norm": 0.40625, + "learning_rate": 0.00043569740472488834, + "loss": 0.2185, + "step": 169200 + }, + { + "epoch": 7.01, + "grad_norm": 0.765625, + "learning_rate": 0.00043569014342226286, + "loss": 0.2336, + "step": 169210 + }, + { + "epoch": 7.01, + "grad_norm": 0.61328125, + "learning_rate": 0.00043568288177018856, + "loss": 0.2121, + "step": 169220 + }, + { + "epoch": 7.01, + "grad_norm": 0.61328125, + "learning_rate": 0.0004356756197686791, + "loss": 0.2053, + "step": 169230 + }, + { + "epoch": 7.01, + "grad_norm": 0.765625, + "learning_rate": 0.00043566835741774823, + "loss": 0.2319, + "step": 169240 + }, + { + "epoch": 7.01, + "grad_norm": 1.2890625, + "learning_rate": 0.00043566109471740954, + "loss": 0.1609, + "step": 169250 + }, + { + "epoch": 7.01, + "grad_norm": 0.59765625, + "learning_rate": 0.0004356538316676767, + "loss": 0.1897, + "step": 169260 + }, + { + "epoch": 7.01, + "grad_norm": 0.322265625, + "learning_rate": 0.00043564656826856343, + "loss": 0.1825, + "step": 169270 + }, + { + "epoch": 7.01, + "grad_norm": 0.96875, + "learning_rate": 0.00043563930452008334, + "loss": 0.258, + "step": 169280 + }, + { + "epoch": 7.01, + "grad_norm": 0.78125, + "learning_rate": 0.0004356320404222501, + "loss": 0.1766, + "step": 169290 + }, + { + "epoch": 7.01, + "grad_norm": 1.4140625, + "learning_rate": 0.00043562477597507745, + "loss": 0.1797, + "step": 169300 + }, + { + "epoch": 7.01, + "grad_norm": 0.94921875, + "learning_rate": 0.000435617511178579, + "loss": 0.233, + "step": 169310 + }, + { + "epoch": 7.01, + "grad_norm": 0.169921875, + "learning_rate": 0.00043561024603276844, + "loss": 0.2032, + "step": 169320 + }, + { + "epoch": 7.01, + "grad_norm": 1.09375, + "learning_rate": 0.0004356029805376595, + "loss": 0.1748, + "step": 169330 + }, + { + "epoch": 7.01, + "grad_norm": 0.3046875, + "learning_rate": 0.00043559571469326577, + "loss": 0.1841, + "step": 169340 + }, + { + "epoch": 7.01, + "grad_norm": 0.400390625, + "learning_rate": 0.0004355884484996009, + "loss": 0.2126, + "step": 169350 + }, + { + "epoch": 7.01, + "grad_norm": 0.671875, + "learning_rate": 0.0004355811819566786, + "loss": 0.2202, + "step": 169360 + }, + { + "epoch": 7.02, + "grad_norm": 1.1015625, + "learning_rate": 0.0004355739150645126, + "loss": 0.2165, + "step": 169370 + }, + { + "epoch": 7.02, + "grad_norm": 0.74609375, + "learning_rate": 0.0004355666478231165, + "loss": 0.2326, + "step": 169380 + }, + { + "epoch": 7.02, + "grad_norm": 0.55859375, + "learning_rate": 0.0004355593802325041, + "loss": 0.2158, + "step": 169390 + }, + { + "epoch": 7.02, + "grad_norm": 0.546875, + "learning_rate": 0.0004355521122926889, + "loss": 0.1802, + "step": 169400 + }, + { + "epoch": 7.02, + "grad_norm": 0.5859375, + "learning_rate": 0.00043554484400368465, + "loss": 0.1999, + "step": 169410 + }, + { + "epoch": 7.02, + "grad_norm": 0.546875, + "learning_rate": 0.0004355375753655051, + "loss": 0.156, + "step": 169420 + }, + { + "epoch": 7.02, + "grad_norm": 0.369140625, + "learning_rate": 0.00043553030637816383, + "loss": 0.1775, + "step": 169430 + }, + { + "epoch": 7.02, + "grad_norm": 1.0078125, + "learning_rate": 0.0004355230370416746, + "loss": 0.1793, + "step": 169440 + }, + { + "epoch": 7.02, + "grad_norm": 0.0, + "learning_rate": 0.000435515767356051, + "loss": 0.2395, + "step": 169450 + }, + { + "epoch": 7.02, + "grad_norm": 0.96484375, + "learning_rate": 0.0004355084973213068, + "loss": 0.2002, + "step": 169460 + }, + { + "epoch": 7.02, + "grad_norm": 0.44921875, + "learning_rate": 0.0004355012269374555, + "loss": 0.2032, + "step": 169470 + }, + { + "epoch": 7.02, + "grad_norm": 0.6484375, + "learning_rate": 0.00043549395620451115, + "loss": 0.1762, + "step": 169480 + }, + { + "epoch": 7.02, + "grad_norm": 0.58984375, + "learning_rate": 0.0004354866851224871, + "loss": 0.2134, + "step": 169490 + }, + { + "epoch": 7.02, + "grad_norm": 0.9765625, + "learning_rate": 0.0004354794136913971, + "loss": 0.1981, + "step": 169500 + }, + { + "epoch": 7.02, + "grad_norm": 0.73046875, + "learning_rate": 0.0004354721419112549, + "loss": 0.1886, + "step": 169510 + }, + { + "epoch": 7.02, + "grad_norm": 0.2890625, + "learning_rate": 0.0004354648697820742, + "loss": 0.2107, + "step": 169520 + }, + { + "epoch": 7.02, + "grad_norm": 0.92578125, + "learning_rate": 0.00043545759730386857, + "loss": 0.2279, + "step": 169530 + }, + { + "epoch": 7.02, + "grad_norm": 0.6328125, + "learning_rate": 0.00043545032447665186, + "loss": 0.1862, + "step": 169540 + }, + { + "epoch": 7.02, + "grad_norm": 1.75, + "learning_rate": 0.0004354430513004376, + "loss": 0.2311, + "step": 169550 + }, + { + "epoch": 7.02, + "grad_norm": 0.81640625, + "learning_rate": 0.0004354357777752396, + "loss": 0.1678, + "step": 169560 + }, + { + "epoch": 7.02, + "grad_norm": 0.63671875, + "learning_rate": 0.00043542850390107144, + "loss": 0.2253, + "step": 169570 + }, + { + "epoch": 7.02, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004354212296779468, + "loss": 0.1781, + "step": 169580 + }, + { + "epoch": 7.02, + "grad_norm": 0.703125, + "learning_rate": 0.00043541395510587953, + "loss": 0.2155, + "step": 169590 + }, + { + "epoch": 7.02, + "grad_norm": 1.484375, + "learning_rate": 0.00043540668018488324, + "loss": 0.1886, + "step": 169600 + }, + { + "epoch": 7.03, + "grad_norm": 0.5234375, + "learning_rate": 0.00043539940491497156, + "loss": 0.2362, + "step": 169610 + }, + { + "epoch": 7.03, + "grad_norm": 1.109375, + "learning_rate": 0.0004353921292961582, + "loss": 0.2119, + "step": 169620 + }, + { + "epoch": 7.03, + "grad_norm": 0.93359375, + "learning_rate": 0.00043538485332845686, + "loss": 0.2053, + "step": 169630 + }, + { + "epoch": 7.03, + "grad_norm": 0.76953125, + "learning_rate": 0.0004353775770118813, + "loss": 0.2167, + "step": 169640 + }, + { + "epoch": 7.03, + "grad_norm": 0.8125, + "learning_rate": 0.0004353703003464451, + "loss": 0.1533, + "step": 169650 + }, + { + "epoch": 7.03, + "grad_norm": 0.71484375, + "learning_rate": 0.000435363023332162, + "loss": 0.2185, + "step": 169660 + }, + { + "epoch": 7.03, + "grad_norm": 0.58984375, + "learning_rate": 0.0004353557459690458, + "loss": 0.2259, + "step": 169670 + }, + { + "epoch": 7.03, + "grad_norm": 0.5859375, + "learning_rate": 0.00043534846825711007, + "loss": 0.1809, + "step": 169680 + }, + { + "epoch": 7.03, + "grad_norm": 0.10009765625, + "learning_rate": 0.00043534119019636853, + "loss": 0.1844, + "step": 169690 + }, + { + "epoch": 7.03, + "grad_norm": 0.9609375, + "learning_rate": 0.0004353339117868349, + "loss": 0.2217, + "step": 169700 + }, + { + "epoch": 7.03, + "grad_norm": 0.47265625, + "learning_rate": 0.0004353266330285228, + "loss": 0.1579, + "step": 169710 + }, + { + "epoch": 7.03, + "grad_norm": 0.47265625, + "learning_rate": 0.00043531935392144606, + "loss": 0.2004, + "step": 169720 + }, + { + "epoch": 7.03, + "grad_norm": 1.328125, + "learning_rate": 0.0004353120744656183, + "loss": 0.1402, + "step": 169730 + }, + { + "epoch": 7.03, + "grad_norm": 0.7109375, + "learning_rate": 0.0004353047946610532, + "loss": 0.1735, + "step": 169740 + }, + { + "epoch": 7.03, + "grad_norm": 0.74609375, + "learning_rate": 0.0004352975145077644, + "loss": 0.2191, + "step": 169750 + }, + { + "epoch": 7.03, + "grad_norm": 0.875, + "learning_rate": 0.00043529023400576584, + "loss": 0.233, + "step": 169760 + }, + { + "epoch": 7.03, + "grad_norm": 0.62109375, + "learning_rate": 0.00043528295315507104, + "loss": 0.2011, + "step": 169770 + }, + { + "epoch": 7.03, + "grad_norm": 1.328125, + "learning_rate": 0.00043527567195569367, + "loss": 0.1829, + "step": 169780 + }, + { + "epoch": 7.03, + "grad_norm": 0.67578125, + "learning_rate": 0.00043526839040764756, + "loss": 0.1734, + "step": 169790 + }, + { + "epoch": 7.03, + "grad_norm": 0.263671875, + "learning_rate": 0.00043526110851094626, + "loss": 0.1467, + "step": 169800 + }, + { + "epoch": 7.03, + "grad_norm": 0.69140625, + "learning_rate": 0.0004352538262656036, + "loss": 0.1989, + "step": 169810 + }, + { + "epoch": 7.03, + "grad_norm": 0.5234375, + "learning_rate": 0.0004352465436716333, + "loss": 0.1969, + "step": 169820 + }, + { + "epoch": 7.03, + "grad_norm": 0.59765625, + "learning_rate": 0.00043523926072904894, + "loss": 0.1948, + "step": 169830 + }, + { + "epoch": 7.03, + "grad_norm": 0.62109375, + "learning_rate": 0.0004352319774378644, + "loss": 0.2131, + "step": 169840 + }, + { + "epoch": 7.04, + "grad_norm": 0.73046875, + "learning_rate": 0.00043522469379809315, + "loss": 0.2136, + "step": 169850 + }, + { + "epoch": 7.04, + "grad_norm": 0.8203125, + "learning_rate": 0.00043521740980974913, + "loss": 0.2002, + "step": 169860 + }, + { + "epoch": 7.04, + "grad_norm": 0.54296875, + "learning_rate": 0.0004352101254728459, + "loss": 0.2017, + "step": 169870 + }, + { + "epoch": 7.04, + "grad_norm": 0.88671875, + "learning_rate": 0.0004352028407873972, + "loss": 0.1922, + "step": 169880 + }, + { + "epoch": 7.04, + "grad_norm": 0.53125, + "learning_rate": 0.00043519555575341674, + "loss": 0.2361, + "step": 169890 + }, + { + "epoch": 7.04, + "grad_norm": 0.57421875, + "learning_rate": 0.0004351882703709183, + "loss": 0.1976, + "step": 169900 + }, + { + "epoch": 7.04, + "grad_norm": 0.5390625, + "learning_rate": 0.0004351809846399155, + "loss": 0.1837, + "step": 169910 + }, + { + "epoch": 7.04, + "grad_norm": 0.73828125, + "learning_rate": 0.0004351736985604221, + "loss": 0.2361, + "step": 169920 + }, + { + "epoch": 7.04, + "grad_norm": 0.439453125, + "learning_rate": 0.00043516641213245177, + "loss": 0.2259, + "step": 169930 + }, + { + "epoch": 7.04, + "grad_norm": 0.49609375, + "learning_rate": 0.0004351591253560183, + "loss": 0.233, + "step": 169940 + }, + { + "epoch": 7.04, + "grad_norm": 0.7578125, + "learning_rate": 0.0004351518382311353, + "loss": 0.1561, + "step": 169950 + }, + { + "epoch": 7.04, + "grad_norm": 0.6875, + "learning_rate": 0.00043514455075781657, + "loss": 0.2105, + "step": 169960 + }, + { + "epoch": 7.04, + "grad_norm": 0.78125, + "learning_rate": 0.00043513726293607574, + "loss": 0.2041, + "step": 169970 + }, + { + "epoch": 7.04, + "grad_norm": 0.58203125, + "learning_rate": 0.00043512997476592664, + "loss": 0.1651, + "step": 169980 + }, + { + "epoch": 7.04, + "grad_norm": 0.4140625, + "learning_rate": 0.00043512268624738284, + "loss": 0.1928, + "step": 169990 + }, + { + "epoch": 7.04, + "grad_norm": 0.67578125, + "learning_rate": 0.0004351153973804582, + "loss": 0.1849, + "step": 170000 + }, + { + "epoch": 7.04, + "grad_norm": 0.609375, + "learning_rate": 0.00043510810816516635, + "loss": 0.1931, + "step": 170010 + }, + { + "epoch": 7.04, + "grad_norm": 0.90234375, + "learning_rate": 0.00043510081860152105, + "loss": 0.1968, + "step": 170020 + }, + { + "epoch": 7.04, + "grad_norm": 0.67578125, + "learning_rate": 0.000435093528689536, + "loss": 0.2402, + "step": 170030 + }, + { + "epoch": 7.04, + "grad_norm": 0.6171875, + "learning_rate": 0.0004350862384292249, + "loss": 0.2186, + "step": 170040 + }, + { + "epoch": 7.04, + "grad_norm": 0.58984375, + "learning_rate": 0.0004350789478206015, + "loss": 0.1864, + "step": 170050 + }, + { + "epoch": 7.04, + "grad_norm": 0.67578125, + "learning_rate": 0.0004350716568636794, + "loss": 0.1974, + "step": 170060 + }, + { + "epoch": 7.04, + "grad_norm": 0.61328125, + "learning_rate": 0.00043506436555847263, + "loss": 0.1943, + "step": 170070 + }, + { + "epoch": 7.04, + "grad_norm": 0.98828125, + "learning_rate": 0.00043505707390499457, + "loss": 0.2309, + "step": 170080 + }, + { + "epoch": 7.05, + "grad_norm": 0.87890625, + "learning_rate": 0.0004350497819032591, + "loss": 0.2022, + "step": 170090 + }, + { + "epoch": 7.05, + "grad_norm": 0.4609375, + "learning_rate": 0.00043504248955328, + "loss": 0.1877, + "step": 170100 + }, + { + "epoch": 7.05, + "grad_norm": 0.96484375, + "learning_rate": 0.0004350351968550708, + "loss": 0.161, + "step": 170110 + }, + { + "epoch": 7.05, + "grad_norm": 0.66796875, + "learning_rate": 0.00043502790380864543, + "loss": 0.1628, + "step": 170120 + }, + { + "epoch": 7.05, + "grad_norm": 1.8828125, + "learning_rate": 0.00043502061041401745, + "loss": 0.2034, + "step": 170130 + }, + { + "epoch": 7.05, + "grad_norm": 0.384765625, + "learning_rate": 0.0004350133166712007, + "loss": 0.1843, + "step": 170140 + }, + { + "epoch": 7.05, + "grad_norm": 1.25, + "learning_rate": 0.0004350060225802089, + "loss": 0.218, + "step": 170150 + }, + { + "epoch": 7.05, + "grad_norm": 0.65234375, + "learning_rate": 0.00043499872814105567, + "loss": 0.1965, + "step": 170160 + }, + { + "epoch": 7.05, + "grad_norm": 0.59765625, + "learning_rate": 0.0004349914333537549, + "loss": 0.263, + "step": 170170 + }, + { + "epoch": 7.05, + "grad_norm": 0.6640625, + "learning_rate": 0.00043498413821832017, + "loss": 0.2237, + "step": 170180 + }, + { + "epoch": 7.05, + "grad_norm": 0.51953125, + "learning_rate": 0.0004349768427347653, + "loss": 0.2122, + "step": 170190 + }, + { + "epoch": 7.05, + "grad_norm": 0.4140625, + "learning_rate": 0.00043496954690310397, + "loss": 0.1817, + "step": 170200 + }, + { + "epoch": 7.05, + "grad_norm": 0.2353515625, + "learning_rate": 0.00043496225072334996, + "loss": 0.1719, + "step": 170210 + }, + { + "epoch": 7.05, + "grad_norm": 0.62109375, + "learning_rate": 0.0004349549541955169, + "loss": 0.2468, + "step": 170220 + }, + { + "epoch": 7.05, + "grad_norm": 1.1953125, + "learning_rate": 0.00043494765731961864, + "loss": 0.1976, + "step": 170230 + }, + { + "epoch": 7.05, + "grad_norm": 1.6015625, + "learning_rate": 0.00043494036009566887, + "loss": 0.1767, + "step": 170240 + }, + { + "epoch": 7.05, + "grad_norm": 0.439453125, + "learning_rate": 0.0004349330625236813, + "loss": 0.1929, + "step": 170250 + }, + { + "epoch": 7.05, + "grad_norm": 0.451171875, + "learning_rate": 0.00043492576460366967, + "loss": 0.2199, + "step": 170260 + }, + { + "epoch": 7.05, + "grad_norm": 0.5703125, + "learning_rate": 0.0004349184663356477, + "loss": 0.21, + "step": 170270 + }, + { + "epoch": 7.05, + "grad_norm": 0.75, + "learning_rate": 0.0004349111677196292, + "loss": 0.2441, + "step": 170280 + }, + { + "epoch": 7.05, + "grad_norm": 0.4453125, + "learning_rate": 0.0004349038687556278, + "loss": 0.2089, + "step": 170290 + }, + { + "epoch": 7.05, + "grad_norm": 2.078125, + "learning_rate": 0.0004348965694436573, + "loss": 0.1836, + "step": 170300 + }, + { + "epoch": 7.05, + "grad_norm": 0.4296875, + "learning_rate": 0.0004348892697837314, + "loss": 0.208, + "step": 170310 + }, + { + "epoch": 7.05, + "grad_norm": 0.314453125, + "learning_rate": 0.0004348819697758639, + "loss": 0.1849, + "step": 170320 + }, + { + "epoch": 7.06, + "grad_norm": 0.80859375, + "learning_rate": 0.0004348746694200686, + "loss": 0.186, + "step": 170330 + }, + { + "epoch": 7.06, + "grad_norm": 0.71875, + "learning_rate": 0.000434867368716359, + "loss": 0.2599, + "step": 170340 + }, + { + "epoch": 7.06, + "grad_norm": 0.216796875, + "learning_rate": 0.00043486006766474893, + "loss": 0.1596, + "step": 170350 + }, + { + "epoch": 7.06, + "grad_norm": 0.7421875, + "learning_rate": 0.00043485276626525226, + "loss": 0.2047, + "step": 170360 + }, + { + "epoch": 7.06, + "grad_norm": 0.6953125, + "learning_rate": 0.00043484546451788266, + "loss": 0.1994, + "step": 170370 + }, + { + "epoch": 7.06, + "grad_norm": 0.90234375, + "learning_rate": 0.00043483816242265383, + "loss": 0.1886, + "step": 170380 + }, + { + "epoch": 7.06, + "grad_norm": 1.6640625, + "learning_rate": 0.00043483085997957956, + "loss": 0.2595, + "step": 170390 + }, + { + "epoch": 7.06, + "grad_norm": 0.55078125, + "learning_rate": 0.00043482355718867354, + "loss": 0.2435, + "step": 170400 + }, + { + "epoch": 7.06, + "grad_norm": 0.59765625, + "learning_rate": 0.00043481625404994957, + "loss": 0.2033, + "step": 170410 + }, + { + "epoch": 7.06, + "grad_norm": 0.41796875, + "learning_rate": 0.0004348089505634214, + "loss": 0.222, + "step": 170420 + }, + { + "epoch": 7.06, + "grad_norm": 0.515625, + "learning_rate": 0.0004348016467291027, + "loss": 0.175, + "step": 170430 + }, + { + "epoch": 7.06, + "grad_norm": 0.98828125, + "learning_rate": 0.0004347943425470072, + "loss": 0.1568, + "step": 170440 + }, + { + "epoch": 7.06, + "grad_norm": 1.3984375, + "learning_rate": 0.0004347870380171488, + "loss": 0.1445, + "step": 170450 + }, + { + "epoch": 7.06, + "grad_norm": 0.63671875, + "learning_rate": 0.0004347797331395411, + "loss": 0.2744, + "step": 170460 + }, + { + "epoch": 7.06, + "grad_norm": 0.64453125, + "learning_rate": 0.00043477242791419794, + "loss": 0.2655, + "step": 170470 + }, + { + "epoch": 7.06, + "grad_norm": 0.6484375, + "learning_rate": 0.000434765122341133, + "loss": 0.195, + "step": 170480 + }, + { + "epoch": 7.06, + "grad_norm": 0.8046875, + "learning_rate": 0.00043475781642036, + "loss": 0.1834, + "step": 170490 + }, + { + "epoch": 7.06, + "grad_norm": 0.4140625, + "learning_rate": 0.0004347505101518928, + "loss": 0.1991, + "step": 170500 + }, + { + "epoch": 7.06, + "grad_norm": 0.671875, + "learning_rate": 0.00043474320353574503, + "loss": 0.1927, + "step": 170510 + }, + { + "epoch": 7.06, + "grad_norm": 0.703125, + "learning_rate": 0.0004347358965719306, + "loss": 0.2103, + "step": 170520 + }, + { + "epoch": 7.06, + "grad_norm": 0.49609375, + "learning_rate": 0.0004347285892604631, + "loss": 0.2304, + "step": 170530 + }, + { + "epoch": 7.06, + "grad_norm": 1.8125, + "learning_rate": 0.0004347212816013564, + "loss": 0.1943, + "step": 170540 + }, + { + "epoch": 7.06, + "grad_norm": 1.3671875, + "learning_rate": 0.00043471397359462407, + "loss": 0.2138, + "step": 170550 + }, + { + "epoch": 7.06, + "grad_norm": 1.375, + "learning_rate": 0.0004347066652402801, + "loss": 0.1893, + "step": 170560 + }, + { + "epoch": 7.06, + "grad_norm": 0.625, + "learning_rate": 0.00043469935653833817, + "loss": 0.2492, + "step": 170570 + }, + { + "epoch": 7.07, + "grad_norm": 1.0, + "learning_rate": 0.00043469204748881185, + "loss": 0.2263, + "step": 170580 + }, + { + "epoch": 7.07, + "grad_norm": 0.578125, + "learning_rate": 0.00043468473809171515, + "loss": 0.1607, + "step": 170590 + }, + { + "epoch": 7.07, + "grad_norm": 0.546875, + "learning_rate": 0.0004346774283470617, + "loss": 0.1933, + "step": 170600 + }, + { + "epoch": 7.07, + "grad_norm": 0.96484375, + "learning_rate": 0.0004346701182548652, + "loss": 0.2467, + "step": 170610 + }, + { + "epoch": 7.07, + "grad_norm": 0.70703125, + "learning_rate": 0.00043466280781513954, + "loss": 0.2447, + "step": 170620 + }, + { + "epoch": 7.07, + "grad_norm": 0.9140625, + "learning_rate": 0.00043465549702789843, + "loss": 0.1429, + "step": 170630 + }, + { + "epoch": 7.07, + "grad_norm": 0.380859375, + "learning_rate": 0.0004346481858931556, + "loss": 0.21, + "step": 170640 + }, + { + "epoch": 7.07, + "grad_norm": 0.2578125, + "learning_rate": 0.0004346408744109249, + "loss": 0.2371, + "step": 170650 + }, + { + "epoch": 7.07, + "grad_norm": 0.76953125, + "learning_rate": 0.00043463356258121986, + "loss": 0.1933, + "step": 170660 + }, + { + "epoch": 7.07, + "grad_norm": 0.7578125, + "learning_rate": 0.0004346262504040545, + "loss": 0.2353, + "step": 170670 + }, + { + "epoch": 7.07, + "grad_norm": 0.8046875, + "learning_rate": 0.00043461893787944246, + "loss": 0.2137, + "step": 170680 + }, + { + "epoch": 7.07, + "grad_norm": 0.58984375, + "learning_rate": 0.0004346116250073975, + "loss": 0.1933, + "step": 170690 + }, + { + "epoch": 7.07, + "grad_norm": 0.6640625, + "learning_rate": 0.0004346043117879334, + "loss": 0.1902, + "step": 170700 + }, + { + "epoch": 7.07, + "grad_norm": 0.8984375, + "learning_rate": 0.00043459699822106395, + "loss": 0.1502, + "step": 170710 + }, + { + "epoch": 7.07, + "grad_norm": 0.42578125, + "learning_rate": 0.0004345896843068029, + "loss": 0.1708, + "step": 170720 + }, + { + "epoch": 7.07, + "grad_norm": 0.5, + "learning_rate": 0.0004345823700451639, + "loss": 0.2009, + "step": 170730 + }, + { + "epoch": 7.07, + "grad_norm": 1.53125, + "learning_rate": 0.0004345750554361609, + "loss": 0.1835, + "step": 170740 + }, + { + "epoch": 7.07, + "grad_norm": 0.4765625, + "learning_rate": 0.00043456774047980753, + "loss": 0.1949, + "step": 170750 + }, + { + "epoch": 7.07, + "grad_norm": 0.65234375, + "learning_rate": 0.0004345604251761176, + "loss": 0.1666, + "step": 170760 + }, + { + "epoch": 7.07, + "grad_norm": 1.0078125, + "learning_rate": 0.000434553109525105, + "loss": 0.2272, + "step": 170770 + }, + { + "epoch": 7.07, + "grad_norm": 0.515625, + "learning_rate": 0.00043454579352678323, + "loss": 0.1836, + "step": 170780 + }, + { + "epoch": 7.07, + "grad_norm": 0.71875, + "learning_rate": 0.0004345384771811662, + "loss": 0.1823, + "step": 170790 + }, + { + "epoch": 7.07, + "grad_norm": 0.8125, + "learning_rate": 0.0004345311604882678, + "loss": 0.1803, + "step": 170800 + }, + { + "epoch": 7.07, + "grad_norm": 1.0078125, + "learning_rate": 0.00043452384344810165, + "loss": 0.2293, + "step": 170810 + }, + { + "epoch": 7.08, + "grad_norm": 0.828125, + "learning_rate": 0.0004345165260606815, + "loss": 0.1959, + "step": 170820 + }, + { + "epoch": 7.08, + "grad_norm": 0.0, + "learning_rate": 0.0004345092083260212, + "loss": 0.175, + "step": 170830 + }, + { + "epoch": 7.08, + "grad_norm": 0.6953125, + "learning_rate": 0.0004345018902441345, + "loss": 0.2244, + "step": 170840 + }, + { + "epoch": 7.08, + "grad_norm": 0.99609375, + "learning_rate": 0.00043449457181503516, + "loss": 0.2443, + "step": 170850 + }, + { + "epoch": 7.08, + "grad_norm": 1.1328125, + "learning_rate": 0.000434487253038737, + "loss": 0.2298, + "step": 170860 + }, + { + "epoch": 7.08, + "grad_norm": 0.92578125, + "learning_rate": 0.0004344799339152537, + "loss": 0.1868, + "step": 170870 + }, + { + "epoch": 7.08, + "grad_norm": 1.3984375, + "learning_rate": 0.0004344726144445991, + "loss": 0.217, + "step": 170880 + }, + { + "epoch": 7.08, + "grad_norm": 0.5078125, + "learning_rate": 0.000434465294626787, + "loss": 0.2025, + "step": 170890 + }, + { + "epoch": 7.08, + "grad_norm": 2.984375, + "learning_rate": 0.0004344579744618311, + "loss": 0.1948, + "step": 170900 + }, + { + "epoch": 7.08, + "grad_norm": 0.8671875, + "learning_rate": 0.0004344506539497451, + "loss": 0.2268, + "step": 170910 + }, + { + "epoch": 7.08, + "grad_norm": 0.9140625, + "learning_rate": 0.00043444333309054306, + "loss": 0.2381, + "step": 170920 + }, + { + "epoch": 7.08, + "grad_norm": 1.15625, + "learning_rate": 0.00043443601188423856, + "loss": 0.2284, + "step": 170930 + }, + { + "epoch": 7.08, + "grad_norm": 1.0703125, + "learning_rate": 0.0004344286903308454, + "loss": 0.2124, + "step": 170940 + }, + { + "epoch": 7.08, + "grad_norm": 0.59765625, + "learning_rate": 0.0004344213684303773, + "loss": 0.1741, + "step": 170950 + }, + { + "epoch": 7.08, + "grad_norm": 0.45703125, + "learning_rate": 0.0004344140461828481, + "loss": 0.1737, + "step": 170960 + }, + { + "epoch": 7.08, + "grad_norm": 0.60546875, + "learning_rate": 0.00043440672358827163, + "loss": 0.2098, + "step": 170970 + }, + { + "epoch": 7.08, + "grad_norm": 0.65625, + "learning_rate": 0.0004343994006466616, + "loss": 0.2087, + "step": 170980 + }, + { + "epoch": 7.08, + "grad_norm": 0.3671875, + "learning_rate": 0.00043439207735803184, + "loss": 0.1536, + "step": 170990 + }, + { + "epoch": 7.08, + "grad_norm": 0.79296875, + "learning_rate": 0.000434384753722396, + "loss": 0.179, + "step": 171000 + }, + { + "epoch": 7.08, + "grad_norm": 0.87890625, + "learning_rate": 0.00043437742973976814, + "loss": 0.2127, + "step": 171010 + }, + { + "epoch": 7.08, + "grad_norm": 0.55859375, + "learning_rate": 0.0004343701054101618, + "loss": 0.2208, + "step": 171020 + }, + { + "epoch": 7.08, + "grad_norm": 0.2275390625, + "learning_rate": 0.0004343627807335907, + "loss": 0.1962, + "step": 171030 + }, + { + "epoch": 7.08, + "grad_norm": 1.8828125, + "learning_rate": 0.0004343554557100689, + "loss": 0.2284, + "step": 171040 + }, + { + "epoch": 7.08, + "grad_norm": 0.59375, + "learning_rate": 0.00043434813033961, + "loss": 0.1813, + "step": 171050 + }, + { + "epoch": 7.09, + "grad_norm": 0.6640625, + "learning_rate": 0.0004343408046222278, + "loss": 0.2895, + "step": 171060 + }, + { + "epoch": 7.09, + "grad_norm": 0.58203125, + "learning_rate": 0.00043433347855793613, + "loss": 0.1952, + "step": 171070 + }, + { + "epoch": 7.09, + "grad_norm": 0.0, + "learning_rate": 0.00043432615214674883, + "loss": 0.2261, + "step": 171080 + }, + { + "epoch": 7.09, + "grad_norm": 0.3203125, + "learning_rate": 0.0004343188253886795, + "loss": 0.1887, + "step": 171090 + }, + { + "epoch": 7.09, + "grad_norm": 0.361328125, + "learning_rate": 0.00043431149828374217, + "loss": 0.2263, + "step": 171100 + }, + { + "epoch": 7.09, + "grad_norm": 0.65625, + "learning_rate": 0.0004343041708319504, + "loss": 0.1928, + "step": 171110 + }, + { + "epoch": 7.09, + "grad_norm": 0.53125, + "learning_rate": 0.0004342968430333181, + "loss": 0.1834, + "step": 171120 + }, + { + "epoch": 7.09, + "grad_norm": 0.7734375, + "learning_rate": 0.00043428951488785905, + "loss": 0.1815, + "step": 171130 + }, + { + "epoch": 7.09, + "grad_norm": 1.390625, + "learning_rate": 0.0004342821863955871, + "loss": 0.1949, + "step": 171140 + }, + { + "epoch": 7.09, + "grad_norm": 0.5859375, + "learning_rate": 0.0004342748575565159, + "loss": 0.1997, + "step": 171150 + }, + { + "epoch": 7.09, + "grad_norm": 0.796875, + "learning_rate": 0.0004342675283706593, + "loss": 0.2171, + "step": 171160 + }, + { + "epoch": 7.09, + "grad_norm": 0.392578125, + "learning_rate": 0.0004342601988380311, + "loss": 0.1794, + "step": 171170 + }, + { + "epoch": 7.09, + "grad_norm": 0.6484375, + "learning_rate": 0.00043425286895864526, + "loss": 0.2096, + "step": 171180 + }, + { + "epoch": 7.09, + "grad_norm": 0.87109375, + "learning_rate": 0.0004342455387325153, + "loss": 0.2169, + "step": 171190 + }, + { + "epoch": 7.09, + "grad_norm": 0.59375, + "learning_rate": 0.00043423820815965513, + "loss": 0.1766, + "step": 171200 + }, + { + "epoch": 7.09, + "grad_norm": 0.466796875, + "learning_rate": 0.0004342308772400786, + "loss": 0.1918, + "step": 171210 + }, + { + "epoch": 7.09, + "grad_norm": 1.328125, + "learning_rate": 0.00043422354597379945, + "loss": 0.2078, + "step": 171220 + }, + { + "epoch": 7.09, + "grad_norm": 1.90625, + "learning_rate": 0.00043421621436083135, + "loss": 0.183, + "step": 171230 + }, + { + "epoch": 7.09, + "grad_norm": 0.609375, + "learning_rate": 0.0004342088824011884, + "loss": 0.2028, + "step": 171240 + }, + { + "epoch": 7.09, + "grad_norm": 1.078125, + "learning_rate": 0.00043420155009488414, + "loss": 0.1794, + "step": 171250 + }, + { + "epoch": 7.09, + "grad_norm": 0.52734375, + "learning_rate": 0.00043419421744193246, + "loss": 0.174, + "step": 171260 + }, + { + "epoch": 7.09, + "grad_norm": 1.328125, + "learning_rate": 0.00043418688444234714, + "loss": 0.2182, + "step": 171270 + }, + { + "epoch": 7.09, + "grad_norm": 0.484375, + "learning_rate": 0.00043417955109614206, + "loss": 0.156, + "step": 171280 + }, + { + "epoch": 7.09, + "grad_norm": 0.828125, + "learning_rate": 0.0004341722174033309, + "loss": 0.2277, + "step": 171290 + }, + { + "epoch": 7.1, + "grad_norm": 0.6953125, + "learning_rate": 0.0004341648833639276, + "loss": 0.2338, + "step": 171300 + }, + { + "epoch": 7.1, + "grad_norm": 0.72265625, + "learning_rate": 0.00043415754897794574, + "loss": 0.1862, + "step": 171310 + }, + { + "epoch": 7.1, + "grad_norm": 0.56640625, + "learning_rate": 0.00043415021424539937, + "loss": 0.1915, + "step": 171320 + }, + { + "epoch": 7.1, + "grad_norm": 0.486328125, + "learning_rate": 0.0004341428791663021, + "loss": 0.2059, + "step": 171330 + }, + { + "epoch": 7.1, + "grad_norm": 0.41796875, + "learning_rate": 0.0004341355437406678, + "loss": 0.2243, + "step": 171340 + }, + { + "epoch": 7.1, + "grad_norm": 0.53125, + "learning_rate": 0.0004341282079685104, + "loss": 0.183, + "step": 171350 + }, + { + "epoch": 7.1, + "grad_norm": 0.96875, + "learning_rate": 0.0004341208718498435, + "loss": 0.195, + "step": 171360 + }, + { + "epoch": 7.1, + "grad_norm": 0.77734375, + "learning_rate": 0.00043411353538468106, + "loss": 0.1653, + "step": 171370 + }, + { + "epoch": 7.1, + "grad_norm": 0.85546875, + "learning_rate": 0.0004341061985730368, + "loss": 0.1927, + "step": 171380 + }, + { + "epoch": 7.1, + "grad_norm": 0.6640625, + "learning_rate": 0.0004340988614149246, + "loss": 0.2483, + "step": 171390 + }, + { + "epoch": 7.1, + "grad_norm": 0.384765625, + "learning_rate": 0.00043409152391035824, + "loss": 0.2686, + "step": 171400 + }, + { + "epoch": 7.1, + "grad_norm": 0.4921875, + "learning_rate": 0.0004340841860593514, + "loss": 0.2353, + "step": 171410 + }, + { + "epoch": 7.1, + "grad_norm": 1.046875, + "learning_rate": 0.0004340768478619181, + "loss": 0.1824, + "step": 171420 + }, + { + "epoch": 7.1, + "grad_norm": 1.125, + "learning_rate": 0.00043406950931807196, + "loss": 0.2357, + "step": 171430 + }, + { + "epoch": 7.1, + "grad_norm": 0.54296875, + "learning_rate": 0.00043406217042782705, + "loss": 0.184, + "step": 171440 + }, + { + "epoch": 7.1, + "grad_norm": 0.7734375, + "learning_rate": 0.00043405483119119683, + "loss": 0.1736, + "step": 171450 + }, + { + "epoch": 7.1, + "grad_norm": 0.3984375, + "learning_rate": 0.00043404749160819535, + "loss": 0.2105, + "step": 171460 + }, + { + "epoch": 7.1, + "grad_norm": 0.57421875, + "learning_rate": 0.00043404015167883644, + "loss": 0.2237, + "step": 171470 + }, + { + "epoch": 7.1, + "grad_norm": 0.859375, + "learning_rate": 0.00043403281140313375, + "loss": 0.2047, + "step": 171480 + }, + { + "epoch": 7.1, + "grad_norm": 0.36328125, + "learning_rate": 0.0004340254707811012, + "loss": 0.2277, + "step": 171490 + }, + { + "epoch": 7.1, + "grad_norm": 0.8671875, + "learning_rate": 0.0004340181298127526, + "loss": 0.217, + "step": 171500 + }, + { + "epoch": 7.1, + "grad_norm": 1.609375, + "learning_rate": 0.00043401078849810175, + "loss": 0.236, + "step": 171510 + }, + { + "epoch": 7.1, + "grad_norm": 1.0859375, + "learning_rate": 0.0004340034468371625, + "loss": 0.222, + "step": 171520 + }, + { + "epoch": 7.1, + "grad_norm": 0.92578125, + "learning_rate": 0.0004339961048299486, + "loss": 0.2042, + "step": 171530 + }, + { + "epoch": 7.11, + "grad_norm": 1.3828125, + "learning_rate": 0.0004339887624764739, + "loss": 0.2124, + "step": 171540 + }, + { + "epoch": 7.11, + "grad_norm": 1.1640625, + "learning_rate": 0.0004339814197767522, + "loss": 0.1649, + "step": 171550 + }, + { + "epoch": 7.11, + "grad_norm": 0.76953125, + "learning_rate": 0.00043397407673079734, + "loss": 0.1897, + "step": 171560 + }, + { + "epoch": 7.11, + "grad_norm": 0.30078125, + "learning_rate": 0.00043396673333862326, + "loss": 0.2091, + "step": 171570 + }, + { + "epoch": 7.11, + "grad_norm": 0.99609375, + "learning_rate": 0.0004339593896002435, + "loss": 0.1932, + "step": 171580 + }, + { + "epoch": 7.11, + "grad_norm": 0.6328125, + "learning_rate": 0.0004339520455156721, + "loss": 0.2216, + "step": 171590 + }, + { + "epoch": 7.11, + "grad_norm": 0.8828125, + "learning_rate": 0.0004339447010849228, + "loss": 0.1838, + "step": 171600 + }, + { + "epoch": 7.11, + "grad_norm": 0.419921875, + "learning_rate": 0.0004339373563080095, + "loss": 0.2025, + "step": 171610 + }, + { + "epoch": 7.11, + "grad_norm": 0.365234375, + "learning_rate": 0.0004339300111849458, + "loss": 0.2516, + "step": 171620 + }, + { + "epoch": 7.11, + "grad_norm": 1.1640625, + "learning_rate": 0.00043392266571574576, + "loss": 0.1863, + "step": 171630 + }, + { + "epoch": 7.11, + "grad_norm": 0.34765625, + "learning_rate": 0.0004339153199004232, + "loss": 0.2472, + "step": 171640 + }, + { + "epoch": 7.11, + "grad_norm": 0.69921875, + "learning_rate": 0.0004339079737389918, + "loss": 0.1954, + "step": 171650 + }, + { + "epoch": 7.11, + "grad_norm": 0.94921875, + "learning_rate": 0.00043390062723146547, + "loss": 0.2021, + "step": 171660 + }, + { + "epoch": 7.11, + "grad_norm": 0.6484375, + "learning_rate": 0.000433893280377858, + "loss": 0.1811, + "step": 171670 + }, + { + "epoch": 7.11, + "grad_norm": 0.8046875, + "learning_rate": 0.0004338859331781833, + "loss": 0.2162, + "step": 171680 + }, + { + "epoch": 7.11, + "grad_norm": 0.462890625, + "learning_rate": 0.0004338785856324551, + "loss": 0.1805, + "step": 171690 + }, + { + "epoch": 7.11, + "grad_norm": 0.91015625, + "learning_rate": 0.00043387123774068717, + "loss": 0.1817, + "step": 171700 + }, + { + "epoch": 7.11, + "grad_norm": 0.6796875, + "learning_rate": 0.0004338638895028935, + "loss": 0.1676, + "step": 171710 + }, + { + "epoch": 7.11, + "grad_norm": 0.376953125, + "learning_rate": 0.0004338565409190879, + "loss": 0.2077, + "step": 171720 + }, + { + "epoch": 7.11, + "grad_norm": 0.74609375, + "learning_rate": 0.00043384919198928405, + "loss": 0.1503, + "step": 171730 + }, + { + "epoch": 7.11, + "grad_norm": 0.55859375, + "learning_rate": 0.0004338418427134959, + "loss": 0.1699, + "step": 171740 + }, + { + "epoch": 7.11, + "grad_norm": 0.66796875, + "learning_rate": 0.0004338344930917373, + "loss": 0.197, + "step": 171750 + }, + { + "epoch": 7.11, + "grad_norm": 0.83984375, + "learning_rate": 0.00043382714312402193, + "loss": 0.198, + "step": 171760 + }, + { + "epoch": 7.11, + "grad_norm": 0.52734375, + "learning_rate": 0.00043381979281036386, + "loss": 0.1631, + "step": 171770 + }, + { + "epoch": 7.12, + "grad_norm": 0.8125, + "learning_rate": 0.00043381244215077677, + "loss": 0.2317, + "step": 171780 + }, + { + "epoch": 7.12, + "grad_norm": 0.71875, + "learning_rate": 0.00043380509114527443, + "loss": 0.1951, + "step": 171790 + }, + { + "epoch": 7.12, + "grad_norm": 1.4453125, + "learning_rate": 0.00043379773979387084, + "loss": 0.1831, + "step": 171800 + }, + { + "epoch": 7.12, + "grad_norm": 0.98828125, + "learning_rate": 0.00043379038809657965, + "loss": 0.2449, + "step": 171810 + }, + { + "epoch": 7.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00043378303605341484, + "loss": 0.2057, + "step": 171820 + }, + { + "epoch": 7.12, + "grad_norm": 1.6796875, + "learning_rate": 0.00043377568366439025, + "loss": 0.1993, + "step": 171830 + }, + { + "epoch": 7.12, + "grad_norm": 1.3984375, + "learning_rate": 0.0004337683309295197, + "loss": 0.1989, + "step": 171840 + }, + { + "epoch": 7.12, + "grad_norm": 1.1015625, + "learning_rate": 0.00043376097784881694, + "loss": 0.2416, + "step": 171850 + }, + { + "epoch": 7.12, + "grad_norm": 0.427734375, + "learning_rate": 0.0004337536244222958, + "loss": 0.1648, + "step": 171860 + }, + { + "epoch": 7.12, + "grad_norm": 1.8984375, + "learning_rate": 0.0004337462706499703, + "loss": 0.1633, + "step": 171870 + }, + { + "epoch": 7.12, + "grad_norm": 0.72265625, + "learning_rate": 0.0004337389165318541, + "loss": 0.1952, + "step": 171880 + }, + { + "epoch": 7.12, + "grad_norm": 0.8515625, + "learning_rate": 0.00043373156206796116, + "loss": 0.1484, + "step": 171890 + }, + { + "epoch": 7.12, + "grad_norm": 0.55078125, + "learning_rate": 0.0004337242072583052, + "loss": 0.1872, + "step": 171900 + }, + { + "epoch": 7.12, + "grad_norm": 0.92578125, + "learning_rate": 0.00043371685210290013, + "loss": 0.2079, + "step": 171910 + }, + { + "epoch": 7.12, + "grad_norm": 0.48828125, + "learning_rate": 0.0004337094966017597, + "loss": 0.213, + "step": 171920 + }, + { + "epoch": 7.12, + "grad_norm": 1.2734375, + "learning_rate": 0.000433702140754898, + "loss": 0.1702, + "step": 171930 + }, + { + "epoch": 7.12, + "grad_norm": 0.263671875, + "learning_rate": 0.00043369478456232866, + "loss": 0.2103, + "step": 171940 + }, + { + "epoch": 7.12, + "grad_norm": 1.140625, + "learning_rate": 0.00043368742802406545, + "loss": 0.1912, + "step": 171950 + }, + { + "epoch": 7.12, + "grad_norm": 0.012451171875, + "learning_rate": 0.00043368007114012253, + "loss": 0.1706, + "step": 171960 + }, + { + "epoch": 7.12, + "grad_norm": 0.6015625, + "learning_rate": 0.00043367271391051346, + "loss": 0.2489, + "step": 171970 + }, + { + "epoch": 7.12, + "grad_norm": 0.33984375, + "learning_rate": 0.0004336653563352522, + "loss": 0.2208, + "step": 171980 + }, + { + "epoch": 7.12, + "grad_norm": 0.97265625, + "learning_rate": 0.0004336579984143525, + "loss": 0.2099, + "step": 171990 + }, + { + "epoch": 7.12, + "grad_norm": 0.302734375, + "learning_rate": 0.0004336506401478283, + "loss": 0.2394, + "step": 172000 + }, + { + "epoch": 7.12, + "grad_norm": 0.51953125, + "learning_rate": 0.0004336432815356934, + "loss": 0.2285, + "step": 172010 + }, + { + "epoch": 7.13, + "grad_norm": 0.6640625, + "learning_rate": 0.0004336359225779618, + "loss": 0.2076, + "step": 172020 + }, + { + "epoch": 7.13, + "grad_norm": 0.9921875, + "learning_rate": 0.00043362856327464717, + "loss": 0.2211, + "step": 172030 + }, + { + "epoch": 7.13, + "grad_norm": 0.67578125, + "learning_rate": 0.0004336212036257634, + "loss": 0.2146, + "step": 172040 + }, + { + "epoch": 7.13, + "grad_norm": 1.15625, + "learning_rate": 0.0004336138436313243, + "loss": 0.1684, + "step": 172050 + }, + { + "epoch": 7.13, + "grad_norm": 0.65625, + "learning_rate": 0.0004336064832913439, + "loss": 0.1876, + "step": 172060 + }, + { + "epoch": 7.13, + "grad_norm": 0.9453125, + "learning_rate": 0.00043359912260583586, + "loss": 0.1567, + "step": 172070 + }, + { + "epoch": 7.13, + "grad_norm": 0.5546875, + "learning_rate": 0.00043359176157481405, + "loss": 0.2345, + "step": 172080 + }, + { + "epoch": 7.13, + "grad_norm": 0.2353515625, + "learning_rate": 0.0004335844001982924, + "loss": 0.1641, + "step": 172090 + }, + { + "epoch": 7.13, + "grad_norm": 0.8515625, + "learning_rate": 0.00043357703847628475, + "loss": 0.2365, + "step": 172100 + }, + { + "epoch": 7.13, + "grad_norm": 0.451171875, + "learning_rate": 0.00043356967640880497, + "loss": 0.209, + "step": 172110 + }, + { + "epoch": 7.13, + "grad_norm": 1.4140625, + "learning_rate": 0.00043356231399586687, + "loss": 0.2124, + "step": 172120 + }, + { + "epoch": 7.13, + "grad_norm": 1.1796875, + "learning_rate": 0.0004335549512374843, + "loss": 0.2058, + "step": 172130 + }, + { + "epoch": 7.13, + "grad_norm": 0.5234375, + "learning_rate": 0.0004335475881336711, + "loss": 0.2062, + "step": 172140 + }, + { + "epoch": 7.13, + "grad_norm": 0.412109375, + "learning_rate": 0.0004335402246844412, + "loss": 0.167, + "step": 172150 + }, + { + "epoch": 7.13, + "grad_norm": 0.96875, + "learning_rate": 0.0004335328608898084, + "loss": 0.2352, + "step": 172160 + }, + { + "epoch": 7.13, + "grad_norm": 1.5859375, + "learning_rate": 0.0004335254967497866, + "loss": 0.2353, + "step": 172170 + }, + { + "epoch": 7.13, + "grad_norm": 0.5859375, + "learning_rate": 0.00043351813226438963, + "loss": 0.1644, + "step": 172180 + }, + { + "epoch": 7.13, + "grad_norm": 0.6875, + "learning_rate": 0.0004335107674336313, + "loss": 0.2008, + "step": 172190 + }, + { + "epoch": 7.13, + "grad_norm": 0.87109375, + "learning_rate": 0.0004335034022575256, + "loss": 0.2302, + "step": 172200 + }, + { + "epoch": 7.13, + "grad_norm": 1.171875, + "learning_rate": 0.0004334960367360863, + "loss": 0.2107, + "step": 172210 + }, + { + "epoch": 7.13, + "grad_norm": 0.7265625, + "learning_rate": 0.0004334886708693272, + "loss": 0.2107, + "step": 172220 + }, + { + "epoch": 7.13, + "grad_norm": 1.5234375, + "learning_rate": 0.0004334813046572623, + "loss": 0.1705, + "step": 172230 + }, + { + "epoch": 7.13, + "grad_norm": 0.8359375, + "learning_rate": 0.0004334739380999054, + "loss": 0.167, + "step": 172240 + }, + { + "epoch": 7.13, + "grad_norm": 1.015625, + "learning_rate": 0.0004334665711972704, + "loss": 0.1953, + "step": 172250 + }, + { + "epoch": 7.13, + "grad_norm": 1.015625, + "learning_rate": 0.00043345920394937103, + "loss": 0.1698, + "step": 172260 + }, + { + "epoch": 7.14, + "grad_norm": 0.9453125, + "learning_rate": 0.0004334518363562213, + "loss": 0.2039, + "step": 172270 + }, + { + "epoch": 7.14, + "grad_norm": 0.328125, + "learning_rate": 0.000433444468417835, + "loss": 0.249, + "step": 172280 + }, + { + "epoch": 7.14, + "grad_norm": 0.8828125, + "learning_rate": 0.0004334371001342261, + "loss": 0.1889, + "step": 172290 + }, + { + "epoch": 7.14, + "grad_norm": 1.3046875, + "learning_rate": 0.0004334297315054083, + "loss": 0.2212, + "step": 172300 + }, + { + "epoch": 7.14, + "grad_norm": 0.75, + "learning_rate": 0.00043342236253139557, + "loss": 0.1852, + "step": 172310 + }, + { + "epoch": 7.14, + "grad_norm": 0.83203125, + "learning_rate": 0.0004334149932122018, + "loss": 0.2069, + "step": 172320 + }, + { + "epoch": 7.14, + "grad_norm": 1.140625, + "learning_rate": 0.0004334076235478408, + "loss": 0.2024, + "step": 172330 + }, + { + "epoch": 7.14, + "grad_norm": 0.90625, + "learning_rate": 0.0004334002535383265, + "loss": 0.2448, + "step": 172340 + }, + { + "epoch": 7.14, + "grad_norm": 0.458984375, + "learning_rate": 0.0004333928831836726, + "loss": 0.1999, + "step": 172350 + }, + { + "epoch": 7.14, + "grad_norm": 0.59765625, + "learning_rate": 0.00043338551248389327, + "loss": 0.2145, + "step": 172360 + }, + { + "epoch": 7.14, + "grad_norm": 0.625, + "learning_rate": 0.0004333781414390021, + "loss": 0.2035, + "step": 172370 + }, + { + "epoch": 7.14, + "grad_norm": 0.578125, + "learning_rate": 0.00043337077004901303, + "loss": 0.2078, + "step": 172380 + }, + { + "epoch": 7.14, + "grad_norm": 0.80078125, + "learning_rate": 0.0004333633983139401, + "loss": 0.1867, + "step": 172390 + }, + { + "epoch": 7.14, + "grad_norm": 0.9609375, + "learning_rate": 0.000433356026233797, + "loss": 0.23, + "step": 172400 + }, + { + "epoch": 7.14, + "grad_norm": 0.64453125, + "learning_rate": 0.00043334865380859766, + "loss": 0.2088, + "step": 172410 + }, + { + "epoch": 7.14, + "grad_norm": 0.40625, + "learning_rate": 0.0004333412810383559, + "loss": 0.1801, + "step": 172420 + }, + { + "epoch": 7.14, + "grad_norm": 0.609375, + "learning_rate": 0.00043333390792308566, + "loss": 0.2108, + "step": 172430 + }, + { + "epoch": 7.14, + "grad_norm": 1.015625, + "learning_rate": 0.0004333265344628009, + "loss": 0.1882, + "step": 172440 + }, + { + "epoch": 7.14, + "grad_norm": 0.169921875, + "learning_rate": 0.00043331916065751533, + "loss": 0.1893, + "step": 172450 + }, + { + "epoch": 7.14, + "grad_norm": 0.7734375, + "learning_rate": 0.0004333117865072429, + "loss": 0.196, + "step": 172460 + }, + { + "epoch": 7.14, + "grad_norm": 0.60546875, + "learning_rate": 0.00043330441201199746, + "loss": 0.1982, + "step": 172470 + }, + { + "epoch": 7.14, + "grad_norm": 1.2265625, + "learning_rate": 0.0004332970371717929, + "loss": 0.1747, + "step": 172480 + }, + { + "epoch": 7.14, + "grad_norm": 0.83984375, + "learning_rate": 0.0004332896619866432, + "loss": 0.1899, + "step": 172490 + }, + { + "epoch": 7.14, + "grad_norm": 0.6796875, + "learning_rate": 0.0004332822864565621, + "loss": 0.2498, + "step": 172500 + }, + { + "epoch": 7.15, + "grad_norm": 1.703125, + "learning_rate": 0.00043327491058156353, + "loss": 0.196, + "step": 172510 + }, + { + "epoch": 7.15, + "grad_norm": 1.046875, + "learning_rate": 0.00043326753436166135, + "loss": 0.187, + "step": 172520 + }, + { + "epoch": 7.15, + "grad_norm": 0.2578125, + "learning_rate": 0.0004332601577968695, + "loss": 0.1751, + "step": 172530 + }, + { + "epoch": 7.15, + "grad_norm": 0.453125, + "learning_rate": 0.00043325278088720176, + "loss": 0.1854, + "step": 172540 + }, + { + "epoch": 7.15, + "grad_norm": 1.2578125, + "learning_rate": 0.00043324540363267217, + "loss": 0.2157, + "step": 172550 + }, + { + "epoch": 7.15, + "grad_norm": 0.6875, + "learning_rate": 0.0004332380260332944, + "loss": 0.2119, + "step": 172560 + }, + { + "epoch": 7.15, + "grad_norm": 0.8828125, + "learning_rate": 0.00043323064808908256, + "loss": 0.162, + "step": 172570 + }, + { + "epoch": 7.15, + "grad_norm": 0.49609375, + "learning_rate": 0.00043322326980005046, + "loss": 0.2304, + "step": 172580 + }, + { + "epoch": 7.15, + "grad_norm": 0.828125, + "learning_rate": 0.0004332158911662119, + "loss": 0.2304, + "step": 172590 + }, + { + "epoch": 7.15, + "grad_norm": 0.60546875, + "learning_rate": 0.00043320851218758074, + "loss": 0.2003, + "step": 172600 + }, + { + "epoch": 7.15, + "grad_norm": 0.5859375, + "learning_rate": 0.00043320113286417103, + "loss": 0.2, + "step": 172610 + }, + { + "epoch": 7.15, + "grad_norm": 0.8359375, + "learning_rate": 0.0004331937531959965, + "loss": 0.2147, + "step": 172620 + }, + { + "epoch": 7.15, + "grad_norm": 0.6484375, + "learning_rate": 0.0004331863731830712, + "loss": 0.2057, + "step": 172630 + }, + { + "epoch": 7.15, + "grad_norm": 0.5625, + "learning_rate": 0.00043317899282540885, + "loss": 0.2231, + "step": 172640 + }, + { + "epoch": 7.15, + "grad_norm": 1.0859375, + "learning_rate": 0.0004331716121230235, + "loss": 0.2055, + "step": 172650 + }, + { + "epoch": 7.15, + "grad_norm": 0.50390625, + "learning_rate": 0.00043316423107592883, + "loss": 0.2262, + "step": 172660 + }, + { + "epoch": 7.15, + "grad_norm": 0.4375, + "learning_rate": 0.00043315684968413894, + "loss": 0.1905, + "step": 172670 + }, + { + "epoch": 7.15, + "grad_norm": 1.1484375, + "learning_rate": 0.0004331494679476676, + "loss": 0.258, + "step": 172680 + }, + { + "epoch": 7.15, + "grad_norm": 3.484375, + "learning_rate": 0.0004331420858665288, + "loss": 0.2307, + "step": 172690 + }, + { + "epoch": 7.15, + "grad_norm": 0.640625, + "learning_rate": 0.00043313470344073633, + "loss": 0.2146, + "step": 172700 + }, + { + "epoch": 7.15, + "grad_norm": 1.25, + "learning_rate": 0.00043312732067030417, + "loss": 0.2089, + "step": 172710 + }, + { + "epoch": 7.15, + "grad_norm": 0.66796875, + "learning_rate": 0.00043311993755524615, + "loss": 0.24, + "step": 172720 + }, + { + "epoch": 7.15, + "grad_norm": 0.9296875, + "learning_rate": 0.00043311255409557615, + "loss": 0.1981, + "step": 172730 + }, + { + "epoch": 7.15, + "grad_norm": 1.1640625, + "learning_rate": 0.00043310517029130816, + "loss": 0.2006, + "step": 172740 + }, + { + "epoch": 7.16, + "grad_norm": 0.22265625, + "learning_rate": 0.000433097786142456, + "loss": 0.2014, + "step": 172750 + }, + { + "epoch": 7.16, + "grad_norm": 0.4453125, + "learning_rate": 0.0004330904016490335, + "loss": 0.2112, + "step": 172760 + }, + { + "epoch": 7.16, + "grad_norm": 0.296875, + "learning_rate": 0.0004330830168110547, + "loss": 0.1551, + "step": 172770 + }, + { + "epoch": 7.16, + "grad_norm": 0.251953125, + "learning_rate": 0.0004330756316285335, + "loss": 0.2198, + "step": 172780 + }, + { + "epoch": 7.16, + "grad_norm": 0.94140625, + "learning_rate": 0.00043306824610148353, + "loss": 0.2092, + "step": 172790 + }, + { + "epoch": 7.16, + "grad_norm": 0.43359375, + "learning_rate": 0.00043306086022991907, + "loss": 0.199, + "step": 172800 + }, + { + "epoch": 7.16, + "grad_norm": 0.390625, + "learning_rate": 0.00043305347401385375, + "loss": 0.2137, + "step": 172810 + }, + { + "epoch": 7.16, + "grad_norm": 1.4921875, + "learning_rate": 0.0004330460874533016, + "loss": 0.206, + "step": 172820 + }, + { + "epoch": 7.16, + "grad_norm": 0.7109375, + "learning_rate": 0.00043303870054827646, + "loss": 0.21, + "step": 172830 + }, + { + "epoch": 7.16, + "grad_norm": 0.69140625, + "learning_rate": 0.0004330313132987923, + "loss": 0.1589, + "step": 172840 + }, + { + "epoch": 7.16, + "grad_norm": 0.4765625, + "learning_rate": 0.0004330239257048629, + "loss": 0.2564, + "step": 172850 + }, + { + "epoch": 7.16, + "grad_norm": 0.5859375, + "learning_rate": 0.00043301653776650227, + "loss": 0.2358, + "step": 172860 + }, + { + "epoch": 7.16, + "grad_norm": 0.54296875, + "learning_rate": 0.0004330091494837243, + "loss": 0.2152, + "step": 172870 + }, + { + "epoch": 7.16, + "grad_norm": 0.625, + "learning_rate": 0.00043300176085654294, + "loss": 0.1708, + "step": 172880 + }, + { + "epoch": 7.16, + "grad_norm": 0.71484375, + "learning_rate": 0.00043299437188497195, + "loss": 0.213, + "step": 172890 + }, + { + "epoch": 7.16, + "grad_norm": 0.94921875, + "learning_rate": 0.0004329869825690252, + "loss": 0.1882, + "step": 172900 + }, + { + "epoch": 7.16, + "grad_norm": 0.333984375, + "learning_rate": 0.0004329795929087169, + "loss": 0.1772, + "step": 172910 + }, + { + "epoch": 7.16, + "grad_norm": 0.83984375, + "learning_rate": 0.0004329722029040607, + "loss": 0.1845, + "step": 172920 + }, + { + "epoch": 7.16, + "grad_norm": 0.8046875, + "learning_rate": 0.0004329648125550706, + "loss": 0.241, + "step": 172930 + }, + { + "epoch": 7.16, + "grad_norm": 1.796875, + "learning_rate": 0.00043295742186176035, + "loss": 0.1581, + "step": 172940 + }, + { + "epoch": 7.16, + "grad_norm": 0.89453125, + "learning_rate": 0.00043295003082414416, + "loss": 0.1818, + "step": 172950 + }, + { + "epoch": 7.16, + "grad_norm": 0.61328125, + "learning_rate": 0.00043294263944223567, + "loss": 0.1768, + "step": 172960 + }, + { + "epoch": 7.16, + "grad_norm": 1.59375, + "learning_rate": 0.0004329352477160489, + "loss": 0.2086, + "step": 172970 + }, + { + "epoch": 7.16, + "grad_norm": 0.5078125, + "learning_rate": 0.0004329278556455978, + "loss": 0.1806, + "step": 172980 + }, + { + "epoch": 7.17, + "grad_norm": 1.796875, + "learning_rate": 0.0004329204632308962, + "loss": 0.1362, + "step": 172990 + }, + { + "epoch": 7.17, + "grad_norm": 1.171875, + "learning_rate": 0.000432913070471958, + "loss": 0.207, + "step": 173000 + }, + { + "epoch": 7.17, + "grad_norm": 1.0859375, + "learning_rate": 0.00043290567736879715, + "loss": 0.2271, + "step": 173010 + }, + { + "epoch": 7.17, + "grad_norm": 0.34375, + "learning_rate": 0.00043289828392142763, + "loss": 0.2394, + "step": 173020 + }, + { + "epoch": 7.17, + "grad_norm": 0.73828125, + "learning_rate": 0.0004328908901298633, + "loss": 0.1802, + "step": 173030 + }, + { + "epoch": 7.17, + "grad_norm": 1.3515625, + "learning_rate": 0.000432883495994118, + "loss": 0.1929, + "step": 173040 + }, + { + "epoch": 7.17, + "grad_norm": 1.03125, + "learning_rate": 0.00043287610151420574, + "loss": 0.2319, + "step": 173050 + }, + { + "epoch": 7.17, + "grad_norm": 0.97265625, + "learning_rate": 0.00043286870669014037, + "loss": 0.2163, + "step": 173060 + }, + { + "epoch": 7.17, + "grad_norm": 0.84375, + "learning_rate": 0.00043286131152193584, + "loss": 0.1613, + "step": 173070 + }, + { + "epoch": 7.17, + "grad_norm": 1.0546875, + "learning_rate": 0.00043285391600960624, + "loss": 0.2439, + "step": 173080 + }, + { + "epoch": 7.17, + "grad_norm": 1.1015625, + "learning_rate": 0.0004328465201531652, + "loss": 0.2635, + "step": 173090 + }, + { + "epoch": 7.17, + "grad_norm": 0.357421875, + "learning_rate": 0.00043283912395262667, + "loss": 0.2133, + "step": 173100 + }, + { + "epoch": 7.17, + "grad_norm": 0.58984375, + "learning_rate": 0.0004328317274080047, + "loss": 0.1945, + "step": 173110 + }, + { + "epoch": 7.17, + "grad_norm": 0.62109375, + "learning_rate": 0.00043282433051931325, + "loss": 0.2627, + "step": 173120 + }, + { + "epoch": 7.17, + "grad_norm": 0.65234375, + "learning_rate": 0.000432816933286566, + "loss": 0.2176, + "step": 173130 + }, + { + "epoch": 7.17, + "grad_norm": 0.8125, + "learning_rate": 0.00043280953570977714, + "loss": 0.2393, + "step": 173140 + }, + { + "epoch": 7.17, + "grad_norm": 1.0, + "learning_rate": 0.00043280213778896045, + "loss": 0.2419, + "step": 173150 + }, + { + "epoch": 7.17, + "grad_norm": 0.8203125, + "learning_rate": 0.0004327947395241299, + "loss": 0.2065, + "step": 173160 + }, + { + "epoch": 7.17, + "grad_norm": 1.2578125, + "learning_rate": 0.00043278734091529937, + "loss": 0.1943, + "step": 173170 + }, + { + "epoch": 7.17, + "grad_norm": 0.62890625, + "learning_rate": 0.0004327799419624828, + "loss": 0.1923, + "step": 173180 + }, + { + "epoch": 7.17, + "grad_norm": 1.0234375, + "learning_rate": 0.0004327725426656941, + "loss": 0.202, + "step": 173190 + }, + { + "epoch": 7.17, + "grad_norm": 0.83984375, + "learning_rate": 0.00043276514302494724, + "loss": 0.1966, + "step": 173200 + }, + { + "epoch": 7.17, + "grad_norm": 0.40625, + "learning_rate": 0.0004327577430402561, + "loss": 0.1579, + "step": 173210 + }, + { + "epoch": 7.17, + "grad_norm": 0.91015625, + "learning_rate": 0.0004327503427116346, + "loss": 0.2411, + "step": 173220 + }, + { + "epoch": 7.18, + "grad_norm": 0.8515625, + "learning_rate": 0.00043274294203909675, + "loss": 0.2366, + "step": 173230 + }, + { + "epoch": 7.18, + "grad_norm": 0.439453125, + "learning_rate": 0.00043273554102265637, + "loss": 0.1986, + "step": 173240 + }, + { + "epoch": 7.18, + "grad_norm": 0.6640625, + "learning_rate": 0.0004327281396623275, + "loss": 0.2404, + "step": 173250 + }, + { + "epoch": 7.18, + "grad_norm": 0.2060546875, + "learning_rate": 0.00043272073795812396, + "loss": 0.2761, + "step": 173260 + }, + { + "epoch": 7.18, + "grad_norm": 1.21875, + "learning_rate": 0.0004327133359100597, + "loss": 0.1928, + "step": 173270 + }, + { + "epoch": 7.18, + "grad_norm": 0.7734375, + "learning_rate": 0.0004327059335181487, + "loss": 0.2589, + "step": 173280 + }, + { + "epoch": 7.18, + "grad_norm": 1.1796875, + "learning_rate": 0.00043269853078240487, + "loss": 0.1785, + "step": 173290 + }, + { + "epoch": 7.18, + "grad_norm": 1.3828125, + "learning_rate": 0.00043269112770284214, + "loss": 0.2149, + "step": 173300 + }, + { + "epoch": 7.18, + "grad_norm": 0.80859375, + "learning_rate": 0.0004326837242794744, + "loss": 0.2434, + "step": 173310 + }, + { + "epoch": 7.18, + "grad_norm": 0.484375, + "learning_rate": 0.00043267632051231564, + "loss": 0.1737, + "step": 173320 + }, + { + "epoch": 7.18, + "grad_norm": 0.73828125, + "learning_rate": 0.0004326689164013798, + "loss": 0.1711, + "step": 173330 + }, + { + "epoch": 7.18, + "grad_norm": 1.03125, + "learning_rate": 0.0004326615119466807, + "loss": 0.2425, + "step": 173340 + }, + { + "epoch": 7.18, + "grad_norm": 0.3671875, + "learning_rate": 0.00043265410714823243, + "loss": 0.1821, + "step": 173350 + }, + { + "epoch": 7.18, + "grad_norm": 0.515625, + "learning_rate": 0.00043264670200604885, + "loss": 0.1802, + "step": 173360 + }, + { + "epoch": 7.18, + "grad_norm": 0.1865234375, + "learning_rate": 0.0004326392965201439, + "loss": 0.2207, + "step": 173370 + }, + { + "epoch": 7.18, + "grad_norm": 0.91796875, + "learning_rate": 0.00043263189069053153, + "loss": 0.1738, + "step": 173380 + }, + { + "epoch": 7.18, + "grad_norm": 1.078125, + "learning_rate": 0.00043262448451722565, + "loss": 0.2434, + "step": 173390 + }, + { + "epoch": 7.18, + "grad_norm": 0.4140625, + "learning_rate": 0.0004326170780002402, + "loss": 0.1739, + "step": 173400 + }, + { + "epoch": 7.18, + "grad_norm": 0.90234375, + "learning_rate": 0.00043260967113958914, + "loss": 0.1852, + "step": 173410 + }, + { + "epoch": 7.18, + "grad_norm": 0.00162506103515625, + "learning_rate": 0.0004326022639352864, + "loss": 0.2118, + "step": 173420 + }, + { + "epoch": 7.18, + "grad_norm": 0.296875, + "learning_rate": 0.00043259485638734587, + "loss": 0.1839, + "step": 173430 + }, + { + "epoch": 7.18, + "grad_norm": 0.9375, + "learning_rate": 0.00043258744849578165, + "loss": 0.1606, + "step": 173440 + }, + { + "epoch": 7.18, + "grad_norm": 0.484375, + "learning_rate": 0.00043258004026060747, + "loss": 0.2408, + "step": 173450 + }, + { + "epoch": 7.18, + "grad_norm": 0.390625, + "learning_rate": 0.0004325726316818374, + "loss": 0.1841, + "step": 173460 + }, + { + "epoch": 7.19, + "grad_norm": 1.796875, + "learning_rate": 0.0004325652227594854, + "loss": 0.198, + "step": 173470 + }, + { + "epoch": 7.19, + "grad_norm": 0.7265625, + "learning_rate": 0.0004325578134935653, + "loss": 0.1964, + "step": 173480 + }, + { + "epoch": 7.19, + "grad_norm": 0.5546875, + "learning_rate": 0.0004325504038840911, + "loss": 0.1869, + "step": 173490 + }, + { + "epoch": 7.19, + "grad_norm": 1.1015625, + "learning_rate": 0.0004325429939310768, + "loss": 0.1959, + "step": 173500 + }, + { + "epoch": 7.19, + "grad_norm": 0.5546875, + "learning_rate": 0.00043253558363453627, + "loss": 0.1498, + "step": 173510 + }, + { + "epoch": 7.19, + "grad_norm": 0.4921875, + "learning_rate": 0.00043252817299448354, + "loss": 0.1972, + "step": 173520 + }, + { + "epoch": 7.19, + "grad_norm": 0.97265625, + "learning_rate": 0.0004325207620109325, + "loss": 0.1713, + "step": 173530 + }, + { + "epoch": 7.19, + "grad_norm": 0.7109375, + "learning_rate": 0.000432513350683897, + "loss": 0.1855, + "step": 173540 + }, + { + "epoch": 7.19, + "grad_norm": 1.0078125, + "learning_rate": 0.00043250593901339115, + "loss": 0.1994, + "step": 173550 + }, + { + "epoch": 7.19, + "grad_norm": 0.447265625, + "learning_rate": 0.0004324985269994288, + "loss": 0.2116, + "step": 173560 + }, + { + "epoch": 7.19, + "grad_norm": 0.83984375, + "learning_rate": 0.00043249111464202397, + "loss": 0.1969, + "step": 173570 + }, + { + "epoch": 7.19, + "grad_norm": 0.73828125, + "learning_rate": 0.00043248370194119056, + "loss": 0.2643, + "step": 173580 + }, + { + "epoch": 7.19, + "grad_norm": 0.2333984375, + "learning_rate": 0.0004324762888969425, + "loss": 0.2547, + "step": 173590 + }, + { + "epoch": 7.19, + "grad_norm": 0.5234375, + "learning_rate": 0.0004324688755092938, + "loss": 0.2473, + "step": 173600 + }, + { + "epoch": 7.19, + "grad_norm": 0.57421875, + "learning_rate": 0.0004324614617782584, + "loss": 0.2184, + "step": 173610 + }, + { + "epoch": 7.19, + "grad_norm": 1.2734375, + "learning_rate": 0.00043245404770385023, + "loss": 0.2071, + "step": 173620 + }, + { + "epoch": 7.19, + "grad_norm": 1.1796875, + "learning_rate": 0.00043244663328608315, + "loss": 0.2369, + "step": 173630 + }, + { + "epoch": 7.19, + "grad_norm": 0.49609375, + "learning_rate": 0.0004324392185249713, + "loss": 0.2243, + "step": 173640 + }, + { + "epoch": 7.19, + "grad_norm": 1.3359375, + "learning_rate": 0.00043243180342052856, + "loss": 0.1992, + "step": 173650 + }, + { + "epoch": 7.19, + "grad_norm": 0.5703125, + "learning_rate": 0.00043242438797276876, + "loss": 0.222, + "step": 173660 + }, + { + "epoch": 7.19, + "grad_norm": 0.921875, + "learning_rate": 0.00043241697218170604, + "loss": 0.1524, + "step": 173670 + }, + { + "epoch": 7.19, + "grad_norm": 1.0, + "learning_rate": 0.00043240955604735435, + "loss": 0.1748, + "step": 173680 + }, + { + "epoch": 7.19, + "grad_norm": 0.9140625, + "learning_rate": 0.0004324021395697274, + "loss": 0.1553, + "step": 173690 + }, + { + "epoch": 7.19, + "grad_norm": 0.55859375, + "learning_rate": 0.00043239472274883946, + "loss": 0.252, + "step": 173700 + }, + { + "epoch": 7.2, + "grad_norm": 0.59765625, + "learning_rate": 0.0004323873055847043, + "loss": 0.2632, + "step": 173710 + }, + { + "epoch": 7.2, + "grad_norm": 0.384765625, + "learning_rate": 0.000432379888077336, + "loss": 0.1919, + "step": 173720 + }, + { + "epoch": 7.2, + "grad_norm": 0.87890625, + "learning_rate": 0.0004323724702267483, + "loss": 0.1975, + "step": 173730 + }, + { + "epoch": 7.2, + "grad_norm": 1.0625, + "learning_rate": 0.0004323650520329554, + "loss": 0.2028, + "step": 173740 + }, + { + "epoch": 7.2, + "grad_norm": 0.69140625, + "learning_rate": 0.00043235763349597114, + "loss": 0.2073, + "step": 173750 + }, + { + "epoch": 7.2, + "grad_norm": 0.81640625, + "learning_rate": 0.00043235021461580946, + "loss": 0.1845, + "step": 173760 + }, + { + "epoch": 7.2, + "grad_norm": 0.640625, + "learning_rate": 0.00043234279539248446, + "loss": 0.2239, + "step": 173770 + }, + { + "epoch": 7.2, + "grad_norm": 1.078125, + "learning_rate": 0.00043233537582601, + "loss": 0.2441, + "step": 173780 + }, + { + "epoch": 7.2, + "grad_norm": 0.875, + "learning_rate": 0.0004323279559164, + "loss": 0.2138, + "step": 173790 + }, + { + "epoch": 7.2, + "grad_norm": 0.6328125, + "learning_rate": 0.0004323205356636685, + "loss": 0.2157, + "step": 173800 + }, + { + "epoch": 7.2, + "grad_norm": 1.1015625, + "learning_rate": 0.00043231311506782943, + "loss": 0.2575, + "step": 173810 + }, + { + "epoch": 7.2, + "grad_norm": 0.5859375, + "learning_rate": 0.00043230569412889674, + "loss": 0.1644, + "step": 173820 + }, + { + "epoch": 7.2, + "grad_norm": 0.95703125, + "learning_rate": 0.0004322982728468845, + "loss": 0.1918, + "step": 173830 + }, + { + "epoch": 7.2, + "grad_norm": 0.78515625, + "learning_rate": 0.0004322908512218066, + "loss": 0.2329, + "step": 173840 + }, + { + "epoch": 7.2, + "grad_norm": 0.447265625, + "learning_rate": 0.0004322834292536769, + "loss": 0.2299, + "step": 173850 + }, + { + "epoch": 7.2, + "grad_norm": 0.66796875, + "learning_rate": 0.00043227600694250955, + "loss": 0.203, + "step": 173860 + }, + { + "epoch": 7.2, + "grad_norm": 0.6171875, + "learning_rate": 0.00043226858428831844, + "loss": 0.2041, + "step": 173870 + }, + { + "epoch": 7.2, + "grad_norm": 1.203125, + "learning_rate": 0.00043226116129111753, + "loss": 0.2452, + "step": 173880 + }, + { + "epoch": 7.2, + "grad_norm": 0.59765625, + "learning_rate": 0.00043225373795092076, + "loss": 0.193, + "step": 173890 + }, + { + "epoch": 7.2, + "grad_norm": 0.984375, + "learning_rate": 0.0004322463142677422, + "loss": 0.1981, + "step": 173900 + }, + { + "epoch": 7.2, + "grad_norm": 0.94921875, + "learning_rate": 0.0004322388902415957, + "loss": 0.2432, + "step": 173910 + }, + { + "epoch": 7.2, + "grad_norm": 0.76171875, + "learning_rate": 0.0004322314658724953, + "loss": 0.193, + "step": 173920 + }, + { + "epoch": 7.2, + "grad_norm": 0.734375, + "learning_rate": 0.00043222404116045497, + "loss": 0.1726, + "step": 173930 + }, + { + "epoch": 7.2, + "grad_norm": 0.53515625, + "learning_rate": 0.0004322166161054887, + "loss": 0.2491, + "step": 173940 + }, + { + "epoch": 7.2, + "grad_norm": 0.48828125, + "learning_rate": 0.0004322091907076103, + "loss": 0.2229, + "step": 173950 + }, + { + "epoch": 7.21, + "grad_norm": 0.53515625, + "learning_rate": 0.0004322017649668341, + "loss": 0.2127, + "step": 173960 + }, + { + "epoch": 7.21, + "grad_norm": 0.390625, + "learning_rate": 0.0004321943388831737, + "loss": 0.2295, + "step": 173970 + }, + { + "epoch": 7.21, + "grad_norm": 0.458984375, + "learning_rate": 0.00043218691245664327, + "loss": 0.2198, + "step": 173980 + }, + { + "epoch": 7.21, + "grad_norm": 0.83203125, + "learning_rate": 0.0004321794856872567, + "loss": 0.2168, + "step": 173990 + }, + { + "epoch": 7.21, + "grad_norm": 0.66796875, + "learning_rate": 0.0004321720585750281, + "loss": 0.2417, + "step": 174000 + }, + { + "epoch": 7.21, + "grad_norm": 0.96484375, + "learning_rate": 0.00043216463111997135, + "loss": 0.2044, + "step": 174010 + }, + { + "epoch": 7.21, + "grad_norm": 0.6796875, + "learning_rate": 0.0004321572033221004, + "loss": 0.2227, + "step": 174020 + }, + { + "epoch": 7.21, + "grad_norm": 0.828125, + "learning_rate": 0.0004321497751814293, + "loss": 0.2314, + "step": 174030 + }, + { + "epoch": 7.21, + "grad_norm": 0.98828125, + "learning_rate": 0.0004321423466979719, + "loss": 0.2061, + "step": 174040 + }, + { + "epoch": 7.21, + "grad_norm": 0.40234375, + "learning_rate": 0.00043213491787174235, + "loss": 0.2665, + "step": 174050 + }, + { + "epoch": 7.21, + "grad_norm": 0.75390625, + "learning_rate": 0.0004321274887027545, + "loss": 0.1865, + "step": 174060 + }, + { + "epoch": 7.21, + "grad_norm": 0.50390625, + "learning_rate": 0.0004321200591910225, + "loss": 0.1852, + "step": 174070 + }, + { + "epoch": 7.21, + "grad_norm": 0.482421875, + "learning_rate": 0.0004321126293365601, + "loss": 0.2037, + "step": 174080 + }, + { + "epoch": 7.21, + "grad_norm": 0.7890625, + "learning_rate": 0.0004321051991393815, + "loss": 0.2456, + "step": 174090 + }, + { + "epoch": 7.21, + "grad_norm": 0.6796875, + "learning_rate": 0.00043209776859950044, + "loss": 0.3129, + "step": 174100 + }, + { + "epoch": 7.21, + "grad_norm": 0.62890625, + "learning_rate": 0.0004320903377169311, + "loss": 0.1557, + "step": 174110 + }, + { + "epoch": 7.21, + "grad_norm": 0.7578125, + "learning_rate": 0.00043208290649168746, + "loss": 0.1916, + "step": 174120 + }, + { + "epoch": 7.21, + "grad_norm": 0.396484375, + "learning_rate": 0.00043207547492378343, + "loss": 0.1797, + "step": 174130 + }, + { + "epoch": 7.21, + "grad_norm": 0.7109375, + "learning_rate": 0.000432068043013233, + "loss": 0.1485, + "step": 174140 + }, + { + "epoch": 7.21, + "grad_norm": 0.7890625, + "learning_rate": 0.00043206061076005023, + "loss": 0.195, + "step": 174150 + }, + { + "epoch": 7.21, + "grad_norm": 1.0390625, + "learning_rate": 0.00043205317816424893, + "loss": 0.2358, + "step": 174160 + }, + { + "epoch": 7.21, + "grad_norm": 0.6328125, + "learning_rate": 0.0004320457452258433, + "loss": 0.1715, + "step": 174170 + }, + { + "epoch": 7.21, + "grad_norm": 0.5, + "learning_rate": 0.00043203831194484727, + "loss": 0.1435, + "step": 174180 + }, + { + "epoch": 7.21, + "grad_norm": 1.046875, + "learning_rate": 0.00043203087832127473, + "loss": 0.2186, + "step": 174190 + }, + { + "epoch": 7.22, + "grad_norm": 0.388671875, + "learning_rate": 0.0004320234443551397, + "loss": 0.1849, + "step": 174200 + }, + { + "epoch": 7.22, + "grad_norm": 0.73046875, + "learning_rate": 0.0004320160100464563, + "loss": 0.2358, + "step": 174210 + }, + { + "epoch": 7.22, + "grad_norm": 0.291015625, + "learning_rate": 0.0004320085753952384, + "loss": 0.2006, + "step": 174220 + }, + { + "epoch": 7.22, + "grad_norm": 0.2578125, + "learning_rate": 0.00043200114040149994, + "loss": 0.1767, + "step": 174230 + }, + { + "epoch": 7.22, + "grad_norm": 0.58984375, + "learning_rate": 0.00043199370506525513, + "loss": 0.2093, + "step": 174240 + }, + { + "epoch": 7.22, + "grad_norm": 1.1015625, + "learning_rate": 0.00043198626938651774, + "loss": 0.1837, + "step": 174250 + }, + { + "epoch": 7.22, + "grad_norm": 0.671875, + "learning_rate": 0.00043197883336530186, + "loss": 0.1613, + "step": 174260 + }, + { + "epoch": 7.22, + "grad_norm": 0.53125, + "learning_rate": 0.0004319713970016214, + "loss": 0.2824, + "step": 174270 + }, + { + "epoch": 7.22, + "grad_norm": 0.62890625, + "learning_rate": 0.0004319639602954905, + "loss": 0.1589, + "step": 174280 + }, + { + "epoch": 7.22, + "grad_norm": 0.36328125, + "learning_rate": 0.00043195652324692305, + "loss": 0.2281, + "step": 174290 + }, + { + "epoch": 7.22, + "grad_norm": 0.82421875, + "learning_rate": 0.0004319490858559331, + "loss": 0.2087, + "step": 174300 + }, + { + "epoch": 7.22, + "grad_norm": 0.458984375, + "learning_rate": 0.00043194164812253457, + "loss": 0.1995, + "step": 174310 + }, + { + "epoch": 7.22, + "grad_norm": 0.84765625, + "learning_rate": 0.0004319342100467415, + "loss": 0.1978, + "step": 174320 + }, + { + "epoch": 7.22, + "grad_norm": 0.8515625, + "learning_rate": 0.0004319267716285679, + "loss": 0.1561, + "step": 174330 + }, + { + "epoch": 7.22, + "grad_norm": 0.703125, + "learning_rate": 0.0004319193328680278, + "loss": 0.2034, + "step": 174340 + }, + { + "epoch": 7.22, + "grad_norm": 1.4453125, + "learning_rate": 0.0004319118937651352, + "loss": 0.2085, + "step": 174350 + }, + { + "epoch": 7.22, + "grad_norm": 0.84375, + "learning_rate": 0.000431904454319904, + "loss": 0.1614, + "step": 174360 + }, + { + "epoch": 7.22, + "grad_norm": 0.53125, + "learning_rate": 0.0004318970145323482, + "loss": 0.1903, + "step": 174370 + }, + { + "epoch": 7.22, + "grad_norm": 0.353515625, + "learning_rate": 0.00043188957440248193, + "loss": 0.2041, + "step": 174380 + }, + { + "epoch": 7.22, + "grad_norm": 0.609375, + "learning_rate": 0.00043188213393031917, + "loss": 0.1486, + "step": 174390 + }, + { + "epoch": 7.22, + "grad_norm": 0.369140625, + "learning_rate": 0.0004318746931158738, + "loss": 0.2069, + "step": 174400 + }, + { + "epoch": 7.22, + "grad_norm": 0.353515625, + "learning_rate": 0.0004318672519591599, + "loss": 0.2216, + "step": 174410 + }, + { + "epoch": 7.22, + "grad_norm": 0.734375, + "learning_rate": 0.0004318598104601915, + "loss": 0.2092, + "step": 174420 + }, + { + "epoch": 7.22, + "grad_norm": 0.3359375, + "learning_rate": 0.0004318523686189826, + "loss": 0.2062, + "step": 174430 + }, + { + "epoch": 7.23, + "grad_norm": 1.0703125, + "learning_rate": 0.00043184492643554717, + "loss": 0.2518, + "step": 174440 + }, + { + "epoch": 7.23, + "grad_norm": 0.470703125, + "learning_rate": 0.0004318374839098992, + "loss": 0.203, + "step": 174450 + }, + { + "epoch": 7.23, + "grad_norm": 0.73828125, + "learning_rate": 0.0004318300410420527, + "loss": 0.1842, + "step": 174460 + }, + { + "epoch": 7.23, + "grad_norm": 0.255859375, + "learning_rate": 0.00043182259783202173, + "loss": 0.2013, + "step": 174470 + }, + { + "epoch": 7.23, + "grad_norm": 1.78125, + "learning_rate": 0.0004318151542798203, + "loss": 0.2063, + "step": 174480 + }, + { + "epoch": 7.23, + "grad_norm": 0.86328125, + "learning_rate": 0.00043180771038546235, + "loss": 0.2103, + "step": 174490 + }, + { + "epoch": 7.23, + "grad_norm": 0.7578125, + "learning_rate": 0.0004318002661489619, + "loss": 0.199, + "step": 174500 + }, + { + "epoch": 7.23, + "grad_norm": 0.1953125, + "learning_rate": 0.000431792821570333, + "loss": 0.1928, + "step": 174510 + }, + { + "epoch": 7.23, + "grad_norm": 1.1171875, + "learning_rate": 0.0004317853766495897, + "loss": 0.1817, + "step": 174520 + }, + { + "epoch": 7.23, + "grad_norm": 0.890625, + "learning_rate": 0.0004317779313867459, + "loss": 0.195, + "step": 174530 + }, + { + "epoch": 7.23, + "grad_norm": 1.28125, + "learning_rate": 0.00043177048578181565, + "loss": 0.2041, + "step": 174540 + }, + { + "epoch": 7.23, + "grad_norm": 1.2421875, + "learning_rate": 0.00043176303983481296, + "loss": 0.2424, + "step": 174550 + }, + { + "epoch": 7.23, + "grad_norm": 0.453125, + "learning_rate": 0.0004317555935457519, + "loss": 0.2436, + "step": 174560 + }, + { + "epoch": 7.23, + "grad_norm": 1.25, + "learning_rate": 0.00043174814691464644, + "loss": 0.2207, + "step": 174570 + }, + { + "epoch": 7.23, + "grad_norm": 0.353515625, + "learning_rate": 0.0004317406999415106, + "loss": 0.237, + "step": 174580 + }, + { + "epoch": 7.23, + "grad_norm": 0.80078125, + "learning_rate": 0.0004317332526263583, + "loss": 0.211, + "step": 174590 + }, + { + "epoch": 7.23, + "grad_norm": 0.376953125, + "learning_rate": 0.00043172580496920375, + "loss": 0.1598, + "step": 174600 + }, + { + "epoch": 7.23, + "grad_norm": 1.0625, + "learning_rate": 0.0004317183569700608, + "loss": 0.2094, + "step": 174610 + }, + { + "epoch": 7.23, + "grad_norm": 1.3671875, + "learning_rate": 0.0004317109086289436, + "loss": 0.2036, + "step": 174620 + }, + { + "epoch": 7.23, + "grad_norm": 0.8046875, + "learning_rate": 0.00043170345994586603, + "loss": 0.256, + "step": 174630 + }, + { + "epoch": 7.23, + "grad_norm": 1.0703125, + "learning_rate": 0.0004316960109208422, + "loss": 0.2265, + "step": 174640 + }, + { + "epoch": 7.23, + "grad_norm": 0.671875, + "learning_rate": 0.000431688561553886, + "loss": 0.1447, + "step": 174650 + }, + { + "epoch": 7.23, + "grad_norm": 0.546875, + "learning_rate": 0.00043168111184501166, + "loss": 0.2164, + "step": 174660 + }, + { + "epoch": 7.23, + "grad_norm": 0.53125, + "learning_rate": 0.00043167366179423306, + "loss": 0.1951, + "step": 174670 + }, + { + "epoch": 7.24, + "grad_norm": 1.421875, + "learning_rate": 0.00043166621140156426, + "loss": 0.1898, + "step": 174680 + }, + { + "epoch": 7.24, + "grad_norm": 0.3515625, + "learning_rate": 0.00043165876066701925, + "loss": 0.1552, + "step": 174690 + }, + { + "epoch": 7.24, + "grad_norm": 1.28125, + "learning_rate": 0.00043165130959061206, + "loss": 0.1624, + "step": 174700 + }, + { + "epoch": 7.24, + "grad_norm": 0.486328125, + "learning_rate": 0.00043164385817235675, + "loss": 0.2143, + "step": 174710 + }, + { + "epoch": 7.24, + "grad_norm": 1.046875, + "learning_rate": 0.00043163640641226734, + "loss": 0.1808, + "step": 174720 + }, + { + "epoch": 7.24, + "grad_norm": 0.37109375, + "learning_rate": 0.00043162895431035777, + "loss": 0.2211, + "step": 174730 + }, + { + "epoch": 7.24, + "grad_norm": 1.3203125, + "learning_rate": 0.00043162150186664214, + "loss": 0.1945, + "step": 174740 + }, + { + "epoch": 7.24, + "grad_norm": 0.291015625, + "learning_rate": 0.0004316140490811344, + "loss": 0.2096, + "step": 174750 + }, + { + "epoch": 7.24, + "grad_norm": 0.40234375, + "learning_rate": 0.00043160659595384873, + "loss": 0.2715, + "step": 174760 + }, + { + "epoch": 7.24, + "grad_norm": 0.41015625, + "learning_rate": 0.00043159914248479904, + "loss": 0.2701, + "step": 174770 + }, + { + "epoch": 7.24, + "grad_norm": 0.52734375, + "learning_rate": 0.00043159168867399933, + "loss": 0.206, + "step": 174780 + }, + { + "epoch": 7.24, + "grad_norm": 0.73046875, + "learning_rate": 0.00043158423452146366, + "loss": 0.1825, + "step": 174790 + }, + { + "epoch": 7.24, + "grad_norm": 0.76953125, + "learning_rate": 0.0004315767800272061, + "loss": 0.2175, + "step": 174800 + }, + { + "epoch": 7.24, + "grad_norm": 0.177734375, + "learning_rate": 0.0004315693251912407, + "loss": 0.2175, + "step": 174810 + }, + { + "epoch": 7.24, + "grad_norm": 1.046875, + "learning_rate": 0.00043156187001358137, + "loss": 0.1883, + "step": 174820 + }, + { + "epoch": 7.24, + "grad_norm": 0.392578125, + "learning_rate": 0.00043155441449424227, + "loss": 0.1993, + "step": 174830 + }, + { + "epoch": 7.24, + "grad_norm": 0.73046875, + "learning_rate": 0.0004315469586332373, + "loss": 0.2031, + "step": 174840 + }, + { + "epoch": 7.24, + "grad_norm": 0.97265625, + "learning_rate": 0.0004315395024305806, + "loss": 0.1795, + "step": 174850 + }, + { + "epoch": 7.24, + "grad_norm": 0.90625, + "learning_rate": 0.00043153204588628613, + "loss": 0.1842, + "step": 174860 + }, + { + "epoch": 7.24, + "grad_norm": 1.1796875, + "learning_rate": 0.0004315245890003679, + "loss": 0.2057, + "step": 174870 + }, + { + "epoch": 7.24, + "grad_norm": 1.265625, + "learning_rate": 0.0004315171317728401, + "loss": 0.2196, + "step": 174880 + }, + { + "epoch": 7.24, + "grad_norm": 1.140625, + "learning_rate": 0.0004315096742037167, + "loss": 0.1407, + "step": 174890 + }, + { + "epoch": 7.24, + "grad_norm": 0.67578125, + "learning_rate": 0.00043150221629301155, + "loss": 0.1896, + "step": 174900 + }, + { + "epoch": 7.24, + "grad_norm": 0.671875, + "learning_rate": 0.0004314947580407389, + "loss": 0.1816, + "step": 174910 + }, + { + "epoch": 7.25, + "grad_norm": 0.625, + "learning_rate": 0.0004314872994469128, + "loss": 0.1792, + "step": 174920 + }, + { + "epoch": 7.25, + "grad_norm": 0.87890625, + "learning_rate": 0.0004314798405115471, + "loss": 0.1812, + "step": 174930 + }, + { + "epoch": 7.25, + "grad_norm": 0.24609375, + "learning_rate": 0.00043147238123465595, + "loss": 0.1652, + "step": 174940 + }, + { + "epoch": 7.25, + "grad_norm": 0.44921875, + "learning_rate": 0.0004314649216162533, + "loss": 0.214, + "step": 174950 + }, + { + "epoch": 7.25, + "grad_norm": 0.51953125, + "learning_rate": 0.0004314574616563534, + "loss": 0.1627, + "step": 174960 + }, + { + "epoch": 7.25, + "grad_norm": 2.09375, + "learning_rate": 0.0004314500013549701, + "loss": 0.2346, + "step": 174970 + }, + { + "epoch": 7.25, + "grad_norm": 0.5546875, + "learning_rate": 0.00043144254071211753, + "loss": 0.193, + "step": 174980 + }, + { + "epoch": 7.25, + "grad_norm": 0.87109375, + "learning_rate": 0.0004314350797278096, + "loss": 0.2197, + "step": 174990 + }, + { + "epoch": 7.25, + "grad_norm": 1.265625, + "learning_rate": 0.00043142761840206053, + "loss": 0.1877, + "step": 175000 + }, + { + "epoch": 7.25, + "grad_norm": 0.66796875, + "learning_rate": 0.0004314201567348842, + "loss": 0.2098, + "step": 175010 + }, + { + "epoch": 7.25, + "grad_norm": 0.408203125, + "learning_rate": 0.00043141269472629477, + "loss": 0.1615, + "step": 175020 + }, + { + "epoch": 7.25, + "grad_norm": 0.6328125, + "learning_rate": 0.00043140523237630623, + "loss": 0.2145, + "step": 175030 + }, + { + "epoch": 7.25, + "grad_norm": 1.2265625, + "learning_rate": 0.0004313977696849327, + "loss": 0.2328, + "step": 175040 + }, + { + "epoch": 7.25, + "grad_norm": 0.578125, + "learning_rate": 0.00043139030665218805, + "loss": 0.1624, + "step": 175050 + }, + { + "epoch": 7.25, + "grad_norm": 0.408203125, + "learning_rate": 0.00043138284327808653, + "loss": 0.2033, + "step": 175060 + }, + { + "epoch": 7.25, + "grad_norm": 1.0078125, + "learning_rate": 0.000431375379562642, + "loss": 0.191, + "step": 175070 + }, + { + "epoch": 7.25, + "grad_norm": 0.478515625, + "learning_rate": 0.00043136791550586864, + "loss": 0.199, + "step": 175080 + }, + { + "epoch": 7.25, + "grad_norm": 0.453125, + "learning_rate": 0.00043136045110778046, + "loss": 0.2432, + "step": 175090 + }, + { + "epoch": 7.25, + "grad_norm": 0.52734375, + "learning_rate": 0.00043135298636839145, + "loss": 0.1674, + "step": 175100 + }, + { + "epoch": 7.25, + "grad_norm": 0.59375, + "learning_rate": 0.0004313455212877157, + "loss": 0.1963, + "step": 175110 + }, + { + "epoch": 7.25, + "grad_norm": 1.078125, + "learning_rate": 0.0004313380558657673, + "loss": 0.1761, + "step": 175120 + }, + { + "epoch": 7.25, + "grad_norm": 0.91796875, + "learning_rate": 0.00043133059010256025, + "loss": 0.1998, + "step": 175130 + }, + { + "epoch": 7.25, + "grad_norm": 0.5859375, + "learning_rate": 0.0004313231239981086, + "loss": 0.2076, + "step": 175140 + }, + { + "epoch": 7.25, + "grad_norm": 1.453125, + "learning_rate": 0.0004313156575524264, + "loss": 0.1739, + "step": 175150 + }, + { + "epoch": 7.26, + "grad_norm": 1.546875, + "learning_rate": 0.00043130819076552776, + "loss": 0.2002, + "step": 175160 + }, + { + "epoch": 7.26, + "grad_norm": 0.34375, + "learning_rate": 0.0004313007236374267, + "loss": 0.1583, + "step": 175170 + }, + { + "epoch": 7.26, + "grad_norm": 0.5390625, + "learning_rate": 0.0004312932561681372, + "loss": 0.2089, + "step": 175180 + }, + { + "epoch": 7.26, + "grad_norm": 0.67578125, + "learning_rate": 0.0004312857883576734, + "loss": 0.2048, + "step": 175190 + }, + { + "epoch": 7.26, + "grad_norm": 0.65625, + "learning_rate": 0.0004312783202060493, + "loss": 0.2202, + "step": 175200 + }, + { + "epoch": 7.26, + "grad_norm": 0.78125, + "learning_rate": 0.00043127085171327893, + "loss": 0.2299, + "step": 175210 + }, + { + "epoch": 7.26, + "grad_norm": 0.3359375, + "learning_rate": 0.0004312633828793765, + "loss": 0.2487, + "step": 175220 + }, + { + "epoch": 7.26, + "grad_norm": 0.47265625, + "learning_rate": 0.0004312559137043559, + "loss": 0.1912, + "step": 175230 + }, + { + "epoch": 7.26, + "grad_norm": 0.2275390625, + "learning_rate": 0.0004312484441882313, + "loss": 0.2175, + "step": 175240 + }, + { + "epoch": 7.26, + "grad_norm": 1.0390625, + "learning_rate": 0.00043124097433101664, + "loss": 0.2102, + "step": 175250 + }, + { + "epoch": 7.26, + "grad_norm": 0.71484375, + "learning_rate": 0.000431233504132726, + "loss": 0.1737, + "step": 175260 + }, + { + "epoch": 7.26, + "grad_norm": 0.953125, + "learning_rate": 0.0004312260335933735, + "loss": 0.2455, + "step": 175270 + }, + { + "epoch": 7.26, + "grad_norm": 0.376953125, + "learning_rate": 0.0004312185627129733, + "loss": 0.1652, + "step": 175280 + }, + { + "epoch": 7.26, + "grad_norm": 0.9921875, + "learning_rate": 0.00043121109149153925, + "loss": 0.2123, + "step": 175290 + }, + { + "epoch": 7.26, + "grad_norm": 0.7578125, + "learning_rate": 0.0004312036199290854, + "loss": 0.1873, + "step": 175300 + }, + { + "epoch": 7.26, + "grad_norm": 0.76953125, + "learning_rate": 0.00043119614802562605, + "loss": 0.247, + "step": 175310 + }, + { + "epoch": 7.26, + "grad_norm": 0.376953125, + "learning_rate": 0.00043118867578117504, + "loss": 0.2048, + "step": 175320 + }, + { + "epoch": 7.26, + "grad_norm": 0.62109375, + "learning_rate": 0.00043118120319574657, + "loss": 0.1863, + "step": 175330 + }, + { + "epoch": 7.26, + "grad_norm": 0.92578125, + "learning_rate": 0.00043117373026935456, + "loss": 0.1949, + "step": 175340 + }, + { + "epoch": 7.26, + "grad_norm": 0.578125, + "learning_rate": 0.00043116625700201323, + "loss": 0.232, + "step": 175350 + }, + { + "epoch": 7.26, + "grad_norm": 3.046875, + "learning_rate": 0.00043115878339373655, + "loss": 0.1997, + "step": 175360 + }, + { + "epoch": 7.26, + "grad_norm": 0.392578125, + "learning_rate": 0.0004311513094445386, + "loss": 0.2088, + "step": 175370 + }, + { + "epoch": 7.26, + "grad_norm": 0.703125, + "learning_rate": 0.0004311438351544334, + "loss": 0.1854, + "step": 175380 + }, + { + "epoch": 7.26, + "grad_norm": 0.5703125, + "learning_rate": 0.0004311363605234351, + "loss": 0.1814, + "step": 175390 + }, + { + "epoch": 7.27, + "grad_norm": 0.7265625, + "learning_rate": 0.0004311288855515577, + "loss": 0.1978, + "step": 175400 + }, + { + "epoch": 7.27, + "grad_norm": 0.318359375, + "learning_rate": 0.0004311214102388153, + "loss": 0.1878, + "step": 175410 + }, + { + "epoch": 7.27, + "grad_norm": 0.5859375, + "learning_rate": 0.00043111393458522204, + "loss": 0.2102, + "step": 175420 + }, + { + "epoch": 7.27, + "grad_norm": 0.80078125, + "learning_rate": 0.0004311064585907919, + "loss": 0.1851, + "step": 175430 + }, + { + "epoch": 7.27, + "grad_norm": 2.625, + "learning_rate": 0.0004310989822555389, + "loss": 0.1719, + "step": 175440 + }, + { + "epoch": 7.27, + "grad_norm": 0.546875, + "learning_rate": 0.0004310915055794772, + "loss": 0.193, + "step": 175450 + }, + { + "epoch": 7.27, + "grad_norm": 0.8125, + "learning_rate": 0.00043108402856262084, + "loss": 0.2438, + "step": 175460 + }, + { + "epoch": 7.27, + "grad_norm": 1.140625, + "learning_rate": 0.0004310765512049839, + "loss": 0.2135, + "step": 175470 + }, + { + "epoch": 7.27, + "grad_norm": 0.59375, + "learning_rate": 0.0004310690735065804, + "loss": 0.2492, + "step": 175480 + }, + { + "epoch": 7.27, + "grad_norm": 0.341796875, + "learning_rate": 0.0004310615954674245, + "loss": 0.1998, + "step": 175490 + }, + { + "epoch": 7.27, + "grad_norm": 0.61328125, + "learning_rate": 0.00043105411708753026, + "loss": 0.2077, + "step": 175500 + }, + { + "epoch": 7.27, + "grad_norm": 0.78125, + "learning_rate": 0.0004310466383669117, + "loss": 0.215, + "step": 175510 + }, + { + "epoch": 7.27, + "grad_norm": 1.0234375, + "learning_rate": 0.0004310391593055829, + "loss": 0.2003, + "step": 175520 + }, + { + "epoch": 7.27, + "grad_norm": 0.72265625, + "learning_rate": 0.00043103167990355795, + "loss": 0.2219, + "step": 175530 + }, + { + "epoch": 7.27, + "grad_norm": 1.9609375, + "learning_rate": 0.00043102420016085096, + "loss": 0.1531, + "step": 175540 + }, + { + "epoch": 7.27, + "grad_norm": 0.671875, + "learning_rate": 0.00043101672007747594, + "loss": 0.1883, + "step": 175550 + }, + { + "epoch": 7.27, + "grad_norm": 0.921875, + "learning_rate": 0.0004310092396534471, + "loss": 0.2787, + "step": 175560 + }, + { + "epoch": 7.27, + "grad_norm": 1.3046875, + "learning_rate": 0.0004310017588887783, + "loss": 0.1909, + "step": 175570 + }, + { + "epoch": 7.27, + "grad_norm": 0.69921875, + "learning_rate": 0.00043099427778348374, + "loss": 0.189, + "step": 175580 + }, + { + "epoch": 7.27, + "grad_norm": 0.5, + "learning_rate": 0.00043098679633757754, + "loss": 0.223, + "step": 175590 + }, + { + "epoch": 7.27, + "grad_norm": 0.416015625, + "learning_rate": 0.0004309793145510737, + "loss": 0.1673, + "step": 175600 + }, + { + "epoch": 7.27, + "grad_norm": 0.60546875, + "learning_rate": 0.0004309718324239864, + "loss": 0.1657, + "step": 175610 + }, + { + "epoch": 7.27, + "grad_norm": 1.265625, + "learning_rate": 0.0004309643499563296, + "loss": 0.1725, + "step": 175620 + }, + { + "epoch": 7.27, + "grad_norm": 0.62890625, + "learning_rate": 0.00043095686714811744, + "loss": 0.2324, + "step": 175630 + }, + { + "epoch": 7.27, + "grad_norm": 0.466796875, + "learning_rate": 0.000430949383999364, + "loss": 0.164, + "step": 175640 + }, + { + "epoch": 7.28, + "grad_norm": 0.8984375, + "learning_rate": 0.00043094190051008334, + "loss": 0.2372, + "step": 175650 + }, + { + "epoch": 7.28, + "grad_norm": 0.57421875, + "learning_rate": 0.0004309344166802896, + "loss": 0.1753, + "step": 175660 + }, + { + "epoch": 7.28, + "grad_norm": 0.8828125, + "learning_rate": 0.00043092693250999683, + "loss": 0.2608, + "step": 175670 + }, + { + "epoch": 7.28, + "grad_norm": 0.47265625, + "learning_rate": 0.00043091944799921914, + "loss": 0.1477, + "step": 175680 + }, + { + "epoch": 7.28, + "grad_norm": 0.42578125, + "learning_rate": 0.0004309119631479704, + "loss": 0.1952, + "step": 175690 + }, + { + "epoch": 7.28, + "grad_norm": 0.4921875, + "learning_rate": 0.0004309044779562651, + "loss": 0.1375, + "step": 175700 + }, + { + "epoch": 7.28, + "grad_norm": 0.625, + "learning_rate": 0.00043089699242411706, + "loss": 0.2203, + "step": 175710 + }, + { + "epoch": 7.28, + "grad_norm": 0.875, + "learning_rate": 0.00043088950655154035, + "loss": 0.1675, + "step": 175720 + }, + { + "epoch": 7.28, + "grad_norm": 0.7578125, + "learning_rate": 0.00043088202033854915, + "loss": 0.2309, + "step": 175730 + }, + { + "epoch": 7.28, + "grad_norm": 0.96484375, + "learning_rate": 0.00043087453378515755, + "loss": 0.1994, + "step": 175740 + }, + { + "epoch": 7.28, + "grad_norm": 0.58203125, + "learning_rate": 0.00043086704689137965, + "loss": 0.2007, + "step": 175750 + }, + { + "epoch": 7.28, + "grad_norm": 0.78125, + "learning_rate": 0.0004308595596572294, + "loss": 0.2056, + "step": 175760 + }, + { + "epoch": 7.28, + "grad_norm": 0.68359375, + "learning_rate": 0.00043085207208272105, + "loss": 0.2592, + "step": 175770 + }, + { + "epoch": 7.28, + "grad_norm": 0.97265625, + "learning_rate": 0.0004308445841678686, + "loss": 0.2011, + "step": 175780 + }, + { + "epoch": 7.28, + "grad_norm": 1.453125, + "learning_rate": 0.00043083709591268615, + "loss": 0.2389, + "step": 175790 + }, + { + "epoch": 7.28, + "grad_norm": 1.265625, + "learning_rate": 0.0004308296073171879, + "loss": 0.2246, + "step": 175800 + }, + { + "epoch": 7.28, + "grad_norm": 2.34375, + "learning_rate": 0.00043082211838138773, + "loss": 0.226, + "step": 175810 + }, + { + "epoch": 7.28, + "grad_norm": 1.0546875, + "learning_rate": 0.0004308146291053, + "loss": 0.1872, + "step": 175820 + }, + { + "epoch": 7.28, + "grad_norm": 0.462890625, + "learning_rate": 0.0004308071394889386, + "loss": 0.1489, + "step": 175830 + }, + { + "epoch": 7.28, + "grad_norm": 0.62890625, + "learning_rate": 0.00043079964953231763, + "loss": 0.209, + "step": 175840 + }, + { + "epoch": 7.28, + "grad_norm": 0.7890625, + "learning_rate": 0.00043079215923545137, + "loss": 0.2171, + "step": 175850 + }, + { + "epoch": 7.28, + "grad_norm": 0.4296875, + "learning_rate": 0.00043078466859835374, + "loss": 0.2348, + "step": 175860 + }, + { + "epoch": 7.28, + "grad_norm": 1.3671875, + "learning_rate": 0.00043077717762103883, + "loss": 0.2348, + "step": 175870 + }, + { + "epoch": 7.28, + "grad_norm": 0.83984375, + "learning_rate": 0.00043076968630352086, + "loss": 0.2196, + "step": 175880 + }, + { + "epoch": 7.29, + "grad_norm": 0.4921875, + "learning_rate": 0.0004307621946458138, + "loss": 0.2053, + "step": 175890 + }, + { + "epoch": 7.29, + "grad_norm": 1.140625, + "learning_rate": 0.0004307547026479318, + "loss": 0.2287, + "step": 175900 + }, + { + "epoch": 7.29, + "grad_norm": 0.69140625, + "learning_rate": 0.00043074721030988907, + "loss": 0.1811, + "step": 175910 + }, + { + "epoch": 7.29, + "grad_norm": 0.71875, + "learning_rate": 0.0004307397176316996, + "loss": 0.2272, + "step": 175920 + }, + { + "epoch": 7.29, + "grad_norm": 0.578125, + "learning_rate": 0.0004307322246133774, + "loss": 0.245, + "step": 175930 + }, + { + "epoch": 7.29, + "grad_norm": 0.58984375, + "learning_rate": 0.0004307247312549367, + "loss": 0.2209, + "step": 175940 + }, + { + "epoch": 7.29, + "grad_norm": 0.625, + "learning_rate": 0.00043071723755639166, + "loss": 0.1626, + "step": 175950 + }, + { + "epoch": 7.29, + "grad_norm": 0.50390625, + "learning_rate": 0.0004307097435177562, + "loss": 0.2245, + "step": 175960 + }, + { + "epoch": 7.29, + "grad_norm": 1.6171875, + "learning_rate": 0.00043070224913904456, + "loss": 0.2151, + "step": 175970 + }, + { + "epoch": 7.29, + "grad_norm": 1.203125, + "learning_rate": 0.0004306947544202708, + "loss": 0.1756, + "step": 175980 + }, + { + "epoch": 7.29, + "grad_norm": 0.67578125, + "learning_rate": 0.00043068725936144906, + "loss": 0.2138, + "step": 175990 + }, + { + "epoch": 7.29, + "grad_norm": 0.703125, + "learning_rate": 0.00043067976396259333, + "loss": 0.2501, + "step": 176000 + }, + { + "epoch": 7.29, + "grad_norm": 0.65625, + "learning_rate": 0.00043067226822371787, + "loss": 0.1814, + "step": 176010 + }, + { + "epoch": 7.29, + "grad_norm": 0.64453125, + "learning_rate": 0.0004306647721448366, + "loss": 0.1694, + "step": 176020 + }, + { + "epoch": 7.29, + "grad_norm": 0.5625, + "learning_rate": 0.00043065727572596385, + "loss": 0.1597, + "step": 176030 + }, + { + "epoch": 7.29, + "grad_norm": 1.328125, + "learning_rate": 0.0004306497789671136, + "loss": 0.2517, + "step": 176040 + }, + { + "epoch": 7.29, + "grad_norm": 0.55859375, + "learning_rate": 0.0004306422818682999, + "loss": 0.1765, + "step": 176050 + }, + { + "epoch": 7.29, + "grad_norm": 0.72265625, + "learning_rate": 0.000430634784429537, + "loss": 0.2434, + "step": 176060 + }, + { + "epoch": 7.29, + "grad_norm": 0.46484375, + "learning_rate": 0.00043062728665083897, + "loss": 0.2016, + "step": 176070 + }, + { + "epoch": 7.29, + "grad_norm": 0.32421875, + "learning_rate": 0.00043061978853221986, + "loss": 0.2343, + "step": 176080 + }, + { + "epoch": 7.29, + "grad_norm": 0.640625, + "learning_rate": 0.0004306122900736938, + "loss": 0.226, + "step": 176090 + }, + { + "epoch": 7.29, + "grad_norm": 0.51171875, + "learning_rate": 0.0004306047912752749, + "loss": 0.1349, + "step": 176100 + }, + { + "epoch": 7.29, + "grad_norm": 0.66015625, + "learning_rate": 0.00043059729213697727, + "loss": 0.2053, + "step": 176110 + }, + { + "epoch": 7.29, + "grad_norm": 0.80078125, + "learning_rate": 0.00043058979265881515, + "loss": 0.2125, + "step": 176120 + }, + { + "epoch": 7.3, + "grad_norm": 0.625, + "learning_rate": 0.00043058229284080243, + "loss": 0.1539, + "step": 176130 + }, + { + "epoch": 7.3, + "grad_norm": 0.9296875, + "learning_rate": 0.00043057479268295343, + "loss": 0.1978, + "step": 176140 + }, + { + "epoch": 7.3, + "grad_norm": 0.72265625, + "learning_rate": 0.00043056729218528207, + "loss": 0.2142, + "step": 176150 + }, + { + "epoch": 7.3, + "grad_norm": 0.8046875, + "learning_rate": 0.00043055979134780267, + "loss": 0.176, + "step": 176160 + }, + { + "epoch": 7.3, + "grad_norm": 0.3125, + "learning_rate": 0.0004305522901705291, + "loss": 0.2236, + "step": 176170 + }, + { + "epoch": 7.3, + "grad_norm": 0.609375, + "learning_rate": 0.00043054478865347574, + "loss": 0.1734, + "step": 176180 + }, + { + "epoch": 7.3, + "grad_norm": 1.125, + "learning_rate": 0.00043053728679665656, + "loss": 0.2183, + "step": 176190 + }, + { + "epoch": 7.3, + "grad_norm": 0.71484375, + "learning_rate": 0.00043052978460008566, + "loss": 0.2513, + "step": 176200 + }, + { + "epoch": 7.3, + "grad_norm": 1.265625, + "learning_rate": 0.00043052228206377725, + "loss": 0.2082, + "step": 176210 + }, + { + "epoch": 7.3, + "grad_norm": 0.400390625, + "learning_rate": 0.0004305147791877454, + "loss": 0.2114, + "step": 176220 + }, + { + "epoch": 7.3, + "grad_norm": 0.61328125, + "learning_rate": 0.00043050727597200415, + "loss": 0.1879, + "step": 176230 + }, + { + "epoch": 7.3, + "grad_norm": 0.291015625, + "learning_rate": 0.0004304997724165678, + "loss": 0.1899, + "step": 176240 + }, + { + "epoch": 7.3, + "grad_norm": 0.609375, + "learning_rate": 0.0004304922685214503, + "loss": 0.1766, + "step": 176250 + }, + { + "epoch": 7.3, + "grad_norm": 0.7421875, + "learning_rate": 0.0004304847642866659, + "loss": 0.1593, + "step": 176260 + }, + { + "epoch": 7.3, + "grad_norm": 0.80078125, + "learning_rate": 0.00043047725971222863, + "loss": 0.2212, + "step": 176270 + }, + { + "epoch": 7.3, + "grad_norm": 1.1328125, + "learning_rate": 0.00043046975479815264, + "loss": 0.2273, + "step": 176280 + }, + { + "epoch": 7.3, + "grad_norm": 0.53125, + "learning_rate": 0.0004304622495444521, + "loss": 0.236, + "step": 176290 + }, + { + "epoch": 7.3, + "grad_norm": 0.376953125, + "learning_rate": 0.000430454743951141, + "loss": 0.2307, + "step": 176300 + }, + { + "epoch": 7.3, + "grad_norm": 1.84375, + "learning_rate": 0.00043044723801823374, + "loss": 0.1974, + "step": 176310 + }, + { + "epoch": 7.3, + "grad_norm": 0.87109375, + "learning_rate": 0.0004304397317457441, + "loss": 0.1987, + "step": 176320 + }, + { + "epoch": 7.3, + "grad_norm": 1.0625, + "learning_rate": 0.0004304322251336864, + "loss": 0.2146, + "step": 176330 + }, + { + "epoch": 7.3, + "grad_norm": 0.439453125, + "learning_rate": 0.00043042471818207473, + "loss": 0.2323, + "step": 176340 + }, + { + "epoch": 7.3, + "grad_norm": 1.3203125, + "learning_rate": 0.0004304172108909233, + "loss": 0.2191, + "step": 176350 + }, + { + "epoch": 7.3, + "grad_norm": 0.55078125, + "learning_rate": 0.0004304097032602461, + "loss": 0.2369, + "step": 176360 + }, + { + "epoch": 7.31, + "grad_norm": 1.125, + "learning_rate": 0.0004304021952900574, + "loss": 0.2184, + "step": 176370 + }, + { + "epoch": 7.31, + "grad_norm": 0.6015625, + "learning_rate": 0.0004303946869803711, + "loss": 0.2028, + "step": 176380 + }, + { + "epoch": 7.31, + "grad_norm": 0.84765625, + "learning_rate": 0.0004303871783312016, + "loss": 0.1714, + "step": 176390 + }, + { + "epoch": 7.31, + "grad_norm": 1.046875, + "learning_rate": 0.0004303796693425629, + "loss": 0.1925, + "step": 176400 + }, + { + "epoch": 7.31, + "grad_norm": 0.53125, + "learning_rate": 0.0004303721600144691, + "loss": 0.1829, + "step": 176410 + }, + { + "epoch": 7.31, + "grad_norm": 1.375, + "learning_rate": 0.0004303646503469344, + "loss": 0.2273, + "step": 176420 + }, + { + "epoch": 7.31, + "grad_norm": 1.265625, + "learning_rate": 0.00043035714033997285, + "loss": 0.2138, + "step": 176430 + }, + { + "epoch": 7.31, + "grad_norm": 0.400390625, + "learning_rate": 0.00043034962999359875, + "loss": 0.2122, + "step": 176440 + }, + { + "epoch": 7.31, + "grad_norm": 0.92578125, + "learning_rate": 0.000430342119307826, + "loss": 0.1821, + "step": 176450 + }, + { + "epoch": 7.31, + "grad_norm": 1.1640625, + "learning_rate": 0.00043033460828266896, + "loss": 0.2102, + "step": 176460 + }, + { + "epoch": 7.31, + "grad_norm": 0.50390625, + "learning_rate": 0.00043032709691814163, + "loss": 0.1934, + "step": 176470 + }, + { + "epoch": 7.31, + "grad_norm": 1.03125, + "learning_rate": 0.00043031958521425815, + "loss": 0.2045, + "step": 176480 + }, + { + "epoch": 7.31, + "grad_norm": 0.74609375, + "learning_rate": 0.00043031207317103273, + "loss": 0.1973, + "step": 176490 + }, + { + "epoch": 7.31, + "grad_norm": 0.5703125, + "learning_rate": 0.0004303045607884794, + "loss": 0.187, + "step": 176500 + }, + { + "epoch": 7.31, + "grad_norm": 0.62890625, + "learning_rate": 0.0004302970480666124, + "loss": 0.2193, + "step": 176510 + }, + { + "epoch": 7.31, + "grad_norm": 0.640625, + "learning_rate": 0.00043028953500544575, + "loss": 0.2134, + "step": 176520 + }, + { + "epoch": 7.31, + "grad_norm": 0.828125, + "learning_rate": 0.00043028202160499377, + "loss": 0.1964, + "step": 176530 + }, + { + "epoch": 7.31, + "grad_norm": 1.25, + "learning_rate": 0.0004302745078652704, + "loss": 0.2042, + "step": 176540 + }, + { + "epoch": 7.31, + "grad_norm": 0.99609375, + "learning_rate": 0.00043026699378628997, + "loss": 0.2083, + "step": 176550 + }, + { + "epoch": 7.31, + "grad_norm": 0.3515625, + "learning_rate": 0.0004302594793680665, + "loss": 0.1938, + "step": 176560 + }, + { + "epoch": 7.31, + "grad_norm": 0.87890625, + "learning_rate": 0.00043025196461061413, + "loss": 0.1832, + "step": 176570 + }, + { + "epoch": 7.31, + "grad_norm": 0.376953125, + "learning_rate": 0.00043024444951394704, + "loss": 0.2251, + "step": 176580 + }, + { + "epoch": 7.31, + "grad_norm": 0.5234375, + "learning_rate": 0.0004302369340780794, + "loss": 0.215, + "step": 176590 + }, + { + "epoch": 7.31, + "grad_norm": 0.6328125, + "learning_rate": 0.0004302294183030252, + "loss": 0.1612, + "step": 176600 + }, + { + "epoch": 7.32, + "grad_norm": 0.3828125, + "learning_rate": 0.0004302219021887988, + "loss": 0.2005, + "step": 176610 + }, + { + "epoch": 7.32, + "grad_norm": 0.470703125, + "learning_rate": 0.0004302143857354142, + "loss": 0.1948, + "step": 176620 + }, + { + "epoch": 7.32, + "grad_norm": 1.0625, + "learning_rate": 0.0004302068689428856, + "loss": 0.1944, + "step": 176630 + }, + { + "epoch": 7.32, + "grad_norm": 2.125, + "learning_rate": 0.00043019935181122716, + "loss": 0.2081, + "step": 176640 + }, + { + "epoch": 7.32, + "grad_norm": 0.6953125, + "learning_rate": 0.00043019183434045297, + "loss": 0.1902, + "step": 176650 + }, + { + "epoch": 7.32, + "grad_norm": 0.8359375, + "learning_rate": 0.0004301843165305772, + "loss": 0.2264, + "step": 176660 + }, + { + "epoch": 7.32, + "grad_norm": 0.8828125, + "learning_rate": 0.00043017679838161407, + "loss": 0.2213, + "step": 176670 + }, + { + "epoch": 7.32, + "grad_norm": 0.7578125, + "learning_rate": 0.00043016927989357755, + "loss": 0.196, + "step": 176680 + }, + { + "epoch": 7.32, + "grad_norm": 1.2265625, + "learning_rate": 0.00043016176106648195, + "loss": 0.1906, + "step": 176690 + }, + { + "epoch": 7.32, + "grad_norm": 1.4609375, + "learning_rate": 0.00043015424190034137, + "loss": 0.2234, + "step": 176700 + }, + { + "epoch": 7.32, + "grad_norm": 0.0012054443359375, + "learning_rate": 0.00043014672239517, + "loss": 0.132, + "step": 176710 + }, + { + "epoch": 7.32, + "grad_norm": 0.796875, + "learning_rate": 0.0004301392025509819, + "loss": 0.2325, + "step": 176720 + }, + { + "epoch": 7.32, + "grad_norm": 0.6640625, + "learning_rate": 0.00043013168236779133, + "loss": 0.1902, + "step": 176730 + }, + { + "epoch": 7.32, + "grad_norm": 0.8984375, + "learning_rate": 0.0004301241618456123, + "loss": 0.1944, + "step": 176740 + }, + { + "epoch": 7.32, + "grad_norm": 0.1845703125, + "learning_rate": 0.0004301166409844591, + "loss": 0.2404, + "step": 176750 + }, + { + "epoch": 7.32, + "grad_norm": 0.119140625, + "learning_rate": 0.0004301091197843459, + "loss": 0.1749, + "step": 176760 + }, + { + "epoch": 7.32, + "grad_norm": 0.78515625, + "learning_rate": 0.0004301015982452867, + "loss": 0.1974, + "step": 176770 + }, + { + "epoch": 7.32, + "grad_norm": 0.87109375, + "learning_rate": 0.0004300940763672957, + "loss": 0.1915, + "step": 176780 + }, + { + "epoch": 7.32, + "grad_norm": 0.6328125, + "learning_rate": 0.00043008655415038724, + "loss": 0.1922, + "step": 176790 + }, + { + "epoch": 7.32, + "grad_norm": 0.98046875, + "learning_rate": 0.00043007903159457525, + "loss": 0.1751, + "step": 176800 + }, + { + "epoch": 7.32, + "grad_norm": 1.2734375, + "learning_rate": 0.0004300715086998739, + "loss": 0.2095, + "step": 176810 + }, + { + "epoch": 7.32, + "grad_norm": 0.21875, + "learning_rate": 0.00043006398546629755, + "loss": 0.2183, + "step": 176820 + }, + { + "epoch": 7.32, + "grad_norm": 0.71875, + "learning_rate": 0.0004300564618938602, + "loss": 0.1722, + "step": 176830 + }, + { + "epoch": 7.32, + "grad_norm": 0.75, + "learning_rate": 0.00043004893798257593, + "loss": 0.195, + "step": 176840 + }, + { + "epoch": 7.33, + "grad_norm": 0.82421875, + "learning_rate": 0.0004300414137324591, + "loss": 0.2567, + "step": 176850 + }, + { + "epoch": 7.33, + "grad_norm": 0.51953125, + "learning_rate": 0.0004300338891435237, + "loss": 0.1738, + "step": 176860 + }, + { + "epoch": 7.33, + "grad_norm": 0.38671875, + "learning_rate": 0.000430026364215784, + "loss": 0.1984, + "step": 176870 + }, + { + "epoch": 7.33, + "grad_norm": 0.46875, + "learning_rate": 0.0004300188389492542, + "loss": 0.1745, + "step": 176880 + }, + { + "epoch": 7.33, + "grad_norm": 0.66796875, + "learning_rate": 0.0004300113133439483, + "loss": 0.2076, + "step": 176890 + }, + { + "epoch": 7.33, + "grad_norm": 0.63671875, + "learning_rate": 0.0004300037873998806, + "loss": 0.2292, + "step": 176900 + }, + { + "epoch": 7.33, + "grad_norm": 0.796875, + "learning_rate": 0.0004299962611170651, + "loss": 0.2015, + "step": 176910 + }, + { + "epoch": 7.33, + "grad_norm": 1.21875, + "learning_rate": 0.00042998873449551613, + "loss": 0.2359, + "step": 176920 + }, + { + "epoch": 7.33, + "grad_norm": 0.625, + "learning_rate": 0.00042998120753524785, + "loss": 0.1966, + "step": 176930 + }, + { + "epoch": 7.33, + "grad_norm": 0.81640625, + "learning_rate": 0.0004299736802362743, + "loss": 0.152, + "step": 176940 + }, + { + "epoch": 7.33, + "grad_norm": 0.5390625, + "learning_rate": 0.0004299661525986098, + "loss": 0.151, + "step": 176950 + }, + { + "epoch": 7.33, + "grad_norm": 0.7109375, + "learning_rate": 0.0004299586246222684, + "loss": 0.2236, + "step": 176960 + }, + { + "epoch": 7.33, + "grad_norm": 0.8125, + "learning_rate": 0.0004299510963072643, + "loss": 0.1141, + "step": 176970 + }, + { + "epoch": 7.33, + "grad_norm": 0.8984375, + "learning_rate": 0.00042994356765361163, + "loss": 0.222, + "step": 176980 + }, + { + "epoch": 7.33, + "grad_norm": 0.474609375, + "learning_rate": 0.00042993603866132464, + "loss": 0.1786, + "step": 176990 + }, + { + "epoch": 7.33, + "grad_norm": 0.39453125, + "learning_rate": 0.00042992850933041746, + "loss": 0.1996, + "step": 177000 + }, + { + "epoch": 7.33, + "grad_norm": 0.58203125, + "learning_rate": 0.00042992097966090424, + "loss": 0.1846, + "step": 177010 + }, + { + "epoch": 7.33, + "grad_norm": 0.6640625, + "learning_rate": 0.0004299134496527992, + "loss": 0.2283, + "step": 177020 + }, + { + "epoch": 7.33, + "grad_norm": 0.7421875, + "learning_rate": 0.00042990591930611643, + "loss": 0.1886, + "step": 177030 + }, + { + "epoch": 7.33, + "grad_norm": 0.63671875, + "learning_rate": 0.00042989838862087014, + "loss": 0.1205, + "step": 177040 + }, + { + "epoch": 7.33, + "grad_norm": 0.5234375, + "learning_rate": 0.00042989085759707456, + "loss": 0.1823, + "step": 177050 + }, + { + "epoch": 7.33, + "grad_norm": 0.59375, + "learning_rate": 0.0004298833262347438, + "loss": 0.2019, + "step": 177060 + }, + { + "epoch": 7.33, + "grad_norm": 1.8828125, + "learning_rate": 0.00042987579453389205, + "loss": 0.2228, + "step": 177070 + }, + { + "epoch": 7.33, + "grad_norm": 0.89453125, + "learning_rate": 0.0004298682624945334, + "loss": 0.1935, + "step": 177080 + }, + { + "epoch": 7.34, + "grad_norm": 0.6953125, + "learning_rate": 0.0004298607301166822, + "loss": 0.2488, + "step": 177090 + }, + { + "epoch": 7.34, + "grad_norm": 0.5234375, + "learning_rate": 0.00042985319740035247, + "loss": 0.1819, + "step": 177100 + }, + { + "epoch": 7.34, + "grad_norm": 0.62109375, + "learning_rate": 0.0004298456643455585, + "loss": 0.2029, + "step": 177110 + }, + { + "epoch": 7.34, + "grad_norm": 0.54296875, + "learning_rate": 0.0004298381309523144, + "loss": 0.2118, + "step": 177120 + }, + { + "epoch": 7.34, + "grad_norm": 1.2421875, + "learning_rate": 0.00042983059722063435, + "loss": 0.2005, + "step": 177130 + }, + { + "epoch": 7.34, + "grad_norm": 1.0078125, + "learning_rate": 0.0004298230631505325, + "loss": 0.208, + "step": 177140 + }, + { + "epoch": 7.34, + "grad_norm": 0.4609375, + "learning_rate": 0.0004298155287420231, + "loss": 0.2013, + "step": 177150 + }, + { + "epoch": 7.34, + "grad_norm": 0.41015625, + "learning_rate": 0.00042980799399512025, + "loss": 0.2173, + "step": 177160 + }, + { + "epoch": 7.34, + "grad_norm": 0.76953125, + "learning_rate": 0.00042980045890983823, + "loss": 0.2235, + "step": 177170 + }, + { + "epoch": 7.34, + "grad_norm": 1.1796875, + "learning_rate": 0.00042979292348619113, + "loss": 0.1869, + "step": 177180 + }, + { + "epoch": 7.34, + "grad_norm": 0.828125, + "learning_rate": 0.00042978538772419316, + "loss": 0.2765, + "step": 177190 + }, + { + "epoch": 7.34, + "grad_norm": 0.28515625, + "learning_rate": 0.0004297778516238585, + "loss": 0.2393, + "step": 177200 + }, + { + "epoch": 7.34, + "grad_norm": 0.81640625, + "learning_rate": 0.0004297703151852014, + "loss": 0.207, + "step": 177210 + }, + { + "epoch": 7.34, + "grad_norm": 0.70703125, + "learning_rate": 0.00042976277840823596, + "loss": 0.191, + "step": 177220 + }, + { + "epoch": 7.34, + "grad_norm": 0.859375, + "learning_rate": 0.0004297552412929764, + "loss": 0.2254, + "step": 177230 + }, + { + "epoch": 7.34, + "grad_norm": 0.67578125, + "learning_rate": 0.0004297477038394368, + "loss": 0.237, + "step": 177240 + }, + { + "epoch": 7.34, + "grad_norm": 0.69140625, + "learning_rate": 0.00042974016604763144, + "loss": 0.1683, + "step": 177250 + }, + { + "epoch": 7.34, + "grad_norm": 0.53125, + "learning_rate": 0.0004297326279175746, + "loss": 0.2277, + "step": 177260 + }, + { + "epoch": 7.34, + "grad_norm": 0.765625, + "learning_rate": 0.0004297250894492803, + "loss": 0.1567, + "step": 177270 + }, + { + "epoch": 7.34, + "grad_norm": 0.32421875, + "learning_rate": 0.0004297175506427628, + "loss": 0.1954, + "step": 177280 + }, + { + "epoch": 7.34, + "grad_norm": 1.0859375, + "learning_rate": 0.0004297100114980362, + "loss": 0.2325, + "step": 177290 + }, + { + "epoch": 7.34, + "grad_norm": 0.2294921875, + "learning_rate": 0.00042970247201511494, + "loss": 0.2125, + "step": 177300 + }, + { + "epoch": 7.34, + "grad_norm": 0.65625, + "learning_rate": 0.0004296949321940129, + "loss": 0.2086, + "step": 177310 + }, + { + "epoch": 7.34, + "grad_norm": 1.328125, + "learning_rate": 0.00042968739203474445, + "loss": 0.1625, + "step": 177320 + }, + { + "epoch": 7.34, + "grad_norm": 0.470703125, + "learning_rate": 0.0004296798515373237, + "loss": 0.1756, + "step": 177330 + }, + { + "epoch": 7.35, + "grad_norm": 1.0703125, + "learning_rate": 0.00042967231070176486, + "loss": 0.2234, + "step": 177340 + }, + { + "epoch": 7.35, + "grad_norm": 0.93359375, + "learning_rate": 0.0004296647695280822, + "loss": 0.2511, + "step": 177350 + }, + { + "epoch": 7.35, + "grad_norm": 0.87890625, + "learning_rate": 0.00042965722801628983, + "loss": 0.1935, + "step": 177360 + }, + { + "epoch": 7.35, + "grad_norm": 0.5234375, + "learning_rate": 0.0004296496861664019, + "loss": 0.224, + "step": 177370 + }, + { + "epoch": 7.35, + "grad_norm": 0.63671875, + "learning_rate": 0.0004296421439784327, + "loss": 0.1642, + "step": 177380 + }, + { + "epoch": 7.35, + "grad_norm": 0.421875, + "learning_rate": 0.00042963460145239643, + "loss": 0.1832, + "step": 177390 + }, + { + "epoch": 7.35, + "grad_norm": 1.4296875, + "learning_rate": 0.0004296270585883072, + "loss": 0.2044, + "step": 177400 + }, + { + "epoch": 7.35, + "grad_norm": 0.333984375, + "learning_rate": 0.00042961951538617916, + "loss": 0.1907, + "step": 177410 + }, + { + "epoch": 7.35, + "grad_norm": 0.796875, + "learning_rate": 0.00042961197184602666, + "loss": 0.2078, + "step": 177420 + }, + { + "epoch": 7.35, + "grad_norm": 0.8125, + "learning_rate": 0.00042960442796786394, + "loss": 0.2515, + "step": 177430 + }, + { + "epoch": 7.35, + "grad_norm": 0.8125, + "learning_rate": 0.0004295968837517049, + "loss": 0.1965, + "step": 177440 + }, + { + "epoch": 7.35, + "grad_norm": 1.140625, + "learning_rate": 0.000429589339197564, + "loss": 0.2285, + "step": 177450 + }, + { + "epoch": 7.35, + "grad_norm": 0.828125, + "learning_rate": 0.00042958179430545535, + "loss": 0.2252, + "step": 177460 + }, + { + "epoch": 7.35, + "grad_norm": 1.8125, + "learning_rate": 0.0004295742490753932, + "loss": 0.193, + "step": 177470 + }, + { + "epoch": 7.35, + "grad_norm": 0.52734375, + "learning_rate": 0.00042956670350739165, + "loss": 0.1992, + "step": 177480 + }, + { + "epoch": 7.35, + "grad_norm": 0.69921875, + "learning_rate": 0.0004295591576014649, + "loss": 0.1523, + "step": 177490 + }, + { + "epoch": 7.35, + "grad_norm": 0.3984375, + "learning_rate": 0.0004295516113576273, + "loss": 0.1825, + "step": 177500 + }, + { + "epoch": 7.35, + "grad_norm": 1.046875, + "learning_rate": 0.00042954406477589295, + "loss": 0.2087, + "step": 177510 + }, + { + "epoch": 7.35, + "grad_norm": 0.7109375, + "learning_rate": 0.000429536517856276, + "loss": 0.185, + "step": 177520 + }, + { + "epoch": 7.35, + "grad_norm": 0.30078125, + "learning_rate": 0.00042952897059879083, + "loss": 0.2602, + "step": 177530 + }, + { + "epoch": 7.35, + "grad_norm": 0.640625, + "learning_rate": 0.0004295214230034514, + "loss": 0.173, + "step": 177540 + }, + { + "epoch": 7.35, + "grad_norm": 0.25390625, + "learning_rate": 0.0004295138750702721, + "loss": 0.1846, + "step": 177550 + }, + { + "epoch": 7.35, + "grad_norm": 2.046875, + "learning_rate": 0.000429506326799267, + "loss": 0.1767, + "step": 177560 + }, + { + "epoch": 7.35, + "grad_norm": 0.73828125, + "learning_rate": 0.00042949877819045045, + "loss": 0.2098, + "step": 177570 + }, + { + "epoch": 7.36, + "grad_norm": 0.455078125, + "learning_rate": 0.0004294912292438366, + "loss": 0.1839, + "step": 177580 + }, + { + "epoch": 7.36, + "grad_norm": 0.423828125, + "learning_rate": 0.0004294836799594396, + "loss": 0.1926, + "step": 177590 + }, + { + "epoch": 7.36, + "grad_norm": 0.78515625, + "learning_rate": 0.0004294761303372737, + "loss": 0.2103, + "step": 177600 + }, + { + "epoch": 7.36, + "grad_norm": 0.75, + "learning_rate": 0.0004294685803773531, + "loss": 0.1814, + "step": 177610 + }, + { + "epoch": 7.36, + "grad_norm": 0.60546875, + "learning_rate": 0.00042946103007969206, + "loss": 0.2371, + "step": 177620 + }, + { + "epoch": 7.36, + "grad_norm": 0.5625, + "learning_rate": 0.0004294534794443047, + "loss": 0.2406, + "step": 177630 + }, + { + "epoch": 7.36, + "grad_norm": 0.9296875, + "learning_rate": 0.00042944592847120533, + "loss": 0.2293, + "step": 177640 + }, + { + "epoch": 7.36, + "grad_norm": 0.984375, + "learning_rate": 0.000429438377160408, + "loss": 0.2061, + "step": 177650 + }, + { + "epoch": 7.36, + "grad_norm": 0.515625, + "learning_rate": 0.00042943082551192706, + "loss": 0.2039, + "step": 177660 + }, + { + "epoch": 7.36, + "grad_norm": 0.52734375, + "learning_rate": 0.0004294232735257767, + "loss": 0.2711, + "step": 177670 + }, + { + "epoch": 7.36, + "grad_norm": 0.47265625, + "learning_rate": 0.00042941572120197113, + "loss": 0.2434, + "step": 177680 + }, + { + "epoch": 7.36, + "grad_norm": 0.609375, + "learning_rate": 0.0004294081685405246, + "loss": 0.1856, + "step": 177690 + }, + { + "epoch": 7.36, + "grad_norm": 0.0, + "learning_rate": 0.00042940061554145117, + "loss": 0.2047, + "step": 177700 + }, + { + "epoch": 7.36, + "grad_norm": 0.95703125, + "learning_rate": 0.00042939306220476513, + "loss": 0.2269, + "step": 177710 + }, + { + "epoch": 7.36, + "grad_norm": 1.3671875, + "learning_rate": 0.0004293855085304808, + "loss": 0.183, + "step": 177720 + }, + { + "epoch": 7.36, + "grad_norm": 0.64453125, + "learning_rate": 0.00042937795451861225, + "loss": 0.1804, + "step": 177730 + }, + { + "epoch": 7.36, + "grad_norm": 1.7109375, + "learning_rate": 0.00042937040016917383, + "loss": 0.2416, + "step": 177740 + }, + { + "epoch": 7.36, + "grad_norm": 0.7421875, + "learning_rate": 0.00042936284548217963, + "loss": 0.196, + "step": 177750 + }, + { + "epoch": 7.36, + "grad_norm": 0.8984375, + "learning_rate": 0.00042935529045764396, + "loss": 0.1828, + "step": 177760 + }, + { + "epoch": 7.36, + "grad_norm": 0.44921875, + "learning_rate": 0.000429347735095581, + "loss": 0.189, + "step": 177770 + }, + { + "epoch": 7.36, + "grad_norm": 0.921875, + "learning_rate": 0.00042934017939600495, + "loss": 0.2014, + "step": 177780 + }, + { + "epoch": 7.36, + "grad_norm": 0.32421875, + "learning_rate": 0.0004293326233589301, + "loss": 0.1514, + "step": 177790 + }, + { + "epoch": 7.36, + "grad_norm": 0.78125, + "learning_rate": 0.00042932506698437057, + "loss": 0.2286, + "step": 177800 + }, + { + "epoch": 7.36, + "grad_norm": 0.396484375, + "learning_rate": 0.0004293175102723406, + "loss": 0.2165, + "step": 177810 + }, + { + "epoch": 7.37, + "grad_norm": 0.7734375, + "learning_rate": 0.0004293099532228545, + "loss": 0.2092, + "step": 177820 + }, + { + "epoch": 7.37, + "grad_norm": 1.3046875, + "learning_rate": 0.00042930239583592643, + "loss": 0.2093, + "step": 177830 + }, + { + "epoch": 7.37, + "grad_norm": 0.94140625, + "learning_rate": 0.00042929483811157053, + "loss": 0.2208, + "step": 177840 + }, + { + "epoch": 7.37, + "grad_norm": 0.52734375, + "learning_rate": 0.0004292872800498012, + "loss": 0.2162, + "step": 177850 + }, + { + "epoch": 7.37, + "grad_norm": 0.7578125, + "learning_rate": 0.00042927972165063256, + "loss": 0.2686, + "step": 177860 + }, + { + "epoch": 7.37, + "grad_norm": 1.0078125, + "learning_rate": 0.00042927216291407884, + "loss": 0.1617, + "step": 177870 + }, + { + "epoch": 7.37, + "grad_norm": 1.921875, + "learning_rate": 0.00042926460384015426, + "loss": 0.2461, + "step": 177880 + }, + { + "epoch": 7.37, + "grad_norm": 0.91796875, + "learning_rate": 0.000429257044428873, + "loss": 0.2058, + "step": 177890 + }, + { + "epoch": 7.37, + "grad_norm": 1.2578125, + "learning_rate": 0.0004292494846802494, + "loss": 0.2556, + "step": 177900 + }, + { + "epoch": 7.37, + "grad_norm": 1.3828125, + "learning_rate": 0.0004292419245942977, + "loss": 0.2146, + "step": 177910 + }, + { + "epoch": 7.37, + "grad_norm": 0.9453125, + "learning_rate": 0.0004292343641710319, + "loss": 0.1858, + "step": 177920 + }, + { + "epoch": 7.37, + "grad_norm": 0.46875, + "learning_rate": 0.0004292268034104665, + "loss": 0.1771, + "step": 177930 + }, + { + "epoch": 7.37, + "grad_norm": 0.09423828125, + "learning_rate": 0.00042921924231261555, + "loss": 0.2196, + "step": 177940 + }, + { + "epoch": 7.37, + "grad_norm": 1.125, + "learning_rate": 0.0004292116808774934, + "loss": 0.2175, + "step": 177950 + }, + { + "epoch": 7.37, + "grad_norm": 0.9765625, + "learning_rate": 0.00042920411910511415, + "loss": 0.2248, + "step": 177960 + }, + { + "epoch": 7.37, + "grad_norm": 0.80859375, + "learning_rate": 0.0004291965569954921, + "loss": 0.2376, + "step": 177970 + }, + { + "epoch": 7.37, + "grad_norm": 0.921875, + "learning_rate": 0.0004291889945486415, + "loss": 0.1868, + "step": 177980 + }, + { + "epoch": 7.37, + "grad_norm": 1.9609375, + "learning_rate": 0.00042918143176457656, + "loss": 0.2168, + "step": 177990 + }, + { + "epoch": 7.37, + "grad_norm": 0.703125, + "learning_rate": 0.00042917386864331156, + "loss": 0.1565, + "step": 178000 + }, + { + "epoch": 7.37, + "grad_norm": 0.71484375, + "learning_rate": 0.00042916630518486065, + "loss": 0.1848, + "step": 178010 + }, + { + "epoch": 7.37, + "grad_norm": 0.60546875, + "learning_rate": 0.0004291587413892381, + "loss": 0.21, + "step": 178020 + }, + { + "epoch": 7.37, + "grad_norm": 0.6796875, + "learning_rate": 0.00042915117725645817, + "loss": 0.2075, + "step": 178030 + }, + { + "epoch": 7.37, + "grad_norm": 0.56640625, + "learning_rate": 0.000429143612786535, + "loss": 0.1766, + "step": 178040 + }, + { + "epoch": 7.37, + "grad_norm": 0.625, + "learning_rate": 0.000429136047979483, + "loss": 0.2278, + "step": 178050 + }, + { + "epoch": 7.38, + "grad_norm": 0.93359375, + "learning_rate": 0.00042912848283531625, + "loss": 0.2, + "step": 178060 + }, + { + "epoch": 7.38, + "grad_norm": 0.92578125, + "learning_rate": 0.00042912091735404907, + "loss": 0.1788, + "step": 178070 + }, + { + "epoch": 7.38, + "grad_norm": 0.7578125, + "learning_rate": 0.00042911335153569564, + "loss": 0.2223, + "step": 178080 + }, + { + "epoch": 7.38, + "grad_norm": 0.5390625, + "learning_rate": 0.00042910578538027025, + "loss": 0.1886, + "step": 178090 + }, + { + "epoch": 7.38, + "grad_norm": 0.68359375, + "learning_rate": 0.00042909821888778706, + "loss": 0.1865, + "step": 178100 + }, + { + "epoch": 7.38, + "grad_norm": 0.5859375, + "learning_rate": 0.0004290906520582604, + "loss": 0.2406, + "step": 178110 + }, + { + "epoch": 7.38, + "grad_norm": 0.73046875, + "learning_rate": 0.0004290830848917044, + "loss": 0.1732, + "step": 178120 + }, + { + "epoch": 7.38, + "grad_norm": 0.625, + "learning_rate": 0.00042907551738813343, + "loss": 0.1857, + "step": 178130 + }, + { + "epoch": 7.38, + "grad_norm": 1.2578125, + "learning_rate": 0.0004290679495475617, + "loss": 0.1922, + "step": 178140 + }, + { + "epoch": 7.38, + "grad_norm": 0.99609375, + "learning_rate": 0.0004290603813700035, + "loss": 0.2299, + "step": 178150 + }, + { + "epoch": 7.38, + "grad_norm": 0.43359375, + "learning_rate": 0.00042905281285547284, + "loss": 0.208, + "step": 178160 + }, + { + "epoch": 7.38, + "grad_norm": 1.5390625, + "learning_rate": 0.00042904524400398414, + "loss": 0.2341, + "step": 178170 + }, + { + "epoch": 7.38, + "grad_norm": 0.765625, + "learning_rate": 0.00042903767481555175, + "loss": 0.1804, + "step": 178180 + }, + { + "epoch": 7.38, + "grad_norm": 0.33203125, + "learning_rate": 0.0004290301052901897, + "loss": 0.2114, + "step": 178190 + }, + { + "epoch": 7.38, + "grad_norm": 0.376953125, + "learning_rate": 0.0004290225354279124, + "loss": 0.1852, + "step": 178200 + }, + { + "epoch": 7.38, + "grad_norm": 0.2001953125, + "learning_rate": 0.0004290149652287339, + "loss": 0.2019, + "step": 178210 + }, + { + "epoch": 7.38, + "grad_norm": 0.8671875, + "learning_rate": 0.0004290073946926686, + "loss": 0.1854, + "step": 178220 + }, + { + "epoch": 7.38, + "grad_norm": 0.50390625, + "learning_rate": 0.0004289998238197308, + "loss": 0.2165, + "step": 178230 + }, + { + "epoch": 7.38, + "grad_norm": 1.5078125, + "learning_rate": 0.0004289922526099346, + "loss": 0.2256, + "step": 178240 + }, + { + "epoch": 7.38, + "grad_norm": 0.412109375, + "learning_rate": 0.0004289846810632943, + "loss": 0.1885, + "step": 178250 + }, + { + "epoch": 7.38, + "grad_norm": 0.6796875, + "learning_rate": 0.0004289771091798241, + "loss": 0.2062, + "step": 178260 + }, + { + "epoch": 7.38, + "grad_norm": 0.62890625, + "learning_rate": 0.00042896953695953843, + "loss": 0.1986, + "step": 178270 + }, + { + "epoch": 7.38, + "grad_norm": 3.484375, + "learning_rate": 0.0004289619644024514, + "loss": 0.2573, + "step": 178280 + }, + { + "epoch": 7.38, + "grad_norm": 1.15625, + "learning_rate": 0.0004289543915085773, + "loss": 0.1613, + "step": 178290 + }, + { + "epoch": 7.39, + "grad_norm": 0.73828125, + "learning_rate": 0.00042894681827793026, + "loss": 0.2024, + "step": 178300 + }, + { + "epoch": 7.39, + "grad_norm": 0.5703125, + "learning_rate": 0.0004289392447105247, + "loss": 0.1958, + "step": 178310 + }, + { + "epoch": 7.39, + "grad_norm": 0.33203125, + "learning_rate": 0.0004289316708063748, + "loss": 0.24, + "step": 178320 + }, + { + "epoch": 7.39, + "grad_norm": 0.7421875, + "learning_rate": 0.0004289240965654948, + "loss": 0.1812, + "step": 178330 + }, + { + "epoch": 7.39, + "grad_norm": 0.4453125, + "learning_rate": 0.000428916521987899, + "loss": 0.2238, + "step": 178340 + }, + { + "epoch": 7.39, + "grad_norm": 0.34375, + "learning_rate": 0.0004289089470736016, + "loss": 0.1593, + "step": 178350 + }, + { + "epoch": 7.39, + "grad_norm": 1.015625, + "learning_rate": 0.0004289013718226169, + "loss": 0.1878, + "step": 178360 + }, + { + "epoch": 7.39, + "grad_norm": 1.0703125, + "learning_rate": 0.0004288937962349591, + "loss": 0.2424, + "step": 178370 + }, + { + "epoch": 7.39, + "grad_norm": 1.0390625, + "learning_rate": 0.0004288862203106425, + "loss": 0.2598, + "step": 178380 + }, + { + "epoch": 7.39, + "grad_norm": 0.5546875, + "learning_rate": 0.0004288786440496814, + "loss": 0.2202, + "step": 178390 + }, + { + "epoch": 7.39, + "grad_norm": 0.6171875, + "learning_rate": 0.00042887106745208997, + "loss": 0.2228, + "step": 178400 + }, + { + "epoch": 7.39, + "grad_norm": 1.703125, + "learning_rate": 0.00042886349051788253, + "loss": 0.182, + "step": 178410 + }, + { + "epoch": 7.39, + "grad_norm": 1.09375, + "learning_rate": 0.0004288559132470733, + "loss": 0.1852, + "step": 178420 + }, + { + "epoch": 7.39, + "grad_norm": 0.69140625, + "learning_rate": 0.00042884833563967655, + "loss": 0.2255, + "step": 178430 + }, + { + "epoch": 7.39, + "grad_norm": 1.109375, + "learning_rate": 0.0004288407576957066, + "loss": 0.1987, + "step": 178440 + }, + { + "epoch": 7.39, + "grad_norm": 0.03857421875, + "learning_rate": 0.00042883317941517764, + "loss": 0.1679, + "step": 178450 + }, + { + "epoch": 7.39, + "grad_norm": 1.6640625, + "learning_rate": 0.00042882560079810384, + "loss": 0.2033, + "step": 178460 + }, + { + "epoch": 7.39, + "grad_norm": 2.28125, + "learning_rate": 0.0004288180218444997, + "loss": 0.2227, + "step": 178470 + }, + { + "epoch": 7.39, + "grad_norm": 0.423828125, + "learning_rate": 0.0004288104425543793, + "loss": 0.1868, + "step": 178480 + }, + { + "epoch": 7.39, + "grad_norm": 0.515625, + "learning_rate": 0.00042880286292775697, + "loss": 0.2145, + "step": 178490 + }, + { + "epoch": 7.39, + "grad_norm": 0.8125, + "learning_rate": 0.000428795282964647, + "loss": 0.1913, + "step": 178500 + }, + { + "epoch": 7.39, + "grad_norm": 0.326171875, + "learning_rate": 0.00042878770266506353, + "loss": 0.2139, + "step": 178510 + }, + { + "epoch": 7.39, + "grad_norm": 0.94140625, + "learning_rate": 0.000428780122029021, + "loss": 0.1938, + "step": 178520 + }, + { + "epoch": 7.39, + "grad_norm": 1.203125, + "learning_rate": 0.0004287725410565335, + "loss": 0.1988, + "step": 178530 + }, + { + "epoch": 7.4, + "grad_norm": 0.890625, + "learning_rate": 0.0004287649597476154, + "loss": 0.2136, + "step": 178540 + }, + { + "epoch": 7.4, + "grad_norm": 0.2099609375, + "learning_rate": 0.00042875737810228097, + "loss": 0.229, + "step": 178550 + }, + { + "epoch": 7.4, + "grad_norm": 0.73046875, + "learning_rate": 0.0004287497961205445, + "loss": 0.1621, + "step": 178560 + }, + { + "epoch": 7.4, + "grad_norm": 0.640625, + "learning_rate": 0.0004287422138024202, + "loss": 0.1579, + "step": 178570 + }, + { + "epoch": 7.4, + "grad_norm": 0.65625, + "learning_rate": 0.00042873463114792234, + "loss": 0.1755, + "step": 178580 + }, + { + "epoch": 7.4, + "grad_norm": 0.359375, + "learning_rate": 0.0004287270481570652, + "loss": 0.1977, + "step": 178590 + }, + { + "epoch": 7.4, + "grad_norm": 1.3515625, + "learning_rate": 0.000428719464829863, + "loss": 0.2277, + "step": 178600 + }, + { + "epoch": 7.4, + "grad_norm": 0.5546875, + "learning_rate": 0.0004287118811663302, + "loss": 0.1857, + "step": 178610 + }, + { + "epoch": 7.4, + "grad_norm": 0.388671875, + "learning_rate": 0.00042870429716648084, + "loss": 0.229, + "step": 178620 + }, + { + "epoch": 7.4, + "grad_norm": 0.62890625, + "learning_rate": 0.00042869671283032927, + "loss": 0.1868, + "step": 178630 + }, + { + "epoch": 7.4, + "grad_norm": 2.359375, + "learning_rate": 0.00042868912815788985, + "loss": 0.2226, + "step": 178640 + }, + { + "epoch": 7.4, + "grad_norm": 0.5859375, + "learning_rate": 0.00042868154314917677, + "loss": 0.1904, + "step": 178650 + }, + { + "epoch": 7.4, + "grad_norm": 1.25, + "learning_rate": 0.0004286739578042043, + "loss": 0.2323, + "step": 178660 + }, + { + "epoch": 7.4, + "grad_norm": 1.2890625, + "learning_rate": 0.00042866637212298674, + "loss": 0.2011, + "step": 178670 + }, + { + "epoch": 7.4, + "grad_norm": 0.65625, + "learning_rate": 0.0004286587861055384, + "loss": 0.2225, + "step": 178680 + }, + { + "epoch": 7.4, + "grad_norm": 2.515625, + "learning_rate": 0.0004286511997518735, + "loss": 0.2089, + "step": 178690 + }, + { + "epoch": 7.4, + "grad_norm": 0.2578125, + "learning_rate": 0.00042864361306200636, + "loss": 0.1918, + "step": 178700 + }, + { + "epoch": 7.4, + "grad_norm": 0.2294921875, + "learning_rate": 0.0004286360260359512, + "loss": 0.2416, + "step": 178710 + }, + { + "epoch": 7.4, + "grad_norm": 0.84765625, + "learning_rate": 0.00042862843867372236, + "loss": 0.2364, + "step": 178720 + }, + { + "epoch": 7.4, + "grad_norm": 1.421875, + "learning_rate": 0.000428620850975334, + "loss": 0.2069, + "step": 178730 + }, + { + "epoch": 7.4, + "grad_norm": 1.46875, + "learning_rate": 0.0004286132629408006, + "loss": 0.165, + "step": 178740 + }, + { + "epoch": 7.4, + "grad_norm": 1.109375, + "learning_rate": 0.0004286056745701363, + "loss": 0.1845, + "step": 178750 + }, + { + "epoch": 7.4, + "grad_norm": 0.43359375, + "learning_rate": 0.00042859808586335536, + "loss": 0.2462, + "step": 178760 + }, + { + "epoch": 7.4, + "grad_norm": 0.890625, + "learning_rate": 0.00042859049682047215, + "loss": 0.217, + "step": 178770 + }, + { + "epoch": 7.41, + "grad_norm": 0.54296875, + "learning_rate": 0.00042858290744150084, + "loss": 0.1748, + "step": 178780 + }, + { + "epoch": 7.41, + "grad_norm": 0.92578125, + "learning_rate": 0.00042857531772645587, + "loss": 0.215, + "step": 178790 + }, + { + "epoch": 7.41, + "grad_norm": 0.734375, + "learning_rate": 0.00042856772767535143, + "loss": 0.2381, + "step": 178800 + }, + { + "epoch": 7.41, + "grad_norm": 0.71875, + "learning_rate": 0.0004285601372882018, + "loss": 0.1777, + "step": 178810 + }, + { + "epoch": 7.41, + "grad_norm": 0.50390625, + "learning_rate": 0.0004285525465650212, + "loss": 0.2031, + "step": 178820 + }, + { + "epoch": 7.41, + "grad_norm": 0.6640625, + "learning_rate": 0.0004285449555058241, + "loss": 0.1931, + "step": 178830 + }, + { + "epoch": 7.41, + "grad_norm": 1.0078125, + "learning_rate": 0.0004285373641106246, + "loss": 0.2018, + "step": 178840 + }, + { + "epoch": 7.41, + "grad_norm": 0.953125, + "learning_rate": 0.000428529772379437, + "loss": 0.1635, + "step": 178850 + }, + { + "epoch": 7.41, + "grad_norm": 0.212890625, + "learning_rate": 0.0004285221803122758, + "loss": 0.2354, + "step": 178860 + }, + { + "epoch": 7.41, + "grad_norm": 0.96875, + "learning_rate": 0.00042851458790915507, + "loss": 0.2193, + "step": 178870 + }, + { + "epoch": 7.41, + "grad_norm": 0.71875, + "learning_rate": 0.0004285069951700892, + "loss": 0.2144, + "step": 178880 + }, + { + "epoch": 7.41, + "grad_norm": 0.765625, + "learning_rate": 0.00042849940209509235, + "loss": 0.2257, + "step": 178890 + }, + { + "epoch": 7.41, + "grad_norm": 1.109375, + "learning_rate": 0.00042849180868417895, + "loss": 0.1937, + "step": 178900 + }, + { + "epoch": 7.41, + "grad_norm": 0.36328125, + "learning_rate": 0.0004284842149373633, + "loss": 0.213, + "step": 178910 + }, + { + "epoch": 7.41, + "grad_norm": 0.8828125, + "learning_rate": 0.0004284766208546596, + "loss": 0.1813, + "step": 178920 + }, + { + "epoch": 7.41, + "grad_norm": 0.7109375, + "learning_rate": 0.0004284690264360821, + "loss": 0.2121, + "step": 178930 + }, + { + "epoch": 7.41, + "grad_norm": 0.63671875, + "learning_rate": 0.00042846143168164517, + "loss": 0.2563, + "step": 178940 + }, + { + "epoch": 7.41, + "grad_norm": 0.462890625, + "learning_rate": 0.0004284538365913632, + "loss": 0.1784, + "step": 178950 + }, + { + "epoch": 7.41, + "grad_norm": 0.255859375, + "learning_rate": 0.00042844624116525034, + "loss": 0.2137, + "step": 178960 + }, + { + "epoch": 7.41, + "grad_norm": 0.80078125, + "learning_rate": 0.0004284386454033209, + "loss": 0.1814, + "step": 178970 + }, + { + "epoch": 7.41, + "grad_norm": 0.7734375, + "learning_rate": 0.0004284310493055892, + "loss": 0.1491, + "step": 178980 + }, + { + "epoch": 7.41, + "grad_norm": 0.69140625, + "learning_rate": 0.0004284234528720696, + "loss": 0.2358, + "step": 178990 + }, + { + "epoch": 7.41, + "grad_norm": 0.3828125, + "learning_rate": 0.0004284158561027762, + "loss": 0.1951, + "step": 179000 + }, + { + "epoch": 7.41, + "grad_norm": 0.859375, + "learning_rate": 0.0004284082589977235, + "loss": 0.2331, + "step": 179010 + }, + { + "epoch": 7.41, + "grad_norm": 0.8046875, + "learning_rate": 0.00042840066155692573, + "loss": 0.1747, + "step": 179020 + }, + { + "epoch": 7.42, + "grad_norm": 0.310546875, + "learning_rate": 0.0004283930637803972, + "loss": 0.1842, + "step": 179030 + }, + { + "epoch": 7.42, + "grad_norm": 0.462890625, + "learning_rate": 0.0004283854656681522, + "loss": 0.2319, + "step": 179040 + }, + { + "epoch": 7.42, + "grad_norm": 0.392578125, + "learning_rate": 0.00042837786722020496, + "loss": 0.2297, + "step": 179050 + }, + { + "epoch": 7.42, + "grad_norm": 0.396484375, + "learning_rate": 0.00042837026843656987, + "loss": 0.1898, + "step": 179060 + }, + { + "epoch": 7.42, + "grad_norm": 1.28125, + "learning_rate": 0.0004283626693172612, + "loss": 0.1658, + "step": 179070 + }, + { + "epoch": 7.42, + "grad_norm": 0.197265625, + "learning_rate": 0.0004283550698622932, + "loss": 0.1734, + "step": 179080 + }, + { + "epoch": 7.42, + "grad_norm": 0.75390625, + "learning_rate": 0.00042834747007168024, + "loss": 0.249, + "step": 179090 + }, + { + "epoch": 7.42, + "grad_norm": 1.140625, + "learning_rate": 0.0004283398699454366, + "loss": 0.2092, + "step": 179100 + }, + { + "epoch": 7.42, + "grad_norm": 0.46875, + "learning_rate": 0.00042833226948357664, + "loss": 0.2277, + "step": 179110 + }, + { + "epoch": 7.42, + "grad_norm": 1.7734375, + "learning_rate": 0.0004283246686861145, + "loss": 0.1991, + "step": 179120 + }, + { + "epoch": 7.42, + "grad_norm": 0.451171875, + "learning_rate": 0.00042831706755306465, + "loss": 0.1653, + "step": 179130 + }, + { + "epoch": 7.42, + "grad_norm": 0.451171875, + "learning_rate": 0.00042830946608444137, + "loss": 0.1899, + "step": 179140 + }, + { + "epoch": 7.42, + "grad_norm": 0.8203125, + "learning_rate": 0.0004283018642802589, + "loss": 0.2087, + "step": 179150 + }, + { + "epoch": 7.42, + "grad_norm": 0.7421875, + "learning_rate": 0.0004282942621405316, + "loss": 0.1937, + "step": 179160 + }, + { + "epoch": 7.42, + "grad_norm": 0.7578125, + "learning_rate": 0.0004282866596652737, + "loss": 0.1394, + "step": 179170 + }, + { + "epoch": 7.42, + "grad_norm": 0.81640625, + "learning_rate": 0.00042827905685449957, + "loss": 0.2416, + "step": 179180 + }, + { + "epoch": 7.42, + "grad_norm": 0.4453125, + "learning_rate": 0.0004282714537082236, + "loss": 0.168, + "step": 179190 + }, + { + "epoch": 7.42, + "grad_norm": 0.255859375, + "learning_rate": 0.00042826385022645984, + "loss": 0.1312, + "step": 179200 + }, + { + "epoch": 7.42, + "grad_norm": 0.92578125, + "learning_rate": 0.0004282562464092229, + "loss": 0.182, + "step": 179210 + }, + { + "epoch": 7.42, + "grad_norm": 0.57421875, + "learning_rate": 0.00042824864225652693, + "loss": 0.239, + "step": 179220 + }, + { + "epoch": 7.42, + "grad_norm": 0.296875, + "learning_rate": 0.00042824103776838623, + "loss": 0.2234, + "step": 179230 + }, + { + "epoch": 7.42, + "grad_norm": 1.3359375, + "learning_rate": 0.00042823343294481523, + "loss": 0.2063, + "step": 179240 + }, + { + "epoch": 7.42, + "grad_norm": 0.51953125, + "learning_rate": 0.000428225827785828, + "loss": 0.2952, + "step": 179250 + }, + { + "epoch": 7.42, + "grad_norm": 0.97265625, + "learning_rate": 0.0004282182222914391, + "loss": 0.1795, + "step": 179260 + }, + { + "epoch": 7.43, + "grad_norm": 0.9921875, + "learning_rate": 0.00042821061646166274, + "loss": 0.1748, + "step": 179270 + }, + { + "epoch": 7.43, + "grad_norm": 0.60546875, + "learning_rate": 0.0004282030102965133, + "loss": 0.19, + "step": 179280 + }, + { + "epoch": 7.43, + "grad_norm": 1.640625, + "learning_rate": 0.00042819540379600496, + "loss": 0.2194, + "step": 179290 + }, + { + "epoch": 7.43, + "grad_norm": 0.353515625, + "learning_rate": 0.0004281877969601522, + "loss": 0.2827, + "step": 179300 + }, + { + "epoch": 7.43, + "grad_norm": 0.439453125, + "learning_rate": 0.00042818018978896916, + "loss": 0.2475, + "step": 179310 + }, + { + "epoch": 7.43, + "grad_norm": 0.91015625, + "learning_rate": 0.0004281725822824702, + "loss": 0.2039, + "step": 179320 + }, + { + "epoch": 7.43, + "grad_norm": 0.99609375, + "learning_rate": 0.00042816497444066984, + "loss": 0.1586, + "step": 179330 + }, + { + "epoch": 7.43, + "grad_norm": 0.50390625, + "learning_rate": 0.00042815736626358215, + "loss": 0.2418, + "step": 179340 + }, + { + "epoch": 7.43, + "grad_norm": 2.484375, + "learning_rate": 0.00042814975775122153, + "loss": 0.1776, + "step": 179350 + }, + { + "epoch": 7.43, + "grad_norm": 0.796875, + "learning_rate": 0.00042814214890360235, + "loss": 0.2032, + "step": 179360 + }, + { + "epoch": 7.43, + "grad_norm": 0.6953125, + "learning_rate": 0.00042813453972073885, + "loss": 0.1675, + "step": 179370 + }, + { + "epoch": 7.43, + "grad_norm": 0.2333984375, + "learning_rate": 0.00042812693020264535, + "loss": 0.2203, + "step": 179380 + }, + { + "epoch": 7.43, + "grad_norm": 1.421875, + "learning_rate": 0.0004281193203493363, + "loss": 0.218, + "step": 179390 + }, + { + "epoch": 7.43, + "grad_norm": 0.66015625, + "learning_rate": 0.0004281117101608258, + "loss": 0.1903, + "step": 179400 + }, + { + "epoch": 7.43, + "grad_norm": 0.3984375, + "learning_rate": 0.00042810409963712836, + "loss": 0.1536, + "step": 179410 + }, + { + "epoch": 7.43, + "grad_norm": 0.44140625, + "learning_rate": 0.00042809648877825825, + "loss": 0.156, + "step": 179420 + }, + { + "epoch": 7.43, + "grad_norm": 0.322265625, + "learning_rate": 0.00042808887758422976, + "loss": 0.2174, + "step": 179430 + }, + { + "epoch": 7.43, + "grad_norm": 0.625, + "learning_rate": 0.00042808126605505724, + "loss": 0.2145, + "step": 179440 + }, + { + "epoch": 7.43, + "grad_norm": 0.0, + "learning_rate": 0.00042807365419075507, + "loss": 0.1951, + "step": 179450 + }, + { + "epoch": 7.43, + "grad_norm": 0.5, + "learning_rate": 0.00042806604199133744, + "loss": 0.2128, + "step": 179460 + }, + { + "epoch": 7.43, + "grad_norm": 0.84375, + "learning_rate": 0.0004280584294568187, + "loss": 0.2261, + "step": 179470 + }, + { + "epoch": 7.43, + "grad_norm": 0.34375, + "learning_rate": 0.00042805081658721334, + "loss": 0.2102, + "step": 179480 + }, + { + "epoch": 7.43, + "grad_norm": 0.625, + "learning_rate": 0.0004280432033825355, + "loss": 0.1717, + "step": 179490 + }, + { + "epoch": 7.43, + "grad_norm": 1.1484375, + "learning_rate": 0.0004280355898427996, + "loss": 0.1872, + "step": 179500 + }, + { + "epoch": 7.44, + "grad_norm": 0.80078125, + "learning_rate": 0.00042802797596802, + "loss": 0.2275, + "step": 179510 + }, + { + "epoch": 7.44, + "grad_norm": 0.54296875, + "learning_rate": 0.00042802036175821083, + "loss": 0.1673, + "step": 179520 + }, + { + "epoch": 7.44, + "grad_norm": 1.109375, + "learning_rate": 0.00042801274721338667, + "loss": 0.1588, + "step": 179530 + }, + { + "epoch": 7.44, + "grad_norm": 0.23046875, + "learning_rate": 0.00042800513233356177, + "loss": 0.1477, + "step": 179540 + }, + { + "epoch": 7.44, + "grad_norm": 0.578125, + "learning_rate": 0.00042799751711875033, + "loss": 0.2399, + "step": 179550 + }, + { + "epoch": 7.44, + "grad_norm": 0.98046875, + "learning_rate": 0.0004279899015689669, + "loss": 0.2054, + "step": 179560 + }, + { + "epoch": 7.44, + "grad_norm": 0.5078125, + "learning_rate": 0.0004279822856842256, + "loss": 0.161, + "step": 179570 + }, + { + "epoch": 7.44, + "grad_norm": 0.9375, + "learning_rate": 0.0004279746694645409, + "loss": 0.2312, + "step": 179580 + }, + { + "epoch": 7.44, + "grad_norm": 0.625, + "learning_rate": 0.0004279670529099271, + "loss": 0.2123, + "step": 179590 + }, + { + "epoch": 7.44, + "grad_norm": 0.7890625, + "learning_rate": 0.00042795943602039855, + "loss": 0.1828, + "step": 179600 + }, + { + "epoch": 7.44, + "grad_norm": 0.453125, + "learning_rate": 0.00042795181879596943, + "loss": 0.2394, + "step": 179610 + }, + { + "epoch": 7.44, + "grad_norm": 0.84765625, + "learning_rate": 0.00042794420123665433, + "loss": 0.1313, + "step": 179620 + }, + { + "epoch": 7.44, + "grad_norm": 0.57421875, + "learning_rate": 0.00042793658334246745, + "loss": 0.1931, + "step": 179630 + }, + { + "epoch": 7.44, + "grad_norm": 0.337890625, + "learning_rate": 0.0004279289651134231, + "loss": 0.2081, + "step": 179640 + }, + { + "epoch": 7.44, + "grad_norm": 0.6875, + "learning_rate": 0.0004279213465495357, + "loss": 0.2302, + "step": 179650 + }, + { + "epoch": 7.44, + "grad_norm": 0.71875, + "learning_rate": 0.00042791372765081946, + "loss": 0.2283, + "step": 179660 + }, + { + "epoch": 7.44, + "grad_norm": 0.87109375, + "learning_rate": 0.0004279061084172889, + "loss": 0.237, + "step": 179670 + }, + { + "epoch": 7.44, + "grad_norm": 0.94921875, + "learning_rate": 0.00042789848884895824, + "loss": 0.1709, + "step": 179680 + }, + { + "epoch": 7.44, + "grad_norm": 1.625, + "learning_rate": 0.0004278908689458417, + "loss": 0.1892, + "step": 179690 + }, + { + "epoch": 7.44, + "grad_norm": 1.1640625, + "learning_rate": 0.0004278832487079539, + "loss": 0.2078, + "step": 179700 + }, + { + "epoch": 7.44, + "grad_norm": 0.9609375, + "learning_rate": 0.000427875628135309, + "loss": 0.2058, + "step": 179710 + }, + { + "epoch": 7.44, + "grad_norm": 0.52734375, + "learning_rate": 0.0004278680072279213, + "loss": 0.1875, + "step": 179720 + }, + { + "epoch": 7.44, + "grad_norm": 0.56640625, + "learning_rate": 0.0004278603859858053, + "loss": 0.2008, + "step": 179730 + }, + { + "epoch": 7.44, + "grad_norm": 0.53515625, + "learning_rate": 0.00042785276440897524, + "loss": 0.2229, + "step": 179740 + }, + { + "epoch": 7.45, + "grad_norm": 0.70703125, + "learning_rate": 0.0004278451424974455, + "loss": 0.2199, + "step": 179750 + }, + { + "epoch": 7.45, + "grad_norm": 0.47265625, + "learning_rate": 0.00042783752025123036, + "loss": 0.2318, + "step": 179760 + }, + { + "epoch": 7.45, + "grad_norm": 0.8671875, + "learning_rate": 0.0004278298976703443, + "loss": 0.2341, + "step": 179770 + }, + { + "epoch": 7.45, + "grad_norm": 0.384765625, + "learning_rate": 0.0004278222747548015, + "loss": 0.2134, + "step": 179780 + }, + { + "epoch": 7.45, + "grad_norm": 0.63671875, + "learning_rate": 0.0004278146515046164, + "loss": 0.2245, + "step": 179790 + }, + { + "epoch": 7.45, + "grad_norm": 0.953125, + "learning_rate": 0.0004278070279198033, + "loss": 0.1928, + "step": 179800 + }, + { + "epoch": 7.45, + "grad_norm": 1.0078125, + "learning_rate": 0.00042779940400037664, + "loss": 0.1802, + "step": 179810 + }, + { + "epoch": 7.45, + "grad_norm": 0.369140625, + "learning_rate": 0.0004277917797463506, + "loss": 0.1814, + "step": 179820 + }, + { + "epoch": 7.45, + "grad_norm": 0.8984375, + "learning_rate": 0.00042778415515773975, + "loss": 0.2054, + "step": 179830 + }, + { + "epoch": 7.45, + "grad_norm": 1.0703125, + "learning_rate": 0.0004277765302345583, + "loss": 0.2247, + "step": 179840 + }, + { + "epoch": 7.45, + "grad_norm": 0.435546875, + "learning_rate": 0.0004277689049768205, + "loss": 0.2008, + "step": 179850 + }, + { + "epoch": 7.45, + "grad_norm": 0.85546875, + "learning_rate": 0.00042776127938454103, + "loss": 0.2002, + "step": 179860 + }, + { + "epoch": 7.45, + "grad_norm": 0.77734375, + "learning_rate": 0.0004277536534577339, + "loss": 0.1624, + "step": 179870 + }, + { + "epoch": 7.45, + "grad_norm": 1.4921875, + "learning_rate": 0.00042774602719641353, + "loss": 0.1839, + "step": 179880 + }, + { + "epoch": 7.45, + "grad_norm": 1.1328125, + "learning_rate": 0.00042773840060059446, + "loss": 0.1624, + "step": 179890 + }, + { + "epoch": 7.45, + "grad_norm": 0.97265625, + "learning_rate": 0.0004277307736702908, + "loss": 0.2493, + "step": 179900 + }, + { + "epoch": 7.45, + "grad_norm": 0.796875, + "learning_rate": 0.000427723146405517, + "loss": 0.1809, + "step": 179910 + }, + { + "epoch": 7.45, + "grad_norm": 0.6484375, + "learning_rate": 0.00042771551880628754, + "loss": 0.223, + "step": 179920 + }, + { + "epoch": 7.45, + "grad_norm": 1.078125, + "learning_rate": 0.00042770789087261666, + "loss": 0.2232, + "step": 179930 + }, + { + "epoch": 7.45, + "grad_norm": 1.0390625, + "learning_rate": 0.0004277002626045187, + "loss": 0.2022, + "step": 179940 + }, + { + "epoch": 7.45, + "grad_norm": 0.4140625, + "learning_rate": 0.000427692634002008, + "loss": 0.209, + "step": 179950 + }, + { + "epoch": 7.45, + "grad_norm": 0.5625, + "learning_rate": 0.0004276850050650989, + "loss": 0.2397, + "step": 179960 + }, + { + "epoch": 7.45, + "grad_norm": 0.412109375, + "learning_rate": 0.00042767737579380596, + "loss": 0.2339, + "step": 179970 + }, + { + "epoch": 7.45, + "grad_norm": 1.21875, + "learning_rate": 0.00042766974618814327, + "loss": 0.2032, + "step": 179980 + }, + { + "epoch": 7.46, + "grad_norm": 0.80078125, + "learning_rate": 0.00042766211624812537, + "loss": 0.1439, + "step": 179990 + }, + { + "epoch": 7.46, + "grad_norm": 0.703125, + "learning_rate": 0.0004276544859737665, + "loss": 0.2486, + "step": 180000 + }, + { + "epoch": 7.46, + "grad_norm": 0.37890625, + "learning_rate": 0.00042764685536508107, + "loss": 0.2027, + "step": 180010 + }, + { + "epoch": 7.46, + "grad_norm": 0.6328125, + "learning_rate": 0.00042763922442208346, + "loss": 0.2054, + "step": 180020 + }, + { + "epoch": 7.46, + "grad_norm": 0.373046875, + "learning_rate": 0.00042763159314478805, + "loss": 0.1994, + "step": 180030 + }, + { + "epoch": 7.46, + "grad_norm": 0.470703125, + "learning_rate": 0.00042762396153320914, + "loss": 0.1786, + "step": 180040 + }, + { + "epoch": 7.46, + "grad_norm": 0.44921875, + "learning_rate": 0.00042761632958736106, + "loss": 0.2061, + "step": 180050 + }, + { + "epoch": 7.46, + "grad_norm": 1.1015625, + "learning_rate": 0.0004276086973072583, + "loss": 0.1953, + "step": 180060 + }, + { + "epoch": 7.46, + "grad_norm": 0.84765625, + "learning_rate": 0.00042760106469291514, + "loss": 0.1854, + "step": 180070 + }, + { + "epoch": 7.46, + "grad_norm": 0.59765625, + "learning_rate": 0.00042759343174434593, + "loss": 0.1808, + "step": 180080 + }, + { + "epoch": 7.46, + "grad_norm": 1.0, + "learning_rate": 0.0004275857984615651, + "loss": 0.2264, + "step": 180090 + }, + { + "epoch": 7.46, + "grad_norm": 0.640625, + "learning_rate": 0.00042757816484458695, + "loss": 0.1904, + "step": 180100 + }, + { + "epoch": 7.46, + "grad_norm": 1.8203125, + "learning_rate": 0.0004275705308934259, + "loss": 0.2077, + "step": 180110 + }, + { + "epoch": 7.46, + "grad_norm": 0.7265625, + "learning_rate": 0.00042756289660809624, + "loss": 0.2125, + "step": 180120 + }, + { + "epoch": 7.46, + "grad_norm": 0.765625, + "learning_rate": 0.00042755526198861237, + "loss": 0.1918, + "step": 180130 + }, + { + "epoch": 7.46, + "grad_norm": 0.53515625, + "learning_rate": 0.00042754762703498873, + "loss": 0.2383, + "step": 180140 + }, + { + "epoch": 7.46, + "grad_norm": 0.65234375, + "learning_rate": 0.0004275399917472396, + "loss": 0.1914, + "step": 180150 + }, + { + "epoch": 7.46, + "grad_norm": 1.234375, + "learning_rate": 0.0004275323561253794, + "loss": 0.2018, + "step": 180160 + }, + { + "epoch": 7.46, + "grad_norm": 0.56640625, + "learning_rate": 0.00042752472016942245, + "loss": 0.1992, + "step": 180170 + }, + { + "epoch": 7.46, + "grad_norm": 0.7265625, + "learning_rate": 0.00042751708387938313, + "loss": 0.2595, + "step": 180180 + }, + { + "epoch": 7.46, + "grad_norm": 0.78515625, + "learning_rate": 0.00042750944725527585, + "loss": 0.1495, + "step": 180190 + }, + { + "epoch": 7.46, + "grad_norm": 0.65625, + "learning_rate": 0.000427501810297115, + "loss": 0.2079, + "step": 180200 + }, + { + "epoch": 7.46, + "grad_norm": 0.51171875, + "learning_rate": 0.00042749417300491486, + "loss": 0.1135, + "step": 180210 + }, + { + "epoch": 7.46, + "grad_norm": 0.306640625, + "learning_rate": 0.0004274865353786899, + "loss": 0.1992, + "step": 180220 + }, + { + "epoch": 7.47, + "grad_norm": 0.462890625, + "learning_rate": 0.0004274788974184544, + "loss": 0.2253, + "step": 180230 + }, + { + "epoch": 7.47, + "grad_norm": 2.09375, + "learning_rate": 0.0004274712591242228, + "loss": 0.1864, + "step": 180240 + }, + { + "epoch": 7.47, + "grad_norm": 0.35546875, + "learning_rate": 0.00042746362049600944, + "loss": 0.1677, + "step": 180250 + }, + { + "epoch": 7.47, + "grad_norm": 1.3515625, + "learning_rate": 0.0004274559815338287, + "loss": 0.2097, + "step": 180260 + }, + { + "epoch": 7.47, + "grad_norm": 0.7421875, + "learning_rate": 0.00042744834223769503, + "loss": 0.2324, + "step": 180270 + }, + { + "epoch": 7.47, + "grad_norm": 1.2734375, + "learning_rate": 0.0004274407026076227, + "loss": 0.2471, + "step": 180280 + }, + { + "epoch": 7.47, + "grad_norm": 1.0234375, + "learning_rate": 0.00042743306264362614, + "loss": 0.2164, + "step": 180290 + }, + { + "epoch": 7.47, + "grad_norm": 0.86328125, + "learning_rate": 0.0004274254223457197, + "loss": 0.2082, + "step": 180300 + }, + { + "epoch": 7.47, + "grad_norm": 0.87890625, + "learning_rate": 0.00042741778171391775, + "loss": 0.2405, + "step": 180310 + }, + { + "epoch": 7.47, + "grad_norm": 0.23828125, + "learning_rate": 0.0004274101407482348, + "loss": 0.2347, + "step": 180320 + }, + { + "epoch": 7.47, + "grad_norm": 0.0, + "learning_rate": 0.00042740249944868506, + "loss": 0.1925, + "step": 180330 + }, + { + "epoch": 7.47, + "grad_norm": 2.3125, + "learning_rate": 0.000427394857815283, + "loss": 0.1835, + "step": 180340 + }, + { + "epoch": 7.47, + "grad_norm": 0.71875, + "learning_rate": 0.00042738721584804283, + "loss": 0.2195, + "step": 180350 + }, + { + "epoch": 7.47, + "grad_norm": 0.65234375, + "learning_rate": 0.0004273795735469792, + "loss": 0.1385, + "step": 180360 + }, + { + "epoch": 7.47, + "grad_norm": 0.86328125, + "learning_rate": 0.00042737193091210636, + "loss": 0.2145, + "step": 180370 + }, + { + "epoch": 7.47, + "grad_norm": 0.6875, + "learning_rate": 0.00042736428794343874, + "loss": 0.2323, + "step": 180380 + }, + { + "epoch": 7.47, + "grad_norm": 0.8515625, + "learning_rate": 0.00042735664464099065, + "loss": 0.2156, + "step": 180390 + }, + { + "epoch": 7.47, + "grad_norm": 0.416015625, + "learning_rate": 0.0004273490010047765, + "loss": 0.1908, + "step": 180400 + }, + { + "epoch": 7.47, + "grad_norm": 0.90625, + "learning_rate": 0.0004273413570348107, + "loss": 0.1718, + "step": 180410 + }, + { + "epoch": 7.47, + "grad_norm": 0.58203125, + "learning_rate": 0.00042733371273110754, + "loss": 0.2006, + "step": 180420 + }, + { + "epoch": 7.47, + "grad_norm": 1.0859375, + "learning_rate": 0.0004273260680936816, + "loss": 0.2659, + "step": 180430 + }, + { + "epoch": 7.47, + "grad_norm": 1.046875, + "learning_rate": 0.0004273184231225471, + "loss": 0.1893, + "step": 180440 + }, + { + "epoch": 7.47, + "grad_norm": 0.69921875, + "learning_rate": 0.0004273107778177184, + "loss": 0.2313, + "step": 180450 + }, + { + "epoch": 7.47, + "grad_norm": 0.79296875, + "learning_rate": 0.00042730313217921004, + "loss": 0.1889, + "step": 180460 + }, + { + "epoch": 7.48, + "grad_norm": 1.0078125, + "learning_rate": 0.00042729548620703634, + "loss": 0.2201, + "step": 180470 + }, + { + "epoch": 7.48, + "grad_norm": 1.1953125, + "learning_rate": 0.00042728783990121167, + "loss": 0.1999, + "step": 180480 + }, + { + "epoch": 7.48, + "grad_norm": 0.46875, + "learning_rate": 0.0004272801932617504, + "loss": 0.2361, + "step": 180490 + }, + { + "epoch": 7.48, + "grad_norm": 1.453125, + "learning_rate": 0.00042727254628866694, + "loss": 0.2144, + "step": 180500 + }, + { + "epoch": 7.48, + "grad_norm": 0.9609375, + "learning_rate": 0.0004272648989819758, + "loss": 0.2566, + "step": 180510 + }, + { + "epoch": 7.48, + "grad_norm": 0.359375, + "learning_rate": 0.0004272572513416911, + "loss": 0.1873, + "step": 180520 + }, + { + "epoch": 7.48, + "grad_norm": 0.3359375, + "learning_rate": 0.0004272496033678276, + "loss": 0.1687, + "step": 180530 + }, + { + "epoch": 7.48, + "grad_norm": 0.263671875, + "learning_rate": 0.00042724195506039933, + "loss": 0.176, + "step": 180540 + }, + { + "epoch": 7.48, + "grad_norm": 0.6171875, + "learning_rate": 0.0004272343064194209, + "loss": 0.2051, + "step": 180550 + }, + { + "epoch": 7.48, + "grad_norm": 0.5546875, + "learning_rate": 0.0004272266574449066, + "loss": 0.1648, + "step": 180560 + }, + { + "epoch": 7.48, + "grad_norm": 0.578125, + "learning_rate": 0.00042721900813687085, + "loss": 0.2354, + "step": 180570 + }, + { + "epoch": 7.48, + "grad_norm": 2.765625, + "learning_rate": 0.00042721135849532824, + "loss": 0.1989, + "step": 180580 + }, + { + "epoch": 7.48, + "grad_norm": 0.8359375, + "learning_rate": 0.00042720370852029275, + "loss": 0.1637, + "step": 180590 + }, + { + "epoch": 7.48, + "grad_norm": 0.796875, + "learning_rate": 0.0004271960582117792, + "loss": 0.1766, + "step": 180600 + }, + { + "epoch": 7.48, + "grad_norm": 0.703125, + "learning_rate": 0.0004271884075698017, + "loss": 0.2216, + "step": 180610 + }, + { + "epoch": 7.48, + "grad_norm": 0.94921875, + "learning_rate": 0.0004271807565943748, + "loss": 0.1778, + "step": 180620 + }, + { + "epoch": 7.48, + "grad_norm": 0.220703125, + "learning_rate": 0.0004271731052855128, + "loss": 0.2182, + "step": 180630 + }, + { + "epoch": 7.48, + "grad_norm": 0.3984375, + "learning_rate": 0.0004271654536432302, + "loss": 0.1751, + "step": 180640 + }, + { + "epoch": 7.48, + "grad_norm": 0.439453125, + "learning_rate": 0.0004271578016675414, + "loss": 0.1874, + "step": 180650 + }, + { + "epoch": 7.48, + "grad_norm": 0.79296875, + "learning_rate": 0.00042715014935846066, + "loss": 0.2081, + "step": 180660 + }, + { + "epoch": 7.48, + "grad_norm": 0.447265625, + "learning_rate": 0.00042714249671600246, + "loss": 0.2401, + "step": 180670 + }, + { + "epoch": 7.48, + "grad_norm": 0.796875, + "learning_rate": 0.0004271348437401813, + "loss": 0.2506, + "step": 180680 + }, + { + "epoch": 7.48, + "grad_norm": 0.5703125, + "learning_rate": 0.0004271271904310114, + "loss": 0.2202, + "step": 180690 + }, + { + "epoch": 7.48, + "grad_norm": 0.5, + "learning_rate": 0.0004271195367885073, + "loss": 0.2359, + "step": 180700 + }, + { + "epoch": 7.48, + "grad_norm": 1.015625, + "learning_rate": 0.0004271118828126833, + "loss": 0.2286, + "step": 180710 + }, + { + "epoch": 7.49, + "grad_norm": 0.62109375, + "learning_rate": 0.00042710422850355395, + "loss": 0.1559, + "step": 180720 + }, + { + "epoch": 7.49, + "grad_norm": 1.921875, + "learning_rate": 0.00042709657386113354, + "loss": 0.1744, + "step": 180730 + }, + { + "epoch": 7.49, + "grad_norm": 0.59375, + "learning_rate": 0.0004270889188854365, + "loss": 0.2212, + "step": 180740 + }, + { + "epoch": 7.49, + "grad_norm": 1.0703125, + "learning_rate": 0.00042708126357647727, + "loss": 0.2251, + "step": 180750 + }, + { + "epoch": 7.49, + "grad_norm": 0.84765625, + "learning_rate": 0.0004270736079342702, + "loss": 0.2239, + "step": 180760 + }, + { + "epoch": 7.49, + "grad_norm": 1.0859375, + "learning_rate": 0.0004270659519588297, + "loss": 0.1874, + "step": 180770 + }, + { + "epoch": 7.49, + "grad_norm": 0.6796875, + "learning_rate": 0.00042705829565017016, + "loss": 0.2028, + "step": 180780 + }, + { + "epoch": 7.49, + "grad_norm": 0.921875, + "learning_rate": 0.0004270506390083061, + "loss": 0.1876, + "step": 180790 + }, + { + "epoch": 7.49, + "grad_norm": 0.65234375, + "learning_rate": 0.0004270429820332518, + "loss": 0.2404, + "step": 180800 + }, + { + "epoch": 7.49, + "grad_norm": 0.51953125, + "learning_rate": 0.00042703532472502175, + "loss": 0.2067, + "step": 180810 + }, + { + "epoch": 7.49, + "grad_norm": 0.74609375, + "learning_rate": 0.0004270276670836304, + "loss": 0.207, + "step": 180820 + }, + { + "epoch": 7.49, + "grad_norm": 0.384765625, + "learning_rate": 0.000427020009109092, + "loss": 0.1983, + "step": 180830 + }, + { + "epoch": 7.49, + "grad_norm": 1.125, + "learning_rate": 0.0004270123508014211, + "loss": 0.1807, + "step": 180840 + }, + { + "epoch": 7.49, + "grad_norm": 0.73046875, + "learning_rate": 0.00042700469216063207, + "loss": 0.199, + "step": 180850 + }, + { + "epoch": 7.49, + "grad_norm": 0.78125, + "learning_rate": 0.00042699703318673936, + "loss": 0.1777, + "step": 180860 + }, + { + "epoch": 7.49, + "grad_norm": 0.97265625, + "learning_rate": 0.00042698937387975735, + "loss": 0.2405, + "step": 180870 + }, + { + "epoch": 7.49, + "grad_norm": 1.0234375, + "learning_rate": 0.00042698171423970037, + "loss": 0.1932, + "step": 180880 + }, + { + "epoch": 7.49, + "grad_norm": 1.2734375, + "learning_rate": 0.000426974054266583, + "loss": 0.2054, + "step": 180890 + }, + { + "epoch": 7.49, + "grad_norm": 0.6875, + "learning_rate": 0.0004269663939604196, + "loss": 0.261, + "step": 180900 + }, + { + "epoch": 7.49, + "grad_norm": 0.7421875, + "learning_rate": 0.00042695873332122445, + "loss": 0.2115, + "step": 180910 + }, + { + "epoch": 7.49, + "grad_norm": 0.220703125, + "learning_rate": 0.0004269510723490121, + "loss": 0.236, + "step": 180920 + }, + { + "epoch": 7.49, + "grad_norm": 1.1171875, + "learning_rate": 0.000426943411043797, + "loss": 0.2072, + "step": 180930 + }, + { + "epoch": 7.49, + "grad_norm": 0.9921875, + "learning_rate": 0.00042693574940559344, + "loss": 0.1948, + "step": 180940 + }, + { + "epoch": 7.49, + "grad_norm": 0.28515625, + "learning_rate": 0.0004269280874344159, + "loss": 0.1724, + "step": 180950 + }, + { + "epoch": 7.5, + "grad_norm": 0.70703125, + "learning_rate": 0.00042692042513027887, + "loss": 0.2164, + "step": 180960 + }, + { + "epoch": 7.5, + "grad_norm": 0.8671875, + "learning_rate": 0.00042691276249319667, + "loss": 0.2007, + "step": 180970 + }, + { + "epoch": 7.5, + "grad_norm": 0.609375, + "learning_rate": 0.0004269050995231838, + "loss": 0.1676, + "step": 180980 + }, + { + "epoch": 7.5, + "grad_norm": 0.6171875, + "learning_rate": 0.00042689743622025455, + "loss": 0.1824, + "step": 180990 + }, + { + "epoch": 7.5, + "grad_norm": 0.66015625, + "learning_rate": 0.00042688977258442353, + "loss": 0.1609, + "step": 181000 + }, + { + "epoch": 7.5, + "grad_norm": 0.51953125, + "learning_rate": 0.000426882108615705, + "loss": 0.1669, + "step": 181010 + }, + { + "epoch": 7.5, + "grad_norm": 0.267578125, + "learning_rate": 0.00042687444431411343, + "loss": 0.2152, + "step": 181020 + }, + { + "epoch": 7.5, + "grad_norm": 0.796875, + "learning_rate": 0.0004268667796796633, + "loss": 0.1871, + "step": 181030 + }, + { + "epoch": 7.5, + "grad_norm": 0.5859375, + "learning_rate": 0.0004268591147123689, + "loss": 0.2326, + "step": 181040 + }, + { + "epoch": 7.5, + "grad_norm": 0.515625, + "learning_rate": 0.00042685144941224484, + "loss": 0.2371, + "step": 181050 + }, + { + "epoch": 7.5, + "grad_norm": 0.0, + "learning_rate": 0.00042684378377930545, + "loss": 0.164, + "step": 181060 + }, + { + "epoch": 7.5, + "grad_norm": 0.8359375, + "learning_rate": 0.0004268361178135651, + "loss": 0.2249, + "step": 181070 + }, + { + "epoch": 7.5, + "grad_norm": 0.734375, + "learning_rate": 0.00042682845151503833, + "loss": 0.1919, + "step": 181080 + }, + { + "epoch": 7.5, + "grad_norm": 0.6484375, + "learning_rate": 0.0004268207848837395, + "loss": 0.2277, + "step": 181090 + }, + { + "epoch": 7.5, + "grad_norm": 0.61328125, + "learning_rate": 0.00042681311791968303, + "loss": 0.1811, + "step": 181100 + }, + { + "epoch": 7.5, + "grad_norm": 0.357421875, + "learning_rate": 0.0004268054506228834, + "loss": 0.1807, + "step": 181110 + }, + { + "epoch": 7.5, + "grad_norm": 0.671875, + "learning_rate": 0.0004267977829933549, + "loss": 0.179, + "step": 181120 + }, + { + "epoch": 7.5, + "grad_norm": 0.76953125, + "learning_rate": 0.0004267901150311122, + "loss": 0.2039, + "step": 181130 + }, + { + "epoch": 7.5, + "grad_norm": 0.72265625, + "learning_rate": 0.0004267824467361695, + "loss": 0.2201, + "step": 181140 + }, + { + "epoch": 7.5, + "grad_norm": 0.703125, + "learning_rate": 0.00042677477810854135, + "loss": 0.2389, + "step": 181150 + }, + { + "epoch": 7.5, + "grad_norm": 0.486328125, + "learning_rate": 0.00042676710914824224, + "loss": 0.2052, + "step": 181160 + }, + { + "epoch": 7.5, + "grad_norm": 0.388671875, + "learning_rate": 0.00042675943985528644, + "loss": 0.2109, + "step": 181170 + }, + { + "epoch": 7.5, + "grad_norm": 0.396484375, + "learning_rate": 0.00042675177022968847, + "loss": 0.2052, + "step": 181180 + }, + { + "epoch": 7.5, + "grad_norm": 0.443359375, + "learning_rate": 0.0004267441002714627, + "loss": 0.2077, + "step": 181190 + }, + { + "epoch": 7.51, + "grad_norm": 0.5390625, + "learning_rate": 0.0004267364299806237, + "loss": 0.1804, + "step": 181200 + }, + { + "epoch": 7.51, + "grad_norm": 0.88671875, + "learning_rate": 0.0004267287593571858, + "loss": 0.2103, + "step": 181210 + }, + { + "epoch": 7.51, + "grad_norm": 0.33984375, + "learning_rate": 0.0004267210884011635, + "loss": 0.1958, + "step": 181220 + }, + { + "epoch": 7.51, + "grad_norm": 0.3828125, + "learning_rate": 0.0004267134171125711, + "loss": 0.1368, + "step": 181230 + }, + { + "epoch": 7.51, + "grad_norm": 0.625, + "learning_rate": 0.0004267057454914232, + "loss": 0.2053, + "step": 181240 + }, + { + "epoch": 7.51, + "grad_norm": 0.7421875, + "learning_rate": 0.00042669807353773416, + "loss": 0.2298, + "step": 181250 + }, + { + "epoch": 7.51, + "grad_norm": 1.15625, + "learning_rate": 0.00042669040125151847, + "loss": 0.1757, + "step": 181260 + }, + { + "epoch": 7.51, + "grad_norm": 0.578125, + "learning_rate": 0.00042668272863279045, + "loss": 0.204, + "step": 181270 + }, + { + "epoch": 7.51, + "grad_norm": 1.3828125, + "learning_rate": 0.0004266750556815646, + "loss": 0.2085, + "step": 181280 + }, + { + "epoch": 7.51, + "grad_norm": 0.7421875, + "learning_rate": 0.00042666738239785545, + "loss": 0.264, + "step": 181290 + }, + { + "epoch": 7.51, + "grad_norm": 1.234375, + "learning_rate": 0.00042665970878167736, + "loss": 0.2049, + "step": 181300 + }, + { + "epoch": 7.51, + "grad_norm": 1.0546875, + "learning_rate": 0.0004266520348330447, + "loss": 0.1409, + "step": 181310 + }, + { + "epoch": 7.51, + "grad_norm": 1.4765625, + "learning_rate": 0.00042664436055197207, + "loss": 0.2106, + "step": 181320 + }, + { + "epoch": 7.51, + "grad_norm": 0.71484375, + "learning_rate": 0.0004266366859384738, + "loss": 0.1686, + "step": 181330 + }, + { + "epoch": 7.51, + "grad_norm": 0.7109375, + "learning_rate": 0.0004266290109925644, + "loss": 0.2467, + "step": 181340 + }, + { + "epoch": 7.51, + "grad_norm": 0.73828125, + "learning_rate": 0.0004266213357142582, + "loss": 0.2639, + "step": 181350 + }, + { + "epoch": 7.51, + "grad_norm": 1.296875, + "learning_rate": 0.00042661366010356974, + "loss": 0.1542, + "step": 181360 + }, + { + "epoch": 7.51, + "grad_norm": 0.458984375, + "learning_rate": 0.0004266059841605135, + "loss": 0.1211, + "step": 181370 + }, + { + "epoch": 7.51, + "grad_norm": 0.4375, + "learning_rate": 0.0004265983078851038, + "loss": 0.2651, + "step": 181380 + }, + { + "epoch": 7.51, + "grad_norm": 1.0234375, + "learning_rate": 0.0004265906312773552, + "loss": 0.2435, + "step": 181390 + }, + { + "epoch": 7.51, + "grad_norm": 0.5703125, + "learning_rate": 0.0004265829543372821, + "loss": 0.196, + "step": 181400 + }, + { + "epoch": 7.51, + "grad_norm": 0.55078125, + "learning_rate": 0.00042657527706489897, + "loss": 0.2009, + "step": 181410 + }, + { + "epoch": 7.51, + "grad_norm": 0.408203125, + "learning_rate": 0.00042656759946022016, + "loss": 0.1608, + "step": 181420 + }, + { + "epoch": 7.51, + "grad_norm": 0.9609375, + "learning_rate": 0.0004265599215232603, + "loss": 0.1847, + "step": 181430 + }, + { + "epoch": 7.52, + "grad_norm": 1.015625, + "learning_rate": 0.00042655224325403363, + "loss": 0.2012, + "step": 181440 + }, + { + "epoch": 7.52, + "grad_norm": 0.8359375, + "learning_rate": 0.00042654456465255475, + "loss": 0.1725, + "step": 181450 + }, + { + "epoch": 7.52, + "grad_norm": 0.345703125, + "learning_rate": 0.0004265368857188381, + "loss": 0.1996, + "step": 181460 + }, + { + "epoch": 7.52, + "grad_norm": 0.65234375, + "learning_rate": 0.000426529206452898, + "loss": 0.1832, + "step": 181470 + }, + { + "epoch": 7.52, + "grad_norm": 0.7421875, + "learning_rate": 0.0004265215268547491, + "loss": 0.1908, + "step": 181480 + }, + { + "epoch": 7.52, + "grad_norm": 0.80859375, + "learning_rate": 0.00042651384692440566, + "loss": 0.1847, + "step": 181490 + }, + { + "epoch": 7.52, + "grad_norm": 0.478515625, + "learning_rate": 0.00042650616666188226, + "loss": 0.1921, + "step": 181500 + }, + { + "epoch": 7.52, + "grad_norm": 0.484375, + "learning_rate": 0.0004264984860671933, + "loss": 0.2274, + "step": 181510 + }, + { + "epoch": 7.52, + "grad_norm": 0.94140625, + "learning_rate": 0.00042649080514035333, + "loss": 0.198, + "step": 181520 + }, + { + "epoch": 7.52, + "grad_norm": 0.5078125, + "learning_rate": 0.0004264831238813767, + "loss": 0.1474, + "step": 181530 + }, + { + "epoch": 7.52, + "grad_norm": 0.9453125, + "learning_rate": 0.0004264754422902778, + "loss": 0.2471, + "step": 181540 + }, + { + "epoch": 7.52, + "grad_norm": 0.75, + "learning_rate": 0.00042646776036707126, + "loss": 0.2253, + "step": 181550 + }, + { + "epoch": 7.52, + "grad_norm": 0.91796875, + "learning_rate": 0.0004264600781117714, + "loss": 0.1716, + "step": 181560 + }, + { + "epoch": 7.52, + "grad_norm": 3.34375, + "learning_rate": 0.00042645239552439277, + "loss": 0.1846, + "step": 181570 + }, + { + "epoch": 7.52, + "grad_norm": 0.9921875, + "learning_rate": 0.00042644471260494976, + "loss": 0.1656, + "step": 181580 + }, + { + "epoch": 7.52, + "grad_norm": 0.6171875, + "learning_rate": 0.00042643702935345684, + "loss": 0.2535, + "step": 181590 + }, + { + "epoch": 7.52, + "grad_norm": 0.28515625, + "learning_rate": 0.00042642934576992846, + "loss": 0.2122, + "step": 181600 + }, + { + "epoch": 7.52, + "grad_norm": 0.61328125, + "learning_rate": 0.0004264216618543792, + "loss": 0.1842, + "step": 181610 + }, + { + "epoch": 7.52, + "grad_norm": 0.7734375, + "learning_rate": 0.00042641397760682335, + "loss": 0.163, + "step": 181620 + }, + { + "epoch": 7.52, + "grad_norm": 1.234375, + "learning_rate": 0.0004264062930272755, + "loss": 0.1904, + "step": 181630 + }, + { + "epoch": 7.52, + "grad_norm": 1.71875, + "learning_rate": 0.00042639860811575006, + "loss": 0.2249, + "step": 181640 + }, + { + "epoch": 7.52, + "grad_norm": 0.82421875, + "learning_rate": 0.0004263909228722614, + "loss": 0.2116, + "step": 181650 + }, + { + "epoch": 7.52, + "grad_norm": 0.703125, + "learning_rate": 0.00042638323729682413, + "loss": 0.1961, + "step": 181660 + }, + { + "epoch": 7.52, + "grad_norm": 0.6796875, + "learning_rate": 0.00042637555138945265, + "loss": 0.2157, + "step": 181670 + }, + { + "epoch": 7.53, + "grad_norm": 0.369140625, + "learning_rate": 0.00042636786515016145, + "loss": 0.1902, + "step": 181680 + }, + { + "epoch": 7.53, + "grad_norm": 0.64453125, + "learning_rate": 0.0004263601785789649, + "loss": 0.1788, + "step": 181690 + }, + { + "epoch": 7.53, + "grad_norm": 1.71875, + "learning_rate": 0.00042635249167587765, + "loss": 0.1812, + "step": 181700 + }, + { + "epoch": 7.53, + "grad_norm": 0.94140625, + "learning_rate": 0.000426344804440914, + "loss": 0.2491, + "step": 181710 + }, + { + "epoch": 7.53, + "grad_norm": 0.478515625, + "learning_rate": 0.00042633711687408847, + "loss": 0.2069, + "step": 181720 + }, + { + "epoch": 7.53, + "grad_norm": 0.490234375, + "learning_rate": 0.0004263294289754155, + "loss": 0.2334, + "step": 181730 + }, + { + "epoch": 7.53, + "grad_norm": 0.458984375, + "learning_rate": 0.00042632174074490965, + "loss": 0.2259, + "step": 181740 + }, + { + "epoch": 7.53, + "grad_norm": 0.96875, + "learning_rate": 0.00042631405218258536, + "loss": 0.2461, + "step": 181750 + }, + { + "epoch": 7.53, + "grad_norm": 0.71484375, + "learning_rate": 0.0004263063632884569, + "loss": 0.2209, + "step": 181760 + }, + { + "epoch": 7.53, + "grad_norm": 1.875, + "learning_rate": 0.00042629867406253905, + "loss": 0.1968, + "step": 181770 + }, + { + "epoch": 7.53, + "grad_norm": 0.416015625, + "learning_rate": 0.00042629098450484604, + "loss": 0.1735, + "step": 181780 + }, + { + "epoch": 7.53, + "grad_norm": 0.7890625, + "learning_rate": 0.0004262832946153925, + "loss": 0.222, + "step": 181790 + }, + { + "epoch": 7.53, + "grad_norm": 0.6796875, + "learning_rate": 0.00042627560439419284, + "loss": 0.2182, + "step": 181800 + }, + { + "epoch": 7.53, + "grad_norm": 0.8046875, + "learning_rate": 0.0004262679138412615, + "loss": 0.1302, + "step": 181810 + }, + { + "epoch": 7.53, + "grad_norm": 1.109375, + "learning_rate": 0.00042626022295661294, + "loss": 0.205, + "step": 181820 + }, + { + "epoch": 7.53, + "grad_norm": 0.98828125, + "learning_rate": 0.0004262525317402617, + "loss": 0.1869, + "step": 181830 + }, + { + "epoch": 7.53, + "grad_norm": 0.6328125, + "learning_rate": 0.0004262448401922223, + "loss": 0.1823, + "step": 181840 + }, + { + "epoch": 7.53, + "grad_norm": 1.21875, + "learning_rate": 0.0004262371483125091, + "loss": 0.2143, + "step": 181850 + }, + { + "epoch": 7.53, + "grad_norm": 0.73046875, + "learning_rate": 0.0004262294561011366, + "loss": 0.2012, + "step": 181860 + }, + { + "epoch": 7.53, + "grad_norm": 0.76171875, + "learning_rate": 0.0004262217635581193, + "loss": 0.2015, + "step": 181870 + }, + { + "epoch": 7.53, + "grad_norm": 1.5234375, + "learning_rate": 0.00042621407068347167, + "loss": 0.2221, + "step": 181880 + }, + { + "epoch": 7.53, + "grad_norm": 0.55859375, + "learning_rate": 0.0004262063774772082, + "loss": 0.173, + "step": 181890 + }, + { + "epoch": 7.53, + "grad_norm": 0.58984375, + "learning_rate": 0.00042619868393934334, + "loss": 0.2308, + "step": 181900 + }, + { + "epoch": 7.53, + "grad_norm": 0.7109375, + "learning_rate": 0.0004261909900698916, + "loss": 0.1793, + "step": 181910 + }, + { + "epoch": 7.54, + "grad_norm": 0.99609375, + "learning_rate": 0.00042618329586886745, + "loss": 0.184, + "step": 181920 + }, + { + "epoch": 7.54, + "grad_norm": 0.8203125, + "learning_rate": 0.00042617560133628534, + "loss": 0.2091, + "step": 181930 + }, + { + "epoch": 7.54, + "grad_norm": 0.6171875, + "learning_rate": 0.0004261679064721598, + "loss": 0.2157, + "step": 181940 + }, + { + "epoch": 7.54, + "grad_norm": 0.77734375, + "learning_rate": 0.00042616021127650527, + "loss": 0.2157, + "step": 181950 + }, + { + "epoch": 7.54, + "grad_norm": 0.93359375, + "learning_rate": 0.0004261525157493362, + "loss": 0.1974, + "step": 181960 + }, + { + "epoch": 7.54, + "grad_norm": 0.59765625, + "learning_rate": 0.00042614481989066723, + "loss": 0.2345, + "step": 181970 + }, + { + "epoch": 7.54, + "grad_norm": 0.76171875, + "learning_rate": 0.00042613712370051266, + "loss": 0.1834, + "step": 181980 + }, + { + "epoch": 7.54, + "grad_norm": 0.828125, + "learning_rate": 0.000426129427178887, + "loss": 0.2128, + "step": 181990 + }, + { + "epoch": 7.54, + "grad_norm": 0.640625, + "learning_rate": 0.0004261217303258049, + "loss": 0.1737, + "step": 182000 + }, + { + "epoch": 7.54, + "grad_norm": 0.48046875, + "learning_rate": 0.00042611403314128063, + "loss": 0.2156, + "step": 182010 + }, + { + "epoch": 7.54, + "grad_norm": 0.9375, + "learning_rate": 0.0004261063356253288, + "loss": 0.2091, + "step": 182020 + }, + { + "epoch": 7.54, + "grad_norm": 0.8984375, + "learning_rate": 0.0004260986377779639, + "loss": 0.1886, + "step": 182030 + }, + { + "epoch": 7.54, + "grad_norm": 1.078125, + "learning_rate": 0.00042609093959920027, + "loss": 0.184, + "step": 182040 + }, + { + "epoch": 7.54, + "grad_norm": 1.1796875, + "learning_rate": 0.0004260832410890526, + "loss": 0.213, + "step": 182050 + }, + { + "epoch": 7.54, + "grad_norm": 0.7421875, + "learning_rate": 0.0004260755422475353, + "loss": 0.1719, + "step": 182060 + }, + { + "epoch": 7.54, + "grad_norm": 0.76953125, + "learning_rate": 0.00042606784307466273, + "loss": 0.1862, + "step": 182070 + }, + { + "epoch": 7.54, + "grad_norm": 0.96484375, + "learning_rate": 0.0004260601435704496, + "loss": 0.1704, + "step": 182080 + }, + { + "epoch": 7.54, + "grad_norm": 0.72265625, + "learning_rate": 0.00042605244373491026, + "loss": 0.2313, + "step": 182090 + }, + { + "epoch": 7.54, + "grad_norm": 0.609375, + "learning_rate": 0.0004260447435680592, + "loss": 0.1728, + "step": 182100 + }, + { + "epoch": 7.54, + "grad_norm": 0.91015625, + "learning_rate": 0.00042603704306991096, + "loss": 0.1731, + "step": 182110 + }, + { + "epoch": 7.54, + "grad_norm": 1.015625, + "learning_rate": 0.00042602934224048007, + "loss": 0.1642, + "step": 182120 + }, + { + "epoch": 7.54, + "grad_norm": 1.078125, + "learning_rate": 0.0004260216410797809, + "loss": 0.1997, + "step": 182130 + }, + { + "epoch": 7.54, + "grad_norm": 0.33203125, + "learning_rate": 0.0004260139395878281, + "loss": 0.2151, + "step": 182140 + }, + { + "epoch": 7.54, + "grad_norm": 0.671875, + "learning_rate": 0.000426006237764636, + "loss": 0.2193, + "step": 182150 + }, + { + "epoch": 7.55, + "grad_norm": 0.8515625, + "learning_rate": 0.0004259985356102192, + "loss": 0.2009, + "step": 182160 + }, + { + "epoch": 7.55, + "grad_norm": 1.6484375, + "learning_rate": 0.0004259908331245921, + "loss": 0.2441, + "step": 182170 + }, + { + "epoch": 7.55, + "grad_norm": 1.2734375, + "learning_rate": 0.0004259831303077693, + "loss": 0.193, + "step": 182180 + }, + { + "epoch": 7.55, + "grad_norm": 1.4140625, + "learning_rate": 0.00042597542715976523, + "loss": 0.2215, + "step": 182190 + }, + { + "epoch": 7.55, + "grad_norm": 0.5546875, + "learning_rate": 0.0004259677236805944, + "loss": 0.243, + "step": 182200 + }, + { + "epoch": 7.55, + "grad_norm": 0.5234375, + "learning_rate": 0.0004259600198702714, + "loss": 0.1987, + "step": 182210 + }, + { + "epoch": 7.55, + "grad_norm": 1.5, + "learning_rate": 0.0004259523157288106, + "loss": 0.1872, + "step": 182220 + }, + { + "epoch": 7.55, + "grad_norm": 0.6015625, + "learning_rate": 0.0004259446112562265, + "loss": 0.2255, + "step": 182230 + }, + { + "epoch": 7.55, + "grad_norm": 1.1796875, + "learning_rate": 0.0004259369064525337, + "loss": 0.1959, + "step": 182240 + }, + { + "epoch": 7.55, + "grad_norm": 0.83984375, + "learning_rate": 0.0004259292013177466, + "loss": 0.1937, + "step": 182250 + }, + { + "epoch": 7.55, + "grad_norm": 1.3125, + "learning_rate": 0.0004259214958518798, + "loss": 0.2089, + "step": 182260 + }, + { + "epoch": 7.55, + "grad_norm": 0.5625, + "learning_rate": 0.00042591379005494766, + "loss": 0.2137, + "step": 182270 + }, + { + "epoch": 7.55, + "grad_norm": 0.83203125, + "learning_rate": 0.00042590608392696486, + "loss": 0.173, + "step": 182280 + }, + { + "epoch": 7.55, + "grad_norm": 0.6171875, + "learning_rate": 0.00042589837746794576, + "loss": 0.2001, + "step": 182290 + }, + { + "epoch": 7.55, + "grad_norm": 0.5546875, + "learning_rate": 0.00042589067067790487, + "loss": 0.2439, + "step": 182300 + }, + { + "epoch": 7.55, + "grad_norm": 0.291015625, + "learning_rate": 0.0004258829635568568, + "loss": 0.257, + "step": 182310 + }, + { + "epoch": 7.55, + "grad_norm": 0.90625, + "learning_rate": 0.00042587525610481594, + "loss": 0.2472, + "step": 182320 + }, + { + "epoch": 7.55, + "grad_norm": 0.765625, + "learning_rate": 0.00042586754832179684, + "loss": 0.1983, + "step": 182330 + }, + { + "epoch": 7.55, + "grad_norm": 0.490234375, + "learning_rate": 0.000425859840207814, + "loss": 0.1853, + "step": 182340 + }, + { + "epoch": 7.55, + "grad_norm": 0.2138671875, + "learning_rate": 0.000425852131762882, + "loss": 0.2145, + "step": 182350 + }, + { + "epoch": 7.55, + "grad_norm": 0.60546875, + "learning_rate": 0.0004258444229870152, + "loss": 0.1769, + "step": 182360 + }, + { + "epoch": 7.55, + "grad_norm": 0.64453125, + "learning_rate": 0.0004258367138802283, + "loss": 0.1734, + "step": 182370 + }, + { + "epoch": 7.55, + "grad_norm": 0.2431640625, + "learning_rate": 0.00042582900444253555, + "loss": 0.2284, + "step": 182380 + }, + { + "epoch": 7.55, + "grad_norm": 0.38671875, + "learning_rate": 0.00042582129467395164, + "loss": 0.2242, + "step": 182390 + }, + { + "epoch": 7.55, + "grad_norm": 1.59375, + "learning_rate": 0.0004258135845744911, + "loss": 0.1941, + "step": 182400 + }, + { + "epoch": 7.56, + "grad_norm": 0.71484375, + "learning_rate": 0.0004258058741441683, + "loss": 0.2355, + "step": 182410 + }, + { + "epoch": 7.56, + "grad_norm": 0.431640625, + "learning_rate": 0.0004257981633829979, + "loss": 0.1344, + "step": 182420 + }, + { + "epoch": 7.56, + "grad_norm": 0.474609375, + "learning_rate": 0.0004257904522909943, + "loss": 0.197, + "step": 182430 + }, + { + "epoch": 7.56, + "grad_norm": 1.3046875, + "learning_rate": 0.000425782740868172, + "loss": 0.1913, + "step": 182440 + }, + { + "epoch": 7.56, + "grad_norm": 1.5703125, + "learning_rate": 0.00042577502911454566, + "loss": 0.1992, + "step": 182450 + }, + { + "epoch": 7.56, + "grad_norm": 0.50390625, + "learning_rate": 0.0004257673170301297, + "loss": 0.1982, + "step": 182460 + }, + { + "epoch": 7.56, + "grad_norm": 0.7109375, + "learning_rate": 0.0004257596046149386, + "loss": 0.1446, + "step": 182470 + }, + { + "epoch": 7.56, + "grad_norm": 1.3203125, + "learning_rate": 0.0004257518918689869, + "loss": 0.1648, + "step": 182480 + }, + { + "epoch": 7.56, + "grad_norm": 0.87890625, + "learning_rate": 0.0004257441787922891, + "loss": 0.193, + "step": 182490 + }, + { + "epoch": 7.56, + "grad_norm": 0.80859375, + "learning_rate": 0.0004257364653848598, + "loss": 0.2503, + "step": 182500 + }, + { + "epoch": 7.56, + "grad_norm": 0.87109375, + "learning_rate": 0.0004257287516467134, + "loss": 0.182, + "step": 182510 + }, + { + "epoch": 7.56, + "grad_norm": 0.76171875, + "learning_rate": 0.00042572103757786443, + "loss": 0.2372, + "step": 182520 + }, + { + "epoch": 7.56, + "grad_norm": 0.23046875, + "learning_rate": 0.00042571332317832756, + "loss": 0.2322, + "step": 182530 + }, + { + "epoch": 7.56, + "grad_norm": 1.9765625, + "learning_rate": 0.00042570560844811705, + "loss": 0.2343, + "step": 182540 + }, + { + "epoch": 7.56, + "grad_norm": 0.90234375, + "learning_rate": 0.00042569789338724764, + "loss": 0.2086, + "step": 182550 + }, + { + "epoch": 7.56, + "grad_norm": 0.435546875, + "learning_rate": 0.00042569017799573376, + "loss": 0.1448, + "step": 182560 + }, + { + "epoch": 7.56, + "grad_norm": 1.828125, + "learning_rate": 0.00042568246227358994, + "loss": 0.2161, + "step": 182570 + }, + { + "epoch": 7.56, + "grad_norm": 0.41796875, + "learning_rate": 0.0004256747462208307, + "loss": 0.1745, + "step": 182580 + }, + { + "epoch": 7.56, + "grad_norm": 1.8515625, + "learning_rate": 0.0004256670298374705, + "loss": 0.233, + "step": 182590 + }, + { + "epoch": 7.56, + "grad_norm": 0.58984375, + "learning_rate": 0.000425659313123524, + "loss": 0.2217, + "step": 182600 + }, + { + "epoch": 7.56, + "grad_norm": 0.96875, + "learning_rate": 0.00042565159607900563, + "loss": 0.1913, + "step": 182610 + }, + { + "epoch": 7.56, + "grad_norm": 0.8515625, + "learning_rate": 0.0004256438787039299, + "loss": 0.199, + "step": 182620 + }, + { + "epoch": 7.56, + "grad_norm": 1.109375, + "learning_rate": 0.0004256361609983114, + "loss": 0.1998, + "step": 182630 + }, + { + "epoch": 7.56, + "grad_norm": 0.80078125, + "learning_rate": 0.0004256284429621645, + "loss": 0.2201, + "step": 182640 + }, + { + "epoch": 7.57, + "grad_norm": 0.345703125, + "learning_rate": 0.0004256207245955039, + "loss": 0.2402, + "step": 182650 + }, + { + "epoch": 7.57, + "grad_norm": 0.765625, + "learning_rate": 0.0004256130058983442, + "loss": 0.1696, + "step": 182660 + }, + { + "epoch": 7.57, + "grad_norm": 0.330078125, + "learning_rate": 0.0004256052868706996, + "loss": 0.2006, + "step": 182670 + }, + { + "epoch": 7.57, + "grad_norm": 0.3515625, + "learning_rate": 0.00042559756751258483, + "loss": 0.1947, + "step": 182680 + }, + { + "epoch": 7.57, + "grad_norm": 0.419921875, + "learning_rate": 0.0004255898478240145, + "loss": 0.2172, + "step": 182690 + }, + { + "epoch": 7.57, + "grad_norm": 0.62109375, + "learning_rate": 0.00042558212780500294, + "loss": 0.1842, + "step": 182700 + }, + { + "epoch": 7.57, + "grad_norm": 0.93359375, + "learning_rate": 0.00042557440745556485, + "loss": 0.2106, + "step": 182710 + }, + { + "epoch": 7.57, + "grad_norm": 0.6171875, + "learning_rate": 0.00042556668677571464, + "loss": 0.1823, + "step": 182720 + }, + { + "epoch": 7.57, + "grad_norm": 1.2890625, + "learning_rate": 0.00042555896576546693, + "loss": 0.182, + "step": 182730 + }, + { + "epoch": 7.57, + "grad_norm": 0.57421875, + "learning_rate": 0.00042555124442483614, + "loss": 0.1694, + "step": 182740 + }, + { + "epoch": 7.57, + "grad_norm": 0.9375, + "learning_rate": 0.0004255435227538369, + "loss": 0.2124, + "step": 182750 + }, + { + "epoch": 7.57, + "grad_norm": 1.5859375, + "learning_rate": 0.0004255358007524838, + "loss": 0.223, + "step": 182760 + }, + { + "epoch": 7.57, + "grad_norm": 0.46484375, + "learning_rate": 0.0004255280784207911, + "loss": 0.17, + "step": 182770 + }, + { + "epoch": 7.57, + "grad_norm": 1.0, + "learning_rate": 0.00042552035575877366, + "loss": 0.1673, + "step": 182780 + }, + { + "epoch": 7.57, + "grad_norm": 0.6015625, + "learning_rate": 0.0004255126327664458, + "loss": 0.178, + "step": 182790 + }, + { + "epoch": 7.57, + "grad_norm": 0.64453125, + "learning_rate": 0.00042550490944382206, + "loss": 0.1717, + "step": 182800 + }, + { + "epoch": 7.57, + "grad_norm": 0.435546875, + "learning_rate": 0.0004254971857909171, + "loss": 0.1814, + "step": 182810 + }, + { + "epoch": 7.57, + "grad_norm": 0.5, + "learning_rate": 0.0004254894618077455, + "loss": 0.2017, + "step": 182820 + }, + { + "epoch": 7.57, + "grad_norm": 0.51171875, + "learning_rate": 0.0004254817374943215, + "loss": 0.1957, + "step": 182830 + }, + { + "epoch": 7.57, + "grad_norm": 2.09375, + "learning_rate": 0.0004254740128506599, + "loss": 0.2228, + "step": 182840 + }, + { + "epoch": 7.57, + "grad_norm": 0.48046875, + "learning_rate": 0.00042546628787677515, + "loss": 0.1796, + "step": 182850 + }, + { + "epoch": 7.57, + "grad_norm": 1.25, + "learning_rate": 0.0004254585625726818, + "loss": 0.2443, + "step": 182860 + }, + { + "epoch": 7.57, + "grad_norm": 1.140625, + "learning_rate": 0.00042545083693839436, + "loss": 0.2325, + "step": 182870 + }, + { + "epoch": 7.57, + "grad_norm": 0.5703125, + "learning_rate": 0.0004254431109739274, + "loss": 0.212, + "step": 182880 + }, + { + "epoch": 7.58, + "grad_norm": 0.72265625, + "learning_rate": 0.00042543538467929547, + "loss": 0.2457, + "step": 182890 + }, + { + "epoch": 7.58, + "grad_norm": 1.421875, + "learning_rate": 0.0004254276580545131, + "loss": 0.1799, + "step": 182900 + }, + { + "epoch": 7.58, + "grad_norm": 0.5703125, + "learning_rate": 0.0004254199310995948, + "loss": 0.202, + "step": 182910 + }, + { + "epoch": 7.58, + "grad_norm": 0.0, + "learning_rate": 0.00042541220381455514, + "loss": 0.1707, + "step": 182920 + }, + { + "epoch": 7.58, + "grad_norm": 0.62109375, + "learning_rate": 0.0004254044761994087, + "loss": 0.1908, + "step": 182930 + }, + { + "epoch": 7.58, + "grad_norm": 0.341796875, + "learning_rate": 0.0004253967482541699, + "loss": 0.1894, + "step": 182940 + }, + { + "epoch": 7.58, + "grad_norm": 0.69140625, + "learning_rate": 0.0004253890199788534, + "loss": 0.2058, + "step": 182950 + }, + { + "epoch": 7.58, + "grad_norm": 0.72265625, + "learning_rate": 0.0004253812913734737, + "loss": 0.1851, + "step": 182960 + }, + { + "epoch": 7.58, + "grad_norm": 0.8671875, + "learning_rate": 0.00042537356243804535, + "loss": 0.205, + "step": 182970 + }, + { + "epoch": 7.58, + "grad_norm": 0.6328125, + "learning_rate": 0.0004253658331725829, + "loss": 0.144, + "step": 182980 + }, + { + "epoch": 7.58, + "grad_norm": 0.58203125, + "learning_rate": 0.00042535810357710086, + "loss": 0.2198, + "step": 182990 + }, + { + "epoch": 7.58, + "grad_norm": 0.92578125, + "learning_rate": 0.00042535037365161384, + "loss": 0.2091, + "step": 183000 + }, + { + "epoch": 7.58, + "grad_norm": 0.72265625, + "learning_rate": 0.00042534264339613633, + "loss": 0.1775, + "step": 183010 + }, + { + "epoch": 7.58, + "grad_norm": 0.76171875, + "learning_rate": 0.0004253349128106829, + "loss": 0.2334, + "step": 183020 + }, + { + "epoch": 7.58, + "grad_norm": 0.8359375, + "learning_rate": 0.0004253271818952681, + "loss": 0.1891, + "step": 183030 + }, + { + "epoch": 7.58, + "grad_norm": 1.4609375, + "learning_rate": 0.0004253194506499065, + "loss": 0.1936, + "step": 183040 + }, + { + "epoch": 7.58, + "grad_norm": 0.8203125, + "learning_rate": 0.0004253117190746126, + "loss": 0.2947, + "step": 183050 + }, + { + "epoch": 7.58, + "grad_norm": 0.62890625, + "learning_rate": 0.000425303987169401, + "loss": 0.1832, + "step": 183060 + }, + { + "epoch": 7.58, + "grad_norm": 1.0625, + "learning_rate": 0.0004252962549342863, + "loss": 0.1754, + "step": 183070 + }, + { + "epoch": 7.58, + "grad_norm": 0.85546875, + "learning_rate": 0.0004252885223692828, + "loss": 0.1993, + "step": 183080 + }, + { + "epoch": 7.58, + "grad_norm": 0.478515625, + "learning_rate": 0.0004252807894744053, + "loss": 0.2186, + "step": 183090 + }, + { + "epoch": 7.58, + "grad_norm": 0.5078125, + "learning_rate": 0.0004252730562496684, + "loss": 0.2016, + "step": 183100 + }, + { + "epoch": 7.58, + "grad_norm": 1.09375, + "learning_rate": 0.00042526532269508645, + "loss": 0.2259, + "step": 183110 + }, + { + "epoch": 7.58, + "grad_norm": 0.546875, + "learning_rate": 0.000425257588810674, + "loss": 0.1678, + "step": 183120 + }, + { + "epoch": 7.59, + "grad_norm": 0.80859375, + "learning_rate": 0.00042524985459644585, + "loss": 0.2164, + "step": 183130 + }, + { + "epoch": 7.59, + "grad_norm": 1.03125, + "learning_rate": 0.00042524212005241624, + "loss": 0.2158, + "step": 183140 + }, + { + "epoch": 7.59, + "grad_norm": 0.65234375, + "learning_rate": 0.00042523438517860004, + "loss": 0.2297, + "step": 183150 + }, + { + "epoch": 7.59, + "grad_norm": 0.890625, + "learning_rate": 0.0004252266499750116, + "loss": 0.2236, + "step": 183160 + }, + { + "epoch": 7.59, + "grad_norm": 0.80859375, + "learning_rate": 0.0004252189144416655, + "loss": 0.235, + "step": 183170 + }, + { + "epoch": 7.59, + "grad_norm": 0.953125, + "learning_rate": 0.0004252111785785763, + "loss": 0.2316, + "step": 183180 + }, + { + "epoch": 7.59, + "grad_norm": 0.32421875, + "learning_rate": 0.00042520344238575864, + "loss": 0.2175, + "step": 183190 + }, + { + "epoch": 7.59, + "grad_norm": 0.953125, + "learning_rate": 0.000425195705863227, + "loss": 0.2252, + "step": 183200 + }, + { + "epoch": 7.59, + "grad_norm": 1.0859375, + "learning_rate": 0.00042518796901099595, + "loss": 0.21, + "step": 183210 + }, + { + "epoch": 7.59, + "grad_norm": 0.94140625, + "learning_rate": 0.00042518023182908007, + "loss": 0.2451, + "step": 183220 + }, + { + "epoch": 7.59, + "grad_norm": 0.275390625, + "learning_rate": 0.0004251724943174939, + "loss": 0.1905, + "step": 183230 + }, + { + "epoch": 7.59, + "grad_norm": 0.796875, + "learning_rate": 0.0004251647564762521, + "loss": 0.2029, + "step": 183240 + }, + { + "epoch": 7.59, + "grad_norm": 0.84765625, + "learning_rate": 0.000425157018305369, + "loss": 0.2325, + "step": 183250 + }, + { + "epoch": 7.59, + "grad_norm": 0.77734375, + "learning_rate": 0.0004251492798048594, + "loss": 0.2198, + "step": 183260 + }, + { + "epoch": 7.59, + "grad_norm": 0.5234375, + "learning_rate": 0.0004251415409747378, + "loss": 0.2147, + "step": 183270 + }, + { + "epoch": 7.59, + "grad_norm": 0.62109375, + "learning_rate": 0.0004251338018150186, + "loss": 0.233, + "step": 183280 + }, + { + "epoch": 7.59, + "grad_norm": 0.69921875, + "learning_rate": 0.0004251260623257166, + "loss": 0.2371, + "step": 183290 + }, + { + "epoch": 7.59, + "grad_norm": 0.5703125, + "learning_rate": 0.00042511832250684625, + "loss": 0.2055, + "step": 183300 + }, + { + "epoch": 7.59, + "grad_norm": 0.0, + "learning_rate": 0.00042511058235842215, + "loss": 0.1773, + "step": 183310 + }, + { + "epoch": 7.59, + "grad_norm": 0.59765625, + "learning_rate": 0.0004251028418804588, + "loss": 0.2317, + "step": 183320 + }, + { + "epoch": 7.59, + "grad_norm": 0.921875, + "learning_rate": 0.0004250951010729708, + "loss": 0.1374, + "step": 183330 + }, + { + "epoch": 7.59, + "grad_norm": 0.95703125, + "learning_rate": 0.00042508735993597273, + "loss": 0.1607, + "step": 183340 + }, + { + "epoch": 7.59, + "grad_norm": 0.59375, + "learning_rate": 0.00042507961846947916, + "loss": 0.1673, + "step": 183350 + }, + { + "epoch": 7.59, + "grad_norm": 1.0859375, + "learning_rate": 0.00042507187667350474, + "loss": 0.2673, + "step": 183360 + }, + { + "epoch": 7.6, + "grad_norm": 0.5078125, + "learning_rate": 0.0004250641345480639, + "loss": 0.1449, + "step": 183370 + }, + { + "epoch": 7.6, + "grad_norm": 1.078125, + "learning_rate": 0.0004250563920931712, + "loss": 0.246, + "step": 183380 + }, + { + "epoch": 7.6, + "grad_norm": 0.412109375, + "learning_rate": 0.00042504864930884137, + "loss": 0.1985, + "step": 183390 + }, + { + "epoch": 7.6, + "grad_norm": 0.78515625, + "learning_rate": 0.0004250409061950888, + "loss": 0.2172, + "step": 183400 + }, + { + "epoch": 7.6, + "grad_norm": 1.421875, + "learning_rate": 0.0004250331627519282, + "loss": 0.1675, + "step": 183410 + }, + { + "epoch": 7.6, + "grad_norm": 0.8203125, + "learning_rate": 0.0004250254189793741, + "loss": 0.197, + "step": 183420 + }, + { + "epoch": 7.6, + "grad_norm": 1.4609375, + "learning_rate": 0.000425017674877441, + "loss": 0.2919, + "step": 183430 + }, + { + "epoch": 7.6, + "grad_norm": 0.80859375, + "learning_rate": 0.0004250099304461436, + "loss": 0.217, + "step": 183440 + }, + { + "epoch": 7.6, + "grad_norm": 0.89453125, + "learning_rate": 0.00042500218568549645, + "loss": 0.1809, + "step": 183450 + }, + { + "epoch": 7.6, + "grad_norm": 0.400390625, + "learning_rate": 0.00042499444059551395, + "loss": 0.1797, + "step": 183460 + }, + { + "epoch": 7.6, + "grad_norm": 0.62890625, + "learning_rate": 0.0004249866951762109, + "loss": 0.2079, + "step": 183470 + }, + { + "epoch": 7.6, + "grad_norm": 0.66796875, + "learning_rate": 0.00042497894942760176, + "loss": 0.1328, + "step": 183480 + }, + { + "epoch": 7.6, + "grad_norm": 0.734375, + "learning_rate": 0.0004249712033497012, + "loss": 0.2063, + "step": 183490 + }, + { + "epoch": 7.6, + "grad_norm": 0.73828125, + "learning_rate": 0.0004249634569425237, + "loss": 0.2178, + "step": 183500 + }, + { + "epoch": 7.6, + "grad_norm": 0.6328125, + "learning_rate": 0.0004249557102060838, + "loss": 0.1885, + "step": 183510 + }, + { + "epoch": 7.6, + "grad_norm": 0.98828125, + "learning_rate": 0.00042494796314039617, + "loss": 0.2449, + "step": 183520 + }, + { + "epoch": 7.6, + "grad_norm": 0.9140625, + "learning_rate": 0.0004249402157454754, + "loss": 0.1592, + "step": 183530 + }, + { + "epoch": 7.6, + "grad_norm": 0.2275390625, + "learning_rate": 0.00042493246802133603, + "loss": 0.2251, + "step": 183540 + }, + { + "epoch": 7.6, + "grad_norm": 1.3125, + "learning_rate": 0.00042492471996799264, + "loss": 0.2332, + "step": 183550 + }, + { + "epoch": 7.6, + "grad_norm": 1.078125, + "learning_rate": 0.00042491697158545977, + "loss": 0.2109, + "step": 183560 + }, + { + "epoch": 7.6, + "grad_norm": 1.109375, + "learning_rate": 0.0004249092228737521, + "loss": 0.2252, + "step": 183570 + }, + { + "epoch": 7.6, + "grad_norm": 0.96484375, + "learning_rate": 0.00042490147383288427, + "loss": 0.2229, + "step": 183580 + }, + { + "epoch": 7.6, + "grad_norm": 0.314453125, + "learning_rate": 0.0004248937244628706, + "loss": 0.2003, + "step": 183590 + }, + { + "epoch": 7.6, + "grad_norm": 0.87109375, + "learning_rate": 0.0004248859747637258, + "loss": 0.216, + "step": 183600 + }, + { + "epoch": 7.61, + "grad_norm": 1.375, + "learning_rate": 0.00042487822473546457, + "loss": 0.1661, + "step": 183610 + }, + { + "epoch": 7.61, + "grad_norm": 0.66015625, + "learning_rate": 0.0004248704743781014, + "loss": 0.2086, + "step": 183620 + }, + { + "epoch": 7.61, + "grad_norm": 3.765625, + "learning_rate": 0.0004248627236916509, + "loss": 0.2213, + "step": 183630 + }, + { + "epoch": 7.61, + "grad_norm": 0.494140625, + "learning_rate": 0.00042485497267612753, + "loss": 0.1652, + "step": 183640 + }, + { + "epoch": 7.61, + "grad_norm": 0.7421875, + "learning_rate": 0.0004248472213315461, + "loss": 0.1538, + "step": 183650 + }, + { + "epoch": 7.61, + "grad_norm": 0.98828125, + "learning_rate": 0.0004248394696579211, + "loss": 0.1927, + "step": 183660 + }, + { + "epoch": 7.61, + "grad_norm": 1.390625, + "learning_rate": 0.00042483171765526694, + "loss": 0.1968, + "step": 183670 + }, + { + "epoch": 7.61, + "grad_norm": 0.546875, + "learning_rate": 0.0004248239653235985, + "loss": 0.2471, + "step": 183680 + }, + { + "epoch": 7.61, + "grad_norm": 1.0546875, + "learning_rate": 0.00042481621266293024, + "loss": 0.1697, + "step": 183690 + }, + { + "epoch": 7.61, + "grad_norm": 0.5, + "learning_rate": 0.0004248084596732767, + "loss": 0.2146, + "step": 183700 + }, + { + "epoch": 7.61, + "grad_norm": 1.15625, + "learning_rate": 0.00042480070635465253, + "loss": 0.1784, + "step": 183710 + }, + { + "epoch": 7.61, + "grad_norm": 0.3984375, + "learning_rate": 0.0004247929527070723, + "loss": 0.198, + "step": 183720 + }, + { + "epoch": 7.61, + "grad_norm": 0.90625, + "learning_rate": 0.0004247851987305506, + "loss": 0.2378, + "step": 183730 + }, + { + "epoch": 7.61, + "grad_norm": 0.66796875, + "learning_rate": 0.00042477744442510207, + "loss": 0.2269, + "step": 183740 + }, + { + "epoch": 7.61, + "grad_norm": 0.68359375, + "learning_rate": 0.0004247696897907412, + "loss": 0.1885, + "step": 183750 + }, + { + "epoch": 7.61, + "grad_norm": 0.57421875, + "learning_rate": 0.0004247619348274827, + "loss": 0.1773, + "step": 183760 + }, + { + "epoch": 7.61, + "grad_norm": 1.59375, + "learning_rate": 0.00042475417953534114, + "loss": 0.197, + "step": 183770 + }, + { + "epoch": 7.61, + "grad_norm": 1.03125, + "learning_rate": 0.000424746423914331, + "loss": 0.2204, + "step": 183780 + }, + { + "epoch": 7.61, + "grad_norm": 3.0, + "learning_rate": 0.000424738667964467, + "loss": 0.2329, + "step": 183790 + }, + { + "epoch": 7.61, + "grad_norm": 0.47265625, + "learning_rate": 0.00042473091168576373, + "loss": 0.2295, + "step": 183800 + }, + { + "epoch": 7.61, + "grad_norm": 0.6484375, + "learning_rate": 0.0004247231550782357, + "loss": 0.209, + "step": 183810 + }, + { + "epoch": 7.61, + "grad_norm": 0.419921875, + "learning_rate": 0.0004247153981418976, + "loss": 0.1874, + "step": 183820 + }, + { + "epoch": 7.61, + "grad_norm": 0.5234375, + "learning_rate": 0.00042470764087676395, + "loss": 0.203, + "step": 183830 + }, + { + "epoch": 7.61, + "grad_norm": 0.859375, + "learning_rate": 0.00042469988328284943, + "loss": 0.1651, + "step": 183840 + }, + { + "epoch": 7.62, + "grad_norm": 0.439453125, + "learning_rate": 0.00042469212536016854, + "loss": 0.2281, + "step": 183850 + }, + { + "epoch": 7.62, + "grad_norm": 0.86328125, + "learning_rate": 0.00042468436710873603, + "loss": 0.1761, + "step": 183860 + }, + { + "epoch": 7.62, + "grad_norm": 0.373046875, + "learning_rate": 0.0004246766085285663, + "loss": 0.1656, + "step": 183870 + }, + { + "epoch": 7.62, + "grad_norm": 0.9453125, + "learning_rate": 0.0004246688496196741, + "loss": 0.1907, + "step": 183880 + }, + { + "epoch": 7.62, + "grad_norm": 0.703125, + "learning_rate": 0.000424661090382074, + "loss": 0.1854, + "step": 183890 + }, + { + "epoch": 7.62, + "grad_norm": 0.98046875, + "learning_rate": 0.0004246533308157806, + "loss": 0.1968, + "step": 183900 + }, + { + "epoch": 7.62, + "grad_norm": 1.40625, + "learning_rate": 0.0004246455709208085, + "loss": 0.2008, + "step": 183910 + }, + { + "epoch": 7.62, + "grad_norm": 0.82421875, + "learning_rate": 0.0004246378106971722, + "loss": 0.2142, + "step": 183920 + }, + { + "epoch": 7.62, + "grad_norm": 0.75, + "learning_rate": 0.00042463005014488646, + "loss": 0.1708, + "step": 183930 + }, + { + "epoch": 7.62, + "grad_norm": 0.435546875, + "learning_rate": 0.00042462228926396585, + "loss": 0.23, + "step": 183940 + }, + { + "epoch": 7.62, + "grad_norm": 1.1796875, + "learning_rate": 0.00042461452805442497, + "loss": 0.229, + "step": 183950 + }, + { + "epoch": 7.62, + "grad_norm": 0.828125, + "learning_rate": 0.00042460676651627835, + "loss": 0.2129, + "step": 183960 + }, + { + "epoch": 7.62, + "grad_norm": 4.28125, + "learning_rate": 0.00042459900464954063, + "loss": 0.2351, + "step": 183970 + }, + { + "epoch": 7.62, + "grad_norm": 0.98046875, + "learning_rate": 0.0004245912424542265, + "loss": 0.2123, + "step": 183980 + }, + { + "epoch": 7.62, + "grad_norm": 0.671875, + "learning_rate": 0.00042458347993035053, + "loss": 0.1259, + "step": 183990 + }, + { + "epoch": 7.62, + "grad_norm": 0.65234375, + "learning_rate": 0.0004245757170779272, + "loss": 0.1903, + "step": 184000 + }, + { + "epoch": 7.62, + "grad_norm": 0.8203125, + "learning_rate": 0.0004245679538969713, + "loss": 0.1954, + "step": 184010 + }, + { + "epoch": 7.62, + "grad_norm": 1.0859375, + "learning_rate": 0.0004245601903874973, + "loss": 0.1791, + "step": 184020 + }, + { + "epoch": 7.62, + "grad_norm": 0.5703125, + "learning_rate": 0.00042455242654951996, + "loss": 0.1622, + "step": 184030 + }, + { + "epoch": 7.62, + "grad_norm": 0.546875, + "learning_rate": 0.00042454466238305374, + "loss": 0.2256, + "step": 184040 + }, + { + "epoch": 7.62, + "grad_norm": 0.53125, + "learning_rate": 0.00042453689788811335, + "loss": 0.212, + "step": 184050 + }, + { + "epoch": 7.62, + "grad_norm": 0.8515625, + "learning_rate": 0.0004245291330647133, + "loss": 0.1846, + "step": 184060 + }, + { + "epoch": 7.62, + "grad_norm": 0.515625, + "learning_rate": 0.0004245213679128683, + "loss": 0.1804, + "step": 184070 + }, + { + "epoch": 7.62, + "grad_norm": 1.078125, + "learning_rate": 0.00042451360243259297, + "loss": 0.1931, + "step": 184080 + }, + { + "epoch": 7.62, + "grad_norm": 1.1171875, + "learning_rate": 0.00042450583662390187, + "loss": 0.2146, + "step": 184090 + }, + { + "epoch": 7.63, + "grad_norm": 0.0, + "learning_rate": 0.0004244980704868097, + "loss": 0.206, + "step": 184100 + }, + { + "epoch": 7.63, + "grad_norm": 0.44140625, + "learning_rate": 0.0004244903040213308, + "loss": 0.1776, + "step": 184110 + }, + { + "epoch": 7.63, + "grad_norm": 0.58984375, + "learning_rate": 0.0004244825372274802, + "loss": 0.1772, + "step": 184120 + }, + { + "epoch": 7.63, + "grad_norm": 0.90234375, + "learning_rate": 0.00042447477010527224, + "loss": 0.2251, + "step": 184130 + }, + { + "epoch": 7.63, + "grad_norm": 0.56640625, + "learning_rate": 0.0004244670026547216, + "loss": 0.1665, + "step": 184140 + }, + { + "epoch": 7.63, + "grad_norm": 0.5625, + "learning_rate": 0.00042445923487584287, + "loss": 0.1781, + "step": 184150 + }, + { + "epoch": 7.63, + "grad_norm": 1.1171875, + "learning_rate": 0.00042445146676865075, + "loss": 0.2095, + "step": 184160 + }, + { + "epoch": 7.63, + "grad_norm": 2.046875, + "learning_rate": 0.0004244436983331598, + "loss": 0.2027, + "step": 184170 + }, + { + "epoch": 7.63, + "grad_norm": 0.828125, + "learning_rate": 0.0004244359295693846, + "loss": 0.2509, + "step": 184180 + }, + { + "epoch": 7.63, + "grad_norm": 0.546875, + "learning_rate": 0.00042442816047733983, + "loss": 0.1865, + "step": 184190 + }, + { + "epoch": 7.63, + "grad_norm": 0.62890625, + "learning_rate": 0.0004244203910570401, + "loss": 0.178, + "step": 184200 + }, + { + "epoch": 7.63, + "grad_norm": 0.71875, + "learning_rate": 0.00042441262130850013, + "loss": 0.2149, + "step": 184210 + }, + { + "epoch": 7.63, + "grad_norm": 0.53125, + "learning_rate": 0.0004244048512317343, + "loss": 0.1818, + "step": 184220 + }, + { + "epoch": 7.63, + "grad_norm": 2.46875, + "learning_rate": 0.0004243970808267574, + "loss": 0.1838, + "step": 184230 + }, + { + "epoch": 7.63, + "grad_norm": 0.34375, + "learning_rate": 0.00042438931009358406, + "loss": 0.1651, + "step": 184240 + }, + { + "epoch": 7.63, + "grad_norm": 0.8984375, + "learning_rate": 0.00042438153903222894, + "loss": 0.2036, + "step": 184250 + }, + { + "epoch": 7.63, + "grad_norm": 1.6015625, + "learning_rate": 0.00042437376764270653, + "loss": 0.1802, + "step": 184260 + }, + { + "epoch": 7.63, + "grad_norm": 0.6328125, + "learning_rate": 0.00042436599592503147, + "loss": 0.2219, + "step": 184270 + }, + { + "epoch": 7.63, + "grad_norm": 1.015625, + "learning_rate": 0.0004243582238792185, + "loss": 0.1891, + "step": 184280 + }, + { + "epoch": 7.63, + "grad_norm": 1.2109375, + "learning_rate": 0.00042435045150528215, + "loss": 0.1672, + "step": 184290 + }, + { + "epoch": 7.63, + "grad_norm": 0.6328125, + "learning_rate": 0.0004243426788032371, + "loss": 0.2536, + "step": 184300 + }, + { + "epoch": 7.63, + "grad_norm": 1.5234375, + "learning_rate": 0.00042433490577309797, + "loss": 0.1591, + "step": 184310 + }, + { + "epoch": 7.63, + "grad_norm": 0.42578125, + "learning_rate": 0.0004243271324148793, + "loss": 0.1771, + "step": 184320 + }, + { + "epoch": 7.63, + "grad_norm": 0.63671875, + "learning_rate": 0.0004243193587285959, + "loss": 0.2344, + "step": 184330 + }, + { + "epoch": 7.64, + "grad_norm": 0.5859375, + "learning_rate": 0.0004243115847142622, + "loss": 0.1672, + "step": 184340 + }, + { + "epoch": 7.64, + "grad_norm": 1.578125, + "learning_rate": 0.00042430381037189297, + "loss": 0.2234, + "step": 184350 + }, + { + "epoch": 7.64, + "grad_norm": 0.0164794921875, + "learning_rate": 0.0004242960357015028, + "loss": 0.2023, + "step": 184360 + }, + { + "epoch": 7.64, + "grad_norm": 0.71484375, + "learning_rate": 0.00042428826070310633, + "loss": 0.1749, + "step": 184370 + }, + { + "epoch": 7.64, + "grad_norm": 0.55078125, + "learning_rate": 0.0004242804853767181, + "loss": 0.2023, + "step": 184380 + }, + { + "epoch": 7.64, + "grad_norm": 0.427734375, + "learning_rate": 0.0004242727097223529, + "loss": 0.1615, + "step": 184390 + }, + { + "epoch": 7.64, + "grad_norm": 1.1015625, + "learning_rate": 0.0004242649337400252, + "loss": 0.1906, + "step": 184400 + }, + { + "epoch": 7.64, + "grad_norm": 1.140625, + "learning_rate": 0.0004242571574297497, + "loss": 0.2524, + "step": 184410 + }, + { + "epoch": 7.64, + "grad_norm": 0.79296875, + "learning_rate": 0.00042424938079154117, + "loss": 0.2598, + "step": 184420 + }, + { + "epoch": 7.64, + "grad_norm": 0.703125, + "learning_rate": 0.000424241603825414, + "loss": 0.2269, + "step": 184430 + }, + { + "epoch": 7.64, + "grad_norm": 0.7734375, + "learning_rate": 0.00042423382653138304, + "loss": 0.2097, + "step": 184440 + }, + { + "epoch": 7.64, + "grad_norm": 0.87890625, + "learning_rate": 0.0004242260489094628, + "loss": 0.2164, + "step": 184450 + }, + { + "epoch": 7.64, + "grad_norm": 0.671875, + "learning_rate": 0.0004242182709596679, + "loss": 0.1923, + "step": 184460 + }, + { + "epoch": 7.64, + "grad_norm": 0.9140625, + "learning_rate": 0.00042421049268201314, + "loss": 0.2612, + "step": 184470 + }, + { + "epoch": 7.64, + "grad_norm": 1.2109375, + "learning_rate": 0.0004242027140765129, + "loss": 0.2145, + "step": 184480 + }, + { + "epoch": 7.64, + "grad_norm": 1.0, + "learning_rate": 0.0004241949351431821, + "loss": 0.2053, + "step": 184490 + }, + { + "epoch": 7.64, + "grad_norm": 0.46875, + "learning_rate": 0.00042418715588203517, + "loss": 0.1974, + "step": 184500 + }, + { + "epoch": 7.64, + "grad_norm": 1.171875, + "learning_rate": 0.00042417937629308676, + "loss": 0.2451, + "step": 184510 + }, + { + "epoch": 7.64, + "grad_norm": 1.6796875, + "learning_rate": 0.0004241715963763517, + "loss": 0.2012, + "step": 184520 + }, + { + "epoch": 7.64, + "grad_norm": 1.0546875, + "learning_rate": 0.0004241638161318445, + "loss": 0.2059, + "step": 184530 + }, + { + "epoch": 7.64, + "grad_norm": 0.435546875, + "learning_rate": 0.0004241560355595797, + "loss": 0.1897, + "step": 184540 + }, + { + "epoch": 7.64, + "grad_norm": 0.95703125, + "learning_rate": 0.0004241482546595722, + "loss": 0.1785, + "step": 184550 + }, + { + "epoch": 7.64, + "grad_norm": 0.46484375, + "learning_rate": 0.00042414047343183637, + "loss": 0.1835, + "step": 184560 + }, + { + "epoch": 7.64, + "grad_norm": 0.921875, + "learning_rate": 0.000424132691876387, + "loss": 0.1732, + "step": 184570 + }, + { + "epoch": 7.65, + "grad_norm": 0.48828125, + "learning_rate": 0.0004241249099932387, + "loss": 0.201, + "step": 184580 + }, + { + "epoch": 7.65, + "grad_norm": 0.55859375, + "learning_rate": 0.00042411712778240614, + "loss": 0.2655, + "step": 184590 + }, + { + "epoch": 7.65, + "grad_norm": 1.0234375, + "learning_rate": 0.000424109345243904, + "loss": 0.2404, + "step": 184600 + }, + { + "epoch": 7.65, + "grad_norm": 0.58984375, + "learning_rate": 0.00042410156237774677, + "loss": 0.2467, + "step": 184610 + }, + { + "epoch": 7.65, + "grad_norm": 0.578125, + "learning_rate": 0.00042409377918394925, + "loss": 0.2098, + "step": 184620 + }, + { + "epoch": 7.65, + "grad_norm": 1.2578125, + "learning_rate": 0.00042408599566252605, + "loss": 0.1872, + "step": 184630 + }, + { + "epoch": 7.65, + "grad_norm": 0.640625, + "learning_rate": 0.00042407821181349176, + "loss": 0.2222, + "step": 184640 + }, + { + "epoch": 7.65, + "grad_norm": 1.0625, + "learning_rate": 0.0004240704276368611, + "loss": 0.1948, + "step": 184650 + }, + { + "epoch": 7.65, + "grad_norm": 0.58203125, + "learning_rate": 0.00042406264313264876, + "loss": 0.2054, + "step": 184660 + }, + { + "epoch": 7.65, + "grad_norm": 0.84375, + "learning_rate": 0.0004240548583008692, + "loss": 0.2085, + "step": 184670 + }, + { + "epoch": 7.65, + "grad_norm": 0.94140625, + "learning_rate": 0.00042404707314153736, + "loss": 0.2334, + "step": 184680 + }, + { + "epoch": 7.65, + "grad_norm": 0.6171875, + "learning_rate": 0.0004240392876546676, + "loss": 0.2259, + "step": 184690 + }, + { + "epoch": 7.65, + "grad_norm": 0.251953125, + "learning_rate": 0.00042403150184027475, + "loss": 0.1794, + "step": 184700 + }, + { + "epoch": 7.65, + "grad_norm": 0.51171875, + "learning_rate": 0.00042402371569837333, + "loss": 0.2114, + "step": 184710 + }, + { + "epoch": 7.65, + "grad_norm": 0.3515625, + "learning_rate": 0.0004240159292289781, + "loss": 0.1893, + "step": 184720 + }, + { + "epoch": 7.65, + "grad_norm": 0.90234375, + "learning_rate": 0.00042400814243210373, + "loss": 0.1847, + "step": 184730 + }, + { + "epoch": 7.65, + "grad_norm": 0.79296875, + "learning_rate": 0.00042400035530776477, + "loss": 0.2248, + "step": 184740 + }, + { + "epoch": 7.65, + "grad_norm": 0.50390625, + "learning_rate": 0.00042399256785597593, + "loss": 0.1529, + "step": 184750 + }, + { + "epoch": 7.65, + "grad_norm": 0.85546875, + "learning_rate": 0.0004239847800767519, + "loss": 0.1713, + "step": 184760 + }, + { + "epoch": 7.65, + "grad_norm": 0.64453125, + "learning_rate": 0.0004239769919701073, + "loss": 0.2495, + "step": 184770 + }, + { + "epoch": 7.65, + "grad_norm": 0.314453125, + "learning_rate": 0.00042396920353605683, + "loss": 0.1724, + "step": 184780 + }, + { + "epoch": 7.65, + "grad_norm": 1.3046875, + "learning_rate": 0.00042396141477461503, + "loss": 0.1638, + "step": 184790 + }, + { + "epoch": 7.65, + "grad_norm": 0.79296875, + "learning_rate": 0.00042395362568579666, + "loss": 0.166, + "step": 184800 + }, + { + "epoch": 7.65, + "grad_norm": 0.5625, + "learning_rate": 0.00042394583626961633, + "loss": 0.1687, + "step": 184810 + }, + { + "epoch": 7.66, + "grad_norm": 0.4765625, + "learning_rate": 0.0004239380465260888, + "loss": 0.2116, + "step": 184820 + }, + { + "epoch": 7.66, + "grad_norm": 0.8359375, + "learning_rate": 0.00042393025645522857, + "loss": 0.2352, + "step": 184830 + }, + { + "epoch": 7.66, + "grad_norm": 0.6328125, + "learning_rate": 0.00042392246605705045, + "loss": 0.1477, + "step": 184840 + }, + { + "epoch": 7.66, + "grad_norm": 0.68359375, + "learning_rate": 0.00042391467533156894, + "loss": 0.2501, + "step": 184850 + }, + { + "epoch": 7.66, + "grad_norm": 0.396484375, + "learning_rate": 0.0004239068842787989, + "loss": 0.2147, + "step": 184860 + }, + { + "epoch": 7.66, + "grad_norm": 0.45703125, + "learning_rate": 0.00042389909289875476, + "loss": 0.2105, + "step": 184870 + }, + { + "epoch": 7.66, + "grad_norm": 0.859375, + "learning_rate": 0.00042389130119145136, + "loss": 0.2013, + "step": 184880 + }, + { + "epoch": 7.66, + "grad_norm": 0.7890625, + "learning_rate": 0.0004238835091569034, + "loss": 0.1645, + "step": 184890 + }, + { + "epoch": 7.66, + "grad_norm": 0.62890625, + "learning_rate": 0.00042387571679512536, + "loss": 0.2095, + "step": 184900 + }, + { + "epoch": 7.66, + "grad_norm": 1.6171875, + "learning_rate": 0.00042386792410613203, + "loss": 0.2148, + "step": 184910 + }, + { + "epoch": 7.66, + "grad_norm": 0.408203125, + "learning_rate": 0.00042386013108993803, + "loss": 0.222, + "step": 184920 + }, + { + "epoch": 7.66, + "grad_norm": 0.84375, + "learning_rate": 0.000423852337746558, + "loss": 0.1574, + "step": 184930 + }, + { + "epoch": 7.66, + "grad_norm": 0.482421875, + "learning_rate": 0.0004238445440760067, + "loss": 0.2161, + "step": 184940 + }, + { + "epoch": 7.66, + "grad_norm": 1.1328125, + "learning_rate": 0.0004238367500782987, + "loss": 0.194, + "step": 184950 + }, + { + "epoch": 7.66, + "grad_norm": 0.80078125, + "learning_rate": 0.0004238289557534487, + "loss": 0.227, + "step": 184960 + }, + { + "epoch": 7.66, + "grad_norm": 2.421875, + "learning_rate": 0.00042382116110147147, + "loss": 0.1806, + "step": 184970 + }, + { + "epoch": 7.66, + "grad_norm": 0.8203125, + "learning_rate": 0.00042381336612238153, + "loss": 0.1974, + "step": 184980 + }, + { + "epoch": 7.66, + "grad_norm": 2.09375, + "learning_rate": 0.00042380557081619355, + "loss": 0.1656, + "step": 184990 + }, + { + "epoch": 7.66, + "grad_norm": 0.15625, + "learning_rate": 0.0004237977751829223, + "loss": 0.2178, + "step": 185000 + }, + { + "epoch": 7.66, + "grad_norm": 0.490234375, + "learning_rate": 0.00042378997922258246, + "loss": 0.258, + "step": 185010 + }, + { + "epoch": 7.66, + "grad_norm": 1.5703125, + "learning_rate": 0.0004237821829351886, + "loss": 0.2538, + "step": 185020 + }, + { + "epoch": 7.66, + "grad_norm": 0.58984375, + "learning_rate": 0.00042377438632075536, + "loss": 0.1876, + "step": 185030 + }, + { + "epoch": 7.66, + "grad_norm": 0.98828125, + "learning_rate": 0.0004237665893792976, + "loss": 0.2006, + "step": 185040 + }, + { + "epoch": 7.66, + "grad_norm": 0.96875, + "learning_rate": 0.0004237587921108299, + "loss": 0.1999, + "step": 185050 + }, + { + "epoch": 7.67, + "grad_norm": 1.2109375, + "learning_rate": 0.0004237509945153668, + "loss": 0.2151, + "step": 185060 + }, + { + "epoch": 7.67, + "grad_norm": 0.64453125, + "learning_rate": 0.00042374319659292317, + "loss": 0.1819, + "step": 185070 + }, + { + "epoch": 7.67, + "grad_norm": 0.52734375, + "learning_rate": 0.0004237353983435136, + "loss": 0.2127, + "step": 185080 + }, + { + "epoch": 7.67, + "grad_norm": 1.671875, + "learning_rate": 0.0004237275997671528, + "loss": 0.2398, + "step": 185090 + }, + { + "epoch": 7.67, + "grad_norm": 0.8515625, + "learning_rate": 0.0004237198008638554, + "loss": 0.2218, + "step": 185100 + }, + { + "epoch": 7.67, + "grad_norm": 0.95703125, + "learning_rate": 0.00042371200163363607, + "loss": 0.243, + "step": 185110 + }, + { + "epoch": 7.67, + "grad_norm": 0.57421875, + "learning_rate": 0.0004237042020765095, + "loss": 0.2207, + "step": 185120 + }, + { + "epoch": 7.67, + "grad_norm": 0.5390625, + "learning_rate": 0.0004236964021924904, + "loss": 0.2112, + "step": 185130 + }, + { + "epoch": 7.67, + "grad_norm": 0.765625, + "learning_rate": 0.0004236886019815934, + "loss": 0.1883, + "step": 185140 + }, + { + "epoch": 7.67, + "grad_norm": 1.21875, + "learning_rate": 0.00042368080144383324, + "loss": 0.1616, + "step": 185150 + }, + { + "epoch": 7.67, + "grad_norm": 0.6953125, + "learning_rate": 0.0004236730005792246, + "loss": 0.1935, + "step": 185160 + }, + { + "epoch": 7.67, + "grad_norm": 1.203125, + "learning_rate": 0.00042366519938778215, + "loss": 0.2184, + "step": 185170 + }, + { + "epoch": 7.67, + "grad_norm": 0.00020885467529296875, + "learning_rate": 0.00042365739786952045, + "loss": 0.1977, + "step": 185180 + }, + { + "epoch": 7.67, + "grad_norm": 1.4609375, + "learning_rate": 0.0004236495960244543, + "loss": 0.2078, + "step": 185190 + }, + { + "epoch": 7.67, + "grad_norm": 0.57421875, + "learning_rate": 0.0004236417938525984, + "loss": 0.2117, + "step": 185200 + }, + { + "epoch": 7.67, + "grad_norm": 0.486328125, + "learning_rate": 0.0004236339913539674, + "loss": 0.1858, + "step": 185210 + }, + { + "epoch": 7.67, + "grad_norm": 0.99609375, + "learning_rate": 0.0004236261885285759, + "loss": 0.2402, + "step": 185220 + }, + { + "epoch": 7.67, + "grad_norm": 1.0625, + "learning_rate": 0.00042361838537643883, + "loss": 0.1745, + "step": 185230 + }, + { + "epoch": 7.67, + "grad_norm": 0.9453125, + "learning_rate": 0.00042361058189757054, + "loss": 0.2253, + "step": 185240 + }, + { + "epoch": 7.67, + "grad_norm": 0.220703125, + "learning_rate": 0.000423602778091986, + "loss": 0.2039, + "step": 185250 + }, + { + "epoch": 7.67, + "grad_norm": 0.72265625, + "learning_rate": 0.0004235949739596997, + "loss": 0.2002, + "step": 185260 + }, + { + "epoch": 7.67, + "grad_norm": 0.392578125, + "learning_rate": 0.0004235871695007264, + "loss": 0.2111, + "step": 185270 + }, + { + "epoch": 7.67, + "grad_norm": 0.93359375, + "learning_rate": 0.0004235793647150808, + "loss": 0.2076, + "step": 185280 + }, + { + "epoch": 7.67, + "grad_norm": 0.83203125, + "learning_rate": 0.00042357155960277766, + "loss": 0.1844, + "step": 185290 + }, + { + "epoch": 7.68, + "grad_norm": 0.609375, + "learning_rate": 0.00042356375416383155, + "loss": 0.1455, + "step": 185300 + }, + { + "epoch": 7.68, + "grad_norm": 1.0625, + "learning_rate": 0.00042355594839825716, + "loss": 0.1686, + "step": 185310 + }, + { + "epoch": 7.68, + "grad_norm": 0.54296875, + "learning_rate": 0.0004235481423060692, + "loss": 0.2217, + "step": 185320 + }, + { + "epoch": 7.68, + "grad_norm": 0.61328125, + "learning_rate": 0.0004235403358872825, + "loss": 0.2257, + "step": 185330 + }, + { + "epoch": 7.68, + "grad_norm": 0.6953125, + "learning_rate": 0.00042353252914191156, + "loss": 0.2169, + "step": 185340 + }, + { + "epoch": 7.68, + "grad_norm": 0.345703125, + "learning_rate": 0.0004235247220699711, + "loss": 0.143, + "step": 185350 + }, + { + "epoch": 7.68, + "grad_norm": 0.546875, + "learning_rate": 0.00042351691467147585, + "loss": 0.1945, + "step": 185360 + }, + { + "epoch": 7.68, + "grad_norm": 1.2578125, + "learning_rate": 0.0004235091069464405, + "loss": 0.2213, + "step": 185370 + }, + { + "epoch": 7.68, + "grad_norm": 0.3046875, + "learning_rate": 0.00042350129889487983, + "loss": 0.2232, + "step": 185380 + }, + { + "epoch": 7.68, + "grad_norm": 0.90625, + "learning_rate": 0.0004234934905168084, + "loss": 0.1813, + "step": 185390 + }, + { + "epoch": 7.68, + "grad_norm": 0.953125, + "learning_rate": 0.0004234856818122409, + "loss": 0.1799, + "step": 185400 + }, + { + "epoch": 7.68, + "grad_norm": 0.6328125, + "learning_rate": 0.0004234778727811922, + "loss": 0.241, + "step": 185410 + }, + { + "epoch": 7.68, + "grad_norm": 1.15625, + "learning_rate": 0.0004234700634236768, + "loss": 0.2014, + "step": 185420 + }, + { + "epoch": 7.68, + "grad_norm": 0.859375, + "learning_rate": 0.00042346225373970947, + "loss": 0.2093, + "step": 185430 + }, + { + "epoch": 7.68, + "grad_norm": 0.74609375, + "learning_rate": 0.00042345444372930496, + "loss": 0.1548, + "step": 185440 + }, + { + "epoch": 7.68, + "grad_norm": 0.53125, + "learning_rate": 0.0004234466333924779, + "loss": 0.1763, + "step": 185450 + }, + { + "epoch": 7.68, + "grad_norm": 0.765625, + "learning_rate": 0.000423438822729243, + "loss": 0.2107, + "step": 185460 + }, + { + "epoch": 7.68, + "grad_norm": 0.4609375, + "learning_rate": 0.00042343101173961497, + "loss": 0.1521, + "step": 185470 + }, + { + "epoch": 7.68, + "grad_norm": 1.125, + "learning_rate": 0.0004234232004236085, + "loss": 0.2205, + "step": 185480 + }, + { + "epoch": 7.68, + "grad_norm": 0.453125, + "learning_rate": 0.00042341538878123833, + "loss": 0.1418, + "step": 185490 + }, + { + "epoch": 7.68, + "grad_norm": 0.1728515625, + "learning_rate": 0.000423407576812519, + "loss": 0.1665, + "step": 185500 + }, + { + "epoch": 7.68, + "grad_norm": 0.349609375, + "learning_rate": 0.0004233997645174654, + "loss": 0.1909, + "step": 185510 + }, + { + "epoch": 7.68, + "grad_norm": 0.353515625, + "learning_rate": 0.00042339195189609224, + "loss": 0.1995, + "step": 185520 + }, + { + "epoch": 7.68, + "grad_norm": 0.404296875, + "learning_rate": 0.0004233841389484141, + "loss": 0.2115, + "step": 185530 + }, + { + "epoch": 7.69, + "grad_norm": 1.09375, + "learning_rate": 0.0004233763256744458, + "loss": 0.2027, + "step": 185540 + }, + { + "epoch": 7.69, + "grad_norm": 0.70703125, + "learning_rate": 0.0004233685120742019, + "loss": 0.1788, + "step": 185550 + }, + { + "epoch": 7.69, + "grad_norm": 0.72265625, + "learning_rate": 0.00042336069814769716, + "loss": 0.2224, + "step": 185560 + }, + { + "epoch": 7.69, + "grad_norm": 0.30078125, + "learning_rate": 0.0004233528838949464, + "loss": 0.2217, + "step": 185570 + }, + { + "epoch": 7.69, + "grad_norm": 0.84375, + "learning_rate": 0.00042334506931596415, + "loss": 0.175, + "step": 185580 + }, + { + "epoch": 7.69, + "grad_norm": 0.953125, + "learning_rate": 0.0004233372544107652, + "loss": 0.2133, + "step": 185590 + }, + { + "epoch": 7.69, + "grad_norm": 0.859375, + "learning_rate": 0.00042332943917936433, + "loss": 0.1927, + "step": 185600 + }, + { + "epoch": 7.69, + "grad_norm": 0.66015625, + "learning_rate": 0.0004233216236217761, + "loss": 0.2171, + "step": 185610 + }, + { + "epoch": 7.69, + "grad_norm": 1.140625, + "learning_rate": 0.0004233138077380153, + "loss": 0.1814, + "step": 185620 + }, + { + "epoch": 7.69, + "grad_norm": 0.5859375, + "learning_rate": 0.0004233059915280967, + "loss": 0.2018, + "step": 185630 + }, + { + "epoch": 7.69, + "grad_norm": 0.953125, + "learning_rate": 0.00042329817499203487, + "loss": 0.1898, + "step": 185640 + }, + { + "epoch": 7.69, + "grad_norm": 2.28125, + "learning_rate": 0.0004232903581298446, + "loss": 0.1929, + "step": 185650 + }, + { + "epoch": 7.69, + "grad_norm": 1.0859375, + "learning_rate": 0.0004232825409415406, + "loss": 0.2164, + "step": 185660 + }, + { + "epoch": 7.69, + "grad_norm": 1.2421875, + "learning_rate": 0.0004232747234271376, + "loss": 0.2183, + "step": 185670 + }, + { + "epoch": 7.69, + "grad_norm": 0.640625, + "learning_rate": 0.00042326690558665026, + "loss": 0.1996, + "step": 185680 + }, + { + "epoch": 7.69, + "grad_norm": 0.7265625, + "learning_rate": 0.0004232590874200933, + "loss": 0.2091, + "step": 185690 + }, + { + "epoch": 7.69, + "grad_norm": 1.3515625, + "learning_rate": 0.00042325126892748144, + "loss": 0.1931, + "step": 185700 + }, + { + "epoch": 7.69, + "grad_norm": 1.078125, + "learning_rate": 0.00042324345010882935, + "loss": 0.1964, + "step": 185710 + }, + { + "epoch": 7.69, + "grad_norm": 0.58984375, + "learning_rate": 0.00042323563096415196, + "loss": 0.1653, + "step": 185720 + }, + { + "epoch": 7.69, + "grad_norm": 0.69140625, + "learning_rate": 0.00042322781149346366, + "loss": 0.2271, + "step": 185730 + }, + { + "epoch": 7.69, + "grad_norm": 0.7734375, + "learning_rate": 0.0004232199916967794, + "loss": 0.2046, + "step": 185740 + }, + { + "epoch": 7.69, + "grad_norm": 0.427734375, + "learning_rate": 0.00042321217157411387, + "loss": 0.1849, + "step": 185750 + }, + { + "epoch": 7.69, + "grad_norm": 0.58984375, + "learning_rate": 0.0004232043511254816, + "loss": 0.198, + "step": 185760 + }, + { + "epoch": 7.69, + "grad_norm": 1.0703125, + "learning_rate": 0.00042319653035089756, + "loss": 0.201, + "step": 185770 + }, + { + "epoch": 7.69, + "grad_norm": 0.5625, + "learning_rate": 0.0004231887092503763, + "loss": 0.1817, + "step": 185780 + }, + { + "epoch": 7.7, + "grad_norm": 0.56640625, + "learning_rate": 0.0004231808878239326, + "loss": 0.1802, + "step": 185790 + }, + { + "epoch": 7.7, + "grad_norm": 0.63671875, + "learning_rate": 0.00042317306607158125, + "loss": 0.2254, + "step": 185800 + }, + { + "epoch": 7.7, + "grad_norm": 0.49609375, + "learning_rate": 0.0004231652439933368, + "loss": 0.2388, + "step": 185810 + }, + { + "epoch": 7.7, + "grad_norm": 1.171875, + "learning_rate": 0.00042315742158921413, + "loss": 0.2066, + "step": 185820 + }, + { + "epoch": 7.7, + "grad_norm": 1.03125, + "learning_rate": 0.0004231495988592278, + "loss": 0.2058, + "step": 185830 + }, + { + "epoch": 7.7, + "grad_norm": 0.357421875, + "learning_rate": 0.00042314177580339264, + "loss": 0.1716, + "step": 185840 + }, + { + "epoch": 7.7, + "grad_norm": 0.5, + "learning_rate": 0.0004231339524217234, + "loss": 0.2269, + "step": 185850 + }, + { + "epoch": 7.7, + "grad_norm": 0.59375, + "learning_rate": 0.0004231261287142347, + "loss": 0.1698, + "step": 185860 + }, + { + "epoch": 7.7, + "grad_norm": 1.046875, + "learning_rate": 0.0004231183046809414, + "loss": 0.1794, + "step": 185870 + }, + { + "epoch": 7.7, + "grad_norm": 0.63671875, + "learning_rate": 0.0004231104803218581, + "loss": 0.1887, + "step": 185880 + }, + { + "epoch": 7.7, + "grad_norm": 0.494140625, + "learning_rate": 0.00042310265563699957, + "loss": 0.2548, + "step": 185890 + }, + { + "epoch": 7.7, + "grad_norm": 1.0390625, + "learning_rate": 0.00042309483062638055, + "loss": 0.2241, + "step": 185900 + }, + { + "epoch": 7.7, + "grad_norm": 0.73046875, + "learning_rate": 0.00042308700529001576, + "loss": 0.1967, + "step": 185910 + }, + { + "epoch": 7.7, + "grad_norm": 0.8671875, + "learning_rate": 0.0004230791796279199, + "loss": 0.2048, + "step": 185920 + }, + { + "epoch": 7.7, + "grad_norm": 0.71875, + "learning_rate": 0.0004230713536401077, + "loss": 0.2451, + "step": 185930 + }, + { + "epoch": 7.7, + "grad_norm": 0.671875, + "learning_rate": 0.00042306352732659396, + "loss": 0.1819, + "step": 185940 + }, + { + "epoch": 7.7, + "grad_norm": 0.87890625, + "learning_rate": 0.0004230557006873933, + "loss": 0.1793, + "step": 185950 + }, + { + "epoch": 7.7, + "grad_norm": 1.2421875, + "learning_rate": 0.0004230478737225205, + "loss": 0.2236, + "step": 185960 + }, + { + "epoch": 7.7, + "grad_norm": 0.625, + "learning_rate": 0.0004230400464319903, + "loss": 0.1894, + "step": 185970 + }, + { + "epoch": 7.7, + "grad_norm": 0.4375, + "learning_rate": 0.00042303221881581746, + "loss": 0.1736, + "step": 185980 + }, + { + "epoch": 7.7, + "grad_norm": 0.58984375, + "learning_rate": 0.0004230243908740166, + "loss": 0.1797, + "step": 185990 + }, + { + "epoch": 7.7, + "grad_norm": 0.34765625, + "learning_rate": 0.00042301656260660254, + "loss": 0.1914, + "step": 186000 + }, + { + "epoch": 7.7, + "grad_norm": 0.42578125, + "learning_rate": 0.0004230087340135901, + "loss": 0.1568, + "step": 186010 + }, + { + "epoch": 7.7, + "grad_norm": 0.74609375, + "learning_rate": 0.0004230009050949938, + "loss": 0.1853, + "step": 186020 + }, + { + "epoch": 7.71, + "grad_norm": 0.90625, + "learning_rate": 0.0004229930758508285, + "loss": 0.2368, + "step": 186030 + }, + { + "epoch": 7.71, + "grad_norm": 0.5703125, + "learning_rate": 0.00042298524628110886, + "loss": 0.2113, + "step": 186040 + }, + { + "epoch": 7.71, + "grad_norm": 0.734375, + "learning_rate": 0.0004229774163858497, + "loss": 0.1727, + "step": 186050 + }, + { + "epoch": 7.71, + "grad_norm": 0.41015625, + "learning_rate": 0.00042296958616506574, + "loss": 0.2448, + "step": 186060 + }, + { + "epoch": 7.71, + "grad_norm": 0.9140625, + "learning_rate": 0.0004229617556187717, + "loss": 0.2338, + "step": 186070 + }, + { + "epoch": 7.71, + "grad_norm": 0.734375, + "learning_rate": 0.0004229539247469822, + "loss": 0.18, + "step": 186080 + }, + { + "epoch": 7.71, + "grad_norm": 0.89453125, + "learning_rate": 0.0004229460935497123, + "loss": 0.2221, + "step": 186090 + }, + { + "epoch": 7.71, + "grad_norm": 0.5859375, + "learning_rate": 0.00042293826202697643, + "loss": 0.2206, + "step": 186100 + }, + { + "epoch": 7.71, + "grad_norm": 1.234375, + "learning_rate": 0.0004229304301787894, + "loss": 0.2161, + "step": 186110 + }, + { + "epoch": 7.71, + "grad_norm": 1.140625, + "learning_rate": 0.00042292259800516595, + "loss": 0.2118, + "step": 186120 + }, + { + "epoch": 7.71, + "grad_norm": 0.5390625, + "learning_rate": 0.00042291476550612093, + "loss": 0.1956, + "step": 186130 + }, + { + "epoch": 7.71, + "grad_norm": 0.435546875, + "learning_rate": 0.00042290693268166895, + "loss": 0.2341, + "step": 186140 + }, + { + "epoch": 7.71, + "grad_norm": 0.68359375, + "learning_rate": 0.0004228990995318248, + "loss": 0.1952, + "step": 186150 + }, + { + "epoch": 7.71, + "grad_norm": 0.435546875, + "learning_rate": 0.00042289126605660324, + "loss": 0.2512, + "step": 186160 + }, + { + "epoch": 7.71, + "grad_norm": 0.796875, + "learning_rate": 0.000422883432256019, + "loss": 0.1592, + "step": 186170 + }, + { + "epoch": 7.71, + "grad_norm": 0.4765625, + "learning_rate": 0.0004228755981300867, + "loss": 0.1951, + "step": 186180 + }, + { + "epoch": 7.71, + "grad_norm": 0.0693359375, + "learning_rate": 0.0004228677636788213, + "loss": 0.1894, + "step": 186190 + }, + { + "epoch": 7.71, + "grad_norm": 1.1484375, + "learning_rate": 0.00042285992890223745, + "loss": 0.2339, + "step": 186200 + }, + { + "epoch": 7.71, + "grad_norm": 0.5234375, + "learning_rate": 0.00042285209380034985, + "loss": 0.24, + "step": 186210 + }, + { + "epoch": 7.71, + "grad_norm": 0.365234375, + "learning_rate": 0.0004228442583731732, + "loss": 0.2463, + "step": 186220 + }, + { + "epoch": 7.71, + "grad_norm": 0.5625, + "learning_rate": 0.00042283642262072244, + "loss": 0.1849, + "step": 186230 + }, + { + "epoch": 7.71, + "grad_norm": 1.0859375, + "learning_rate": 0.0004228285865430121, + "loss": 0.241, + "step": 186240 + }, + { + "epoch": 7.71, + "grad_norm": 0.81640625, + "learning_rate": 0.0004228207501400571, + "loss": 0.2694, + "step": 186250 + }, + { + "epoch": 7.71, + "grad_norm": 0.298828125, + "learning_rate": 0.000422812913411872, + "loss": 0.2239, + "step": 186260 + }, + { + "epoch": 7.72, + "grad_norm": 0.76171875, + "learning_rate": 0.00042280507635847174, + "loss": 0.2119, + "step": 186270 + }, + { + "epoch": 7.72, + "grad_norm": 0.48046875, + "learning_rate": 0.000422797238979871, + "loss": 0.1741, + "step": 186280 + }, + { + "epoch": 7.72, + "grad_norm": 0.6875, + "learning_rate": 0.0004227894012760845, + "loss": 0.2412, + "step": 186290 + }, + { + "epoch": 7.72, + "grad_norm": 0.9921875, + "learning_rate": 0.000422781563247127, + "loss": 0.1974, + "step": 186300 + }, + { + "epoch": 7.72, + "grad_norm": 0.7578125, + "learning_rate": 0.0004227737248930133, + "loss": 0.1795, + "step": 186310 + }, + { + "epoch": 7.72, + "grad_norm": 0.60546875, + "learning_rate": 0.00042276588621375805, + "loss": 0.1889, + "step": 186320 + }, + { + "epoch": 7.72, + "grad_norm": 1.1328125, + "learning_rate": 0.0004227580472093761, + "loss": 0.2305, + "step": 186330 + }, + { + "epoch": 7.72, + "grad_norm": 0.435546875, + "learning_rate": 0.0004227502078798821, + "loss": 0.2232, + "step": 186340 + }, + { + "epoch": 7.72, + "grad_norm": 1.3203125, + "learning_rate": 0.00042274236822529096, + "loss": 0.2295, + "step": 186350 + }, + { + "epoch": 7.72, + "grad_norm": 0.55078125, + "learning_rate": 0.0004227345282456173, + "loss": 0.2083, + "step": 186360 + }, + { + "epoch": 7.72, + "grad_norm": 0.78125, + "learning_rate": 0.0004227266879408759, + "loss": 0.2034, + "step": 186370 + }, + { + "epoch": 7.72, + "grad_norm": 1.2734375, + "learning_rate": 0.0004227188473110815, + "loss": 0.224, + "step": 186380 + }, + { + "epoch": 7.72, + "grad_norm": 0.484375, + "learning_rate": 0.00042271100635624885, + "loss": 0.1924, + "step": 186390 + }, + { + "epoch": 7.72, + "grad_norm": 0.92578125, + "learning_rate": 0.00042270316507639284, + "loss": 0.1669, + "step": 186400 + }, + { + "epoch": 7.72, + "grad_norm": 1.1640625, + "learning_rate": 0.00042269532347152805, + "loss": 0.1507, + "step": 186410 + }, + { + "epoch": 7.72, + "grad_norm": 0.79296875, + "learning_rate": 0.0004226874815416694, + "loss": 0.2059, + "step": 186420 + }, + { + "epoch": 7.72, + "grad_norm": 0.392578125, + "learning_rate": 0.0004226796392868314, + "loss": 0.2227, + "step": 186430 + }, + { + "epoch": 7.72, + "grad_norm": 0.5, + "learning_rate": 0.0004226717967070291, + "loss": 0.214, + "step": 186440 + }, + { + "epoch": 7.72, + "grad_norm": 0.61328125, + "learning_rate": 0.00042266395380227706, + "loss": 0.2178, + "step": 186450 + }, + { + "epoch": 7.72, + "grad_norm": 1.328125, + "learning_rate": 0.00042265611057259013, + "loss": 0.2159, + "step": 186460 + }, + { + "epoch": 7.72, + "grad_norm": 0.953125, + "learning_rate": 0.00042264826701798305, + "loss": 0.2181, + "step": 186470 + }, + { + "epoch": 7.72, + "grad_norm": 0.9453125, + "learning_rate": 0.00042264042313847056, + "loss": 0.1769, + "step": 186480 + }, + { + "epoch": 7.72, + "grad_norm": 0.46484375, + "learning_rate": 0.0004226325789340675, + "loss": 0.1647, + "step": 186490 + }, + { + "epoch": 7.72, + "grad_norm": 0.69921875, + "learning_rate": 0.00042262473440478847, + "loss": 0.2133, + "step": 186500 + }, + { + "epoch": 7.73, + "grad_norm": 0.8046875, + "learning_rate": 0.00042261688955064837, + "loss": 0.2279, + "step": 186510 + }, + { + "epoch": 7.73, + "grad_norm": 0.8359375, + "learning_rate": 0.0004226090443716619, + "loss": 0.2156, + "step": 186520 + }, + { + "epoch": 7.73, + "grad_norm": 0.46484375, + "learning_rate": 0.000422601198867844, + "loss": 0.2141, + "step": 186530 + }, + { + "epoch": 7.73, + "grad_norm": 0.7265625, + "learning_rate": 0.0004225933530392091, + "loss": 0.2017, + "step": 186540 + }, + { + "epoch": 7.73, + "grad_norm": 0.5, + "learning_rate": 0.00042258550688577224, + "loss": 0.2076, + "step": 186550 + }, + { + "epoch": 7.73, + "grad_norm": 1.046875, + "learning_rate": 0.0004225776604075481, + "loss": 0.2522, + "step": 186560 + }, + { + "epoch": 7.73, + "grad_norm": 0.6796875, + "learning_rate": 0.0004225698136045514, + "loss": 0.2083, + "step": 186570 + }, + { + "epoch": 7.73, + "grad_norm": 1.171875, + "learning_rate": 0.0004225619664767969, + "loss": 0.2047, + "step": 186580 + }, + { + "epoch": 7.73, + "grad_norm": 0.64453125, + "learning_rate": 0.00042255411902429954, + "loss": 0.2082, + "step": 186590 + }, + { + "epoch": 7.73, + "grad_norm": 1.6953125, + "learning_rate": 0.0004225462712470738, + "loss": 0.1499, + "step": 186600 + }, + { + "epoch": 7.73, + "grad_norm": 0.91796875, + "learning_rate": 0.00042253842314513473, + "loss": 0.2031, + "step": 186610 + }, + { + "epoch": 7.73, + "grad_norm": 0.51953125, + "learning_rate": 0.000422530574718497, + "loss": 0.2136, + "step": 186620 + }, + { + "epoch": 7.73, + "grad_norm": 1.4609375, + "learning_rate": 0.0004225227259671752, + "loss": 0.2354, + "step": 186630 + }, + { + "epoch": 7.73, + "grad_norm": 0.59375, + "learning_rate": 0.0004225148768911844, + "loss": 0.232, + "step": 186640 + }, + { + "epoch": 7.73, + "grad_norm": 0.625, + "learning_rate": 0.0004225070274905393, + "loss": 0.218, + "step": 186650 + }, + { + "epoch": 7.73, + "grad_norm": 0.75, + "learning_rate": 0.0004224991777652544, + "loss": 0.2135, + "step": 186660 + }, + { + "epoch": 7.73, + "grad_norm": 0.96875, + "learning_rate": 0.0004224913277153448, + "loss": 0.1721, + "step": 186670 + }, + { + "epoch": 7.73, + "grad_norm": 0.9609375, + "learning_rate": 0.00042248347734082515, + "loss": 0.1748, + "step": 186680 + }, + { + "epoch": 7.73, + "grad_norm": 0.98046875, + "learning_rate": 0.00042247562664171027, + "loss": 0.182, + "step": 186690 + }, + { + "epoch": 7.73, + "grad_norm": 0.78125, + "learning_rate": 0.00042246777561801474, + "loss": 0.2432, + "step": 186700 + }, + { + "epoch": 7.73, + "grad_norm": 0.51171875, + "learning_rate": 0.00042245992426975354, + "loss": 0.1936, + "step": 186710 + }, + { + "epoch": 7.73, + "grad_norm": 1.46875, + "learning_rate": 0.00042245207259694145, + "loss": 0.2131, + "step": 186720 + }, + { + "epoch": 7.73, + "grad_norm": 0.65234375, + "learning_rate": 0.00042244422059959307, + "loss": 0.2072, + "step": 186730 + }, + { + "epoch": 7.73, + "grad_norm": 0.310546875, + "learning_rate": 0.0004224363682777234, + "loss": 0.2076, + "step": 186740 + }, + { + "epoch": 7.74, + "grad_norm": 0.640625, + "learning_rate": 0.0004224285156313471, + "loss": 0.188, + "step": 186750 + }, + { + "epoch": 7.74, + "grad_norm": 0.375, + "learning_rate": 0.0004224206626604789, + "loss": 0.2016, + "step": 186760 + }, + { + "epoch": 7.74, + "grad_norm": 0.84375, + "learning_rate": 0.0004224128093651337, + "loss": 0.1754, + "step": 186770 + }, + { + "epoch": 7.74, + "grad_norm": 0.373046875, + "learning_rate": 0.0004224049557453261, + "loss": 0.2106, + "step": 186780 + }, + { + "epoch": 7.74, + "grad_norm": 0.7734375, + "learning_rate": 0.0004223971018010711, + "loss": 0.1587, + "step": 186790 + }, + { + "epoch": 7.74, + "grad_norm": 0.96875, + "learning_rate": 0.0004223892475323833, + "loss": 0.2254, + "step": 186800 + }, + { + "epoch": 7.74, + "grad_norm": 0.3828125, + "learning_rate": 0.00042238139293927757, + "loss": 0.1664, + "step": 186810 + }, + { + "epoch": 7.74, + "grad_norm": 0.5390625, + "learning_rate": 0.0004223735380217687, + "loss": 0.1643, + "step": 186820 + }, + { + "epoch": 7.74, + "grad_norm": 0.61328125, + "learning_rate": 0.00042236568277987145, + "loss": 0.2405, + "step": 186830 + }, + { + "epoch": 7.74, + "grad_norm": 1.0234375, + "learning_rate": 0.00042235782721360056, + "loss": 0.2515, + "step": 186840 + }, + { + "epoch": 7.74, + "grad_norm": 0.640625, + "learning_rate": 0.0004223499713229709, + "loss": 0.196, + "step": 186850 + }, + { + "epoch": 7.74, + "grad_norm": 0.63671875, + "learning_rate": 0.0004223421151079972, + "loss": 0.1547, + "step": 186860 + }, + { + "epoch": 7.74, + "grad_norm": 0.494140625, + "learning_rate": 0.0004223342585686942, + "loss": 0.1885, + "step": 186870 + }, + { + "epoch": 7.74, + "grad_norm": 0.88671875, + "learning_rate": 0.00042232640170507675, + "loss": 0.221, + "step": 186880 + }, + { + "epoch": 7.74, + "grad_norm": 0.97265625, + "learning_rate": 0.0004223185445171597, + "loss": 0.1823, + "step": 186890 + }, + { + "epoch": 7.74, + "grad_norm": 0.90234375, + "learning_rate": 0.00042231068700495766, + "loss": 0.2281, + "step": 186900 + }, + { + "epoch": 7.74, + "grad_norm": 0.625, + "learning_rate": 0.0004223028291684855, + "loss": 0.1785, + "step": 186910 + }, + { + "epoch": 7.74, + "grad_norm": 0.72265625, + "learning_rate": 0.00042229497100775806, + "loss": 0.1818, + "step": 186920 + }, + { + "epoch": 7.74, + "grad_norm": 0.85546875, + "learning_rate": 0.00042228711252279016, + "loss": 0.2154, + "step": 186930 + }, + { + "epoch": 7.74, + "grad_norm": 1.09375, + "learning_rate": 0.0004222792537135964, + "loss": 0.2092, + "step": 186940 + }, + { + "epoch": 7.74, + "grad_norm": 0.50390625, + "learning_rate": 0.0004222713945801918, + "loss": 0.228, + "step": 186950 + }, + { + "epoch": 7.74, + "grad_norm": 0.83984375, + "learning_rate": 0.00042226353512259097, + "loss": 0.2107, + "step": 186960 + }, + { + "epoch": 7.74, + "grad_norm": 0.478515625, + "learning_rate": 0.0004222556753408088, + "loss": 0.2065, + "step": 186970 + }, + { + "epoch": 7.74, + "grad_norm": 0.83984375, + "learning_rate": 0.00042224781523486003, + "loss": 0.1984, + "step": 186980 + }, + { + "epoch": 7.75, + "grad_norm": 0.6171875, + "learning_rate": 0.00042223995480475954, + "loss": 0.1951, + "step": 186990 + }, + { + "epoch": 7.75, + "grad_norm": 1.03125, + "learning_rate": 0.0004222320940505219, + "loss": 0.1909, + "step": 187000 + }, + { + "epoch": 7.75, + "grad_norm": 0.953125, + "learning_rate": 0.00042222423297216225, + "loss": 0.239, + "step": 187010 + }, + { + "epoch": 7.75, + "grad_norm": 0.3984375, + "learning_rate": 0.00042221637156969504, + "loss": 0.1551, + "step": 187020 + }, + { + "epoch": 7.75, + "grad_norm": 0.35546875, + "learning_rate": 0.0004222085098431353, + "loss": 0.2113, + "step": 187030 + }, + { + "epoch": 7.75, + "grad_norm": 0.453125, + "learning_rate": 0.0004222006477924977, + "loss": 0.2283, + "step": 187040 + }, + { + "epoch": 7.75, + "grad_norm": 0.671875, + "learning_rate": 0.0004221927854177971, + "loss": 0.1644, + "step": 187050 + }, + { + "epoch": 7.75, + "grad_norm": 0.6640625, + "learning_rate": 0.0004221849227190483, + "loss": 0.174, + "step": 187060 + }, + { + "epoch": 7.75, + "grad_norm": 1.5703125, + "learning_rate": 0.0004221770596962661, + "loss": 0.213, + "step": 187070 + }, + { + "epoch": 7.75, + "grad_norm": 0.8125, + "learning_rate": 0.0004221691963494651, + "loss": 0.1979, + "step": 187080 + }, + { + "epoch": 7.75, + "grad_norm": 0.58203125, + "learning_rate": 0.0004221613326786604, + "loss": 0.1462, + "step": 187090 + }, + { + "epoch": 7.75, + "grad_norm": 1.109375, + "learning_rate": 0.0004221534686838666, + "loss": 0.1764, + "step": 187100 + }, + { + "epoch": 7.75, + "grad_norm": 0.6171875, + "learning_rate": 0.0004221456043650986, + "loss": 0.2059, + "step": 187110 + }, + { + "epoch": 7.75, + "grad_norm": 0.84375, + "learning_rate": 0.0004221377397223712, + "loss": 0.211, + "step": 187120 + }, + { + "epoch": 7.75, + "grad_norm": 0.81640625, + "learning_rate": 0.00042212987475569907, + "loss": 0.1804, + "step": 187130 + }, + { + "epoch": 7.75, + "grad_norm": 0.67578125, + "learning_rate": 0.0004221220094650972, + "loss": 0.2361, + "step": 187140 + }, + { + "epoch": 7.75, + "grad_norm": 1.1953125, + "learning_rate": 0.00042211414385058023, + "loss": 0.1811, + "step": 187150 + }, + { + "epoch": 7.75, + "grad_norm": 0.458984375, + "learning_rate": 0.00042210627791216306, + "loss": 0.2065, + "step": 187160 + }, + { + "epoch": 7.75, + "grad_norm": 0.6484375, + "learning_rate": 0.00042209841164986045, + "loss": 0.1709, + "step": 187170 + }, + { + "epoch": 7.75, + "grad_norm": 0.75, + "learning_rate": 0.00042209054506368724, + "loss": 0.1789, + "step": 187180 + }, + { + "epoch": 7.75, + "grad_norm": 0.80859375, + "learning_rate": 0.0004220826781536582, + "loss": 0.2262, + "step": 187190 + }, + { + "epoch": 7.75, + "grad_norm": 1.7421875, + "learning_rate": 0.0004220748109197881, + "loss": 0.1844, + "step": 187200 + }, + { + "epoch": 7.75, + "grad_norm": 0.1728515625, + "learning_rate": 0.00042206694336209186, + "loss": 0.251, + "step": 187210 + }, + { + "epoch": 7.75, + "grad_norm": 0.71484375, + "learning_rate": 0.0004220590754805841, + "loss": 0.1635, + "step": 187220 + }, + { + "epoch": 7.76, + "grad_norm": 0.59375, + "learning_rate": 0.0004220512072752798, + "loss": 0.2318, + "step": 187230 + }, + { + "epoch": 7.76, + "grad_norm": 0.80859375, + "learning_rate": 0.00042204333874619373, + "loss": 0.1731, + "step": 187240 + }, + { + "epoch": 7.76, + "grad_norm": 0.671875, + "learning_rate": 0.0004220354698933407, + "loss": 0.1779, + "step": 187250 + }, + { + "epoch": 7.76, + "grad_norm": 0.54296875, + "learning_rate": 0.0004220276007167354, + "loss": 0.1582, + "step": 187260 + }, + { + "epoch": 7.76, + "grad_norm": 0.6484375, + "learning_rate": 0.0004220197312163928, + "loss": 0.1844, + "step": 187270 + }, + { + "epoch": 7.76, + "grad_norm": 0.65625, + "learning_rate": 0.0004220118613923276, + "loss": 0.1589, + "step": 187280 + }, + { + "epoch": 7.76, + "grad_norm": 0.74609375, + "learning_rate": 0.00042200399124455476, + "loss": 0.1835, + "step": 187290 + }, + { + "epoch": 7.76, + "grad_norm": 0.6171875, + "learning_rate": 0.0004219961207730889, + "loss": 0.1859, + "step": 187300 + }, + { + "epoch": 7.76, + "grad_norm": 1.578125, + "learning_rate": 0.00042198824997794496, + "loss": 0.2253, + "step": 187310 + }, + { + "epoch": 7.76, + "grad_norm": 1.1171875, + "learning_rate": 0.00042198037885913766, + "loss": 0.1591, + "step": 187320 + }, + { + "epoch": 7.76, + "grad_norm": 0.578125, + "learning_rate": 0.0004219725074166818, + "loss": 0.203, + "step": 187330 + }, + { + "epoch": 7.76, + "grad_norm": 0.9140625, + "learning_rate": 0.0004219646356505923, + "loss": 0.1886, + "step": 187340 + }, + { + "epoch": 7.76, + "grad_norm": 0.6640625, + "learning_rate": 0.000421956763560884, + "loss": 0.1792, + "step": 187350 + }, + { + "epoch": 7.76, + "grad_norm": 0.95703125, + "learning_rate": 0.00042194889114757165, + "loss": 0.2534, + "step": 187360 + }, + { + "epoch": 7.76, + "grad_norm": 2.5625, + "learning_rate": 0.00042194101841066997, + "loss": 0.2219, + "step": 187370 + }, + { + "epoch": 7.76, + "grad_norm": 0.423828125, + "learning_rate": 0.0004219331453501939, + "loss": 0.1965, + "step": 187380 + }, + { + "epoch": 7.76, + "grad_norm": 0.271484375, + "learning_rate": 0.00042192527196615817, + "loss": 0.2274, + "step": 187390 + }, + { + "epoch": 7.76, + "grad_norm": 0.62109375, + "learning_rate": 0.00042191739825857765, + "loss": 0.2447, + "step": 187400 + }, + { + "epoch": 7.76, + "grad_norm": 0.890625, + "learning_rate": 0.0004219095242274672, + "loss": 0.2195, + "step": 187410 + }, + { + "epoch": 7.76, + "grad_norm": 0.435546875, + "learning_rate": 0.00042190164987284155, + "loss": 0.1649, + "step": 187420 + }, + { + "epoch": 7.76, + "grad_norm": 0.6953125, + "learning_rate": 0.0004218937751947156, + "loss": 0.2333, + "step": 187430 + }, + { + "epoch": 7.76, + "grad_norm": 1.4765625, + "learning_rate": 0.0004218859001931041, + "loss": 0.1982, + "step": 187440 + }, + { + "epoch": 7.76, + "grad_norm": 0.66796875, + "learning_rate": 0.0004218780248680219, + "loss": 0.1909, + "step": 187450 + }, + { + "epoch": 7.76, + "grad_norm": 0.66015625, + "learning_rate": 0.00042187014921948385, + "loss": 0.1928, + "step": 187460 + }, + { + "epoch": 7.76, + "grad_norm": 0.71875, + "learning_rate": 0.0004218622732475047, + "loss": 0.1898, + "step": 187470 + }, + { + "epoch": 7.77, + "grad_norm": 0.58984375, + "learning_rate": 0.00042185439695209933, + "loss": 0.183, + "step": 187480 + }, + { + "epoch": 7.77, + "grad_norm": 0.625, + "learning_rate": 0.0004218465203332825, + "loss": 0.1898, + "step": 187490 + }, + { + "epoch": 7.77, + "grad_norm": 0.65625, + "learning_rate": 0.00042183864339106916, + "loss": 0.1724, + "step": 187500 + }, + { + "epoch": 7.77, + "grad_norm": 1.15625, + "learning_rate": 0.000421830766125474, + "loss": 0.2252, + "step": 187510 + }, + { + "epoch": 7.77, + "grad_norm": 0.96875, + "learning_rate": 0.00042182288853651186, + "loss": 0.217, + "step": 187520 + }, + { + "epoch": 7.77, + "grad_norm": 0.5625, + "learning_rate": 0.0004218150106241977, + "loss": 0.2523, + "step": 187530 + }, + { + "epoch": 7.77, + "grad_norm": 0.56640625, + "learning_rate": 0.00042180713238854615, + "loss": 0.251, + "step": 187540 + }, + { + "epoch": 7.77, + "grad_norm": 0.66015625, + "learning_rate": 0.00042179925382957215, + "loss": 0.1868, + "step": 187550 + }, + { + "epoch": 7.77, + "grad_norm": 0.80078125, + "learning_rate": 0.0004217913749472905, + "loss": 0.1779, + "step": 187560 + }, + { + "epoch": 7.77, + "grad_norm": 1.203125, + "learning_rate": 0.000421783495741716, + "loss": 0.1774, + "step": 187570 + }, + { + "epoch": 7.77, + "grad_norm": 1.2421875, + "learning_rate": 0.00042177561621286366, + "loss": 0.215, + "step": 187580 + }, + { + "epoch": 7.77, + "grad_norm": 0.9296875, + "learning_rate": 0.00042176773636074807, + "loss": 0.1913, + "step": 187590 + }, + { + "epoch": 7.77, + "grad_norm": 0.61328125, + "learning_rate": 0.0004217598561853841, + "loss": 0.1555, + "step": 187600 + }, + { + "epoch": 7.77, + "grad_norm": 1.3125, + "learning_rate": 0.0004217519756867867, + "loss": 0.1957, + "step": 187610 + }, + { + "epoch": 7.77, + "grad_norm": 0.73828125, + "learning_rate": 0.00042174409486497063, + "loss": 0.219, + "step": 187620 + }, + { + "epoch": 7.77, + "grad_norm": 0.52734375, + "learning_rate": 0.0004217362137199506, + "loss": 0.1731, + "step": 187630 + }, + { + "epoch": 7.77, + "grad_norm": 1.3671875, + "learning_rate": 0.0004217283322517417, + "loss": 0.2312, + "step": 187640 + }, + { + "epoch": 7.77, + "grad_norm": 1.28125, + "learning_rate": 0.00042172045046035857, + "loss": 0.2451, + "step": 187650 + }, + { + "epoch": 7.77, + "grad_norm": 0.5859375, + "learning_rate": 0.00042171256834581613, + "loss": 0.1737, + "step": 187660 + }, + { + "epoch": 7.77, + "grad_norm": 0.59765625, + "learning_rate": 0.00042170468590812914, + "loss": 0.2278, + "step": 187670 + }, + { + "epoch": 7.77, + "grad_norm": 1.2109375, + "learning_rate": 0.0004216968031473125, + "loss": 0.1996, + "step": 187680 + }, + { + "epoch": 7.77, + "grad_norm": 0.8984375, + "learning_rate": 0.000421688920063381, + "loss": 0.2394, + "step": 187690 + }, + { + "epoch": 7.77, + "grad_norm": 0.455078125, + "learning_rate": 0.0004216810366563495, + "loss": 0.2203, + "step": 187700 + }, + { + "epoch": 7.77, + "grad_norm": 0.94921875, + "learning_rate": 0.00042167315292623283, + "loss": 0.1934, + "step": 187710 + }, + { + "epoch": 7.78, + "grad_norm": 1.0078125, + "learning_rate": 0.0004216652688730458, + "loss": 0.2527, + "step": 187720 + }, + { + "epoch": 7.78, + "grad_norm": 0.58203125, + "learning_rate": 0.00042165738449680335, + "loss": 0.2089, + "step": 187730 + }, + { + "epoch": 7.78, + "grad_norm": 0.67578125, + "learning_rate": 0.00042164949979752017, + "loss": 0.2324, + "step": 187740 + }, + { + "epoch": 7.78, + "grad_norm": 0.48828125, + "learning_rate": 0.0004216416147752112, + "loss": 0.2624, + "step": 187750 + }, + { + "epoch": 7.78, + "grad_norm": 0.796875, + "learning_rate": 0.0004216337294298912, + "loss": 0.1986, + "step": 187760 + }, + { + "epoch": 7.78, + "grad_norm": 1.5625, + "learning_rate": 0.0004216258437615751, + "loss": 0.2585, + "step": 187770 + }, + { + "epoch": 7.78, + "grad_norm": 0.7265625, + "learning_rate": 0.0004216179577702777, + "loss": 0.2224, + "step": 187780 + }, + { + "epoch": 7.78, + "grad_norm": 0.51171875, + "learning_rate": 0.00042161007145601385, + "loss": 0.1872, + "step": 187790 + }, + { + "epoch": 7.78, + "grad_norm": 0.6796875, + "learning_rate": 0.0004216021848187983, + "loss": 0.2094, + "step": 187800 + }, + { + "epoch": 7.78, + "grad_norm": 0.77734375, + "learning_rate": 0.000421594297858646, + "loss": 0.1848, + "step": 187810 + }, + { + "epoch": 7.78, + "grad_norm": 0.51171875, + "learning_rate": 0.0004215864105755718, + "loss": 0.2127, + "step": 187820 + }, + { + "epoch": 7.78, + "grad_norm": 1.515625, + "learning_rate": 0.00042157852296959047, + "loss": 0.2288, + "step": 187830 + }, + { + "epoch": 7.78, + "grad_norm": 0.49609375, + "learning_rate": 0.0004215706350407169, + "loss": 0.2156, + "step": 187840 + }, + { + "epoch": 7.78, + "grad_norm": 3.921875, + "learning_rate": 0.00042156274678896595, + "loss": 0.219, + "step": 187850 + }, + { + "epoch": 7.78, + "grad_norm": 0.80078125, + "learning_rate": 0.0004215548582143524, + "loss": 0.2284, + "step": 187860 + }, + { + "epoch": 7.78, + "grad_norm": 1.046875, + "learning_rate": 0.00042154696931689115, + "loss": 0.1699, + "step": 187870 + }, + { + "epoch": 7.78, + "grad_norm": 1.1171875, + "learning_rate": 0.000421539080096597, + "loss": 0.2227, + "step": 187880 + }, + { + "epoch": 7.78, + "grad_norm": 0.68359375, + "learning_rate": 0.0004215311905534849, + "loss": 0.1456, + "step": 187890 + }, + { + "epoch": 7.78, + "grad_norm": 0.9609375, + "learning_rate": 0.00042152330068756955, + "loss": 0.1826, + "step": 187900 + }, + { + "epoch": 7.78, + "grad_norm": 0.42578125, + "learning_rate": 0.0004215154104988659, + "loss": 0.223, + "step": 187910 + }, + { + "epoch": 7.78, + "grad_norm": 0.388671875, + "learning_rate": 0.0004215075199873887, + "loss": 0.1619, + "step": 187920 + }, + { + "epoch": 7.78, + "grad_norm": 0.51953125, + "learning_rate": 0.000421499629153153, + "loss": 0.2008, + "step": 187930 + }, + { + "epoch": 7.78, + "grad_norm": 0.4296875, + "learning_rate": 0.00042149173799617347, + "loss": 0.2235, + "step": 187940 + }, + { + "epoch": 7.78, + "grad_norm": 0.76953125, + "learning_rate": 0.000421483846516465, + "loss": 0.2366, + "step": 187950 + }, + { + "epoch": 7.79, + "grad_norm": 0.62890625, + "learning_rate": 0.00042147595471404245, + "loss": 0.149, + "step": 187960 + }, + { + "epoch": 7.79, + "grad_norm": 1.015625, + "learning_rate": 0.00042146806258892063, + "loss": 0.203, + "step": 187970 + }, + { + "epoch": 7.79, + "grad_norm": 1.65625, + "learning_rate": 0.0004214601701411145, + "loss": 0.1721, + "step": 187980 + }, + { + "epoch": 7.79, + "grad_norm": 1.21875, + "learning_rate": 0.0004214522773706387, + "loss": 0.2233, + "step": 187990 + }, + { + "epoch": 7.79, + "grad_norm": 1.046875, + "learning_rate": 0.0004214443842775084, + "loss": 0.2063, + "step": 188000 + }, + { + "epoch": 7.79, + "grad_norm": 0.984375, + "learning_rate": 0.00042143649086173827, + "loss": 0.1968, + "step": 188010 + }, + { + "epoch": 7.79, + "grad_norm": 0.6015625, + "learning_rate": 0.00042142859712334307, + "loss": 0.2092, + "step": 188020 + }, + { + "epoch": 7.79, + "grad_norm": 0.62109375, + "learning_rate": 0.0004214207030623378, + "loss": 0.1954, + "step": 188030 + }, + { + "epoch": 7.79, + "grad_norm": 0.53125, + "learning_rate": 0.00042141280867873733, + "loss": 0.2349, + "step": 188040 + }, + { + "epoch": 7.79, + "grad_norm": 0.70703125, + "learning_rate": 0.00042140491397255644, + "loss": 0.2242, + "step": 188050 + }, + { + "epoch": 7.79, + "grad_norm": 1.1640625, + "learning_rate": 0.00042139701894381, + "loss": 0.1754, + "step": 188060 + }, + { + "epoch": 7.79, + "grad_norm": 0.85546875, + "learning_rate": 0.0004213891235925129, + "loss": 0.2397, + "step": 188070 + }, + { + "epoch": 7.79, + "grad_norm": 0.0, + "learning_rate": 0.00042138122791867994, + "loss": 0.1634, + "step": 188080 + }, + { + "epoch": 7.79, + "grad_norm": 0.40234375, + "learning_rate": 0.00042137333192232606, + "loss": 0.1857, + "step": 188090 + }, + { + "epoch": 7.79, + "grad_norm": 0.74609375, + "learning_rate": 0.0004213654356034661, + "loss": 0.216, + "step": 188100 + }, + { + "epoch": 7.79, + "grad_norm": 0.86328125, + "learning_rate": 0.0004213575389621148, + "loss": 0.2096, + "step": 188110 + }, + { + "epoch": 7.79, + "grad_norm": 0.734375, + "learning_rate": 0.00042134964199828717, + "loss": 0.2011, + "step": 188120 + }, + { + "epoch": 7.79, + "grad_norm": 0.87890625, + "learning_rate": 0.00042134174471199806, + "loss": 0.1997, + "step": 188130 + }, + { + "epoch": 7.79, + "grad_norm": 0.70703125, + "learning_rate": 0.0004213338471032623, + "loss": 0.1996, + "step": 188140 + }, + { + "epoch": 7.79, + "grad_norm": 0.35546875, + "learning_rate": 0.00042132594917209466, + "loss": 0.231, + "step": 188150 + }, + { + "epoch": 7.79, + "grad_norm": 0.51953125, + "learning_rate": 0.00042131805091851003, + "loss": 0.1981, + "step": 188160 + }, + { + "epoch": 7.79, + "grad_norm": 0.78515625, + "learning_rate": 0.00042131015234252346, + "loss": 0.1996, + "step": 188170 + }, + { + "epoch": 7.79, + "grad_norm": 0.453125, + "learning_rate": 0.00042130225344414965, + "loss": 0.2072, + "step": 188180 + }, + { + "epoch": 7.79, + "grad_norm": 0.890625, + "learning_rate": 0.0004212943542234035, + "loss": 0.208, + "step": 188190 + }, + { + "epoch": 7.8, + "grad_norm": 0.84765625, + "learning_rate": 0.00042128645468029986, + "loss": 0.2333, + "step": 188200 + }, + { + "epoch": 7.8, + "grad_norm": 0.458984375, + "learning_rate": 0.0004212785548148536, + "loss": 0.2483, + "step": 188210 + }, + { + "epoch": 7.8, + "grad_norm": 0.9609375, + "learning_rate": 0.00042127065462707966, + "loss": 0.2352, + "step": 188220 + }, + { + "epoch": 7.8, + "grad_norm": 0.45703125, + "learning_rate": 0.0004212627541169927, + "loss": 0.2156, + "step": 188230 + }, + { + "epoch": 7.8, + "grad_norm": 0.54296875, + "learning_rate": 0.0004212548532846079, + "loss": 0.2282, + "step": 188240 + }, + { + "epoch": 7.8, + "grad_norm": 0.9765625, + "learning_rate": 0.0004212469521299398, + "loss": 0.2788, + "step": 188250 + }, + { + "epoch": 7.8, + "grad_norm": 0.72265625, + "learning_rate": 0.00042123905065300356, + "loss": 0.2196, + "step": 188260 + }, + { + "epoch": 7.8, + "grad_norm": 1.0625, + "learning_rate": 0.00042123114885381387, + "loss": 0.2479, + "step": 188270 + }, + { + "epoch": 7.8, + "grad_norm": 0.55859375, + "learning_rate": 0.00042122324673238564, + "loss": 0.1997, + "step": 188280 + }, + { + "epoch": 7.8, + "grad_norm": 2.078125, + "learning_rate": 0.00042121534428873376, + "loss": 0.1956, + "step": 188290 + }, + { + "epoch": 7.8, + "grad_norm": 0.49609375, + "learning_rate": 0.00042120744152287316, + "loss": 0.2572, + "step": 188300 + }, + { + "epoch": 7.8, + "grad_norm": 1.2421875, + "learning_rate": 0.0004211995384348185, + "loss": 0.1838, + "step": 188310 + }, + { + "epoch": 7.8, + "grad_norm": 0.67578125, + "learning_rate": 0.0004211916350245849, + "loss": 0.1783, + "step": 188320 + }, + { + "epoch": 7.8, + "grad_norm": 0.546875, + "learning_rate": 0.00042118373129218707, + "loss": 0.2292, + "step": 188330 + }, + { + "epoch": 7.8, + "grad_norm": 1.109375, + "learning_rate": 0.00042117582723764, + "loss": 0.2465, + "step": 188340 + }, + { + "epoch": 7.8, + "grad_norm": 0.6875, + "learning_rate": 0.0004211679228609585, + "loss": 0.2447, + "step": 188350 + }, + { + "epoch": 7.8, + "grad_norm": 0.416015625, + "learning_rate": 0.00042116001816215745, + "loss": 0.1692, + "step": 188360 + }, + { + "epoch": 7.8, + "grad_norm": 0.486328125, + "learning_rate": 0.0004211521131412517, + "loss": 0.1967, + "step": 188370 + }, + { + "epoch": 7.8, + "grad_norm": 0.58984375, + "learning_rate": 0.0004211442077982562, + "loss": 0.206, + "step": 188380 + }, + { + "epoch": 7.8, + "grad_norm": 0.30078125, + "learning_rate": 0.0004211363021331857, + "loss": 0.179, + "step": 188390 + }, + { + "epoch": 7.8, + "grad_norm": 1.046875, + "learning_rate": 0.00042112839614605525, + "loss": 0.2135, + "step": 188400 + }, + { + "epoch": 7.8, + "grad_norm": 0.58984375, + "learning_rate": 0.0004211204898368796, + "loss": 0.178, + "step": 188410 + }, + { + "epoch": 7.8, + "grad_norm": 0.84765625, + "learning_rate": 0.0004211125832056737, + "loss": 0.1911, + "step": 188420 + }, + { + "epoch": 7.8, + "grad_norm": 0.421875, + "learning_rate": 0.0004211046762524523, + "loss": 0.2311, + "step": 188430 + }, + { + "epoch": 7.81, + "grad_norm": 0.5234375, + "learning_rate": 0.0004210967689772305, + "loss": 0.2062, + "step": 188440 + }, + { + "epoch": 7.81, + "grad_norm": 1.2421875, + "learning_rate": 0.000421088861380023, + "loss": 0.2439, + "step": 188450 + }, + { + "epoch": 7.81, + "grad_norm": 1.0390625, + "learning_rate": 0.00042108095346084473, + "loss": 0.2339, + "step": 188460 + }, + { + "epoch": 7.81, + "grad_norm": 1.265625, + "learning_rate": 0.00042107304521971057, + "loss": 0.1713, + "step": 188470 + }, + { + "epoch": 7.81, + "grad_norm": 0.427734375, + "learning_rate": 0.0004210651366566355, + "loss": 0.2135, + "step": 188480 + }, + { + "epoch": 7.81, + "grad_norm": 0.345703125, + "learning_rate": 0.00042105722777163425, + "loss": 0.2365, + "step": 188490 + }, + { + "epoch": 7.81, + "grad_norm": 0.79296875, + "learning_rate": 0.00042104931856472175, + "loss": 0.2106, + "step": 188500 + }, + { + "epoch": 7.81, + "grad_norm": 0.625, + "learning_rate": 0.0004210414090359129, + "loss": 0.2376, + "step": 188510 + }, + { + "epoch": 7.81, + "grad_norm": 0.64453125, + "learning_rate": 0.0004210334991852226, + "loss": 0.1861, + "step": 188520 + }, + { + "epoch": 7.81, + "grad_norm": 0.333984375, + "learning_rate": 0.0004210255890126658, + "loss": 0.2469, + "step": 188530 + }, + { + "epoch": 7.81, + "grad_norm": 0.478515625, + "learning_rate": 0.00042101767851825724, + "loss": 0.2053, + "step": 188540 + }, + { + "epoch": 7.81, + "grad_norm": 0.60546875, + "learning_rate": 0.0004210097677020119, + "loss": 0.1647, + "step": 188550 + }, + { + "epoch": 7.81, + "grad_norm": 1.4921875, + "learning_rate": 0.0004210018565639446, + "loss": 0.2033, + "step": 188560 + }, + { + "epoch": 7.81, + "grad_norm": 0.5703125, + "learning_rate": 0.0004209939451040703, + "loss": 0.1644, + "step": 188570 + }, + { + "epoch": 7.81, + "grad_norm": 0.875, + "learning_rate": 0.0004209860333224039, + "loss": 0.2093, + "step": 188580 + }, + { + "epoch": 7.81, + "grad_norm": 0.6171875, + "learning_rate": 0.0004209781212189602, + "loss": 0.1807, + "step": 188590 + }, + { + "epoch": 7.81, + "grad_norm": 2.265625, + "learning_rate": 0.00042097020879375415, + "loss": 0.1897, + "step": 188600 + }, + { + "epoch": 7.81, + "grad_norm": 0.8984375, + "learning_rate": 0.0004209622960468007, + "loss": 0.213, + "step": 188610 + }, + { + "epoch": 7.81, + "grad_norm": 0.546875, + "learning_rate": 0.0004209543829781145, + "loss": 0.1753, + "step": 188620 + }, + { + "epoch": 7.81, + "grad_norm": 1.3125, + "learning_rate": 0.0004209464695877108, + "loss": 0.2342, + "step": 188630 + }, + { + "epoch": 7.81, + "grad_norm": 0.5546875, + "learning_rate": 0.0004209385558756042, + "loss": 0.1997, + "step": 188640 + }, + { + "epoch": 7.81, + "grad_norm": 0.59765625, + "learning_rate": 0.00042093064184180976, + "loss": 0.2051, + "step": 188650 + }, + { + "epoch": 7.81, + "grad_norm": 0.431640625, + "learning_rate": 0.0004209227274863422, + "loss": 0.2351, + "step": 188660 + }, + { + "epoch": 7.81, + "grad_norm": 0.57421875, + "learning_rate": 0.0004209148128092166, + "loss": 0.2199, + "step": 188670 + }, + { + "epoch": 7.82, + "grad_norm": 0.56640625, + "learning_rate": 0.00042090689781044776, + "loss": 0.209, + "step": 188680 + }, + { + "epoch": 7.82, + "grad_norm": 0.89453125, + "learning_rate": 0.0004208989824900507, + "loss": 0.199, + "step": 188690 + }, + { + "epoch": 7.82, + "grad_norm": 1.53125, + "learning_rate": 0.00042089106684804013, + "loss": 0.224, + "step": 188700 + }, + { + "epoch": 7.82, + "grad_norm": 0.6640625, + "learning_rate": 0.00042088315088443097, + "loss": 0.2296, + "step": 188710 + }, + { + "epoch": 7.82, + "grad_norm": 0.46875, + "learning_rate": 0.0004208752345992383, + "loss": 0.2078, + "step": 188720 + }, + { + "epoch": 7.82, + "grad_norm": 0.640625, + "learning_rate": 0.0004208673179924768, + "loss": 0.1563, + "step": 188730 + }, + { + "epoch": 7.82, + "grad_norm": 0.984375, + "learning_rate": 0.0004208594010641614, + "loss": 0.111, + "step": 188740 + }, + { + "epoch": 7.82, + "grad_norm": 0.64453125, + "learning_rate": 0.00042085148381430717, + "loss": 0.1951, + "step": 188750 + }, + { + "epoch": 7.82, + "grad_norm": 1.0234375, + "learning_rate": 0.00042084356624292886, + "loss": 0.2462, + "step": 188760 + }, + { + "epoch": 7.82, + "grad_norm": 0.7734375, + "learning_rate": 0.00042083564835004145, + "loss": 0.2252, + "step": 188770 + }, + { + "epoch": 7.82, + "grad_norm": 0.890625, + "learning_rate": 0.00042082773013565975, + "loss": 0.1935, + "step": 188780 + }, + { + "epoch": 7.82, + "grad_norm": 0.859375, + "learning_rate": 0.0004208198115997987, + "loss": 0.2368, + "step": 188790 + }, + { + "epoch": 7.82, + "grad_norm": 0.53515625, + "learning_rate": 0.0004208118927424732, + "loss": 0.1958, + "step": 188800 + }, + { + "epoch": 7.82, + "grad_norm": 0.5859375, + "learning_rate": 0.0004208039735636983, + "loss": 0.2221, + "step": 188810 + }, + { + "epoch": 7.82, + "grad_norm": 0.36328125, + "learning_rate": 0.0004207960540634886, + "loss": 0.2073, + "step": 188820 + }, + { + "epoch": 7.82, + "grad_norm": 1.0546875, + "learning_rate": 0.0004207881342418592, + "loss": 0.2737, + "step": 188830 + }, + { + "epoch": 7.82, + "grad_norm": 0.52734375, + "learning_rate": 0.00042078021409882503, + "loss": 0.2446, + "step": 188840 + }, + { + "epoch": 7.82, + "grad_norm": 1.3515625, + "learning_rate": 0.00042077229363440095, + "loss": 0.2136, + "step": 188850 + }, + { + "epoch": 7.82, + "grad_norm": 1.2734375, + "learning_rate": 0.00042076437284860183, + "loss": 0.1542, + "step": 188860 + }, + { + "epoch": 7.82, + "grad_norm": 0.56640625, + "learning_rate": 0.00042075645174144253, + "loss": 0.1732, + "step": 188870 + }, + { + "epoch": 7.82, + "grad_norm": 0.66796875, + "learning_rate": 0.0004207485303129381, + "loss": 0.1871, + "step": 188880 + }, + { + "epoch": 7.82, + "grad_norm": 0.703125, + "learning_rate": 0.00042074060856310335, + "loss": 0.2786, + "step": 188890 + }, + { + "epoch": 7.82, + "grad_norm": 0.58984375, + "learning_rate": 0.0004207326864919533, + "loss": 0.1812, + "step": 188900 + }, + { + "epoch": 7.82, + "grad_norm": 0.390625, + "learning_rate": 0.00042072476409950267, + "loss": 0.2171, + "step": 188910 + }, + { + "epoch": 7.83, + "grad_norm": 1.03125, + "learning_rate": 0.00042071684138576647, + "loss": 0.1989, + "step": 188920 + }, + { + "epoch": 7.83, + "grad_norm": 0.447265625, + "learning_rate": 0.00042070891835075964, + "loss": 0.2317, + "step": 188930 + }, + { + "epoch": 7.83, + "grad_norm": 1.234375, + "learning_rate": 0.0004207009949944971, + "loss": 0.2105, + "step": 188940 + }, + { + "epoch": 7.83, + "grad_norm": 0.734375, + "learning_rate": 0.00042069307131699363, + "loss": 0.1815, + "step": 188950 + }, + { + "epoch": 7.83, + "grad_norm": 0.86328125, + "learning_rate": 0.0004206851473182644, + "loss": 0.1834, + "step": 188960 + }, + { + "epoch": 7.83, + "grad_norm": 1.65625, + "learning_rate": 0.000420677222998324, + "loss": 0.1796, + "step": 188970 + }, + { + "epoch": 7.83, + "grad_norm": 0.56640625, + "learning_rate": 0.00042066929835718746, + "loss": 0.2258, + "step": 188980 + }, + { + "epoch": 7.83, + "grad_norm": 0.439453125, + "learning_rate": 0.0004206613733948699, + "loss": 0.2429, + "step": 188990 + }, + { + "epoch": 7.83, + "grad_norm": 0.9140625, + "learning_rate": 0.00042065344811138594, + "loss": 0.1943, + "step": 189000 + }, + { + "epoch": 7.83, + "grad_norm": 0.828125, + "learning_rate": 0.0004206455225067506, + "loss": 0.2094, + "step": 189010 + }, + { + "epoch": 7.83, + "grad_norm": 2.734375, + "learning_rate": 0.00042063759658097886, + "loss": 0.2239, + "step": 189020 + }, + { + "epoch": 7.83, + "grad_norm": 0.5234375, + "learning_rate": 0.00042062967033408566, + "loss": 0.2098, + "step": 189030 + }, + { + "epoch": 7.83, + "grad_norm": 0.91796875, + "learning_rate": 0.00042062174376608576, + "loss": 0.2177, + "step": 189040 + }, + { + "epoch": 7.83, + "grad_norm": 2.140625, + "learning_rate": 0.0004206138168769942, + "loss": 0.2475, + "step": 189050 + }, + { + "epoch": 7.83, + "grad_norm": 1.53125, + "learning_rate": 0.0004206058896668259, + "loss": 0.211, + "step": 189060 + }, + { + "epoch": 7.83, + "grad_norm": 0.431640625, + "learning_rate": 0.0004205979621355956, + "loss": 0.1813, + "step": 189070 + }, + { + "epoch": 7.83, + "grad_norm": 0.49609375, + "learning_rate": 0.00042059003428331857, + "loss": 0.2064, + "step": 189080 + }, + { + "epoch": 7.83, + "grad_norm": 0.59765625, + "learning_rate": 0.00042058210611000936, + "loss": 0.2343, + "step": 189090 + }, + { + "epoch": 7.83, + "grad_norm": 0.875, + "learning_rate": 0.00042057417761568307, + "loss": 0.2204, + "step": 189100 + }, + { + "epoch": 7.83, + "grad_norm": 1.1953125, + "learning_rate": 0.00042056624880035465, + "loss": 0.2256, + "step": 189110 + }, + { + "epoch": 7.83, + "grad_norm": 0.9296875, + "learning_rate": 0.0004205583196640389, + "loss": 0.162, + "step": 189120 + }, + { + "epoch": 7.83, + "grad_norm": 0.51171875, + "learning_rate": 0.00042055039020675087, + "loss": 0.1318, + "step": 189130 + }, + { + "epoch": 7.83, + "grad_norm": 0.88671875, + "learning_rate": 0.0004205424604285054, + "loss": 0.1736, + "step": 189140 + }, + { + "epoch": 7.83, + "grad_norm": 0.59765625, + "learning_rate": 0.0004205345303293174, + "loss": 0.1828, + "step": 189150 + }, + { + "epoch": 7.83, + "grad_norm": 0.828125, + "learning_rate": 0.00042052659990920186, + "loss": 0.1647, + "step": 189160 + }, + { + "epoch": 7.84, + "grad_norm": 1.2578125, + "learning_rate": 0.00042051866916817366, + "loss": 0.2172, + "step": 189170 + }, + { + "epoch": 7.84, + "grad_norm": 1.15625, + "learning_rate": 0.0004205107381062478, + "loss": 0.1978, + "step": 189180 + }, + { + "epoch": 7.84, + "grad_norm": 1.203125, + "learning_rate": 0.0004205028067234391, + "loss": 0.1996, + "step": 189190 + }, + { + "epoch": 7.84, + "grad_norm": 0.515625, + "learning_rate": 0.00042049487501976253, + "loss": 0.2285, + "step": 189200 + }, + { + "epoch": 7.84, + "grad_norm": 0.388671875, + "learning_rate": 0.00042048694299523303, + "loss": 0.232, + "step": 189210 + }, + { + "epoch": 7.84, + "grad_norm": 0.458984375, + "learning_rate": 0.0004204790106498655, + "loss": 0.2189, + "step": 189220 + }, + { + "epoch": 7.84, + "grad_norm": 1.2734375, + "learning_rate": 0.00042047107798367486, + "loss": 0.2073, + "step": 189230 + }, + { + "epoch": 7.84, + "grad_norm": 0.875, + "learning_rate": 0.0004204631449966761, + "loss": 0.2695, + "step": 189240 + }, + { + "epoch": 7.84, + "grad_norm": 0.53515625, + "learning_rate": 0.0004204552116888841, + "loss": 0.177, + "step": 189250 + }, + { + "epoch": 7.84, + "grad_norm": 0.53125, + "learning_rate": 0.0004204472780603138, + "loss": 0.2052, + "step": 189260 + }, + { + "epoch": 7.84, + "grad_norm": 0.625, + "learning_rate": 0.0004204393441109801, + "loss": 0.217, + "step": 189270 + }, + { + "epoch": 7.84, + "grad_norm": 0.62890625, + "learning_rate": 0.000420431409840898, + "loss": 0.1708, + "step": 189280 + }, + { + "epoch": 7.84, + "grad_norm": 0.53515625, + "learning_rate": 0.0004204234752500824, + "loss": 0.1953, + "step": 189290 + }, + { + "epoch": 7.84, + "grad_norm": 0.87890625, + "learning_rate": 0.0004204155403385481, + "loss": 0.1923, + "step": 189300 + }, + { + "epoch": 7.84, + "grad_norm": 0.0, + "learning_rate": 0.0004204076051063103, + "loss": 0.1422, + "step": 189310 + }, + { + "epoch": 7.84, + "grad_norm": 0.609375, + "learning_rate": 0.0004203996695533838, + "loss": 0.2476, + "step": 189320 + }, + { + "epoch": 7.84, + "grad_norm": 0.421875, + "learning_rate": 0.0004203917336797834, + "loss": 0.2458, + "step": 189330 + }, + { + "epoch": 7.84, + "grad_norm": 0.796875, + "learning_rate": 0.00042038379748552426, + "loss": 0.197, + "step": 189340 + }, + { + "epoch": 7.84, + "grad_norm": 0.80859375, + "learning_rate": 0.0004203758609706212, + "loss": 0.203, + "step": 189350 + }, + { + "epoch": 7.84, + "grad_norm": 0.40234375, + "learning_rate": 0.0004203679241350892, + "loss": 0.1832, + "step": 189360 + }, + { + "epoch": 7.84, + "grad_norm": 0.6953125, + "learning_rate": 0.00042035998697894305, + "loss": 0.1887, + "step": 189370 + }, + { + "epoch": 7.84, + "grad_norm": 0.953125, + "learning_rate": 0.00042035204950219784, + "loss": 0.2224, + "step": 189380 + }, + { + "epoch": 7.84, + "grad_norm": 0.5703125, + "learning_rate": 0.0004203441117048685, + "loss": 0.2162, + "step": 189390 + }, + { + "epoch": 7.84, + "grad_norm": 0.640625, + "learning_rate": 0.00042033617358696993, + "loss": 0.2049, + "step": 189400 + }, + { + "epoch": 7.85, + "grad_norm": 2.203125, + "learning_rate": 0.00042032823514851717, + "loss": 0.1924, + "step": 189410 + }, + { + "epoch": 7.85, + "grad_norm": 0.97265625, + "learning_rate": 0.00042032029638952497, + "loss": 0.1861, + "step": 189420 + }, + { + "epoch": 7.85, + "grad_norm": 0.388671875, + "learning_rate": 0.0004203123573100083, + "loss": 0.1613, + "step": 189430 + }, + { + "epoch": 7.85, + "grad_norm": 1.15625, + "learning_rate": 0.00042030441790998226, + "loss": 0.2218, + "step": 189440 + }, + { + "epoch": 7.85, + "grad_norm": 1.0390625, + "learning_rate": 0.00042029647818946173, + "loss": 0.1712, + "step": 189450 + }, + { + "epoch": 7.85, + "grad_norm": 0.412109375, + "learning_rate": 0.00042028853814846145, + "loss": 0.2446, + "step": 189460 + }, + { + "epoch": 7.85, + "grad_norm": 0.54296875, + "learning_rate": 0.0004202805977869967, + "loss": 0.2158, + "step": 189470 + }, + { + "epoch": 7.85, + "grad_norm": 0.31640625, + "learning_rate": 0.0004202726571050822, + "loss": 0.1896, + "step": 189480 + }, + { + "epoch": 7.85, + "grad_norm": 1.1171875, + "learning_rate": 0.00042026471610273294, + "loss": 0.2173, + "step": 189490 + }, + { + "epoch": 7.85, + "grad_norm": 0.98046875, + "learning_rate": 0.0004202567747799638, + "loss": 0.2102, + "step": 189500 + }, + { + "epoch": 7.85, + "grad_norm": 0.58984375, + "learning_rate": 0.00042024883313678994, + "loss": 0.1803, + "step": 189510 + }, + { + "epoch": 7.85, + "grad_norm": 0.54296875, + "learning_rate": 0.00042024089117322604, + "loss": 0.2023, + "step": 189520 + }, + { + "epoch": 7.85, + "grad_norm": 1.03125, + "learning_rate": 0.0004202329488892873, + "loss": 0.21, + "step": 189530 + }, + { + "epoch": 7.85, + "grad_norm": 0.8046875, + "learning_rate": 0.0004202250062849884, + "loss": 0.2549, + "step": 189540 + }, + { + "epoch": 7.85, + "grad_norm": 0.859375, + "learning_rate": 0.0004202170633603445, + "loss": 0.2001, + "step": 189550 + }, + { + "epoch": 7.85, + "grad_norm": 0.56640625, + "learning_rate": 0.0004202091201153704, + "loss": 0.1922, + "step": 189560 + }, + { + "epoch": 7.85, + "grad_norm": 0.54296875, + "learning_rate": 0.0004202011765500812, + "loss": 0.1852, + "step": 189570 + }, + { + "epoch": 7.85, + "grad_norm": 0.46875, + "learning_rate": 0.0004201932326644917, + "loss": 0.2038, + "step": 189580 + }, + { + "epoch": 7.85, + "grad_norm": 0.70703125, + "learning_rate": 0.0004201852884586169, + "loss": 0.22, + "step": 189590 + }, + { + "epoch": 7.85, + "grad_norm": 1.1953125, + "learning_rate": 0.00042017734393247184, + "loss": 0.1816, + "step": 189600 + }, + { + "epoch": 7.85, + "grad_norm": 1.4609375, + "learning_rate": 0.0004201693990860714, + "loss": 0.2005, + "step": 189610 + }, + { + "epoch": 7.85, + "grad_norm": 0.478515625, + "learning_rate": 0.0004201614539194304, + "loss": 0.1966, + "step": 189620 + }, + { + "epoch": 7.85, + "grad_norm": 0.56640625, + "learning_rate": 0.00042015350843256405, + "loss": 0.2322, + "step": 189630 + }, + { + "epoch": 7.85, + "grad_norm": 0.296875, + "learning_rate": 0.00042014556262548713, + "loss": 0.1739, + "step": 189640 + }, + { + "epoch": 7.86, + "grad_norm": 1.09375, + "learning_rate": 0.0004201376164982146, + "loss": 0.2136, + "step": 189650 + }, + { + "epoch": 7.86, + "grad_norm": 0.96875, + "learning_rate": 0.0004201296700507615, + "loss": 0.1891, + "step": 189660 + }, + { + "epoch": 7.86, + "grad_norm": 1.0078125, + "learning_rate": 0.00042012172328314277, + "loss": 0.2396, + "step": 189670 + }, + { + "epoch": 7.86, + "grad_norm": 1.234375, + "learning_rate": 0.0004201137761953733, + "loss": 0.217, + "step": 189680 + }, + { + "epoch": 7.86, + "grad_norm": 1.4296875, + "learning_rate": 0.00042010582878746806, + "loss": 0.1921, + "step": 189690 + }, + { + "epoch": 7.86, + "grad_norm": 0.498046875, + "learning_rate": 0.0004200978810594419, + "loss": 0.1925, + "step": 189700 + }, + { + "epoch": 7.86, + "grad_norm": 0.96484375, + "learning_rate": 0.00042008993301131004, + "loss": 0.2041, + "step": 189710 + }, + { + "epoch": 7.86, + "grad_norm": 0.89453125, + "learning_rate": 0.00042008198464308727, + "loss": 0.1913, + "step": 189720 + }, + { + "epoch": 7.86, + "grad_norm": 0.45703125, + "learning_rate": 0.00042007403595478856, + "loss": 0.2038, + "step": 189730 + }, + { + "epoch": 7.86, + "grad_norm": 0.76171875, + "learning_rate": 0.00042006608694642887, + "loss": 0.1833, + "step": 189740 + }, + { + "epoch": 7.86, + "grad_norm": 1.2890625, + "learning_rate": 0.0004200581376180232, + "loss": 0.1866, + "step": 189750 + }, + { + "epoch": 7.86, + "grad_norm": 1.703125, + "learning_rate": 0.0004200501879695865, + "loss": 0.256, + "step": 189760 + }, + { + "epoch": 7.86, + "grad_norm": 0.7578125, + "learning_rate": 0.00042004223800113364, + "loss": 0.1806, + "step": 189770 + }, + { + "epoch": 7.86, + "grad_norm": 0.50390625, + "learning_rate": 0.0004200342877126797, + "loss": 0.1717, + "step": 189780 + }, + { + "epoch": 7.86, + "grad_norm": 1.0234375, + "learning_rate": 0.00042002633710423954, + "loss": 0.205, + "step": 189790 + }, + { + "epoch": 7.86, + "grad_norm": 0.169921875, + "learning_rate": 0.0004200183861758282, + "loss": 0.2258, + "step": 189800 + }, + { + "epoch": 7.86, + "grad_norm": 0.486328125, + "learning_rate": 0.0004200104349274607, + "loss": 0.1883, + "step": 189810 + }, + { + "epoch": 7.86, + "grad_norm": 0.8359375, + "learning_rate": 0.0004200024833591518, + "loss": 0.1852, + "step": 189820 + }, + { + "epoch": 7.86, + "grad_norm": 0.8515625, + "learning_rate": 0.0004199945314709167, + "loss": 0.1916, + "step": 189830 + }, + { + "epoch": 7.86, + "grad_norm": 0.78125, + "learning_rate": 0.0004199865792627702, + "loss": 0.2139, + "step": 189840 + }, + { + "epoch": 7.86, + "grad_norm": 1.46875, + "learning_rate": 0.0004199786267347272, + "loss": 0.2176, + "step": 189850 + }, + { + "epoch": 7.86, + "grad_norm": 0.796875, + "learning_rate": 0.00041997067388680295, + "loss": 0.2332, + "step": 189860 + }, + { + "epoch": 7.86, + "grad_norm": 1.09375, + "learning_rate": 0.0004199627207190122, + "loss": 0.1863, + "step": 189870 + }, + { + "epoch": 7.86, + "grad_norm": 1.140625, + "learning_rate": 0.00041995476723136996, + "loss": 0.2091, + "step": 189880 + }, + { + "epoch": 7.87, + "grad_norm": 0.2041015625, + "learning_rate": 0.0004199468134238911, + "loss": 0.1961, + "step": 189890 + }, + { + "epoch": 7.87, + "grad_norm": 0.9453125, + "learning_rate": 0.0004199388592965908, + "loss": 0.1859, + "step": 189900 + }, + { + "epoch": 7.87, + "grad_norm": 0.7265625, + "learning_rate": 0.00041993090484948386, + "loss": 0.1487, + "step": 189910 + }, + { + "epoch": 7.87, + "grad_norm": 0.5078125, + "learning_rate": 0.0004199229500825854, + "loss": 0.1965, + "step": 189920 + }, + { + "epoch": 7.87, + "grad_norm": 0.392578125, + "learning_rate": 0.0004199149949959102, + "loss": 0.189, + "step": 189930 + }, + { + "epoch": 7.87, + "grad_norm": 0.4765625, + "learning_rate": 0.0004199070395894734, + "loss": 0.2095, + "step": 189940 + }, + { + "epoch": 7.87, + "grad_norm": 0.44140625, + "learning_rate": 0.00041989908386328985, + "loss": 0.2088, + "step": 189950 + }, + { + "epoch": 7.87, + "grad_norm": 0.546875, + "learning_rate": 0.0004198911278173746, + "loss": 0.1887, + "step": 189960 + }, + { + "epoch": 7.87, + "grad_norm": 0.75390625, + "learning_rate": 0.0004198831714517426, + "loss": 0.1844, + "step": 189970 + }, + { + "epoch": 7.87, + "grad_norm": 1.3125, + "learning_rate": 0.00041987521476640876, + "loss": 0.2246, + "step": 189980 + }, + { + "epoch": 7.87, + "grad_norm": 0.67578125, + "learning_rate": 0.0004198672577613881, + "loss": 0.2075, + "step": 189990 + }, + { + "epoch": 7.87, + "grad_norm": 0.369140625, + "learning_rate": 0.0004198593004366957, + "loss": 0.1689, + "step": 190000 + }, + { + "epoch": 7.87, + "grad_norm": 0.70703125, + "learning_rate": 0.0004198513427923464, + "loss": 0.217, + "step": 190010 + }, + { + "epoch": 7.87, + "grad_norm": 0.1845703125, + "learning_rate": 0.0004198433848283552, + "loss": 0.1892, + "step": 190020 + }, + { + "epoch": 7.87, + "grad_norm": 0.921875, + "learning_rate": 0.00041983542654473716, + "loss": 0.1796, + "step": 190030 + }, + { + "epoch": 7.87, + "grad_norm": 0.478515625, + "learning_rate": 0.00041982746794150705, + "loss": 0.1759, + "step": 190040 + }, + { + "epoch": 7.87, + "grad_norm": 0.6640625, + "learning_rate": 0.0004198195090186801, + "loss": 0.1934, + "step": 190050 + }, + { + "epoch": 7.87, + "grad_norm": 0.6015625, + "learning_rate": 0.00041981154977627114, + "loss": 0.1751, + "step": 190060 + }, + { + "epoch": 7.87, + "grad_norm": 0.90234375, + "learning_rate": 0.00041980359021429514, + "loss": 0.2213, + "step": 190070 + }, + { + "epoch": 7.87, + "grad_norm": 0.9921875, + "learning_rate": 0.0004197956303327672, + "loss": 0.2154, + "step": 190080 + }, + { + "epoch": 7.87, + "grad_norm": 0.97265625, + "learning_rate": 0.00041978767013170216, + "loss": 0.215, + "step": 190090 + }, + { + "epoch": 7.87, + "grad_norm": 0.2275390625, + "learning_rate": 0.0004197797096111151, + "loss": 0.2241, + "step": 190100 + }, + { + "epoch": 7.87, + "grad_norm": 0.458984375, + "learning_rate": 0.000419771748771021, + "loss": 0.2162, + "step": 190110 + }, + { + "epoch": 7.87, + "grad_norm": 0.87109375, + "learning_rate": 0.0004197637876114347, + "loss": 0.2679, + "step": 190120 + }, + { + "epoch": 7.88, + "grad_norm": 2.40625, + "learning_rate": 0.0004197558261323713, + "loss": 0.1835, + "step": 190130 + }, + { + "epoch": 7.88, + "grad_norm": 1.1328125, + "learning_rate": 0.0004197478643338458, + "loss": 0.203, + "step": 190140 + }, + { + "epoch": 7.88, + "grad_norm": 0.68359375, + "learning_rate": 0.0004197399022158731, + "loss": 0.1648, + "step": 190150 + }, + { + "epoch": 7.88, + "grad_norm": 0.953125, + "learning_rate": 0.0004197319397784683, + "loss": 0.1911, + "step": 190160 + }, + { + "epoch": 7.88, + "grad_norm": 0.6015625, + "learning_rate": 0.0004197239770216463, + "loss": 0.2303, + "step": 190170 + }, + { + "epoch": 7.88, + "grad_norm": 1.28125, + "learning_rate": 0.0004197160139454221, + "loss": 0.1831, + "step": 190180 + }, + { + "epoch": 7.88, + "grad_norm": 0.296875, + "learning_rate": 0.00041970805054981073, + "loss": 0.195, + "step": 190190 + }, + { + "epoch": 7.88, + "grad_norm": 0.76171875, + "learning_rate": 0.0004197000868348271, + "loss": 0.2068, + "step": 190200 + }, + { + "epoch": 7.88, + "grad_norm": 0.212890625, + "learning_rate": 0.00041969212280048624, + "loss": 0.1908, + "step": 190210 + }, + { + "epoch": 7.88, + "grad_norm": 0.6015625, + "learning_rate": 0.00041968415844680307, + "loss": 0.1983, + "step": 190220 + }, + { + "epoch": 7.88, + "grad_norm": 1.390625, + "learning_rate": 0.00041967619377379276, + "loss": 0.2087, + "step": 190230 + }, + { + "epoch": 7.88, + "grad_norm": 0.6328125, + "learning_rate": 0.00041966822878147005, + "loss": 0.2413, + "step": 190240 + }, + { + "epoch": 7.88, + "grad_norm": 0.85546875, + "learning_rate": 0.0004196602634698501, + "loss": 0.1855, + "step": 190250 + }, + { + "epoch": 7.88, + "grad_norm": 0.796875, + "learning_rate": 0.00041965229783894785, + "loss": 0.1773, + "step": 190260 + }, + { + "epoch": 7.88, + "grad_norm": 0.51953125, + "learning_rate": 0.0004196443318887784, + "loss": 0.2323, + "step": 190270 + }, + { + "epoch": 7.88, + "grad_norm": 0.578125, + "learning_rate": 0.00041963636561935655, + "loss": 0.1742, + "step": 190280 + }, + { + "epoch": 7.88, + "grad_norm": 0.3671875, + "learning_rate": 0.0004196283990306974, + "loss": 0.2052, + "step": 190290 + }, + { + "epoch": 7.88, + "grad_norm": 0.80078125, + "learning_rate": 0.0004196204321228159, + "loss": 0.1884, + "step": 190300 + }, + { + "epoch": 7.88, + "grad_norm": 1.34375, + "learning_rate": 0.00041961246489572704, + "loss": 0.1787, + "step": 190310 + }, + { + "epoch": 7.88, + "grad_norm": 0.90625, + "learning_rate": 0.0004196044973494458, + "loss": 0.1576, + "step": 190320 + }, + { + "epoch": 7.88, + "grad_norm": 0.5, + "learning_rate": 0.0004195965294839873, + "loss": 0.2089, + "step": 190330 + }, + { + "epoch": 7.88, + "grad_norm": 0.7265625, + "learning_rate": 0.0004195885612993664, + "loss": 0.2101, + "step": 190340 + }, + { + "epoch": 7.88, + "grad_norm": 0.369140625, + "learning_rate": 0.00041958059279559816, + "loss": 0.2228, + "step": 190350 + }, + { + "epoch": 7.88, + "grad_norm": 0.81640625, + "learning_rate": 0.00041957262397269757, + "loss": 0.269, + "step": 190360 + }, + { + "epoch": 7.89, + "grad_norm": 0.58984375, + "learning_rate": 0.0004195646548306796, + "loss": 0.2131, + "step": 190370 + }, + { + "epoch": 7.89, + "grad_norm": 0.828125, + "learning_rate": 0.00041955668536955925, + "loss": 0.1832, + "step": 190380 + }, + { + "epoch": 7.89, + "grad_norm": 0.859375, + "learning_rate": 0.0004195487155893516, + "loss": 0.2274, + "step": 190390 + }, + { + "epoch": 7.89, + "grad_norm": 0.404296875, + "learning_rate": 0.00041954074549007146, + "loss": 0.2074, + "step": 190400 + }, + { + "epoch": 7.89, + "grad_norm": 0.65234375, + "learning_rate": 0.00041953277507173403, + "loss": 0.1988, + "step": 190410 + }, + { + "epoch": 7.89, + "grad_norm": 0.51953125, + "learning_rate": 0.0004195248043343542, + "loss": 0.1591, + "step": 190420 + }, + { + "epoch": 7.89, + "grad_norm": 0.421875, + "learning_rate": 0.000419516833277947, + "loss": 0.164, + "step": 190430 + }, + { + "epoch": 7.89, + "grad_norm": 0.41796875, + "learning_rate": 0.00041950886190252745, + "loss": 0.2142, + "step": 190440 + }, + { + "epoch": 7.89, + "grad_norm": 0.490234375, + "learning_rate": 0.0004195008902081106, + "loss": 0.1316, + "step": 190450 + }, + { + "epoch": 7.89, + "grad_norm": 2.703125, + "learning_rate": 0.0004194929181947111, + "loss": 0.1979, + "step": 190460 + }, + { + "epoch": 7.89, + "grad_norm": 0.98046875, + "learning_rate": 0.00041948494586234447, + "loss": 0.2864, + "step": 190470 + }, + { + "epoch": 7.89, + "grad_norm": 0.52734375, + "learning_rate": 0.0004194769732110254, + "loss": 0.1579, + "step": 190480 + }, + { + "epoch": 7.89, + "grad_norm": 1.0546875, + "learning_rate": 0.00041946900024076905, + "loss": 0.2155, + "step": 190490 + }, + { + "epoch": 7.89, + "grad_norm": 0.30859375, + "learning_rate": 0.00041946102695159025, + "loss": 0.2073, + "step": 190500 + }, + { + "epoch": 7.89, + "grad_norm": 0.546875, + "learning_rate": 0.0004194530533435041, + "loss": 0.2001, + "step": 190510 + }, + { + "epoch": 7.89, + "grad_norm": 0.703125, + "learning_rate": 0.00041944507941652566, + "loss": 0.1827, + "step": 190520 + }, + { + "epoch": 7.89, + "grad_norm": 0.8828125, + "learning_rate": 0.0004194371051706698, + "loss": 0.1822, + "step": 190530 + }, + { + "epoch": 7.89, + "grad_norm": 0.859375, + "learning_rate": 0.00041942913060595164, + "loss": 0.2223, + "step": 190540 + }, + { + "epoch": 7.89, + "grad_norm": 1.9296875, + "learning_rate": 0.0004194211557223862, + "loss": 0.1737, + "step": 190550 + }, + { + "epoch": 7.89, + "grad_norm": 0.78125, + "learning_rate": 0.00041941318051998843, + "loss": 0.2702, + "step": 190560 + }, + { + "epoch": 7.89, + "grad_norm": 0.294921875, + "learning_rate": 0.0004194052049987733, + "loss": 0.1848, + "step": 190570 + }, + { + "epoch": 7.89, + "grad_norm": 0.52734375, + "learning_rate": 0.0004193972291587559, + "loss": 0.2486, + "step": 190580 + }, + { + "epoch": 7.89, + "grad_norm": 0.81640625, + "learning_rate": 0.0004193892529999512, + "loss": 0.1927, + "step": 190590 + }, + { + "epoch": 7.89, + "grad_norm": 0.8125, + "learning_rate": 0.00041938127652237425, + "loss": 0.2196, + "step": 190600 + }, + { + "epoch": 7.9, + "grad_norm": 0.58984375, + "learning_rate": 0.00041937329972604, + "loss": 0.2479, + "step": 190610 + }, + { + "epoch": 7.9, + "grad_norm": 1.078125, + "learning_rate": 0.0004193653226109635, + "loss": 0.1918, + "step": 190620 + }, + { + "epoch": 7.9, + "grad_norm": 0.6640625, + "learning_rate": 0.0004193573451771597, + "loss": 0.2382, + "step": 190630 + }, + { + "epoch": 7.9, + "grad_norm": 0.85546875, + "learning_rate": 0.0004193493674246437, + "loss": 0.2106, + "step": 190640 + }, + { + "epoch": 7.9, + "grad_norm": 0.62890625, + "learning_rate": 0.0004193413893534305, + "loss": 0.2269, + "step": 190650 + }, + { + "epoch": 7.9, + "grad_norm": 0.298828125, + "learning_rate": 0.0004193334109635351, + "loss": 0.2106, + "step": 190660 + }, + { + "epoch": 7.9, + "grad_norm": 0.419921875, + "learning_rate": 0.00041932543225497244, + "loss": 0.1911, + "step": 190670 + }, + { + "epoch": 7.9, + "grad_norm": 0.83203125, + "learning_rate": 0.00041931745322775773, + "loss": 0.1814, + "step": 190680 + }, + { + "epoch": 7.9, + "grad_norm": 0.63671875, + "learning_rate": 0.0004193094738819057, + "loss": 0.1826, + "step": 190690 + }, + { + "epoch": 7.9, + "grad_norm": 0.63671875, + "learning_rate": 0.00041930149421743164, + "loss": 0.2107, + "step": 190700 + }, + { + "epoch": 7.9, + "grad_norm": 0.72265625, + "learning_rate": 0.0004192935142343504, + "loss": 0.1961, + "step": 190710 + }, + { + "epoch": 7.9, + "grad_norm": 1.0, + "learning_rate": 0.000419285533932677, + "loss": 0.2262, + "step": 190720 + }, + { + "epoch": 7.9, + "grad_norm": 0.220703125, + "learning_rate": 0.0004192775533124266, + "loss": 0.2242, + "step": 190730 + }, + { + "epoch": 7.9, + "grad_norm": 0.6640625, + "learning_rate": 0.00041926957237361405, + "loss": 0.2396, + "step": 190740 + }, + { + "epoch": 7.9, + "grad_norm": 1.15625, + "learning_rate": 0.0004192615911162545, + "loss": 0.1813, + "step": 190750 + }, + { + "epoch": 7.9, + "grad_norm": 0.34375, + "learning_rate": 0.00041925360954036297, + "loss": 0.1999, + "step": 190760 + }, + { + "epoch": 7.9, + "grad_norm": 1.0859375, + "learning_rate": 0.0004192456276459543, + "loss": 0.2079, + "step": 190770 + }, + { + "epoch": 7.9, + "grad_norm": 1.25, + "learning_rate": 0.0004192376454330437, + "loss": 0.1945, + "step": 190780 + }, + { + "epoch": 7.9, + "grad_norm": 0.73046875, + "learning_rate": 0.00041922966290164614, + "loss": 0.2174, + "step": 190790 + }, + { + "epoch": 7.9, + "grad_norm": 1.1640625, + "learning_rate": 0.0004192216800517766, + "loss": 0.1838, + "step": 190800 + }, + { + "epoch": 7.9, + "grad_norm": 0.68359375, + "learning_rate": 0.00041921369688345013, + "loss": 0.201, + "step": 190810 + }, + { + "epoch": 7.9, + "grad_norm": 0.8515625, + "learning_rate": 0.0004192057133966818, + "loss": 0.1919, + "step": 190820 + }, + { + "epoch": 7.9, + "grad_norm": 0.55078125, + "learning_rate": 0.0004191977295914865, + "loss": 0.2296, + "step": 190830 + }, + { + "epoch": 7.9, + "grad_norm": 1.1015625, + "learning_rate": 0.00041918974546787947, + "loss": 0.201, + "step": 190840 + }, + { + "epoch": 7.9, + "grad_norm": 1.484375, + "learning_rate": 0.0004191817610258755, + "loss": 0.2111, + "step": 190850 + }, + { + "epoch": 7.91, + "grad_norm": 1.125, + "learning_rate": 0.00041917377626548983, + "loss": 0.1746, + "step": 190860 + }, + { + "epoch": 7.91, + "grad_norm": 0.4921875, + "learning_rate": 0.00041916579118673726, + "loss": 0.1831, + "step": 190870 + }, + { + "epoch": 7.91, + "grad_norm": 0.263671875, + "learning_rate": 0.000419157805789633, + "loss": 0.1979, + "step": 190880 + }, + { + "epoch": 7.91, + "grad_norm": 0.4296875, + "learning_rate": 0.000419149820074192, + "loss": 0.1915, + "step": 190890 + }, + { + "epoch": 7.91, + "grad_norm": 0.8125, + "learning_rate": 0.00041914183404042927, + "loss": 0.1926, + "step": 190900 + }, + { + "epoch": 7.91, + "grad_norm": 2.90625, + "learning_rate": 0.00041913384768835996, + "loss": 0.199, + "step": 190910 + }, + { + "epoch": 7.91, + "grad_norm": 1.1171875, + "learning_rate": 0.000419125861017999, + "loss": 0.2421, + "step": 190920 + }, + { + "epoch": 7.91, + "grad_norm": 0.65234375, + "learning_rate": 0.0004191178740293614, + "loss": 0.2035, + "step": 190930 + }, + { + "epoch": 7.91, + "grad_norm": 0.51171875, + "learning_rate": 0.00041910988672246225, + "loss": 0.1902, + "step": 190940 + }, + { + "epoch": 7.91, + "grad_norm": 0.51953125, + "learning_rate": 0.0004191018990973166, + "loss": 0.1495, + "step": 190950 + }, + { + "epoch": 7.91, + "grad_norm": 0.58203125, + "learning_rate": 0.0004190939111539393, + "loss": 0.2041, + "step": 190960 + }, + { + "epoch": 7.91, + "grad_norm": 1.125, + "learning_rate": 0.0004190859228923456, + "loss": 0.1581, + "step": 190970 + }, + { + "epoch": 7.91, + "grad_norm": 0.921875, + "learning_rate": 0.0004190779343125504, + "loss": 0.2482, + "step": 190980 + }, + { + "epoch": 7.91, + "grad_norm": 0.51171875, + "learning_rate": 0.0004190699454145689, + "loss": 0.2382, + "step": 190990 + }, + { + "epoch": 7.91, + "grad_norm": 0.921875, + "learning_rate": 0.00041906195619841594, + "loss": 0.1871, + "step": 191000 + }, + { + "epoch": 7.91, + "grad_norm": 0.435546875, + "learning_rate": 0.0004190539666641067, + "loss": 0.1768, + "step": 191010 + }, + { + "epoch": 7.91, + "grad_norm": 0.27734375, + "learning_rate": 0.0004190459768116561, + "loss": 0.2078, + "step": 191020 + }, + { + "epoch": 7.91, + "grad_norm": 0.41015625, + "learning_rate": 0.00041903798664107915, + "loss": 0.1986, + "step": 191030 + }, + { + "epoch": 7.91, + "grad_norm": 0.380859375, + "learning_rate": 0.0004190299961523911, + "loss": 0.1867, + "step": 191040 + }, + { + "epoch": 7.91, + "grad_norm": 1.0703125, + "learning_rate": 0.0004190220053456068, + "loss": 0.1879, + "step": 191050 + }, + { + "epoch": 7.91, + "grad_norm": 0.48046875, + "learning_rate": 0.00041901401422074133, + "loss": 0.1675, + "step": 191060 + }, + { + "epoch": 7.91, + "grad_norm": 1.328125, + "learning_rate": 0.00041900602277780973, + "loss": 0.1978, + "step": 191070 + }, + { + "epoch": 7.91, + "grad_norm": 1.21875, + "learning_rate": 0.00041899803101682703, + "loss": 0.2328, + "step": 191080 + }, + { + "epoch": 7.91, + "grad_norm": 0.62890625, + "learning_rate": 0.0004189900389378083, + "loss": 0.1927, + "step": 191090 + }, + { + "epoch": 7.92, + "grad_norm": 0.470703125, + "learning_rate": 0.00041898204654076867, + "loss": 0.2165, + "step": 191100 + }, + { + "epoch": 7.92, + "grad_norm": 0.3984375, + "learning_rate": 0.0004189740538257229, + "loss": 0.2376, + "step": 191110 + }, + { + "epoch": 7.92, + "grad_norm": 1.15625, + "learning_rate": 0.00041896606079268637, + "loss": 0.164, + "step": 191120 + }, + { + "epoch": 7.92, + "grad_norm": 1.6015625, + "learning_rate": 0.00041895806744167386, + "loss": 0.2521, + "step": 191130 + }, + { + "epoch": 7.92, + "grad_norm": 0.58984375, + "learning_rate": 0.0004189500737727005, + "loss": 0.2214, + "step": 191140 + }, + { + "epoch": 7.92, + "grad_norm": 0.64453125, + "learning_rate": 0.0004189420797857814, + "loss": 0.2204, + "step": 191150 + }, + { + "epoch": 7.92, + "grad_norm": 0.88671875, + "learning_rate": 0.00041893408548093156, + "loss": 0.2433, + "step": 191160 + }, + { + "epoch": 7.92, + "grad_norm": 0.8515625, + "learning_rate": 0.00041892609085816596, + "loss": 0.2074, + "step": 191170 + }, + { + "epoch": 7.92, + "grad_norm": 0.80078125, + "learning_rate": 0.00041891809591749975, + "loss": 0.2262, + "step": 191180 + }, + { + "epoch": 7.92, + "grad_norm": 0.53125, + "learning_rate": 0.00041891010065894785, + "loss": 0.2014, + "step": 191190 + }, + { + "epoch": 7.92, + "grad_norm": 0.470703125, + "learning_rate": 0.00041890210508252544, + "loss": 0.225, + "step": 191200 + }, + { + "epoch": 7.92, + "grad_norm": 0.40234375, + "learning_rate": 0.00041889410918824753, + "loss": 0.2026, + "step": 191210 + }, + { + "epoch": 7.92, + "grad_norm": 1.265625, + "learning_rate": 0.0004188861129761291, + "loss": 0.1806, + "step": 191220 + }, + { + "epoch": 7.92, + "grad_norm": 0.28515625, + "learning_rate": 0.0004188781164461853, + "loss": 0.1835, + "step": 191230 + }, + { + "epoch": 7.92, + "grad_norm": 1.4609375, + "learning_rate": 0.00041887011959843105, + "loss": 0.2236, + "step": 191240 + }, + { + "epoch": 7.92, + "grad_norm": 0.44140625, + "learning_rate": 0.00041886212243288153, + "loss": 0.1469, + "step": 191250 + }, + { + "epoch": 7.92, + "grad_norm": 0.85546875, + "learning_rate": 0.0004188541249495517, + "loss": 0.185, + "step": 191260 + }, + { + "epoch": 7.92, + "grad_norm": 0.828125, + "learning_rate": 0.0004188461271484566, + "loss": 0.1891, + "step": 191270 + }, + { + "epoch": 7.92, + "grad_norm": 0.73828125, + "learning_rate": 0.0004188381290296114, + "loss": 0.2361, + "step": 191280 + }, + { + "epoch": 7.92, + "grad_norm": 0.76171875, + "learning_rate": 0.00041883013059303104, + "loss": 0.181, + "step": 191290 + }, + { + "epoch": 7.92, + "grad_norm": 1.109375, + "learning_rate": 0.00041882213183873064, + "loss": 0.1771, + "step": 191300 + }, + { + "epoch": 7.92, + "grad_norm": 1.25, + "learning_rate": 0.0004188141327667252, + "loss": 0.2119, + "step": 191310 + }, + { + "epoch": 7.92, + "grad_norm": 0.7109375, + "learning_rate": 0.00041880613337702977, + "loss": 0.1855, + "step": 191320 + }, + { + "epoch": 7.92, + "grad_norm": 0.5234375, + "learning_rate": 0.00041879813366965945, + "loss": 0.1699, + "step": 191330 + }, + { + "epoch": 7.93, + "grad_norm": 1.1640625, + "learning_rate": 0.00041879013364462927, + "loss": 0.2044, + "step": 191340 + }, + { + "epoch": 7.93, + "grad_norm": 2.15625, + "learning_rate": 0.00041878213330195425, + "loss": 0.1976, + "step": 191350 + }, + { + "epoch": 7.93, + "grad_norm": 0.765625, + "learning_rate": 0.0004187741326416495, + "loss": 0.2252, + "step": 191360 + }, + { + "epoch": 7.93, + "grad_norm": 1.0, + "learning_rate": 0.00041876613166373004, + "loss": 0.2416, + "step": 191370 + }, + { + "epoch": 7.93, + "grad_norm": 0.32421875, + "learning_rate": 0.000418758130368211, + "loss": 0.146, + "step": 191380 + }, + { + "epoch": 7.93, + "grad_norm": 0.7109375, + "learning_rate": 0.00041875012875510734, + "loss": 0.1756, + "step": 191390 + }, + { + "epoch": 7.93, + "grad_norm": 0.77734375, + "learning_rate": 0.0004187421268244342, + "loss": 0.2243, + "step": 191400 + }, + { + "epoch": 7.93, + "grad_norm": 0.55078125, + "learning_rate": 0.0004187341245762066, + "loss": 0.1921, + "step": 191410 + }, + { + "epoch": 7.93, + "grad_norm": 0.765625, + "learning_rate": 0.0004187261220104396, + "loss": 0.2433, + "step": 191420 + }, + { + "epoch": 7.93, + "grad_norm": 0.5703125, + "learning_rate": 0.0004187181191271482, + "loss": 0.2161, + "step": 191430 + }, + { + "epoch": 7.93, + "grad_norm": 0.4296875, + "learning_rate": 0.00041871011592634755, + "loss": 0.2104, + "step": 191440 + }, + { + "epoch": 7.93, + "grad_norm": 0.828125, + "learning_rate": 0.00041870211240805266, + "loss": 0.2269, + "step": 191450 + }, + { + "epoch": 7.93, + "grad_norm": 0.72265625, + "learning_rate": 0.0004186941085722786, + "loss": 0.254, + "step": 191460 + }, + { + "epoch": 7.93, + "grad_norm": 0.54296875, + "learning_rate": 0.00041868610441904056, + "loss": 0.1876, + "step": 191470 + }, + { + "epoch": 7.93, + "grad_norm": 1.5, + "learning_rate": 0.0004186780999483534, + "loss": 0.164, + "step": 191480 + }, + { + "epoch": 7.93, + "grad_norm": 0.1962890625, + "learning_rate": 0.00041867009516023223, + "loss": 0.2153, + "step": 191490 + }, + { + "epoch": 7.93, + "grad_norm": 1.1484375, + "learning_rate": 0.00041866209005469226, + "loss": 0.2193, + "step": 191500 + }, + { + "epoch": 7.93, + "grad_norm": 0.7109375, + "learning_rate": 0.00041865408463174835, + "loss": 0.1539, + "step": 191510 + }, + { + "epoch": 7.93, + "grad_norm": 0.8359375, + "learning_rate": 0.0004186460788914157, + "loss": 0.1747, + "step": 191520 + }, + { + "epoch": 7.93, + "grad_norm": 0.359375, + "learning_rate": 0.00041863807283370937, + "loss": 0.1837, + "step": 191530 + }, + { + "epoch": 7.93, + "grad_norm": 0.57421875, + "learning_rate": 0.00041863006645864444, + "loss": 0.1672, + "step": 191540 + }, + { + "epoch": 7.93, + "grad_norm": 0.4609375, + "learning_rate": 0.00041862205976623586, + "loss": 0.2033, + "step": 191550 + }, + { + "epoch": 7.93, + "grad_norm": 1.1640625, + "learning_rate": 0.00041861405275649876, + "loss": 0.1988, + "step": 191560 + }, + { + "epoch": 7.93, + "grad_norm": 0.7578125, + "learning_rate": 0.00041860604542944826, + "loss": 0.208, + "step": 191570 + }, + { + "epoch": 7.94, + "grad_norm": 0.54296875, + "learning_rate": 0.0004185980377850994, + "loss": 0.1693, + "step": 191580 + }, + { + "epoch": 7.94, + "grad_norm": 0.6171875, + "learning_rate": 0.00041859002982346714, + "loss": 0.2366, + "step": 191590 + }, + { + "epoch": 7.94, + "grad_norm": 0.66796875, + "learning_rate": 0.00041858202154456683, + "loss": 0.249, + "step": 191600 + }, + { + "epoch": 7.94, + "grad_norm": 0.51171875, + "learning_rate": 0.00041857401294841324, + "loss": 0.1616, + "step": 191610 + }, + { + "epoch": 7.94, + "grad_norm": 1.703125, + "learning_rate": 0.0004185660040350216, + "loss": 0.2067, + "step": 191620 + }, + { + "epoch": 7.94, + "grad_norm": 1.109375, + "learning_rate": 0.00041855799480440694, + "loss": 0.1639, + "step": 191630 + }, + { + "epoch": 7.94, + "grad_norm": 0.77734375, + "learning_rate": 0.00041854998525658427, + "loss": 0.171, + "step": 191640 + }, + { + "epoch": 7.94, + "grad_norm": 0.453125, + "learning_rate": 0.0004185419753915688, + "loss": 0.1832, + "step": 191650 + }, + { + "epoch": 7.94, + "grad_norm": 0.8828125, + "learning_rate": 0.00041853396520937555, + "loss": 0.1951, + "step": 191660 + }, + { + "epoch": 7.94, + "grad_norm": 0.55078125, + "learning_rate": 0.00041852595471001953, + "loss": 0.2177, + "step": 191670 + }, + { + "epoch": 7.94, + "grad_norm": 0.50390625, + "learning_rate": 0.0004185179438935159, + "loss": 0.2013, + "step": 191680 + }, + { + "epoch": 7.94, + "grad_norm": 0.5234375, + "learning_rate": 0.0004185099327598797, + "loss": 0.258, + "step": 191690 + }, + { + "epoch": 7.94, + "grad_norm": 0.498046875, + "learning_rate": 0.00041850192130912594, + "loss": 0.1821, + "step": 191700 + }, + { + "epoch": 7.94, + "grad_norm": 1.328125, + "learning_rate": 0.0004184939095412698, + "loss": 0.2399, + "step": 191710 + }, + { + "epoch": 7.94, + "grad_norm": 1.609375, + "learning_rate": 0.0004184858974563264, + "loss": 0.2082, + "step": 191720 + }, + { + "epoch": 7.94, + "grad_norm": 0.87109375, + "learning_rate": 0.0004184778850543106, + "loss": 0.1597, + "step": 191730 + }, + { + "epoch": 7.94, + "grad_norm": 0.5625, + "learning_rate": 0.00041846987233523766, + "loss": 0.1806, + "step": 191740 + }, + { + "epoch": 7.94, + "grad_norm": 0.90234375, + "learning_rate": 0.0004184618592991226, + "loss": 0.2155, + "step": 191750 + }, + { + "epoch": 7.94, + "grad_norm": 1.2578125, + "learning_rate": 0.0004184538459459806, + "loss": 0.218, + "step": 191760 + }, + { + "epoch": 7.94, + "grad_norm": 1.453125, + "learning_rate": 0.0004184458322758266, + "loss": 0.1696, + "step": 191770 + }, + { + "epoch": 7.94, + "grad_norm": 0.326171875, + "learning_rate": 0.00041843781828867566, + "loss": 0.2115, + "step": 191780 + }, + { + "epoch": 7.94, + "grad_norm": 0.7578125, + "learning_rate": 0.000418429803984543, + "loss": 0.196, + "step": 191790 + }, + { + "epoch": 7.94, + "grad_norm": 0.5390625, + "learning_rate": 0.0004184217893634437, + "loss": 0.193, + "step": 191800 + }, + { + "epoch": 7.94, + "grad_norm": 1.5078125, + "learning_rate": 0.0004184137744253927, + "loss": 0.1948, + "step": 191810 + }, + { + "epoch": 7.95, + "grad_norm": 0.40234375, + "learning_rate": 0.00041840575917040515, + "loss": 0.1837, + "step": 191820 + }, + { + "epoch": 7.95, + "grad_norm": 0.96484375, + "learning_rate": 0.0004183977435984962, + "loss": 0.2227, + "step": 191830 + }, + { + "epoch": 7.95, + "grad_norm": 0.7109375, + "learning_rate": 0.0004183897277096809, + "loss": 0.1926, + "step": 191840 + }, + { + "epoch": 7.95, + "grad_norm": 0.416015625, + "learning_rate": 0.0004183817115039742, + "loss": 0.1631, + "step": 191850 + }, + { + "epoch": 7.95, + "grad_norm": 0.2373046875, + "learning_rate": 0.00041837369498139143, + "loss": 0.216, + "step": 191860 + }, + { + "epoch": 7.95, + "grad_norm": 0.296875, + "learning_rate": 0.00041836567814194746, + "loss": 0.1546, + "step": 191870 + }, + { + "epoch": 7.95, + "grad_norm": 0.6953125, + "learning_rate": 0.00041835766098565754, + "loss": 0.2658, + "step": 191880 + }, + { + "epoch": 7.95, + "grad_norm": 0.71484375, + "learning_rate": 0.0004183496435125367, + "loss": 0.2032, + "step": 191890 + }, + { + "epoch": 7.95, + "grad_norm": 0.69921875, + "learning_rate": 0.00041834162572259995, + "loss": 0.1889, + "step": 191900 + }, + { + "epoch": 7.95, + "grad_norm": 0.5390625, + "learning_rate": 0.00041833360761586236, + "loss": 0.2201, + "step": 191910 + }, + { + "epoch": 7.95, + "grad_norm": 0.734375, + "learning_rate": 0.0004183255891923392, + "loss": 0.2633, + "step": 191920 + }, + { + "epoch": 7.95, + "grad_norm": 0.58984375, + "learning_rate": 0.0004183175704520455, + "loss": 0.216, + "step": 191930 + }, + { + "epoch": 7.95, + "grad_norm": 0.75, + "learning_rate": 0.0004183095513949963, + "loss": 0.2203, + "step": 191940 + }, + { + "epoch": 7.95, + "grad_norm": 0.53125, + "learning_rate": 0.0004183015320212066, + "loss": 0.191, + "step": 191950 + }, + { + "epoch": 7.95, + "grad_norm": 1.28125, + "learning_rate": 0.0004182935123306917, + "loss": 0.2076, + "step": 191960 + }, + { + "epoch": 7.95, + "grad_norm": 0.5703125, + "learning_rate": 0.00041828549232346646, + "loss": 0.1701, + "step": 191970 + }, + { + "epoch": 7.95, + "grad_norm": 0.8359375, + "learning_rate": 0.0004182774719995462, + "loss": 0.2028, + "step": 191980 + }, + { + "epoch": 7.95, + "grad_norm": 0.62109375, + "learning_rate": 0.0004182694513589459, + "loss": 0.1824, + "step": 191990 + }, + { + "epoch": 7.95, + "grad_norm": 0.84375, + "learning_rate": 0.0004182614304016806, + "loss": 0.2164, + "step": 192000 + }, + { + "epoch": 7.95, + "grad_norm": 0.2275390625, + "learning_rate": 0.00041825340912776557, + "loss": 0.1746, + "step": 192010 + }, + { + "epoch": 7.95, + "grad_norm": 1.21875, + "learning_rate": 0.0004182453875372157, + "loss": 0.235, + "step": 192020 + }, + { + "epoch": 7.95, + "grad_norm": 0.62109375, + "learning_rate": 0.00041823736563004616, + "loss": 0.1766, + "step": 192030 + }, + { + "epoch": 7.95, + "grad_norm": 1.359375, + "learning_rate": 0.00041822934340627217, + "loss": 0.2235, + "step": 192040 + }, + { + "epoch": 7.95, + "grad_norm": 0.796875, + "learning_rate": 0.0004182213208659086, + "loss": 0.2378, + "step": 192050 + }, + { + "epoch": 7.96, + "grad_norm": 0.357421875, + "learning_rate": 0.0004182132980089708, + "loss": 0.1959, + "step": 192060 + }, + { + "epoch": 7.96, + "grad_norm": 0.4609375, + "learning_rate": 0.00041820527483547366, + "loss": 0.1936, + "step": 192070 + }, + { + "epoch": 7.96, + "grad_norm": 1.0390625, + "learning_rate": 0.0004181972513454323, + "loss": 0.2154, + "step": 192080 + }, + { + "epoch": 7.96, + "grad_norm": 0.73046875, + "learning_rate": 0.000418189227538862, + "loss": 0.1732, + "step": 192090 + }, + { + "epoch": 7.96, + "grad_norm": 0.4296875, + "learning_rate": 0.0004181812034157777, + "loss": 0.172, + "step": 192100 + }, + { + "epoch": 7.96, + "grad_norm": 0.83203125, + "learning_rate": 0.00041817317897619447, + "loss": 0.2558, + "step": 192110 + }, + { + "epoch": 7.96, + "grad_norm": 0.6640625, + "learning_rate": 0.00041816515422012757, + "loss": 0.2587, + "step": 192120 + }, + { + "epoch": 7.96, + "grad_norm": 0.2333984375, + "learning_rate": 0.000418157129147592, + "loss": 0.199, + "step": 192130 + }, + { + "epoch": 7.96, + "grad_norm": 0.81640625, + "learning_rate": 0.0004181491037586028, + "loss": 0.1896, + "step": 192140 + }, + { + "epoch": 7.96, + "grad_norm": 0.1494140625, + "learning_rate": 0.0004181410780531752, + "loss": 0.2129, + "step": 192150 + }, + { + "epoch": 7.96, + "grad_norm": 0.6640625, + "learning_rate": 0.00041813305203132424, + "loss": 0.1872, + "step": 192160 + }, + { + "epoch": 7.96, + "grad_norm": 0.89453125, + "learning_rate": 0.000418125025693065, + "loss": 0.1552, + "step": 192170 + }, + { + "epoch": 7.96, + "grad_norm": 0.6875, + "learning_rate": 0.00041811699903841266, + "loss": 0.187, + "step": 192180 + }, + { + "epoch": 7.96, + "grad_norm": 0.6640625, + "learning_rate": 0.00041810897206738225, + "loss": 0.2053, + "step": 192190 + }, + { + "epoch": 7.96, + "grad_norm": 1.0234375, + "learning_rate": 0.00041810094477998897, + "loss": 0.166, + "step": 192200 + }, + { + "epoch": 7.96, + "grad_norm": 1.7578125, + "learning_rate": 0.00041809291717624777, + "loss": 0.1862, + "step": 192210 + }, + { + "epoch": 7.96, + "grad_norm": 0.68359375, + "learning_rate": 0.00041808488925617395, + "loss": 0.1904, + "step": 192220 + }, + { + "epoch": 7.96, + "grad_norm": 0.427734375, + "learning_rate": 0.00041807686101978246, + "loss": 0.2326, + "step": 192230 + }, + { + "epoch": 7.96, + "grad_norm": 1.2265625, + "learning_rate": 0.0004180688324670885, + "loss": 0.1963, + "step": 192240 + }, + { + "epoch": 7.96, + "grad_norm": 0.453125, + "learning_rate": 0.0004180608035981071, + "loss": 0.1801, + "step": 192250 + }, + { + "epoch": 7.96, + "grad_norm": 0.361328125, + "learning_rate": 0.0004180527744128534, + "loss": 0.1925, + "step": 192260 + }, + { + "epoch": 7.96, + "grad_norm": 1.2734375, + "learning_rate": 0.0004180447449113426, + "loss": 0.2057, + "step": 192270 + }, + { + "epoch": 7.96, + "grad_norm": 0.51953125, + "learning_rate": 0.00041803671509358975, + "loss": 0.186, + "step": 192280 + }, + { + "epoch": 7.96, + "grad_norm": 1.09375, + "learning_rate": 0.0004180286849596099, + "loss": 0.1662, + "step": 192290 + }, + { + "epoch": 7.97, + "grad_norm": 3.0625, + "learning_rate": 0.00041802065450941825, + "loss": 0.2118, + "step": 192300 + }, + { + "epoch": 7.97, + "grad_norm": 0.5234375, + "learning_rate": 0.0004180126237430298, + "loss": 0.1827, + "step": 192310 + }, + { + "epoch": 7.97, + "grad_norm": 0.9296875, + "learning_rate": 0.0004180045926604598, + "loss": 0.2421, + "step": 192320 + }, + { + "epoch": 7.97, + "grad_norm": 0.55859375, + "learning_rate": 0.00041799656126172326, + "loss": 0.1759, + "step": 192330 + }, + { + "epoch": 7.97, + "grad_norm": 1.1015625, + "learning_rate": 0.00041798852954683536, + "loss": 0.241, + "step": 192340 + }, + { + "epoch": 7.97, + "grad_norm": 0.828125, + "learning_rate": 0.0004179804975158112, + "loss": 0.2204, + "step": 192350 + }, + { + "epoch": 7.97, + "grad_norm": 0.65234375, + "learning_rate": 0.00041797246516866586, + "loss": 0.2007, + "step": 192360 + }, + { + "epoch": 7.97, + "grad_norm": 0.466796875, + "learning_rate": 0.0004179644325054145, + "loss": 0.2007, + "step": 192370 + }, + { + "epoch": 7.97, + "grad_norm": 0.66015625, + "learning_rate": 0.00041795639952607225, + "loss": 0.1831, + "step": 192380 + }, + { + "epoch": 7.97, + "grad_norm": 0.46875, + "learning_rate": 0.0004179483662306541, + "loss": 0.2588, + "step": 192390 + }, + { + "epoch": 7.97, + "grad_norm": 1.25, + "learning_rate": 0.0004179403326191753, + "loss": 0.2323, + "step": 192400 + }, + { + "epoch": 7.97, + "grad_norm": 0.4765625, + "learning_rate": 0.00041793229869165093, + "loss": 0.2522, + "step": 192410 + }, + { + "epoch": 7.97, + "grad_norm": 0.62109375, + "learning_rate": 0.0004179242644480961, + "loss": 0.1905, + "step": 192420 + }, + { + "epoch": 7.97, + "grad_norm": 0.45703125, + "learning_rate": 0.00041791622988852597, + "loss": 0.1816, + "step": 192430 + }, + { + "epoch": 7.97, + "grad_norm": 1.4921875, + "learning_rate": 0.00041790819501295564, + "loss": 0.1632, + "step": 192440 + }, + { + "epoch": 7.97, + "grad_norm": 0.546875, + "learning_rate": 0.00041790015982140017, + "loss": 0.1863, + "step": 192450 + }, + { + "epoch": 7.97, + "grad_norm": 0.47265625, + "learning_rate": 0.0004178921243138748, + "loss": 0.1674, + "step": 192460 + }, + { + "epoch": 7.97, + "grad_norm": 0.7578125, + "learning_rate": 0.0004178840884903945, + "loss": 0.1766, + "step": 192470 + }, + { + "epoch": 7.97, + "grad_norm": 0.8515625, + "learning_rate": 0.0004178760523509745, + "loss": 0.1838, + "step": 192480 + }, + { + "epoch": 7.97, + "grad_norm": 0.6875, + "learning_rate": 0.0004178680158956299, + "loss": 0.1904, + "step": 192490 + }, + { + "epoch": 7.97, + "grad_norm": 0.6015625, + "learning_rate": 0.00041785997912437584, + "loss": 0.1751, + "step": 192500 + }, + { + "epoch": 7.97, + "grad_norm": 1.65625, + "learning_rate": 0.0004178519420372274, + "loss": 0.1918, + "step": 192510 + }, + { + "epoch": 7.97, + "grad_norm": 0.7265625, + "learning_rate": 0.00041784390463419973, + "loss": 0.1474, + "step": 192520 + }, + { + "epoch": 7.97, + "grad_norm": 0.5, + "learning_rate": 0.00041783586691530807, + "loss": 0.1898, + "step": 192530 + }, + { + "epoch": 7.97, + "grad_norm": 0.53125, + "learning_rate": 0.00041782782888056733, + "loss": 0.2115, + "step": 192540 + }, + { + "epoch": 7.98, + "grad_norm": 1.1484375, + "learning_rate": 0.00041781979052999275, + "loss": 0.1813, + "step": 192550 + }, + { + "epoch": 7.98, + "grad_norm": 0.67578125, + "learning_rate": 0.00041781175186359946, + "loss": 0.2451, + "step": 192560 + }, + { + "epoch": 7.98, + "grad_norm": 0.87109375, + "learning_rate": 0.0004178037128814025, + "loss": 0.2411, + "step": 192570 + }, + { + "epoch": 7.98, + "grad_norm": 0.60546875, + "learning_rate": 0.00041779567358341717, + "loss": 0.1991, + "step": 192580 + }, + { + "epoch": 7.98, + "grad_norm": 0.462890625, + "learning_rate": 0.0004177876339696585, + "loss": 0.2264, + "step": 192590 + }, + { + "epoch": 7.98, + "grad_norm": 0.8359375, + "learning_rate": 0.0004177795940401415, + "loss": 0.1929, + "step": 192600 + }, + { + "epoch": 7.98, + "grad_norm": 0.396484375, + "learning_rate": 0.0004177715537948816, + "loss": 0.1702, + "step": 192610 + }, + { + "epoch": 7.98, + "grad_norm": 0.82421875, + "learning_rate": 0.0004177635132338936, + "loss": 0.2153, + "step": 192620 + }, + { + "epoch": 7.98, + "grad_norm": 0.93359375, + "learning_rate": 0.0004177554723571929, + "loss": 0.1919, + "step": 192630 + }, + { + "epoch": 7.98, + "grad_norm": 0.8515625, + "learning_rate": 0.00041774743116479446, + "loss": 0.1881, + "step": 192640 + }, + { + "epoch": 7.98, + "grad_norm": 0.82421875, + "learning_rate": 0.00041773938965671356, + "loss": 0.1973, + "step": 192650 + }, + { + "epoch": 7.98, + "grad_norm": 0.4609375, + "learning_rate": 0.0004177313478329651, + "loss": 0.1812, + "step": 192660 + }, + { + "epoch": 7.98, + "grad_norm": 1.21875, + "learning_rate": 0.00041772330569356443, + "loss": 0.2334, + "step": 192670 + }, + { + "epoch": 7.98, + "grad_norm": 0.9765625, + "learning_rate": 0.0004177152632385266, + "loss": 0.2126, + "step": 192680 + }, + { + "epoch": 7.98, + "grad_norm": 0.69921875, + "learning_rate": 0.00041770722046786675, + "loss": 0.2443, + "step": 192690 + }, + { + "epoch": 7.98, + "grad_norm": 0.65625, + "learning_rate": 0.0004176991773816, + "loss": 0.2481, + "step": 192700 + }, + { + "epoch": 7.98, + "grad_norm": 1.359375, + "learning_rate": 0.00041769113397974156, + "loss": 0.1789, + "step": 192710 + }, + { + "epoch": 7.98, + "grad_norm": 1.1484375, + "learning_rate": 0.0004176830902623065, + "loss": 0.2678, + "step": 192720 + }, + { + "epoch": 7.98, + "grad_norm": 0.55859375, + "learning_rate": 0.00041767504622931, + "loss": 0.2315, + "step": 192730 + }, + { + "epoch": 7.98, + "grad_norm": 0.439453125, + "learning_rate": 0.0004176670018807671, + "loss": 0.1996, + "step": 192740 + }, + { + "epoch": 7.98, + "grad_norm": 0.80078125, + "learning_rate": 0.000417658957216693, + "loss": 0.2154, + "step": 192750 + }, + { + "epoch": 7.98, + "grad_norm": 0.5703125, + "learning_rate": 0.00041765091223710296, + "loss": 0.2143, + "step": 192760 + }, + { + "epoch": 7.98, + "grad_norm": 1.21875, + "learning_rate": 0.0004176428669420119, + "loss": 0.2121, + "step": 192770 + }, + { + "epoch": 7.98, + "grad_norm": 0.6328125, + "learning_rate": 0.00041763482133143516, + "loss": 0.2192, + "step": 192780 + }, + { + "epoch": 7.99, + "grad_norm": 0.85546875, + "learning_rate": 0.0004176267754053877, + "loss": 0.1811, + "step": 192790 + }, + { + "epoch": 7.99, + "grad_norm": 0.828125, + "learning_rate": 0.0004176187291638848, + "loss": 0.2041, + "step": 192800 + }, + { + "epoch": 7.99, + "grad_norm": 0.90234375, + "learning_rate": 0.00041761068260694147, + "loss": 0.1974, + "step": 192810 + }, + { + "epoch": 7.99, + "grad_norm": 0.515625, + "learning_rate": 0.00041760263573457305, + "loss": 0.2134, + "step": 192820 + }, + { + "epoch": 7.99, + "grad_norm": 0.6640625, + "learning_rate": 0.00041759458854679455, + "loss": 0.153, + "step": 192830 + }, + { + "epoch": 7.99, + "grad_norm": 0.404296875, + "learning_rate": 0.0004175865410436211, + "loss": 0.2015, + "step": 192840 + }, + { + "epoch": 7.99, + "grad_norm": 0.54296875, + "learning_rate": 0.00041757849322506793, + "loss": 0.2041, + "step": 192850 + }, + { + "epoch": 7.99, + "grad_norm": 0.6953125, + "learning_rate": 0.00041757044509115005, + "loss": 0.2355, + "step": 192860 + }, + { + "epoch": 7.99, + "grad_norm": 0.52734375, + "learning_rate": 0.0004175623966418827, + "loss": 0.2236, + "step": 192870 + }, + { + "epoch": 7.99, + "grad_norm": 0.97265625, + "learning_rate": 0.00041755434787728107, + "loss": 0.1613, + "step": 192880 + }, + { + "epoch": 7.99, + "grad_norm": 0.3828125, + "learning_rate": 0.00041754629879736015, + "loss": 0.1834, + "step": 192890 + }, + { + "epoch": 7.99, + "grad_norm": 0.89453125, + "learning_rate": 0.00041753824940213535, + "loss": 0.1789, + "step": 192900 + }, + { + "epoch": 7.99, + "grad_norm": 0.93359375, + "learning_rate": 0.0004175301996916215, + "loss": 0.2096, + "step": 192910 + }, + { + "epoch": 7.99, + "grad_norm": 0.330078125, + "learning_rate": 0.000417522149665834, + "loss": 0.2103, + "step": 192920 + }, + { + "epoch": 7.99, + "grad_norm": 1.125, + "learning_rate": 0.0004175140993247879, + "loss": 0.2354, + "step": 192930 + }, + { + "epoch": 7.99, + "grad_norm": 0.322265625, + "learning_rate": 0.0004175060486684984, + "loss": 0.2248, + "step": 192940 + }, + { + "epoch": 7.99, + "grad_norm": 1.15625, + "learning_rate": 0.0004174979976969805, + "loss": 0.2445, + "step": 192950 + }, + { + "epoch": 7.99, + "grad_norm": 0.60546875, + "learning_rate": 0.0004174899464102495, + "loss": 0.2036, + "step": 192960 + }, + { + "epoch": 7.99, + "grad_norm": 0.65625, + "learning_rate": 0.0004174818948083204, + "loss": 0.2133, + "step": 192970 + }, + { + "epoch": 7.99, + "grad_norm": 0.458984375, + "learning_rate": 0.00041747384289120867, + "loss": 0.181, + "step": 192980 + }, + { + "epoch": 7.99, + "grad_norm": 2.5625, + "learning_rate": 0.00041746579065892917, + "loss": 0.2051, + "step": 192990 + }, + { + "epoch": 7.99, + "grad_norm": 0.77734375, + "learning_rate": 0.00041745773811149715, + "loss": 0.1873, + "step": 193000 + }, + { + "epoch": 7.99, + "grad_norm": 0.828125, + "learning_rate": 0.00041744968524892766, + "loss": 0.1764, + "step": 193010 + }, + { + "epoch": 7.99, + "grad_norm": 0.91796875, + "learning_rate": 0.00041744163207123596, + "loss": 0.1604, + "step": 193020 + }, + { + "epoch": 8.0, + "grad_norm": 0.66796875, + "learning_rate": 0.0004174335785784372, + "loss": 0.207, + "step": 193030 + }, + { + "epoch": 8.0, + "grad_norm": 0.75390625, + "learning_rate": 0.00041742552477054665, + "loss": 0.2188, + "step": 193040 + }, + { + "epoch": 8.0, + "grad_norm": 0.890625, + "learning_rate": 0.00041741747064757924, + "loss": 0.2048, + "step": 193050 + }, + { + "epoch": 8.0, + "grad_norm": 0.578125, + "learning_rate": 0.0004174094162095502, + "loss": 0.2015, + "step": 193060 + }, + { + "epoch": 8.0, + "grad_norm": 0.5703125, + "learning_rate": 0.0004174013614564748, + "loss": 0.2668, + "step": 193070 + }, + { + "epoch": 8.0, + "grad_norm": 0.57421875, + "learning_rate": 0.000417393306388368, + "loss": 0.2057, + "step": 193080 + }, + { + "epoch": 8.0, + "grad_norm": 1.0859375, + "learning_rate": 0.0004173852510052453, + "loss": 0.2113, + "step": 193090 + }, + { + "epoch": 8.0, + "grad_norm": 0.75390625, + "learning_rate": 0.00041737719530712136, + "loss": 0.2036, + "step": 193100 + }, + { + "epoch": 8.0, + "grad_norm": 0.88671875, + "learning_rate": 0.00041736913929401177, + "loss": 0.2028, + "step": 193110 + }, + { + "epoch": 8.0, + "grad_norm": 0.609375, + "learning_rate": 0.00041736108296593153, + "loss": 0.2643, + "step": 193120 + }, + { + "epoch": 8.0, + "grad_norm": 0.6015625, + "learning_rate": 0.0004173530263228957, + "loss": 0.2127, + "step": 193130 + }, + { + "epoch": 8.0, + "grad_norm": 0.421875, + "learning_rate": 0.0004173449693649197, + "loss": 0.187, + "step": 193140 + }, + { + "epoch": 8.0, + "grad_norm": 0.6640625, + "learning_rate": 0.0004173369120920185, + "loss": 0.2359, + "step": 193150 + }, + { + "epoch": 8.0, + "grad_norm": 0.59375, + "learning_rate": 0.0004173288545042072, + "loss": 0.1854, + "step": 193160 + }, + { + "epoch": 8.0, + "grad_norm": 0.859375, + "learning_rate": 0.0004173207966015011, + "loss": 0.2468, + "step": 193170 + }, + { + "epoch": 8.0, + "grad_norm": 0.4609375, + "learning_rate": 0.00041731273838391535, + "loss": 0.2067, + "step": 193180 + }, + { + "epoch": 8.0, + "grad_norm": 0.890625, + "learning_rate": 0.0004173046798514651, + "loss": 0.2165, + "step": 193190 + }, + { + "epoch": 8.0, + "grad_norm": 0.5390625, + "learning_rate": 0.0004172966210041656, + "loss": 0.2312, + "step": 193200 + }, + { + "epoch": 8.0, + "grad_norm": 0.6875, + "learning_rate": 0.0004172885618420318, + "loss": 0.2238, + "step": 193210 + }, + { + "epoch": 8.0, + "grad_norm": 0.54296875, + "learning_rate": 0.0004172805023650791, + "loss": 0.1927, + "step": 193220 + }, + { + "epoch": 8.0, + "grad_norm": 0.55859375, + "learning_rate": 0.00041727244257332244, + "loss": 0.2126, + "step": 193230 + }, + { + "epoch": 8.0, + "grad_norm": 0.63671875, + "learning_rate": 0.0004172643824667772, + "loss": 0.2064, + "step": 193240 + }, + { + "epoch": 8.0, + "grad_norm": 1.5859375, + "learning_rate": 0.0004172563220454584, + "loss": 0.1857, + "step": 193250 + }, + { + "epoch": 8.0, + "grad_norm": 0.7265625, + "learning_rate": 0.0004172482613093812, + "loss": 0.1556, + "step": 193260 + }, + { + "epoch": 8.01, + "grad_norm": 1.140625, + "learning_rate": 0.00041724020025856096, + "loss": 0.1888, + "step": 193270 + }, + { + "epoch": 8.01, + "grad_norm": 0.796875, + "learning_rate": 0.0004172321388930127, + "loss": 0.24, + "step": 193280 + }, + { + "epoch": 8.01, + "grad_norm": 0.83984375, + "learning_rate": 0.00041722407721275155, + "loss": 0.1881, + "step": 193290 + }, + { + "epoch": 8.01, + "grad_norm": 2.140625, + "learning_rate": 0.00041721601521779283, + "loss": 0.1963, + "step": 193300 + }, + { + "epoch": 8.01, + "grad_norm": 0.421875, + "learning_rate": 0.00041720795290815153, + "loss": 0.1818, + "step": 193310 + }, + { + "epoch": 8.01, + "grad_norm": 1.046875, + "learning_rate": 0.000417199890283843, + "loss": 0.246, + "step": 193320 + }, + { + "epoch": 8.01, + "grad_norm": 1.3125, + "learning_rate": 0.0004171918273448823, + "loss": 0.1562, + "step": 193330 + }, + { + "epoch": 8.01, + "grad_norm": 0.66796875, + "learning_rate": 0.00041718376409128466, + "loss": 0.2384, + "step": 193340 + }, + { + "epoch": 8.01, + "grad_norm": 0.58203125, + "learning_rate": 0.0004171757005230652, + "loss": 0.2262, + "step": 193350 + }, + { + "epoch": 8.01, + "grad_norm": 0.6953125, + "learning_rate": 0.00041716763664023914, + "loss": 0.161, + "step": 193360 + }, + { + "epoch": 8.01, + "grad_norm": 1.1640625, + "learning_rate": 0.00041715957244282167, + "loss": 0.1988, + "step": 193370 + }, + { + "epoch": 8.01, + "grad_norm": 0.6796875, + "learning_rate": 0.0004171515079308279, + "loss": 0.1365, + "step": 193380 + }, + { + "epoch": 8.01, + "grad_norm": 1.296875, + "learning_rate": 0.000417143443104273, + "loss": 0.1955, + "step": 193390 + }, + { + "epoch": 8.01, + "grad_norm": 0.65234375, + "learning_rate": 0.00041713537796317227, + "loss": 0.1842, + "step": 193400 + }, + { + "epoch": 8.01, + "grad_norm": 0.859375, + "learning_rate": 0.0004171273125075408, + "loss": 0.1738, + "step": 193410 + }, + { + "epoch": 8.01, + "grad_norm": 0.64453125, + "learning_rate": 0.0004171192467373937, + "loss": 0.2511, + "step": 193420 + }, + { + "epoch": 8.01, + "grad_norm": 0.578125, + "learning_rate": 0.0004171111806527463, + "loss": 0.2037, + "step": 193430 + }, + { + "epoch": 8.01, + "grad_norm": 0.77734375, + "learning_rate": 0.0004171031142536136, + "loss": 0.169, + "step": 193440 + }, + { + "epoch": 8.01, + "grad_norm": 0.5546875, + "learning_rate": 0.000417095047540011, + "loss": 0.1991, + "step": 193450 + }, + { + "epoch": 8.01, + "grad_norm": 0.89453125, + "learning_rate": 0.00041708698051195353, + "loss": 0.131, + "step": 193460 + }, + { + "epoch": 8.01, + "grad_norm": 0.91015625, + "learning_rate": 0.0004170789131694564, + "loss": 0.1863, + "step": 193470 + }, + { + "epoch": 8.01, + "grad_norm": 2.125, + "learning_rate": 0.00041707084551253484, + "loss": 0.1906, + "step": 193480 + }, + { + "epoch": 8.01, + "grad_norm": 0.359375, + "learning_rate": 0.0004170627775412039, + "loss": 0.1871, + "step": 193490 + }, + { + "epoch": 8.01, + "grad_norm": 0.65625, + "learning_rate": 0.00041705470925547893, + "loss": 0.1407, + "step": 193500 + }, + { + "epoch": 8.02, + "grad_norm": 1.0703125, + "learning_rate": 0.000417046640655375, + "loss": 0.2045, + "step": 193510 + }, + { + "epoch": 8.02, + "grad_norm": 0.890625, + "learning_rate": 0.0004170385717409074, + "loss": 0.2389, + "step": 193520 + }, + { + "epoch": 8.02, + "grad_norm": 2.203125, + "learning_rate": 0.00041703050251209117, + "loss": 0.2203, + "step": 193530 + }, + { + "epoch": 8.02, + "grad_norm": 0.45703125, + "learning_rate": 0.0004170224329689416, + "loss": 0.2127, + "step": 193540 + }, + { + "epoch": 8.02, + "grad_norm": 0.5703125, + "learning_rate": 0.00041701436311147383, + "loss": 0.2229, + "step": 193550 + }, + { + "epoch": 8.02, + "grad_norm": 0.70703125, + "learning_rate": 0.0004170062929397031, + "loss": 0.2313, + "step": 193560 + }, + { + "epoch": 8.02, + "grad_norm": 0.365234375, + "learning_rate": 0.0004169982224536445, + "loss": 0.2048, + "step": 193570 + }, + { + "epoch": 8.02, + "grad_norm": 0.369140625, + "learning_rate": 0.00041699015165331334, + "loss": 0.192, + "step": 193580 + }, + { + "epoch": 8.02, + "grad_norm": 0.4921875, + "learning_rate": 0.00041698208053872476, + "loss": 0.2018, + "step": 193590 + }, + { + "epoch": 8.02, + "grad_norm": 0.84765625, + "learning_rate": 0.00041697400910989385, + "loss": 0.2113, + "step": 193600 + }, + { + "epoch": 8.02, + "grad_norm": 0.6171875, + "learning_rate": 0.00041696593736683594, + "loss": 0.1982, + "step": 193610 + }, + { + "epoch": 8.02, + "grad_norm": 0.7734375, + "learning_rate": 0.00041695786530956627, + "loss": 0.2225, + "step": 193620 + }, + { + "epoch": 8.02, + "grad_norm": 1.3515625, + "learning_rate": 0.00041694979293809974, + "loss": 0.215, + "step": 193630 + }, + { + "epoch": 8.02, + "grad_norm": 0.64453125, + "learning_rate": 0.0004169417202524518, + "loss": 0.192, + "step": 193640 + }, + { + "epoch": 8.02, + "grad_norm": 1.34375, + "learning_rate": 0.0004169336472526376, + "loss": 0.1857, + "step": 193650 + }, + { + "epoch": 8.02, + "grad_norm": 1.421875, + "learning_rate": 0.00041692557393867226, + "loss": 0.1406, + "step": 193660 + }, + { + "epoch": 8.02, + "grad_norm": 0.7578125, + "learning_rate": 0.00041691750031057107, + "loss": 0.155, + "step": 193670 + }, + { + "epoch": 8.02, + "grad_norm": 2.265625, + "learning_rate": 0.0004169094263683492, + "loss": 0.1897, + "step": 193680 + }, + { + "epoch": 8.02, + "grad_norm": 1.15625, + "learning_rate": 0.00041690135211202173, + "loss": 0.1931, + "step": 193690 + }, + { + "epoch": 8.02, + "grad_norm": 0.87890625, + "learning_rate": 0.000416893277541604, + "loss": 0.2089, + "step": 193700 + }, + { + "epoch": 8.02, + "grad_norm": 0.4296875, + "learning_rate": 0.00041688520265711107, + "loss": 0.1879, + "step": 193710 + }, + { + "epoch": 8.02, + "grad_norm": 0.5390625, + "learning_rate": 0.00041687712745855826, + "loss": 0.1859, + "step": 193720 + }, + { + "epoch": 8.02, + "grad_norm": 0.87109375, + "learning_rate": 0.0004168690519459607, + "loss": 0.2122, + "step": 193730 + }, + { + "epoch": 8.02, + "grad_norm": 1.0703125, + "learning_rate": 0.0004168609761193336, + "loss": 0.2748, + "step": 193740 + }, + { + "epoch": 8.03, + "grad_norm": 0.625, + "learning_rate": 0.0004168528999786922, + "loss": 0.2295, + "step": 193750 + }, + { + "epoch": 8.03, + "grad_norm": 0.875, + "learning_rate": 0.0004168448235240516, + "loss": 0.1853, + "step": 193760 + }, + { + "epoch": 8.03, + "grad_norm": 1.65625, + "learning_rate": 0.00041683674675542715, + "loss": 0.1771, + "step": 193770 + }, + { + "epoch": 8.03, + "grad_norm": 0.6875, + "learning_rate": 0.00041682866967283384, + "loss": 0.2357, + "step": 193780 + }, + { + "epoch": 8.03, + "grad_norm": 0.4765625, + "learning_rate": 0.0004168205922762871, + "loss": 0.2009, + "step": 193790 + }, + { + "epoch": 8.03, + "grad_norm": 0.51171875, + "learning_rate": 0.000416812514565802, + "loss": 0.2196, + "step": 193800 + }, + { + "epoch": 8.03, + "grad_norm": 1.875, + "learning_rate": 0.00041680443654139364, + "loss": 0.204, + "step": 193810 + }, + { + "epoch": 8.03, + "grad_norm": 1.1328125, + "learning_rate": 0.0004167963582030775, + "loss": 0.1861, + "step": 193820 + }, + { + "epoch": 8.03, + "grad_norm": 0.96875, + "learning_rate": 0.00041678827955086864, + "loss": 0.1458, + "step": 193830 + }, + { + "epoch": 8.03, + "grad_norm": 0.43359375, + "learning_rate": 0.00041678020058478213, + "loss": 0.1836, + "step": 193840 + }, + { + "epoch": 8.03, + "grad_norm": 1.03125, + "learning_rate": 0.00041677212130483335, + "loss": 0.1897, + "step": 193850 + }, + { + "epoch": 8.03, + "grad_norm": 0.62109375, + "learning_rate": 0.0004167640417110374, + "loss": 0.1498, + "step": 193860 + }, + { + "epoch": 8.03, + "grad_norm": 0.58984375, + "learning_rate": 0.0004167559618034096, + "loss": 0.2051, + "step": 193870 + }, + { + "epoch": 8.03, + "grad_norm": 0.73046875, + "learning_rate": 0.00041674788158196506, + "loss": 0.2347, + "step": 193880 + }, + { + "epoch": 8.03, + "grad_norm": 0.6875, + "learning_rate": 0.000416739801046719, + "loss": 0.2748, + "step": 193890 + }, + { + "epoch": 8.03, + "grad_norm": 0.9296875, + "learning_rate": 0.0004167317201976867, + "loss": 0.1871, + "step": 193900 + }, + { + "epoch": 8.03, + "grad_norm": 0.0, + "learning_rate": 0.0004167236390348832, + "loss": 0.186, + "step": 193910 + }, + { + "epoch": 8.03, + "grad_norm": 0.6875, + "learning_rate": 0.0004167155575583239, + "loss": 0.2057, + "step": 193920 + }, + { + "epoch": 8.03, + "grad_norm": 1.0703125, + "learning_rate": 0.00041670747576802393, + "loss": 0.1869, + "step": 193930 + }, + { + "epoch": 8.03, + "grad_norm": 1.234375, + "learning_rate": 0.00041669939366399846, + "loss": 0.1795, + "step": 193940 + }, + { + "epoch": 8.03, + "grad_norm": 1.6484375, + "learning_rate": 0.0004166913112462627, + "loss": 0.2078, + "step": 193950 + }, + { + "epoch": 8.03, + "grad_norm": 0.9296875, + "learning_rate": 0.000416683228514832, + "loss": 0.2203, + "step": 193960 + }, + { + "epoch": 8.03, + "grad_norm": 0.392578125, + "learning_rate": 0.00041667514546972137, + "loss": 0.2289, + "step": 193970 + }, + { + "epoch": 8.03, + "grad_norm": 0.46875, + "learning_rate": 0.0004166670621109461, + "loss": 0.2356, + "step": 193980 + }, + { + "epoch": 8.04, + "grad_norm": 1.0859375, + "learning_rate": 0.0004166589784385215, + "loss": 0.1703, + "step": 193990 + }, + { + "epoch": 8.04, + "grad_norm": 0.494140625, + "learning_rate": 0.00041665089445246263, + "loss": 0.1523, + "step": 194000 + }, + { + "epoch": 8.04, + "grad_norm": 2.34375, + "learning_rate": 0.0004166428101527848, + "loss": 0.2329, + "step": 194010 + }, + { + "epoch": 8.04, + "grad_norm": 0.4375, + "learning_rate": 0.0004166347255395032, + "loss": 0.2211, + "step": 194020 + }, + { + "epoch": 8.04, + "grad_norm": 0.392578125, + "learning_rate": 0.00041662664061263305, + "loss": 0.1665, + "step": 194030 + }, + { + "epoch": 8.04, + "grad_norm": 0.6484375, + "learning_rate": 0.0004166185553721896, + "loss": 0.1871, + "step": 194040 + }, + { + "epoch": 8.04, + "grad_norm": 0.80078125, + "learning_rate": 0.00041661046981818797, + "loss": 0.2536, + "step": 194050 + }, + { + "epoch": 8.04, + "grad_norm": 0.98046875, + "learning_rate": 0.00041660238395064343, + "loss": 0.1861, + "step": 194060 + }, + { + "epoch": 8.04, + "grad_norm": 0.79296875, + "learning_rate": 0.00041659429776957116, + "loss": 0.2129, + "step": 194070 + }, + { + "epoch": 8.04, + "grad_norm": 0.7578125, + "learning_rate": 0.0004165862112749864, + "loss": 0.1924, + "step": 194080 + }, + { + "epoch": 8.04, + "grad_norm": 0.2578125, + "learning_rate": 0.00041657812446690447, + "loss": 0.1726, + "step": 194090 + }, + { + "epoch": 8.04, + "grad_norm": 0.75, + "learning_rate": 0.0004165700373453404, + "loss": 0.1964, + "step": 194100 + }, + { + "epoch": 8.04, + "grad_norm": 0.7421875, + "learning_rate": 0.0004165619499103096, + "loss": 0.2235, + "step": 194110 + }, + { + "epoch": 8.04, + "grad_norm": 0.2294921875, + "learning_rate": 0.00041655386216182713, + "loss": 0.1571, + "step": 194120 + }, + { + "epoch": 8.04, + "grad_norm": 0.5625, + "learning_rate": 0.00041654577409990835, + "loss": 0.2309, + "step": 194130 + }, + { + "epoch": 8.04, + "grad_norm": 0.75, + "learning_rate": 0.0004165376857245684, + "loss": 0.1991, + "step": 194140 + }, + { + "epoch": 8.04, + "grad_norm": 0.8359375, + "learning_rate": 0.00041652959703582243, + "loss": 0.2221, + "step": 194150 + }, + { + "epoch": 8.04, + "grad_norm": 0.453125, + "learning_rate": 0.0004165215080336858, + "loss": 0.2197, + "step": 194160 + }, + { + "epoch": 8.04, + "grad_norm": 0.80078125, + "learning_rate": 0.00041651341871817366, + "loss": 0.2578, + "step": 194170 + }, + { + "epoch": 8.04, + "grad_norm": 0.7421875, + "learning_rate": 0.00041650532908930123, + "loss": 0.1997, + "step": 194180 + }, + { + "epoch": 8.04, + "grad_norm": 0.66796875, + "learning_rate": 0.0004164972391470838, + "loss": 0.1673, + "step": 194190 + }, + { + "epoch": 8.04, + "grad_norm": 0.78515625, + "learning_rate": 0.0004164891488915365, + "loss": 0.1776, + "step": 194200 + }, + { + "epoch": 8.04, + "grad_norm": 0.91796875, + "learning_rate": 0.0004164810583226746, + "loss": 0.1979, + "step": 194210 + }, + { + "epoch": 8.04, + "grad_norm": 0.578125, + "learning_rate": 0.00041647296744051337, + "loss": 0.218, + "step": 194220 + }, + { + "epoch": 8.04, + "grad_norm": 1.078125, + "learning_rate": 0.00041646487624506795, + "loss": 0.2187, + "step": 194230 + }, + { + "epoch": 8.05, + "grad_norm": 0.78515625, + "learning_rate": 0.00041645678473635364, + "loss": 0.1754, + "step": 194240 + }, + { + "epoch": 8.05, + "grad_norm": 1.3125, + "learning_rate": 0.0004164486929143856, + "loss": 0.1597, + "step": 194250 + }, + { + "epoch": 8.05, + "grad_norm": 0.474609375, + "learning_rate": 0.00041644060077917914, + "loss": 0.1824, + "step": 194260 + }, + { + "epoch": 8.05, + "grad_norm": 0.9921875, + "learning_rate": 0.0004164325083307494, + "loss": 0.225, + "step": 194270 + }, + { + "epoch": 8.05, + "grad_norm": 0.9296875, + "learning_rate": 0.0004164244155691117, + "loss": 0.2169, + "step": 194280 + }, + { + "epoch": 8.05, + "grad_norm": 1.625, + "learning_rate": 0.0004164163224942812, + "loss": 0.2325, + "step": 194290 + }, + { + "epoch": 8.05, + "grad_norm": 0.498046875, + "learning_rate": 0.0004164082291062732, + "loss": 0.1932, + "step": 194300 + }, + { + "epoch": 8.05, + "grad_norm": 0.466796875, + "learning_rate": 0.0004164001354051028, + "loss": 0.2053, + "step": 194310 + }, + { + "epoch": 8.05, + "grad_norm": 1.390625, + "learning_rate": 0.00041639204139078535, + "loss": 0.2255, + "step": 194320 + }, + { + "epoch": 8.05, + "grad_norm": 1.21875, + "learning_rate": 0.00041638394706333605, + "loss": 0.2767, + "step": 194330 + }, + { + "epoch": 8.05, + "grad_norm": 0.259765625, + "learning_rate": 0.0004163758524227701, + "loss": 0.204, + "step": 194340 + }, + { + "epoch": 8.05, + "grad_norm": 0.80859375, + "learning_rate": 0.0004163677574691028, + "loss": 0.1805, + "step": 194350 + }, + { + "epoch": 8.05, + "grad_norm": 0.62109375, + "learning_rate": 0.0004163596622023493, + "loss": 0.2045, + "step": 194360 + }, + { + "epoch": 8.05, + "grad_norm": 0.5, + "learning_rate": 0.0004163515666225249, + "loss": 0.2174, + "step": 194370 + }, + { + "epoch": 8.05, + "grad_norm": 0.875, + "learning_rate": 0.0004163434707296449, + "loss": 0.1782, + "step": 194380 + }, + { + "epoch": 8.05, + "grad_norm": 0.69140625, + "learning_rate": 0.00041633537452372436, + "loss": 0.2014, + "step": 194390 + }, + { + "epoch": 8.05, + "grad_norm": 0.8359375, + "learning_rate": 0.0004163272780047787, + "loss": 0.1668, + "step": 194400 + }, + { + "epoch": 8.05, + "grad_norm": 0.49609375, + "learning_rate": 0.0004163191811728229, + "loss": 0.2011, + "step": 194410 + }, + { + "epoch": 8.05, + "grad_norm": 0.34375, + "learning_rate": 0.0004163110840278725, + "loss": 0.1922, + "step": 194420 + }, + { + "epoch": 8.05, + "grad_norm": 1.21875, + "learning_rate": 0.00041630298656994256, + "loss": 0.1771, + "step": 194430 + }, + { + "epoch": 8.05, + "grad_norm": 1.171875, + "learning_rate": 0.0004162948887990483, + "loss": 0.1878, + "step": 194440 + }, + { + "epoch": 8.05, + "grad_norm": 1.359375, + "learning_rate": 0.00041628679071520515, + "loss": 0.1757, + "step": 194450 + }, + { + "epoch": 8.05, + "grad_norm": 1.921875, + "learning_rate": 0.00041627869231842813, + "loss": 0.2103, + "step": 194460 + }, + { + "epoch": 8.05, + "grad_norm": 0.404296875, + "learning_rate": 0.00041627059360873253, + "loss": 0.2172, + "step": 194470 + }, + { + "epoch": 8.06, + "grad_norm": 0.337890625, + "learning_rate": 0.00041626249458613375, + "loss": 0.2032, + "step": 194480 + }, + { + "epoch": 8.06, + "grad_norm": 0.263671875, + "learning_rate": 0.0004162543952506468, + "loss": 0.1884, + "step": 194490 + }, + { + "epoch": 8.06, + "grad_norm": 1.5078125, + "learning_rate": 0.00041624629560228706, + "loss": 0.2081, + "step": 194500 + }, + { + "epoch": 8.06, + "grad_norm": 1.078125, + "learning_rate": 0.0004162381956410698, + "loss": 0.2042, + "step": 194510 + }, + { + "epoch": 8.06, + "grad_norm": 1.078125, + "learning_rate": 0.0004162300953670101, + "loss": 0.2524, + "step": 194520 + }, + { + "epoch": 8.06, + "grad_norm": 0.6953125, + "learning_rate": 0.0004162219947801234, + "loss": 0.1802, + "step": 194530 + }, + { + "epoch": 8.06, + "grad_norm": 0.6015625, + "learning_rate": 0.0004162138938804248, + "loss": 0.1657, + "step": 194540 + }, + { + "epoch": 8.06, + "grad_norm": 1.25, + "learning_rate": 0.00041620579266792967, + "loss": 0.1814, + "step": 194550 + }, + { + "epoch": 8.06, + "grad_norm": 0.73828125, + "learning_rate": 0.00041619769114265313, + "loss": 0.1708, + "step": 194560 + }, + { + "epoch": 8.06, + "grad_norm": 1.1953125, + "learning_rate": 0.0004161895893046105, + "loss": 0.175, + "step": 194570 + }, + { + "epoch": 8.06, + "grad_norm": 0.2890625, + "learning_rate": 0.00041618148715381697, + "loss": 0.242, + "step": 194580 + }, + { + "epoch": 8.06, + "grad_norm": 2.78125, + "learning_rate": 0.0004161733846902879, + "loss": 0.1881, + "step": 194590 + }, + { + "epoch": 8.06, + "grad_norm": 2.015625, + "learning_rate": 0.00041616528191403847, + "loss": 0.2078, + "step": 194600 + }, + { + "epoch": 8.06, + "grad_norm": 0.60546875, + "learning_rate": 0.00041615717882508387, + "loss": 0.1998, + "step": 194610 + }, + { + "epoch": 8.06, + "grad_norm": 0.177734375, + "learning_rate": 0.0004161490754234394, + "loss": 0.2234, + "step": 194620 + }, + { + "epoch": 8.06, + "grad_norm": 0.96875, + "learning_rate": 0.00041614097170912035, + "loss": 0.2426, + "step": 194630 + }, + { + "epoch": 8.06, + "grad_norm": 0.427734375, + "learning_rate": 0.00041613286768214195, + "loss": 0.1811, + "step": 194640 + }, + { + "epoch": 8.06, + "grad_norm": 1.03125, + "learning_rate": 0.0004161247633425194, + "loss": 0.1981, + "step": 194650 + }, + { + "epoch": 8.06, + "grad_norm": 0.412109375, + "learning_rate": 0.0004161166586902679, + "loss": 0.1949, + "step": 194660 + }, + { + "epoch": 8.06, + "grad_norm": 0.427734375, + "learning_rate": 0.0004161085537254029, + "loss": 0.202, + "step": 194670 + }, + { + "epoch": 8.06, + "grad_norm": 0.392578125, + "learning_rate": 0.00041610044844793946, + "loss": 0.1622, + "step": 194680 + }, + { + "epoch": 8.06, + "grad_norm": 0.96875, + "learning_rate": 0.000416092342857893, + "loss": 0.232, + "step": 194690 + }, + { + "epoch": 8.06, + "grad_norm": 0.8203125, + "learning_rate": 0.00041608423695527864, + "loss": 0.1795, + "step": 194700 + }, + { + "epoch": 8.06, + "grad_norm": 0.69921875, + "learning_rate": 0.00041607613074011175, + "loss": 0.2069, + "step": 194710 + }, + { + "epoch": 8.07, + "grad_norm": 0.7109375, + "learning_rate": 0.0004160680242124075, + "loss": 0.2229, + "step": 194720 + }, + { + "epoch": 8.07, + "grad_norm": 1.0859375, + "learning_rate": 0.0004160599173721811, + "loss": 0.2312, + "step": 194730 + }, + { + "epoch": 8.07, + "grad_norm": 0.46875, + "learning_rate": 0.0004160518102194478, + "loss": 0.1967, + "step": 194740 + }, + { + "epoch": 8.07, + "grad_norm": 1.0703125, + "learning_rate": 0.0004160437027542231, + "loss": 0.2344, + "step": 194750 + }, + { + "epoch": 8.07, + "grad_norm": 0.67578125, + "learning_rate": 0.00041603559497652197, + "loss": 0.2207, + "step": 194760 + }, + { + "epoch": 8.07, + "grad_norm": 0.72265625, + "learning_rate": 0.0004160274868863598, + "loss": 0.1589, + "step": 194770 + }, + { + "epoch": 8.07, + "grad_norm": 2.234375, + "learning_rate": 0.00041601937848375183, + "loss": 0.1958, + "step": 194780 + }, + { + "epoch": 8.07, + "grad_norm": 0.3203125, + "learning_rate": 0.0004160112697687133, + "loss": 0.1846, + "step": 194790 + }, + { + "epoch": 8.07, + "grad_norm": 0.7265625, + "learning_rate": 0.00041600316074125957, + "loss": 0.1683, + "step": 194800 + }, + { + "epoch": 8.07, + "grad_norm": 1.2421875, + "learning_rate": 0.00041599505140140576, + "loss": 0.2107, + "step": 194810 + }, + { + "epoch": 8.07, + "grad_norm": 0.5625, + "learning_rate": 0.00041598694174916717, + "loss": 0.2375, + "step": 194820 + }, + { + "epoch": 8.07, + "grad_norm": 0.45703125, + "learning_rate": 0.0004159788317845591, + "loss": 0.2421, + "step": 194830 + }, + { + "epoch": 8.07, + "grad_norm": 0.51171875, + "learning_rate": 0.0004159707215075969, + "loss": 0.2128, + "step": 194840 + }, + { + "epoch": 8.07, + "grad_norm": 0.87890625, + "learning_rate": 0.0004159626109182956, + "loss": 0.2642, + "step": 194850 + }, + { + "epoch": 8.07, + "grad_norm": 0.4921875, + "learning_rate": 0.00041595450001667055, + "loss": 0.2229, + "step": 194860 + }, + { + "epoch": 8.07, + "grad_norm": 0.40234375, + "learning_rate": 0.0004159463888027371, + "loss": 0.1909, + "step": 194870 + }, + { + "epoch": 8.07, + "grad_norm": 0.4375, + "learning_rate": 0.0004159382772765105, + "loss": 0.1282, + "step": 194880 + }, + { + "epoch": 8.07, + "grad_norm": 0.96875, + "learning_rate": 0.0004159301654380059, + "loss": 0.1995, + "step": 194890 + }, + { + "epoch": 8.07, + "grad_norm": 0.67578125, + "learning_rate": 0.00041592205328723876, + "loss": 0.2134, + "step": 194900 + }, + { + "epoch": 8.07, + "grad_norm": 1.5, + "learning_rate": 0.0004159139408242242, + "loss": 0.1842, + "step": 194910 + }, + { + "epoch": 8.07, + "grad_norm": 0.6796875, + "learning_rate": 0.00041590582804897747, + "loss": 0.1973, + "step": 194920 + }, + { + "epoch": 8.07, + "grad_norm": 0.5625, + "learning_rate": 0.00041589771496151396, + "loss": 0.2153, + "step": 194930 + }, + { + "epoch": 8.07, + "grad_norm": 0.78515625, + "learning_rate": 0.0004158896015618488, + "loss": 0.2142, + "step": 194940 + }, + { + "epoch": 8.07, + "grad_norm": 0.314453125, + "learning_rate": 0.0004158814878499974, + "loss": 0.2249, + "step": 194950 + }, + { + "epoch": 8.08, + "grad_norm": 0.23046875, + "learning_rate": 0.0004158733738259748, + "loss": 0.1368, + "step": 194960 + }, + { + "epoch": 8.08, + "grad_norm": 0.50390625, + "learning_rate": 0.0004158652594897966, + "loss": 0.1703, + "step": 194970 + }, + { + "epoch": 8.08, + "grad_norm": 0.76953125, + "learning_rate": 0.00041585714484147784, + "loss": 0.2108, + "step": 194980 + }, + { + "epoch": 8.08, + "grad_norm": 0.9140625, + "learning_rate": 0.0004158490298810338, + "loss": 0.1696, + "step": 194990 + }, + { + "epoch": 8.08, + "grad_norm": 1.3828125, + "learning_rate": 0.00041584091460847983, + "loss": 0.2315, + "step": 195000 + }, + { + "epoch": 8.08, + "grad_norm": 0.474609375, + "learning_rate": 0.0004158327990238312, + "loss": 0.2335, + "step": 195010 + }, + { + "epoch": 8.08, + "grad_norm": 0.66015625, + "learning_rate": 0.00041582468312710303, + "loss": 0.2295, + "step": 195020 + }, + { + "epoch": 8.08, + "grad_norm": 0.33984375, + "learning_rate": 0.0004158165669183108, + "loss": 0.2082, + "step": 195030 + }, + { + "epoch": 8.08, + "grad_norm": 1.296875, + "learning_rate": 0.00041580845039746974, + "loss": 0.2086, + "step": 195040 + }, + { + "epoch": 8.08, + "grad_norm": 0.48046875, + "learning_rate": 0.000415800333564595, + "loss": 0.1892, + "step": 195050 + }, + { + "epoch": 8.08, + "grad_norm": 0.89453125, + "learning_rate": 0.000415792216419702, + "loss": 0.1746, + "step": 195060 + }, + { + "epoch": 8.08, + "grad_norm": 0.7265625, + "learning_rate": 0.0004157840989628059, + "loss": 0.2037, + "step": 195070 + }, + { + "epoch": 8.08, + "grad_norm": 0.58984375, + "learning_rate": 0.0004157759811939221, + "loss": 0.2092, + "step": 195080 + }, + { + "epoch": 8.08, + "grad_norm": 0.91796875, + "learning_rate": 0.0004157678631130657, + "loss": 0.2134, + "step": 195090 + }, + { + "epoch": 8.08, + "grad_norm": 1.4609375, + "learning_rate": 0.00041575974472025213, + "loss": 0.1977, + "step": 195100 + }, + { + "epoch": 8.08, + "grad_norm": 0.61328125, + "learning_rate": 0.00041575162601549667, + "loss": 0.1823, + "step": 195110 + }, + { + "epoch": 8.08, + "grad_norm": 1.2421875, + "learning_rate": 0.00041574350699881445, + "loss": 0.2396, + "step": 195120 + }, + { + "epoch": 8.08, + "grad_norm": 0.63671875, + "learning_rate": 0.00041573538767022093, + "loss": 0.2048, + "step": 195130 + }, + { + "epoch": 8.08, + "grad_norm": 0.875, + "learning_rate": 0.00041572726802973134, + "loss": 0.2027, + "step": 195140 + }, + { + "epoch": 8.08, + "grad_norm": 0.62890625, + "learning_rate": 0.00041571914807736086, + "loss": 0.2142, + "step": 195150 + }, + { + "epoch": 8.08, + "grad_norm": 0.578125, + "learning_rate": 0.0004157110278131249, + "loss": 0.2157, + "step": 195160 + }, + { + "epoch": 8.08, + "grad_norm": 0.87890625, + "learning_rate": 0.0004157029072370386, + "loss": 0.206, + "step": 195170 + }, + { + "epoch": 8.08, + "grad_norm": 0.69921875, + "learning_rate": 0.0004156947863491173, + "loss": 0.2244, + "step": 195180 + }, + { + "epoch": 8.08, + "grad_norm": 0.38671875, + "learning_rate": 0.0004156866651493764, + "loss": 0.2073, + "step": 195190 + }, + { + "epoch": 8.09, + "grad_norm": 1.4296875, + "learning_rate": 0.0004156785436378311, + "loss": 0.2328, + "step": 195200 + }, + { + "epoch": 8.09, + "grad_norm": 0.67578125, + "learning_rate": 0.0004156704218144966, + "loss": 0.1571, + "step": 195210 + }, + { + "epoch": 8.09, + "grad_norm": 1.0234375, + "learning_rate": 0.0004156622996793883, + "loss": 0.2414, + "step": 195220 + }, + { + "epoch": 8.09, + "grad_norm": 0.72265625, + "learning_rate": 0.0004156541772325214, + "loss": 0.1793, + "step": 195230 + }, + { + "epoch": 8.09, + "grad_norm": 0.9140625, + "learning_rate": 0.00041564605447391126, + "loss": 0.1753, + "step": 195240 + }, + { + "epoch": 8.09, + "grad_norm": 1.1171875, + "learning_rate": 0.0004156379314035731, + "loss": 0.1457, + "step": 195250 + }, + { + "epoch": 8.09, + "grad_norm": 2.5, + "learning_rate": 0.0004156298080215223, + "loss": 0.1664, + "step": 195260 + }, + { + "epoch": 8.09, + "grad_norm": 1.0546875, + "learning_rate": 0.00041562168432777405, + "loss": 0.1923, + "step": 195270 + }, + { + "epoch": 8.09, + "grad_norm": 0.97265625, + "learning_rate": 0.00041561356032234365, + "loss": 0.1594, + "step": 195280 + }, + { + "epoch": 8.09, + "grad_norm": 0.443359375, + "learning_rate": 0.0004156054360052465, + "loss": 0.1369, + "step": 195290 + }, + { + "epoch": 8.09, + "grad_norm": 0.671875, + "learning_rate": 0.0004155973113764977, + "loss": 0.2097, + "step": 195300 + }, + { + "epoch": 8.09, + "grad_norm": 0.74609375, + "learning_rate": 0.00041558918643611276, + "loss": 0.2275, + "step": 195310 + }, + { + "epoch": 8.09, + "grad_norm": 0.89453125, + "learning_rate": 0.00041558106118410676, + "loss": 0.1926, + "step": 195320 + }, + { + "epoch": 8.09, + "grad_norm": 1.046875, + "learning_rate": 0.0004155729356204951, + "loss": 0.2146, + "step": 195330 + }, + { + "epoch": 8.09, + "grad_norm": 0.71875, + "learning_rate": 0.0004155648097452931, + "loss": 0.222, + "step": 195340 + }, + { + "epoch": 8.09, + "grad_norm": 0.765625, + "learning_rate": 0.000415556683558516, + "loss": 0.2406, + "step": 195350 + }, + { + "epoch": 8.09, + "grad_norm": 1.1796875, + "learning_rate": 0.00041554855706017903, + "loss": 0.2059, + "step": 195360 + }, + { + "epoch": 8.09, + "grad_norm": 0.703125, + "learning_rate": 0.00041554043025029757, + "loss": 0.1635, + "step": 195370 + }, + { + "epoch": 8.09, + "grad_norm": 0.80078125, + "learning_rate": 0.0004155323031288869, + "loss": 0.1915, + "step": 195380 + }, + { + "epoch": 8.09, + "grad_norm": 1.0390625, + "learning_rate": 0.00041552417569596235, + "loss": 0.2744, + "step": 195390 + }, + { + "epoch": 8.09, + "grad_norm": 1.4140625, + "learning_rate": 0.00041551604795153917, + "loss": 0.2137, + "step": 195400 + }, + { + "epoch": 8.09, + "grad_norm": 2.09375, + "learning_rate": 0.0004155079198956326, + "loss": 0.257, + "step": 195410 + }, + { + "epoch": 8.09, + "grad_norm": 0.79296875, + "learning_rate": 0.00041549979152825814, + "loss": 0.1937, + "step": 195420 + }, + { + "epoch": 8.09, + "grad_norm": 0.61328125, + "learning_rate": 0.00041549166284943087, + "loss": 0.2102, + "step": 195430 + }, + { + "epoch": 8.1, + "grad_norm": 0.62109375, + "learning_rate": 0.0004154835338591661, + "loss": 0.1941, + "step": 195440 + }, + { + "epoch": 8.1, + "grad_norm": 1.3671875, + "learning_rate": 0.0004154754045574792, + "loss": 0.1768, + "step": 195450 + }, + { + "epoch": 8.1, + "grad_norm": 0.546875, + "learning_rate": 0.0004154672749443855, + "loss": 0.1729, + "step": 195460 + }, + { + "epoch": 8.1, + "grad_norm": 1.3203125, + "learning_rate": 0.0004154591450199003, + "loss": 0.2326, + "step": 195470 + }, + { + "epoch": 8.1, + "grad_norm": 0.77734375, + "learning_rate": 0.0004154510147840388, + "loss": 0.1502, + "step": 195480 + }, + { + "epoch": 8.1, + "grad_norm": 0.71875, + "learning_rate": 0.00041544288423681633, + "loss": 0.1682, + "step": 195490 + }, + { + "epoch": 8.1, + "grad_norm": 0.984375, + "learning_rate": 0.0004154347533782483, + "loss": 0.2043, + "step": 195500 + }, + { + "epoch": 8.1, + "grad_norm": 1.2109375, + "learning_rate": 0.0004154266222083499, + "loss": 0.2393, + "step": 195510 + }, + { + "epoch": 8.1, + "grad_norm": 1.5859375, + "learning_rate": 0.00041541849072713647, + "loss": 0.1656, + "step": 195520 + }, + { + "epoch": 8.1, + "grad_norm": 0.859375, + "learning_rate": 0.0004154103589346232, + "loss": 0.2225, + "step": 195530 + }, + { + "epoch": 8.1, + "grad_norm": 0.451171875, + "learning_rate": 0.0004154022268308256, + "loss": 0.1754, + "step": 195540 + }, + { + "epoch": 8.1, + "grad_norm": 0.77734375, + "learning_rate": 0.0004153940944157589, + "loss": 0.1865, + "step": 195550 + }, + { + "epoch": 8.1, + "grad_norm": 0.87890625, + "learning_rate": 0.0004153859616894383, + "loss": 0.1271, + "step": 195560 + }, + { + "epoch": 8.1, + "grad_norm": 1.0, + "learning_rate": 0.0004153778286518792, + "loss": 0.1835, + "step": 195570 + }, + { + "epoch": 8.1, + "grad_norm": 0.000507354736328125, + "learning_rate": 0.00041536969530309685, + "loss": 0.177, + "step": 195580 + }, + { + "epoch": 8.1, + "grad_norm": 0.84375, + "learning_rate": 0.0004153615616431067, + "loss": 0.2004, + "step": 195590 + }, + { + "epoch": 8.1, + "grad_norm": 1.21875, + "learning_rate": 0.00041535342767192384, + "loss": 0.2501, + "step": 195600 + }, + { + "epoch": 8.1, + "grad_norm": 0.76171875, + "learning_rate": 0.00041534529338956374, + "loss": 0.2355, + "step": 195610 + }, + { + "epoch": 8.1, + "grad_norm": 0.91015625, + "learning_rate": 0.00041533715879604164, + "loss": 0.2165, + "step": 195620 + }, + { + "epoch": 8.1, + "grad_norm": 1.7421875, + "learning_rate": 0.00041532902389137286, + "loss": 0.1932, + "step": 195630 + }, + { + "epoch": 8.1, + "grad_norm": 1.2890625, + "learning_rate": 0.0004153208886755727, + "loss": 0.2398, + "step": 195640 + }, + { + "epoch": 8.1, + "grad_norm": 0.59765625, + "learning_rate": 0.0004153127531486565, + "loss": 0.2295, + "step": 195650 + }, + { + "epoch": 8.1, + "grad_norm": 0.494140625, + "learning_rate": 0.00041530461731063953, + "loss": 0.2365, + "step": 195660 + }, + { + "epoch": 8.1, + "grad_norm": 1.0390625, + "learning_rate": 0.00041529648116153716, + "loss": 0.1935, + "step": 195670 + }, + { + "epoch": 8.11, + "grad_norm": 0.79296875, + "learning_rate": 0.00041528834470136466, + "loss": 0.2107, + "step": 195680 + }, + { + "epoch": 8.11, + "grad_norm": 1.0, + "learning_rate": 0.0004152802079301373, + "loss": 0.2143, + "step": 195690 + }, + { + "epoch": 8.11, + "grad_norm": 0.28125, + "learning_rate": 0.00041527207084787043, + "loss": 0.2317, + "step": 195700 + }, + { + "epoch": 8.11, + "grad_norm": 1.046875, + "learning_rate": 0.00041526393345457945, + "loss": 0.1865, + "step": 195710 + }, + { + "epoch": 8.11, + "grad_norm": 0.42578125, + "learning_rate": 0.0004152557957502795, + "loss": 0.2166, + "step": 195720 + }, + { + "epoch": 8.11, + "grad_norm": 0.97265625, + "learning_rate": 0.0004152476577349861, + "loss": 0.2129, + "step": 195730 + }, + { + "epoch": 8.11, + "grad_norm": 0.78125, + "learning_rate": 0.00041523951940871436, + "loss": 0.1585, + "step": 195740 + }, + { + "epoch": 8.11, + "grad_norm": 0.8359375, + "learning_rate": 0.00041523138077147973, + "loss": 0.2063, + "step": 195750 + }, + { + "epoch": 8.11, + "grad_norm": 0.52734375, + "learning_rate": 0.00041522324182329744, + "loss": 0.2104, + "step": 195760 + }, + { + "epoch": 8.11, + "grad_norm": 0.5625, + "learning_rate": 0.0004152151025641829, + "loss": 0.2039, + "step": 195770 + }, + { + "epoch": 8.11, + "grad_norm": 0.2080078125, + "learning_rate": 0.0004152069629941513, + "loss": 0.1849, + "step": 195780 + }, + { + "epoch": 8.11, + "grad_norm": 0.58984375, + "learning_rate": 0.0004151988231132181, + "loss": 0.1526, + "step": 195790 + }, + { + "epoch": 8.11, + "grad_norm": 0.5703125, + "learning_rate": 0.0004151906829213986, + "loss": 0.206, + "step": 195800 + }, + { + "epoch": 8.11, + "grad_norm": 0.73828125, + "learning_rate": 0.000415182542418708, + "loss": 0.2432, + "step": 195810 + }, + { + "epoch": 8.11, + "grad_norm": 0.50390625, + "learning_rate": 0.00041517440160516176, + "loss": 0.2051, + "step": 195820 + }, + { + "epoch": 8.11, + "grad_norm": 1.03125, + "learning_rate": 0.0004151662604807751, + "loss": 0.1754, + "step": 195830 + }, + { + "epoch": 8.11, + "grad_norm": 1.125, + "learning_rate": 0.00041515811904556334, + "loss": 0.2155, + "step": 195840 + }, + { + "epoch": 8.11, + "grad_norm": 0.55078125, + "learning_rate": 0.0004151499772995418, + "loss": 0.1969, + "step": 195850 + }, + { + "epoch": 8.11, + "grad_norm": 0.08056640625, + "learning_rate": 0.00041514183524272595, + "loss": 0.1636, + "step": 195860 + }, + { + "epoch": 8.11, + "grad_norm": 0.68359375, + "learning_rate": 0.00041513369287513095, + "loss": 0.1921, + "step": 195870 + }, + { + "epoch": 8.11, + "grad_norm": 0.94921875, + "learning_rate": 0.0004151255501967722, + "loss": 0.1557, + "step": 195880 + }, + { + "epoch": 8.11, + "grad_norm": 1.0859375, + "learning_rate": 0.000415117407207665, + "loss": 0.2808, + "step": 195890 + }, + { + "epoch": 8.11, + "grad_norm": 0.4375, + "learning_rate": 0.00041510926390782467, + "loss": 0.1685, + "step": 195900 + }, + { + "epoch": 8.11, + "grad_norm": 0.70703125, + "learning_rate": 0.0004151011202972664, + "loss": 0.1572, + "step": 195910 + }, + { + "epoch": 8.11, + "grad_norm": 0.55859375, + "learning_rate": 0.00041509297637600586, + "loss": 0.2082, + "step": 195920 + }, + { + "epoch": 8.12, + "grad_norm": 0.5859375, + "learning_rate": 0.000415084832144058, + "loss": 0.1701, + "step": 195930 + }, + { + "epoch": 8.12, + "grad_norm": 0.5703125, + "learning_rate": 0.00041507668760143836, + "loss": 0.2067, + "step": 195940 + }, + { + "epoch": 8.12, + "grad_norm": 1.234375, + "learning_rate": 0.0004150685427481623, + "loss": 0.2599, + "step": 195950 + }, + { + "epoch": 8.12, + "grad_norm": 1.0078125, + "learning_rate": 0.00041506039758424496, + "loss": 0.2048, + "step": 195960 + }, + { + "epoch": 8.12, + "grad_norm": 0.67578125, + "learning_rate": 0.0004150522521097019, + "loss": 0.1604, + "step": 195970 + }, + { + "epoch": 8.12, + "grad_norm": 0.67578125, + "learning_rate": 0.0004150441063245482, + "loss": 0.1998, + "step": 195980 + }, + { + "epoch": 8.12, + "grad_norm": 0.458984375, + "learning_rate": 0.0004150359602287994, + "loss": 0.2078, + "step": 195990 + }, + { + "epoch": 8.12, + "grad_norm": 0.76171875, + "learning_rate": 0.00041502781382247067, + "loss": 0.2249, + "step": 196000 + }, + { + "epoch": 8.12, + "grad_norm": 1.96875, + "learning_rate": 0.0004150196671055774, + "loss": 0.2146, + "step": 196010 + }, + { + "epoch": 8.12, + "grad_norm": 0.63671875, + "learning_rate": 0.00041501152007813503, + "loss": 0.2526, + "step": 196020 + }, + { + "epoch": 8.12, + "grad_norm": 0.71875, + "learning_rate": 0.0004150033727401588, + "loss": 0.2569, + "step": 196030 + }, + { + "epoch": 8.12, + "grad_norm": 1.0234375, + "learning_rate": 0.000414995225091664, + "loss": 0.1891, + "step": 196040 + }, + { + "epoch": 8.12, + "grad_norm": 0.69140625, + "learning_rate": 0.000414987077132666, + "loss": 0.1912, + "step": 196050 + }, + { + "epoch": 8.12, + "grad_norm": 0.7578125, + "learning_rate": 0.0004149789288631801, + "loss": 0.1906, + "step": 196060 + }, + { + "epoch": 8.12, + "grad_norm": 0.80078125, + "learning_rate": 0.0004149707802832218, + "loss": 0.216, + "step": 196070 + }, + { + "epoch": 8.12, + "grad_norm": 1.234375, + "learning_rate": 0.0004149626313928062, + "loss": 0.2378, + "step": 196080 + }, + { + "epoch": 8.12, + "grad_norm": 0.859375, + "learning_rate": 0.00041495448219194874, + "loss": 0.1941, + "step": 196090 + }, + { + "epoch": 8.12, + "grad_norm": 0.6953125, + "learning_rate": 0.0004149463326806648, + "loss": 0.2482, + "step": 196100 + }, + { + "epoch": 8.12, + "grad_norm": 0.796875, + "learning_rate": 0.00041493818285896967, + "loss": 0.1472, + "step": 196110 + }, + { + "epoch": 8.12, + "grad_norm": 0.5546875, + "learning_rate": 0.0004149300327268787, + "loss": 0.2039, + "step": 196120 + }, + { + "epoch": 8.12, + "grad_norm": 1.46875, + "learning_rate": 0.0004149218822844072, + "loss": 0.1763, + "step": 196130 + }, + { + "epoch": 8.12, + "grad_norm": 1.8984375, + "learning_rate": 0.0004149137315315705, + "loss": 0.2315, + "step": 196140 + }, + { + "epoch": 8.12, + "grad_norm": 0.337890625, + "learning_rate": 0.000414905580468384, + "loss": 0.2126, + "step": 196150 + }, + { + "epoch": 8.12, + "grad_norm": 1.4453125, + "learning_rate": 0.000414897429094863, + "loss": 0.1942, + "step": 196160 + }, + { + "epoch": 8.13, + "grad_norm": 0.69140625, + "learning_rate": 0.0004148892774110228, + "loss": 0.2567, + "step": 196170 + }, + { + "epoch": 8.13, + "grad_norm": 1.2890625, + "learning_rate": 0.00041488112541687884, + "loss": 0.2111, + "step": 196180 + }, + { + "epoch": 8.13, + "grad_norm": 1.2421875, + "learning_rate": 0.0004148729731124464, + "loss": 0.1883, + "step": 196190 + }, + { + "epoch": 8.13, + "grad_norm": 0.482421875, + "learning_rate": 0.0004148648204977408, + "loss": 0.1783, + "step": 196200 + }, + { + "epoch": 8.13, + "grad_norm": 0.490234375, + "learning_rate": 0.0004148566675727775, + "loss": 0.231, + "step": 196210 + }, + { + "epoch": 8.13, + "grad_norm": 0.828125, + "learning_rate": 0.00041484851433757166, + "loss": 0.261, + "step": 196220 + }, + { + "epoch": 8.13, + "grad_norm": 0.65234375, + "learning_rate": 0.00041484036079213877, + "loss": 0.2234, + "step": 196230 + }, + { + "epoch": 8.13, + "grad_norm": 1.34375, + "learning_rate": 0.00041483220693649416, + "loss": 0.2073, + "step": 196240 + }, + { + "epoch": 8.13, + "grad_norm": 0.427734375, + "learning_rate": 0.000414824052770653, + "loss": 0.2049, + "step": 196250 + }, + { + "epoch": 8.13, + "grad_norm": 0.53515625, + "learning_rate": 0.00041481589829463086, + "loss": 0.2107, + "step": 196260 + }, + { + "epoch": 8.13, + "grad_norm": 0.1982421875, + "learning_rate": 0.00041480774350844297, + "loss": 0.1694, + "step": 196270 + }, + { + "epoch": 8.13, + "grad_norm": 1.03125, + "learning_rate": 0.0004147995884121047, + "loss": 0.2183, + "step": 196280 + }, + { + "epoch": 8.13, + "grad_norm": 0.9921875, + "learning_rate": 0.0004147914330056315, + "loss": 0.2139, + "step": 196290 + }, + { + "epoch": 8.13, + "grad_norm": 0.80078125, + "learning_rate": 0.0004147832772890385, + "loss": 0.232, + "step": 196300 + }, + { + "epoch": 8.13, + "grad_norm": 0.9765625, + "learning_rate": 0.0004147751212623413, + "loss": 0.2073, + "step": 196310 + }, + { + "epoch": 8.13, + "grad_norm": 0.408203125, + "learning_rate": 0.000414766964925555, + "loss": 0.1817, + "step": 196320 + }, + { + "epoch": 8.13, + "grad_norm": 0.8828125, + "learning_rate": 0.00041475880827869505, + "loss": 0.2319, + "step": 196330 + }, + { + "epoch": 8.13, + "grad_norm": 0.205078125, + "learning_rate": 0.0004147506513217769, + "loss": 0.2056, + "step": 196340 + }, + { + "epoch": 8.13, + "grad_norm": 1.953125, + "learning_rate": 0.0004147424940548158, + "loss": 0.2016, + "step": 196350 + }, + { + "epoch": 8.13, + "grad_norm": 0.53125, + "learning_rate": 0.0004147343364778271, + "loss": 0.1979, + "step": 196360 + }, + { + "epoch": 8.13, + "grad_norm": 1.609375, + "learning_rate": 0.0004147261785908262, + "loss": 0.2177, + "step": 196370 + }, + { + "epoch": 8.13, + "grad_norm": 0.625, + "learning_rate": 0.0004147180203938283, + "loss": 0.2045, + "step": 196380 + }, + { + "epoch": 8.13, + "grad_norm": 0.53515625, + "learning_rate": 0.00041470986188684903, + "loss": 0.1959, + "step": 196390 + }, + { + "epoch": 8.13, + "grad_norm": 0.67578125, + "learning_rate": 0.0004147017030699034, + "loss": 0.1971, + "step": 196400 + }, + { + "epoch": 8.14, + "grad_norm": 0.271484375, + "learning_rate": 0.00041469354394300706, + "loss": 0.1655, + "step": 196410 + }, + { + "epoch": 8.14, + "grad_norm": 0.85546875, + "learning_rate": 0.0004146853845061753, + "loss": 0.1991, + "step": 196420 + }, + { + "epoch": 8.14, + "grad_norm": 0.66796875, + "learning_rate": 0.0004146772247594234, + "loss": 0.1935, + "step": 196430 + }, + { + "epoch": 8.14, + "grad_norm": 0.9140625, + "learning_rate": 0.0004146690647027668, + "loss": 0.2162, + "step": 196440 + }, + { + "epoch": 8.14, + "grad_norm": 0.9765625, + "learning_rate": 0.0004146609043362207, + "loss": 0.2151, + "step": 196450 + }, + { + "epoch": 8.14, + "grad_norm": 1.59375, + "learning_rate": 0.00041465274365980057, + "loss": 0.2095, + "step": 196460 + }, + { + "epoch": 8.14, + "grad_norm": 1.3125, + "learning_rate": 0.0004146445826735218, + "loss": 0.1703, + "step": 196470 + }, + { + "epoch": 8.14, + "grad_norm": 0.828125, + "learning_rate": 0.0004146364213773996, + "loss": 0.174, + "step": 196480 + }, + { + "epoch": 8.14, + "grad_norm": 0.97265625, + "learning_rate": 0.0004146282597714496, + "loss": 0.2413, + "step": 196490 + }, + { + "epoch": 8.14, + "grad_norm": 2.171875, + "learning_rate": 0.0004146200978556869, + "loss": 0.2063, + "step": 196500 + }, + { + "epoch": 8.14, + "grad_norm": 0.453125, + "learning_rate": 0.000414611935630127, + "loss": 0.2111, + "step": 196510 + }, + { + "epoch": 8.14, + "grad_norm": 0.8671875, + "learning_rate": 0.00041460377309478517, + "loss": 0.1523, + "step": 196520 + }, + { + "epoch": 8.14, + "grad_norm": 1.0078125, + "learning_rate": 0.0004145956102496768, + "loss": 0.1671, + "step": 196530 + }, + { + "epoch": 8.14, + "grad_norm": 1.8046875, + "learning_rate": 0.00041458744709481734, + "loss": 0.2131, + "step": 196540 + }, + { + "epoch": 8.14, + "grad_norm": 1.1015625, + "learning_rate": 0.00041457928363022205, + "loss": 0.1895, + "step": 196550 + }, + { + "epoch": 8.14, + "grad_norm": 0.470703125, + "learning_rate": 0.00041457111985590637, + "loss": 0.1819, + "step": 196560 + }, + { + "epoch": 8.14, + "grad_norm": 2.015625, + "learning_rate": 0.0004145629557718855, + "loss": 0.2092, + "step": 196570 + }, + { + "epoch": 8.14, + "grad_norm": 0.1943359375, + "learning_rate": 0.000414554791378175, + "loss": 0.1695, + "step": 196580 + }, + { + "epoch": 8.14, + "grad_norm": 1.3671875, + "learning_rate": 0.0004145466266747901, + "loss": 0.1801, + "step": 196590 + }, + { + "epoch": 8.14, + "grad_norm": 0.341796875, + "learning_rate": 0.00041453846166174626, + "loss": 0.1957, + "step": 196600 + }, + { + "epoch": 8.14, + "grad_norm": 0.56640625, + "learning_rate": 0.00041453029633905874, + "loss": 0.1925, + "step": 196610 + }, + { + "epoch": 8.14, + "grad_norm": 0.38671875, + "learning_rate": 0.0004145221307067431, + "loss": 0.2089, + "step": 196620 + }, + { + "epoch": 8.14, + "grad_norm": 0.98828125, + "learning_rate": 0.00041451396476481446, + "loss": 0.1556, + "step": 196630 + }, + { + "epoch": 8.14, + "grad_norm": 0.78125, + "learning_rate": 0.0004145057985132883, + "loss": 0.2267, + "step": 196640 + }, + { + "epoch": 8.15, + "grad_norm": 0.042236328125, + "learning_rate": 0.00041449763195218004, + "loss": 0.1482, + "step": 196650 + }, + { + "epoch": 8.15, + "grad_norm": 0.6015625, + "learning_rate": 0.000414489465081505, + "loss": 0.1895, + "step": 196660 + }, + { + "epoch": 8.15, + "grad_norm": 0.84765625, + "learning_rate": 0.00041448129790127853, + "loss": 0.1689, + "step": 196670 + }, + { + "epoch": 8.15, + "grad_norm": 0.69921875, + "learning_rate": 0.000414473130411516, + "loss": 0.2038, + "step": 196680 + }, + { + "epoch": 8.15, + "grad_norm": 0.97265625, + "learning_rate": 0.0004144649626122329, + "loss": 0.1823, + "step": 196690 + }, + { + "epoch": 8.15, + "grad_norm": 0.62109375, + "learning_rate": 0.00041445679450344443, + "loss": 0.1613, + "step": 196700 + }, + { + "epoch": 8.15, + "grad_norm": 0.76171875, + "learning_rate": 0.00041444862608516596, + "loss": 0.2023, + "step": 196710 + }, + { + "epoch": 8.15, + "grad_norm": 0.71875, + "learning_rate": 0.00041444045735741303, + "loss": 0.178, + "step": 196720 + }, + { + "epoch": 8.15, + "grad_norm": 0.57421875, + "learning_rate": 0.0004144322883202009, + "loss": 0.186, + "step": 196730 + }, + { + "epoch": 8.15, + "grad_norm": 0.58203125, + "learning_rate": 0.000414424118973545, + "loss": 0.1938, + "step": 196740 + }, + { + "epoch": 8.15, + "grad_norm": 0.71875, + "learning_rate": 0.00041441594931746064, + "loss": 0.1595, + "step": 196750 + }, + { + "epoch": 8.15, + "grad_norm": 0.5234375, + "learning_rate": 0.00041440777935196316, + "loss": 0.2186, + "step": 196760 + }, + { + "epoch": 8.15, + "grad_norm": 0.828125, + "learning_rate": 0.000414399609077068, + "loss": 0.2381, + "step": 196770 + }, + { + "epoch": 8.15, + "grad_norm": 1.0703125, + "learning_rate": 0.0004143914384927906, + "loss": 0.1884, + "step": 196780 + }, + { + "epoch": 8.15, + "grad_norm": 0.8046875, + "learning_rate": 0.00041438326759914623, + "loss": 0.1724, + "step": 196790 + }, + { + "epoch": 8.15, + "grad_norm": 0.478515625, + "learning_rate": 0.00041437509639615026, + "loss": 0.1848, + "step": 196800 + }, + { + "epoch": 8.15, + "grad_norm": 0.466796875, + "learning_rate": 0.00041436692488381824, + "loss": 0.1572, + "step": 196810 + }, + { + "epoch": 8.15, + "grad_norm": 1.2265625, + "learning_rate": 0.0004143587530621653, + "loss": 0.1567, + "step": 196820 + }, + { + "epoch": 8.15, + "grad_norm": 0.703125, + "learning_rate": 0.0004143505809312069, + "loss": 0.2161, + "step": 196830 + }, + { + "epoch": 8.15, + "grad_norm": 0.8046875, + "learning_rate": 0.0004143424084909586, + "loss": 0.1704, + "step": 196840 + }, + { + "epoch": 8.15, + "grad_norm": 0.6796875, + "learning_rate": 0.00041433423574143547, + "loss": 0.1952, + "step": 196850 + }, + { + "epoch": 8.15, + "grad_norm": 0.6015625, + "learning_rate": 0.00041432606268265314, + "loss": 0.2483, + "step": 196860 + }, + { + "epoch": 8.15, + "grad_norm": 0.404296875, + "learning_rate": 0.0004143178893146269, + "loss": 0.1448, + "step": 196870 + }, + { + "epoch": 8.15, + "grad_norm": 0.6484375, + "learning_rate": 0.0004143097156373722, + "loss": 0.1624, + "step": 196880 + }, + { + "epoch": 8.16, + "grad_norm": 0.8046875, + "learning_rate": 0.00041430154165090425, + "loss": 0.2051, + "step": 196890 + }, + { + "epoch": 8.16, + "grad_norm": 0.859375, + "learning_rate": 0.0004142933673552386, + "loss": 0.2096, + "step": 196900 + }, + { + "epoch": 8.16, + "grad_norm": 0.466796875, + "learning_rate": 0.00041428519275039054, + "loss": 0.1923, + "step": 196910 + }, + { + "epoch": 8.16, + "grad_norm": 3.1875, + "learning_rate": 0.0004142770178363756, + "loss": 0.2154, + "step": 196920 + }, + { + "epoch": 8.16, + "grad_norm": 0.5390625, + "learning_rate": 0.0004142688426132089, + "loss": 0.2177, + "step": 196930 + }, + { + "epoch": 8.16, + "grad_norm": 0.66015625, + "learning_rate": 0.00041426066708090604, + "loss": 0.1958, + "step": 196940 + }, + { + "epoch": 8.16, + "grad_norm": 0.703125, + "learning_rate": 0.0004142524912394824, + "loss": 0.1546, + "step": 196950 + }, + { + "epoch": 8.16, + "grad_norm": 0.353515625, + "learning_rate": 0.0004142443150889532, + "loss": 0.2034, + "step": 196960 + }, + { + "epoch": 8.16, + "grad_norm": 0.61328125, + "learning_rate": 0.0004142361386293341, + "loss": 0.1807, + "step": 196970 + }, + { + "epoch": 8.16, + "grad_norm": 0.73046875, + "learning_rate": 0.00041422796186064016, + "loss": 0.2167, + "step": 196980 + }, + { + "epoch": 8.16, + "grad_norm": 0.369140625, + "learning_rate": 0.000414219784782887, + "loss": 0.2728, + "step": 196990 + }, + { + "epoch": 8.16, + "grad_norm": 0.5, + "learning_rate": 0.0004142116073960899, + "loss": 0.1895, + "step": 197000 + }, + { + "epoch": 8.16, + "grad_norm": 1.0390625, + "learning_rate": 0.00041420342970026433, + "loss": 0.2146, + "step": 197010 + }, + { + "epoch": 8.16, + "grad_norm": 0.890625, + "learning_rate": 0.0004141952516954256, + "loss": 0.1762, + "step": 197020 + }, + { + "epoch": 8.16, + "grad_norm": 0.8125, + "learning_rate": 0.00041418707338158916, + "loss": 0.1725, + "step": 197030 + }, + { + "epoch": 8.16, + "grad_norm": 0.62109375, + "learning_rate": 0.0004141788947587704, + "loss": 0.1913, + "step": 197040 + }, + { + "epoch": 8.16, + "grad_norm": 1.75, + "learning_rate": 0.00041417071582698466, + "loss": 0.1981, + "step": 197050 + }, + { + "epoch": 8.16, + "grad_norm": 2.46875, + "learning_rate": 0.00041416253658624737, + "loss": 0.2273, + "step": 197060 + }, + { + "epoch": 8.16, + "grad_norm": 0.373046875, + "learning_rate": 0.00041415435703657394, + "loss": 0.2004, + "step": 197070 + }, + { + "epoch": 8.16, + "grad_norm": 1.375, + "learning_rate": 0.00041414617717797975, + "loss": 0.2538, + "step": 197080 + }, + { + "epoch": 8.16, + "grad_norm": 0.64453125, + "learning_rate": 0.00041413799701048006, + "loss": 0.1833, + "step": 197090 + }, + { + "epoch": 8.16, + "grad_norm": 0.87109375, + "learning_rate": 0.00041412981653409056, + "loss": 0.1936, + "step": 197100 + }, + { + "epoch": 8.16, + "grad_norm": 1.03125, + "learning_rate": 0.00041412163574882635, + "loss": 0.2024, + "step": 197110 + }, + { + "epoch": 8.16, + "grad_norm": 0.69140625, + "learning_rate": 0.00041411345465470296, + "loss": 0.2051, + "step": 197120 + }, + { + "epoch": 8.17, + "grad_norm": 0.578125, + "learning_rate": 0.00041410527325173575, + "loss": 0.1816, + "step": 197130 + }, + { + "epoch": 8.17, + "grad_norm": 0.73046875, + "learning_rate": 0.00041409709153994017, + "loss": 0.2111, + "step": 197140 + }, + { + "epoch": 8.17, + "grad_norm": 0.578125, + "learning_rate": 0.00041408890951933166, + "loss": 0.1749, + "step": 197150 + }, + { + "epoch": 8.17, + "grad_norm": 0.478515625, + "learning_rate": 0.0004140807271899254, + "loss": 0.1911, + "step": 197160 + }, + { + "epoch": 8.17, + "grad_norm": 0.6171875, + "learning_rate": 0.00041407254455173707, + "loss": 0.1694, + "step": 197170 + }, + { + "epoch": 8.17, + "grad_norm": 0.765625, + "learning_rate": 0.0004140643616047818, + "loss": 0.1752, + "step": 197180 + }, + { + "epoch": 8.17, + "grad_norm": 0.9296875, + "learning_rate": 0.00041405617834907514, + "loss": 0.21, + "step": 197190 + }, + { + "epoch": 8.17, + "grad_norm": 0.55859375, + "learning_rate": 0.00041404799478463256, + "loss": 0.1894, + "step": 197200 + }, + { + "epoch": 8.17, + "grad_norm": 0.8515625, + "learning_rate": 0.0004140398109114693, + "loss": 0.233, + "step": 197210 + }, + { + "epoch": 8.17, + "grad_norm": 0.7734375, + "learning_rate": 0.0004140316267296008, + "loss": 0.216, + "step": 197220 + }, + { + "epoch": 8.17, + "grad_norm": 0.5859375, + "learning_rate": 0.0004140234422390426, + "loss": 0.1997, + "step": 197230 + }, + { + "epoch": 8.17, + "grad_norm": 0.4609375, + "learning_rate": 0.0004140152574398099, + "loss": 0.2076, + "step": 197240 + }, + { + "epoch": 8.17, + "grad_norm": 0.58203125, + "learning_rate": 0.00041400707233191827, + "loss": 0.2488, + "step": 197250 + }, + { + "epoch": 8.17, + "grad_norm": 0.546875, + "learning_rate": 0.000413998886915383, + "loss": 0.2534, + "step": 197260 + }, + { + "epoch": 8.17, + "grad_norm": 0.423828125, + "learning_rate": 0.00041399070119021953, + "loss": 0.1961, + "step": 197270 + }, + { + "epoch": 8.17, + "grad_norm": 0.6875, + "learning_rate": 0.00041398251515644324, + "loss": 0.2252, + "step": 197280 + }, + { + "epoch": 8.17, + "grad_norm": 0.75, + "learning_rate": 0.00041397432881406967, + "loss": 0.2079, + "step": 197290 + }, + { + "epoch": 8.17, + "grad_norm": 0.408203125, + "learning_rate": 0.000413966142163114, + "loss": 0.1687, + "step": 197300 + }, + { + "epoch": 8.17, + "grad_norm": 0.84375, + "learning_rate": 0.0004139579552035918, + "loss": 0.223, + "step": 197310 + }, + { + "epoch": 8.17, + "grad_norm": 0.494140625, + "learning_rate": 0.0004139497679355185, + "loss": 0.1891, + "step": 197320 + }, + { + "epoch": 8.17, + "grad_norm": 0.62109375, + "learning_rate": 0.0004139415803589094, + "loss": 0.1888, + "step": 197330 + }, + { + "epoch": 8.17, + "grad_norm": 0.6796875, + "learning_rate": 0.00041393339247377994, + "loss": 0.2057, + "step": 197340 + }, + { + "epoch": 8.17, + "grad_norm": 0.6015625, + "learning_rate": 0.00041392520428014547, + "loss": 0.2193, + "step": 197350 + }, + { + "epoch": 8.17, + "grad_norm": 0.74609375, + "learning_rate": 0.00041391701577802155, + "loss": 0.2181, + "step": 197360 + }, + { + "epoch": 8.18, + "grad_norm": 1.1953125, + "learning_rate": 0.0004139088269674235, + "loss": 0.2078, + "step": 197370 + }, + { + "epoch": 8.18, + "grad_norm": 0.7890625, + "learning_rate": 0.0004139006378483667, + "loss": 0.2108, + "step": 197380 + }, + { + "epoch": 8.18, + "grad_norm": 0.921875, + "learning_rate": 0.0004138924484208667, + "loss": 0.1981, + "step": 197390 + }, + { + "epoch": 8.18, + "grad_norm": 1.109375, + "learning_rate": 0.0004138842586849387, + "loss": 0.1968, + "step": 197400 + }, + { + "epoch": 8.18, + "grad_norm": 0.625, + "learning_rate": 0.00041387606864059823, + "loss": 0.1828, + "step": 197410 + }, + { + "epoch": 8.18, + "grad_norm": 0.70703125, + "learning_rate": 0.0004138678782878608, + "loss": 0.1988, + "step": 197420 + }, + { + "epoch": 8.18, + "grad_norm": 1.1875, + "learning_rate": 0.00041385968762674166, + "loss": 0.208, + "step": 197430 + }, + { + "epoch": 8.18, + "grad_norm": 1.359375, + "learning_rate": 0.0004138514966572562, + "loss": 0.1583, + "step": 197440 + }, + { + "epoch": 8.18, + "grad_norm": 0.9765625, + "learning_rate": 0.00041384330537941994, + "loss": 0.1723, + "step": 197450 + }, + { + "epoch": 8.18, + "grad_norm": 2.40625, + "learning_rate": 0.00041383511379324835, + "loss": 0.2631, + "step": 197460 + }, + { + "epoch": 8.18, + "grad_norm": 0.94921875, + "learning_rate": 0.00041382692189875683, + "loss": 0.1939, + "step": 197470 + }, + { + "epoch": 8.18, + "grad_norm": 0.83203125, + "learning_rate": 0.0004138187296959606, + "loss": 0.1501, + "step": 197480 + }, + { + "epoch": 8.18, + "grad_norm": 0.6484375, + "learning_rate": 0.0004138105371848753, + "loss": 0.2162, + "step": 197490 + }, + { + "epoch": 8.18, + "grad_norm": 0.9609375, + "learning_rate": 0.00041380234436551614, + "loss": 0.251, + "step": 197500 + }, + { + "epoch": 8.18, + "grad_norm": 0.56640625, + "learning_rate": 0.00041379415123789877, + "loss": 0.2084, + "step": 197510 + }, + { + "epoch": 8.18, + "grad_norm": 1.40625, + "learning_rate": 0.00041378595780203843, + "loss": 0.2502, + "step": 197520 + }, + { + "epoch": 8.18, + "grad_norm": 0.74609375, + "learning_rate": 0.00041377776405795064, + "loss": 0.2145, + "step": 197530 + }, + { + "epoch": 8.18, + "grad_norm": 0.365234375, + "learning_rate": 0.00041376957000565073, + "loss": 0.1806, + "step": 197540 + }, + { + "epoch": 8.18, + "grad_norm": 0.66015625, + "learning_rate": 0.00041376137564515426, + "loss": 0.2053, + "step": 197550 + }, + { + "epoch": 8.18, + "grad_norm": 0.91796875, + "learning_rate": 0.00041375318097647653, + "loss": 0.22, + "step": 197560 + }, + { + "epoch": 8.18, + "grad_norm": 1.078125, + "learning_rate": 0.00041374498599963303, + "loss": 0.1881, + "step": 197570 + }, + { + "epoch": 8.18, + "grad_norm": 0.69140625, + "learning_rate": 0.0004137367907146391, + "loss": 0.2034, + "step": 197580 + }, + { + "epoch": 8.18, + "grad_norm": 0.322265625, + "learning_rate": 0.0004137285951215102, + "loss": 0.1633, + "step": 197590 + }, + { + "epoch": 8.18, + "grad_norm": 1.640625, + "learning_rate": 0.00041372039922026185, + "loss": 0.2066, + "step": 197600 + }, + { + "epoch": 8.18, + "grad_norm": 0.26953125, + "learning_rate": 0.0004137122030109093, + "loss": 0.187, + "step": 197610 + }, + { + "epoch": 8.19, + "grad_norm": 1.2109375, + "learning_rate": 0.00041370400649346807, + "loss": 0.1869, + "step": 197620 + }, + { + "epoch": 8.19, + "grad_norm": 1.1484375, + "learning_rate": 0.0004136958096679536, + "loss": 0.128, + "step": 197630 + }, + { + "epoch": 8.19, + "grad_norm": 0.5234375, + "learning_rate": 0.00041368761253438133, + "loss": 0.2432, + "step": 197640 + }, + { + "epoch": 8.19, + "grad_norm": 0.48828125, + "learning_rate": 0.0004136794150927666, + "loss": 0.2284, + "step": 197650 + }, + { + "epoch": 8.19, + "grad_norm": 1.3359375, + "learning_rate": 0.0004136712173431249, + "loss": 0.2422, + "step": 197660 + }, + { + "epoch": 8.19, + "grad_norm": 0.70703125, + "learning_rate": 0.0004136630192854717, + "loss": 0.1879, + "step": 197670 + }, + { + "epoch": 8.19, + "grad_norm": 0.419921875, + "learning_rate": 0.0004136548209198223, + "loss": 0.2181, + "step": 197680 + }, + { + "epoch": 8.19, + "grad_norm": 0.453125, + "learning_rate": 0.0004136466222461922, + "loss": 0.1582, + "step": 197690 + }, + { + "epoch": 8.19, + "grad_norm": 1.671875, + "learning_rate": 0.00041363842326459685, + "loss": 0.2614, + "step": 197700 + }, + { + "epoch": 8.19, + "grad_norm": 0.796875, + "learning_rate": 0.0004136302239750516, + "loss": 0.2801, + "step": 197710 + }, + { + "epoch": 8.19, + "grad_norm": 0.61328125, + "learning_rate": 0.000413622024377572, + "loss": 0.1902, + "step": 197720 + }, + { + "epoch": 8.19, + "grad_norm": 0.6953125, + "learning_rate": 0.00041361382447217343, + "loss": 0.2008, + "step": 197730 + }, + { + "epoch": 8.19, + "grad_norm": 0.8671875, + "learning_rate": 0.0004136056242588713, + "loss": 0.2243, + "step": 197740 + }, + { + "epoch": 8.19, + "grad_norm": 0.8125, + "learning_rate": 0.00041359742373768104, + "loss": 0.1737, + "step": 197750 + }, + { + "epoch": 8.19, + "grad_norm": 0.55078125, + "learning_rate": 0.0004135892229086181, + "loss": 0.1804, + "step": 197760 + }, + { + "epoch": 8.19, + "grad_norm": 0.7109375, + "learning_rate": 0.00041358102177169794, + "loss": 0.1958, + "step": 197770 + }, + { + "epoch": 8.19, + "grad_norm": 0.5234375, + "learning_rate": 0.00041357282032693595, + "loss": 0.1987, + "step": 197780 + }, + { + "epoch": 8.19, + "grad_norm": 0.82421875, + "learning_rate": 0.00041356461857434756, + "loss": 0.2558, + "step": 197790 + }, + { + "epoch": 8.19, + "grad_norm": 0.51171875, + "learning_rate": 0.0004135564165139482, + "loss": 0.1859, + "step": 197800 + }, + { + "epoch": 8.19, + "grad_norm": 0.41015625, + "learning_rate": 0.0004135482141457534, + "loss": 0.178, + "step": 197810 + }, + { + "epoch": 8.19, + "grad_norm": 0.92578125, + "learning_rate": 0.0004135400114697784, + "loss": 0.2045, + "step": 197820 + }, + { + "epoch": 8.19, + "grad_norm": 1.859375, + "learning_rate": 0.0004135318084860389, + "loss": 0.2191, + "step": 197830 + }, + { + "epoch": 8.19, + "grad_norm": 0.53125, + "learning_rate": 0.00041352360519455013, + "loss": 0.1903, + "step": 197840 + }, + { + "epoch": 8.19, + "grad_norm": 1.6484375, + "learning_rate": 0.0004135154015953275, + "loss": 0.236, + "step": 197850 + }, + { + "epoch": 8.2, + "grad_norm": 0.73828125, + "learning_rate": 0.00041350719768838673, + "loss": 0.2044, + "step": 197860 + }, + { + "epoch": 8.2, + "grad_norm": 0.5, + "learning_rate": 0.00041349899347374293, + "loss": 0.173, + "step": 197870 + }, + { + "epoch": 8.2, + "grad_norm": 1.0078125, + "learning_rate": 0.0004134907889514117, + "loss": 0.192, + "step": 197880 + }, + { + "epoch": 8.2, + "grad_norm": 0.9765625, + "learning_rate": 0.00041348258412140853, + "loss": 0.2437, + "step": 197890 + }, + { + "epoch": 8.2, + "grad_norm": 0.5546875, + "learning_rate": 0.0004134743789837487, + "loss": 0.1921, + "step": 197900 + }, + { + "epoch": 8.2, + "grad_norm": 2.453125, + "learning_rate": 0.00041346617353844786, + "loss": 0.2221, + "step": 197910 + }, + { + "epoch": 8.2, + "grad_norm": 0.70703125, + "learning_rate": 0.00041345796778552126, + "loss": 0.2053, + "step": 197920 + }, + { + "epoch": 8.2, + "grad_norm": 0.66796875, + "learning_rate": 0.0004134497617249844, + "loss": 0.2394, + "step": 197930 + }, + { + "epoch": 8.2, + "grad_norm": 0.328125, + "learning_rate": 0.0004134415553568528, + "loss": 0.1668, + "step": 197940 + }, + { + "epoch": 8.2, + "grad_norm": 0.375, + "learning_rate": 0.0004134333486811418, + "loss": 0.1704, + "step": 197950 + }, + { + "epoch": 8.2, + "grad_norm": 1.359375, + "learning_rate": 0.0004134251416978668, + "loss": 0.2072, + "step": 197960 + }, + { + "epoch": 8.2, + "grad_norm": 1.7578125, + "learning_rate": 0.00041341693440704355, + "loss": 0.2209, + "step": 197970 + }, + { + "epoch": 8.2, + "grad_norm": 0.57421875, + "learning_rate": 0.00041340872680868706, + "loss": 0.222, + "step": 197980 + }, + { + "epoch": 8.2, + "grad_norm": 0.875, + "learning_rate": 0.00041340051890281315, + "loss": 0.2159, + "step": 197990 + }, + { + "epoch": 8.2, + "grad_norm": 0.341796875, + "learning_rate": 0.00041339231068943696, + "loss": 0.2018, + "step": 198000 + }, + { + "epoch": 8.2, + "grad_norm": 0.40234375, + "learning_rate": 0.0004133841021685742, + "loss": 0.2018, + "step": 198010 + }, + { + "epoch": 8.2, + "grad_norm": 0.58984375, + "learning_rate": 0.00041337589334024017, + "loss": 0.2021, + "step": 198020 + }, + { + "epoch": 8.2, + "grad_norm": 0.66796875, + "learning_rate": 0.00041336768420445035, + "loss": 0.1871, + "step": 198030 + }, + { + "epoch": 8.2, + "grad_norm": 0.8671875, + "learning_rate": 0.0004133594747612202, + "loss": 0.1687, + "step": 198040 + }, + { + "epoch": 8.2, + "grad_norm": 0.609375, + "learning_rate": 0.0004133512650105652, + "loss": 0.2187, + "step": 198050 + }, + { + "epoch": 8.2, + "grad_norm": 0.92578125, + "learning_rate": 0.0004133430549525007, + "loss": 0.1678, + "step": 198060 + }, + { + "epoch": 8.2, + "grad_norm": 1.0234375, + "learning_rate": 0.0004133348445870422, + "loss": 0.2061, + "step": 198070 + }, + { + "epoch": 8.2, + "grad_norm": 0.92578125, + "learning_rate": 0.00041332663391420514, + "loss": 0.1957, + "step": 198080 + }, + { + "epoch": 8.2, + "grad_norm": 1.296875, + "learning_rate": 0.00041331842293400507, + "loss": 0.2441, + "step": 198090 + }, + { + "epoch": 8.21, + "grad_norm": 0.2890625, + "learning_rate": 0.0004133102116464573, + "loss": 0.2228, + "step": 198100 + }, + { + "epoch": 8.21, + "grad_norm": 0.80078125, + "learning_rate": 0.00041330200005157744, + "loss": 0.1898, + "step": 198110 + }, + { + "epoch": 8.21, + "grad_norm": 0.58984375, + "learning_rate": 0.0004132937881493808, + "loss": 0.2239, + "step": 198120 + }, + { + "epoch": 8.21, + "grad_norm": 0.93359375, + "learning_rate": 0.0004132855759398828, + "loss": 0.1924, + "step": 198130 + }, + { + "epoch": 8.21, + "grad_norm": 1.09375, + "learning_rate": 0.00041327736342309905, + "loss": 0.2187, + "step": 198140 + }, + { + "epoch": 8.21, + "grad_norm": 0.255859375, + "learning_rate": 0.000413269150599045, + "loss": 0.1801, + "step": 198150 + }, + { + "epoch": 8.21, + "grad_norm": 0.2578125, + "learning_rate": 0.00041326093746773596, + "loss": 0.1768, + "step": 198160 + }, + { + "epoch": 8.21, + "grad_norm": 0.81640625, + "learning_rate": 0.00041325272402918754, + "loss": 0.1679, + "step": 198170 + }, + { + "epoch": 8.21, + "grad_norm": 0.8125, + "learning_rate": 0.000413244510283415, + "loss": 0.1932, + "step": 198180 + }, + { + "epoch": 8.21, + "grad_norm": 0.765625, + "learning_rate": 0.000413236296230434, + "loss": 0.2165, + "step": 198190 + }, + { + "epoch": 8.21, + "grad_norm": 0.6796875, + "learning_rate": 0.00041322808187025994, + "loss": 0.2155, + "step": 198200 + }, + { + "epoch": 8.21, + "grad_norm": 1.0546875, + "learning_rate": 0.00041321986720290816, + "loss": 0.1966, + "step": 198210 + }, + { + "epoch": 8.21, + "grad_norm": 0.73828125, + "learning_rate": 0.0004132116522283943, + "loss": 0.2366, + "step": 198220 + }, + { + "epoch": 8.21, + "grad_norm": 1.015625, + "learning_rate": 0.0004132034369467338, + "loss": 0.1997, + "step": 198230 + }, + { + "epoch": 8.21, + "grad_norm": 1.03125, + "learning_rate": 0.0004131952213579419, + "loss": 0.1831, + "step": 198240 + }, + { + "epoch": 8.21, + "grad_norm": 0.55078125, + "learning_rate": 0.0004131870054620343, + "loss": 0.2482, + "step": 198250 + }, + { + "epoch": 8.21, + "grad_norm": 0.625, + "learning_rate": 0.0004131787892590264, + "loss": 0.1967, + "step": 198260 + }, + { + "epoch": 8.21, + "grad_norm": 1.0, + "learning_rate": 0.0004131705727489336, + "loss": 0.2158, + "step": 198270 + }, + { + "epoch": 8.21, + "grad_norm": 1.9609375, + "learning_rate": 0.00041316235593177143, + "loss": 0.2122, + "step": 198280 + }, + { + "epoch": 8.21, + "grad_norm": 1.0703125, + "learning_rate": 0.00041315413880755523, + "loss": 0.2306, + "step": 198290 + }, + { + "epoch": 8.21, + "grad_norm": 1.5078125, + "learning_rate": 0.0004131459213763007, + "loss": 0.2558, + "step": 198300 + }, + { + "epoch": 8.21, + "grad_norm": 0.6015625, + "learning_rate": 0.0004131377036380231, + "loss": 0.2298, + "step": 198310 + }, + { + "epoch": 8.21, + "grad_norm": 0.9453125, + "learning_rate": 0.00041312948559273793, + "loss": 0.2149, + "step": 198320 + }, + { + "epoch": 8.21, + "grad_norm": 0.6953125, + "learning_rate": 0.00041312126724046074, + "loss": 0.1966, + "step": 198330 + }, + { + "epoch": 8.22, + "grad_norm": 0.6171875, + "learning_rate": 0.00041311304858120693, + "loss": 0.2089, + "step": 198340 + }, + { + "epoch": 8.22, + "grad_norm": 1.6875, + "learning_rate": 0.00041310482961499196, + "loss": 0.1666, + "step": 198350 + }, + { + "epoch": 8.22, + "grad_norm": 2.546875, + "learning_rate": 0.00041309661034183133, + "loss": 0.1877, + "step": 198360 + }, + { + "epoch": 8.22, + "grad_norm": 0.80859375, + "learning_rate": 0.00041308839076174044, + "loss": 0.2323, + "step": 198370 + }, + { + "epoch": 8.22, + "grad_norm": 0.99609375, + "learning_rate": 0.00041308017087473485, + "loss": 0.2078, + "step": 198380 + }, + { + "epoch": 8.22, + "grad_norm": 0.87109375, + "learning_rate": 0.00041307195068083005, + "loss": 0.173, + "step": 198390 + }, + { + "epoch": 8.22, + "grad_norm": 2.328125, + "learning_rate": 0.00041306373018004135, + "loss": 0.1522, + "step": 198400 + }, + { + "epoch": 8.22, + "grad_norm": 2.03125, + "learning_rate": 0.00041305550937238434, + "loss": 0.1781, + "step": 198410 + }, + { + "epoch": 8.22, + "grad_norm": 0.66015625, + "learning_rate": 0.00041304728825787454, + "loss": 0.1779, + "step": 198420 + }, + { + "epoch": 8.22, + "grad_norm": 0.890625, + "learning_rate": 0.0004130390668365273, + "loss": 0.2013, + "step": 198430 + }, + { + "epoch": 8.22, + "grad_norm": 0.85546875, + "learning_rate": 0.0004130308451083581, + "loss": 0.1935, + "step": 198440 + }, + { + "epoch": 8.22, + "grad_norm": 0.439453125, + "learning_rate": 0.0004130226230733825, + "loss": 0.1789, + "step": 198450 + }, + { + "epoch": 8.22, + "grad_norm": 0.8984375, + "learning_rate": 0.0004130144007316159, + "loss": 0.185, + "step": 198460 + }, + { + "epoch": 8.22, + "grad_norm": 1.171875, + "learning_rate": 0.00041300617808307384, + "loss": 0.2155, + "step": 198470 + }, + { + "epoch": 8.22, + "grad_norm": 2.546875, + "learning_rate": 0.00041299795512777176, + "loss": 0.2111, + "step": 198480 + }, + { + "epoch": 8.22, + "grad_norm": 0.65234375, + "learning_rate": 0.00041298973186572514, + "loss": 0.228, + "step": 198490 + }, + { + "epoch": 8.22, + "grad_norm": 0.5390625, + "learning_rate": 0.00041298150829694943, + "loss": 0.2085, + "step": 198500 + }, + { + "epoch": 8.22, + "grad_norm": 0.3828125, + "learning_rate": 0.0004129732844214601, + "loss": 0.198, + "step": 198510 + }, + { + "epoch": 8.22, + "grad_norm": 0.4375, + "learning_rate": 0.0004129650602392727, + "loss": 0.2399, + "step": 198520 + }, + { + "epoch": 8.22, + "grad_norm": 0.60546875, + "learning_rate": 0.00041295683575040255, + "loss": 0.1776, + "step": 198530 + }, + { + "epoch": 8.22, + "grad_norm": 0.9921875, + "learning_rate": 0.00041294861095486537, + "loss": 0.1805, + "step": 198540 + }, + { + "epoch": 8.22, + "grad_norm": 1.6953125, + "learning_rate": 0.00041294038585267635, + "loss": 0.1847, + "step": 198550 + }, + { + "epoch": 8.22, + "grad_norm": 0.7578125, + "learning_rate": 0.0004129321604438512, + "loss": 0.1674, + "step": 198560 + }, + { + "epoch": 8.22, + "grad_norm": 0.5078125, + "learning_rate": 0.0004129239347284054, + "loss": 0.1828, + "step": 198570 + }, + { + "epoch": 8.23, + "grad_norm": 0.765625, + "learning_rate": 0.0004129157087063542, + "loss": 0.2086, + "step": 198580 + }, + { + "epoch": 8.23, + "grad_norm": 0.3671875, + "learning_rate": 0.00041290748237771336, + "loss": 0.2405, + "step": 198590 + }, + { + "epoch": 8.23, + "grad_norm": 0.55859375, + "learning_rate": 0.00041289925574249813, + "loss": 0.2366, + "step": 198600 + }, + { + "epoch": 8.23, + "grad_norm": 0.21875, + "learning_rate": 0.00041289102880072415, + "loss": 0.1652, + "step": 198610 + }, + { + "epoch": 8.23, + "grad_norm": 0.8203125, + "learning_rate": 0.0004128828015524068, + "loss": 0.2126, + "step": 198620 + }, + { + "epoch": 8.23, + "grad_norm": 0.57421875, + "learning_rate": 0.00041287457399756164, + "loss": 0.1877, + "step": 198630 + }, + { + "epoch": 8.23, + "grad_norm": 0.2392578125, + "learning_rate": 0.00041286634613620413, + "loss": 0.1482, + "step": 198640 + }, + { + "epoch": 8.23, + "grad_norm": 0.609375, + "learning_rate": 0.0004128581179683497, + "loss": 0.2298, + "step": 198650 + }, + { + "epoch": 8.23, + "grad_norm": 0.294921875, + "learning_rate": 0.00041284988949401394, + "loss": 0.1587, + "step": 198660 + }, + { + "epoch": 8.23, + "grad_norm": 0.7890625, + "learning_rate": 0.0004128416607132123, + "loss": 0.1894, + "step": 198670 + }, + { + "epoch": 8.23, + "grad_norm": 0.9140625, + "learning_rate": 0.0004128334316259601, + "loss": 0.2061, + "step": 198680 + }, + { + "epoch": 8.23, + "grad_norm": 0.4375, + "learning_rate": 0.00041282520223227305, + "loss": 0.1632, + "step": 198690 + }, + { + "epoch": 8.23, + "grad_norm": 0.486328125, + "learning_rate": 0.00041281697253216655, + "loss": 0.1773, + "step": 198700 + }, + { + "epoch": 8.23, + "grad_norm": 0.203125, + "learning_rate": 0.0004128087425256561, + "loss": 0.2757, + "step": 198710 + }, + { + "epoch": 8.23, + "grad_norm": 0.61328125, + "learning_rate": 0.0004128005122127572, + "loss": 0.1916, + "step": 198720 + }, + { + "epoch": 8.23, + "grad_norm": 0.427734375, + "learning_rate": 0.0004127922815934853, + "loss": 0.2007, + "step": 198730 + }, + { + "epoch": 8.23, + "grad_norm": 0.322265625, + "learning_rate": 0.00041278405066785584, + "loss": 0.1704, + "step": 198740 + }, + { + "epoch": 8.23, + "grad_norm": 0.49609375, + "learning_rate": 0.0004127758194358844, + "loss": 0.1741, + "step": 198750 + }, + { + "epoch": 8.23, + "grad_norm": 0.349609375, + "learning_rate": 0.0004127675878975865, + "loss": 0.185, + "step": 198760 + }, + { + "epoch": 8.23, + "grad_norm": 1.3125, + "learning_rate": 0.00041275935605297753, + "loss": 0.2239, + "step": 198770 + }, + { + "epoch": 8.23, + "grad_norm": 0.703125, + "learning_rate": 0.0004127511239020731, + "loss": 0.1425, + "step": 198780 + }, + { + "epoch": 8.23, + "grad_norm": 0.6015625, + "learning_rate": 0.0004127428914448885, + "loss": 0.1827, + "step": 198790 + }, + { + "epoch": 8.23, + "grad_norm": 1.4296875, + "learning_rate": 0.0004127346586814395, + "loss": 0.2152, + "step": 198800 + }, + { + "epoch": 8.23, + "grad_norm": 0.74609375, + "learning_rate": 0.0004127264256117414, + "loss": 0.1466, + "step": 198810 + }, + { + "epoch": 8.24, + "grad_norm": 0.58203125, + "learning_rate": 0.0004127181922358097, + "loss": 0.1837, + "step": 198820 + }, + { + "epoch": 8.24, + "grad_norm": 0.93359375, + "learning_rate": 0.00041270995855366, + "loss": 0.2121, + "step": 198830 + }, + { + "epoch": 8.24, + "grad_norm": 0.5, + "learning_rate": 0.0004127017245653076, + "loss": 0.2029, + "step": 198840 + }, + { + "epoch": 8.24, + "grad_norm": 0.859375, + "learning_rate": 0.00041269349027076827, + "loss": 0.1782, + "step": 198850 + }, + { + "epoch": 8.24, + "grad_norm": 0.71875, + "learning_rate": 0.00041268525567005727, + "loss": 0.1546, + "step": 198860 + }, + { + "epoch": 8.24, + "grad_norm": 0.73828125, + "learning_rate": 0.00041267702076319023, + "loss": 0.1922, + "step": 198870 + }, + { + "epoch": 8.24, + "grad_norm": 0.9765625, + "learning_rate": 0.0004126687855501826, + "loss": 0.1955, + "step": 198880 + }, + { + "epoch": 8.24, + "grad_norm": 0.734375, + "learning_rate": 0.00041266055003104986, + "loss": 0.2182, + "step": 198890 + }, + { + "epoch": 8.24, + "grad_norm": 0.61328125, + "learning_rate": 0.00041265231420580754, + "loss": 0.213, + "step": 198900 + }, + { + "epoch": 8.24, + "grad_norm": 0.58203125, + "learning_rate": 0.00041264407807447114, + "loss": 0.1965, + "step": 198910 + }, + { + "epoch": 8.24, + "grad_norm": 0.95703125, + "learning_rate": 0.0004126358416370561, + "loss": 0.1662, + "step": 198920 + }, + { + "epoch": 8.24, + "grad_norm": 0.68359375, + "learning_rate": 0.00041262760489357804, + "loss": 0.2244, + "step": 198930 + }, + { + "epoch": 8.24, + "grad_norm": 0.48828125, + "learning_rate": 0.00041261936784405234, + "loss": 0.2062, + "step": 198940 + }, + { + "epoch": 8.24, + "grad_norm": 1.28125, + "learning_rate": 0.0004126111304884946, + "loss": 0.179, + "step": 198950 + }, + { + "epoch": 8.24, + "grad_norm": 1.90625, + "learning_rate": 0.00041260289282692023, + "loss": 0.2247, + "step": 198960 + }, + { + "epoch": 8.24, + "grad_norm": 2.078125, + "learning_rate": 0.00041259465485934477, + "loss": 0.211, + "step": 198970 + }, + { + "epoch": 8.24, + "grad_norm": 0.625, + "learning_rate": 0.00041258641658578377, + "loss": 0.2254, + "step": 198980 + }, + { + "epoch": 8.24, + "grad_norm": 1.1484375, + "learning_rate": 0.0004125781780062527, + "loss": 0.2078, + "step": 198990 + }, + { + "epoch": 8.24, + "grad_norm": 0.49609375, + "learning_rate": 0.000412569939120767, + "loss": 0.2526, + "step": 199000 + }, + { + "epoch": 8.24, + "grad_norm": 0.8125, + "learning_rate": 0.0004125616999293422, + "loss": 0.1809, + "step": 199010 + }, + { + "epoch": 8.24, + "grad_norm": 0.1611328125, + "learning_rate": 0.00041255346043199393, + "loss": 0.1997, + "step": 199020 + }, + { + "epoch": 8.24, + "grad_norm": 0.365234375, + "learning_rate": 0.00041254522062873756, + "loss": 0.2236, + "step": 199030 + }, + { + "epoch": 8.24, + "grad_norm": 0.73828125, + "learning_rate": 0.00041253698051958867, + "loss": 0.2162, + "step": 199040 + }, + { + "epoch": 8.24, + "grad_norm": 0.494140625, + "learning_rate": 0.00041252874010456266, + "loss": 0.2078, + "step": 199050 + }, + { + "epoch": 8.25, + "grad_norm": 0.765625, + "learning_rate": 0.0004125204993836752, + "loss": 0.1787, + "step": 199060 + }, + { + "epoch": 8.25, + "grad_norm": 0.267578125, + "learning_rate": 0.0004125122583569417, + "loss": 0.1949, + "step": 199070 + }, + { + "epoch": 8.25, + "grad_norm": 0.53125, + "learning_rate": 0.00041250401702437763, + "loss": 0.2412, + "step": 199080 + }, + { + "epoch": 8.25, + "grad_norm": 1.09375, + "learning_rate": 0.0004124957753859986, + "loss": 0.2375, + "step": 199090 + }, + { + "epoch": 8.25, + "grad_norm": 0.484375, + "learning_rate": 0.00041248753344181997, + "loss": 0.2136, + "step": 199100 + }, + { + "epoch": 8.25, + "grad_norm": 0.93359375, + "learning_rate": 0.00041247929119185746, + "loss": 0.2469, + "step": 199110 + }, + { + "epoch": 8.25, + "grad_norm": 0.62890625, + "learning_rate": 0.0004124710486361264, + "loss": 0.2394, + "step": 199120 + }, + { + "epoch": 8.25, + "grad_norm": 0.6328125, + "learning_rate": 0.0004124628057746424, + "loss": 0.1535, + "step": 199130 + }, + { + "epoch": 8.25, + "grad_norm": 0.73828125, + "learning_rate": 0.00041245456260742097, + "loss": 0.2225, + "step": 199140 + }, + { + "epoch": 8.25, + "grad_norm": 0.83984375, + "learning_rate": 0.0004124463191344776, + "loss": 0.2271, + "step": 199150 + }, + { + "epoch": 8.25, + "grad_norm": 1.3359375, + "learning_rate": 0.00041243807535582775, + "loss": 0.2227, + "step": 199160 + }, + { + "epoch": 8.25, + "grad_norm": 1.0859375, + "learning_rate": 0.000412429831271487, + "loss": 0.2091, + "step": 199170 + }, + { + "epoch": 8.25, + "grad_norm": 0.4609375, + "learning_rate": 0.0004124215868814708, + "loss": 0.2405, + "step": 199180 + }, + { + "epoch": 8.25, + "grad_norm": 0.8046875, + "learning_rate": 0.0004124133421857949, + "loss": 0.17, + "step": 199190 + }, + { + "epoch": 8.25, + "grad_norm": 1.21875, + "learning_rate": 0.0004124050971844744, + "loss": 0.1808, + "step": 199200 + }, + { + "epoch": 8.25, + "grad_norm": 0.8046875, + "learning_rate": 0.0004123968518775251, + "loss": 0.1733, + "step": 199210 + }, + { + "epoch": 8.25, + "grad_norm": 1.0234375, + "learning_rate": 0.00041238860626496257, + "loss": 0.168, + "step": 199220 + }, + { + "epoch": 8.25, + "grad_norm": 0.98828125, + "learning_rate": 0.00041238036034680215, + "loss": 0.1868, + "step": 199230 + }, + { + "epoch": 8.25, + "grad_norm": 0.73828125, + "learning_rate": 0.0004123721141230594, + "loss": 0.198, + "step": 199240 + }, + { + "epoch": 8.25, + "grad_norm": 0.6640625, + "learning_rate": 0.0004123638675937499, + "loss": 0.2257, + "step": 199250 + }, + { + "epoch": 8.25, + "grad_norm": 0.53125, + "learning_rate": 0.0004123556207588891, + "loss": 0.2154, + "step": 199260 + }, + { + "epoch": 8.25, + "grad_norm": 1.046875, + "learning_rate": 0.0004123473736184926, + "loss": 0.2079, + "step": 199270 + }, + { + "epoch": 8.25, + "grad_norm": 1.078125, + "learning_rate": 0.0004123391261725759, + "loss": 0.1716, + "step": 199280 + }, + { + "epoch": 8.25, + "grad_norm": 1.3125, + "learning_rate": 0.0004123308784211544, + "loss": 0.2484, + "step": 199290 + }, + { + "epoch": 8.25, + "grad_norm": 1.65625, + "learning_rate": 0.0004123226303642438, + "loss": 0.1817, + "step": 199300 + }, + { + "epoch": 8.26, + "grad_norm": 0.51171875, + "learning_rate": 0.0004123143820018595, + "loss": 0.2284, + "step": 199310 + }, + { + "epoch": 8.26, + "grad_norm": 0.578125, + "learning_rate": 0.00041230613333401715, + "loss": 0.176, + "step": 199320 + }, + { + "epoch": 8.26, + "grad_norm": 0.671875, + "learning_rate": 0.00041229788436073205, + "loss": 0.1473, + "step": 199330 + }, + { + "epoch": 8.26, + "grad_norm": 1.0859375, + "learning_rate": 0.0004122896350820199, + "loss": 0.2062, + "step": 199340 + }, + { + "epoch": 8.26, + "grad_norm": 0.671875, + "learning_rate": 0.0004122813854978962, + "loss": 0.2039, + "step": 199350 + }, + { + "epoch": 8.26, + "grad_norm": 0.53515625, + "learning_rate": 0.00041227313560837647, + "loss": 0.1959, + "step": 199360 + }, + { + "epoch": 8.26, + "grad_norm": 1.078125, + "learning_rate": 0.0004122648854134762, + "loss": 0.1874, + "step": 199370 + }, + { + "epoch": 8.26, + "grad_norm": 0.70703125, + "learning_rate": 0.000412256634913211, + "loss": 0.1976, + "step": 199380 + }, + { + "epoch": 8.26, + "grad_norm": 1.1328125, + "learning_rate": 0.00041224838410759625, + "loss": 0.2159, + "step": 199390 + }, + { + "epoch": 8.26, + "grad_norm": 0.62890625, + "learning_rate": 0.0004122401329966476, + "loss": 0.2167, + "step": 199400 + }, + { + "epoch": 8.26, + "grad_norm": 0.2099609375, + "learning_rate": 0.0004122318815803805, + "loss": 0.1892, + "step": 199410 + }, + { + "epoch": 8.26, + "grad_norm": 1.4921875, + "learning_rate": 0.0004122236298588106, + "loss": 0.1837, + "step": 199420 + }, + { + "epoch": 8.26, + "grad_norm": 0.416015625, + "learning_rate": 0.0004122153778319533, + "loss": 0.2339, + "step": 199430 + }, + { + "epoch": 8.26, + "grad_norm": 1.015625, + "learning_rate": 0.0004122071254998241, + "loss": 0.1512, + "step": 199440 + }, + { + "epoch": 8.26, + "grad_norm": 1.7578125, + "learning_rate": 0.00041219887286243875, + "loss": 0.1794, + "step": 199450 + }, + { + "epoch": 8.26, + "grad_norm": 0.98046875, + "learning_rate": 0.00041219061991981253, + "loss": 0.2144, + "step": 199460 + }, + { + "epoch": 8.26, + "grad_norm": 0.80859375, + "learning_rate": 0.00041218236667196116, + "loss": 0.1907, + "step": 199470 + }, + { + "epoch": 8.26, + "grad_norm": 0.453125, + "learning_rate": 0.00041217411311890006, + "loss": 0.1151, + "step": 199480 + }, + { + "epoch": 8.26, + "grad_norm": 0.77734375, + "learning_rate": 0.00041216585926064477, + "loss": 0.1778, + "step": 199490 + }, + { + "epoch": 8.26, + "grad_norm": 0.341796875, + "learning_rate": 0.00041215760509721085, + "loss": 0.2362, + "step": 199500 + }, + { + "epoch": 8.26, + "grad_norm": 0.87109375, + "learning_rate": 0.00041214935062861387, + "loss": 0.2303, + "step": 199510 + }, + { + "epoch": 8.26, + "grad_norm": 0.453125, + "learning_rate": 0.00041214109585486927, + "loss": 0.1397, + "step": 199520 + }, + { + "epoch": 8.26, + "grad_norm": 2.390625, + "learning_rate": 0.00041213284077599267, + "loss": 0.2019, + "step": 199530 + }, + { + "epoch": 8.26, + "grad_norm": 1.6015625, + "learning_rate": 0.00041212458539199956, + "loss": 0.2074, + "step": 199540 + }, + { + "epoch": 8.27, + "grad_norm": 0.466796875, + "learning_rate": 0.00041211632970290546, + "loss": 0.1586, + "step": 199550 + }, + { + "epoch": 8.27, + "grad_norm": 0.89453125, + "learning_rate": 0.0004121080737087259, + "loss": 0.2459, + "step": 199560 + }, + { + "epoch": 8.27, + "grad_norm": 1.171875, + "learning_rate": 0.00041209981740947656, + "loss": 0.1822, + "step": 199570 + }, + { + "epoch": 8.27, + "grad_norm": 0.8359375, + "learning_rate": 0.00041209156080517287, + "loss": 0.213, + "step": 199580 + }, + { + "epoch": 8.27, + "grad_norm": 0.890625, + "learning_rate": 0.00041208330389583026, + "loss": 0.1909, + "step": 199590 + }, + { + "epoch": 8.27, + "grad_norm": 0.69921875, + "learning_rate": 0.00041207504668146444, + "loss": 0.1896, + "step": 199600 + }, + { + "epoch": 8.27, + "grad_norm": 0.984375, + "learning_rate": 0.00041206678916209087, + "loss": 0.2122, + "step": 199610 + }, + { + "epoch": 8.27, + "grad_norm": 0.75, + "learning_rate": 0.00041205853133772515, + "loss": 0.2306, + "step": 199620 + }, + { + "epoch": 8.27, + "grad_norm": 1.0703125, + "learning_rate": 0.00041205027320838274, + "loss": 0.212, + "step": 199630 + }, + { + "epoch": 8.27, + "grad_norm": 0.3984375, + "learning_rate": 0.0004120420147740792, + "loss": 0.1722, + "step": 199640 + }, + { + "epoch": 8.27, + "grad_norm": 0.33203125, + "learning_rate": 0.0004120337560348301, + "loss": 0.2193, + "step": 199650 + }, + { + "epoch": 8.27, + "grad_norm": 1.15625, + "learning_rate": 0.00041202549699065094, + "loss": 0.2271, + "step": 199660 + }, + { + "epoch": 8.27, + "grad_norm": 1.2109375, + "learning_rate": 0.0004120172376415574, + "loss": 0.2523, + "step": 199670 + }, + { + "epoch": 8.27, + "grad_norm": 0.53515625, + "learning_rate": 0.00041200897798756474, + "loss": 0.193, + "step": 199680 + }, + { + "epoch": 8.27, + "grad_norm": 0.53515625, + "learning_rate": 0.00041200071802868885, + "loss": 0.2045, + "step": 199690 + }, + { + "epoch": 8.27, + "grad_norm": 1.3671875, + "learning_rate": 0.00041199245776494506, + "loss": 0.2101, + "step": 199700 + }, + { + "epoch": 8.27, + "grad_norm": 0.5234375, + "learning_rate": 0.0004119841971963489, + "loss": 0.1997, + "step": 199710 + }, + { + "epoch": 8.27, + "grad_norm": 0.8671875, + "learning_rate": 0.000411975936322916, + "loss": 0.1833, + "step": 199720 + }, + { + "epoch": 8.27, + "grad_norm": 0.62890625, + "learning_rate": 0.0004119676751446618, + "loss": 0.2227, + "step": 199730 + }, + { + "epoch": 8.27, + "grad_norm": 0.3671875, + "learning_rate": 0.00041195941366160203, + "loss": 0.1794, + "step": 199740 + }, + { + "epoch": 8.27, + "grad_norm": 0.69921875, + "learning_rate": 0.0004119511518737522, + "loss": 0.1724, + "step": 199750 + }, + { + "epoch": 8.27, + "grad_norm": 0.734375, + "learning_rate": 0.0004119428897811276, + "loss": 0.2469, + "step": 199760 + }, + { + "epoch": 8.27, + "grad_norm": 1.4375, + "learning_rate": 0.0004119346273837441, + "loss": 0.181, + "step": 199770 + }, + { + "epoch": 8.27, + "grad_norm": 1.078125, + "learning_rate": 0.00041192636468161717, + "loss": 0.1815, + "step": 199780 + }, + { + "epoch": 8.28, + "grad_norm": 1.015625, + "learning_rate": 0.0004119181016747622, + "loss": 0.1672, + "step": 199790 + }, + { + "epoch": 8.28, + "grad_norm": 0.75390625, + "learning_rate": 0.00041190983836319485, + "loss": 0.2223, + "step": 199800 + }, + { + "epoch": 8.28, + "grad_norm": 0.88671875, + "learning_rate": 0.0004119015747469307, + "loss": 0.2215, + "step": 199810 + }, + { + "epoch": 8.28, + "grad_norm": 0.80078125, + "learning_rate": 0.0004118933108259853, + "loss": 0.2045, + "step": 199820 + }, + { + "epoch": 8.28, + "grad_norm": 0.890625, + "learning_rate": 0.0004118850466003742, + "loss": 0.1782, + "step": 199830 + }, + { + "epoch": 8.28, + "grad_norm": 0.64453125, + "learning_rate": 0.0004118767820701128, + "loss": 0.2112, + "step": 199840 + }, + { + "epoch": 8.28, + "grad_norm": 0.76953125, + "learning_rate": 0.0004118685172352168, + "loss": 0.2417, + "step": 199850 + }, + { + "epoch": 8.28, + "grad_norm": 0.6328125, + "learning_rate": 0.0004118602520957018, + "loss": 0.1927, + "step": 199860 + }, + { + "epoch": 8.28, + "grad_norm": 0.490234375, + "learning_rate": 0.00041185198665158327, + "loss": 0.2159, + "step": 199870 + }, + { + "epoch": 8.28, + "grad_norm": 0.52734375, + "learning_rate": 0.00041184372090287674, + "loss": 0.1884, + "step": 199880 + }, + { + "epoch": 8.28, + "grad_norm": 1.0703125, + "learning_rate": 0.0004118354548495978, + "loss": 0.178, + "step": 199890 + }, + { + "epoch": 8.28, + "grad_norm": 0.640625, + "learning_rate": 0.0004118271884917621, + "loss": 0.1902, + "step": 199900 + }, + { + "epoch": 8.28, + "grad_norm": 1.6171875, + "learning_rate": 0.00041181892182938506, + "loss": 0.2138, + "step": 199910 + }, + { + "epoch": 8.28, + "grad_norm": 0.66796875, + "learning_rate": 0.00041181065486248225, + "loss": 0.2117, + "step": 199920 + }, + { + "epoch": 8.28, + "grad_norm": 0.59765625, + "learning_rate": 0.0004118023875910692, + "loss": 0.2277, + "step": 199930 + }, + { + "epoch": 8.28, + "grad_norm": 0.59765625, + "learning_rate": 0.0004117941200151617, + "loss": 0.2303, + "step": 199940 + }, + { + "epoch": 8.28, + "grad_norm": 0.380859375, + "learning_rate": 0.00041178585213477503, + "loss": 0.209, + "step": 199950 + }, + { + "epoch": 8.28, + "grad_norm": 1.4921875, + "learning_rate": 0.0004117775839499249, + "loss": 0.1865, + "step": 199960 + }, + { + "epoch": 8.28, + "grad_norm": 1.15625, + "learning_rate": 0.0004117693154606268, + "loss": 0.169, + "step": 199970 + }, + { + "epoch": 8.28, + "grad_norm": 0.494140625, + "learning_rate": 0.00041176104666689633, + "loss": 0.1928, + "step": 199980 + }, + { + "epoch": 8.28, + "grad_norm": 1.03125, + "learning_rate": 0.000411752777568749, + "loss": 0.2174, + "step": 199990 + }, + { + "epoch": 8.28, + "grad_norm": 1.0078125, + "learning_rate": 0.00041174450816620044, + "loss": 0.2326, + "step": 200000 + }, + { + "epoch": 8.28, + "eval_runtime": 2823.9495, + "eval_samples_per_second": 34.196, + "eval_steps_per_second": 8.549, + "step": 200000 + }, + { + "epoch": 8.28, + "grad_norm": 1.2421875, + "learning_rate": 0.0004117362384592662, + "loss": 0.2248, + "step": 200010 + }, + { + "epoch": 8.28, + "grad_norm": 1.046875, + "learning_rate": 0.0004117279684479618, + "loss": 0.2123, + "step": 200020 + }, + { + "epoch": 8.29, + "grad_norm": 0.59375, + "learning_rate": 0.0004117196981323029, + "loss": 0.1849, + "step": 200030 + }, + { + "epoch": 8.29, + "grad_norm": 0.66796875, + "learning_rate": 0.00041171142751230495, + "loss": 0.1814, + "step": 200040 + }, + { + "epoch": 8.29, + "grad_norm": 0.60546875, + "learning_rate": 0.00041170315658798354, + "loss": 0.1958, + "step": 200050 + }, + { + "epoch": 8.29, + "grad_norm": 1.015625, + "learning_rate": 0.0004116948853593542, + "loss": 0.2149, + "step": 200060 + }, + { + "epoch": 8.29, + "grad_norm": 0.6171875, + "learning_rate": 0.0004116866138264326, + "loss": 0.1799, + "step": 200070 + }, + { + "epoch": 8.29, + "grad_norm": 0.5625, + "learning_rate": 0.00041167834198923425, + "loss": 0.1772, + "step": 200080 + }, + { + "epoch": 8.29, + "grad_norm": 0.65625, + "learning_rate": 0.0004116700698477748, + "loss": 0.2188, + "step": 200090 + }, + { + "epoch": 8.29, + "grad_norm": 0.71484375, + "learning_rate": 0.00041166179740206963, + "loss": 0.1964, + "step": 200100 + }, + { + "epoch": 8.29, + "grad_norm": 0.2578125, + "learning_rate": 0.00041165352465213445, + "loss": 0.1631, + "step": 200110 + }, + { + "epoch": 8.29, + "grad_norm": 0.30078125, + "learning_rate": 0.0004116452515979848, + "loss": 0.192, + "step": 200120 + }, + { + "epoch": 8.29, + "grad_norm": 1.1953125, + "learning_rate": 0.0004116369782396362, + "loss": 0.2323, + "step": 200130 + }, + { + "epoch": 8.29, + "grad_norm": 0.89453125, + "learning_rate": 0.00041162870457710427, + "loss": 0.2209, + "step": 200140 + }, + { + "epoch": 8.29, + "grad_norm": 1.0625, + "learning_rate": 0.0004116204306104046, + "loss": 0.1911, + "step": 200150 + }, + { + "epoch": 8.29, + "grad_norm": 0.578125, + "learning_rate": 0.00041161215633955274, + "loss": 0.2114, + "step": 200160 + }, + { + "epoch": 8.29, + "grad_norm": 0.546875, + "learning_rate": 0.0004116038817645643, + "loss": 0.208, + "step": 200170 + }, + { + "epoch": 8.29, + "grad_norm": 0.9453125, + "learning_rate": 0.0004115956068854547, + "loss": 0.1927, + "step": 200180 + }, + { + "epoch": 8.29, + "grad_norm": 0.42578125, + "learning_rate": 0.00041158733170223967, + "loss": 0.1788, + "step": 200190 + }, + { + "epoch": 8.29, + "grad_norm": 0.52734375, + "learning_rate": 0.0004115790562149347, + "loss": 0.2159, + "step": 200200 + }, + { + "epoch": 8.29, + "grad_norm": 0.375, + "learning_rate": 0.00041157078042355547, + "loss": 0.1863, + "step": 200210 + }, + { + "epoch": 8.29, + "grad_norm": 0.84375, + "learning_rate": 0.0004115625043281174, + "loss": 0.1751, + "step": 200220 + }, + { + "epoch": 8.29, + "grad_norm": 0.953125, + "learning_rate": 0.00041155422792863615, + "loss": 0.1691, + "step": 200230 + }, + { + "epoch": 8.29, + "grad_norm": 1.0078125, + "learning_rate": 0.0004115459512251274, + "loss": 0.2071, + "step": 200240 + }, + { + "epoch": 8.29, + "grad_norm": 0.388671875, + "learning_rate": 0.00041153767421760645, + "loss": 0.2037, + "step": 200250 + }, + { + "epoch": 8.29, + "grad_norm": 0.294921875, + "learning_rate": 0.00041152939690608915, + "loss": 0.2489, + "step": 200260 + }, + { + "epoch": 8.3, + "grad_norm": 0.90234375, + "learning_rate": 0.000411521119290591, + "loss": 0.2125, + "step": 200270 + }, + { + "epoch": 8.3, + "grad_norm": 0.921875, + "learning_rate": 0.00041151284137112745, + "loss": 0.18, + "step": 200280 + }, + { + "epoch": 8.3, + "grad_norm": 0.5234375, + "learning_rate": 0.0004115045631477142, + "loss": 0.1459, + "step": 200290 + }, + { + "epoch": 8.3, + "grad_norm": 1.0390625, + "learning_rate": 0.0004114962846203668, + "loss": 0.2174, + "step": 200300 + }, + { + "epoch": 8.3, + "grad_norm": 0.2001953125, + "learning_rate": 0.00041148800578910086, + "loss": 0.1665, + "step": 200310 + }, + { + "epoch": 8.3, + "grad_norm": 0.375, + "learning_rate": 0.000411479726653932, + "loss": 0.1825, + "step": 200320 + }, + { + "epoch": 8.3, + "grad_norm": 1.1640625, + "learning_rate": 0.0004114714472148756, + "loss": 0.1434, + "step": 200330 + }, + { + "epoch": 8.3, + "grad_norm": 0.51171875, + "learning_rate": 0.00041146316747194743, + "loss": 0.2116, + "step": 200340 + }, + { + "epoch": 8.3, + "grad_norm": 0.4375, + "learning_rate": 0.00041145488742516304, + "loss": 0.1945, + "step": 200350 + }, + { + "epoch": 8.3, + "grad_norm": 1.0390625, + "learning_rate": 0.000411446607074538, + "loss": 0.1842, + "step": 200360 + }, + { + "epoch": 8.3, + "grad_norm": 1.109375, + "learning_rate": 0.00041143832642008776, + "loss": 0.1961, + "step": 200370 + }, + { + "epoch": 8.3, + "grad_norm": 1.84375, + "learning_rate": 0.00041143004546182816, + "loss": 0.1792, + "step": 200380 + }, + { + "epoch": 8.3, + "grad_norm": 0.890625, + "learning_rate": 0.00041142176419977456, + "loss": 0.1692, + "step": 200390 + }, + { + "epoch": 8.3, + "grad_norm": 0.578125, + "learning_rate": 0.0004114134826339427, + "loss": 0.2225, + "step": 200400 + }, + { + "epoch": 8.3, + "grad_norm": 0.703125, + "learning_rate": 0.000411405200764348, + "loss": 0.1816, + "step": 200410 + }, + { + "epoch": 8.3, + "grad_norm": 1.0703125, + "learning_rate": 0.00041139691859100623, + "loss": 0.2137, + "step": 200420 + }, + { + "epoch": 8.3, + "grad_norm": 0.77734375, + "learning_rate": 0.00041138863611393285, + "loss": 0.1942, + "step": 200430 + }, + { + "epoch": 8.3, + "grad_norm": 1.5, + "learning_rate": 0.0004113803533331435, + "loss": 0.2117, + "step": 200440 + }, + { + "epoch": 8.3, + "grad_norm": 0.0, + "learning_rate": 0.00041137207024865375, + "loss": 0.1776, + "step": 200450 + }, + { + "epoch": 8.3, + "grad_norm": 1.796875, + "learning_rate": 0.0004113637868604792, + "loss": 0.1772, + "step": 200460 + }, + { + "epoch": 8.3, + "grad_norm": 0.59375, + "learning_rate": 0.00041135550316863547, + "loss": 0.2087, + "step": 200470 + }, + { + "epoch": 8.3, + "grad_norm": 0.62890625, + "learning_rate": 0.000411347219173138, + "loss": 0.179, + "step": 200480 + }, + { + "epoch": 8.3, + "grad_norm": 0.74609375, + "learning_rate": 0.0004113389348740025, + "loss": 0.2193, + "step": 200490 + }, + { + "epoch": 8.3, + "grad_norm": 0.392578125, + "learning_rate": 0.0004113306502712447, + "loss": 0.2371, + "step": 200500 + }, + { + "epoch": 8.31, + "grad_norm": 0.99609375, + "learning_rate": 0.0004113223653648799, + "loss": 0.18, + "step": 200510 + }, + { + "epoch": 8.31, + "grad_norm": 0.65234375, + "learning_rate": 0.00041131408015492385, + "loss": 0.1765, + "step": 200520 + }, + { + "epoch": 8.31, + "grad_norm": 0.8203125, + "learning_rate": 0.0004113057946413921, + "loss": 0.2339, + "step": 200530 + }, + { + "epoch": 8.31, + "grad_norm": 0.6328125, + "learning_rate": 0.0004112975088243003, + "loss": 0.1883, + "step": 200540 + }, + { + "epoch": 8.31, + "grad_norm": 0.953125, + "learning_rate": 0.000411289222703664, + "loss": 0.232, + "step": 200550 + }, + { + "epoch": 8.31, + "grad_norm": 0.58203125, + "learning_rate": 0.0004112809362794988, + "loss": 0.2102, + "step": 200560 + }, + { + "epoch": 8.31, + "grad_norm": 0.5078125, + "learning_rate": 0.00041127264955182033, + "loss": 0.1833, + "step": 200570 + }, + { + "epoch": 8.31, + "grad_norm": 0.55078125, + "learning_rate": 0.00041126436252064414, + "loss": 0.2298, + "step": 200580 + }, + { + "epoch": 8.31, + "grad_norm": 0.359375, + "learning_rate": 0.00041125607518598584, + "loss": 0.1817, + "step": 200590 + }, + { + "epoch": 8.31, + "grad_norm": 0.44140625, + "learning_rate": 0.00041124778754786097, + "loss": 0.198, + "step": 200600 + }, + { + "epoch": 8.31, + "grad_norm": 0.55078125, + "learning_rate": 0.00041123949960628515, + "loss": 0.2107, + "step": 200610 + }, + { + "epoch": 8.31, + "grad_norm": 0.62890625, + "learning_rate": 0.00041123121136127404, + "loss": 0.2169, + "step": 200620 + }, + { + "epoch": 8.31, + "grad_norm": 0.58984375, + "learning_rate": 0.0004112229228128432, + "loss": 0.2149, + "step": 200630 + }, + { + "epoch": 8.31, + "grad_norm": 0.69921875, + "learning_rate": 0.00041121463396100833, + "loss": 0.205, + "step": 200640 + }, + { + "epoch": 8.31, + "grad_norm": 1.390625, + "learning_rate": 0.0004112063448057848, + "loss": 0.161, + "step": 200650 + }, + { + "epoch": 8.31, + "grad_norm": 1.125, + "learning_rate": 0.0004111980553471884, + "loss": 0.221, + "step": 200660 + }, + { + "epoch": 8.31, + "grad_norm": 1.015625, + "learning_rate": 0.0004111897655852347, + "loss": 0.2419, + "step": 200670 + }, + { + "epoch": 8.31, + "grad_norm": 0.83984375, + "learning_rate": 0.00041118147551993917, + "loss": 0.2062, + "step": 200680 + }, + { + "epoch": 8.31, + "grad_norm": 0.98828125, + "learning_rate": 0.0004111731851513176, + "loss": 0.2336, + "step": 200690 + }, + { + "epoch": 8.31, + "grad_norm": 0.87109375, + "learning_rate": 0.0004111648944793854, + "loss": 0.2183, + "step": 200700 + }, + { + "epoch": 8.31, + "grad_norm": 0.8828125, + "learning_rate": 0.00041115660350415836, + "loss": 0.1933, + "step": 200710 + }, + { + "epoch": 8.31, + "grad_norm": 0.88671875, + "learning_rate": 0.000411148312225652, + "loss": 0.1838, + "step": 200720 + }, + { + "epoch": 8.31, + "grad_norm": 0.6015625, + "learning_rate": 0.0004111400206438818, + "loss": 0.1783, + "step": 200730 + }, + { + "epoch": 8.31, + "grad_norm": 1.1171875, + "learning_rate": 0.0004111317287588636, + "loss": 0.1483, + "step": 200740 + }, + { + "epoch": 8.32, + "grad_norm": 1.515625, + "learning_rate": 0.00041112343657061283, + "loss": 0.2428, + "step": 200750 + }, + { + "epoch": 8.32, + "grad_norm": 0.431640625, + "learning_rate": 0.0004111151440791452, + "loss": 0.2189, + "step": 200760 + }, + { + "epoch": 8.32, + "grad_norm": 1.3359375, + "learning_rate": 0.0004111068512844763, + "loss": 0.1983, + "step": 200770 + }, + { + "epoch": 8.32, + "grad_norm": 0.71875, + "learning_rate": 0.00041109855818662164, + "loss": 0.2128, + "step": 200780 + }, + { + "epoch": 8.32, + "grad_norm": 1.0546875, + "learning_rate": 0.00041109026478559685, + "loss": 0.2075, + "step": 200790 + }, + { + "epoch": 8.32, + "grad_norm": 0.69921875, + "learning_rate": 0.00041108197108141763, + "loss": 0.1907, + "step": 200800 + }, + { + "epoch": 8.32, + "grad_norm": 0.341796875, + "learning_rate": 0.00041107367707409956, + "loss": 0.254, + "step": 200810 + }, + { + "epoch": 8.32, + "grad_norm": 1.0546875, + "learning_rate": 0.00041106538276365823, + "loss": 0.1646, + "step": 200820 + }, + { + "epoch": 8.32, + "grad_norm": 0.8359375, + "learning_rate": 0.00041105708815010917, + "loss": 0.1818, + "step": 200830 + }, + { + "epoch": 8.32, + "grad_norm": 0.62109375, + "learning_rate": 0.0004110487932334681, + "loss": 0.2606, + "step": 200840 + }, + { + "epoch": 8.32, + "grad_norm": 0.56640625, + "learning_rate": 0.0004110404980137507, + "loss": 0.2093, + "step": 200850 + }, + { + "epoch": 8.32, + "grad_norm": 0.55859375, + "learning_rate": 0.00041103220249097226, + "loss": 0.1738, + "step": 200860 + }, + { + "epoch": 8.32, + "grad_norm": 0.5546875, + "learning_rate": 0.0004110239066651488, + "loss": 0.2086, + "step": 200870 + }, + { + "epoch": 8.32, + "grad_norm": 1.15625, + "learning_rate": 0.0004110156105362956, + "loss": 0.197, + "step": 200880 + }, + { + "epoch": 8.32, + "grad_norm": 1.015625, + "learning_rate": 0.0004110073141044284, + "loss": 0.1854, + "step": 200890 + }, + { + "epoch": 8.32, + "grad_norm": 0.828125, + "learning_rate": 0.000410999017369563, + "loss": 0.2029, + "step": 200900 + }, + { + "epoch": 8.32, + "grad_norm": 0.48046875, + "learning_rate": 0.00041099072033171465, + "loss": 0.1389, + "step": 200910 + }, + { + "epoch": 8.32, + "grad_norm": 0.9140625, + "learning_rate": 0.0004109824229908992, + "loss": 0.2365, + "step": 200920 + }, + { + "epoch": 8.32, + "grad_norm": 0.6953125, + "learning_rate": 0.0004109741253471323, + "loss": 0.2112, + "step": 200930 + }, + { + "epoch": 8.32, + "grad_norm": 0.62109375, + "learning_rate": 0.0004109658274004294, + "loss": 0.204, + "step": 200940 + }, + { + "epoch": 8.32, + "grad_norm": 0.875, + "learning_rate": 0.0004109575291508062, + "loss": 0.1526, + "step": 200950 + }, + { + "epoch": 8.32, + "grad_norm": 0.5703125, + "learning_rate": 0.0004109492305982783, + "loss": 0.1949, + "step": 200960 + }, + { + "epoch": 8.32, + "grad_norm": 1.953125, + "learning_rate": 0.00041094093174286135, + "loss": 0.1599, + "step": 200970 + }, + { + "epoch": 8.32, + "grad_norm": 0.7421875, + "learning_rate": 0.0004109326325845709, + "loss": 0.2214, + "step": 200980 + }, + { + "epoch": 8.32, + "grad_norm": 0.6484375, + "learning_rate": 0.0004109243331234227, + "loss": 0.1737, + "step": 200990 + }, + { + "epoch": 8.33, + "grad_norm": 0.4140625, + "learning_rate": 0.0004109160333594323, + "loss": 0.1984, + "step": 201000 + }, + { + "epoch": 8.33, + "grad_norm": 1.53125, + "learning_rate": 0.00041090773329261523, + "loss": 0.2379, + "step": 201010 + }, + { + "epoch": 8.33, + "grad_norm": 0.703125, + "learning_rate": 0.00041089943292298717, + "loss": 0.1832, + "step": 201020 + }, + { + "epoch": 8.33, + "grad_norm": 0.55078125, + "learning_rate": 0.0004108911322505638, + "loss": 0.1462, + "step": 201030 + }, + { + "epoch": 8.33, + "grad_norm": 0.92578125, + "learning_rate": 0.0004108828312753606, + "loss": 0.1905, + "step": 201040 + }, + { + "epoch": 8.33, + "grad_norm": 0.49609375, + "learning_rate": 0.0004108745299973934, + "loss": 0.218, + "step": 201050 + }, + { + "epoch": 8.33, + "grad_norm": 0.57421875, + "learning_rate": 0.0004108662284166777, + "loss": 0.1459, + "step": 201060 + }, + { + "epoch": 8.33, + "grad_norm": 6.375, + "learning_rate": 0.0004108579265332291, + "loss": 0.2154, + "step": 201070 + }, + { + "epoch": 8.33, + "grad_norm": 1.609375, + "learning_rate": 0.0004108496243470632, + "loss": 0.1896, + "step": 201080 + }, + { + "epoch": 8.33, + "grad_norm": 0.61328125, + "learning_rate": 0.0004108413218581958, + "loss": 0.1971, + "step": 201090 + }, + { + "epoch": 8.33, + "grad_norm": 0.6015625, + "learning_rate": 0.00041083301906664235, + "loss": 0.1903, + "step": 201100 + }, + { + "epoch": 8.33, + "grad_norm": 0.80859375, + "learning_rate": 0.0004108247159724185, + "loss": 0.1844, + "step": 201110 + }, + { + "epoch": 8.33, + "grad_norm": 0.8203125, + "learning_rate": 0.00041081641257553986, + "loss": 0.1493, + "step": 201120 + }, + { + "epoch": 8.33, + "grad_norm": 1.2421875, + "learning_rate": 0.00041080810887602216, + "loss": 0.2177, + "step": 201130 + }, + { + "epoch": 8.33, + "grad_norm": 0.5703125, + "learning_rate": 0.00041079980487388097, + "loss": 0.2208, + "step": 201140 + }, + { + "epoch": 8.33, + "grad_norm": 0.58203125, + "learning_rate": 0.0004107915005691319, + "loss": 0.201, + "step": 201150 + }, + { + "epoch": 8.33, + "grad_norm": 1.0859375, + "learning_rate": 0.0004107831959617906, + "loss": 0.2118, + "step": 201160 + }, + { + "epoch": 8.33, + "grad_norm": 1.28125, + "learning_rate": 0.0004107748910518727, + "loss": 0.2059, + "step": 201170 + }, + { + "epoch": 8.33, + "grad_norm": 0.55859375, + "learning_rate": 0.0004107665858393937, + "loss": 0.2337, + "step": 201180 + }, + { + "epoch": 8.33, + "grad_norm": 1.03125, + "learning_rate": 0.0004107582803243695, + "loss": 0.267, + "step": 201190 + }, + { + "epoch": 8.33, + "grad_norm": 0.06298828125, + "learning_rate": 0.0004107499745068155, + "loss": 0.1614, + "step": 201200 + }, + { + "epoch": 8.33, + "grad_norm": 1.078125, + "learning_rate": 0.00041074166838674743, + "loss": 0.1992, + "step": 201210 + }, + { + "epoch": 8.33, + "grad_norm": 1.0390625, + "learning_rate": 0.0004107333619641809, + "loss": 0.2226, + "step": 201220 + }, + { + "epoch": 8.33, + "grad_norm": 0.302734375, + "learning_rate": 0.0004107250552391315, + "loss": 0.1834, + "step": 201230 + }, + { + "epoch": 8.34, + "grad_norm": 0.69921875, + "learning_rate": 0.000410716748211615, + "loss": 0.227, + "step": 201240 + }, + { + "epoch": 8.34, + "grad_norm": 1.1953125, + "learning_rate": 0.0004107084408816468, + "loss": 0.2018, + "step": 201250 + }, + { + "epoch": 8.34, + "grad_norm": 1.0, + "learning_rate": 0.00041070013324924276, + "loss": 0.2087, + "step": 201260 + }, + { + "epoch": 8.34, + "grad_norm": 1.140625, + "learning_rate": 0.0004106918253144184, + "loss": 0.2347, + "step": 201270 + }, + { + "epoch": 8.34, + "grad_norm": 0.734375, + "learning_rate": 0.00041068351707718935, + "loss": 0.2243, + "step": 201280 + }, + { + "epoch": 8.34, + "grad_norm": 1.0546875, + "learning_rate": 0.0004106752085375713, + "loss": 0.2044, + "step": 201290 + }, + { + "epoch": 8.34, + "grad_norm": 1.390625, + "learning_rate": 0.00041066689969557993, + "loss": 0.1994, + "step": 201300 + }, + { + "epoch": 8.34, + "grad_norm": 0.7734375, + "learning_rate": 0.00041065859055123074, + "loss": 0.193, + "step": 201310 + }, + { + "epoch": 8.34, + "grad_norm": 0.59375, + "learning_rate": 0.00041065028110453937, + "loss": 0.16, + "step": 201320 + }, + { + "epoch": 8.34, + "grad_norm": 0.9296875, + "learning_rate": 0.0004106419713555216, + "loss": 0.1794, + "step": 201330 + }, + { + "epoch": 8.34, + "grad_norm": 0.72265625, + "learning_rate": 0.000410633661304193, + "loss": 0.2002, + "step": 201340 + }, + { + "epoch": 8.34, + "grad_norm": 0.609375, + "learning_rate": 0.0004106253509505692, + "loss": 0.1816, + "step": 201350 + }, + { + "epoch": 8.34, + "grad_norm": 0.8046875, + "learning_rate": 0.0004106170402946657, + "loss": 0.198, + "step": 201360 + }, + { + "epoch": 8.34, + "grad_norm": 1.8203125, + "learning_rate": 0.0004106087293364984, + "loss": 0.2056, + "step": 201370 + }, + { + "epoch": 8.34, + "grad_norm": 0.828125, + "learning_rate": 0.0004106004180760828, + "loss": 0.2227, + "step": 201380 + }, + { + "epoch": 8.34, + "grad_norm": 0.92578125, + "learning_rate": 0.0004105921065134346, + "loss": 0.1828, + "step": 201390 + }, + { + "epoch": 8.34, + "grad_norm": 1.5703125, + "learning_rate": 0.00041058379464856934, + "loss": 0.2158, + "step": 201400 + }, + { + "epoch": 8.34, + "grad_norm": 1.125, + "learning_rate": 0.00041057548248150266, + "loss": 0.229, + "step": 201410 + }, + { + "epoch": 8.34, + "grad_norm": 1.3828125, + "learning_rate": 0.0004105671700122504, + "loss": 0.1901, + "step": 201420 + }, + { + "epoch": 8.34, + "grad_norm": 0.443359375, + "learning_rate": 0.00041055885724082796, + "loss": 0.1585, + "step": 201430 + }, + { + "epoch": 8.34, + "grad_norm": 0.92578125, + "learning_rate": 0.0004105505441672511, + "loss": 0.2136, + "step": 201440 + }, + { + "epoch": 8.34, + "grad_norm": 1.0546875, + "learning_rate": 0.00041054223079153553, + "loss": 0.2395, + "step": 201450 + }, + { + "epoch": 8.34, + "grad_norm": 0.88671875, + "learning_rate": 0.0004105339171136967, + "loss": 0.1836, + "step": 201460 + }, + { + "epoch": 8.34, + "grad_norm": 0.498046875, + "learning_rate": 0.0004105256031337505, + "loss": 0.1442, + "step": 201470 + }, + { + "epoch": 8.35, + "grad_norm": 0.984375, + "learning_rate": 0.00041051728885171235, + "loss": 0.2529, + "step": 201480 + }, + { + "epoch": 8.35, + "grad_norm": 0.6015625, + "learning_rate": 0.00041050897426759804, + "loss": 0.2237, + "step": 201490 + }, + { + "epoch": 8.35, + "grad_norm": 0.78125, + "learning_rate": 0.0004105006593814232, + "loss": 0.184, + "step": 201500 + }, + { + "epoch": 8.35, + "grad_norm": 1.0234375, + "learning_rate": 0.0004104923441932034, + "loss": 0.1606, + "step": 201510 + }, + { + "epoch": 8.35, + "grad_norm": 0.486328125, + "learning_rate": 0.00041048402870295437, + "loss": 0.2106, + "step": 201520 + }, + { + "epoch": 8.35, + "grad_norm": 1.0390625, + "learning_rate": 0.00041047571291069173, + "loss": 0.1885, + "step": 201530 + }, + { + "epoch": 8.35, + "grad_norm": 0.419921875, + "learning_rate": 0.0004104673968164311, + "loss": 0.186, + "step": 201540 + }, + { + "epoch": 8.35, + "grad_norm": 1.4453125, + "learning_rate": 0.0004104590804201882, + "loss": 0.1883, + "step": 201550 + }, + { + "epoch": 8.35, + "grad_norm": 1.078125, + "learning_rate": 0.00041045076372197865, + "loss": 0.2219, + "step": 201560 + }, + { + "epoch": 8.35, + "grad_norm": 0.71484375, + "learning_rate": 0.00041044244672181797, + "loss": 0.2281, + "step": 201570 + }, + { + "epoch": 8.35, + "grad_norm": 0.8359375, + "learning_rate": 0.00041043412941972206, + "loss": 0.2257, + "step": 201580 + }, + { + "epoch": 8.35, + "grad_norm": 0.515625, + "learning_rate": 0.00041042581181570636, + "loss": 0.2439, + "step": 201590 + }, + { + "epoch": 8.35, + "grad_norm": 0.40625, + "learning_rate": 0.00041041749390978666, + "loss": 0.1851, + "step": 201600 + }, + { + "epoch": 8.35, + "grad_norm": 0.6640625, + "learning_rate": 0.00041040917570197855, + "loss": 0.1996, + "step": 201610 + }, + { + "epoch": 8.35, + "grad_norm": 0.447265625, + "learning_rate": 0.0004104008571922977, + "loss": 0.2257, + "step": 201620 + }, + { + "epoch": 8.35, + "grad_norm": 1.0078125, + "learning_rate": 0.0004103925383807597, + "loss": 0.1967, + "step": 201630 + }, + { + "epoch": 8.35, + "grad_norm": 0.275390625, + "learning_rate": 0.0004103842192673804, + "loss": 0.216, + "step": 201640 + }, + { + "epoch": 8.35, + "grad_norm": 0.37109375, + "learning_rate": 0.00041037589985217524, + "loss": 0.1863, + "step": 201650 + }, + { + "epoch": 8.35, + "grad_norm": 0.4375, + "learning_rate": 0.00041036758013515993, + "loss": 0.1626, + "step": 201660 + }, + { + "epoch": 8.35, + "grad_norm": 0.447265625, + "learning_rate": 0.0004103592601163502, + "loss": 0.1887, + "step": 201670 + }, + { + "epoch": 8.35, + "grad_norm": 1.8828125, + "learning_rate": 0.0004103509397957616, + "loss": 0.2099, + "step": 201680 + }, + { + "epoch": 8.35, + "grad_norm": 0.8984375, + "learning_rate": 0.00041034261917340987, + "loss": 0.1801, + "step": 201690 + }, + { + "epoch": 8.35, + "grad_norm": 1.65625, + "learning_rate": 0.00041033429824931066, + "loss": 0.198, + "step": 201700 + }, + { + "epoch": 8.35, + "grad_norm": 0.462890625, + "learning_rate": 0.0004103259770234796, + "loss": 0.1924, + "step": 201710 + }, + { + "epoch": 8.36, + "grad_norm": 0.53125, + "learning_rate": 0.0004103176554959324, + "loss": 0.2194, + "step": 201720 + }, + { + "epoch": 8.36, + "grad_norm": 0.87109375, + "learning_rate": 0.00041030933366668465, + "loss": 0.1794, + "step": 201730 + }, + { + "epoch": 8.36, + "grad_norm": 0.6953125, + "learning_rate": 0.0004103010115357521, + "loss": 0.1899, + "step": 201740 + }, + { + "epoch": 8.36, + "grad_norm": 0.6328125, + "learning_rate": 0.0004102926891031503, + "loss": 0.1875, + "step": 201750 + }, + { + "epoch": 8.36, + "grad_norm": 1.2578125, + "learning_rate": 0.000410284366368895, + "loss": 0.2, + "step": 201760 + }, + { + "epoch": 8.36, + "grad_norm": 1.234375, + "learning_rate": 0.0004102760433330018, + "loss": 0.1696, + "step": 201770 + }, + { + "epoch": 8.36, + "grad_norm": 0.6015625, + "learning_rate": 0.0004102677199954864, + "loss": 0.1628, + "step": 201780 + }, + { + "epoch": 8.36, + "grad_norm": 1.21875, + "learning_rate": 0.00041025939635636445, + "loss": 0.127, + "step": 201790 + }, + { + "epoch": 8.36, + "grad_norm": 3.015625, + "learning_rate": 0.00041025107241565164, + "loss": 0.2257, + "step": 201800 + }, + { + "epoch": 8.36, + "grad_norm": 0.369140625, + "learning_rate": 0.0004102427481733636, + "loss": 0.1432, + "step": 201810 + }, + { + "epoch": 8.36, + "grad_norm": 1.2265625, + "learning_rate": 0.00041023442362951606, + "loss": 0.1666, + "step": 201820 + }, + { + "epoch": 8.36, + "grad_norm": 1.0390625, + "learning_rate": 0.00041022609878412457, + "loss": 0.1976, + "step": 201830 + }, + { + "epoch": 8.36, + "grad_norm": 0.5625, + "learning_rate": 0.0004102177736372049, + "loss": 0.2176, + "step": 201840 + }, + { + "epoch": 8.36, + "grad_norm": 0.34375, + "learning_rate": 0.0004102094481887727, + "loss": 0.1939, + "step": 201850 + }, + { + "epoch": 8.36, + "grad_norm": 0.9453125, + "learning_rate": 0.0004102011224388437, + "loss": 0.1967, + "step": 201860 + }, + { + "epoch": 8.36, + "grad_norm": 0.52734375, + "learning_rate": 0.00041019279638743335, + "loss": 0.2412, + "step": 201870 + }, + { + "epoch": 8.36, + "grad_norm": 0.68359375, + "learning_rate": 0.00041018447003455746, + "loss": 0.2059, + "step": 201880 + }, + { + "epoch": 8.36, + "grad_norm": 0.5625, + "learning_rate": 0.00041017614338023173, + "loss": 0.1988, + "step": 201890 + }, + { + "epoch": 8.36, + "grad_norm": 0.5, + "learning_rate": 0.00041016781642447183, + "loss": 0.1762, + "step": 201900 + }, + { + "epoch": 8.36, + "grad_norm": 0.953125, + "learning_rate": 0.00041015948916729337, + "loss": 0.2454, + "step": 201910 + }, + { + "epoch": 8.36, + "grad_norm": 0.86328125, + "learning_rate": 0.000410151161608712, + "loss": 0.2442, + "step": 201920 + }, + { + "epoch": 8.36, + "grad_norm": 0.8515625, + "learning_rate": 0.0004101428337487435, + "loss": 0.239, + "step": 201930 + }, + { + "epoch": 8.36, + "grad_norm": 0.80859375, + "learning_rate": 0.00041013450558740353, + "loss": 0.1967, + "step": 201940 + }, + { + "epoch": 8.36, + "grad_norm": 0.9296875, + "learning_rate": 0.0004101261771247076, + "loss": 0.2092, + "step": 201950 + }, + { + "epoch": 8.37, + "grad_norm": 0.6171875, + "learning_rate": 0.0004101178483606716, + "loss": 0.2101, + "step": 201960 + }, + { + "epoch": 8.37, + "grad_norm": 0.7578125, + "learning_rate": 0.000410109519295311, + "loss": 0.1802, + "step": 201970 + }, + { + "epoch": 8.37, + "grad_norm": 0.84375, + "learning_rate": 0.00041010118992864167, + "loss": 0.1629, + "step": 201980 + }, + { + "epoch": 8.37, + "grad_norm": 0.8671875, + "learning_rate": 0.000410092860260679, + "loss": 0.2049, + "step": 201990 + }, + { + "epoch": 8.37, + "grad_norm": 0.8046875, + "learning_rate": 0.0004100845302914391, + "loss": 0.2, + "step": 202000 + }, + { + "epoch": 8.37, + "grad_norm": 1.0, + "learning_rate": 0.00041007620002093725, + "loss": 0.2468, + "step": 202010 + }, + { + "epoch": 8.37, + "grad_norm": 0.8515625, + "learning_rate": 0.00041006786944918936, + "loss": 0.1872, + "step": 202020 + }, + { + "epoch": 8.37, + "grad_norm": 0.3984375, + "learning_rate": 0.000410059538576211, + "loss": 0.194, + "step": 202030 + }, + { + "epoch": 8.37, + "grad_norm": 0.64453125, + "learning_rate": 0.00041005120740201785, + "loss": 0.2017, + "step": 202040 + }, + { + "epoch": 8.37, + "grad_norm": 0.55859375, + "learning_rate": 0.0004100428759266256, + "loss": 0.2054, + "step": 202050 + }, + { + "epoch": 8.37, + "grad_norm": 0.95703125, + "learning_rate": 0.00041003454415005004, + "loss": 0.2427, + "step": 202060 + }, + { + "epoch": 8.37, + "grad_norm": 1.03125, + "learning_rate": 0.0004100262120723066, + "loss": 0.2035, + "step": 202070 + }, + { + "epoch": 8.37, + "grad_norm": 0.48046875, + "learning_rate": 0.0004100178796934112, + "loss": 0.181, + "step": 202080 + }, + { + "epoch": 8.37, + "grad_norm": 0.79296875, + "learning_rate": 0.00041000954701337944, + "loss": 0.2003, + "step": 202090 + }, + { + "epoch": 8.37, + "grad_norm": 0.431640625, + "learning_rate": 0.000410001214032227, + "loss": 0.1906, + "step": 202100 + }, + { + "epoch": 8.37, + "grad_norm": 0.447265625, + "learning_rate": 0.00040999288074996957, + "loss": 0.1274, + "step": 202110 + }, + { + "epoch": 8.37, + "grad_norm": 0.5625, + "learning_rate": 0.0004099845471666227, + "loss": 0.218, + "step": 202120 + }, + { + "epoch": 8.37, + "grad_norm": 1.140625, + "learning_rate": 0.0004099762132822022, + "loss": 0.2174, + "step": 202130 + }, + { + "epoch": 8.37, + "grad_norm": 0.5625, + "learning_rate": 0.0004099678790967238, + "loss": 0.2185, + "step": 202140 + }, + { + "epoch": 8.37, + "grad_norm": 0.58203125, + "learning_rate": 0.0004099595446102032, + "loss": 0.2398, + "step": 202150 + }, + { + "epoch": 8.37, + "grad_norm": 0.9296875, + "learning_rate": 0.0004099512098226559, + "loss": 0.2306, + "step": 202160 + }, + { + "epoch": 8.37, + "grad_norm": 0.94140625, + "learning_rate": 0.00040994287473409776, + "loss": 0.2077, + "step": 202170 + }, + { + "epoch": 8.37, + "grad_norm": 0.384765625, + "learning_rate": 0.0004099345393445444, + "loss": 0.2202, + "step": 202180 + }, + { + "epoch": 8.37, + "grad_norm": 1.0078125, + "learning_rate": 0.0004099262036540115, + "loss": 0.178, + "step": 202190 + }, + { + "epoch": 8.38, + "grad_norm": 0.6640625, + "learning_rate": 0.0004099178676625147, + "loss": 0.2265, + "step": 202200 + }, + { + "epoch": 8.38, + "grad_norm": 0.515625, + "learning_rate": 0.0004099095313700698, + "loss": 0.2003, + "step": 202210 + }, + { + "epoch": 8.38, + "grad_norm": 0.283203125, + "learning_rate": 0.0004099011947766924, + "loss": 0.2064, + "step": 202220 + }, + { + "epoch": 8.38, + "grad_norm": 0.796875, + "learning_rate": 0.0004098928578823983, + "loss": 0.1628, + "step": 202230 + }, + { + "epoch": 8.38, + "grad_norm": 0.482421875, + "learning_rate": 0.000409884520687203, + "loss": 0.215, + "step": 202240 + }, + { + "epoch": 8.38, + "grad_norm": 1.2890625, + "learning_rate": 0.0004098761831911223, + "loss": 0.1434, + "step": 202250 + }, + { + "epoch": 8.38, + "grad_norm": 0.7109375, + "learning_rate": 0.000409867845394172, + "loss": 0.2183, + "step": 202260 + }, + { + "epoch": 8.38, + "grad_norm": 0.5234375, + "learning_rate": 0.00040985950729636756, + "loss": 0.204, + "step": 202270 + }, + { + "epoch": 8.38, + "grad_norm": 0.25, + "learning_rate": 0.00040985116889772483, + "loss": 0.2387, + "step": 202280 + }, + { + "epoch": 8.38, + "grad_norm": 0.65234375, + "learning_rate": 0.00040984283019825953, + "loss": 0.1543, + "step": 202290 + }, + { + "epoch": 8.38, + "grad_norm": 0.41796875, + "learning_rate": 0.0004098344911979872, + "loss": 0.2174, + "step": 202300 + }, + { + "epoch": 8.38, + "grad_norm": 0.65234375, + "learning_rate": 0.0004098261518969236, + "loss": 0.1868, + "step": 202310 + }, + { + "epoch": 8.38, + "grad_norm": 0.89453125, + "learning_rate": 0.00040981781229508453, + "loss": 0.1849, + "step": 202320 + }, + { + "epoch": 8.38, + "grad_norm": 0.12109375, + "learning_rate": 0.00040980947239248555, + "loss": 0.2093, + "step": 202330 + }, + { + "epoch": 8.38, + "grad_norm": 0.6015625, + "learning_rate": 0.0004098011321891424, + "loss": 0.1922, + "step": 202340 + }, + { + "epoch": 8.38, + "grad_norm": 2.09375, + "learning_rate": 0.00040979279168507077, + "loss": 0.2488, + "step": 202350 + }, + { + "epoch": 8.38, + "grad_norm": 2.015625, + "learning_rate": 0.0004097844508802864, + "loss": 0.1543, + "step": 202360 + }, + { + "epoch": 8.38, + "grad_norm": 0.859375, + "learning_rate": 0.0004097761097748049, + "loss": 0.1955, + "step": 202370 + }, + { + "epoch": 8.38, + "grad_norm": 1.0546875, + "learning_rate": 0.00040976776836864205, + "loss": 0.232, + "step": 202380 + }, + { + "epoch": 8.38, + "grad_norm": 0.57421875, + "learning_rate": 0.0004097594266618135, + "loss": 0.179, + "step": 202390 + }, + { + "epoch": 8.38, + "grad_norm": 0.96484375, + "learning_rate": 0.00040975108465433495, + "loss": 0.2141, + "step": 202400 + }, + { + "epoch": 8.38, + "grad_norm": 0.46484375, + "learning_rate": 0.0004097427423462221, + "loss": 0.2293, + "step": 202410 + }, + { + "epoch": 8.38, + "grad_norm": 0.8515625, + "learning_rate": 0.0004097343997374907, + "loss": 0.1951, + "step": 202420 + }, + { + "epoch": 8.38, + "grad_norm": 0.5546875, + "learning_rate": 0.0004097260568281564, + "loss": 0.1638, + "step": 202430 + }, + { + "epoch": 8.39, + "grad_norm": 0.48828125, + "learning_rate": 0.0004097177136182349, + "loss": 0.2237, + "step": 202440 + }, + { + "epoch": 8.39, + "grad_norm": 0.2060546875, + "learning_rate": 0.0004097093701077419, + "loss": 0.1897, + "step": 202450 + }, + { + "epoch": 8.39, + "grad_norm": 0.7265625, + "learning_rate": 0.0004097010262966931, + "loss": 0.2073, + "step": 202460 + }, + { + "epoch": 8.39, + "grad_norm": 0.8359375, + "learning_rate": 0.00040969268218510427, + "loss": 0.2077, + "step": 202470 + }, + { + "epoch": 8.39, + "grad_norm": 0.5703125, + "learning_rate": 0.00040968433777299095, + "loss": 0.2335, + "step": 202480 + }, + { + "epoch": 8.39, + "grad_norm": 0.8046875, + "learning_rate": 0.00040967599306036904, + "loss": 0.2172, + "step": 202490 + }, + { + "epoch": 8.39, + "grad_norm": 2.359375, + "learning_rate": 0.0004096676480472542, + "loss": 0.2022, + "step": 202500 + }, + { + "epoch": 8.39, + "grad_norm": 1.046875, + "learning_rate": 0.000409659302733662, + "loss": 0.1854, + "step": 202510 + }, + { + "epoch": 8.39, + "grad_norm": 3.40625, + "learning_rate": 0.0004096509571196083, + "loss": 0.2582, + "step": 202520 + }, + { + "epoch": 8.39, + "grad_norm": 0.8046875, + "learning_rate": 0.0004096426112051087, + "loss": 0.2213, + "step": 202530 + }, + { + "epoch": 8.39, + "grad_norm": 0.6953125, + "learning_rate": 0.00040963426499017897, + "loss": 0.1788, + "step": 202540 + }, + { + "epoch": 8.39, + "grad_norm": 0.75390625, + "learning_rate": 0.00040962591847483476, + "loss": 0.2123, + "step": 202550 + }, + { + "epoch": 8.39, + "grad_norm": 0.1875, + "learning_rate": 0.0004096175716590918, + "loss": 0.1871, + "step": 202560 + }, + { + "epoch": 8.39, + "grad_norm": 0.69140625, + "learning_rate": 0.00040960922454296583, + "loss": 0.2457, + "step": 202570 + }, + { + "epoch": 8.39, + "grad_norm": 0.7421875, + "learning_rate": 0.0004096008771264726, + "loss": 0.2171, + "step": 202580 + }, + { + "epoch": 8.39, + "grad_norm": 1.1171875, + "learning_rate": 0.00040959252940962765, + "loss": 0.1832, + "step": 202590 + }, + { + "epoch": 8.39, + "grad_norm": 0.81640625, + "learning_rate": 0.0004095841813924468, + "loss": 0.1756, + "step": 202600 + }, + { + "epoch": 8.39, + "grad_norm": 0.59765625, + "learning_rate": 0.00040957583307494585, + "loss": 0.1496, + "step": 202610 + }, + { + "epoch": 8.39, + "grad_norm": 0.625, + "learning_rate": 0.0004095674844571403, + "loss": 0.2468, + "step": 202620 + }, + { + "epoch": 8.39, + "grad_norm": 0.703125, + "learning_rate": 0.00040955913553904606, + "loss": 0.1875, + "step": 202630 + }, + { + "epoch": 8.39, + "grad_norm": 1.6171875, + "learning_rate": 0.00040955078632067866, + "loss": 0.1846, + "step": 202640 + }, + { + "epoch": 8.39, + "grad_norm": 0.90625, + "learning_rate": 0.00040954243680205404, + "loss": 0.2357, + "step": 202650 + }, + { + "epoch": 8.39, + "grad_norm": 1.703125, + "learning_rate": 0.0004095340869831877, + "loss": 0.2275, + "step": 202660 + }, + { + "epoch": 8.39, + "grad_norm": 1.40625, + "learning_rate": 0.00040952573686409545, + "loss": 0.1582, + "step": 202670 + }, + { + "epoch": 8.39, + "grad_norm": 0.296875, + "learning_rate": 0.000409517386444793, + "loss": 0.239, + "step": 202680 + }, + { + "epoch": 8.4, + "grad_norm": 0.63671875, + "learning_rate": 0.00040950903572529605, + "loss": 0.1834, + "step": 202690 + }, + { + "epoch": 8.4, + "grad_norm": 0.6953125, + "learning_rate": 0.00040950068470562036, + "loss": 0.2615, + "step": 202700 + }, + { + "epoch": 8.4, + "grad_norm": 0.37890625, + "learning_rate": 0.00040949233338578156, + "loss": 0.1777, + "step": 202710 + }, + { + "epoch": 8.4, + "grad_norm": 0.59765625, + "learning_rate": 0.0004094839817657954, + "loss": 0.195, + "step": 202720 + }, + { + "epoch": 8.4, + "grad_norm": 1.8203125, + "learning_rate": 0.00040947562984567764, + "loss": 0.2365, + "step": 202730 + }, + { + "epoch": 8.4, + "grad_norm": 0.9296875, + "learning_rate": 0.000409467277625444, + "loss": 0.2268, + "step": 202740 + }, + { + "epoch": 8.4, + "grad_norm": 0.828125, + "learning_rate": 0.00040945892510511005, + "loss": 0.2409, + "step": 202750 + }, + { + "epoch": 8.4, + "grad_norm": 0.5, + "learning_rate": 0.00040945057228469175, + "loss": 0.194, + "step": 202760 + }, + { + "epoch": 8.4, + "grad_norm": 0.72265625, + "learning_rate": 0.0004094422191642046, + "loss": 0.2192, + "step": 202770 + }, + { + "epoch": 8.4, + "grad_norm": 0.79296875, + "learning_rate": 0.0004094338657436645, + "loss": 0.1879, + "step": 202780 + }, + { + "epoch": 8.4, + "grad_norm": 1.3984375, + "learning_rate": 0.00040942551202308695, + "loss": 0.2479, + "step": 202790 + }, + { + "epoch": 8.4, + "grad_norm": 0.5625, + "learning_rate": 0.0004094171580024879, + "loss": 0.1949, + "step": 202800 + }, + { + "epoch": 8.4, + "grad_norm": 1.2109375, + "learning_rate": 0.0004094088036818829, + "loss": 0.1884, + "step": 202810 + }, + { + "epoch": 8.4, + "grad_norm": 0.39453125, + "learning_rate": 0.0004094004490612879, + "loss": 0.192, + "step": 202820 + }, + { + "epoch": 8.4, + "grad_norm": 0.3359375, + "learning_rate": 0.0004093920941407183, + "loss": 0.1957, + "step": 202830 + }, + { + "epoch": 8.4, + "grad_norm": 0.50390625, + "learning_rate": 0.0004093837389201901, + "loss": 0.1914, + "step": 202840 + }, + { + "epoch": 8.4, + "grad_norm": 0.99609375, + "learning_rate": 0.0004093753833997189, + "loss": 0.2466, + "step": 202850 + }, + { + "epoch": 8.4, + "grad_norm": 0.6484375, + "learning_rate": 0.00040936702757932045, + "loss": 0.2217, + "step": 202860 + }, + { + "epoch": 8.4, + "grad_norm": 0.66796875, + "learning_rate": 0.0004093586714590104, + "loss": 0.1571, + "step": 202870 + }, + { + "epoch": 8.4, + "grad_norm": 0.52734375, + "learning_rate": 0.00040935031503880456, + "loss": 0.1638, + "step": 202880 + }, + { + "epoch": 8.4, + "grad_norm": 0.5, + "learning_rate": 0.0004093419583187187, + "loss": 0.1434, + "step": 202890 + }, + { + "epoch": 8.4, + "grad_norm": 0.0, + "learning_rate": 0.0004093336012987684, + "loss": 0.1902, + "step": 202900 + }, + { + "epoch": 8.4, + "grad_norm": 0.84375, + "learning_rate": 0.00040932524397896945, + "loss": 0.2566, + "step": 202910 + }, + { + "epoch": 8.4, + "grad_norm": 1.1328125, + "learning_rate": 0.0004093168863593377, + "loss": 0.2317, + "step": 202920 + }, + { + "epoch": 8.41, + "grad_norm": 0.703125, + "learning_rate": 0.0004093085284398887, + "loss": 0.1775, + "step": 202930 + }, + { + "epoch": 8.41, + "grad_norm": 0.90625, + "learning_rate": 0.00040930017022063823, + "loss": 0.1792, + "step": 202940 + }, + { + "epoch": 8.41, + "grad_norm": 0.93359375, + "learning_rate": 0.00040929181170160214, + "loss": 0.2558, + "step": 202950 + }, + { + "epoch": 8.41, + "grad_norm": 0.60546875, + "learning_rate": 0.000409283452882796, + "loss": 0.2535, + "step": 202960 + }, + { + "epoch": 8.41, + "grad_norm": 0.640625, + "learning_rate": 0.0004092750937642356, + "loss": 0.2085, + "step": 202970 + }, + { + "epoch": 8.41, + "grad_norm": 0.9765625, + "learning_rate": 0.00040926673434593663, + "loss": 0.1932, + "step": 202980 + }, + { + "epoch": 8.41, + "grad_norm": 0.890625, + "learning_rate": 0.00040925837462791497, + "loss": 0.2185, + "step": 202990 + }, + { + "epoch": 8.41, + "grad_norm": 1.140625, + "learning_rate": 0.00040925001461018616, + "loss": 0.2045, + "step": 203000 + }, + { + "epoch": 8.41, + "grad_norm": 0.220703125, + "learning_rate": 0.0004092416542927661, + "loss": 0.1938, + "step": 203010 + }, + { + "epoch": 8.41, + "grad_norm": 0.234375, + "learning_rate": 0.00040923329367567043, + "loss": 0.2384, + "step": 203020 + }, + { + "epoch": 8.41, + "grad_norm": 1.1328125, + "learning_rate": 0.00040922493275891484, + "loss": 0.2321, + "step": 203030 + }, + { + "epoch": 8.41, + "grad_norm": 0.84765625, + "learning_rate": 0.00040921657154251515, + "loss": 0.2048, + "step": 203040 + }, + { + "epoch": 8.41, + "grad_norm": 2.171875, + "learning_rate": 0.0004092082100264871, + "loss": 0.2429, + "step": 203050 + }, + { + "epoch": 8.41, + "grad_norm": 1.03125, + "learning_rate": 0.00040919984821084635, + "loss": 0.1979, + "step": 203060 + }, + { + "epoch": 8.41, + "grad_norm": 0.59765625, + "learning_rate": 0.00040919148609560864, + "loss": 0.2117, + "step": 203070 + }, + { + "epoch": 8.41, + "grad_norm": 0.96875, + "learning_rate": 0.0004091831236807898, + "loss": 0.2234, + "step": 203080 + }, + { + "epoch": 8.41, + "grad_norm": 0.39453125, + "learning_rate": 0.0004091747609664055, + "loss": 0.2055, + "step": 203090 + }, + { + "epoch": 8.41, + "grad_norm": 0.765625, + "learning_rate": 0.0004091663979524715, + "loss": 0.1834, + "step": 203100 + }, + { + "epoch": 8.41, + "grad_norm": 0.81640625, + "learning_rate": 0.0004091580346390035, + "loss": 0.1978, + "step": 203110 + }, + { + "epoch": 8.41, + "grad_norm": 0.49609375, + "learning_rate": 0.0004091496710260173, + "loss": 0.1815, + "step": 203120 + }, + { + "epoch": 8.41, + "grad_norm": 0.97265625, + "learning_rate": 0.0004091413071135286, + "loss": 0.2287, + "step": 203130 + }, + { + "epoch": 8.41, + "grad_norm": 0.6328125, + "learning_rate": 0.0004091329429015531, + "loss": 0.2429, + "step": 203140 + }, + { + "epoch": 8.41, + "grad_norm": 0.5078125, + "learning_rate": 0.00040912457839010666, + "loss": 0.219, + "step": 203150 + }, + { + "epoch": 8.41, + "grad_norm": 2.453125, + "learning_rate": 0.00040911621357920494, + "loss": 0.2009, + "step": 203160 + }, + { + "epoch": 8.42, + "grad_norm": 1.0703125, + "learning_rate": 0.0004091078484688636, + "loss": 0.1986, + "step": 203170 + }, + { + "epoch": 8.42, + "grad_norm": 1.28125, + "learning_rate": 0.00040909948305909853, + "loss": 0.2185, + "step": 203180 + }, + { + "epoch": 8.42, + "grad_norm": 0.26171875, + "learning_rate": 0.00040909111734992545, + "loss": 0.2011, + "step": 203190 + }, + { + "epoch": 8.42, + "grad_norm": 0.88671875, + "learning_rate": 0.00040908275134135996, + "loss": 0.2394, + "step": 203200 + }, + { + "epoch": 8.42, + "grad_norm": 0.427734375, + "learning_rate": 0.00040907438503341796, + "loss": 0.217, + "step": 203210 + }, + { + "epoch": 8.42, + "grad_norm": 0.0, + "learning_rate": 0.00040906601842611526, + "loss": 0.1933, + "step": 203220 + }, + { + "epoch": 8.42, + "grad_norm": 0.76953125, + "learning_rate": 0.0004090576515194674, + "loss": 0.2217, + "step": 203230 + }, + { + "epoch": 8.42, + "grad_norm": 0.78125, + "learning_rate": 0.00040904928431349016, + "loss": 0.1638, + "step": 203240 + }, + { + "epoch": 8.42, + "grad_norm": 1.3203125, + "learning_rate": 0.00040904091680819945, + "loss": 0.2147, + "step": 203250 + }, + { + "epoch": 8.42, + "grad_norm": 1.0078125, + "learning_rate": 0.00040903254900361084, + "loss": 0.2449, + "step": 203260 + }, + { + "epoch": 8.42, + "grad_norm": 0.73046875, + "learning_rate": 0.00040902418089974014, + "loss": 0.2372, + "step": 203270 + }, + { + "epoch": 8.42, + "grad_norm": 0.375, + "learning_rate": 0.00040901581249660313, + "loss": 0.2397, + "step": 203280 + }, + { + "epoch": 8.42, + "grad_norm": 0.2080078125, + "learning_rate": 0.0004090074437942155, + "loss": 0.1945, + "step": 203290 + }, + { + "epoch": 8.42, + "grad_norm": 0.859375, + "learning_rate": 0.00040899907479259313, + "loss": 0.1606, + "step": 203300 + }, + { + "epoch": 8.42, + "grad_norm": 0.2333984375, + "learning_rate": 0.0004089907054917515, + "loss": 0.2244, + "step": 203310 + }, + { + "epoch": 8.42, + "grad_norm": 0.431640625, + "learning_rate": 0.0004089823358917067, + "loss": 0.2223, + "step": 203320 + }, + { + "epoch": 8.42, + "grad_norm": 0.859375, + "learning_rate": 0.00040897396599247425, + "loss": 0.1991, + "step": 203330 + }, + { + "epoch": 8.42, + "grad_norm": 0.90234375, + "learning_rate": 0.00040896559579407, + "loss": 0.1796, + "step": 203340 + }, + { + "epoch": 8.42, + "grad_norm": 0.353515625, + "learning_rate": 0.0004089572252965096, + "loss": 0.2149, + "step": 203350 + }, + { + "epoch": 8.42, + "grad_norm": 0.57421875, + "learning_rate": 0.00040894885449980887, + "loss": 0.2258, + "step": 203360 + }, + { + "epoch": 8.42, + "grad_norm": 2.03125, + "learning_rate": 0.0004089404834039836, + "loss": 0.1786, + "step": 203370 + }, + { + "epoch": 8.42, + "grad_norm": 0.65625, + "learning_rate": 0.00040893211200904953, + "loss": 0.25, + "step": 203380 + }, + { + "epoch": 8.42, + "grad_norm": 0.2421875, + "learning_rate": 0.00040892374031502234, + "loss": 0.1839, + "step": 203390 + }, + { + "epoch": 8.42, + "grad_norm": 0.859375, + "learning_rate": 0.00040891536832191776, + "loss": 0.1919, + "step": 203400 + }, + { + "epoch": 8.43, + "grad_norm": 0.84375, + "learning_rate": 0.0004089069960297518, + "loss": 0.1899, + "step": 203410 + }, + { + "epoch": 8.43, + "grad_norm": 0.7109375, + "learning_rate": 0.00040889862343853993, + "loss": 0.176, + "step": 203420 + }, + { + "epoch": 8.43, + "grad_norm": 0.453125, + "learning_rate": 0.000408890250548298, + "loss": 0.1855, + "step": 203430 + }, + { + "epoch": 8.43, + "grad_norm": 0.62890625, + "learning_rate": 0.0004088818773590418, + "loss": 0.1904, + "step": 203440 + }, + { + "epoch": 8.43, + "grad_norm": 0.283203125, + "learning_rate": 0.000408873503870787, + "loss": 0.1711, + "step": 203450 + }, + { + "epoch": 8.43, + "grad_norm": 0.68359375, + "learning_rate": 0.0004088651300835495, + "loss": 0.1681, + "step": 203460 + }, + { + "epoch": 8.43, + "grad_norm": 0.279296875, + "learning_rate": 0.000408856755997345, + "loss": 0.2237, + "step": 203470 + }, + { + "epoch": 8.43, + "grad_norm": 0.875, + "learning_rate": 0.0004088483816121892, + "loss": 0.1966, + "step": 203480 + }, + { + "epoch": 8.43, + "grad_norm": 0.5546875, + "learning_rate": 0.00040884000692809785, + "loss": 0.1845, + "step": 203490 + }, + { + "epoch": 8.43, + "grad_norm": 0.8828125, + "learning_rate": 0.0004088316319450869, + "loss": 0.1541, + "step": 203500 + }, + { + "epoch": 8.43, + "grad_norm": 0.2373046875, + "learning_rate": 0.0004088232566631719, + "loss": 0.1568, + "step": 203510 + }, + { + "epoch": 8.43, + "grad_norm": 0.9609375, + "learning_rate": 0.00040881488108236865, + "loss": 0.204, + "step": 203520 + }, + { + "epoch": 8.43, + "grad_norm": 0.45703125, + "learning_rate": 0.0004088065052026929, + "loss": 0.2013, + "step": 203530 + }, + { + "epoch": 8.43, + "grad_norm": 0.76953125, + "learning_rate": 0.0004087981290241606, + "loss": 0.1787, + "step": 203540 + }, + { + "epoch": 8.43, + "grad_norm": 0.95703125, + "learning_rate": 0.00040878975254678726, + "loss": 0.1802, + "step": 203550 + }, + { + "epoch": 8.43, + "grad_norm": 0.6953125, + "learning_rate": 0.00040878137577058873, + "loss": 0.1897, + "step": 203560 + }, + { + "epoch": 8.43, + "grad_norm": 1.234375, + "learning_rate": 0.00040877299869558083, + "loss": 0.1685, + "step": 203570 + }, + { + "epoch": 8.43, + "grad_norm": 0.63671875, + "learning_rate": 0.0004087646213217794, + "loss": 0.2523, + "step": 203580 + }, + { + "epoch": 8.43, + "grad_norm": 0.94140625, + "learning_rate": 0.00040875624364919997, + "loss": 0.2163, + "step": 203590 + }, + { + "epoch": 8.43, + "grad_norm": 0.72265625, + "learning_rate": 0.0004087478656778585, + "loss": 0.2203, + "step": 203600 + }, + { + "epoch": 8.43, + "grad_norm": 0.70703125, + "learning_rate": 0.00040873948740777064, + "loss": 0.2456, + "step": 203610 + }, + { + "epoch": 8.43, + "grad_norm": 1.0625, + "learning_rate": 0.0004087311088389522, + "loss": 0.2333, + "step": 203620 + }, + { + "epoch": 8.43, + "grad_norm": 0.47265625, + "learning_rate": 0.000408722729971419, + "loss": 0.1624, + "step": 203630 + }, + { + "epoch": 8.43, + "grad_norm": 0.65234375, + "learning_rate": 0.00040871435080518673, + "loss": 0.1795, + "step": 203640 + }, + { + "epoch": 8.44, + "grad_norm": 0.439453125, + "learning_rate": 0.0004087059713402712, + "loss": 0.2101, + "step": 203650 + }, + { + "epoch": 8.44, + "grad_norm": 0.38671875, + "learning_rate": 0.00040869759157668816, + "loss": 0.1574, + "step": 203660 + }, + { + "epoch": 8.44, + "grad_norm": 0.58984375, + "learning_rate": 0.0004086892115144534, + "loss": 0.1993, + "step": 203670 + }, + { + "epoch": 8.44, + "grad_norm": 0.52734375, + "learning_rate": 0.0004086808311535827, + "loss": 0.1744, + "step": 203680 + }, + { + "epoch": 8.44, + "grad_norm": 0.1787109375, + "learning_rate": 0.00040867245049409185, + "loss": 0.1903, + "step": 203690 + }, + { + "epoch": 8.44, + "grad_norm": 1.7578125, + "learning_rate": 0.0004086640695359966, + "loss": 0.1563, + "step": 203700 + }, + { + "epoch": 8.44, + "grad_norm": 0.359375, + "learning_rate": 0.00040865568827931257, + "loss": 0.2022, + "step": 203710 + }, + { + "epoch": 8.44, + "grad_norm": 0.68359375, + "learning_rate": 0.0004086473067240557, + "loss": 0.2382, + "step": 203720 + }, + { + "epoch": 8.44, + "grad_norm": 0.4453125, + "learning_rate": 0.00040863892487024177, + "loss": 0.187, + "step": 203730 + }, + { + "epoch": 8.44, + "grad_norm": 0.8203125, + "learning_rate": 0.0004086305427178865, + "loss": 0.2151, + "step": 203740 + }, + { + "epoch": 8.44, + "grad_norm": 0.15625, + "learning_rate": 0.0004086221602670057, + "loss": 0.1478, + "step": 203750 + }, + { + "epoch": 8.44, + "grad_norm": 0.58984375, + "learning_rate": 0.0004086137775176152, + "loss": 0.215, + "step": 203760 + }, + { + "epoch": 8.44, + "grad_norm": 0.447265625, + "learning_rate": 0.00040860539446973057, + "loss": 0.1783, + "step": 203770 + }, + { + "epoch": 8.44, + "grad_norm": 0.330078125, + "learning_rate": 0.00040859701112336776, + "loss": 0.1232, + "step": 203780 + }, + { + "epoch": 8.44, + "grad_norm": 0.98828125, + "learning_rate": 0.00040858862747854247, + "loss": 0.2426, + "step": 203790 + }, + { + "epoch": 8.44, + "grad_norm": 0.6015625, + "learning_rate": 0.0004085802435352706, + "loss": 0.1875, + "step": 203800 + }, + { + "epoch": 8.44, + "grad_norm": 0.734375, + "learning_rate": 0.00040857185929356777, + "loss": 0.1959, + "step": 203810 + }, + { + "epoch": 8.44, + "grad_norm": 0.28515625, + "learning_rate": 0.0004085634747534498, + "loss": 0.1887, + "step": 203820 + }, + { + "epoch": 8.44, + "grad_norm": 1.1328125, + "learning_rate": 0.00040855508991493255, + "loss": 0.2158, + "step": 203830 + }, + { + "epoch": 8.44, + "grad_norm": 0.72265625, + "learning_rate": 0.00040854670477803175, + "loss": 0.226, + "step": 203840 + }, + { + "epoch": 8.44, + "grad_norm": 0.828125, + "learning_rate": 0.0004085383193427631, + "loss": 0.2122, + "step": 203850 + }, + { + "epoch": 8.44, + "grad_norm": 0.80859375, + "learning_rate": 0.00040852993360914256, + "loss": 0.2243, + "step": 203860 + }, + { + "epoch": 8.44, + "grad_norm": 0.71875, + "learning_rate": 0.0004085215475771857, + "loss": 0.1797, + "step": 203870 + }, + { + "epoch": 8.44, + "grad_norm": 0.63671875, + "learning_rate": 0.00040851316124690845, + "loss": 0.2077, + "step": 203880 + }, + { + "epoch": 8.45, + "grad_norm": 0.4921875, + "learning_rate": 0.0004085047746183266, + "loss": 0.1763, + "step": 203890 + }, + { + "epoch": 8.45, + "grad_norm": 0.33984375, + "learning_rate": 0.0004084963876914558, + "loss": 0.1864, + "step": 203900 + }, + { + "epoch": 8.45, + "grad_norm": 0.86328125, + "learning_rate": 0.000408488000466312, + "loss": 0.224, + "step": 203910 + }, + { + "epoch": 8.45, + "grad_norm": 0.51171875, + "learning_rate": 0.0004084796129429108, + "loss": 0.1597, + "step": 203920 + }, + { + "epoch": 8.45, + "grad_norm": 0.90625, + "learning_rate": 0.00040847122512126813, + "loss": 0.1803, + "step": 203930 + }, + { + "epoch": 8.45, + "grad_norm": 0.6953125, + "learning_rate": 0.00040846283700139973, + "loss": 0.1785, + "step": 203940 + }, + { + "epoch": 8.45, + "grad_norm": 0.54296875, + "learning_rate": 0.00040845444858332136, + "loss": 0.1785, + "step": 203950 + }, + { + "epoch": 8.45, + "grad_norm": 0.96875, + "learning_rate": 0.00040844605986704884, + "loss": 0.196, + "step": 203960 + }, + { + "epoch": 8.45, + "grad_norm": 0.59375, + "learning_rate": 0.00040843767085259797, + "loss": 0.2205, + "step": 203970 + }, + { + "epoch": 8.45, + "grad_norm": 0.30859375, + "learning_rate": 0.0004084292815399845, + "loss": 0.2097, + "step": 203980 + }, + { + "epoch": 8.45, + "grad_norm": 0.72265625, + "learning_rate": 0.0004084208919292242, + "loss": 0.232, + "step": 203990 + }, + { + "epoch": 8.45, + "grad_norm": 1.1796875, + "learning_rate": 0.0004084125020203329, + "loss": 0.2275, + "step": 204000 + }, + { + "epoch": 8.45, + "grad_norm": 0.55859375, + "learning_rate": 0.0004084041118133264, + "loss": 0.1671, + "step": 204010 + }, + { + "epoch": 8.45, + "grad_norm": 0.94921875, + "learning_rate": 0.00040839572130822044, + "loss": 0.2418, + "step": 204020 + }, + { + "epoch": 8.45, + "grad_norm": 0.640625, + "learning_rate": 0.00040838733050503086, + "loss": 0.1233, + "step": 204030 + }, + { + "epoch": 8.45, + "grad_norm": 0.1728515625, + "learning_rate": 0.00040837893940377345, + "loss": 0.2118, + "step": 204040 + }, + { + "epoch": 8.45, + "grad_norm": 0.91015625, + "learning_rate": 0.00040837054800446393, + "loss": 0.2699, + "step": 204050 + }, + { + "epoch": 8.45, + "grad_norm": 0.73046875, + "learning_rate": 0.0004083621563071182, + "loss": 0.1698, + "step": 204060 + }, + { + "epoch": 8.45, + "grad_norm": 0.70703125, + "learning_rate": 0.000408353764311752, + "loss": 0.2238, + "step": 204070 + }, + { + "epoch": 8.45, + "grad_norm": 0.443359375, + "learning_rate": 0.000408345372018381, + "loss": 0.1949, + "step": 204080 + }, + { + "epoch": 8.45, + "grad_norm": 1.53125, + "learning_rate": 0.00040833697942702123, + "loss": 0.2558, + "step": 204090 + }, + { + "epoch": 8.45, + "grad_norm": 1.109375, + "learning_rate": 0.00040832858653768833, + "loss": 0.1914, + "step": 204100 + }, + { + "epoch": 8.45, + "grad_norm": 1.2109375, + "learning_rate": 0.00040832019335039813, + "loss": 0.2002, + "step": 204110 + }, + { + "epoch": 8.45, + "grad_norm": 0.6875, + "learning_rate": 0.0004083117998651664, + "loss": 0.2278, + "step": 204120 + }, + { + "epoch": 8.46, + "grad_norm": 0.58984375, + "learning_rate": 0.000408303406082009, + "loss": 0.1614, + "step": 204130 + }, + { + "epoch": 8.46, + "grad_norm": 0.87109375, + "learning_rate": 0.00040829501200094167, + "loss": 0.209, + "step": 204140 + }, + { + "epoch": 8.46, + "grad_norm": 0.77734375, + "learning_rate": 0.00040828661762198014, + "loss": 0.255, + "step": 204150 + }, + { + "epoch": 8.46, + "grad_norm": 0.7578125, + "learning_rate": 0.0004082782229451404, + "loss": 0.2339, + "step": 204160 + }, + { + "epoch": 8.46, + "grad_norm": 0.78125, + "learning_rate": 0.0004082698279704381, + "loss": 0.211, + "step": 204170 + }, + { + "epoch": 8.46, + "grad_norm": 0.59375, + "learning_rate": 0.00040826143269788906, + "loss": 0.1908, + "step": 204180 + }, + { + "epoch": 8.46, + "grad_norm": 0.50390625, + "learning_rate": 0.00040825303712750904, + "loss": 0.2606, + "step": 204190 + }, + { + "epoch": 8.46, + "grad_norm": 0.8203125, + "learning_rate": 0.00040824464125931403, + "loss": 0.252, + "step": 204200 + }, + { + "epoch": 8.46, + "grad_norm": 0.67578125, + "learning_rate": 0.0004082362450933196, + "loss": 0.2241, + "step": 204210 + }, + { + "epoch": 8.46, + "grad_norm": 0.6875, + "learning_rate": 0.00040822784862954163, + "loss": 0.2163, + "step": 204220 + }, + { + "epoch": 8.46, + "grad_norm": 1.3515625, + "learning_rate": 0.000408219451867996, + "loss": 0.1842, + "step": 204230 + }, + { + "epoch": 8.46, + "grad_norm": 0.69140625, + "learning_rate": 0.0004082110548086984, + "loss": 0.2119, + "step": 204240 + }, + { + "epoch": 8.46, + "grad_norm": 0.267578125, + "learning_rate": 0.00040820265745166476, + "loss": 0.16, + "step": 204250 + }, + { + "epoch": 8.46, + "grad_norm": 0.671875, + "learning_rate": 0.0004081942597969107, + "loss": 0.1749, + "step": 204260 + }, + { + "epoch": 8.46, + "grad_norm": 0.69921875, + "learning_rate": 0.0004081858618444522, + "loss": 0.2266, + "step": 204270 + }, + { + "epoch": 8.46, + "grad_norm": 0.47265625, + "learning_rate": 0.00040817746359430495, + "loss": 0.2103, + "step": 204280 + }, + { + "epoch": 8.46, + "grad_norm": 0.71484375, + "learning_rate": 0.0004081690650464848, + "loss": 0.2027, + "step": 204290 + }, + { + "epoch": 8.46, + "grad_norm": 1.2109375, + "learning_rate": 0.0004081606662010076, + "loss": 0.2339, + "step": 204300 + }, + { + "epoch": 8.46, + "grad_norm": 0.84375, + "learning_rate": 0.00040815226705788905, + "loss": 0.1602, + "step": 204310 + }, + { + "epoch": 8.46, + "grad_norm": 0.81640625, + "learning_rate": 0.000408143867617145, + "loss": 0.1857, + "step": 204320 + }, + { + "epoch": 8.46, + "grad_norm": 0.46484375, + "learning_rate": 0.0004081354678787913, + "loss": 0.1984, + "step": 204330 + }, + { + "epoch": 8.46, + "grad_norm": 1.046875, + "learning_rate": 0.00040812706784284375, + "loss": 0.1931, + "step": 204340 + }, + { + "epoch": 8.46, + "grad_norm": 0.73828125, + "learning_rate": 0.00040811866750931807, + "loss": 0.1444, + "step": 204350 + }, + { + "epoch": 8.46, + "grad_norm": 0.462890625, + "learning_rate": 0.0004081102668782302, + "loss": 0.2292, + "step": 204360 + }, + { + "epoch": 8.46, + "grad_norm": 1.0078125, + "learning_rate": 0.00040810186594959584, + "loss": 0.2067, + "step": 204370 + }, + { + "epoch": 8.47, + "grad_norm": 1.0, + "learning_rate": 0.0004080934647234309, + "loss": 0.2024, + "step": 204380 + }, + { + "epoch": 8.47, + "grad_norm": 0.482421875, + "learning_rate": 0.000408085063199751, + "loss": 0.1898, + "step": 204390 + }, + { + "epoch": 8.47, + "grad_norm": 0.35546875, + "learning_rate": 0.0004080766613785722, + "loss": 0.212, + "step": 204400 + }, + { + "epoch": 8.47, + "grad_norm": 0.58984375, + "learning_rate": 0.0004080682592599102, + "loss": 0.2086, + "step": 204410 + }, + { + "epoch": 8.47, + "grad_norm": 2.125, + "learning_rate": 0.0004080598568437807, + "loss": 0.2284, + "step": 204420 + }, + { + "epoch": 8.47, + "grad_norm": 1.40625, + "learning_rate": 0.00040805145413019974, + "loss": 0.1959, + "step": 204430 + }, + { + "epoch": 8.47, + "grad_norm": 0.9609375, + "learning_rate": 0.00040804305111918297, + "loss": 0.2101, + "step": 204440 + }, + { + "epoch": 8.47, + "grad_norm": 0.625, + "learning_rate": 0.00040803464781074617, + "loss": 0.2232, + "step": 204450 + }, + { + "epoch": 8.47, + "grad_norm": 0.2099609375, + "learning_rate": 0.0004080262442049053, + "loss": 0.1812, + "step": 204460 + }, + { + "epoch": 8.47, + "grad_norm": 0.734375, + "learning_rate": 0.00040801784030167616, + "loss": 0.1837, + "step": 204470 + }, + { + "epoch": 8.47, + "grad_norm": 0.498046875, + "learning_rate": 0.00040800943610107444, + "loss": 0.1712, + "step": 204480 + }, + { + "epoch": 8.47, + "grad_norm": 0.84765625, + "learning_rate": 0.00040800103160311597, + "loss": 0.1835, + "step": 204490 + }, + { + "epoch": 8.47, + "grad_norm": 1.390625, + "learning_rate": 0.0004079926268078167, + "loss": 0.2221, + "step": 204500 + }, + { + "epoch": 8.47, + "grad_norm": 0.703125, + "learning_rate": 0.00040798422171519234, + "loss": 0.2037, + "step": 204510 + }, + { + "epoch": 8.47, + "grad_norm": 0.154296875, + "learning_rate": 0.00040797581632525884, + "loss": 0.1771, + "step": 204520 + }, + { + "epoch": 8.47, + "grad_norm": 0.462890625, + "learning_rate": 0.0004079674106380318, + "loss": 0.2199, + "step": 204530 + }, + { + "epoch": 8.47, + "grad_norm": 1.7109375, + "learning_rate": 0.0004079590046535271, + "loss": 0.1935, + "step": 204540 + }, + { + "epoch": 8.47, + "grad_norm": 1.5625, + "learning_rate": 0.0004079505983717607, + "loss": 0.2136, + "step": 204550 + }, + { + "epoch": 8.47, + "grad_norm": 0.54296875, + "learning_rate": 0.0004079421917927483, + "loss": 0.226, + "step": 204560 + }, + { + "epoch": 8.47, + "grad_norm": 1.234375, + "learning_rate": 0.0004079337849165058, + "loss": 0.2091, + "step": 204570 + }, + { + "epoch": 8.47, + "grad_norm": 0.8046875, + "learning_rate": 0.0004079253777430489, + "loss": 0.213, + "step": 204580 + }, + { + "epoch": 8.47, + "grad_norm": 0.4375, + "learning_rate": 0.00040791697027239355, + "loss": 0.1853, + "step": 204590 + }, + { + "epoch": 8.47, + "grad_norm": 0.9765625, + "learning_rate": 0.0004079085625045555, + "loss": 0.1762, + "step": 204600 + }, + { + "epoch": 8.47, + "grad_norm": 0.75, + "learning_rate": 0.00040790015443955055, + "loss": 0.2464, + "step": 204610 + }, + { + "epoch": 8.48, + "grad_norm": 0.609375, + "learning_rate": 0.00040789174607739465, + "loss": 0.2061, + "step": 204620 + }, + { + "epoch": 8.48, + "grad_norm": 0.2373046875, + "learning_rate": 0.00040788333741810344, + "loss": 0.2096, + "step": 204630 + }, + { + "epoch": 8.48, + "grad_norm": 0.388671875, + "learning_rate": 0.0004078749284616929, + "loss": 0.1898, + "step": 204640 + }, + { + "epoch": 8.48, + "grad_norm": 1.3203125, + "learning_rate": 0.00040786651920817873, + "loss": 0.2097, + "step": 204650 + }, + { + "epoch": 8.48, + "grad_norm": 0.79296875, + "learning_rate": 0.00040785810965757684, + "loss": 0.1899, + "step": 204660 + }, + { + "epoch": 8.48, + "grad_norm": 0.4921875, + "learning_rate": 0.0004078496998099031, + "loss": 0.2398, + "step": 204670 + }, + { + "epoch": 8.48, + "grad_norm": 0.53125, + "learning_rate": 0.00040784128966517324, + "loss": 0.2076, + "step": 204680 + }, + { + "epoch": 8.48, + "grad_norm": 0.44140625, + "learning_rate": 0.0004078328792234031, + "loss": 0.1617, + "step": 204690 + }, + { + "epoch": 8.48, + "grad_norm": 1.109375, + "learning_rate": 0.0004078244684846085, + "loss": 0.2133, + "step": 204700 + }, + { + "epoch": 8.48, + "grad_norm": 1.03125, + "learning_rate": 0.00040781605744880534, + "loss": 0.2223, + "step": 204710 + }, + { + "epoch": 8.48, + "grad_norm": 1.6328125, + "learning_rate": 0.0004078076461160094, + "loss": 0.1931, + "step": 204720 + }, + { + "epoch": 8.48, + "grad_norm": 0.8984375, + "learning_rate": 0.00040779923448623645, + "loss": 0.2206, + "step": 204730 + }, + { + "epoch": 8.48, + "grad_norm": 0.69921875, + "learning_rate": 0.0004077908225595025, + "loss": 0.1972, + "step": 204740 + }, + { + "epoch": 8.48, + "grad_norm": 0.8984375, + "learning_rate": 0.0004077824103358232, + "loss": 0.2476, + "step": 204750 + }, + { + "epoch": 8.48, + "grad_norm": 0.6484375, + "learning_rate": 0.0004077739978152144, + "loss": 0.2178, + "step": 204760 + }, + { + "epoch": 8.48, + "grad_norm": 0.578125, + "learning_rate": 0.000407765584997692, + "loss": 0.2035, + "step": 204770 + }, + { + "epoch": 8.48, + "grad_norm": 0.78515625, + "learning_rate": 0.0004077571718832719, + "loss": 0.2245, + "step": 204780 + }, + { + "epoch": 8.48, + "grad_norm": 0.29296875, + "learning_rate": 0.0004077487584719697, + "loss": 0.2207, + "step": 204790 + }, + { + "epoch": 8.48, + "grad_norm": 0.75390625, + "learning_rate": 0.0004077403447638014, + "loss": 0.194, + "step": 204800 + }, + { + "epoch": 8.48, + "grad_norm": 0.3203125, + "learning_rate": 0.00040773193075878286, + "loss": 0.1664, + "step": 204810 + }, + { + "epoch": 8.48, + "grad_norm": 0.63671875, + "learning_rate": 0.0004077235164569298, + "loss": 0.2129, + "step": 204820 + }, + { + "epoch": 8.48, + "grad_norm": 1.0625, + "learning_rate": 0.00040771510185825816, + "loss": 0.2096, + "step": 204830 + }, + { + "epoch": 8.48, + "grad_norm": 0.80078125, + "learning_rate": 0.00040770668696278367, + "loss": 0.1797, + "step": 204840 + }, + { + "epoch": 8.48, + "grad_norm": 0.5625, + "learning_rate": 0.00040769827177052233, + "loss": 0.1881, + "step": 204850 + }, + { + "epoch": 8.49, + "grad_norm": 0.478515625, + "learning_rate": 0.0004076898562814898, + "loss": 0.2197, + "step": 204860 + }, + { + "epoch": 8.49, + "grad_norm": 0.609375, + "learning_rate": 0.000407681440495702, + "loss": 0.1639, + "step": 204870 + }, + { + "epoch": 8.49, + "grad_norm": 0.41796875, + "learning_rate": 0.00040767302441317477, + "loss": 0.2054, + "step": 204880 + }, + { + "epoch": 8.49, + "grad_norm": 0.8359375, + "learning_rate": 0.0004076646080339239, + "loss": 0.2326, + "step": 204890 + }, + { + "epoch": 8.49, + "grad_norm": 0.5546875, + "learning_rate": 0.0004076561913579653, + "loss": 0.2086, + "step": 204900 + }, + { + "epoch": 8.49, + "grad_norm": 0.27734375, + "learning_rate": 0.00040764777438531474, + "loss": 0.1918, + "step": 204910 + }, + { + "epoch": 8.49, + "grad_norm": 1.2578125, + "learning_rate": 0.0004076393571159881, + "loss": 0.1939, + "step": 204920 + }, + { + "epoch": 8.49, + "grad_norm": 0.75, + "learning_rate": 0.0004076309395500013, + "loss": 0.194, + "step": 204930 + }, + { + "epoch": 8.49, + "grad_norm": 0.8203125, + "learning_rate": 0.00040762252168737, + "loss": 0.1885, + "step": 204940 + }, + { + "epoch": 8.49, + "grad_norm": 0.953125, + "learning_rate": 0.0004076141035281101, + "loss": 0.2139, + "step": 204950 + }, + { + "epoch": 8.49, + "grad_norm": 0.71875, + "learning_rate": 0.0004076056850722376, + "loss": 0.1813, + "step": 204960 + }, + { + "epoch": 8.49, + "grad_norm": 0.70703125, + "learning_rate": 0.0004075972663197681, + "loss": 0.1698, + "step": 204970 + }, + { + "epoch": 8.49, + "grad_norm": 0.62109375, + "learning_rate": 0.0004075888472707177, + "loss": 0.2236, + "step": 204980 + }, + { + "epoch": 8.49, + "grad_norm": 0.62109375, + "learning_rate": 0.00040758042792510203, + "loss": 0.2217, + "step": 204990 + }, + { + "epoch": 8.49, + "grad_norm": 0.486328125, + "learning_rate": 0.000407572008282937, + "loss": 0.2352, + "step": 205000 + }, + { + "epoch": 8.49, + "grad_norm": 0.92578125, + "learning_rate": 0.0004075635883442385, + "loss": 0.2056, + "step": 205010 + }, + { + "epoch": 8.49, + "grad_norm": 1.21875, + "learning_rate": 0.00040755516810902235, + "loss": 0.2119, + "step": 205020 + }, + { + "epoch": 8.49, + "grad_norm": 1.046875, + "learning_rate": 0.0004075467475773043, + "loss": 0.2119, + "step": 205030 + }, + { + "epoch": 8.49, + "grad_norm": 1.640625, + "learning_rate": 0.0004075383267491004, + "loss": 0.2155, + "step": 205040 + }, + { + "epoch": 8.49, + "grad_norm": 1.046875, + "learning_rate": 0.00040752990562442626, + "loss": 0.2241, + "step": 205050 + }, + { + "epoch": 8.49, + "grad_norm": 0.48828125, + "learning_rate": 0.00040752148420329794, + "loss": 0.1795, + "step": 205060 + }, + { + "epoch": 8.49, + "grad_norm": 0.291015625, + "learning_rate": 0.0004075130624857312, + "loss": 0.1934, + "step": 205070 + }, + { + "epoch": 8.49, + "grad_norm": 0.85546875, + "learning_rate": 0.0004075046404717418, + "loss": 0.154, + "step": 205080 + }, + { + "epoch": 8.49, + "grad_norm": 1.03125, + "learning_rate": 0.0004074962181613458, + "loss": 0.2421, + "step": 205090 + }, + { + "epoch": 8.5, + "grad_norm": 0.546875, + "learning_rate": 0.00040748779555455886, + "loss": 0.1795, + "step": 205100 + }, + { + "epoch": 8.5, + "grad_norm": 0.51171875, + "learning_rate": 0.0004074793726513969, + "loss": 0.2186, + "step": 205110 + }, + { + "epoch": 8.5, + "grad_norm": 0.68359375, + "learning_rate": 0.00040747094945187574, + "loss": 0.2137, + "step": 205120 + }, + { + "epoch": 8.5, + "grad_norm": 0.875, + "learning_rate": 0.0004074625259560113, + "loss": 0.1969, + "step": 205130 + }, + { + "epoch": 8.5, + "grad_norm": 0.8359375, + "learning_rate": 0.0004074541021638194, + "loss": 0.2073, + "step": 205140 + }, + { + "epoch": 8.5, + "grad_norm": 0.6328125, + "learning_rate": 0.0004074456780753158, + "loss": 0.1954, + "step": 205150 + }, + { + "epoch": 8.5, + "grad_norm": 0.71875, + "learning_rate": 0.0004074372536905165, + "loss": 0.1819, + "step": 205160 + }, + { + "epoch": 8.5, + "grad_norm": 1.40625, + "learning_rate": 0.00040742882900943737, + "loss": 0.2242, + "step": 205170 + }, + { + "epoch": 8.5, + "grad_norm": 0.828125, + "learning_rate": 0.000407420404032094, + "loss": 0.1952, + "step": 205180 + }, + { + "epoch": 8.5, + "grad_norm": 0.34765625, + "learning_rate": 0.00040741197875850257, + "loss": 0.2246, + "step": 205190 + }, + { + "epoch": 8.5, + "grad_norm": 1.5859375, + "learning_rate": 0.0004074035531886787, + "loss": 0.1914, + "step": 205200 + }, + { + "epoch": 8.5, + "grad_norm": 0.4375, + "learning_rate": 0.0004073951273226384, + "loss": 0.1458, + "step": 205210 + }, + { + "epoch": 8.5, + "grad_norm": 0.63671875, + "learning_rate": 0.0004073867011603975, + "loss": 0.1451, + "step": 205220 + }, + { + "epoch": 8.5, + "grad_norm": 0.80078125, + "learning_rate": 0.00040737827470197174, + "loss": 0.2109, + "step": 205230 + }, + { + "epoch": 8.5, + "grad_norm": 1.09375, + "learning_rate": 0.00040736984794737706, + "loss": 0.213, + "step": 205240 + }, + { + "epoch": 8.5, + "grad_norm": 1.0078125, + "learning_rate": 0.0004073614208966294, + "loss": 0.2241, + "step": 205250 + }, + { + "epoch": 8.5, + "grad_norm": 0.333984375, + "learning_rate": 0.0004073529935497445, + "loss": 0.2165, + "step": 205260 + }, + { + "epoch": 8.5, + "grad_norm": 0.58203125, + "learning_rate": 0.0004073445659067383, + "loss": 0.1564, + "step": 205270 + }, + { + "epoch": 8.5, + "grad_norm": 0.87109375, + "learning_rate": 0.0004073361379676266, + "loss": 0.1876, + "step": 205280 + }, + { + "epoch": 8.5, + "grad_norm": 0.45703125, + "learning_rate": 0.00040732770973242525, + "loss": 0.1925, + "step": 205290 + }, + { + "epoch": 8.5, + "grad_norm": 0.64453125, + "learning_rate": 0.0004073192812011502, + "loss": 0.1722, + "step": 205300 + }, + { + "epoch": 8.5, + "grad_norm": 0.6796875, + "learning_rate": 0.0004073108523738172, + "loss": 0.2274, + "step": 205310 + }, + { + "epoch": 8.5, + "grad_norm": 1.0078125, + "learning_rate": 0.00040730242325044214, + "loss": 0.2182, + "step": 205320 + }, + { + "epoch": 8.5, + "grad_norm": 0.51953125, + "learning_rate": 0.00040729399383104093, + "loss": 0.1528, + "step": 205330 + }, + { + "epoch": 8.51, + "grad_norm": 0.232421875, + "learning_rate": 0.00040728556411562944, + "loss": 0.1895, + "step": 205340 + }, + { + "epoch": 8.51, + "grad_norm": 0.83984375, + "learning_rate": 0.0004072771341042235, + "loss": 0.1709, + "step": 205350 + }, + { + "epoch": 8.51, + "grad_norm": 1.65625, + "learning_rate": 0.00040726870379683897, + "loss": 0.2147, + "step": 205360 + }, + { + "epoch": 8.51, + "grad_norm": 0.388671875, + "learning_rate": 0.0004072602731934917, + "loss": 0.1776, + "step": 205370 + }, + { + "epoch": 8.51, + "grad_norm": 2.3125, + "learning_rate": 0.0004072518422941975, + "loss": 0.2006, + "step": 205380 + }, + { + "epoch": 8.51, + "grad_norm": 1.0703125, + "learning_rate": 0.00040724341109897247, + "loss": 0.2422, + "step": 205390 + }, + { + "epoch": 8.51, + "grad_norm": 1.109375, + "learning_rate": 0.0004072349796078322, + "loss": 0.1969, + "step": 205400 + }, + { + "epoch": 8.51, + "grad_norm": 0.5, + "learning_rate": 0.0004072265478207928, + "loss": 0.2071, + "step": 205410 + }, + { + "epoch": 8.51, + "grad_norm": 0.890625, + "learning_rate": 0.0004072181157378699, + "loss": 0.1998, + "step": 205420 + }, + { + "epoch": 8.51, + "grad_norm": 0.30078125, + "learning_rate": 0.0004072096833590796, + "loss": 0.2381, + "step": 205430 + }, + { + "epoch": 8.51, + "grad_norm": 1.484375, + "learning_rate": 0.0004072012506844376, + "loss": 0.2084, + "step": 205440 + }, + { + "epoch": 8.51, + "grad_norm": 0.5546875, + "learning_rate": 0.00040719281771395976, + "loss": 0.202, + "step": 205450 + }, + { + "epoch": 8.51, + "grad_norm": 0.6640625, + "learning_rate": 0.00040718438444766207, + "loss": 0.2235, + "step": 205460 + }, + { + "epoch": 8.51, + "grad_norm": 0.98828125, + "learning_rate": 0.00040717595088556036, + "loss": 0.201, + "step": 205470 + }, + { + "epoch": 8.51, + "grad_norm": 0.640625, + "learning_rate": 0.0004071675170276704, + "loss": 0.2133, + "step": 205480 + }, + { + "epoch": 8.51, + "grad_norm": 0.482421875, + "learning_rate": 0.00040715908287400826, + "loss": 0.167, + "step": 205490 + }, + { + "epoch": 8.51, + "grad_norm": 0.578125, + "learning_rate": 0.0004071506484245896, + "loss": 0.2197, + "step": 205500 + }, + { + "epoch": 8.51, + "grad_norm": 0.68359375, + "learning_rate": 0.00040714221367943047, + "loss": 0.1648, + "step": 205510 + }, + { + "epoch": 8.51, + "grad_norm": 1.8203125, + "learning_rate": 0.0004071337786385466, + "loss": 0.194, + "step": 205520 + }, + { + "epoch": 8.51, + "grad_norm": 0.412109375, + "learning_rate": 0.00040712534330195397, + "loss": 0.1818, + "step": 205530 + }, + { + "epoch": 8.51, + "grad_norm": 0.890625, + "learning_rate": 0.0004071169076696684, + "loss": 0.1978, + "step": 205540 + }, + { + "epoch": 8.51, + "grad_norm": 0.6484375, + "learning_rate": 0.00040710847174170575, + "loss": 0.1986, + "step": 205550 + }, + { + "epoch": 8.51, + "grad_norm": 0.490234375, + "learning_rate": 0.00040710003551808194, + "loss": 0.2266, + "step": 205560 + }, + { + "epoch": 8.51, + "grad_norm": 0.392578125, + "learning_rate": 0.0004070915989988129, + "loss": 0.2403, + "step": 205570 + }, + { + "epoch": 8.52, + "grad_norm": 0.478515625, + "learning_rate": 0.00040708316218391433, + "loss": 0.2577, + "step": 205580 + }, + { + "epoch": 8.52, + "grad_norm": 1.7109375, + "learning_rate": 0.00040707472507340225, + "loss": 0.2084, + "step": 205590 + }, + { + "epoch": 8.52, + "grad_norm": 0.5078125, + "learning_rate": 0.0004070662876672925, + "loss": 0.2318, + "step": 205600 + }, + { + "epoch": 8.52, + "grad_norm": 2.5625, + "learning_rate": 0.00040705784996560094, + "loss": 0.1984, + "step": 205610 + }, + { + "epoch": 8.52, + "grad_norm": 1.0546875, + "learning_rate": 0.00040704941196834354, + "loss": 0.1989, + "step": 205620 + }, + { + "epoch": 8.52, + "grad_norm": 1.671875, + "learning_rate": 0.00040704097367553604, + "loss": 0.2463, + "step": 205630 + }, + { + "epoch": 8.52, + "grad_norm": 1.1484375, + "learning_rate": 0.0004070325350871944, + "loss": 0.2287, + "step": 205640 + }, + { + "epoch": 8.52, + "grad_norm": 0.76171875, + "learning_rate": 0.0004070240962033345, + "loss": 0.1167, + "step": 205650 + }, + { + "epoch": 8.52, + "grad_norm": 0.5625, + "learning_rate": 0.00040701565702397217, + "loss": 0.1587, + "step": 205660 + }, + { + "epoch": 8.52, + "grad_norm": 0.55859375, + "learning_rate": 0.0004070072175491234, + "loss": 0.2244, + "step": 205670 + }, + { + "epoch": 8.52, + "grad_norm": 0.458984375, + "learning_rate": 0.0004069987777788039, + "loss": 0.2215, + "step": 205680 + }, + { + "epoch": 8.52, + "grad_norm": 0.76953125, + "learning_rate": 0.0004069903377130297, + "loss": 0.2094, + "step": 205690 + }, + { + "epoch": 8.52, + "grad_norm": 0.55859375, + "learning_rate": 0.00040698189735181666, + "loss": 0.24, + "step": 205700 + }, + { + "epoch": 8.52, + "grad_norm": 0.62109375, + "learning_rate": 0.0004069734566951806, + "loss": 0.2183, + "step": 205710 + }, + { + "epoch": 8.52, + "grad_norm": 0.3125, + "learning_rate": 0.0004069650157431375, + "loss": 0.2059, + "step": 205720 + }, + { + "epoch": 8.52, + "grad_norm": 0.16796875, + "learning_rate": 0.0004069565744957032, + "loss": 0.1986, + "step": 205730 + }, + { + "epoch": 8.52, + "grad_norm": 0.99609375, + "learning_rate": 0.00040694813295289356, + "loss": 0.162, + "step": 205740 + }, + { + "epoch": 8.52, + "grad_norm": 0.94140625, + "learning_rate": 0.00040693969111472446, + "loss": 0.1788, + "step": 205750 + }, + { + "epoch": 8.52, + "grad_norm": 0.6328125, + "learning_rate": 0.0004069312489812118, + "loss": 0.1871, + "step": 205760 + }, + { + "epoch": 8.52, + "grad_norm": 0.57421875, + "learning_rate": 0.00040692280655237154, + "loss": 0.2196, + "step": 205770 + }, + { + "epoch": 8.52, + "grad_norm": 0.7734375, + "learning_rate": 0.0004069143638282194, + "loss": 0.1976, + "step": 205780 + }, + { + "epoch": 8.52, + "grad_norm": 0.71875, + "learning_rate": 0.0004069059208087714, + "loss": 0.225, + "step": 205790 + }, + { + "epoch": 8.52, + "grad_norm": 0.68359375, + "learning_rate": 0.00040689747749404347, + "loss": 0.2054, + "step": 205800 + }, + { + "epoch": 8.52, + "grad_norm": 0.515625, + "learning_rate": 0.0004068890338840514, + "loss": 0.192, + "step": 205810 + }, + { + "epoch": 8.53, + "grad_norm": 0.6875, + "learning_rate": 0.0004068805899788111, + "loss": 0.21, + "step": 205820 + }, + { + "epoch": 8.53, + "grad_norm": 0.58203125, + "learning_rate": 0.00040687214577833853, + "loss": 0.2221, + "step": 205830 + }, + { + "epoch": 8.53, + "grad_norm": 0.23828125, + "learning_rate": 0.0004068637012826494, + "loss": 0.1959, + "step": 205840 + }, + { + "epoch": 8.53, + "grad_norm": 0.58984375, + "learning_rate": 0.0004068552564917598, + "loss": 0.201, + "step": 205850 + }, + { + "epoch": 8.53, + "grad_norm": 0.890625, + "learning_rate": 0.0004068468114056856, + "loss": 0.2553, + "step": 205860 + }, + { + "epoch": 8.53, + "grad_norm": 0.89453125, + "learning_rate": 0.0004068383660244425, + "loss": 0.1994, + "step": 205870 + }, + { + "epoch": 8.53, + "grad_norm": 3.0625, + "learning_rate": 0.0004068299203480467, + "loss": 0.2261, + "step": 205880 + }, + { + "epoch": 8.53, + "grad_norm": 0.6484375, + "learning_rate": 0.00040682147437651374, + "loss": 0.2335, + "step": 205890 + }, + { + "epoch": 8.53, + "grad_norm": 1.03125, + "learning_rate": 0.0004068130281098598, + "loss": 0.1715, + "step": 205900 + }, + { + "epoch": 8.53, + "grad_norm": 2.765625, + "learning_rate": 0.0004068045815481007, + "loss": 0.1641, + "step": 205910 + }, + { + "epoch": 8.53, + "grad_norm": 0.59765625, + "learning_rate": 0.00040679613469125225, + "loss": 0.2375, + "step": 205920 + }, + { + "epoch": 8.53, + "grad_norm": 0.62109375, + "learning_rate": 0.00040678768753933046, + "loss": 0.1749, + "step": 205930 + }, + { + "epoch": 8.53, + "grad_norm": 0.8359375, + "learning_rate": 0.00040677924009235113, + "loss": 0.1836, + "step": 205940 + }, + { + "epoch": 8.53, + "grad_norm": 0.6484375, + "learning_rate": 0.0004067707923503302, + "loss": 0.1907, + "step": 205950 + }, + { + "epoch": 8.53, + "grad_norm": 0.62890625, + "learning_rate": 0.00040676234431328363, + "loss": 0.2155, + "step": 205960 + }, + { + "epoch": 8.53, + "grad_norm": 0.87890625, + "learning_rate": 0.00040675389598122724, + "loss": 0.2006, + "step": 205970 + }, + { + "epoch": 8.53, + "grad_norm": 0.96875, + "learning_rate": 0.00040674544735417695, + "loss": 0.1632, + "step": 205980 + }, + { + "epoch": 8.53, + "grad_norm": 0.6328125, + "learning_rate": 0.00040673699843214863, + "loss": 0.1443, + "step": 205990 + }, + { + "epoch": 8.53, + "grad_norm": 1.46875, + "learning_rate": 0.00040672854921515813, + "loss": 0.175, + "step": 206000 + }, + { + "epoch": 8.53, + "grad_norm": 0.73828125, + "learning_rate": 0.00040672009970322155, + "loss": 0.1595, + "step": 206010 + }, + { + "epoch": 8.53, + "grad_norm": 1.5234375, + "learning_rate": 0.0004067116498963546, + "loss": 0.2077, + "step": 206020 + }, + { + "epoch": 8.53, + "grad_norm": 2.578125, + "learning_rate": 0.00040670319979457326, + "loss": 0.2092, + "step": 206030 + }, + { + "epoch": 8.53, + "grad_norm": 0.62109375, + "learning_rate": 0.00040669474939789344, + "loss": 0.2078, + "step": 206040 + }, + { + "epoch": 8.53, + "grad_norm": 0.83203125, + "learning_rate": 0.000406686298706331, + "loss": 0.1886, + "step": 206050 + }, + { + "epoch": 8.53, + "grad_norm": 0.71875, + "learning_rate": 0.0004066778477199019, + "loss": 0.204, + "step": 206060 + }, + { + "epoch": 8.54, + "grad_norm": 0.54296875, + "learning_rate": 0.00040666939643862197, + "loss": 0.177, + "step": 206070 + }, + { + "epoch": 8.54, + "grad_norm": 1.1484375, + "learning_rate": 0.0004066609448625072, + "loss": 0.2114, + "step": 206080 + }, + { + "epoch": 8.54, + "grad_norm": 1.0234375, + "learning_rate": 0.0004066524929915734, + "loss": 0.1952, + "step": 206090 + }, + { + "epoch": 8.54, + "grad_norm": 0.4453125, + "learning_rate": 0.0004066440408258365, + "loss": 0.2096, + "step": 206100 + }, + { + "epoch": 8.54, + "grad_norm": 0.80859375, + "learning_rate": 0.0004066355883653125, + "loss": 0.2015, + "step": 206110 + }, + { + "epoch": 8.54, + "grad_norm": 1.09375, + "learning_rate": 0.00040662713561001726, + "loss": 0.2477, + "step": 206120 + }, + { + "epoch": 8.54, + "grad_norm": 0.5234375, + "learning_rate": 0.0004066186825599666, + "loss": 0.2152, + "step": 206130 + }, + { + "epoch": 8.54, + "grad_norm": 0.60546875, + "learning_rate": 0.00040661022921517653, + "loss": 0.2115, + "step": 206140 + }, + { + "epoch": 8.54, + "grad_norm": 0.51171875, + "learning_rate": 0.00040660177557566286, + "loss": 0.1895, + "step": 206150 + }, + { + "epoch": 8.54, + "grad_norm": 0.9609375, + "learning_rate": 0.0004065933216414416, + "loss": 0.1901, + "step": 206160 + }, + { + "epoch": 8.54, + "grad_norm": 0.921875, + "learning_rate": 0.00040658486741252867, + "loss": 0.1875, + "step": 206170 + }, + { + "epoch": 8.54, + "grad_norm": 0.57421875, + "learning_rate": 0.00040657641288893985, + "loss": 0.1611, + "step": 206180 + }, + { + "epoch": 8.54, + "grad_norm": 0.6015625, + "learning_rate": 0.0004065679580706911, + "loss": 0.2307, + "step": 206190 + }, + { + "epoch": 8.54, + "grad_norm": 0.7109375, + "learning_rate": 0.00040655950295779853, + "loss": 0.1997, + "step": 206200 + }, + { + "epoch": 8.54, + "grad_norm": 0.5234375, + "learning_rate": 0.00040655104755027774, + "loss": 0.1577, + "step": 206210 + }, + { + "epoch": 8.54, + "grad_norm": 0.421875, + "learning_rate": 0.0004065425918481448, + "loss": 0.2074, + "step": 206220 + }, + { + "epoch": 8.54, + "grad_norm": 1.2421875, + "learning_rate": 0.00040653413585141563, + "loss": 0.217, + "step": 206230 + }, + { + "epoch": 8.54, + "grad_norm": 0.625, + "learning_rate": 0.00040652567956010613, + "loss": 0.2236, + "step": 206240 + }, + { + "epoch": 8.54, + "grad_norm": 0.765625, + "learning_rate": 0.0004065172229742322, + "loss": 0.2222, + "step": 206250 + }, + { + "epoch": 8.54, + "grad_norm": 1.390625, + "learning_rate": 0.00040650876609380973, + "loss": 0.2216, + "step": 206260 + }, + { + "epoch": 8.54, + "grad_norm": 1.0859375, + "learning_rate": 0.0004065003089188547, + "loss": 0.2232, + "step": 206270 + }, + { + "epoch": 8.54, + "grad_norm": 0.51171875, + "learning_rate": 0.00040649185144938293, + "loss": 0.2563, + "step": 206280 + }, + { + "epoch": 8.54, + "grad_norm": 0.84375, + "learning_rate": 0.00040648339368541046, + "loss": 0.1726, + "step": 206290 + }, + { + "epoch": 8.54, + "grad_norm": 0.9375, + "learning_rate": 0.0004064749356269531, + "loss": 0.2328, + "step": 206300 + }, + { + "epoch": 8.55, + "grad_norm": 0.5390625, + "learning_rate": 0.00040646647727402685, + "loss": 0.1808, + "step": 206310 + }, + { + "epoch": 8.55, + "grad_norm": 0.95703125, + "learning_rate": 0.00040645801862664754, + "loss": 0.2403, + "step": 206320 + }, + { + "epoch": 8.55, + "grad_norm": 0.8125, + "learning_rate": 0.0004064495596848312, + "loss": 0.1802, + "step": 206330 + }, + { + "epoch": 8.55, + "grad_norm": 0.1650390625, + "learning_rate": 0.0004064411004485936, + "loss": 0.1742, + "step": 206340 + }, + { + "epoch": 8.55, + "grad_norm": 0.55859375, + "learning_rate": 0.00040643264091795085, + "loss": 0.1842, + "step": 206350 + }, + { + "epoch": 8.55, + "grad_norm": 0.361328125, + "learning_rate": 0.00040642418109291867, + "loss": 0.243, + "step": 206360 + }, + { + "epoch": 8.55, + "grad_norm": 1.0234375, + "learning_rate": 0.0004064157209735131, + "loss": 0.1946, + "step": 206370 + }, + { + "epoch": 8.55, + "grad_norm": 0.6796875, + "learning_rate": 0.00040640726055975006, + "loss": 0.2597, + "step": 206380 + }, + { + "epoch": 8.55, + "grad_norm": 2.28125, + "learning_rate": 0.0004063987998516454, + "loss": 0.1754, + "step": 206390 + }, + { + "epoch": 8.55, + "grad_norm": 0.70703125, + "learning_rate": 0.0004063903388492151, + "loss": 0.201, + "step": 206400 + }, + { + "epoch": 8.55, + "grad_norm": 1.8125, + "learning_rate": 0.0004063818775524751, + "loss": 0.2545, + "step": 206410 + }, + { + "epoch": 8.55, + "grad_norm": 0.953125, + "learning_rate": 0.0004063734159614413, + "loss": 0.2285, + "step": 206420 + }, + { + "epoch": 8.55, + "grad_norm": 0.49609375, + "learning_rate": 0.0004063649540761296, + "loss": 0.1899, + "step": 206430 + }, + { + "epoch": 8.55, + "grad_norm": 1.6328125, + "learning_rate": 0.00040635649189655596, + "loss": 0.2237, + "step": 206440 + }, + { + "epoch": 8.55, + "grad_norm": 0.369140625, + "learning_rate": 0.0004063480294227362, + "loss": 0.1816, + "step": 206450 + }, + { + "epoch": 8.55, + "grad_norm": 0.5859375, + "learning_rate": 0.0004063395666546864, + "loss": 0.1747, + "step": 206460 + }, + { + "epoch": 8.55, + "grad_norm": 1.109375, + "learning_rate": 0.0004063311035924224, + "loss": 0.2281, + "step": 206470 + }, + { + "epoch": 8.55, + "grad_norm": 0.92578125, + "learning_rate": 0.0004063226402359602, + "loss": 0.1809, + "step": 206480 + }, + { + "epoch": 8.55, + "grad_norm": 0.458984375, + "learning_rate": 0.00040631417658531566, + "loss": 0.2221, + "step": 206490 + }, + { + "epoch": 8.55, + "grad_norm": 0.71484375, + "learning_rate": 0.00040630571264050464, + "loss": 0.1642, + "step": 206500 + }, + { + "epoch": 8.55, + "grad_norm": 0.625, + "learning_rate": 0.0004062972484015433, + "loss": 0.1699, + "step": 206510 + }, + { + "epoch": 8.55, + "grad_norm": 1.4375, + "learning_rate": 0.00040628878386844724, + "loss": 0.1983, + "step": 206520 + }, + { + "epoch": 8.55, + "grad_norm": 0.36328125, + "learning_rate": 0.0004062803190412327, + "loss": 0.1713, + "step": 206530 + }, + { + "epoch": 8.55, + "grad_norm": 0.67578125, + "learning_rate": 0.0004062718539199154, + "loss": 0.1713, + "step": 206540 + }, + { + "epoch": 8.56, + "grad_norm": 0.455078125, + "learning_rate": 0.0004062633885045114, + "loss": 0.2251, + "step": 206550 + }, + { + "epoch": 8.56, + "grad_norm": 1.4921875, + "learning_rate": 0.0004062549227950365, + "loss": 0.1609, + "step": 206560 + }, + { + "epoch": 8.56, + "grad_norm": 1.5546875, + "learning_rate": 0.0004062464567915067, + "loss": 0.1871, + "step": 206570 + }, + { + "epoch": 8.56, + "grad_norm": 0.1806640625, + "learning_rate": 0.00040623799049393806, + "loss": 0.1901, + "step": 206580 + }, + { + "epoch": 8.56, + "grad_norm": 0.66015625, + "learning_rate": 0.00040622952390234634, + "loss": 0.2098, + "step": 206590 + }, + { + "epoch": 8.56, + "grad_norm": 0.5546875, + "learning_rate": 0.0004062210570167475, + "loss": 0.2429, + "step": 206600 + }, + { + "epoch": 8.56, + "grad_norm": 0.9765625, + "learning_rate": 0.0004062125898371576, + "loss": 0.2098, + "step": 206610 + }, + { + "epoch": 8.56, + "grad_norm": 0.8046875, + "learning_rate": 0.00040620412236359236, + "loss": 0.1762, + "step": 206620 + }, + { + "epoch": 8.56, + "grad_norm": 0.49609375, + "learning_rate": 0.00040619565459606787, + "loss": 0.221, + "step": 206630 + }, + { + "epoch": 8.56, + "grad_norm": 1.09375, + "learning_rate": 0.0004061871865346, + "loss": 0.1886, + "step": 206640 + }, + { + "epoch": 8.56, + "grad_norm": 0.6875, + "learning_rate": 0.00040617871817920473, + "loss": 0.2021, + "step": 206650 + }, + { + "epoch": 8.56, + "grad_norm": 0.66796875, + "learning_rate": 0.000406170249529898, + "loss": 0.1998, + "step": 206660 + }, + { + "epoch": 8.56, + "grad_norm": 0.88671875, + "learning_rate": 0.00040616178058669575, + "loss": 0.1779, + "step": 206670 + }, + { + "epoch": 8.56, + "grad_norm": 0.9375, + "learning_rate": 0.0004061533113496138, + "loss": 0.21, + "step": 206680 + }, + { + "epoch": 8.56, + "grad_norm": 0.1845703125, + "learning_rate": 0.00040614484181866826, + "loss": 0.231, + "step": 206690 + }, + { + "epoch": 8.56, + "grad_norm": 0.88671875, + "learning_rate": 0.0004061363719938749, + "loss": 0.2365, + "step": 206700 + }, + { + "epoch": 8.56, + "grad_norm": 0.90234375, + "learning_rate": 0.0004061279018752498, + "loss": 0.2432, + "step": 206710 + }, + { + "epoch": 8.56, + "grad_norm": 0.78125, + "learning_rate": 0.00040611943146280893, + "loss": 0.2202, + "step": 206720 + }, + { + "epoch": 8.56, + "grad_norm": 0.0, + "learning_rate": 0.00040611096075656805, + "loss": 0.1894, + "step": 206730 + }, + { + "epoch": 8.56, + "grad_norm": 0.4453125, + "learning_rate": 0.0004061024897565433, + "loss": 0.2168, + "step": 206740 + }, + { + "epoch": 8.56, + "grad_norm": 0.333984375, + "learning_rate": 0.0004060940184627504, + "loss": 0.1694, + "step": 206750 + }, + { + "epoch": 8.56, + "grad_norm": 0.9609375, + "learning_rate": 0.0004060855468752055, + "loss": 0.2494, + "step": 206760 + }, + { + "epoch": 8.56, + "grad_norm": 0.58203125, + "learning_rate": 0.00040607707499392445, + "loss": 0.2029, + "step": 206770 + }, + { + "epoch": 8.56, + "grad_norm": 0.53515625, + "learning_rate": 0.00040606860281892313, + "loss": 0.1951, + "step": 206780 + }, + { + "epoch": 8.57, + "grad_norm": 0.85546875, + "learning_rate": 0.0004060601303502176, + "loss": 0.1796, + "step": 206790 + }, + { + "epoch": 8.57, + "grad_norm": 0.63671875, + "learning_rate": 0.00040605165758782376, + "loss": 0.229, + "step": 206800 + }, + { + "epoch": 8.57, + "grad_norm": 0.84765625, + "learning_rate": 0.00040604318453175747, + "loss": 0.1497, + "step": 206810 + }, + { + "epoch": 8.57, + "grad_norm": 0.46875, + "learning_rate": 0.00040603471118203486, + "loss": 0.1941, + "step": 206820 + }, + { + "epoch": 8.57, + "grad_norm": 0.6796875, + "learning_rate": 0.00040602623753867176, + "loss": 0.1792, + "step": 206830 + }, + { + "epoch": 8.57, + "grad_norm": 0.63671875, + "learning_rate": 0.0004060177636016841, + "loss": 0.1994, + "step": 206840 + }, + { + "epoch": 8.57, + "grad_norm": 1.5546875, + "learning_rate": 0.00040600928937108795, + "loss": 0.183, + "step": 206850 + }, + { + "epoch": 8.57, + "grad_norm": 0.84375, + "learning_rate": 0.00040600081484689905, + "loss": 0.2167, + "step": 206860 + }, + { + "epoch": 8.57, + "grad_norm": 1.296875, + "learning_rate": 0.00040599234002913347, + "loss": 0.1803, + "step": 206870 + }, + { + "epoch": 8.57, + "grad_norm": 0.59765625, + "learning_rate": 0.00040598386491780725, + "loss": 0.1748, + "step": 206880 + }, + { + "epoch": 8.57, + "grad_norm": 0.447265625, + "learning_rate": 0.0004059753895129361, + "loss": 0.2077, + "step": 206890 + }, + { + "epoch": 8.57, + "grad_norm": 0.6484375, + "learning_rate": 0.0004059669138145362, + "loss": 0.183, + "step": 206900 + }, + { + "epoch": 8.57, + "grad_norm": 0.81640625, + "learning_rate": 0.0004059584378226234, + "loss": 0.2074, + "step": 206910 + }, + { + "epoch": 8.57, + "grad_norm": 0.69140625, + "learning_rate": 0.0004059499615372137, + "loss": 0.1633, + "step": 206920 + }, + { + "epoch": 8.57, + "grad_norm": 1.03125, + "learning_rate": 0.000405941484958323, + "loss": 0.2156, + "step": 206930 + }, + { + "epoch": 8.57, + "grad_norm": 0.8515625, + "learning_rate": 0.0004059330080859672, + "loss": 0.1945, + "step": 206940 + }, + { + "epoch": 8.57, + "grad_norm": 0.81640625, + "learning_rate": 0.0004059245309201623, + "loss": 0.2089, + "step": 206950 + }, + { + "epoch": 8.57, + "grad_norm": 0.66015625, + "learning_rate": 0.0004059160534609244, + "loss": 0.1541, + "step": 206960 + }, + { + "epoch": 8.57, + "grad_norm": 0.43359375, + "learning_rate": 0.00040590757570826925, + "loss": 0.1944, + "step": 206970 + }, + { + "epoch": 8.57, + "grad_norm": 0.984375, + "learning_rate": 0.0004058990976622129, + "loss": 0.2709, + "step": 206980 + }, + { + "epoch": 8.57, + "grad_norm": 0.69921875, + "learning_rate": 0.0004058906193227713, + "loss": 0.2016, + "step": 206990 + }, + { + "epoch": 8.57, + "grad_norm": 0.4453125, + "learning_rate": 0.0004058821406899603, + "loss": 0.2069, + "step": 207000 + }, + { + "epoch": 8.57, + "grad_norm": 0.373046875, + "learning_rate": 0.000405873661763796, + "loss": 0.2642, + "step": 207010 + }, + { + "epoch": 8.57, + "grad_norm": 0.62890625, + "learning_rate": 0.0004058651825442943, + "loss": 0.1763, + "step": 207020 + }, + { + "epoch": 8.58, + "grad_norm": 1.265625, + "learning_rate": 0.00040585670303147116, + "loss": 0.2306, + "step": 207030 + }, + { + "epoch": 8.58, + "grad_norm": 1.265625, + "learning_rate": 0.00040584822322534256, + "loss": 0.1736, + "step": 207040 + }, + { + "epoch": 8.58, + "grad_norm": 1.859375, + "learning_rate": 0.0004058397431259244, + "loss": 0.2416, + "step": 207050 + }, + { + "epoch": 8.58, + "grad_norm": 0.8359375, + "learning_rate": 0.0004058312627332327, + "loss": 0.1605, + "step": 207060 + }, + { + "epoch": 8.58, + "grad_norm": 0.59375, + "learning_rate": 0.00040582278204728334, + "loss": 0.1994, + "step": 207070 + }, + { + "epoch": 8.58, + "grad_norm": 1.421875, + "learning_rate": 0.0004058143010680924, + "loss": 0.2254, + "step": 207080 + }, + { + "epoch": 8.58, + "grad_norm": 0.99609375, + "learning_rate": 0.00040580581979567576, + "loss": 0.2561, + "step": 207090 + }, + { + "epoch": 8.58, + "grad_norm": 0.70703125, + "learning_rate": 0.00040579733823004933, + "loss": 0.2167, + "step": 207100 + }, + { + "epoch": 8.58, + "grad_norm": 0.166015625, + "learning_rate": 0.0004057888563712292, + "loss": 0.1332, + "step": 207110 + }, + { + "epoch": 8.58, + "grad_norm": 0.474609375, + "learning_rate": 0.0004057803742192312, + "loss": 0.2517, + "step": 207120 + }, + { + "epoch": 8.58, + "grad_norm": 0.640625, + "learning_rate": 0.00040577189177407145, + "loss": 0.163, + "step": 207130 + }, + { + "epoch": 8.58, + "grad_norm": 1.0625, + "learning_rate": 0.00040576340903576583, + "loss": 0.2026, + "step": 207140 + }, + { + "epoch": 8.58, + "grad_norm": 0.8671875, + "learning_rate": 0.00040575492600433016, + "loss": 0.2415, + "step": 207150 + }, + { + "epoch": 8.58, + "grad_norm": 0.7578125, + "learning_rate": 0.0004057464426797806, + "loss": 0.1899, + "step": 207160 + }, + { + "epoch": 8.58, + "grad_norm": 0.3359375, + "learning_rate": 0.0004057379590621331, + "loss": 0.1769, + "step": 207170 + }, + { + "epoch": 8.58, + "grad_norm": 0.640625, + "learning_rate": 0.0004057294751514036, + "loss": 0.1916, + "step": 207180 + }, + { + "epoch": 8.58, + "grad_norm": 0.76171875, + "learning_rate": 0.00040572099094760797, + "loss": 0.1384, + "step": 207190 + }, + { + "epoch": 8.58, + "grad_norm": 2.4375, + "learning_rate": 0.00040571250645076226, + "loss": 0.1739, + "step": 207200 + }, + { + "epoch": 8.58, + "grad_norm": 1.09375, + "learning_rate": 0.00040570402166088246, + "loss": 0.2537, + "step": 207210 + }, + { + "epoch": 8.58, + "grad_norm": 0.78125, + "learning_rate": 0.0004056955365779845, + "loss": 0.1777, + "step": 207220 + }, + { + "epoch": 8.58, + "grad_norm": 0.494140625, + "learning_rate": 0.00040568705120208425, + "loss": 0.23, + "step": 207230 + }, + { + "epoch": 8.58, + "grad_norm": 1.03125, + "learning_rate": 0.00040567856553319794, + "loss": 0.1773, + "step": 207240 + }, + { + "epoch": 8.58, + "grad_norm": 1.2578125, + "learning_rate": 0.0004056700795713413, + "loss": 0.2376, + "step": 207250 + }, + { + "epoch": 8.58, + "grad_norm": 1.0234375, + "learning_rate": 0.00040566159331653043, + "loss": 0.2118, + "step": 207260 + }, + { + "epoch": 8.59, + "grad_norm": 0.435546875, + "learning_rate": 0.0004056531067687812, + "loss": 0.2382, + "step": 207270 + }, + { + "epoch": 8.59, + "grad_norm": 0.703125, + "learning_rate": 0.00040564461992810964, + "loss": 0.1607, + "step": 207280 + }, + { + "epoch": 8.59, + "grad_norm": 0.96484375, + "learning_rate": 0.00040563613279453174, + "loss": 0.1977, + "step": 207290 + }, + { + "epoch": 8.59, + "grad_norm": 1.0078125, + "learning_rate": 0.00040562764536806344, + "loss": 0.1746, + "step": 207300 + }, + { + "epoch": 8.59, + "grad_norm": 0.796875, + "learning_rate": 0.0004056191576487207, + "loss": 0.1977, + "step": 207310 + }, + { + "epoch": 8.59, + "grad_norm": 1.2578125, + "learning_rate": 0.0004056106696365195, + "loss": 0.169, + "step": 207320 + }, + { + "epoch": 8.59, + "grad_norm": 0.80859375, + "learning_rate": 0.00040560218133147585, + "loss": 0.193, + "step": 207330 + }, + { + "epoch": 8.59, + "grad_norm": 0.6328125, + "learning_rate": 0.0004055936927336057, + "loss": 0.1944, + "step": 207340 + }, + { + "epoch": 8.59, + "grad_norm": 0.59375, + "learning_rate": 0.00040558520384292504, + "loss": 0.2211, + "step": 207350 + }, + { + "epoch": 8.59, + "grad_norm": 0.58203125, + "learning_rate": 0.0004055767146594498, + "loss": 0.1609, + "step": 207360 + }, + { + "epoch": 8.59, + "grad_norm": 2.0, + "learning_rate": 0.00040556822518319596, + "loss": 0.2599, + "step": 207370 + }, + { + "epoch": 8.59, + "grad_norm": 0.6484375, + "learning_rate": 0.0004055597354141796, + "loss": 0.1696, + "step": 207380 + }, + { + "epoch": 8.59, + "grad_norm": 0.55078125, + "learning_rate": 0.0004055512453524165, + "loss": 0.2126, + "step": 207390 + }, + { + "epoch": 8.59, + "grad_norm": 0.5859375, + "learning_rate": 0.0004055427549979228, + "loss": 0.2031, + "step": 207400 + }, + { + "epoch": 8.59, + "grad_norm": 1.1953125, + "learning_rate": 0.0004055342643507145, + "loss": 0.1966, + "step": 207410 + }, + { + "epoch": 8.59, + "grad_norm": 1.4609375, + "learning_rate": 0.0004055257734108074, + "loss": 0.1955, + "step": 207420 + }, + { + "epoch": 8.59, + "grad_norm": 0.0, + "learning_rate": 0.0004055172821782177, + "loss": 0.1837, + "step": 207430 + }, + { + "epoch": 8.59, + "grad_norm": 0.55078125, + "learning_rate": 0.0004055087906529612, + "loss": 0.1998, + "step": 207440 + }, + { + "epoch": 8.59, + "grad_norm": 0.58984375, + "learning_rate": 0.000405500298835054, + "loss": 0.2387, + "step": 207450 + }, + { + "epoch": 8.59, + "grad_norm": 0.71484375, + "learning_rate": 0.00040549180672451196, + "loss": 0.2164, + "step": 207460 + }, + { + "epoch": 8.59, + "grad_norm": 0.734375, + "learning_rate": 0.0004054833143213512, + "loss": 0.2037, + "step": 207470 + }, + { + "epoch": 8.59, + "grad_norm": 0.216796875, + "learning_rate": 0.0004054748216255876, + "loss": 0.1778, + "step": 207480 + }, + { + "epoch": 8.59, + "grad_norm": 0.6484375, + "learning_rate": 0.0004054663286372372, + "loss": 0.1955, + "step": 207490 + }, + { + "epoch": 8.59, + "grad_norm": 0.72265625, + "learning_rate": 0.000405457835356316, + "loss": 0.2016, + "step": 207500 + }, + { + "epoch": 8.6, + "grad_norm": 1.3203125, + "learning_rate": 0.0004054493417828399, + "loss": 0.1663, + "step": 207510 + }, + { + "epoch": 8.6, + "grad_norm": 0.26171875, + "learning_rate": 0.00040544084791682483, + "loss": 0.1785, + "step": 207520 + }, + { + "epoch": 8.6, + "grad_norm": 1.1640625, + "learning_rate": 0.000405432353758287, + "loss": 0.203, + "step": 207530 + }, + { + "epoch": 8.6, + "grad_norm": 0.66796875, + "learning_rate": 0.00040542385930724214, + "loss": 0.192, + "step": 207540 + }, + { + "epoch": 8.6, + "grad_norm": 0.59765625, + "learning_rate": 0.00040541536456370643, + "loss": 0.2055, + "step": 207550 + }, + { + "epoch": 8.6, + "grad_norm": 1.0625, + "learning_rate": 0.0004054068695276958, + "loss": 0.2054, + "step": 207560 + }, + { + "epoch": 8.6, + "grad_norm": 0.72265625, + "learning_rate": 0.0004053983741992262, + "loss": 0.221, + "step": 207570 + }, + { + "epoch": 8.6, + "grad_norm": 1.2734375, + "learning_rate": 0.00040538987857831365, + "loss": 0.2552, + "step": 207580 + }, + { + "epoch": 8.6, + "grad_norm": 0.384765625, + "learning_rate": 0.00040538138266497416, + "loss": 0.2179, + "step": 207590 + }, + { + "epoch": 8.6, + "grad_norm": 1.2265625, + "learning_rate": 0.00040537288645922366, + "loss": 0.2491, + "step": 207600 + }, + { + "epoch": 8.6, + "grad_norm": 0.267578125, + "learning_rate": 0.0004053643899610782, + "loss": 0.2084, + "step": 207610 + }, + { + "epoch": 8.6, + "grad_norm": 0.92578125, + "learning_rate": 0.00040535589317055367, + "loss": 0.1719, + "step": 207620 + }, + { + "epoch": 8.6, + "grad_norm": 0.8671875, + "learning_rate": 0.0004053473960876662, + "loss": 0.1729, + "step": 207630 + }, + { + "epoch": 8.6, + "grad_norm": 1.0234375, + "learning_rate": 0.00040533889871243166, + "loss": 0.1896, + "step": 207640 + }, + { + "epoch": 8.6, + "grad_norm": 2.03125, + "learning_rate": 0.00040533040104486607, + "loss": 0.2034, + "step": 207650 + }, + { + "epoch": 8.6, + "grad_norm": 2.875, + "learning_rate": 0.00040532190308498553, + "loss": 0.1808, + "step": 207660 + }, + { + "epoch": 8.6, + "grad_norm": 0.65625, + "learning_rate": 0.0004053134048328058, + "loss": 0.1848, + "step": 207670 + }, + { + "epoch": 8.6, + "grad_norm": 1.328125, + "learning_rate": 0.0004053049062883431, + "loss": 0.1926, + "step": 207680 + }, + { + "epoch": 8.6, + "grad_norm": 0.4140625, + "learning_rate": 0.00040529640745161333, + "loss": 0.2159, + "step": 207690 + }, + { + "epoch": 8.6, + "grad_norm": 0.87109375, + "learning_rate": 0.0004052879083226325, + "loss": 0.2518, + "step": 207700 + }, + { + "epoch": 8.6, + "grad_norm": 1.1796875, + "learning_rate": 0.00040527940890141656, + "loss": 0.1688, + "step": 207710 + }, + { + "epoch": 8.6, + "grad_norm": 0.294921875, + "learning_rate": 0.0004052709091879816, + "loss": 0.1937, + "step": 207720 + }, + { + "epoch": 8.6, + "grad_norm": 1.0, + "learning_rate": 0.00040526240918234354, + "loss": 0.1743, + "step": 207730 + }, + { + "epoch": 8.6, + "grad_norm": 0.68359375, + "learning_rate": 0.00040525390888451843, + "loss": 0.1843, + "step": 207740 + }, + { + "epoch": 8.6, + "grad_norm": 0.466796875, + "learning_rate": 0.0004052454082945221, + "loss": 0.2111, + "step": 207750 + }, + { + "epoch": 8.61, + "grad_norm": 0.62109375, + "learning_rate": 0.00040523690741237073, + "loss": 0.2576, + "step": 207760 + }, + { + "epoch": 8.61, + "grad_norm": 1.265625, + "learning_rate": 0.00040522840623808033, + "loss": 0.2053, + "step": 207770 + }, + { + "epoch": 8.61, + "grad_norm": 0.65234375, + "learning_rate": 0.00040521990477166683, + "loss": 0.1981, + "step": 207780 + }, + { + "epoch": 8.61, + "grad_norm": 0.357421875, + "learning_rate": 0.0004052114030131462, + "loss": 0.2037, + "step": 207790 + }, + { + "epoch": 8.61, + "grad_norm": 0.388671875, + "learning_rate": 0.0004052029009625344, + "loss": 0.221, + "step": 207800 + }, + { + "epoch": 8.61, + "grad_norm": 0.5703125, + "learning_rate": 0.00040519439861984763, + "loss": 0.2062, + "step": 207810 + }, + { + "epoch": 8.61, + "grad_norm": 1.9296875, + "learning_rate": 0.00040518589598510167, + "loss": 0.2508, + "step": 207820 + }, + { + "epoch": 8.61, + "grad_norm": 1.609375, + "learning_rate": 0.00040517739305831257, + "loss": 0.2102, + "step": 207830 + }, + { + "epoch": 8.61, + "grad_norm": 0.89453125, + "learning_rate": 0.0004051688898394965, + "loss": 0.1756, + "step": 207840 + }, + { + "epoch": 8.61, + "grad_norm": 0.37890625, + "learning_rate": 0.0004051603863286693, + "loss": 0.196, + "step": 207850 + }, + { + "epoch": 8.61, + "grad_norm": 0.515625, + "learning_rate": 0.000405151882525847, + "loss": 0.2235, + "step": 207860 + }, + { + "epoch": 8.61, + "grad_norm": 1.0546875, + "learning_rate": 0.0004051433784310455, + "loss": 0.1584, + "step": 207870 + }, + { + "epoch": 8.61, + "grad_norm": 0.625, + "learning_rate": 0.00040513487404428104, + "loss": 0.2274, + "step": 207880 + }, + { + "epoch": 8.61, + "grad_norm": 0.71484375, + "learning_rate": 0.00040512636936556955, + "loss": 0.2079, + "step": 207890 + }, + { + "epoch": 8.61, + "grad_norm": 1.046875, + "learning_rate": 0.0004051178643949269, + "loss": 0.2144, + "step": 207900 + }, + { + "epoch": 8.61, + "grad_norm": 0.60546875, + "learning_rate": 0.00040510935913236915, + "loss": 0.2104, + "step": 207910 + }, + { + "epoch": 8.61, + "grad_norm": 0.6015625, + "learning_rate": 0.0004051008535779124, + "loss": 0.187, + "step": 207920 + }, + { + "epoch": 8.61, + "grad_norm": 0.875, + "learning_rate": 0.0004050923477315726, + "loss": 0.1748, + "step": 207930 + }, + { + "epoch": 8.61, + "grad_norm": 0.81640625, + "learning_rate": 0.0004050838415933657, + "loss": 0.1778, + "step": 207940 + }, + { + "epoch": 8.61, + "grad_norm": 0.5, + "learning_rate": 0.0004050753351633077, + "loss": 0.2464, + "step": 207950 + }, + { + "epoch": 8.61, + "grad_norm": 0.83984375, + "learning_rate": 0.00040506682844141485, + "loss": 0.205, + "step": 207960 + }, + { + "epoch": 8.61, + "grad_norm": 0.53515625, + "learning_rate": 0.00040505832142770284, + "loss": 0.1886, + "step": 207970 + }, + { + "epoch": 8.61, + "grad_norm": 0.63671875, + "learning_rate": 0.0004050498141221879, + "loss": 0.2088, + "step": 207980 + }, + { + "epoch": 8.61, + "grad_norm": 0.6953125, + "learning_rate": 0.0004050413065248858, + "loss": 0.1999, + "step": 207990 + }, + { + "epoch": 8.62, + "grad_norm": 0.83203125, + "learning_rate": 0.0004050327986358128, + "loss": 0.2029, + "step": 208000 + }, + { + "epoch": 8.62, + "grad_norm": 0.46484375, + "learning_rate": 0.00040502429045498486, + "loss": 0.155, + "step": 208010 + }, + { + "epoch": 8.62, + "grad_norm": 0.8125, + "learning_rate": 0.0004050157819824179, + "loss": 0.2516, + "step": 208020 + }, + { + "epoch": 8.62, + "grad_norm": 0.44140625, + "learning_rate": 0.000405007273218128, + "loss": 0.221, + "step": 208030 + }, + { + "epoch": 8.62, + "grad_norm": 0.70703125, + "learning_rate": 0.0004049987641621311, + "loss": 0.1828, + "step": 208040 + }, + { + "epoch": 8.62, + "grad_norm": 1.34375, + "learning_rate": 0.0004049902548144433, + "loss": 0.2107, + "step": 208050 + }, + { + "epoch": 8.62, + "grad_norm": 0.8046875, + "learning_rate": 0.00040498174517508055, + "loss": 0.1829, + "step": 208060 + }, + { + "epoch": 8.62, + "grad_norm": 1.0703125, + "learning_rate": 0.0004049732352440589, + "loss": 0.1609, + "step": 208070 + }, + { + "epoch": 8.62, + "grad_norm": 0.6640625, + "learning_rate": 0.00040496472502139444, + "loss": 0.1947, + "step": 208080 + }, + { + "epoch": 8.62, + "grad_norm": 0.76171875, + "learning_rate": 0.00040495621450710304, + "loss": 0.1753, + "step": 208090 + }, + { + "epoch": 8.62, + "grad_norm": 0.37109375, + "learning_rate": 0.0004049477037012008, + "loss": 0.1935, + "step": 208100 + }, + { + "epoch": 8.62, + "grad_norm": 0.36328125, + "learning_rate": 0.00040493919260370365, + "loss": 0.1789, + "step": 208110 + }, + { + "epoch": 8.62, + "grad_norm": 0.4921875, + "learning_rate": 0.00040493068121462774, + "loss": 0.1814, + "step": 208120 + }, + { + "epoch": 8.62, + "grad_norm": 0.439453125, + "learning_rate": 0.00040492216953398895, + "loss": 0.1839, + "step": 208130 + }, + { + "epoch": 8.62, + "grad_norm": 0.73046875, + "learning_rate": 0.0004049136575618034, + "loss": 0.204, + "step": 208140 + }, + { + "epoch": 8.62, + "grad_norm": 0.77734375, + "learning_rate": 0.0004049051452980871, + "loss": 0.2137, + "step": 208150 + }, + { + "epoch": 8.62, + "grad_norm": 0.373046875, + "learning_rate": 0.000404896632742856, + "loss": 0.2202, + "step": 208160 + }, + { + "epoch": 8.62, + "grad_norm": 0.50390625, + "learning_rate": 0.0004048881198961262, + "loss": 0.1714, + "step": 208170 + }, + { + "epoch": 8.62, + "grad_norm": 1.703125, + "learning_rate": 0.0004048796067579137, + "loss": 0.1857, + "step": 208180 + }, + { + "epoch": 8.62, + "grad_norm": 0.5078125, + "learning_rate": 0.00040487109332823447, + "loss": 0.1608, + "step": 208190 + }, + { + "epoch": 8.62, + "grad_norm": 1.0390625, + "learning_rate": 0.0004048625796071046, + "loss": 0.2077, + "step": 208200 + }, + { + "epoch": 8.62, + "grad_norm": 0.80859375, + "learning_rate": 0.00040485406559454006, + "loss": 0.1795, + "step": 208210 + }, + { + "epoch": 8.62, + "grad_norm": 0.7109375, + "learning_rate": 0.00040484555129055686, + "loss": 0.2242, + "step": 208220 + }, + { + "epoch": 8.62, + "grad_norm": 0.734375, + "learning_rate": 0.00040483703669517106, + "loss": 0.253, + "step": 208230 + }, + { + "epoch": 8.63, + "grad_norm": 1.8671875, + "learning_rate": 0.00040482852180839866, + "loss": 0.2184, + "step": 208240 + }, + { + "epoch": 8.63, + "grad_norm": 0.5625, + "learning_rate": 0.0004048200066302558, + "loss": 0.1583, + "step": 208250 + }, + { + "epoch": 8.63, + "grad_norm": 0.58984375, + "learning_rate": 0.0004048114911607583, + "loss": 0.1983, + "step": 208260 + }, + { + "epoch": 8.63, + "grad_norm": 0.75390625, + "learning_rate": 0.0004048029753999224, + "loss": 0.1619, + "step": 208270 + }, + { + "epoch": 8.63, + "grad_norm": 0.7421875, + "learning_rate": 0.0004047944593477639, + "loss": 0.2313, + "step": 208280 + }, + { + "epoch": 8.63, + "grad_norm": 1.78125, + "learning_rate": 0.00040478594300429906, + "loss": 0.185, + "step": 208290 + }, + { + "epoch": 8.63, + "grad_norm": 0.859375, + "learning_rate": 0.0004047774263695437, + "loss": 0.1884, + "step": 208300 + }, + { + "epoch": 8.63, + "grad_norm": 1.125, + "learning_rate": 0.00040476890944351394, + "loss": 0.1981, + "step": 208310 + }, + { + "epoch": 8.63, + "grad_norm": 0.60546875, + "learning_rate": 0.0004047603922262259, + "loss": 0.2112, + "step": 208320 + }, + { + "epoch": 8.63, + "grad_norm": 0.79296875, + "learning_rate": 0.0004047518747176955, + "loss": 0.2292, + "step": 208330 + }, + { + "epoch": 8.63, + "grad_norm": 0.96484375, + "learning_rate": 0.0004047433569179387, + "loss": 0.1807, + "step": 208340 + }, + { + "epoch": 8.63, + "grad_norm": 0.349609375, + "learning_rate": 0.0004047348388269716, + "loss": 0.1937, + "step": 208350 + }, + { + "epoch": 8.63, + "grad_norm": 0.5859375, + "learning_rate": 0.00040472632044481034, + "loss": 0.1488, + "step": 208360 + }, + { + "epoch": 8.63, + "grad_norm": 0.36328125, + "learning_rate": 0.00040471780177147085, + "loss": 0.2141, + "step": 208370 + }, + { + "epoch": 8.63, + "grad_norm": 0.45703125, + "learning_rate": 0.00040470928280696904, + "loss": 0.1867, + "step": 208380 + }, + { + "epoch": 8.63, + "grad_norm": 0.74609375, + "learning_rate": 0.0004047007635513212, + "loss": 0.2422, + "step": 208390 + }, + { + "epoch": 8.63, + "grad_norm": 0.5234375, + "learning_rate": 0.0004046922440045432, + "loss": 0.1928, + "step": 208400 + }, + { + "epoch": 8.63, + "grad_norm": 0.90234375, + "learning_rate": 0.000404683724166651, + "loss": 0.1746, + "step": 208410 + }, + { + "epoch": 8.63, + "grad_norm": 1.203125, + "learning_rate": 0.0004046752040376609, + "loss": 0.1736, + "step": 208420 + }, + { + "epoch": 8.63, + "grad_norm": 1.4375, + "learning_rate": 0.0004046666836175886, + "loss": 0.2468, + "step": 208430 + }, + { + "epoch": 8.63, + "grad_norm": 0.412109375, + "learning_rate": 0.0004046581629064504, + "loss": 0.2046, + "step": 208440 + }, + { + "epoch": 8.63, + "grad_norm": 0.435546875, + "learning_rate": 0.00040464964190426226, + "loss": 0.2225, + "step": 208450 + }, + { + "epoch": 8.63, + "grad_norm": 0.640625, + "learning_rate": 0.00040464112061104016, + "loss": 0.1628, + "step": 208460 + }, + { + "epoch": 8.63, + "grad_norm": 0.83203125, + "learning_rate": 0.00040463259902680015, + "loss": 0.2034, + "step": 208470 + }, + { + "epoch": 8.64, + "grad_norm": 0.431640625, + "learning_rate": 0.00040462407715155833, + "loss": 0.216, + "step": 208480 + }, + { + "epoch": 8.64, + "grad_norm": 0.333984375, + "learning_rate": 0.00040461555498533063, + "loss": 0.2431, + "step": 208490 + }, + { + "epoch": 8.64, + "grad_norm": 0.296875, + "learning_rate": 0.00040460703252813326, + "loss": 0.1564, + "step": 208500 + }, + { + "epoch": 8.64, + "grad_norm": 0.66796875, + "learning_rate": 0.00040459850977998205, + "loss": 0.2425, + "step": 208510 + }, + { + "epoch": 8.64, + "grad_norm": 1.1015625, + "learning_rate": 0.0004045899867408932, + "loss": 0.2092, + "step": 208520 + }, + { + "epoch": 8.64, + "grad_norm": 0.91015625, + "learning_rate": 0.0004045814634108827, + "loss": 0.1817, + "step": 208530 + }, + { + "epoch": 8.64, + "grad_norm": 0.302734375, + "learning_rate": 0.0004045729397899665, + "loss": 0.196, + "step": 208540 + }, + { + "epoch": 8.64, + "grad_norm": 0.796875, + "learning_rate": 0.0004045644158781608, + "loss": 0.1953, + "step": 208550 + }, + { + "epoch": 8.64, + "grad_norm": 0.6953125, + "learning_rate": 0.00040455589167548146, + "loss": 0.2037, + "step": 208560 + }, + { + "epoch": 8.64, + "grad_norm": 0.81640625, + "learning_rate": 0.00040454736718194474, + "loss": 0.1946, + "step": 208570 + }, + { + "epoch": 8.64, + "grad_norm": 0.63671875, + "learning_rate": 0.00040453884239756655, + "loss": 0.2053, + "step": 208580 + }, + { + "epoch": 8.64, + "grad_norm": 0.2158203125, + "learning_rate": 0.0004045303173223629, + "loss": 0.2057, + "step": 208590 + }, + { + "epoch": 8.64, + "grad_norm": 0.91015625, + "learning_rate": 0.00040452179195634986, + "loss": 0.1899, + "step": 208600 + }, + { + "epoch": 8.64, + "grad_norm": 0.80859375, + "learning_rate": 0.00040451326629954357, + "loss": 0.1673, + "step": 208610 + }, + { + "epoch": 8.64, + "grad_norm": 0.435546875, + "learning_rate": 0.00040450474035195994, + "loss": 0.1542, + "step": 208620 + }, + { + "epoch": 8.64, + "grad_norm": 0.4453125, + "learning_rate": 0.00040449621411361503, + "loss": 0.1669, + "step": 208630 + }, + { + "epoch": 8.64, + "grad_norm": 1.25, + "learning_rate": 0.00040448768758452505, + "loss": 0.1726, + "step": 208640 + }, + { + "epoch": 8.64, + "grad_norm": 1.015625, + "learning_rate": 0.0004044791607647058, + "loss": 0.1897, + "step": 208650 + }, + { + "epoch": 8.64, + "grad_norm": 0.47265625, + "learning_rate": 0.0004044706336541736, + "loss": 0.1874, + "step": 208660 + }, + { + "epoch": 8.64, + "grad_norm": 0.93359375, + "learning_rate": 0.00040446210625294423, + "loss": 0.2182, + "step": 208670 + }, + { + "epoch": 8.64, + "grad_norm": 1.078125, + "learning_rate": 0.00040445357856103386, + "loss": 0.1724, + "step": 208680 + }, + { + "epoch": 8.64, + "grad_norm": 1.1484375, + "learning_rate": 0.0004044450505784586, + "loss": 0.2357, + "step": 208690 + }, + { + "epoch": 8.64, + "grad_norm": 0.1484375, + "learning_rate": 0.0004044365223052344, + "loss": 0.1826, + "step": 208700 + }, + { + "epoch": 8.64, + "grad_norm": 0.82421875, + "learning_rate": 0.00040442799374137734, + "loss": 0.2217, + "step": 208710 + }, + { + "epoch": 8.65, + "grad_norm": 0.3203125, + "learning_rate": 0.00040441946488690343, + "loss": 0.187, + "step": 208720 + }, + { + "epoch": 8.65, + "grad_norm": 0.34765625, + "learning_rate": 0.00040441093574182887, + "loss": 0.1637, + "step": 208730 + }, + { + "epoch": 8.65, + "grad_norm": 0.58984375, + "learning_rate": 0.00040440240630616953, + "loss": 0.2137, + "step": 208740 + }, + { + "epoch": 8.65, + "grad_norm": 0.578125, + "learning_rate": 0.0004043938765799415, + "loss": 0.226, + "step": 208750 + }, + { + "epoch": 8.65, + "grad_norm": 0.72265625, + "learning_rate": 0.0004043853465631609, + "loss": 0.1979, + "step": 208760 + }, + { + "epoch": 8.65, + "grad_norm": 0.546875, + "learning_rate": 0.00040437681625584375, + "loss": 0.1733, + "step": 208770 + }, + { + "epoch": 8.65, + "grad_norm": 0.5859375, + "learning_rate": 0.0004043682856580062, + "loss": 0.16, + "step": 208780 + }, + { + "epoch": 8.65, + "grad_norm": 0.796875, + "learning_rate": 0.0004043597547696641, + "loss": 0.2165, + "step": 208790 + }, + { + "epoch": 8.65, + "grad_norm": 0.431640625, + "learning_rate": 0.0004043512235908336, + "loss": 0.2788, + "step": 208800 + }, + { + "epoch": 8.65, + "grad_norm": 0.9375, + "learning_rate": 0.00040434269212153085, + "loss": 0.2174, + "step": 208810 + }, + { + "epoch": 8.65, + "grad_norm": 0.1767578125, + "learning_rate": 0.0004043341603617718, + "loss": 0.1435, + "step": 208820 + }, + { + "epoch": 8.65, + "grad_norm": 0.6640625, + "learning_rate": 0.00040432562831157245, + "loss": 0.1627, + "step": 208830 + }, + { + "epoch": 8.65, + "grad_norm": 1.8359375, + "learning_rate": 0.00040431709597094903, + "loss": 0.2298, + "step": 208840 + }, + { + "epoch": 8.65, + "grad_norm": 1.09375, + "learning_rate": 0.0004043085633399174, + "loss": 0.2388, + "step": 208850 + }, + { + "epoch": 8.65, + "grad_norm": 1.3125, + "learning_rate": 0.0004043000304184938, + "loss": 0.2204, + "step": 208860 + }, + { + "epoch": 8.65, + "grad_norm": 0.69140625, + "learning_rate": 0.00040429149720669415, + "loss": 0.2412, + "step": 208870 + }, + { + "epoch": 8.65, + "grad_norm": 0.62109375, + "learning_rate": 0.0004042829637045346, + "loss": 0.2302, + "step": 208880 + }, + { + "epoch": 8.65, + "grad_norm": 0.72265625, + "learning_rate": 0.0004042744299120312, + "loss": 0.1815, + "step": 208890 + }, + { + "epoch": 8.65, + "grad_norm": 0.59375, + "learning_rate": 0.00040426589582919994, + "loss": 0.2655, + "step": 208900 + }, + { + "epoch": 8.65, + "grad_norm": 0.205078125, + "learning_rate": 0.00040425736145605686, + "loss": 0.2022, + "step": 208910 + }, + { + "epoch": 8.65, + "grad_norm": 0.4609375, + "learning_rate": 0.0004042488267926182, + "loss": 0.1633, + "step": 208920 + }, + { + "epoch": 8.65, + "grad_norm": 0.70703125, + "learning_rate": 0.0004042402918388999, + "loss": 0.1708, + "step": 208930 + }, + { + "epoch": 8.65, + "grad_norm": 0.458984375, + "learning_rate": 0.00040423175659491797, + "loss": 0.2098, + "step": 208940 + }, + { + "epoch": 8.65, + "grad_norm": 0.75, + "learning_rate": 0.0004042232210606885, + "loss": 0.1957, + "step": 208950 + }, + { + "epoch": 8.66, + "grad_norm": 0.52734375, + "learning_rate": 0.00040421468523622764, + "loss": 0.1866, + "step": 208960 + }, + { + "epoch": 8.66, + "grad_norm": 0.275390625, + "learning_rate": 0.0004042061491215514, + "loss": 0.2003, + "step": 208970 + }, + { + "epoch": 8.66, + "grad_norm": 0.006500244140625, + "learning_rate": 0.00040419761271667576, + "loss": 0.1905, + "step": 208980 + }, + { + "epoch": 8.66, + "grad_norm": 0.7890625, + "learning_rate": 0.00040418907602161696, + "loss": 0.2667, + "step": 208990 + }, + { + "epoch": 8.66, + "grad_norm": 1.03125, + "learning_rate": 0.00040418053903639096, + "loss": 0.2024, + "step": 209000 + }, + { + "epoch": 8.66, + "grad_norm": 0.87109375, + "learning_rate": 0.00040417200176101376, + "loss": 0.188, + "step": 209010 + }, + { + "epoch": 8.66, + "grad_norm": 0.54296875, + "learning_rate": 0.0004041634641955016, + "loss": 0.2149, + "step": 209020 + }, + { + "epoch": 8.66, + "grad_norm": 1.859375, + "learning_rate": 0.00040415492633987036, + "loss": 0.2359, + "step": 209030 + }, + { + "epoch": 8.66, + "grad_norm": 0.86328125, + "learning_rate": 0.0004041463881941363, + "loss": 0.2518, + "step": 209040 + }, + { + "epoch": 8.66, + "grad_norm": 0.61328125, + "learning_rate": 0.00040413784975831536, + "loss": 0.1908, + "step": 209050 + }, + { + "epoch": 8.66, + "grad_norm": 0.421875, + "learning_rate": 0.00040412931103242354, + "loss": 0.2046, + "step": 209060 + }, + { + "epoch": 8.66, + "grad_norm": 1.078125, + "learning_rate": 0.00040412077201647706, + "loss": 0.1802, + "step": 209070 + }, + { + "epoch": 8.66, + "grad_norm": 0.6328125, + "learning_rate": 0.0004041122327104919, + "loss": 0.2513, + "step": 209080 + }, + { + "epoch": 8.66, + "grad_norm": 1.234375, + "learning_rate": 0.00040410369311448427, + "loss": 0.1722, + "step": 209090 + }, + { + "epoch": 8.66, + "grad_norm": 1.296875, + "learning_rate": 0.00040409515322847, + "loss": 0.2282, + "step": 209100 + }, + { + "epoch": 8.66, + "grad_norm": 0.4375, + "learning_rate": 0.00040408661305246535, + "loss": 0.1839, + "step": 209110 + }, + { + "epoch": 8.66, + "grad_norm": 0.828125, + "learning_rate": 0.00040407807258648634, + "loss": 0.1691, + "step": 209120 + }, + { + "epoch": 8.66, + "grad_norm": 0.90625, + "learning_rate": 0.00040406953183054907, + "loss": 0.2174, + "step": 209130 + }, + { + "epoch": 8.66, + "grad_norm": 0.8203125, + "learning_rate": 0.00040406099078466945, + "loss": 0.2036, + "step": 209140 + }, + { + "epoch": 8.66, + "grad_norm": 0.43359375, + "learning_rate": 0.0004040524494488638, + "loss": 0.1911, + "step": 209150 + }, + { + "epoch": 8.66, + "grad_norm": 0.37890625, + "learning_rate": 0.000404043907823148, + "loss": 0.262, + "step": 209160 + }, + { + "epoch": 8.66, + "grad_norm": 0.37890625, + "learning_rate": 0.00040403536590753834, + "loss": 0.1666, + "step": 209170 + }, + { + "epoch": 8.66, + "grad_norm": 0.7109375, + "learning_rate": 0.00040402682370205056, + "loss": 0.1992, + "step": 209180 + }, + { + "epoch": 8.66, + "grad_norm": 0.62890625, + "learning_rate": 0.000404018281206701, + "loss": 0.1958, + "step": 209190 + }, + { + "epoch": 8.67, + "grad_norm": 0.78125, + "learning_rate": 0.0004040097384215058, + "loss": 0.2089, + "step": 209200 + }, + { + "epoch": 8.67, + "grad_norm": 0.66015625, + "learning_rate": 0.00040400119534648077, + "loss": 0.2155, + "step": 209210 + }, + { + "epoch": 8.67, + "grad_norm": 0.45703125, + "learning_rate": 0.00040399265198164217, + "loss": 0.2005, + "step": 209220 + }, + { + "epoch": 8.67, + "grad_norm": 0.37890625, + "learning_rate": 0.000403984108327006, + "loss": 0.1322, + "step": 209230 + }, + { + "epoch": 8.67, + "grad_norm": 1.8515625, + "learning_rate": 0.00040397556438258835, + "loss": 0.2356, + "step": 209240 + }, + { + "epoch": 8.67, + "grad_norm": 0.8671875, + "learning_rate": 0.0004039670201484054, + "loss": 0.2372, + "step": 209250 + }, + { + "epoch": 8.67, + "grad_norm": 0.81640625, + "learning_rate": 0.00040395847562447307, + "loss": 0.2271, + "step": 209260 + }, + { + "epoch": 8.67, + "grad_norm": 0.48046875, + "learning_rate": 0.0004039499308108075, + "loss": 0.2338, + "step": 209270 + }, + { + "epoch": 8.67, + "grad_norm": 0.80859375, + "learning_rate": 0.0004039413857074248, + "loss": 0.1777, + "step": 209280 + }, + { + "epoch": 8.67, + "grad_norm": 1.6875, + "learning_rate": 0.0004039328403143411, + "loss": 0.1926, + "step": 209290 + }, + { + "epoch": 8.67, + "grad_norm": 0.71875, + "learning_rate": 0.00040392429463157234, + "loss": 0.1851, + "step": 209300 + }, + { + "epoch": 8.67, + "grad_norm": 1.4921875, + "learning_rate": 0.0004039157486591347, + "loss": 0.1917, + "step": 209310 + }, + { + "epoch": 8.67, + "grad_norm": 0.91015625, + "learning_rate": 0.0004039072023970443, + "loss": 0.2118, + "step": 209320 + }, + { + "epoch": 8.67, + "grad_norm": 0.0, + "learning_rate": 0.00040389865584531716, + "loss": 0.1796, + "step": 209330 + }, + { + "epoch": 8.67, + "grad_norm": 0.8046875, + "learning_rate": 0.0004038901090039693, + "loss": 0.2138, + "step": 209340 + }, + { + "epoch": 8.67, + "grad_norm": 0.58984375, + "learning_rate": 0.00040388156187301684, + "loss": 0.1951, + "step": 209350 + }, + { + "epoch": 8.67, + "grad_norm": 0.55078125, + "learning_rate": 0.000403873014452476, + "loss": 0.1856, + "step": 209360 + }, + { + "epoch": 8.67, + "grad_norm": 0.80859375, + "learning_rate": 0.0004038644667423628, + "loss": 0.1707, + "step": 209370 + }, + { + "epoch": 8.67, + "grad_norm": 0.7734375, + "learning_rate": 0.00040385591874269314, + "loss": 0.2015, + "step": 209380 + }, + { + "epoch": 8.67, + "grad_norm": 0.5078125, + "learning_rate": 0.00040384737045348336, + "loss": 0.1607, + "step": 209390 + }, + { + "epoch": 8.67, + "grad_norm": 0.91015625, + "learning_rate": 0.0004038388218747494, + "loss": 0.2007, + "step": 209400 + }, + { + "epoch": 8.67, + "grad_norm": 2.171875, + "learning_rate": 0.0004038302730065074, + "loss": 0.1896, + "step": 209410 + }, + { + "epoch": 8.67, + "grad_norm": 0.46875, + "learning_rate": 0.00040382172384877346, + "loss": 0.1603, + "step": 209420 + }, + { + "epoch": 8.67, + "grad_norm": 0.8828125, + "learning_rate": 0.0004038131744015636, + "loss": 0.2058, + "step": 209430 + }, + { + "epoch": 8.67, + "grad_norm": 0.6640625, + "learning_rate": 0.000403804624664894, + "loss": 0.2113, + "step": 209440 + }, + { + "epoch": 8.68, + "grad_norm": 0.44921875, + "learning_rate": 0.0004037960746387807, + "loss": 0.2043, + "step": 209450 + }, + { + "epoch": 8.68, + "grad_norm": 0.515625, + "learning_rate": 0.0004037875243232398, + "loss": 0.2517, + "step": 209460 + }, + { + "epoch": 8.68, + "grad_norm": 1.265625, + "learning_rate": 0.00040377897371828735, + "loss": 0.2055, + "step": 209470 + }, + { + "epoch": 8.68, + "grad_norm": 0.1484375, + "learning_rate": 0.0004037704228239395, + "loss": 0.2231, + "step": 209480 + }, + { + "epoch": 8.68, + "grad_norm": 0.7578125, + "learning_rate": 0.0004037618716402123, + "loss": 0.2211, + "step": 209490 + }, + { + "epoch": 8.68, + "grad_norm": 0.376953125, + "learning_rate": 0.00040375332016712195, + "loss": 0.2092, + "step": 209500 + }, + { + "epoch": 8.68, + "grad_norm": 1.6953125, + "learning_rate": 0.0004037447684046843, + "loss": 0.2345, + "step": 209510 + }, + { + "epoch": 8.68, + "grad_norm": 0.8515625, + "learning_rate": 0.00040373621635291575, + "loss": 0.2353, + "step": 209520 + }, + { + "epoch": 8.68, + "grad_norm": 0.2119140625, + "learning_rate": 0.00040372766401183216, + "loss": 0.2202, + "step": 209530 + }, + { + "epoch": 8.68, + "grad_norm": 1.5625, + "learning_rate": 0.0004037191113814498, + "loss": 0.1882, + "step": 209540 + }, + { + "epoch": 8.68, + "grad_norm": 0.185546875, + "learning_rate": 0.00040371055846178457, + "loss": 0.2161, + "step": 209550 + }, + { + "epoch": 8.68, + "grad_norm": 0.98046875, + "learning_rate": 0.0004037020052528527, + "loss": 0.2133, + "step": 209560 + }, + { + "epoch": 8.68, + "grad_norm": 0.458984375, + "learning_rate": 0.00040369345175467024, + "loss": 0.2153, + "step": 209570 + }, + { + "epoch": 8.68, + "grad_norm": 0.796875, + "learning_rate": 0.0004036848979672533, + "loss": 0.2158, + "step": 209580 + }, + { + "epoch": 8.68, + "grad_norm": 0.64453125, + "learning_rate": 0.000403676343890618, + "loss": 0.1865, + "step": 209590 + }, + { + "epoch": 8.68, + "grad_norm": 1.0625, + "learning_rate": 0.00040366778952478043, + "loss": 0.197, + "step": 209600 + }, + { + "epoch": 8.68, + "grad_norm": 1.3203125, + "learning_rate": 0.0004036592348697566, + "loss": 0.1992, + "step": 209610 + }, + { + "epoch": 8.68, + "grad_norm": 0.50390625, + "learning_rate": 0.0004036506799255628, + "loss": 0.2089, + "step": 209620 + }, + { + "epoch": 8.68, + "grad_norm": 1.515625, + "learning_rate": 0.00040364212469221495, + "loss": 0.1769, + "step": 209630 + }, + { + "epoch": 8.68, + "grad_norm": 1.046875, + "learning_rate": 0.0004036335691697292, + "loss": 0.221, + "step": 209640 + }, + { + "epoch": 8.68, + "grad_norm": 0.76171875, + "learning_rate": 0.00040362501335812175, + "loss": 0.148, + "step": 209650 + }, + { + "epoch": 8.68, + "grad_norm": 0.60546875, + "learning_rate": 0.00040361645725740846, + "loss": 0.1874, + "step": 209660 + }, + { + "epoch": 8.68, + "grad_norm": 0.84765625, + "learning_rate": 0.00040360790086760567, + "loss": 0.2297, + "step": 209670 + }, + { + "epoch": 8.68, + "grad_norm": 0.462890625, + "learning_rate": 0.0004035993441887295, + "loss": 0.2456, + "step": 209680 + }, + { + "epoch": 8.69, + "grad_norm": 0.53125, + "learning_rate": 0.00040359078722079577, + "loss": 0.2283, + "step": 209690 + }, + { + "epoch": 8.69, + "grad_norm": 0.298828125, + "learning_rate": 0.00040358222996382087, + "loss": 0.2173, + "step": 209700 + }, + { + "epoch": 8.69, + "grad_norm": 0.703125, + "learning_rate": 0.00040357367241782076, + "loss": 0.1941, + "step": 209710 + }, + { + "epoch": 8.69, + "grad_norm": 1.0234375, + "learning_rate": 0.00040356511458281165, + "loss": 0.248, + "step": 209720 + }, + { + "epoch": 8.69, + "grad_norm": 0.482421875, + "learning_rate": 0.0004035565564588095, + "loss": 0.1952, + "step": 209730 + }, + { + "epoch": 8.69, + "grad_norm": 0.79296875, + "learning_rate": 0.0004035479980458305, + "loss": 0.2213, + "step": 209740 + }, + { + "epoch": 8.69, + "grad_norm": 0.478515625, + "learning_rate": 0.0004035394393438908, + "loss": 0.1901, + "step": 209750 + }, + { + "epoch": 8.69, + "grad_norm": 0.287109375, + "learning_rate": 0.00040353088035300646, + "loss": 0.1869, + "step": 209760 + }, + { + "epoch": 8.69, + "grad_norm": 0.5, + "learning_rate": 0.0004035223210731935, + "loss": 0.172, + "step": 209770 + }, + { + "epoch": 8.69, + "grad_norm": 1.0234375, + "learning_rate": 0.0004035137615044682, + "loss": 0.2691, + "step": 209780 + }, + { + "epoch": 8.69, + "grad_norm": 1.21875, + "learning_rate": 0.0004035052016468465, + "loss": 0.1305, + "step": 209790 + }, + { + "epoch": 8.69, + "grad_norm": 0.2890625, + "learning_rate": 0.0004034966415003446, + "loss": 0.173, + "step": 209800 + }, + { + "epoch": 8.69, + "grad_norm": 0.263671875, + "learning_rate": 0.0004034880810649787, + "loss": 0.1747, + "step": 209810 + }, + { + "epoch": 8.69, + "grad_norm": 0.5859375, + "learning_rate": 0.0004034795203407646, + "loss": 0.2245, + "step": 209820 + }, + { + "epoch": 8.69, + "grad_norm": 0.56640625, + "learning_rate": 0.00040347095932771884, + "loss": 0.2022, + "step": 209830 + }, + { + "epoch": 8.69, + "grad_norm": 0.91796875, + "learning_rate": 0.0004034623980258572, + "loss": 0.2388, + "step": 209840 + }, + { + "epoch": 8.69, + "grad_norm": 1.6328125, + "learning_rate": 0.0004034538364351959, + "loss": 0.166, + "step": 209850 + }, + { + "epoch": 8.69, + "grad_norm": 1.3984375, + "learning_rate": 0.00040344527455575107, + "loss": 0.1952, + "step": 209860 + }, + { + "epoch": 8.69, + "grad_norm": 0.44140625, + "learning_rate": 0.0004034367123875388, + "loss": 0.1689, + "step": 209870 + }, + { + "epoch": 8.69, + "grad_norm": 0.353515625, + "learning_rate": 0.00040342814993057517, + "loss": 0.1996, + "step": 209880 + }, + { + "epoch": 8.69, + "grad_norm": 0.75, + "learning_rate": 0.0004034195871848764, + "loss": 0.1344, + "step": 209890 + }, + { + "epoch": 8.69, + "grad_norm": 0.6640625, + "learning_rate": 0.0004034110241504585, + "loss": 0.2009, + "step": 209900 + }, + { + "epoch": 8.69, + "grad_norm": 0.44921875, + "learning_rate": 0.0004034024608273377, + "loss": 0.2226, + "step": 209910 + }, + { + "epoch": 8.69, + "grad_norm": 0.59375, + "learning_rate": 0.0004033938972155299, + "loss": 0.2084, + "step": 209920 + }, + { + "epoch": 8.7, + "grad_norm": 0.474609375, + "learning_rate": 0.00040338533331505137, + "loss": 0.2353, + "step": 209930 + }, + { + "epoch": 8.7, + "grad_norm": 0.8828125, + "learning_rate": 0.0004033767691259183, + "loss": 0.1947, + "step": 209940 + }, + { + "epoch": 8.7, + "grad_norm": 0.5859375, + "learning_rate": 0.0004033682046481466, + "loss": 0.2169, + "step": 209950 + }, + { + "epoch": 8.7, + "grad_norm": 0.328125, + "learning_rate": 0.0004033596398817526, + "loss": 0.1704, + "step": 209960 + }, + { + "epoch": 8.7, + "grad_norm": 0.54296875, + "learning_rate": 0.00040335107482675225, + "loss": 0.2082, + "step": 209970 + }, + { + "epoch": 8.7, + "grad_norm": 0.67578125, + "learning_rate": 0.00040334250948316177, + "loss": 0.2046, + "step": 209980 + }, + { + "epoch": 8.7, + "grad_norm": 0.81640625, + "learning_rate": 0.0004033339438509973, + "loss": 0.207, + "step": 209990 + }, + { + "epoch": 8.7, + "grad_norm": 1.3671875, + "learning_rate": 0.0004033253779302748, + "loss": 0.2241, + "step": 210000 + }, + { + "epoch": 8.7, + "grad_norm": 0.59375, + "learning_rate": 0.00040331681172101053, + "loss": 0.1825, + "step": 210010 + }, + { + "epoch": 8.7, + "grad_norm": 0.0, + "learning_rate": 0.00040330824522322064, + "loss": 0.2452, + "step": 210020 + }, + { + "epoch": 8.7, + "grad_norm": 0.26171875, + "learning_rate": 0.0004032996784369211, + "loss": 0.2041, + "step": 210030 + }, + { + "epoch": 8.7, + "grad_norm": 3.28125, + "learning_rate": 0.00040329111136212814, + "loss": 0.2061, + "step": 210040 + }, + { + "epoch": 8.7, + "grad_norm": 0.4765625, + "learning_rate": 0.00040328254399885793, + "loss": 0.2416, + "step": 210050 + }, + { + "epoch": 8.7, + "grad_norm": 0.76953125, + "learning_rate": 0.00040327397634712646, + "loss": 0.1605, + "step": 210060 + }, + { + "epoch": 8.7, + "grad_norm": 0.5390625, + "learning_rate": 0.0004032654084069499, + "loss": 0.2337, + "step": 210070 + }, + { + "epoch": 8.7, + "grad_norm": 0.275390625, + "learning_rate": 0.0004032568401783445, + "loss": 0.1339, + "step": 210080 + }, + { + "epoch": 8.7, + "grad_norm": 0.74609375, + "learning_rate": 0.0004032482716613262, + "loss": 0.2111, + "step": 210090 + }, + { + "epoch": 8.7, + "grad_norm": 0.458984375, + "learning_rate": 0.0004032397028559112, + "loss": 0.133, + "step": 210100 + }, + { + "epoch": 8.7, + "grad_norm": 2.453125, + "learning_rate": 0.00040323113376211563, + "loss": 0.2201, + "step": 210110 + }, + { + "epoch": 8.7, + "grad_norm": 0.8671875, + "learning_rate": 0.00040322256437995565, + "loss": 0.2167, + "step": 210120 + }, + { + "epoch": 8.7, + "grad_norm": 1.046875, + "learning_rate": 0.00040321399470944733, + "loss": 0.1806, + "step": 210130 + }, + { + "epoch": 8.7, + "grad_norm": 0.8046875, + "learning_rate": 0.0004032054247506067, + "loss": 0.2369, + "step": 210140 + }, + { + "epoch": 8.7, + "grad_norm": 0.478515625, + "learning_rate": 0.0004031968545034502, + "loss": 0.212, + "step": 210150 + }, + { + "epoch": 8.7, + "grad_norm": 0.62890625, + "learning_rate": 0.00040318828396799367, + "loss": 0.2035, + "step": 210160 + }, + { + "epoch": 8.71, + "grad_norm": 1.6953125, + "learning_rate": 0.0004031797131442533, + "loss": 0.2157, + "step": 210170 + }, + { + "epoch": 8.71, + "grad_norm": 0.7265625, + "learning_rate": 0.00040317114203224536, + "loss": 0.2208, + "step": 210180 + }, + { + "epoch": 8.71, + "grad_norm": 0.490234375, + "learning_rate": 0.00040316257063198577, + "loss": 0.2034, + "step": 210190 + }, + { + "epoch": 8.71, + "grad_norm": 0.1796875, + "learning_rate": 0.0004031539989434908, + "loss": 0.1864, + "step": 210200 + }, + { + "epoch": 8.71, + "grad_norm": 1.15625, + "learning_rate": 0.0004031454269667766, + "loss": 0.1824, + "step": 210210 + }, + { + "epoch": 8.71, + "grad_norm": 0.5625, + "learning_rate": 0.0004031368547018591, + "loss": 0.2283, + "step": 210220 + }, + { + "epoch": 8.71, + "grad_norm": 2.109375, + "learning_rate": 0.00040312828214875466, + "loss": 0.1725, + "step": 210230 + }, + { + "epoch": 8.71, + "grad_norm": 0.7890625, + "learning_rate": 0.0004031197093074793, + "loss": 0.1978, + "step": 210240 + }, + { + "epoch": 8.71, + "grad_norm": 0.9296875, + "learning_rate": 0.0004031111361780493, + "loss": 0.2282, + "step": 210250 + }, + { + "epoch": 8.71, + "grad_norm": 0.9375, + "learning_rate": 0.00040310256276048054, + "loss": 0.1724, + "step": 210260 + }, + { + "epoch": 8.71, + "grad_norm": 0.484375, + "learning_rate": 0.00040309398905478933, + "loss": 0.2832, + "step": 210270 + }, + { + "epoch": 8.71, + "grad_norm": 0.1904296875, + "learning_rate": 0.00040308541506099184, + "loss": 0.1896, + "step": 210280 + }, + { + "epoch": 8.71, + "grad_norm": 0.6796875, + "learning_rate": 0.000403076840779104, + "loss": 0.1759, + "step": 210290 + }, + { + "epoch": 8.71, + "grad_norm": 1.09375, + "learning_rate": 0.00040306826620914216, + "loss": 0.2034, + "step": 210300 + }, + { + "epoch": 8.71, + "grad_norm": 0.75, + "learning_rate": 0.00040305969135112234, + "loss": 0.1662, + "step": 210310 + }, + { + "epoch": 8.71, + "grad_norm": 1.5, + "learning_rate": 0.00040305111620506074, + "loss": 0.1812, + "step": 210320 + }, + { + "epoch": 8.71, + "grad_norm": 0.384765625, + "learning_rate": 0.00040304254077097346, + "loss": 0.2231, + "step": 210330 + }, + { + "epoch": 8.71, + "grad_norm": 0.478515625, + "learning_rate": 0.0004030339650488766, + "loss": 0.1769, + "step": 210340 + }, + { + "epoch": 8.71, + "grad_norm": 0.66796875, + "learning_rate": 0.00040302538903878637, + "loss": 0.1758, + "step": 210350 + }, + { + "epoch": 8.71, + "grad_norm": 2.3125, + "learning_rate": 0.00040301681274071893, + "loss": 0.1953, + "step": 210360 + }, + { + "epoch": 8.71, + "grad_norm": 1.1875, + "learning_rate": 0.0004030082361546903, + "loss": 0.2295, + "step": 210370 + }, + { + "epoch": 8.71, + "grad_norm": 0.51171875, + "learning_rate": 0.00040299965928071674, + "loss": 0.2036, + "step": 210380 + }, + { + "epoch": 8.71, + "grad_norm": 0.50390625, + "learning_rate": 0.00040299108211881436, + "loss": 0.2027, + "step": 210390 + }, + { + "epoch": 8.71, + "grad_norm": 0.6015625, + "learning_rate": 0.0004029825046689992, + "loss": 0.2077, + "step": 210400 + }, + { + "epoch": 8.72, + "grad_norm": 0.78125, + "learning_rate": 0.0004029739269312876, + "loss": 0.1889, + "step": 210410 + }, + { + "epoch": 8.72, + "grad_norm": 0.61328125, + "learning_rate": 0.0004029653489056956, + "loss": 0.195, + "step": 210420 + }, + { + "epoch": 8.72, + "grad_norm": 0.84765625, + "learning_rate": 0.00040295677059223916, + "loss": 0.206, + "step": 210430 + }, + { + "epoch": 8.72, + "grad_norm": 0.34765625, + "learning_rate": 0.00040294819199093474, + "loss": 0.2526, + "step": 210440 + }, + { + "epoch": 8.72, + "grad_norm": 1.3359375, + "learning_rate": 0.0004029396131017983, + "loss": 0.1999, + "step": 210450 + }, + { + "epoch": 8.72, + "grad_norm": 0.70703125, + "learning_rate": 0.00040293103392484604, + "loss": 0.25, + "step": 210460 + }, + { + "epoch": 8.72, + "grad_norm": 1.0, + "learning_rate": 0.00040292245446009407, + "loss": 0.1952, + "step": 210470 + }, + { + "epoch": 8.72, + "grad_norm": 0.52734375, + "learning_rate": 0.0004029138747075586, + "loss": 0.1804, + "step": 210480 + }, + { + "epoch": 8.72, + "grad_norm": 0.51171875, + "learning_rate": 0.00040290529466725566, + "loss": 0.1981, + "step": 210490 + }, + { + "epoch": 8.72, + "grad_norm": 0.359375, + "learning_rate": 0.0004028967143392015, + "loss": 0.21, + "step": 210500 + }, + { + "epoch": 8.72, + "grad_norm": 1.234375, + "learning_rate": 0.00040288813372341224, + "loss": 0.181, + "step": 210510 + }, + { + "epoch": 8.72, + "grad_norm": 0.490234375, + "learning_rate": 0.00040287955281990403, + "loss": 0.264, + "step": 210520 + }, + { + "epoch": 8.72, + "grad_norm": 0.5625, + "learning_rate": 0.00040287097162869297, + "loss": 0.1518, + "step": 210530 + }, + { + "epoch": 8.72, + "grad_norm": 0.435546875, + "learning_rate": 0.0004028623901497953, + "loss": 0.1595, + "step": 210540 + }, + { + "epoch": 8.72, + "grad_norm": 0.67578125, + "learning_rate": 0.0004028538083832271, + "loss": 0.2258, + "step": 210550 + }, + { + "epoch": 8.72, + "grad_norm": 0.84765625, + "learning_rate": 0.00040284522632900457, + "loss": 0.1826, + "step": 210560 + }, + { + "epoch": 8.72, + "grad_norm": 0.6953125, + "learning_rate": 0.0004028366439871438, + "loss": 0.212, + "step": 210570 + }, + { + "epoch": 8.72, + "grad_norm": 0.4140625, + "learning_rate": 0.0004028280613576609, + "loss": 0.1799, + "step": 210580 + }, + { + "epoch": 8.72, + "grad_norm": 0.6640625, + "learning_rate": 0.0004028194784405722, + "loss": 0.2029, + "step": 210590 + }, + { + "epoch": 8.72, + "grad_norm": 0.57421875, + "learning_rate": 0.00040281089523589364, + "loss": 0.2372, + "step": 210600 + }, + { + "epoch": 8.72, + "grad_norm": 0.5546875, + "learning_rate": 0.00040280231174364155, + "loss": 0.1617, + "step": 210610 + }, + { + "epoch": 8.72, + "grad_norm": 0.51953125, + "learning_rate": 0.0004027937279638321, + "loss": 0.2222, + "step": 210620 + }, + { + "epoch": 8.72, + "grad_norm": 0.4453125, + "learning_rate": 0.00040278514389648117, + "loss": 0.1991, + "step": 210630 + }, + { + "epoch": 8.72, + "grad_norm": 0.609375, + "learning_rate": 0.0004027765595416052, + "loss": 0.2064, + "step": 210640 + }, + { + "epoch": 8.73, + "grad_norm": 0.69140625, + "learning_rate": 0.0004027679748992202, + "loss": 0.2572, + "step": 210650 + }, + { + "epoch": 8.73, + "grad_norm": 0.458984375, + "learning_rate": 0.0004027593899693424, + "loss": 0.1636, + "step": 210660 + }, + { + "epoch": 8.73, + "grad_norm": 0.62109375, + "learning_rate": 0.000402750804751988, + "loss": 0.2107, + "step": 210670 + }, + { + "epoch": 8.73, + "grad_norm": 0.32421875, + "learning_rate": 0.000402742219247173, + "loss": 0.2028, + "step": 210680 + }, + { + "epoch": 8.73, + "grad_norm": 0.69140625, + "learning_rate": 0.00040273363345491364, + "loss": 0.2194, + "step": 210690 + }, + { + "epoch": 8.73, + "grad_norm": 0.54296875, + "learning_rate": 0.0004027250473752261, + "loss": 0.1513, + "step": 210700 + }, + { + "epoch": 8.73, + "grad_norm": 0.8671875, + "learning_rate": 0.00040271646100812643, + "loss": 0.2146, + "step": 210710 + }, + { + "epoch": 8.73, + "grad_norm": 0.6796875, + "learning_rate": 0.000402707874353631, + "loss": 0.1879, + "step": 210720 + }, + { + "epoch": 8.73, + "grad_norm": 0.76171875, + "learning_rate": 0.00040269928741175576, + "loss": 0.2045, + "step": 210730 + }, + { + "epoch": 8.73, + "grad_norm": 0.58203125, + "learning_rate": 0.00040269070018251693, + "loss": 0.1714, + "step": 210740 + }, + { + "epoch": 8.73, + "grad_norm": 0.35546875, + "learning_rate": 0.0004026821126659307, + "loss": 0.2292, + "step": 210750 + }, + { + "epoch": 8.73, + "grad_norm": 1.4765625, + "learning_rate": 0.00040267352486201327, + "loss": 0.2061, + "step": 210760 + }, + { + "epoch": 8.73, + "grad_norm": 1.078125, + "learning_rate": 0.00040266493677078074, + "loss": 0.2124, + "step": 210770 + }, + { + "epoch": 8.73, + "grad_norm": 0.48046875, + "learning_rate": 0.0004026563483922493, + "loss": 0.2201, + "step": 210780 + }, + { + "epoch": 8.73, + "grad_norm": 0.921875, + "learning_rate": 0.0004026477597264351, + "loss": 0.1647, + "step": 210790 + }, + { + "epoch": 8.73, + "grad_norm": 0.58203125, + "learning_rate": 0.0004026391707733542, + "loss": 0.1535, + "step": 210800 + }, + { + "epoch": 8.73, + "grad_norm": 0.419921875, + "learning_rate": 0.00040263058153302303, + "loss": 0.1723, + "step": 210810 + }, + { + "epoch": 8.73, + "grad_norm": 0.875, + "learning_rate": 0.00040262199200545747, + "loss": 0.1886, + "step": 210820 + }, + { + "epoch": 8.73, + "grad_norm": 0.76953125, + "learning_rate": 0.00040261340219067385, + "loss": 0.1375, + "step": 210830 + }, + { + "epoch": 8.73, + "grad_norm": 0.51171875, + "learning_rate": 0.00040260481208868827, + "loss": 0.1653, + "step": 210840 + }, + { + "epoch": 8.73, + "grad_norm": 1.0625, + "learning_rate": 0.0004025962216995169, + "loss": 0.2241, + "step": 210850 + }, + { + "epoch": 8.73, + "grad_norm": 1.90625, + "learning_rate": 0.0004025876310231759, + "loss": 0.1969, + "step": 210860 + }, + { + "epoch": 8.73, + "grad_norm": 0.87890625, + "learning_rate": 0.0004025790400596815, + "loss": 0.2038, + "step": 210870 + }, + { + "epoch": 8.73, + "grad_norm": 0.62890625, + "learning_rate": 0.0004025704488090498, + "loss": 0.1841, + "step": 210880 + }, + { + "epoch": 8.74, + "grad_norm": 0.55859375, + "learning_rate": 0.00040256185727129704, + "loss": 0.1932, + "step": 210890 + }, + { + "epoch": 8.74, + "grad_norm": 0.0022735595703125, + "learning_rate": 0.0004025532654464393, + "loss": 0.187, + "step": 210900 + }, + { + "epoch": 8.74, + "grad_norm": 0.80078125, + "learning_rate": 0.0004025446733344928, + "loss": 0.2036, + "step": 210910 + }, + { + "epoch": 8.74, + "grad_norm": 0.96875, + "learning_rate": 0.00040253608093547365, + "loss": 0.2391, + "step": 210920 + }, + { + "epoch": 8.74, + "grad_norm": 0.578125, + "learning_rate": 0.0004025274882493981, + "loss": 0.1989, + "step": 210930 + }, + { + "epoch": 8.74, + "grad_norm": 0.5703125, + "learning_rate": 0.0004025188952762824, + "loss": 0.1847, + "step": 210940 + }, + { + "epoch": 8.74, + "grad_norm": 1.375, + "learning_rate": 0.0004025103020161425, + "loss": 0.171, + "step": 210950 + }, + { + "epoch": 8.74, + "grad_norm": 0.8203125, + "learning_rate": 0.00040250170846899467, + "loss": 0.2167, + "step": 210960 + }, + { + "epoch": 8.74, + "grad_norm": 0.640625, + "learning_rate": 0.0004024931146348551, + "loss": 0.1827, + "step": 210970 + }, + { + "epoch": 8.74, + "grad_norm": 0.69921875, + "learning_rate": 0.00040248452051373996, + "loss": 0.2081, + "step": 210980 + }, + { + "epoch": 8.74, + "grad_norm": 0.53515625, + "learning_rate": 0.0004024759261056654, + "loss": 0.2187, + "step": 210990 + }, + { + "epoch": 8.74, + "grad_norm": 0.65234375, + "learning_rate": 0.00040246733141064767, + "loss": 0.186, + "step": 211000 + }, + { + "epoch": 8.74, + "grad_norm": 1.125, + "learning_rate": 0.0004024587364287029, + "loss": 0.2681, + "step": 211010 + }, + { + "epoch": 8.74, + "grad_norm": 1.078125, + "learning_rate": 0.0004024501411598472, + "loss": 0.1776, + "step": 211020 + }, + { + "epoch": 8.74, + "grad_norm": 0.80859375, + "learning_rate": 0.00040244154560409675, + "loss": 0.2003, + "step": 211030 + }, + { + "epoch": 8.74, + "grad_norm": 1.1640625, + "learning_rate": 0.00040243294976146786, + "loss": 0.2116, + "step": 211040 + }, + { + "epoch": 8.74, + "grad_norm": 1.2734375, + "learning_rate": 0.0004024243536319766, + "loss": 0.2403, + "step": 211050 + }, + { + "epoch": 8.74, + "grad_norm": 0.55078125, + "learning_rate": 0.0004024157572156392, + "loss": 0.1981, + "step": 211060 + }, + { + "epoch": 8.74, + "grad_norm": 0.703125, + "learning_rate": 0.0004024071605124717, + "loss": 0.1945, + "step": 211070 + }, + { + "epoch": 8.74, + "grad_norm": 0.462890625, + "learning_rate": 0.0004023985635224905, + "loss": 0.1785, + "step": 211080 + }, + { + "epoch": 8.74, + "grad_norm": 0.51171875, + "learning_rate": 0.00040238996624571155, + "loss": 0.1934, + "step": 211090 + }, + { + "epoch": 8.74, + "grad_norm": 0.390625, + "learning_rate": 0.00040238136868215126, + "loss": 0.1954, + "step": 211100 + }, + { + "epoch": 8.74, + "grad_norm": 1.9609375, + "learning_rate": 0.0004023727708318256, + "loss": 0.2305, + "step": 211110 + }, + { + "epoch": 8.74, + "grad_norm": 0.359375, + "learning_rate": 0.0004023641726947509, + "loss": 0.1837, + "step": 211120 + }, + { + "epoch": 8.74, + "grad_norm": 0.84765625, + "learning_rate": 0.00040235557427094316, + "loss": 0.165, + "step": 211130 + }, + { + "epoch": 8.75, + "grad_norm": 0.69140625, + "learning_rate": 0.00040234697556041884, + "loss": 0.1657, + "step": 211140 + }, + { + "epoch": 8.75, + "grad_norm": 0.88671875, + "learning_rate": 0.0004023383765631939, + "loss": 0.2071, + "step": 211150 + }, + { + "epoch": 8.75, + "grad_norm": 0.5390625, + "learning_rate": 0.00040232977727928455, + "loss": 0.2323, + "step": 211160 + }, + { + "epoch": 8.75, + "grad_norm": 1.2578125, + "learning_rate": 0.00040232117770870704, + "loss": 0.2182, + "step": 211170 + }, + { + "epoch": 8.75, + "grad_norm": 0.9375, + "learning_rate": 0.00040231257785147754, + "loss": 0.1528, + "step": 211180 + }, + { + "epoch": 8.75, + "grad_norm": 0.60546875, + "learning_rate": 0.0004023039777076122, + "loss": 0.1793, + "step": 211190 + }, + { + "epoch": 8.75, + "grad_norm": 0.63671875, + "learning_rate": 0.00040229537727712723, + "loss": 0.1994, + "step": 211200 + }, + { + "epoch": 8.75, + "grad_norm": 0.5625, + "learning_rate": 0.0004022867765600388, + "loss": 0.2012, + "step": 211210 + }, + { + "epoch": 8.75, + "grad_norm": 1.46875, + "learning_rate": 0.00040227817555636307, + "loss": 0.2073, + "step": 211220 + }, + { + "epoch": 8.75, + "grad_norm": 0.55078125, + "learning_rate": 0.00040226957426611634, + "loss": 0.197, + "step": 211230 + }, + { + "epoch": 8.75, + "grad_norm": 0.50390625, + "learning_rate": 0.0004022609726893146, + "loss": 0.1651, + "step": 211240 + }, + { + "epoch": 8.75, + "grad_norm": 1.640625, + "learning_rate": 0.0004022523708259742, + "loss": 0.2669, + "step": 211250 + }, + { + "epoch": 8.75, + "grad_norm": 0.41015625, + "learning_rate": 0.00040224376867611136, + "loss": 0.1884, + "step": 211260 + }, + { + "epoch": 8.75, + "grad_norm": 0.79296875, + "learning_rate": 0.0004022351662397421, + "loss": 0.1998, + "step": 211270 + }, + { + "epoch": 8.75, + "grad_norm": 0.1513671875, + "learning_rate": 0.00040222656351688273, + "loss": 0.1902, + "step": 211280 + }, + { + "epoch": 8.75, + "grad_norm": 0.90625, + "learning_rate": 0.0004022179605075494, + "loss": 0.1958, + "step": 211290 + }, + { + "epoch": 8.75, + "grad_norm": 0.56640625, + "learning_rate": 0.0004022093572117583, + "loss": 0.1798, + "step": 211300 + }, + { + "epoch": 8.75, + "grad_norm": 0.58203125, + "learning_rate": 0.0004022007536295256, + "loss": 0.1835, + "step": 211310 + }, + { + "epoch": 8.75, + "grad_norm": 1.0078125, + "learning_rate": 0.0004021921497608676, + "loss": 0.1846, + "step": 211320 + }, + { + "epoch": 8.75, + "grad_norm": 0.0, + "learning_rate": 0.00040218354560580036, + "loss": 0.1926, + "step": 211330 + }, + { + "epoch": 8.75, + "grad_norm": 0.90234375, + "learning_rate": 0.0004021749411643401, + "loss": 0.1393, + "step": 211340 + }, + { + "epoch": 8.75, + "grad_norm": 0.890625, + "learning_rate": 0.00040216633643650303, + "loss": 0.2574, + "step": 211350 + }, + { + "epoch": 8.75, + "grad_norm": 0.48046875, + "learning_rate": 0.00040215773142230536, + "loss": 0.157, + "step": 211360 + }, + { + "epoch": 8.75, + "grad_norm": 0.400390625, + "learning_rate": 0.0004021491261217633, + "loss": 0.1965, + "step": 211370 + }, + { + "epoch": 8.76, + "grad_norm": 0.91796875, + "learning_rate": 0.000402140520534893, + "loss": 0.1905, + "step": 211380 + }, + { + "epoch": 8.76, + "grad_norm": 0.6875, + "learning_rate": 0.0004021319146617107, + "loss": 0.1972, + "step": 211390 + }, + { + "epoch": 8.76, + "grad_norm": 0.5, + "learning_rate": 0.00040212330850223255, + "loss": 0.1649, + "step": 211400 + }, + { + "epoch": 8.76, + "grad_norm": 1.03125, + "learning_rate": 0.0004021147020564747, + "loss": 0.2033, + "step": 211410 + }, + { + "epoch": 8.76, + "grad_norm": 0.53515625, + "learning_rate": 0.0004021060953244534, + "loss": 0.16, + "step": 211420 + }, + { + "epoch": 8.76, + "grad_norm": 0.5859375, + "learning_rate": 0.00040209748830618495, + "loss": 0.215, + "step": 211430 + }, + { + "epoch": 8.76, + "grad_norm": 0.7734375, + "learning_rate": 0.0004020888810016854, + "loss": 0.189, + "step": 211440 + }, + { + "epoch": 8.76, + "grad_norm": 0.8359375, + "learning_rate": 0.00040208027341097103, + "loss": 0.2438, + "step": 211450 + }, + { + "epoch": 8.76, + "grad_norm": 0.8671875, + "learning_rate": 0.000402071665534058, + "loss": 0.1582, + "step": 211460 + }, + { + "epoch": 8.76, + "grad_norm": 0.46484375, + "learning_rate": 0.0004020630573709625, + "loss": 0.1923, + "step": 211470 + }, + { + "epoch": 8.76, + "grad_norm": 0.3828125, + "learning_rate": 0.00040205444892170075, + "loss": 0.1702, + "step": 211480 + }, + { + "epoch": 8.76, + "grad_norm": 0.419921875, + "learning_rate": 0.000402045840186289, + "loss": 0.221, + "step": 211490 + }, + { + "epoch": 8.76, + "grad_norm": 0.75390625, + "learning_rate": 0.00040203723116474327, + "loss": 0.1907, + "step": 211500 + }, + { + "epoch": 8.76, + "grad_norm": 0.765625, + "learning_rate": 0.00040202862185708, + "loss": 0.1956, + "step": 211510 + }, + { + "epoch": 8.76, + "grad_norm": 1.1875, + "learning_rate": 0.00040202001226331526, + "loss": 0.2808, + "step": 211520 + }, + { + "epoch": 8.76, + "grad_norm": 1.21875, + "learning_rate": 0.0004020114023834652, + "loss": 0.1901, + "step": 211530 + }, + { + "epoch": 8.76, + "grad_norm": 0.62109375, + "learning_rate": 0.0004020027922175462, + "loss": 0.2138, + "step": 211540 + }, + { + "epoch": 8.76, + "grad_norm": 0.3671875, + "learning_rate": 0.00040199418176557435, + "loss": 0.1438, + "step": 211550 + }, + { + "epoch": 8.76, + "grad_norm": 0.396484375, + "learning_rate": 0.00040198557102756583, + "loss": 0.1624, + "step": 211560 + }, + { + "epoch": 8.76, + "grad_norm": 0.671875, + "learning_rate": 0.0004019769600035369, + "loss": 0.2217, + "step": 211570 + }, + { + "epoch": 8.76, + "grad_norm": 1.2265625, + "learning_rate": 0.0004019683486935037, + "loss": 0.2052, + "step": 211580 + }, + { + "epoch": 8.76, + "grad_norm": 1.640625, + "learning_rate": 0.00040195973709748246, + "loss": 0.2314, + "step": 211590 + }, + { + "epoch": 8.76, + "grad_norm": 0.5, + "learning_rate": 0.0004019511252154895, + "loss": 0.2235, + "step": 211600 + }, + { + "epoch": 8.76, + "grad_norm": 0.82421875, + "learning_rate": 0.0004019425130475409, + "loss": 0.2038, + "step": 211610 + }, + { + "epoch": 8.77, + "grad_norm": 0.5546875, + "learning_rate": 0.00040193390059365286, + "loss": 0.233, + "step": 211620 + }, + { + "epoch": 8.77, + "grad_norm": 0.232421875, + "learning_rate": 0.00040192528785384165, + "loss": 0.1997, + "step": 211630 + }, + { + "epoch": 8.77, + "grad_norm": 1.09375, + "learning_rate": 0.00040191667482812344, + "loss": 0.2133, + "step": 211640 + }, + { + "epoch": 8.77, + "grad_norm": 0.6171875, + "learning_rate": 0.0004019080615165145, + "loss": 0.2001, + "step": 211650 + }, + { + "epoch": 8.77, + "grad_norm": 0.43359375, + "learning_rate": 0.00040189944791903085, + "loss": 0.1773, + "step": 211660 + }, + { + "epoch": 8.77, + "grad_norm": 0.2890625, + "learning_rate": 0.00040189083403568905, + "loss": 0.2404, + "step": 211670 + }, + { + "epoch": 8.77, + "grad_norm": 0.6640625, + "learning_rate": 0.000401882219866505, + "loss": 0.1993, + "step": 211680 + }, + { + "epoch": 8.77, + "grad_norm": 1.0859375, + "learning_rate": 0.000401873605411495, + "loss": 0.1964, + "step": 211690 + }, + { + "epoch": 8.77, + "grad_norm": 0.66015625, + "learning_rate": 0.00040186499067067525, + "loss": 0.2239, + "step": 211700 + }, + { + "epoch": 8.77, + "grad_norm": 0.3984375, + "learning_rate": 0.0004018563756440621, + "loss": 0.2136, + "step": 211710 + }, + { + "epoch": 8.77, + "grad_norm": 0.54296875, + "learning_rate": 0.0004018477603316716, + "loss": 0.2232, + "step": 211720 + }, + { + "epoch": 8.77, + "grad_norm": 0.77734375, + "learning_rate": 0.00040183914473351995, + "loss": 0.238, + "step": 211730 + }, + { + "epoch": 8.77, + "grad_norm": 0.5703125, + "learning_rate": 0.00040183052884962343, + "loss": 0.1559, + "step": 211740 + }, + { + "epoch": 8.77, + "grad_norm": 0.86328125, + "learning_rate": 0.0004018219126799983, + "loss": 0.2158, + "step": 211750 + }, + { + "epoch": 8.77, + "grad_norm": 0.77734375, + "learning_rate": 0.00040181329622466074, + "loss": 0.1768, + "step": 211760 + }, + { + "epoch": 8.77, + "grad_norm": 1.0390625, + "learning_rate": 0.00040180467948362693, + "loss": 0.1781, + "step": 211770 + }, + { + "epoch": 8.77, + "grad_norm": 0.7890625, + "learning_rate": 0.0004017960624569131, + "loss": 0.1689, + "step": 211780 + }, + { + "epoch": 8.77, + "grad_norm": 1.5625, + "learning_rate": 0.00040178744514453546, + "loss": 0.1753, + "step": 211790 + }, + { + "epoch": 8.77, + "grad_norm": 1.4453125, + "learning_rate": 0.0004017788275465103, + "loss": 0.167, + "step": 211800 + }, + { + "epoch": 8.77, + "grad_norm": 0.78125, + "learning_rate": 0.0004017702096628537, + "loss": 0.1713, + "step": 211810 + }, + { + "epoch": 8.77, + "grad_norm": 1.078125, + "learning_rate": 0.00040176159149358197, + "loss": 0.2272, + "step": 211820 + }, + { + "epoch": 8.77, + "grad_norm": 2.5, + "learning_rate": 0.0004017529730387114, + "loss": 0.2221, + "step": 211830 + }, + { + "epoch": 8.77, + "grad_norm": 0.66015625, + "learning_rate": 0.00040174435429825804, + "loss": 0.1565, + "step": 211840 + }, + { + "epoch": 8.77, + "grad_norm": 0.875, + "learning_rate": 0.0004017357352722382, + "loss": 0.1747, + "step": 211850 + }, + { + "epoch": 8.78, + "grad_norm": 0.45703125, + "learning_rate": 0.00040172711596066815, + "loss": 0.16, + "step": 211860 + }, + { + "epoch": 8.78, + "grad_norm": 1.390625, + "learning_rate": 0.00040171849636356393, + "loss": 0.1927, + "step": 211870 + }, + { + "epoch": 8.78, + "grad_norm": 0.30078125, + "learning_rate": 0.00040170987648094206, + "loss": 0.1493, + "step": 211880 + }, + { + "epoch": 8.78, + "grad_norm": 2.0625, + "learning_rate": 0.0004017012563128184, + "loss": 0.2403, + "step": 211890 + }, + { + "epoch": 8.78, + "grad_norm": 0.84375, + "learning_rate": 0.00040169263585920946, + "loss": 0.1232, + "step": 211900 + }, + { + "epoch": 8.78, + "grad_norm": 0.87890625, + "learning_rate": 0.0004016840151201313, + "loss": 0.215, + "step": 211910 + }, + { + "epoch": 8.78, + "grad_norm": 0.56640625, + "learning_rate": 0.0004016753940956003, + "loss": 0.1657, + "step": 211920 + }, + { + "epoch": 8.78, + "grad_norm": 0.8671875, + "learning_rate": 0.00040166677278563254, + "loss": 0.2171, + "step": 211930 + }, + { + "epoch": 8.78, + "grad_norm": 1.0, + "learning_rate": 0.0004016581511902443, + "loss": 0.2235, + "step": 211940 + }, + { + "epoch": 8.78, + "grad_norm": 1.8046875, + "learning_rate": 0.00040164952930945175, + "loss": 0.182, + "step": 211950 + }, + { + "epoch": 8.78, + "grad_norm": 1.0625, + "learning_rate": 0.0004016409071432712, + "loss": 0.1959, + "step": 211960 + }, + { + "epoch": 8.78, + "grad_norm": 0.27734375, + "learning_rate": 0.0004016322846917188, + "loss": 0.1923, + "step": 211970 + }, + { + "epoch": 8.78, + "grad_norm": 0.427734375, + "learning_rate": 0.0004016236619548109, + "loss": 0.2466, + "step": 211980 + }, + { + "epoch": 8.78, + "grad_norm": 0.443359375, + "learning_rate": 0.0004016150389325636, + "loss": 0.2209, + "step": 211990 + }, + { + "epoch": 8.78, + "grad_norm": 2.1875, + "learning_rate": 0.0004016064156249931, + "loss": 0.2528, + "step": 212000 + }, + { + "epoch": 8.78, + "grad_norm": 0.828125, + "learning_rate": 0.0004015977920321158, + "loss": 0.206, + "step": 212010 + }, + { + "epoch": 8.78, + "grad_norm": 0.5625, + "learning_rate": 0.00040158916815394774, + "loss": 0.2038, + "step": 212020 + }, + { + "epoch": 8.78, + "grad_norm": 0.51171875, + "learning_rate": 0.0004015805439905053, + "loss": 0.1909, + "step": 212030 + }, + { + "epoch": 8.78, + "grad_norm": 0.0, + "learning_rate": 0.00040157191954180466, + "loss": 0.1799, + "step": 212040 + }, + { + "epoch": 8.78, + "grad_norm": 0.99609375, + "learning_rate": 0.00040156329480786195, + "loss": 0.2309, + "step": 212050 + }, + { + "epoch": 8.78, + "grad_norm": 1.015625, + "learning_rate": 0.00040155466978869357, + "loss": 0.1731, + "step": 212060 + }, + { + "epoch": 8.78, + "grad_norm": 0.6328125, + "learning_rate": 0.00040154604448431567, + "loss": 0.1966, + "step": 212070 + }, + { + "epoch": 8.78, + "grad_norm": 0.67578125, + "learning_rate": 0.00040153741889474445, + "loss": 0.1726, + "step": 212080 + }, + { + "epoch": 8.78, + "grad_norm": 0.4140625, + "learning_rate": 0.00040152879301999614, + "loss": 0.1882, + "step": 212090 + }, + { + "epoch": 8.79, + "grad_norm": 1.7109375, + "learning_rate": 0.00040152016686008704, + "loss": 0.2054, + "step": 212100 + }, + { + "epoch": 8.79, + "grad_norm": 0.59375, + "learning_rate": 0.0004015115404150334, + "loss": 0.1888, + "step": 212110 + }, + { + "epoch": 8.79, + "grad_norm": 0.7421875, + "learning_rate": 0.00040150291368485134, + "loss": 0.2293, + "step": 212120 + }, + { + "epoch": 8.79, + "grad_norm": 1.28125, + "learning_rate": 0.00040149428666955714, + "loss": 0.2449, + "step": 212130 + }, + { + "epoch": 8.79, + "grad_norm": 0.9296875, + "learning_rate": 0.00040148565936916705, + "loss": 0.1876, + "step": 212140 + }, + { + "epoch": 8.79, + "grad_norm": 1.3671875, + "learning_rate": 0.00040147703178369733, + "loss": 0.2168, + "step": 212150 + }, + { + "epoch": 8.79, + "grad_norm": 0.78125, + "learning_rate": 0.0004014684039131642, + "loss": 0.1629, + "step": 212160 + }, + { + "epoch": 8.79, + "grad_norm": 0.86328125, + "learning_rate": 0.00040145977575758394, + "loss": 0.2101, + "step": 212170 + }, + { + "epoch": 8.79, + "grad_norm": 0.734375, + "learning_rate": 0.00040145114731697263, + "loss": 0.1726, + "step": 212180 + }, + { + "epoch": 8.79, + "grad_norm": 0.5, + "learning_rate": 0.0004014425185913467, + "loss": 0.1861, + "step": 212190 + }, + { + "epoch": 8.79, + "grad_norm": 0.46875, + "learning_rate": 0.0004014338895807223, + "loss": 0.2055, + "step": 212200 + }, + { + "epoch": 8.79, + "grad_norm": 0.515625, + "learning_rate": 0.00040142526028511563, + "loss": 0.2141, + "step": 212210 + }, + { + "epoch": 8.79, + "grad_norm": 1.1171875, + "learning_rate": 0.000401416630704543, + "loss": 0.1527, + "step": 212220 + }, + { + "epoch": 8.79, + "grad_norm": 0.51953125, + "learning_rate": 0.0004014080008390206, + "loss": 0.2209, + "step": 212230 + }, + { + "epoch": 8.79, + "grad_norm": 0.671875, + "learning_rate": 0.0004013993706885647, + "loss": 0.2059, + "step": 212240 + }, + { + "epoch": 8.79, + "grad_norm": 0.4296875, + "learning_rate": 0.00040139074025319154, + "loss": 0.2129, + "step": 212250 + }, + { + "epoch": 8.79, + "grad_norm": 0.6953125, + "learning_rate": 0.00040138210953291734, + "loss": 0.1804, + "step": 212260 + }, + { + "epoch": 8.79, + "grad_norm": 0.6015625, + "learning_rate": 0.0004013734785277584, + "loss": 0.2181, + "step": 212270 + }, + { + "epoch": 8.79, + "grad_norm": 0.82421875, + "learning_rate": 0.0004013648472377309, + "loss": 0.1574, + "step": 212280 + }, + { + "epoch": 8.79, + "grad_norm": 0.73828125, + "learning_rate": 0.0004013562156628511, + "loss": 0.1671, + "step": 212290 + }, + { + "epoch": 8.79, + "grad_norm": 0.58984375, + "learning_rate": 0.0004013475838031353, + "loss": 0.1991, + "step": 212300 + }, + { + "epoch": 8.79, + "grad_norm": 0.83984375, + "learning_rate": 0.0004013389516585996, + "loss": 0.2031, + "step": 212310 + }, + { + "epoch": 8.79, + "grad_norm": 0.875, + "learning_rate": 0.0004013303192292604, + "loss": 0.1813, + "step": 212320 + }, + { + "epoch": 8.79, + "grad_norm": 0.53515625, + "learning_rate": 0.00040132168651513386, + "loss": 0.223, + "step": 212330 + }, + { + "epoch": 8.8, + "grad_norm": 0.52734375, + "learning_rate": 0.0004013130535162362, + "loss": 0.2192, + "step": 212340 + }, + { + "epoch": 8.8, + "grad_norm": 0.76953125, + "learning_rate": 0.0004013044202325837, + "loss": 0.2197, + "step": 212350 + }, + { + "epoch": 8.8, + "grad_norm": 0.0, + "learning_rate": 0.00040129578666419274, + "loss": 0.208, + "step": 212360 + }, + { + "epoch": 8.8, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0004012871528110794, + "loss": 0.2312, + "step": 212370 + }, + { + "epoch": 8.8, + "grad_norm": 0.0, + "learning_rate": 0.00040127851867325997, + "loss": 0.2008, + "step": 212380 + }, + { + "epoch": 8.8, + "grad_norm": 1.53125, + "learning_rate": 0.00040126988425075074, + "loss": 0.1427, + "step": 212390 + }, + { + "epoch": 8.8, + "grad_norm": 0.6171875, + "learning_rate": 0.00040126124954356786, + "loss": 0.2439, + "step": 212400 + }, + { + "epoch": 8.8, + "grad_norm": 0.408203125, + "learning_rate": 0.0004012526145517277, + "loss": 0.1957, + "step": 212410 + }, + { + "epoch": 8.8, + "grad_norm": 0.54296875, + "learning_rate": 0.00040124397927524636, + "loss": 0.1737, + "step": 212420 + }, + { + "epoch": 8.8, + "grad_norm": 0.95703125, + "learning_rate": 0.0004012353437141402, + "loss": 0.1967, + "step": 212430 + }, + { + "epoch": 8.8, + "grad_norm": 0.28125, + "learning_rate": 0.0004012267078684255, + "loss": 0.1783, + "step": 212440 + }, + { + "epoch": 8.8, + "grad_norm": 0.384765625, + "learning_rate": 0.00040121807173811854, + "loss": 0.2233, + "step": 212450 + }, + { + "epoch": 8.8, + "grad_norm": 0.7890625, + "learning_rate": 0.0004012094353232354, + "loss": 0.1746, + "step": 212460 + }, + { + "epoch": 8.8, + "grad_norm": 0.59765625, + "learning_rate": 0.0004012007986237924, + "loss": 0.1921, + "step": 212470 + }, + { + "epoch": 8.8, + "grad_norm": 0.84375, + "learning_rate": 0.00040119216163980595, + "loss": 0.1749, + "step": 212480 + }, + { + "epoch": 8.8, + "grad_norm": 0.478515625, + "learning_rate": 0.00040118352437129215, + "loss": 0.2189, + "step": 212490 + }, + { + "epoch": 8.8, + "grad_norm": 1.5390625, + "learning_rate": 0.00040117488681826717, + "loss": 0.2163, + "step": 212500 + }, + { + "epoch": 8.8, + "grad_norm": 0.7421875, + "learning_rate": 0.0004011662489807474, + "loss": 0.2368, + "step": 212510 + }, + { + "epoch": 8.8, + "grad_norm": 0.427734375, + "learning_rate": 0.00040115761085874913, + "loss": 0.1998, + "step": 212520 + }, + { + "epoch": 8.8, + "grad_norm": 0.89453125, + "learning_rate": 0.0004011489724522886, + "loss": 0.2385, + "step": 212530 + }, + { + "epoch": 8.8, + "grad_norm": 0.79296875, + "learning_rate": 0.000401140333761382, + "loss": 0.1867, + "step": 212540 + }, + { + "epoch": 8.8, + "grad_norm": 0.89453125, + "learning_rate": 0.0004011316947860456, + "loss": 0.2222, + "step": 212550 + }, + { + "epoch": 8.8, + "grad_norm": 0.6875, + "learning_rate": 0.00040112305552629567, + "loss": 0.2239, + "step": 212560 + }, + { + "epoch": 8.8, + "grad_norm": 0.291015625, + "learning_rate": 0.0004011144159821484, + "loss": 0.1591, + "step": 212570 + }, + { + "epoch": 8.81, + "grad_norm": 0.6015625, + "learning_rate": 0.00040110577615362023, + "loss": 0.2189, + "step": 212580 + }, + { + "epoch": 8.81, + "grad_norm": 0.8125, + "learning_rate": 0.0004010971360407273, + "loss": 0.2343, + "step": 212590 + }, + { + "epoch": 8.81, + "grad_norm": 0.58984375, + "learning_rate": 0.0004010884956434858, + "loss": 0.1918, + "step": 212600 + }, + { + "epoch": 8.81, + "grad_norm": 1.1015625, + "learning_rate": 0.0004010798549619121, + "loss": 0.1948, + "step": 212610 + }, + { + "epoch": 8.81, + "grad_norm": 0.48828125, + "learning_rate": 0.0004010712139960224, + "loss": 0.1796, + "step": 212620 + }, + { + "epoch": 8.81, + "grad_norm": 0.5703125, + "learning_rate": 0.000401062572745833, + "loss": 0.1546, + "step": 212630 + }, + { + "epoch": 8.81, + "grad_norm": 1.0546875, + "learning_rate": 0.00040105393121136026, + "loss": 0.2275, + "step": 212640 + }, + { + "epoch": 8.81, + "grad_norm": 0.3984375, + "learning_rate": 0.0004010452893926202, + "loss": 0.1925, + "step": 212650 + }, + { + "epoch": 8.81, + "grad_norm": 1.0234375, + "learning_rate": 0.0004010366472896293, + "loss": 0.2385, + "step": 212660 + }, + { + "epoch": 8.81, + "grad_norm": 1.125, + "learning_rate": 0.0004010280049024037, + "loss": 0.2027, + "step": 212670 + }, + { + "epoch": 8.81, + "grad_norm": 0.70703125, + "learning_rate": 0.0004010193622309597, + "loss": 0.2307, + "step": 212680 + }, + { + "epoch": 8.81, + "grad_norm": 0.3359375, + "learning_rate": 0.00040101071927531356, + "loss": 0.1815, + "step": 212690 + }, + { + "epoch": 8.81, + "grad_norm": 0.53515625, + "learning_rate": 0.0004010020760354816, + "loss": 0.2188, + "step": 212700 + }, + { + "epoch": 8.81, + "grad_norm": 0.671875, + "learning_rate": 0.00040099343251148, + "loss": 0.1757, + "step": 212710 + }, + { + "epoch": 8.81, + "grad_norm": 0.84375, + "learning_rate": 0.0004009847887033251, + "loss": 0.2333, + "step": 212720 + }, + { + "epoch": 8.81, + "grad_norm": 0.53125, + "learning_rate": 0.00040097614461103305, + "loss": 0.2385, + "step": 212730 + }, + { + "epoch": 8.81, + "grad_norm": 0.365234375, + "learning_rate": 0.0004009675002346203, + "loss": 0.2246, + "step": 212740 + }, + { + "epoch": 8.81, + "grad_norm": 0.2060546875, + "learning_rate": 0.000400958855574103, + "loss": 0.1418, + "step": 212750 + }, + { + "epoch": 8.81, + "grad_norm": 0.87890625, + "learning_rate": 0.0004009502106294974, + "loss": 0.1784, + "step": 212760 + }, + { + "epoch": 8.81, + "grad_norm": 0.52734375, + "learning_rate": 0.00040094156540081984, + "loss": 0.2144, + "step": 212770 + }, + { + "epoch": 8.81, + "grad_norm": 0.62890625, + "learning_rate": 0.0004009329198880865, + "loss": 0.1713, + "step": 212780 + }, + { + "epoch": 8.81, + "grad_norm": 0.91796875, + "learning_rate": 0.00040092427409131376, + "loss": 0.2052, + "step": 212790 + }, + { + "epoch": 8.81, + "grad_norm": 0.75390625, + "learning_rate": 0.00040091562801051784, + "loss": 0.2322, + "step": 212800 + }, + { + "epoch": 8.81, + "grad_norm": 1.2265625, + "learning_rate": 0.00040090698164571494, + "loss": 0.2322, + "step": 212810 + }, + { + "epoch": 8.81, + "grad_norm": 0.7109375, + "learning_rate": 0.00040089833499692143, + "loss": 0.1815, + "step": 212820 + }, + { + "epoch": 8.82, + "grad_norm": 0.578125, + "learning_rate": 0.0004008896880641536, + "loss": 0.1712, + "step": 212830 + }, + { + "epoch": 8.82, + "grad_norm": 0.34765625, + "learning_rate": 0.0004008810408474276, + "loss": 0.1985, + "step": 212840 + }, + { + "epoch": 8.82, + "grad_norm": 1.3203125, + "learning_rate": 0.00040087239334675977, + "loss": 0.1849, + "step": 212850 + }, + { + "epoch": 8.82, + "grad_norm": 0.83984375, + "learning_rate": 0.0004008637455621664, + "loss": 0.1964, + "step": 212860 + }, + { + "epoch": 8.82, + "grad_norm": 0.7265625, + "learning_rate": 0.0004008550974936638, + "loss": 0.2078, + "step": 212870 + }, + { + "epoch": 8.82, + "grad_norm": 1.203125, + "learning_rate": 0.00040084644914126824, + "loss": 0.1705, + "step": 212880 + }, + { + "epoch": 8.82, + "grad_norm": 1.09375, + "learning_rate": 0.0004008378005049958, + "loss": 0.1327, + "step": 212890 + }, + { + "epoch": 8.82, + "grad_norm": 0.81640625, + "learning_rate": 0.000400829151584863, + "loss": 0.1954, + "step": 212900 + }, + { + "epoch": 8.82, + "grad_norm": 0.6640625, + "learning_rate": 0.000400820502380886, + "loss": 0.1519, + "step": 212910 + }, + { + "epoch": 8.82, + "grad_norm": 0.55859375, + "learning_rate": 0.00040081185289308113, + "loss": 0.1682, + "step": 212920 + }, + { + "epoch": 8.82, + "grad_norm": 1.078125, + "learning_rate": 0.00040080320312146467, + "loss": 0.249, + "step": 212930 + }, + { + "epoch": 8.82, + "grad_norm": 0.458984375, + "learning_rate": 0.0004007945530660528, + "loss": 0.2246, + "step": 212940 + }, + { + "epoch": 8.82, + "grad_norm": 0.73828125, + "learning_rate": 0.00040078590272686187, + "loss": 0.1983, + "step": 212950 + }, + { + "epoch": 8.82, + "grad_norm": 1.1484375, + "learning_rate": 0.0004007772521039082, + "loss": 0.1929, + "step": 212960 + }, + { + "epoch": 8.82, + "grad_norm": 0.91015625, + "learning_rate": 0.000400768601197208, + "loss": 0.1981, + "step": 212970 + }, + { + "epoch": 8.82, + "grad_norm": 0.84765625, + "learning_rate": 0.00040075995000677755, + "loss": 0.195, + "step": 212980 + }, + { + "epoch": 8.82, + "grad_norm": 1.40625, + "learning_rate": 0.00040075129853263316, + "loss": 0.1959, + "step": 212990 + }, + { + "epoch": 8.82, + "grad_norm": 0.455078125, + "learning_rate": 0.00040074264677479116, + "loss": 0.2611, + "step": 213000 + }, + { + "epoch": 8.82, + "grad_norm": 0.94921875, + "learning_rate": 0.0004007339947332678, + "loss": 0.2187, + "step": 213010 + }, + { + "epoch": 8.82, + "grad_norm": 0.373046875, + "learning_rate": 0.0004007253424080792, + "loss": 0.1869, + "step": 213020 + }, + { + "epoch": 8.82, + "grad_norm": 1.078125, + "learning_rate": 0.0004007166897992419, + "loss": 0.2124, + "step": 213030 + }, + { + "epoch": 8.82, + "grad_norm": 0.81640625, + "learning_rate": 0.000400708036906772, + "loss": 0.2194, + "step": 213040 + }, + { + "epoch": 8.82, + "grad_norm": 0.439453125, + "learning_rate": 0.00040069938373068586, + "loss": 0.1596, + "step": 213050 + }, + { + "epoch": 8.82, + "grad_norm": 0.76171875, + "learning_rate": 0.0004006907302709998, + "loss": 0.1921, + "step": 213060 + }, + { + "epoch": 8.83, + "grad_norm": 0.494140625, + "learning_rate": 0.00040068207652773003, + "loss": 0.1876, + "step": 213070 + }, + { + "epoch": 8.83, + "grad_norm": 0.1572265625, + "learning_rate": 0.0004006734225008929, + "loss": 0.1282, + "step": 213080 + }, + { + "epoch": 8.83, + "grad_norm": 1.3046875, + "learning_rate": 0.00040066476819050455, + "loss": 0.2053, + "step": 213090 + }, + { + "epoch": 8.83, + "grad_norm": 1.1875, + "learning_rate": 0.0004006561135965815, + "loss": 0.1877, + "step": 213100 + }, + { + "epoch": 8.83, + "grad_norm": 0.462890625, + "learning_rate": 0.0004006474587191399, + "loss": 0.1905, + "step": 213110 + }, + { + "epoch": 8.83, + "grad_norm": 0.53515625, + "learning_rate": 0.00040063880355819593, + "loss": 0.1871, + "step": 213120 + }, + { + "epoch": 8.83, + "grad_norm": 2.46875, + "learning_rate": 0.00040063014811376606, + "loss": 0.2421, + "step": 213130 + }, + { + "epoch": 8.83, + "grad_norm": 0.9765625, + "learning_rate": 0.00040062149238586666, + "loss": 0.1955, + "step": 213140 + }, + { + "epoch": 8.83, + "grad_norm": 1.515625, + "learning_rate": 0.00040061283637451376, + "loss": 0.1866, + "step": 213150 + }, + { + "epoch": 8.83, + "grad_norm": 0.6484375, + "learning_rate": 0.0004006041800797237, + "loss": 0.2222, + "step": 213160 + }, + { + "epoch": 8.83, + "grad_norm": 0.66015625, + "learning_rate": 0.000400595523501513, + "loss": 0.1882, + "step": 213170 + }, + { + "epoch": 8.83, + "grad_norm": 0.44140625, + "learning_rate": 0.0004005868666398976, + "loss": 0.171, + "step": 213180 + }, + { + "epoch": 8.83, + "grad_norm": 1.4765625, + "learning_rate": 0.0004005782094948941, + "loss": 0.1829, + "step": 213190 + }, + { + "epoch": 8.83, + "grad_norm": 2.328125, + "learning_rate": 0.00040056955206651856, + "loss": 0.2254, + "step": 213200 + }, + { + "epoch": 8.83, + "grad_norm": 1.2578125, + "learning_rate": 0.0004005608943547875, + "loss": 0.1913, + "step": 213210 + }, + { + "epoch": 8.83, + "grad_norm": 1.390625, + "learning_rate": 0.0004005522363597171, + "loss": 0.2175, + "step": 213220 + }, + { + "epoch": 8.83, + "grad_norm": 0.197265625, + "learning_rate": 0.0004005435780813236, + "loss": 0.2105, + "step": 213230 + }, + { + "epoch": 8.83, + "grad_norm": 1.375, + "learning_rate": 0.0004005349195196234, + "loss": 0.2239, + "step": 213240 + }, + { + "epoch": 8.83, + "grad_norm": 1.3828125, + "learning_rate": 0.00040052626067463266, + "loss": 0.1879, + "step": 213250 + }, + { + "epoch": 8.83, + "grad_norm": 0.3984375, + "learning_rate": 0.00040051760154636776, + "loss": 0.1653, + "step": 213260 + }, + { + "epoch": 8.83, + "grad_norm": 0.41796875, + "learning_rate": 0.000400508942134845, + "loss": 0.1911, + "step": 213270 + }, + { + "epoch": 8.83, + "grad_norm": 0.578125, + "learning_rate": 0.0004005002824400807, + "loss": 0.1404, + "step": 213280 + }, + { + "epoch": 8.83, + "grad_norm": 0.9765625, + "learning_rate": 0.0004004916224620911, + "loss": 0.1624, + "step": 213290 + }, + { + "epoch": 8.83, + "grad_norm": 0.404296875, + "learning_rate": 0.00040048296220089255, + "loss": 0.1641, + "step": 213300 + }, + { + "epoch": 8.84, + "grad_norm": 1.1796875, + "learning_rate": 0.0004004743016565012, + "loss": 0.1886, + "step": 213310 + }, + { + "epoch": 8.84, + "grad_norm": 0.640625, + "learning_rate": 0.0004004656408289336, + "loss": 0.1798, + "step": 213320 + }, + { + "epoch": 8.84, + "grad_norm": 0.63671875, + "learning_rate": 0.0004004569797182058, + "loss": 0.1738, + "step": 213330 + }, + { + "epoch": 8.84, + "grad_norm": 0.62890625, + "learning_rate": 0.0004004483183243343, + "loss": 0.1715, + "step": 213340 + }, + { + "epoch": 8.84, + "grad_norm": 0.5625, + "learning_rate": 0.0004004396566473353, + "loss": 0.1794, + "step": 213350 + }, + { + "epoch": 8.84, + "grad_norm": 0.9140625, + "learning_rate": 0.0004004309946872251, + "loss": 0.1977, + "step": 213360 + }, + { + "epoch": 8.84, + "grad_norm": 0.7265625, + "learning_rate": 0.00040042233244402006, + "loss": 0.2054, + "step": 213370 + }, + { + "epoch": 8.84, + "grad_norm": 0.9375, + "learning_rate": 0.0004004136699177363, + "loss": 0.1521, + "step": 213380 + }, + { + "epoch": 8.84, + "grad_norm": 1.25, + "learning_rate": 0.0004004050071083903, + "loss": 0.1971, + "step": 213390 + }, + { + "epoch": 8.84, + "grad_norm": 0.91015625, + "learning_rate": 0.00040039634401599843, + "loss": 0.1399, + "step": 213400 + }, + { + "epoch": 8.84, + "grad_norm": 0.33203125, + "learning_rate": 0.00040038768064057676, + "loss": 0.2016, + "step": 213410 + }, + { + "epoch": 8.84, + "grad_norm": 1.0625, + "learning_rate": 0.00040037901698214184, + "loss": 0.2371, + "step": 213420 + }, + { + "epoch": 8.84, + "grad_norm": 0.5546875, + "learning_rate": 0.0004003703530407098, + "loss": 0.2059, + "step": 213430 + }, + { + "epoch": 8.84, + "grad_norm": 0.466796875, + "learning_rate": 0.0004003616888162969, + "loss": 0.2103, + "step": 213440 + }, + { + "epoch": 8.84, + "grad_norm": 0.5234375, + "learning_rate": 0.0004003530243089196, + "loss": 0.2134, + "step": 213450 + }, + { + "epoch": 8.84, + "grad_norm": 0.61328125, + "learning_rate": 0.0004003443595185942, + "loss": 0.2009, + "step": 213460 + }, + { + "epoch": 8.84, + "grad_norm": 1.6015625, + "learning_rate": 0.0004003356944453369, + "loss": 0.1633, + "step": 213470 + }, + { + "epoch": 8.84, + "grad_norm": 0.9921875, + "learning_rate": 0.0004003270290891641, + "loss": 0.1801, + "step": 213480 + }, + { + "epoch": 8.84, + "grad_norm": 1.5390625, + "learning_rate": 0.000400318363450092, + "loss": 0.1905, + "step": 213490 + }, + { + "epoch": 8.84, + "grad_norm": 0.283203125, + "learning_rate": 0.0004003096975281369, + "loss": 0.204, + "step": 213500 + }, + { + "epoch": 8.84, + "grad_norm": 0.93359375, + "learning_rate": 0.0004003010313233154, + "loss": 0.1746, + "step": 213510 + }, + { + "epoch": 8.84, + "grad_norm": 1.09375, + "learning_rate": 0.0004002923648356435, + "loss": 0.2062, + "step": 213520 + }, + { + "epoch": 8.84, + "grad_norm": 0.66015625, + "learning_rate": 0.0004002836980651375, + "loss": 0.1843, + "step": 213530 + }, + { + "epoch": 8.84, + "grad_norm": 0.3359375, + "learning_rate": 0.00040027503101181384, + "loss": 0.2149, + "step": 213540 + }, + { + "epoch": 8.85, + "grad_norm": 0.765625, + "learning_rate": 0.00040026636367568893, + "loss": 0.1752, + "step": 213550 + }, + { + "epoch": 8.85, + "grad_norm": 0.6875, + "learning_rate": 0.00040025769605677884, + "loss": 0.1923, + "step": 213560 + }, + { + "epoch": 8.85, + "grad_norm": 0.65234375, + "learning_rate": 0.0004002490281551, + "loss": 0.1815, + "step": 213570 + }, + { + "epoch": 8.85, + "grad_norm": 0.828125, + "learning_rate": 0.0004002403599706688, + "loss": 0.1952, + "step": 213580 + }, + { + "epoch": 8.85, + "grad_norm": 0.64453125, + "learning_rate": 0.00040023169150350135, + "loss": 0.2016, + "step": 213590 + }, + { + "epoch": 8.85, + "grad_norm": 0.58203125, + "learning_rate": 0.0004002230227536141, + "loss": 0.1804, + "step": 213600 + }, + { + "epoch": 8.85, + "grad_norm": 0.63671875, + "learning_rate": 0.00040021435372102337, + "loss": 0.287, + "step": 213610 + }, + { + "epoch": 8.85, + "grad_norm": 1.8515625, + "learning_rate": 0.00040020568440574545, + "loss": 0.1914, + "step": 213620 + }, + { + "epoch": 8.85, + "grad_norm": 1.1015625, + "learning_rate": 0.0004001970148077967, + "loss": 0.1512, + "step": 213630 + }, + { + "epoch": 8.85, + "grad_norm": 0.95703125, + "learning_rate": 0.00040018834492719336, + "loss": 0.2406, + "step": 213640 + }, + { + "epoch": 8.85, + "grad_norm": 0.75390625, + "learning_rate": 0.00040017967476395165, + "loss": 0.2551, + "step": 213650 + }, + { + "epoch": 8.85, + "grad_norm": 0.0, + "learning_rate": 0.00040017100431808815, + "loss": 0.2352, + "step": 213660 + }, + { + "epoch": 8.85, + "grad_norm": 1.0625, + "learning_rate": 0.000400162333589619, + "loss": 0.2349, + "step": 213670 + }, + { + "epoch": 8.85, + "grad_norm": 0.400390625, + "learning_rate": 0.0004001536625785606, + "loss": 0.2182, + "step": 213680 + }, + { + "epoch": 8.85, + "grad_norm": 0.41796875, + "learning_rate": 0.00040014499128492915, + "loss": 0.2367, + "step": 213690 + }, + { + "epoch": 8.85, + "grad_norm": 0.98046875, + "learning_rate": 0.000400136319708741, + "loss": 0.2228, + "step": 213700 + }, + { + "epoch": 8.85, + "grad_norm": 0.7265625, + "learning_rate": 0.00040012764785001264, + "loss": 0.1621, + "step": 213710 + }, + { + "epoch": 8.85, + "grad_norm": 0.87890625, + "learning_rate": 0.00040011897570876024, + "loss": 0.142, + "step": 213720 + }, + { + "epoch": 8.85, + "grad_norm": 1.703125, + "learning_rate": 0.00040011030328500005, + "loss": 0.2122, + "step": 213730 + }, + { + "epoch": 8.85, + "grad_norm": 1.3046875, + "learning_rate": 0.00040010163057874854, + "loss": 0.1833, + "step": 213740 + }, + { + "epoch": 8.85, + "grad_norm": 0.36328125, + "learning_rate": 0.00040009295759002193, + "loss": 0.2142, + "step": 213750 + }, + { + "epoch": 8.85, + "grad_norm": 0.8984375, + "learning_rate": 0.0004000842843188366, + "loss": 0.1798, + "step": 213760 + }, + { + "epoch": 8.85, + "grad_norm": 0.5703125, + "learning_rate": 0.0004000756107652089, + "loss": 0.2445, + "step": 213770 + }, + { + "epoch": 8.85, + "grad_norm": 0.515625, + "learning_rate": 0.00040006693692915506, + "loss": 0.1944, + "step": 213780 + }, + { + "epoch": 8.86, + "grad_norm": 0.458984375, + "learning_rate": 0.00040005826281069146, + "loss": 0.2036, + "step": 213790 + }, + { + "epoch": 8.86, + "grad_norm": 0.50390625, + "learning_rate": 0.0004000495884098344, + "loss": 0.2163, + "step": 213800 + }, + { + "epoch": 8.86, + "grad_norm": 0.451171875, + "learning_rate": 0.00040004091372660024, + "loss": 0.1977, + "step": 213810 + }, + { + "epoch": 8.86, + "grad_norm": 0.224609375, + "learning_rate": 0.00040003223876100525, + "loss": 0.1966, + "step": 213820 + }, + { + "epoch": 8.86, + "grad_norm": 1.2890625, + "learning_rate": 0.00040002356351306586, + "loss": 0.1975, + "step": 213830 + }, + { + "epoch": 8.86, + "grad_norm": 0.5078125, + "learning_rate": 0.00040001488798279826, + "loss": 0.1506, + "step": 213840 + }, + { + "epoch": 8.86, + "grad_norm": 0.578125, + "learning_rate": 0.00040000621217021895, + "loss": 0.1434, + "step": 213850 + }, + { + "epoch": 8.86, + "grad_norm": 0.578125, + "learning_rate": 0.00039999753607534397, + "loss": 0.2242, + "step": 213860 + }, + { + "epoch": 8.86, + "grad_norm": 0.98828125, + "learning_rate": 0.0003999888596981899, + "loss": 0.2012, + "step": 213870 + }, + { + "epoch": 8.86, + "grad_norm": 0.74609375, + "learning_rate": 0.000399980183038773, + "loss": 0.2274, + "step": 213880 + }, + { + "epoch": 8.86, + "grad_norm": 0.734375, + "learning_rate": 0.00039997150609710964, + "loss": 0.2064, + "step": 213890 + }, + { + "epoch": 8.86, + "grad_norm": 0.55078125, + "learning_rate": 0.00039996282887321607, + "loss": 0.2055, + "step": 213900 + }, + { + "epoch": 8.86, + "grad_norm": 1.0859375, + "learning_rate": 0.0003999541513671087, + "loss": 0.1959, + "step": 213910 + }, + { + "epoch": 8.86, + "grad_norm": 1.40625, + "learning_rate": 0.00039994547357880374, + "loss": 0.1892, + "step": 213920 + }, + { + "epoch": 8.86, + "grad_norm": 0.294921875, + "learning_rate": 0.00039993679550831757, + "loss": 0.1702, + "step": 213930 + }, + { + "epoch": 8.86, + "grad_norm": 0.6328125, + "learning_rate": 0.0003999281171556666, + "loss": 0.223, + "step": 213940 + }, + { + "epoch": 8.86, + "grad_norm": 1.1796875, + "learning_rate": 0.000399919438520867, + "loss": 0.1916, + "step": 213950 + }, + { + "epoch": 8.86, + "grad_norm": 0.875, + "learning_rate": 0.0003999107596039353, + "loss": 0.2119, + "step": 213960 + }, + { + "epoch": 8.86, + "grad_norm": 0.7109375, + "learning_rate": 0.0003999020804048877, + "loss": 0.18, + "step": 213970 + }, + { + "epoch": 8.86, + "grad_norm": 0.20703125, + "learning_rate": 0.0003998934009237407, + "loss": 0.1412, + "step": 213980 + }, + { + "epoch": 8.86, + "grad_norm": 0.62109375, + "learning_rate": 0.00039988472116051036, + "loss": 0.2067, + "step": 213990 + }, + { + "epoch": 8.86, + "grad_norm": 0.62890625, + "learning_rate": 0.00039987604111521325, + "loss": 0.1918, + "step": 214000 + }, + { + "epoch": 8.86, + "grad_norm": 0.859375, + "learning_rate": 0.00039986736078786555, + "loss": 0.1979, + "step": 214010 + }, + { + "epoch": 8.86, + "grad_norm": 0.796875, + "learning_rate": 0.00039985868017848364, + "loss": 0.204, + "step": 214020 + }, + { + "epoch": 8.87, + "grad_norm": 0.6875, + "learning_rate": 0.000399849999287084, + "loss": 0.1893, + "step": 214030 + }, + { + "epoch": 8.87, + "grad_norm": 0.5859375, + "learning_rate": 0.00039984131811368276, + "loss": 0.1653, + "step": 214040 + }, + { + "epoch": 8.87, + "grad_norm": 0.419921875, + "learning_rate": 0.00039983263665829626, + "loss": 0.1882, + "step": 214050 + }, + { + "epoch": 8.87, + "grad_norm": 0.59765625, + "learning_rate": 0.00039982395492094104, + "loss": 0.1874, + "step": 214060 + }, + { + "epoch": 8.87, + "grad_norm": 1.03125, + "learning_rate": 0.00039981527290163335, + "loss": 0.2001, + "step": 214070 + }, + { + "epoch": 8.87, + "grad_norm": 0.67578125, + "learning_rate": 0.0003998065906003894, + "loss": 0.2251, + "step": 214080 + }, + { + "epoch": 8.87, + "grad_norm": 0.26953125, + "learning_rate": 0.0003997979080172256, + "loss": 0.1533, + "step": 214090 + }, + { + "epoch": 8.87, + "grad_norm": 0.55078125, + "learning_rate": 0.00039978922515215837, + "loss": 0.1866, + "step": 214100 + }, + { + "epoch": 8.87, + "grad_norm": 0.5234375, + "learning_rate": 0.000399780542005204, + "loss": 0.1982, + "step": 214110 + }, + { + "epoch": 8.87, + "grad_norm": 0.451171875, + "learning_rate": 0.00039977185857637877, + "loss": 0.225, + "step": 214120 + }, + { + "epoch": 8.87, + "grad_norm": 0.76953125, + "learning_rate": 0.00039976317486569915, + "loss": 0.1842, + "step": 214130 + }, + { + "epoch": 8.87, + "grad_norm": 0.302734375, + "learning_rate": 0.0003997544908731814, + "loss": 0.1725, + "step": 214140 + }, + { + "epoch": 8.87, + "grad_norm": 0.490234375, + "learning_rate": 0.0003997458065988419, + "loss": 0.2049, + "step": 214150 + }, + { + "epoch": 8.87, + "grad_norm": 0.375, + "learning_rate": 0.00039973712204269686, + "loss": 0.2324, + "step": 214160 + }, + { + "epoch": 8.87, + "grad_norm": 0.58984375, + "learning_rate": 0.0003997284372047627, + "loss": 0.2354, + "step": 214170 + }, + { + "epoch": 8.87, + "grad_norm": 0.703125, + "learning_rate": 0.00039971975208505595, + "loss": 0.2204, + "step": 214180 + }, + { + "epoch": 8.87, + "grad_norm": 0.388671875, + "learning_rate": 0.0003997110666835927, + "loss": 0.2166, + "step": 214190 + }, + { + "epoch": 8.87, + "grad_norm": 0.703125, + "learning_rate": 0.0003997023810003894, + "loss": 0.2085, + "step": 214200 + }, + { + "epoch": 8.87, + "grad_norm": 1.03125, + "learning_rate": 0.00039969369503546235, + "loss": 0.1838, + "step": 214210 + }, + { + "epoch": 8.87, + "grad_norm": 1.109375, + "learning_rate": 0.000399685008788828, + "loss": 0.1857, + "step": 214220 + }, + { + "epoch": 8.87, + "grad_norm": 0.35546875, + "learning_rate": 0.0003996763222605026, + "loss": 0.182, + "step": 214230 + }, + { + "epoch": 8.87, + "grad_norm": 1.4375, + "learning_rate": 0.00039966763545050253, + "loss": 0.1838, + "step": 214240 + }, + { + "epoch": 8.87, + "grad_norm": 1.0234375, + "learning_rate": 0.0003996589483588441, + "loss": 0.202, + "step": 214250 + }, + { + "epoch": 8.87, + "grad_norm": 1.6796875, + "learning_rate": 0.00039965026098554376, + "loss": 0.2168, + "step": 214260 + }, + { + "epoch": 8.88, + "grad_norm": 0.8046875, + "learning_rate": 0.00039964157333061774, + "loss": 0.2308, + "step": 214270 + }, + { + "epoch": 8.88, + "grad_norm": 0.8671875, + "learning_rate": 0.00039963288539408246, + "loss": 0.1758, + "step": 214280 + }, + { + "epoch": 8.88, + "grad_norm": 0.38671875, + "learning_rate": 0.00039962419717595423, + "loss": 0.2087, + "step": 214290 + }, + { + "epoch": 8.88, + "grad_norm": 1.0859375, + "learning_rate": 0.0003996155086762494, + "loss": 0.2147, + "step": 214300 + }, + { + "epoch": 8.88, + "grad_norm": 0.474609375, + "learning_rate": 0.00039960681989498437, + "loss": 0.2379, + "step": 214310 + }, + { + "epoch": 8.88, + "grad_norm": 1.015625, + "learning_rate": 0.0003995981308321754, + "loss": 0.1778, + "step": 214320 + }, + { + "epoch": 8.88, + "grad_norm": 0.275390625, + "learning_rate": 0.000399589441487839, + "loss": 0.1276, + "step": 214330 + }, + { + "epoch": 8.88, + "grad_norm": 1.28125, + "learning_rate": 0.0003995807518619914, + "loss": 0.1841, + "step": 214340 + }, + { + "epoch": 8.88, + "grad_norm": 0.9375, + "learning_rate": 0.00039957206195464893, + "loss": 0.1665, + "step": 214350 + }, + { + "epoch": 8.88, + "grad_norm": 1.109375, + "learning_rate": 0.000399563371765828, + "loss": 0.2347, + "step": 214360 + }, + { + "epoch": 8.88, + "grad_norm": 0.734375, + "learning_rate": 0.00039955468129554503, + "loss": 0.1934, + "step": 214370 + }, + { + "epoch": 8.88, + "grad_norm": 0.71875, + "learning_rate": 0.00039954599054381625, + "loss": 0.1886, + "step": 214380 + }, + { + "epoch": 8.88, + "grad_norm": 0.515625, + "learning_rate": 0.0003995372995106581, + "loss": 0.2024, + "step": 214390 + }, + { + "epoch": 8.88, + "grad_norm": 0.5546875, + "learning_rate": 0.0003995286081960868, + "loss": 0.2614, + "step": 214400 + }, + { + "epoch": 8.88, + "grad_norm": 0.71484375, + "learning_rate": 0.00039951991660011887, + "loss": 0.2114, + "step": 214410 + }, + { + "epoch": 8.88, + "grad_norm": 0.466796875, + "learning_rate": 0.0003995112247227706, + "loss": 0.2325, + "step": 214420 + }, + { + "epoch": 8.88, + "grad_norm": 0.88671875, + "learning_rate": 0.0003995025325640583, + "loss": 0.1538, + "step": 214430 + }, + { + "epoch": 8.88, + "grad_norm": 0.59765625, + "learning_rate": 0.0003994938401239985, + "loss": 0.1974, + "step": 214440 + }, + { + "epoch": 8.88, + "grad_norm": 0.75, + "learning_rate": 0.00039948514740260736, + "loss": 0.2037, + "step": 214450 + }, + { + "epoch": 8.88, + "grad_norm": 1.0, + "learning_rate": 0.00039947645439990134, + "loss": 0.2065, + "step": 214460 + }, + { + "epoch": 8.88, + "grad_norm": 0.212890625, + "learning_rate": 0.0003994677611158968, + "loss": 0.2019, + "step": 214470 + }, + { + "epoch": 8.88, + "grad_norm": 0.69140625, + "learning_rate": 0.00039945906755061003, + "loss": 0.2542, + "step": 214480 + }, + { + "epoch": 8.88, + "grad_norm": 0.98828125, + "learning_rate": 0.0003994503737040574, + "loss": 0.2058, + "step": 214490 + }, + { + "epoch": 8.88, + "grad_norm": 0.70703125, + "learning_rate": 0.00039944167957625535, + "loss": 0.1821, + "step": 214500 + }, + { + "epoch": 8.88, + "grad_norm": 0.482421875, + "learning_rate": 0.0003994329851672202, + "loss": 0.1441, + "step": 214510 + }, + { + "epoch": 8.89, + "grad_norm": 0.52734375, + "learning_rate": 0.00039942429047696833, + "loss": 0.2019, + "step": 214520 + }, + { + "epoch": 8.89, + "grad_norm": 0.42578125, + "learning_rate": 0.000399415595505516, + "loss": 0.2411, + "step": 214530 + }, + { + "epoch": 8.89, + "grad_norm": 0.5078125, + "learning_rate": 0.0003994069002528797, + "loss": 0.1996, + "step": 214540 + }, + { + "epoch": 8.89, + "grad_norm": 1.046875, + "learning_rate": 0.00039939820471907587, + "loss": 0.1588, + "step": 214550 + }, + { + "epoch": 8.89, + "grad_norm": 0.68359375, + "learning_rate": 0.0003993895089041206, + "loss": 0.2362, + "step": 214560 + }, + { + "epoch": 8.89, + "grad_norm": 0.88671875, + "learning_rate": 0.0003993808128080304, + "loss": 0.2125, + "step": 214570 + }, + { + "epoch": 8.89, + "grad_norm": 0.82421875, + "learning_rate": 0.00039937211643082174, + "loss": 0.1936, + "step": 214580 + }, + { + "epoch": 8.89, + "grad_norm": 1.9453125, + "learning_rate": 0.00039936341977251077, + "loss": 0.1768, + "step": 214590 + }, + { + "epoch": 8.89, + "grad_norm": 0.6640625, + "learning_rate": 0.0003993547228331141, + "loss": 0.1789, + "step": 214600 + }, + { + "epoch": 8.89, + "grad_norm": 1.1484375, + "learning_rate": 0.0003993460256126479, + "loss": 0.1864, + "step": 214610 + }, + { + "epoch": 8.89, + "grad_norm": 0.859375, + "learning_rate": 0.0003993373281111286, + "loss": 0.2098, + "step": 214620 + }, + { + "epoch": 8.89, + "grad_norm": 0.466796875, + "learning_rate": 0.00039932863032857266, + "loss": 0.2092, + "step": 214630 + }, + { + "epoch": 8.89, + "grad_norm": 0.89453125, + "learning_rate": 0.00039931993226499624, + "loss": 0.2378, + "step": 214640 + }, + { + "epoch": 8.89, + "grad_norm": 0.80859375, + "learning_rate": 0.00039931123392041593, + "loss": 0.2619, + "step": 214650 + }, + { + "epoch": 8.89, + "grad_norm": 1.0234375, + "learning_rate": 0.0003993025352948479, + "loss": 0.2255, + "step": 214660 + }, + { + "epoch": 8.89, + "grad_norm": 0.65625, + "learning_rate": 0.00039929383638830876, + "loss": 0.1743, + "step": 214670 + }, + { + "epoch": 8.89, + "grad_norm": 4.46875, + "learning_rate": 0.0003992851372008146, + "loss": 0.2145, + "step": 214680 + }, + { + "epoch": 8.89, + "grad_norm": 0.390625, + "learning_rate": 0.00039927643773238203, + "loss": 0.2004, + "step": 214690 + }, + { + "epoch": 8.89, + "grad_norm": 0.671875, + "learning_rate": 0.0003992677379830273, + "loss": 0.1683, + "step": 214700 + }, + { + "epoch": 8.89, + "grad_norm": 0.98828125, + "learning_rate": 0.00039925903795276686, + "loss": 0.2105, + "step": 214710 + }, + { + "epoch": 8.89, + "grad_norm": 1.4140625, + "learning_rate": 0.0003992503376416169, + "loss": 0.2267, + "step": 214720 + }, + { + "epoch": 8.89, + "grad_norm": 2.0625, + "learning_rate": 0.00039924163704959406, + "loss": 0.1944, + "step": 214730 + }, + { + "epoch": 8.89, + "grad_norm": 0.96875, + "learning_rate": 0.00039923293617671445, + "loss": 0.2321, + "step": 214740 + }, + { + "epoch": 8.89, + "grad_norm": 1.0234375, + "learning_rate": 0.00039922423502299466, + "loss": 0.2123, + "step": 214750 + }, + { + "epoch": 8.9, + "grad_norm": 0.640625, + "learning_rate": 0.0003992155335884509, + "loss": 0.2378, + "step": 214760 + }, + { + "epoch": 8.9, + "grad_norm": 0.77734375, + "learning_rate": 0.0003992068318730997, + "loss": 0.2302, + "step": 214770 + }, + { + "epoch": 8.9, + "grad_norm": 0.546875, + "learning_rate": 0.00039919812987695725, + "loss": 0.1537, + "step": 214780 + }, + { + "epoch": 8.9, + "grad_norm": 0.6640625, + "learning_rate": 0.00039918942760004016, + "loss": 0.2259, + "step": 214790 + }, + { + "epoch": 8.9, + "grad_norm": 0.498046875, + "learning_rate": 0.0003991807250423646, + "loss": 0.1654, + "step": 214800 + }, + { + "epoch": 8.9, + "grad_norm": 0.8515625, + "learning_rate": 0.00039917202220394706, + "loss": 0.1659, + "step": 214810 + }, + { + "epoch": 8.9, + "grad_norm": 0.51171875, + "learning_rate": 0.00039916331908480384, + "loss": 0.1732, + "step": 214820 + }, + { + "epoch": 8.9, + "grad_norm": 0.5703125, + "learning_rate": 0.00039915461568495135, + "loss": 0.1874, + "step": 214830 + }, + { + "epoch": 8.9, + "grad_norm": 0.30078125, + "learning_rate": 0.0003991459120044061, + "loss": 0.1829, + "step": 214840 + }, + { + "epoch": 8.9, + "grad_norm": 1.3125, + "learning_rate": 0.0003991372080431842, + "loss": 0.2451, + "step": 214850 + }, + { + "epoch": 8.9, + "grad_norm": 0.68359375, + "learning_rate": 0.0003991285038013023, + "loss": 0.2203, + "step": 214860 + }, + { + "epoch": 8.9, + "grad_norm": 0.8359375, + "learning_rate": 0.0003991197992787766, + "loss": 0.1954, + "step": 214870 + }, + { + "epoch": 8.9, + "grad_norm": 0.69140625, + "learning_rate": 0.00039911109447562357, + "loss": 0.1964, + "step": 214880 + }, + { + "epoch": 8.9, + "grad_norm": 0.5703125, + "learning_rate": 0.00039910238939185955, + "loss": 0.2176, + "step": 214890 + }, + { + "epoch": 8.9, + "grad_norm": 1.6171875, + "learning_rate": 0.0003990936840275009, + "loss": 0.1533, + "step": 214900 + }, + { + "epoch": 8.9, + "grad_norm": 1.0390625, + "learning_rate": 0.00039908497838256406, + "loss": 0.1785, + "step": 214910 + }, + { + "epoch": 8.9, + "grad_norm": 1.546875, + "learning_rate": 0.0003990762724570655, + "loss": 0.1893, + "step": 214920 + }, + { + "epoch": 8.9, + "grad_norm": 0.671875, + "learning_rate": 0.0003990675662510213, + "loss": 0.1772, + "step": 214930 + }, + { + "epoch": 8.9, + "grad_norm": 0.306640625, + "learning_rate": 0.0003990588597644481, + "loss": 0.199, + "step": 214940 + }, + { + "epoch": 8.9, + "grad_norm": 0.263671875, + "learning_rate": 0.0003990501529973623, + "loss": 0.2466, + "step": 214950 + }, + { + "epoch": 8.9, + "grad_norm": 0.5390625, + "learning_rate": 0.00039904144594978013, + "loss": 0.1754, + "step": 214960 + }, + { + "epoch": 8.9, + "grad_norm": 0.8125, + "learning_rate": 0.0003990327386217181, + "loss": 0.1551, + "step": 214970 + }, + { + "epoch": 8.9, + "grad_norm": 0.64453125, + "learning_rate": 0.00039902403101319255, + "loss": 0.1923, + "step": 214980 + }, + { + "epoch": 8.9, + "grad_norm": 1.515625, + "learning_rate": 0.00039901532312421983, + "loss": 0.1894, + "step": 214990 + }, + { + "epoch": 8.91, + "grad_norm": 0.70703125, + "learning_rate": 0.0003990066149548164, + "loss": 0.1868, + "step": 215000 + }, + { + "epoch": 8.91, + "grad_norm": 0.9921875, + "learning_rate": 0.00039899790650499856, + "loss": 0.1632, + "step": 215010 + }, + { + "epoch": 8.91, + "grad_norm": 1.4140625, + "learning_rate": 0.0003989891977747828, + "loss": 0.1839, + "step": 215020 + }, + { + "epoch": 8.91, + "grad_norm": 0.7265625, + "learning_rate": 0.0003989804887641855, + "loss": 0.198, + "step": 215030 + }, + { + "epoch": 8.91, + "grad_norm": 0.44140625, + "learning_rate": 0.0003989717794732229, + "loss": 0.1813, + "step": 215040 + }, + { + "epoch": 8.91, + "grad_norm": 0.78125, + "learning_rate": 0.0003989630699019116, + "loss": 0.2094, + "step": 215050 + }, + { + "epoch": 8.91, + "grad_norm": 0.96875, + "learning_rate": 0.00039895436005026784, + "loss": 0.2118, + "step": 215060 + }, + { + "epoch": 8.91, + "grad_norm": 0.6953125, + "learning_rate": 0.0003989456499183081, + "loss": 0.2204, + "step": 215070 + }, + { + "epoch": 8.91, + "grad_norm": 0.99609375, + "learning_rate": 0.0003989369395060487, + "loss": 0.1879, + "step": 215080 + }, + { + "epoch": 8.91, + "grad_norm": 0.5078125, + "learning_rate": 0.000398928228813506, + "loss": 0.226, + "step": 215090 + }, + { + "epoch": 8.91, + "grad_norm": 0.96484375, + "learning_rate": 0.0003989195178406965, + "loss": 0.2411, + "step": 215100 + }, + { + "epoch": 8.91, + "grad_norm": 0.5234375, + "learning_rate": 0.00039891080658763657, + "loss": 0.1885, + "step": 215110 + }, + { + "epoch": 8.91, + "grad_norm": 0.8984375, + "learning_rate": 0.0003989020950543426, + "loss": 0.1758, + "step": 215120 + }, + { + "epoch": 8.91, + "grad_norm": 0.58203125, + "learning_rate": 0.00039889338324083093, + "loss": 0.2078, + "step": 215130 + }, + { + "epoch": 8.91, + "grad_norm": 0.89453125, + "learning_rate": 0.00039888467114711804, + "loss": 0.1899, + "step": 215140 + }, + { + "epoch": 8.91, + "grad_norm": 0.85546875, + "learning_rate": 0.00039887595877322025, + "loss": 0.1715, + "step": 215150 + }, + { + "epoch": 8.91, + "grad_norm": 0.6640625, + "learning_rate": 0.00039886724611915393, + "loss": 0.1635, + "step": 215160 + }, + { + "epoch": 8.91, + "grad_norm": 0.28515625, + "learning_rate": 0.0003988585331849356, + "loss": 0.1866, + "step": 215170 + }, + { + "epoch": 8.91, + "grad_norm": 0.6875, + "learning_rate": 0.00039884981997058156, + "loss": 0.2322, + "step": 215180 + }, + { + "epoch": 8.91, + "grad_norm": 0.462890625, + "learning_rate": 0.00039884110647610824, + "loss": 0.1743, + "step": 215190 + }, + { + "epoch": 8.91, + "grad_norm": 0.8046875, + "learning_rate": 0.00039883239270153205, + "loss": 0.2268, + "step": 215200 + }, + { + "epoch": 8.91, + "grad_norm": 0.53515625, + "learning_rate": 0.0003988236786468693, + "loss": 0.2185, + "step": 215210 + }, + { + "epoch": 8.91, + "grad_norm": 0.498046875, + "learning_rate": 0.0003988149643121365, + "loss": 0.2384, + "step": 215220 + }, + { + "epoch": 8.91, + "grad_norm": 0.66015625, + "learning_rate": 0.00039880624969735, + "loss": 0.2069, + "step": 215230 + }, + { + "epoch": 8.92, + "grad_norm": 0.333984375, + "learning_rate": 0.0003987975348025262, + "loss": 0.229, + "step": 215240 + }, + { + "epoch": 8.92, + "grad_norm": 0.55859375, + "learning_rate": 0.00039878881962768155, + "loss": 0.1938, + "step": 215250 + }, + { + "epoch": 8.92, + "grad_norm": 0.5, + "learning_rate": 0.0003987801041728324, + "loss": 0.1911, + "step": 215260 + }, + { + "epoch": 8.92, + "grad_norm": 0.46484375, + "learning_rate": 0.0003987713884379951, + "loss": 0.1744, + "step": 215270 + }, + { + "epoch": 8.92, + "grad_norm": 1.0703125, + "learning_rate": 0.0003987626724231862, + "loss": 0.1997, + "step": 215280 + }, + { + "epoch": 8.92, + "grad_norm": 0.8515625, + "learning_rate": 0.00039875395612842205, + "loss": 0.1967, + "step": 215290 + }, + { + "epoch": 8.92, + "grad_norm": 1.1796875, + "learning_rate": 0.00039874523955371887, + "loss": 0.1693, + "step": 215300 + }, + { + "epoch": 8.92, + "grad_norm": 0.3203125, + "learning_rate": 0.00039873652269909333, + "loss": 0.1968, + "step": 215310 + }, + { + "epoch": 8.92, + "grad_norm": 0.396484375, + "learning_rate": 0.00039872780556456165, + "loss": 0.1232, + "step": 215320 + }, + { + "epoch": 8.92, + "grad_norm": 0.2490234375, + "learning_rate": 0.0003987190881501403, + "loss": 0.1911, + "step": 215330 + }, + { + "epoch": 8.92, + "grad_norm": 0.95703125, + "learning_rate": 0.00039871037045584567, + "loss": 0.1886, + "step": 215340 + }, + { + "epoch": 8.92, + "grad_norm": 0.34375, + "learning_rate": 0.0003987016524816943, + "loss": 0.1674, + "step": 215350 + }, + { + "epoch": 8.92, + "grad_norm": 0.6953125, + "learning_rate": 0.0003986929342277024, + "loss": 0.2256, + "step": 215360 + }, + { + "epoch": 8.92, + "grad_norm": 0.45703125, + "learning_rate": 0.0003986842156938865, + "loss": 0.1823, + "step": 215370 + }, + { + "epoch": 8.92, + "grad_norm": 0.296875, + "learning_rate": 0.0003986754968802628, + "loss": 0.2426, + "step": 215380 + }, + { + "epoch": 8.92, + "grad_norm": 1.3828125, + "learning_rate": 0.000398666777786848, + "loss": 0.1434, + "step": 215390 + }, + { + "epoch": 8.92, + "grad_norm": 0.83203125, + "learning_rate": 0.0003986580584136584, + "loss": 0.1983, + "step": 215400 + }, + { + "epoch": 8.92, + "grad_norm": 0.56640625, + "learning_rate": 0.00039864933876071034, + "loss": 0.1792, + "step": 215410 + }, + { + "epoch": 8.92, + "grad_norm": 0.494140625, + "learning_rate": 0.00039864061882802026, + "loss": 0.2179, + "step": 215420 + }, + { + "epoch": 8.92, + "grad_norm": 0.6875, + "learning_rate": 0.00039863189861560466, + "loss": 0.2445, + "step": 215430 + }, + { + "epoch": 8.92, + "grad_norm": 1.125, + "learning_rate": 0.0003986231781234798, + "loss": 0.2128, + "step": 215440 + }, + { + "epoch": 8.92, + "grad_norm": 0.51171875, + "learning_rate": 0.00039861445735166223, + "loss": 0.1991, + "step": 215450 + }, + { + "epoch": 8.92, + "grad_norm": 0.80859375, + "learning_rate": 0.00039860573630016825, + "loss": 0.1715, + "step": 215460 + }, + { + "epoch": 8.92, + "grad_norm": 0.61328125, + "learning_rate": 0.0003985970149690144, + "loss": 0.2066, + "step": 215470 + }, + { + "epoch": 8.93, + "grad_norm": 0.94921875, + "learning_rate": 0.00039858829335821696, + "loss": 0.2, + "step": 215480 + }, + { + "epoch": 8.93, + "grad_norm": 1.1796875, + "learning_rate": 0.0003985795714677924, + "loss": 0.1456, + "step": 215490 + }, + { + "epoch": 8.93, + "grad_norm": 0.75390625, + "learning_rate": 0.0003985708492977571, + "loss": 0.178, + "step": 215500 + }, + { + "epoch": 8.93, + "grad_norm": 0.59765625, + "learning_rate": 0.0003985621268481275, + "loss": 0.2709, + "step": 215510 + }, + { + "epoch": 8.93, + "grad_norm": 0.6796875, + "learning_rate": 0.00039855340411892014, + "loss": 0.2361, + "step": 215520 + }, + { + "epoch": 8.93, + "grad_norm": 1.0078125, + "learning_rate": 0.0003985446811101512, + "loss": 0.2305, + "step": 215530 + }, + { + "epoch": 8.93, + "grad_norm": 1.359375, + "learning_rate": 0.00039853595782183724, + "loss": 0.1907, + "step": 215540 + }, + { + "epoch": 8.93, + "grad_norm": 0.447265625, + "learning_rate": 0.00039852723425399476, + "loss": 0.1738, + "step": 215550 + }, + { + "epoch": 8.93, + "grad_norm": 0.515625, + "learning_rate": 0.0003985185104066399, + "loss": 0.1761, + "step": 215560 + }, + { + "epoch": 8.93, + "grad_norm": 0.6640625, + "learning_rate": 0.0003985097862797893, + "loss": 0.1916, + "step": 215570 + }, + { + "epoch": 8.93, + "grad_norm": 0.72265625, + "learning_rate": 0.00039850106187345937, + "loss": 0.2029, + "step": 215580 + }, + { + "epoch": 8.93, + "grad_norm": 0.63671875, + "learning_rate": 0.00039849233718766637, + "loss": 0.2046, + "step": 215590 + }, + { + "epoch": 8.93, + "grad_norm": 0.8984375, + "learning_rate": 0.00039848361222242693, + "loss": 0.1884, + "step": 215600 + }, + { + "epoch": 8.93, + "grad_norm": 1.3359375, + "learning_rate": 0.00039847488697775725, + "loss": 0.213, + "step": 215610 + }, + { + "epoch": 8.93, + "grad_norm": 0.82421875, + "learning_rate": 0.000398466161453674, + "loss": 0.2239, + "step": 215620 + }, + { + "epoch": 8.93, + "grad_norm": 0.5546875, + "learning_rate": 0.00039845743565019345, + "loss": 0.2039, + "step": 215630 + }, + { + "epoch": 8.93, + "grad_norm": 0.76171875, + "learning_rate": 0.0003984487095673319, + "loss": 0.1458, + "step": 215640 + }, + { + "epoch": 8.93, + "grad_norm": 0.29296875, + "learning_rate": 0.000398439983205106, + "loss": 0.136, + "step": 215650 + }, + { + "epoch": 8.93, + "grad_norm": 0.96484375, + "learning_rate": 0.0003984312565635321, + "loss": 0.1705, + "step": 215660 + }, + { + "epoch": 8.93, + "grad_norm": 1.046875, + "learning_rate": 0.0003984225296426266, + "loss": 0.2092, + "step": 215670 + }, + { + "epoch": 8.93, + "grad_norm": 1.28125, + "learning_rate": 0.00039841380244240595, + "loss": 0.1968, + "step": 215680 + }, + { + "epoch": 8.93, + "grad_norm": 0.55859375, + "learning_rate": 0.0003984050749628865, + "loss": 0.1877, + "step": 215690 + }, + { + "epoch": 8.93, + "grad_norm": 2.015625, + "learning_rate": 0.00039839634720408474, + "loss": 0.1858, + "step": 215700 + }, + { + "epoch": 8.93, + "grad_norm": 0.80078125, + "learning_rate": 0.00039838761916601706, + "loss": 0.1965, + "step": 215710 + }, + { + "epoch": 8.94, + "grad_norm": 1.5078125, + "learning_rate": 0.00039837889084869994, + "loss": 0.2094, + "step": 215720 + }, + { + "epoch": 8.94, + "grad_norm": 0.88671875, + "learning_rate": 0.0003983701622521498, + "loss": 0.1847, + "step": 215730 + }, + { + "epoch": 8.94, + "grad_norm": 0.4765625, + "learning_rate": 0.000398361433376383, + "loss": 0.1798, + "step": 215740 + }, + { + "epoch": 8.94, + "grad_norm": 0.95703125, + "learning_rate": 0.00039835270422141603, + "loss": 0.2314, + "step": 215750 + }, + { + "epoch": 8.94, + "grad_norm": 1.0078125, + "learning_rate": 0.00039834397478726523, + "loss": 0.1933, + "step": 215760 + }, + { + "epoch": 8.94, + "grad_norm": 1.0859375, + "learning_rate": 0.00039833524507394707, + "loss": 0.2377, + "step": 215770 + }, + { + "epoch": 8.94, + "grad_norm": 0.59375, + "learning_rate": 0.00039832651508147813, + "loss": 0.2392, + "step": 215780 + }, + { + "epoch": 8.94, + "grad_norm": 0.24609375, + "learning_rate": 0.00039831778480987456, + "loss": 0.1399, + "step": 215790 + }, + { + "epoch": 8.94, + "grad_norm": 0.5234375, + "learning_rate": 0.00039830905425915306, + "loss": 0.2317, + "step": 215800 + }, + { + "epoch": 8.94, + "grad_norm": 0.34375, + "learning_rate": 0.00039830032342932985, + "loss": 0.2155, + "step": 215810 + }, + { + "epoch": 8.94, + "grad_norm": 1.015625, + "learning_rate": 0.00039829159232042144, + "loss": 0.1984, + "step": 215820 + }, + { + "epoch": 8.94, + "grad_norm": 0.796875, + "learning_rate": 0.0003982828609324444, + "loss": 0.1829, + "step": 215830 + }, + { + "epoch": 8.94, + "grad_norm": 0.00390625, + "learning_rate": 0.0003982741292654149, + "loss": 0.1822, + "step": 215840 + }, + { + "epoch": 8.94, + "grad_norm": 0.87109375, + "learning_rate": 0.00039826539731934954, + "loss": 0.2003, + "step": 215850 + }, + { + "epoch": 8.94, + "grad_norm": 0.416015625, + "learning_rate": 0.00039825666509426465, + "loss": 0.1804, + "step": 215860 + }, + { + "epoch": 8.94, + "grad_norm": 0.0, + "learning_rate": 0.00039824793259017675, + "loss": 0.2059, + "step": 215870 + }, + { + "epoch": 8.94, + "grad_norm": 0.408203125, + "learning_rate": 0.00039823919980710233, + "loss": 0.1815, + "step": 215880 + }, + { + "epoch": 8.94, + "grad_norm": 0.58984375, + "learning_rate": 0.0003982304667450577, + "loss": 0.1604, + "step": 215890 + }, + { + "epoch": 8.94, + "grad_norm": 0.92578125, + "learning_rate": 0.0003982217334040593, + "loss": 0.1814, + "step": 215900 + }, + { + "epoch": 8.94, + "grad_norm": 0.66796875, + "learning_rate": 0.00039821299978412366, + "loss": 0.1741, + "step": 215910 + }, + { + "epoch": 8.94, + "grad_norm": 0.80078125, + "learning_rate": 0.00039820426588526713, + "loss": 0.1852, + "step": 215920 + }, + { + "epoch": 8.94, + "grad_norm": 0.64453125, + "learning_rate": 0.0003981955317075062, + "loss": 0.2165, + "step": 215930 + }, + { + "epoch": 8.94, + "grad_norm": 0.640625, + "learning_rate": 0.00039818679725085726, + "loss": 0.187, + "step": 215940 + }, + { + "epoch": 8.94, + "grad_norm": 0.69140625, + "learning_rate": 0.00039817806251533683, + "loss": 0.1869, + "step": 215950 + }, + { + "epoch": 8.95, + "grad_norm": 1.0078125, + "learning_rate": 0.0003981693275009612, + "loss": 0.2033, + "step": 215960 + }, + { + "epoch": 8.95, + "grad_norm": 0.8828125, + "learning_rate": 0.000398160592207747, + "loss": 0.1996, + "step": 215970 + }, + { + "epoch": 8.95, + "grad_norm": 1.1015625, + "learning_rate": 0.00039815185663571046, + "loss": 0.2083, + "step": 215980 + }, + { + "epoch": 8.95, + "grad_norm": 1.3125, + "learning_rate": 0.00039814312078486816, + "loss": 0.182, + "step": 215990 + }, + { + "epoch": 8.95, + "grad_norm": 0.875, + "learning_rate": 0.00039813438465523656, + "loss": 0.2179, + "step": 216000 + }, + { + "epoch": 8.95, + "grad_norm": 0.69921875, + "learning_rate": 0.00039812564824683196, + "loss": 0.2127, + "step": 216010 + }, + { + "epoch": 8.95, + "grad_norm": 1.0234375, + "learning_rate": 0.0003981169115596709, + "loss": 0.199, + "step": 216020 + }, + { + "epoch": 8.95, + "grad_norm": 0.0, + "learning_rate": 0.0003981081745937698, + "loss": 0.2209, + "step": 216030 + }, + { + "epoch": 8.95, + "grad_norm": 0.68359375, + "learning_rate": 0.0003980994373491452, + "loss": 0.2164, + "step": 216040 + }, + { + "epoch": 8.95, + "grad_norm": 1.2421875, + "learning_rate": 0.0003980906998258134, + "loss": 0.1792, + "step": 216050 + }, + { + "epoch": 8.95, + "grad_norm": 0.83203125, + "learning_rate": 0.00039808196202379087, + "loss": 0.179, + "step": 216060 + }, + { + "epoch": 8.95, + "grad_norm": 0.61328125, + "learning_rate": 0.0003980732239430941, + "loss": 0.2132, + "step": 216070 + }, + { + "epoch": 8.95, + "grad_norm": 0.82421875, + "learning_rate": 0.0003980644855837395, + "loss": 0.1748, + "step": 216080 + }, + { + "epoch": 8.95, + "grad_norm": 0.58203125, + "learning_rate": 0.00039805574694574356, + "loss": 0.1568, + "step": 216090 + }, + { + "epoch": 8.95, + "grad_norm": 1.453125, + "learning_rate": 0.0003980470080291226, + "loss": 0.2007, + "step": 216100 + }, + { + "epoch": 8.95, + "grad_norm": 1.09375, + "learning_rate": 0.00039803826883389327, + "loss": 0.1794, + "step": 216110 + }, + { + "epoch": 8.95, + "grad_norm": 0.57421875, + "learning_rate": 0.0003980295293600719, + "loss": 0.2081, + "step": 216120 + }, + { + "epoch": 8.95, + "grad_norm": 0.59375, + "learning_rate": 0.00039802078960767483, + "loss": 0.2017, + "step": 216130 + }, + { + "epoch": 8.95, + "grad_norm": 2.8125, + "learning_rate": 0.0003980120495767187, + "loss": 0.1944, + "step": 216140 + }, + { + "epoch": 8.95, + "grad_norm": 1.40625, + "learning_rate": 0.00039800330926721987, + "loss": 0.2325, + "step": 216150 + }, + { + "epoch": 8.95, + "grad_norm": 1.3515625, + "learning_rate": 0.0003979945686791948, + "loss": 0.1931, + "step": 216160 + }, + { + "epoch": 8.95, + "grad_norm": 1.0, + "learning_rate": 0.0003979858278126599, + "loss": 0.178, + "step": 216170 + }, + { + "epoch": 8.95, + "grad_norm": 4.625, + "learning_rate": 0.00039797708666763177, + "loss": 0.2305, + "step": 216180 + }, + { + "epoch": 8.95, + "grad_norm": 0.58984375, + "learning_rate": 0.0003979683452441266, + "loss": 0.1755, + "step": 216190 + }, + { + "epoch": 8.95, + "grad_norm": 0.75, + "learning_rate": 0.00039795960354216105, + "loss": 0.2235, + "step": 216200 + }, + { + "epoch": 8.96, + "grad_norm": 0.453125, + "learning_rate": 0.0003979508615617515, + "loss": 0.2072, + "step": 216210 + }, + { + "epoch": 8.96, + "grad_norm": 0.4375, + "learning_rate": 0.00039794211930291437, + "loss": 0.1633, + "step": 216220 + }, + { + "epoch": 8.96, + "grad_norm": 0.78515625, + "learning_rate": 0.0003979333767656662, + "loss": 0.2, + "step": 216230 + }, + { + "epoch": 8.96, + "grad_norm": 1.0, + "learning_rate": 0.0003979246339500233, + "loss": 0.2209, + "step": 216240 + }, + { + "epoch": 8.96, + "grad_norm": 0.75390625, + "learning_rate": 0.00039791589085600234, + "loss": 0.1986, + "step": 216250 + }, + { + "epoch": 8.96, + "grad_norm": 1.0546875, + "learning_rate": 0.0003979071474836196, + "loss": 0.1711, + "step": 216260 + }, + { + "epoch": 8.96, + "grad_norm": 0.53515625, + "learning_rate": 0.00039789840383289154, + "loss": 0.1973, + "step": 216270 + }, + { + "epoch": 8.96, + "grad_norm": 0.275390625, + "learning_rate": 0.00039788965990383477, + "loss": 0.1822, + "step": 216280 + }, + { + "epoch": 8.96, + "grad_norm": 0.47265625, + "learning_rate": 0.0003978809156964655, + "loss": 0.2343, + "step": 216290 + }, + { + "epoch": 8.96, + "grad_norm": 0.55859375, + "learning_rate": 0.0003978721712108003, + "loss": 0.1733, + "step": 216300 + }, + { + "epoch": 8.96, + "grad_norm": 0.578125, + "learning_rate": 0.00039786342644685573, + "loss": 0.193, + "step": 216310 + }, + { + "epoch": 8.96, + "grad_norm": 0.228515625, + "learning_rate": 0.0003978546814046481, + "loss": 0.1634, + "step": 216320 + }, + { + "epoch": 8.96, + "grad_norm": 0.5234375, + "learning_rate": 0.000397845936084194, + "loss": 0.1493, + "step": 216330 + }, + { + "epoch": 8.96, + "grad_norm": 0.8203125, + "learning_rate": 0.0003978371904855098, + "loss": 0.2233, + "step": 216340 + }, + { + "epoch": 8.96, + "grad_norm": 0.578125, + "learning_rate": 0.0003978284446086119, + "loss": 0.1826, + "step": 216350 + }, + { + "epoch": 8.96, + "grad_norm": 0.9453125, + "learning_rate": 0.0003978196984535169, + "loss": 0.236, + "step": 216360 + }, + { + "epoch": 8.96, + "grad_norm": 0.333984375, + "learning_rate": 0.00039781095202024114, + "loss": 0.1784, + "step": 216370 + }, + { + "epoch": 8.96, + "grad_norm": 0.9140625, + "learning_rate": 0.00039780220530880115, + "loss": 0.2051, + "step": 216380 + }, + { + "epoch": 8.96, + "grad_norm": 0.1982421875, + "learning_rate": 0.00039779345831921344, + "loss": 0.1807, + "step": 216390 + }, + { + "epoch": 8.96, + "grad_norm": 1.171875, + "learning_rate": 0.00039778471105149425, + "loss": 0.2058, + "step": 216400 + }, + { + "epoch": 8.96, + "grad_norm": 0.5625, + "learning_rate": 0.00039777596350566035, + "loss": 0.2397, + "step": 216410 + }, + { + "epoch": 8.96, + "grad_norm": 1.4609375, + "learning_rate": 0.000397767215681728, + "loss": 0.2173, + "step": 216420 + }, + { + "epoch": 8.96, + "grad_norm": 1.015625, + "learning_rate": 0.00039775846757971367, + "loss": 0.2335, + "step": 216430 + }, + { + "epoch": 8.96, + "grad_norm": 0.8125, + "learning_rate": 0.00039774971919963385, + "loss": 0.2769, + "step": 216440 + }, + { + "epoch": 8.97, + "grad_norm": 0.5078125, + "learning_rate": 0.000397740970541505, + "loss": 0.2105, + "step": 216450 + }, + { + "epoch": 8.97, + "grad_norm": 0.43359375, + "learning_rate": 0.0003977322216053437, + "loss": 0.2381, + "step": 216460 + }, + { + "epoch": 8.97, + "grad_norm": 0.83984375, + "learning_rate": 0.0003977234723911662, + "loss": 0.1657, + "step": 216470 + }, + { + "epoch": 8.97, + "grad_norm": 0.55859375, + "learning_rate": 0.0003977147228989891, + "loss": 0.2499, + "step": 216480 + }, + { + "epoch": 8.97, + "grad_norm": 0.64453125, + "learning_rate": 0.00039770597312882897, + "loss": 0.2015, + "step": 216490 + }, + { + "epoch": 8.97, + "grad_norm": 1.0625, + "learning_rate": 0.00039769722308070205, + "loss": 0.2275, + "step": 216500 + }, + { + "epoch": 8.97, + "grad_norm": 1.15625, + "learning_rate": 0.00039768847275462493, + "loss": 0.2201, + "step": 216510 + }, + { + "epoch": 8.97, + "grad_norm": 0.82421875, + "learning_rate": 0.000397679722150614, + "loss": 0.2532, + "step": 216520 + }, + { + "epoch": 8.97, + "grad_norm": 1.359375, + "learning_rate": 0.0003976709712686858, + "loss": 0.1683, + "step": 216530 + }, + { + "epoch": 8.97, + "grad_norm": 0.84375, + "learning_rate": 0.0003976622201088568, + "loss": 0.1966, + "step": 216540 + }, + { + "epoch": 8.97, + "grad_norm": 0.6484375, + "learning_rate": 0.0003976534686711435, + "loss": 0.1644, + "step": 216550 + }, + { + "epoch": 8.97, + "grad_norm": 1.0390625, + "learning_rate": 0.0003976447169555622, + "loss": 0.1994, + "step": 216560 + }, + { + "epoch": 8.97, + "grad_norm": 0.7265625, + "learning_rate": 0.00039763596496212954, + "loss": 0.191, + "step": 216570 + }, + { + "epoch": 8.97, + "grad_norm": 0.65625, + "learning_rate": 0.00039762721269086195, + "loss": 0.265, + "step": 216580 + }, + { + "epoch": 8.97, + "grad_norm": 0.671875, + "learning_rate": 0.0003976184601417759, + "loss": 0.1834, + "step": 216590 + }, + { + "epoch": 8.97, + "grad_norm": 0.80078125, + "learning_rate": 0.0003976097073148878, + "loss": 0.1756, + "step": 216600 + }, + { + "epoch": 8.97, + "grad_norm": 0.390625, + "learning_rate": 0.00039760095421021417, + "loss": 0.1915, + "step": 216610 + }, + { + "epoch": 8.97, + "grad_norm": 0.76171875, + "learning_rate": 0.0003975922008277715, + "loss": 0.188, + "step": 216620 + }, + { + "epoch": 8.97, + "grad_norm": 1.46875, + "learning_rate": 0.0003975834471675763, + "loss": 0.2033, + "step": 216630 + }, + { + "epoch": 8.97, + "grad_norm": 0.87890625, + "learning_rate": 0.00039757469322964495, + "loss": 0.2415, + "step": 216640 + }, + { + "epoch": 8.97, + "grad_norm": 0.83984375, + "learning_rate": 0.00039756593901399395, + "loss": 0.2084, + "step": 216650 + }, + { + "epoch": 8.97, + "grad_norm": 2.125, + "learning_rate": 0.0003975571845206398, + "loss": 0.1774, + "step": 216660 + }, + { + "epoch": 8.97, + "grad_norm": 0.58984375, + "learning_rate": 0.00039754842974959903, + "loss": 0.1767, + "step": 216670 + }, + { + "epoch": 8.97, + "grad_norm": 0.326171875, + "learning_rate": 0.00039753967470088796, + "loss": 0.201, + "step": 216680 + }, + { + "epoch": 8.98, + "grad_norm": 0.453125, + "learning_rate": 0.0003975309193745231, + "loss": 0.2375, + "step": 216690 + }, + { + "epoch": 8.98, + "grad_norm": 0.50390625, + "learning_rate": 0.00039752216377052116, + "loss": 0.2234, + "step": 216700 + }, + { + "epoch": 8.98, + "grad_norm": 0.88671875, + "learning_rate": 0.00039751340788889833, + "loss": 0.2154, + "step": 216710 + }, + { + "epoch": 8.98, + "grad_norm": 0.0, + "learning_rate": 0.00039750465172967123, + "loss": 0.1795, + "step": 216720 + }, + { + "epoch": 8.98, + "grad_norm": 0.98828125, + "learning_rate": 0.0003974958952928562, + "loss": 0.1937, + "step": 216730 + }, + { + "epoch": 8.98, + "grad_norm": 0.64453125, + "learning_rate": 0.00039748713857846996, + "loss": 0.2079, + "step": 216740 + }, + { + "epoch": 8.98, + "grad_norm": 0.80859375, + "learning_rate": 0.0003974783815865288, + "loss": 0.2026, + "step": 216750 + }, + { + "epoch": 8.98, + "grad_norm": 0.72265625, + "learning_rate": 0.00039746962431704924, + "loss": 0.2207, + "step": 216760 + }, + { + "epoch": 8.98, + "grad_norm": 0.984375, + "learning_rate": 0.0003974608667700478, + "loss": 0.2222, + "step": 216770 + }, + { + "epoch": 8.98, + "grad_norm": 0.6953125, + "learning_rate": 0.0003974521089455409, + "loss": 0.1833, + "step": 216780 + }, + { + "epoch": 8.98, + "grad_norm": 0.6328125, + "learning_rate": 0.00039744335084354506, + "loss": 0.1671, + "step": 216790 + }, + { + "epoch": 8.98, + "grad_norm": 0.8125, + "learning_rate": 0.00039743459246407677, + "loss": 0.2187, + "step": 216800 + }, + { + "epoch": 8.98, + "grad_norm": 0.64453125, + "learning_rate": 0.00039742583380715247, + "loss": 0.2206, + "step": 216810 + }, + { + "epoch": 8.98, + "grad_norm": 0.5703125, + "learning_rate": 0.0003974170748727887, + "loss": 0.2002, + "step": 216820 + }, + { + "epoch": 8.98, + "grad_norm": 0.361328125, + "learning_rate": 0.0003974083156610019, + "loss": 0.2351, + "step": 216830 + }, + { + "epoch": 8.98, + "grad_norm": 0.404296875, + "learning_rate": 0.0003973995561718087, + "loss": 0.1944, + "step": 216840 + }, + { + "epoch": 8.98, + "grad_norm": 1.1796875, + "learning_rate": 0.00039739079640522526, + "loss": 0.2113, + "step": 216850 + }, + { + "epoch": 8.98, + "grad_norm": 1.53125, + "learning_rate": 0.0003973820363612684, + "loss": 0.1946, + "step": 216860 + }, + { + "epoch": 8.98, + "grad_norm": 0.60546875, + "learning_rate": 0.0003973732760399543, + "loss": 0.181, + "step": 216870 + }, + { + "epoch": 8.98, + "grad_norm": 0.6953125, + "learning_rate": 0.0003973645154412997, + "loss": 0.1724, + "step": 216880 + }, + { + "epoch": 8.98, + "grad_norm": 0.890625, + "learning_rate": 0.00039735575456532104, + "loss": 0.2497, + "step": 216890 + }, + { + "epoch": 8.98, + "grad_norm": 0.98828125, + "learning_rate": 0.00039734699341203474, + "loss": 0.2345, + "step": 216900 + }, + { + "epoch": 8.98, + "grad_norm": 0.9609375, + "learning_rate": 0.0003973382319814572, + "loss": 0.1982, + "step": 216910 + }, + { + "epoch": 8.98, + "grad_norm": 0.65234375, + "learning_rate": 0.00039732947027360514, + "loss": 0.1529, + "step": 216920 + }, + { + "epoch": 8.99, + "grad_norm": 0.38671875, + "learning_rate": 0.00039732070828849486, + "loss": 0.2014, + "step": 216930 + }, + { + "epoch": 8.99, + "grad_norm": 1.2421875, + "learning_rate": 0.000397311946026143, + "loss": 0.2228, + "step": 216940 + }, + { + "epoch": 8.99, + "grad_norm": 0.76171875, + "learning_rate": 0.0003973031834865659, + "loss": 0.2154, + "step": 216950 + }, + { + "epoch": 8.99, + "grad_norm": 0.4765625, + "learning_rate": 0.0003972944206697802, + "loss": 0.2057, + "step": 216960 + }, + { + "epoch": 8.99, + "grad_norm": 0.6953125, + "learning_rate": 0.0003972856575758023, + "loss": 0.1933, + "step": 216970 + }, + { + "epoch": 8.99, + "grad_norm": 0.5703125, + "learning_rate": 0.00039727689420464854, + "loss": 0.2275, + "step": 216980 + }, + { + "epoch": 8.99, + "grad_norm": 0.89453125, + "learning_rate": 0.0003972681305563357, + "loss": 0.2, + "step": 216990 + }, + { + "epoch": 8.99, + "grad_norm": 0.310546875, + "learning_rate": 0.0003972593666308801, + "loss": 0.2272, + "step": 217000 + }, + { + "epoch": 8.99, + "grad_norm": 1.015625, + "learning_rate": 0.00039725060242829825, + "loss": 0.1769, + "step": 217010 + }, + { + "epoch": 8.99, + "grad_norm": 0.98046875, + "learning_rate": 0.00039724183794860677, + "loss": 0.2041, + "step": 217020 + }, + { + "epoch": 8.99, + "grad_norm": 0.53125, + "learning_rate": 0.00039723307319182194, + "loss": 0.2171, + "step": 217030 + }, + { + "epoch": 8.99, + "grad_norm": 0.625, + "learning_rate": 0.0003972243081579604, + "loss": 0.2154, + "step": 217040 + }, + { + "epoch": 8.99, + "grad_norm": 0.73828125, + "learning_rate": 0.00039721554284703867, + "loss": 0.2591, + "step": 217050 + }, + { + "epoch": 8.99, + "grad_norm": 0.94140625, + "learning_rate": 0.0003972067772590732, + "loss": 0.2102, + "step": 217060 + }, + { + "epoch": 8.99, + "grad_norm": 0.875, + "learning_rate": 0.00039719801139408037, + "loss": 0.2362, + "step": 217070 + }, + { + "epoch": 8.99, + "grad_norm": 2.359375, + "learning_rate": 0.0003971892452520768, + "loss": 0.2233, + "step": 217080 + }, + { + "epoch": 8.99, + "grad_norm": 0.79296875, + "learning_rate": 0.00039718047883307894, + "loss": 0.2195, + "step": 217090 + }, + { + "epoch": 8.99, + "grad_norm": 1.1171875, + "learning_rate": 0.0003971717121371034, + "loss": 0.213, + "step": 217100 + }, + { + "epoch": 8.99, + "grad_norm": 0.52734375, + "learning_rate": 0.0003971629451641665, + "loss": 0.1836, + "step": 217110 + }, + { + "epoch": 8.99, + "grad_norm": 0.921875, + "learning_rate": 0.0003971541779142849, + "loss": 0.1897, + "step": 217120 + }, + { + "epoch": 8.99, + "grad_norm": 0.85546875, + "learning_rate": 0.000397145410387475, + "loss": 0.2332, + "step": 217130 + }, + { + "epoch": 8.99, + "grad_norm": 0.41796875, + "learning_rate": 0.0003971366425837534, + "loss": 0.172, + "step": 217140 + }, + { + "epoch": 8.99, + "grad_norm": 0.2353515625, + "learning_rate": 0.00039712787450313646, + "loss": 0.1631, + "step": 217150 + }, + { + "epoch": 8.99, + "grad_norm": 0.8203125, + "learning_rate": 0.00039711910614564076, + "loss": 0.1616, + "step": 217160 + }, + { + "epoch": 9.0, + "grad_norm": 0.5234375, + "learning_rate": 0.0003971103375112828, + "loss": 0.1746, + "step": 217170 + }, + { + "epoch": 9.0, + "grad_norm": 0.85546875, + "learning_rate": 0.000397101568600079, + "loss": 0.2046, + "step": 217180 + }, + { + "epoch": 9.0, + "grad_norm": 0.89453125, + "learning_rate": 0.00039709279941204604, + "loss": 0.166, + "step": 217190 + }, + { + "epoch": 9.0, + "grad_norm": 0.9453125, + "learning_rate": 0.00039708402994720023, + "loss": 0.2039, + "step": 217200 + }, + { + "epoch": 9.0, + "grad_norm": 0.8515625, + "learning_rate": 0.00039707526020555815, + "loss": 0.2011, + "step": 217210 + }, + { + "epoch": 9.0, + "grad_norm": 0.46875, + "learning_rate": 0.0003970664901871364, + "loss": 0.19, + "step": 217220 + }, + { + "epoch": 9.0, + "grad_norm": 0.25, + "learning_rate": 0.00039705771989195137, + "loss": 0.1996, + "step": 217230 + }, + { + "epoch": 9.0, + "grad_norm": 1.640625, + "learning_rate": 0.00039704894932001956, + "loss": 0.1757, + "step": 217240 + }, + { + "epoch": 9.0, + "grad_norm": 1.984375, + "learning_rate": 0.0003970401784713575, + "loss": 0.2151, + "step": 217250 + }, + { + "epoch": 9.0, + "grad_norm": 0.80078125, + "learning_rate": 0.00039703140734598176, + "loss": 0.2195, + "step": 217260 + }, + { + "epoch": 9.0, + "grad_norm": 0.921875, + "learning_rate": 0.0003970226359439088, + "loss": 0.2235, + "step": 217270 + }, + { + "epoch": 9.0, + "grad_norm": 0.85546875, + "learning_rate": 0.00039701386426515504, + "loss": 0.179, + "step": 217280 + }, + { + "epoch": 9.0, + "grad_norm": 0.8515625, + "learning_rate": 0.00039700509230973703, + "loss": 0.2005, + "step": 217290 + }, + { + "epoch": 9.0, + "grad_norm": 0.4609375, + "learning_rate": 0.0003969963200776714, + "loss": 0.2044, + "step": 217300 + }, + { + "epoch": 9.0, + "grad_norm": 0.416015625, + "learning_rate": 0.0003969875475689746, + "loss": 0.1624, + "step": 217310 + }, + { + "epoch": 9.0, + "grad_norm": 0.7265625, + "learning_rate": 0.00039697877478366293, + "loss": 0.2243, + "step": 217320 + }, + { + "epoch": 9.0, + "grad_norm": 1.6796875, + "learning_rate": 0.0003969700017217533, + "loss": 0.2086, + "step": 217330 + }, + { + "epoch": 9.0, + "grad_norm": 0.26171875, + "learning_rate": 0.0003969612283832619, + "loss": 0.1875, + "step": 217340 + }, + { + "epoch": 9.0, + "grad_norm": 0.81640625, + "learning_rate": 0.0003969524547682053, + "loss": 0.2134, + "step": 217350 + }, + { + "epoch": 9.0, + "grad_norm": 0.0, + "learning_rate": 0.00039694368087660013, + "loss": 0.2171, + "step": 217360 + }, + { + "epoch": 9.0, + "grad_norm": 0.275390625, + "learning_rate": 0.00039693490670846275, + "loss": 0.1811, + "step": 217370 + }, + { + "epoch": 9.0, + "grad_norm": 1.1015625, + "learning_rate": 0.0003969261322638098, + "loss": 0.1959, + "step": 217380 + }, + { + "epoch": 9.0, + "grad_norm": 0.68359375, + "learning_rate": 0.00039691735754265775, + "loss": 0.1879, + "step": 217390 + }, + { + "epoch": 9.0, + "grad_norm": 0.765625, + "learning_rate": 0.000396908582545023, + "loss": 0.224, + "step": 217400 + }, + { + "epoch": 9.01, + "grad_norm": 0.60546875, + "learning_rate": 0.00039689980727092224, + "loss": 0.1905, + "step": 217410 + }, + { + "epoch": 9.01, + "grad_norm": 0.58984375, + "learning_rate": 0.0003968910317203719, + "loss": 0.1866, + "step": 217420 + }, + { + "epoch": 9.01, + "grad_norm": 0.7265625, + "learning_rate": 0.00039688225589338844, + "loss": 0.122, + "step": 217430 + }, + { + "epoch": 9.01, + "grad_norm": 1.1015625, + "learning_rate": 0.00039687347978998856, + "loss": 0.1927, + "step": 217440 + }, + { + "epoch": 9.01, + "grad_norm": 0.640625, + "learning_rate": 0.0003968647034101885, + "loss": 0.1948, + "step": 217450 + }, + { + "epoch": 9.01, + "grad_norm": 0.84765625, + "learning_rate": 0.0003968559267540051, + "loss": 0.2053, + "step": 217460 + }, + { + "epoch": 9.01, + "grad_norm": 0.7578125, + "learning_rate": 0.00039684714982145454, + "loss": 0.1809, + "step": 217470 + }, + { + "epoch": 9.01, + "grad_norm": 1.671875, + "learning_rate": 0.00039683837261255355, + "loss": 0.2216, + "step": 217480 + }, + { + "epoch": 9.01, + "grad_norm": 0.703125, + "learning_rate": 0.00039682959512731865, + "loss": 0.1631, + "step": 217490 + }, + { + "epoch": 9.01, + "grad_norm": 1.2890625, + "learning_rate": 0.00039682081736576626, + "loss": 0.2103, + "step": 217500 + }, + { + "epoch": 9.01, + "grad_norm": 1.015625, + "learning_rate": 0.00039681203932791296, + "loss": 0.1628, + "step": 217510 + }, + { + "epoch": 9.01, + "grad_norm": 0.89453125, + "learning_rate": 0.0003968032610137753, + "loss": 0.1731, + "step": 217520 + }, + { + "epoch": 9.01, + "grad_norm": 0.8203125, + "learning_rate": 0.0003967944824233697, + "loss": 0.2201, + "step": 217530 + }, + { + "epoch": 9.01, + "grad_norm": 0.62890625, + "learning_rate": 0.0003967857035567127, + "loss": 0.2197, + "step": 217540 + }, + { + "epoch": 9.01, + "grad_norm": 1.3046875, + "learning_rate": 0.000396776924413821, + "loss": 0.1495, + "step": 217550 + }, + { + "epoch": 9.01, + "grad_norm": 0.92578125, + "learning_rate": 0.0003967681449947109, + "loss": 0.2047, + "step": 217560 + }, + { + "epoch": 9.01, + "grad_norm": 0.69921875, + "learning_rate": 0.0003967593652993989, + "loss": 0.1862, + "step": 217570 + }, + { + "epoch": 9.01, + "grad_norm": 0.58984375, + "learning_rate": 0.0003967505853279018, + "loss": 0.2126, + "step": 217580 + }, + { + "epoch": 9.01, + "grad_norm": 0.474609375, + "learning_rate": 0.0003967418050802358, + "loss": 0.1385, + "step": 217590 + }, + { + "epoch": 9.01, + "grad_norm": 0.859375, + "learning_rate": 0.0003967330245564177, + "loss": 0.1739, + "step": 217600 + }, + { + "epoch": 9.01, + "grad_norm": 0.8359375, + "learning_rate": 0.0003967242437564638, + "loss": 0.2125, + "step": 217610 + }, + { + "epoch": 9.01, + "grad_norm": 0.859375, + "learning_rate": 0.0003967154626803907, + "loss": 0.2429, + "step": 217620 + }, + { + "epoch": 9.01, + "grad_norm": 2.515625, + "learning_rate": 0.000396706681328215, + "loss": 0.1837, + "step": 217630 + }, + { + "epoch": 9.01, + "grad_norm": 0.361328125, + "learning_rate": 0.00039669789969995316, + "loss": 0.1632, + "step": 217640 + }, + { + "epoch": 9.02, + "grad_norm": 0.4921875, + "learning_rate": 0.00039668911779562176, + "loss": 0.2188, + "step": 217650 + }, + { + "epoch": 9.02, + "grad_norm": 0.65625, + "learning_rate": 0.0003966803356152372, + "loss": 0.1722, + "step": 217660 + }, + { + "epoch": 9.02, + "grad_norm": 0.99609375, + "learning_rate": 0.00039667155315881614, + "loss": 0.1743, + "step": 217670 + }, + { + "epoch": 9.02, + "grad_norm": 0.94140625, + "learning_rate": 0.000396662770426375, + "loss": 0.2225, + "step": 217680 + }, + { + "epoch": 9.02, + "grad_norm": 0.6875, + "learning_rate": 0.0003966539874179304, + "loss": 0.1921, + "step": 217690 + }, + { + "epoch": 9.02, + "grad_norm": 1.7890625, + "learning_rate": 0.00039664520413349884, + "loss": 0.2052, + "step": 217700 + }, + { + "epoch": 9.02, + "grad_norm": 0.5625, + "learning_rate": 0.00039663642057309687, + "loss": 0.1512, + "step": 217710 + }, + { + "epoch": 9.02, + "grad_norm": 0.51953125, + "learning_rate": 0.00039662763673674095, + "loss": 0.1422, + "step": 217720 + }, + { + "epoch": 9.02, + "grad_norm": 0.76953125, + "learning_rate": 0.0003966188526244476, + "loss": 0.239, + "step": 217730 + }, + { + "epoch": 9.02, + "grad_norm": 0.99609375, + "learning_rate": 0.0003966100682362335, + "loss": 0.1567, + "step": 217740 + }, + { + "epoch": 9.02, + "grad_norm": 1.03125, + "learning_rate": 0.00039660128357211507, + "loss": 0.1641, + "step": 217750 + }, + { + "epoch": 9.02, + "grad_norm": 0.318359375, + "learning_rate": 0.0003965924986321088, + "loss": 0.2294, + "step": 217760 + }, + { + "epoch": 9.02, + "grad_norm": 0.6796875, + "learning_rate": 0.00039658371341623136, + "loss": 0.1812, + "step": 217770 + }, + { + "epoch": 9.02, + "grad_norm": 2.609375, + "learning_rate": 0.00039657492792449914, + "loss": 0.1691, + "step": 217780 + }, + { + "epoch": 9.02, + "grad_norm": 0.7265625, + "learning_rate": 0.0003965661421569287, + "loss": 0.168, + "step": 217790 + }, + { + "epoch": 9.02, + "grad_norm": 0.609375, + "learning_rate": 0.00039655735611353674, + "loss": 0.1698, + "step": 217800 + }, + { + "epoch": 9.02, + "grad_norm": 0.0, + "learning_rate": 0.0003965485697943395, + "loss": 0.231, + "step": 217810 + }, + { + "epoch": 9.02, + "grad_norm": 1.1640625, + "learning_rate": 0.0003965397831993538, + "loss": 0.1868, + "step": 217820 + }, + { + "epoch": 9.02, + "grad_norm": 0.76171875, + "learning_rate": 0.00039653099632859604, + "loss": 0.2219, + "step": 217830 + }, + { + "epoch": 9.02, + "grad_norm": 0.76953125, + "learning_rate": 0.00039652220918208264, + "loss": 0.1965, + "step": 217840 + }, + { + "epoch": 9.02, + "grad_norm": 2.71875, + "learning_rate": 0.00039651342175983043, + "loss": 0.2002, + "step": 217850 + }, + { + "epoch": 9.02, + "grad_norm": 1.0078125, + "learning_rate": 0.00039650463406185564, + "loss": 0.1705, + "step": 217860 + }, + { + "epoch": 9.02, + "grad_norm": 1.0234375, + "learning_rate": 0.000396495846088175, + "loss": 0.1491, + "step": 217870 + }, + { + "epoch": 9.02, + "grad_norm": 0.7734375, + "learning_rate": 0.00039648705783880514, + "loss": 0.219, + "step": 217880 + }, + { + "epoch": 9.02, + "grad_norm": 0.51953125, + "learning_rate": 0.00039647826931376223, + "loss": 0.1787, + "step": 217890 + }, + { + "epoch": 9.03, + "grad_norm": 0.7109375, + "learning_rate": 0.0003964694805130632, + "loss": 0.2108, + "step": 217900 + }, + { + "epoch": 9.03, + "grad_norm": 0.6171875, + "learning_rate": 0.00039646069143672435, + "loss": 0.1063, + "step": 217910 + }, + { + "epoch": 9.03, + "grad_norm": 0.59375, + "learning_rate": 0.00039645190208476233, + "loss": 0.1444, + "step": 217920 + }, + { + "epoch": 9.03, + "grad_norm": 0.31640625, + "learning_rate": 0.00039644311245719363, + "loss": 0.1659, + "step": 217930 + }, + { + "epoch": 9.03, + "grad_norm": 0.494140625, + "learning_rate": 0.0003964343225540348, + "loss": 0.1364, + "step": 217940 + }, + { + "epoch": 9.03, + "grad_norm": 0.376953125, + "learning_rate": 0.00039642553237530237, + "loss": 0.1973, + "step": 217950 + }, + { + "epoch": 9.03, + "grad_norm": 0.76171875, + "learning_rate": 0.000396416741921013, + "loss": 0.1978, + "step": 217960 + }, + { + "epoch": 9.03, + "grad_norm": 0.703125, + "learning_rate": 0.000396407951191183, + "loss": 0.1782, + "step": 217970 + }, + { + "epoch": 9.03, + "grad_norm": 0.63671875, + "learning_rate": 0.0003963991601858291, + "loss": 0.2286, + "step": 217980 + }, + { + "epoch": 9.03, + "grad_norm": 1.578125, + "learning_rate": 0.0003963903689049678, + "loss": 0.2382, + "step": 217990 + }, + { + "epoch": 9.03, + "grad_norm": 0.9921875, + "learning_rate": 0.00039638157734861565, + "loss": 0.1794, + "step": 218000 + }, + { + "epoch": 9.03, + "grad_norm": 0.81640625, + "learning_rate": 0.0003963727855167891, + "loss": 0.1642, + "step": 218010 + }, + { + "epoch": 9.03, + "grad_norm": 0.78125, + "learning_rate": 0.0003963639934095049, + "loss": 0.2376, + "step": 218020 + }, + { + "epoch": 9.03, + "grad_norm": 1.0546875, + "learning_rate": 0.00039635520102677935, + "loss": 0.2027, + "step": 218030 + }, + { + "epoch": 9.03, + "grad_norm": 0.49609375, + "learning_rate": 0.0003963464083686292, + "loss": 0.1511, + "step": 218040 + }, + { + "epoch": 9.03, + "grad_norm": 0.494140625, + "learning_rate": 0.0003963376154350709, + "loss": 0.1864, + "step": 218050 + }, + { + "epoch": 9.03, + "grad_norm": 0.953125, + "learning_rate": 0.000396328822226121, + "loss": 0.2096, + "step": 218060 + }, + { + "epoch": 9.03, + "grad_norm": 0.89453125, + "learning_rate": 0.0003963200287417961, + "loss": 0.1626, + "step": 218070 + }, + { + "epoch": 9.03, + "grad_norm": 2.34375, + "learning_rate": 0.0003963112349821126, + "loss": 0.2189, + "step": 218080 + }, + { + "epoch": 9.03, + "grad_norm": 0.470703125, + "learning_rate": 0.00039630244094708724, + "loss": 0.2036, + "step": 218090 + }, + { + "epoch": 9.03, + "grad_norm": 0.51171875, + "learning_rate": 0.0003962936466367364, + "loss": 0.1768, + "step": 218100 + }, + { + "epoch": 9.03, + "grad_norm": 0.765625, + "learning_rate": 0.0003962848520510768, + "loss": 0.1848, + "step": 218110 + }, + { + "epoch": 9.03, + "grad_norm": 0.51171875, + "learning_rate": 0.00039627605719012496, + "loss": 0.1621, + "step": 218120 + }, + { + "epoch": 9.03, + "grad_norm": 0.57421875, + "learning_rate": 0.0003962672620538972, + "loss": 0.2032, + "step": 218130 + }, + { + "epoch": 9.04, + "grad_norm": 0.45703125, + "learning_rate": 0.0003962584666424104, + "loss": 0.2105, + "step": 218140 + }, + { + "epoch": 9.04, + "grad_norm": 0.8203125, + "learning_rate": 0.00039624967095568093, + "loss": 0.2046, + "step": 218150 + }, + { + "epoch": 9.04, + "grad_norm": 0.8359375, + "learning_rate": 0.0003962408749937253, + "loss": 0.1917, + "step": 218160 + }, + { + "epoch": 9.04, + "grad_norm": 1.25, + "learning_rate": 0.0003962320787565602, + "loss": 0.2061, + "step": 218170 + }, + { + "epoch": 9.04, + "grad_norm": 0.8203125, + "learning_rate": 0.0003962232822442021, + "loss": 0.2448, + "step": 218180 + }, + { + "epoch": 9.04, + "grad_norm": 1.4453125, + "learning_rate": 0.0003962144854566676, + "loss": 0.1589, + "step": 218190 + }, + { + "epoch": 9.04, + "grad_norm": 1.5859375, + "learning_rate": 0.0003962056883939732, + "loss": 0.2046, + "step": 218200 + }, + { + "epoch": 9.04, + "grad_norm": 0.8671875, + "learning_rate": 0.0003961968910561354, + "loss": 0.2205, + "step": 218210 + }, + { + "epoch": 9.04, + "grad_norm": 0.82421875, + "learning_rate": 0.000396188093443171, + "loss": 0.2202, + "step": 218220 + }, + { + "epoch": 9.04, + "grad_norm": 0.27734375, + "learning_rate": 0.0003961792955550964, + "loss": 0.1781, + "step": 218230 + }, + { + "epoch": 9.04, + "grad_norm": 0.4140625, + "learning_rate": 0.000396170497391928, + "loss": 0.1746, + "step": 218240 + }, + { + "epoch": 9.04, + "grad_norm": 1.34375, + "learning_rate": 0.0003961616989536826, + "loss": 0.166, + "step": 218250 + }, + { + "epoch": 9.04, + "grad_norm": 0.5078125, + "learning_rate": 0.00039615290024037664, + "loss": 0.2289, + "step": 218260 + }, + { + "epoch": 9.04, + "grad_norm": 0.77734375, + "learning_rate": 0.0003961441012520267, + "loss": 0.188, + "step": 218270 + }, + { + "epoch": 9.04, + "grad_norm": 0.9375, + "learning_rate": 0.00039613530198864946, + "loss": 0.1932, + "step": 218280 + }, + { + "epoch": 9.04, + "grad_norm": 1.1484375, + "learning_rate": 0.00039612650245026115, + "loss": 0.1805, + "step": 218290 + }, + { + "epoch": 9.04, + "grad_norm": 0.765625, + "learning_rate": 0.0003961177026368787, + "loss": 0.1843, + "step": 218300 + }, + { + "epoch": 9.04, + "grad_norm": 0.49609375, + "learning_rate": 0.0003961089025485185, + "loss": 0.1956, + "step": 218310 + }, + { + "epoch": 9.04, + "grad_norm": 0.5, + "learning_rate": 0.0003961001021851971, + "loss": 0.1927, + "step": 218320 + }, + { + "epoch": 9.04, + "grad_norm": 0.54296875, + "learning_rate": 0.0003960913015469311, + "loss": 0.2218, + "step": 218330 + }, + { + "epoch": 9.04, + "grad_norm": 1.0703125, + "learning_rate": 0.00039608250063373696, + "loss": 0.1816, + "step": 218340 + }, + { + "epoch": 9.04, + "grad_norm": 0.34765625, + "learning_rate": 0.00039607369944563145, + "loss": 0.1897, + "step": 218350 + }, + { + "epoch": 9.04, + "grad_norm": 0.7265625, + "learning_rate": 0.00039606489798263097, + "loss": 0.2, + "step": 218360 + }, + { + "epoch": 9.04, + "grad_norm": 0.7421875, + "learning_rate": 0.000396056096244752, + "loss": 0.2316, + "step": 218370 + }, + { + "epoch": 9.05, + "grad_norm": 0.6796875, + "learning_rate": 0.00039604729423201143, + "loss": 0.2121, + "step": 218380 + }, + { + "epoch": 9.05, + "grad_norm": 0.65625, + "learning_rate": 0.00039603849194442555, + "loss": 0.1533, + "step": 218390 + }, + { + "epoch": 9.05, + "grad_norm": 1.53125, + "learning_rate": 0.000396029689382011, + "loss": 0.2159, + "step": 218400 + }, + { + "epoch": 9.05, + "grad_norm": 0.828125, + "learning_rate": 0.0003960208865447843, + "loss": 0.1948, + "step": 218410 + }, + { + "epoch": 9.05, + "grad_norm": 0.734375, + "learning_rate": 0.00039601208343276206, + "loss": 0.2178, + "step": 218420 + }, + { + "epoch": 9.05, + "grad_norm": 0.8671875, + "learning_rate": 0.00039600328004596095, + "loss": 0.194, + "step": 218430 + }, + { + "epoch": 9.05, + "grad_norm": 1.4609375, + "learning_rate": 0.0003959944763843973, + "loss": 0.2249, + "step": 218440 + }, + { + "epoch": 9.05, + "grad_norm": 0.98046875, + "learning_rate": 0.0003959856724480879, + "loss": 0.1635, + "step": 218450 + }, + { + "epoch": 9.05, + "grad_norm": 0.40625, + "learning_rate": 0.0003959768682370492, + "loss": 0.1817, + "step": 218460 + }, + { + "epoch": 9.05, + "grad_norm": 0.59765625, + "learning_rate": 0.00039596806375129777, + "loss": 0.2409, + "step": 218470 + }, + { + "epoch": 9.05, + "grad_norm": 1.2890625, + "learning_rate": 0.0003959592589908503, + "loss": 0.1799, + "step": 218480 + }, + { + "epoch": 9.05, + "grad_norm": 0.59765625, + "learning_rate": 0.00039595045395572317, + "loss": 0.2098, + "step": 218490 + }, + { + "epoch": 9.05, + "grad_norm": 0.61328125, + "learning_rate": 0.00039594164864593305, + "loss": 0.1954, + "step": 218500 + }, + { + "epoch": 9.05, + "grad_norm": 0.150390625, + "learning_rate": 0.0003959328430614966, + "loss": 0.168, + "step": 218510 + }, + { + "epoch": 9.05, + "grad_norm": 0.4375, + "learning_rate": 0.0003959240372024302, + "loss": 0.2196, + "step": 218520 + }, + { + "epoch": 9.05, + "grad_norm": 0.67578125, + "learning_rate": 0.00039591523106875055, + "loss": 0.2119, + "step": 218530 + }, + { + "epoch": 9.05, + "grad_norm": 0.75390625, + "learning_rate": 0.0003959064246604742, + "loss": 0.2237, + "step": 218540 + }, + { + "epoch": 9.05, + "grad_norm": 0.458984375, + "learning_rate": 0.0003958976179776177, + "loss": 0.193, + "step": 218550 + }, + { + "epoch": 9.05, + "grad_norm": 0.56640625, + "learning_rate": 0.00039588881102019767, + "loss": 0.2231, + "step": 218560 + }, + { + "epoch": 9.05, + "grad_norm": 0.9921875, + "learning_rate": 0.0003958800037882306, + "loss": 0.1917, + "step": 218570 + }, + { + "epoch": 9.05, + "grad_norm": 1.765625, + "learning_rate": 0.00039587119628173317, + "loss": 0.2063, + "step": 218580 + }, + { + "epoch": 9.05, + "grad_norm": 1.1171875, + "learning_rate": 0.00039586238850072196, + "loss": 0.1981, + "step": 218590 + }, + { + "epoch": 9.05, + "grad_norm": 0.44140625, + "learning_rate": 0.00039585358044521337, + "loss": 0.2499, + "step": 218600 + }, + { + "epoch": 9.05, + "grad_norm": 0.498046875, + "learning_rate": 0.00039584477211522415, + "loss": 0.1968, + "step": 218610 + }, + { + "epoch": 9.06, + "grad_norm": 0.61328125, + "learning_rate": 0.00039583596351077077, + "loss": 0.2045, + "step": 218620 + }, + { + "epoch": 9.06, + "grad_norm": 0.84765625, + "learning_rate": 0.00039582715463186993, + "loss": 0.2096, + "step": 218630 + }, + { + "epoch": 9.06, + "grad_norm": 1.1953125, + "learning_rate": 0.0003958183454785381, + "loss": 0.194, + "step": 218640 + }, + { + "epoch": 9.06, + "grad_norm": 0.4609375, + "learning_rate": 0.00039580953605079187, + "loss": 0.1467, + "step": 218650 + }, + { + "epoch": 9.06, + "grad_norm": 0.8828125, + "learning_rate": 0.0003958007263486478, + "loss": 0.1515, + "step": 218660 + }, + { + "epoch": 9.06, + "grad_norm": 0.89453125, + "learning_rate": 0.00039579191637212265, + "loss": 0.1594, + "step": 218670 + }, + { + "epoch": 9.06, + "grad_norm": 0.83203125, + "learning_rate": 0.0003957831061212327, + "loss": 0.1948, + "step": 218680 + }, + { + "epoch": 9.06, + "grad_norm": 1.0390625, + "learning_rate": 0.0003957742955959948, + "loss": 0.256, + "step": 218690 + }, + { + "epoch": 9.06, + "grad_norm": 0.96875, + "learning_rate": 0.0003957654847964254, + "loss": 0.2194, + "step": 218700 + }, + { + "epoch": 9.06, + "grad_norm": 0.625, + "learning_rate": 0.000395756673722541, + "loss": 0.2294, + "step": 218710 + }, + { + "epoch": 9.06, + "grad_norm": 0.439453125, + "learning_rate": 0.0003957478623743584, + "loss": 0.2117, + "step": 218720 + }, + { + "epoch": 9.06, + "grad_norm": 1.2421875, + "learning_rate": 0.000395739050751894, + "loss": 0.2125, + "step": 218730 + }, + { + "epoch": 9.06, + "grad_norm": 1.078125, + "learning_rate": 0.00039573023885516444, + "loss": 0.2012, + "step": 218740 + }, + { + "epoch": 9.06, + "grad_norm": 0.7109375, + "learning_rate": 0.00039572142668418633, + "loss": 0.1742, + "step": 218750 + }, + { + "epoch": 9.06, + "grad_norm": 0.4921875, + "learning_rate": 0.0003957126142389762, + "loss": 0.2241, + "step": 218760 + }, + { + "epoch": 9.06, + "grad_norm": 0.197265625, + "learning_rate": 0.0003957038015195508, + "loss": 0.2028, + "step": 218770 + }, + { + "epoch": 9.06, + "grad_norm": 1.5078125, + "learning_rate": 0.00039569498852592646, + "loss": 0.1959, + "step": 218780 + }, + { + "epoch": 9.06, + "grad_norm": 1.1640625, + "learning_rate": 0.00039568617525811983, + "loss": 0.1828, + "step": 218790 + }, + { + "epoch": 9.06, + "grad_norm": 1.0859375, + "learning_rate": 0.00039567736171614763, + "loss": 0.1942, + "step": 218800 + }, + { + "epoch": 9.06, + "grad_norm": 1.0234375, + "learning_rate": 0.00039566854790002635, + "loss": 0.2085, + "step": 218810 + }, + { + "epoch": 9.06, + "grad_norm": 0.71875, + "learning_rate": 0.0003956597338097726, + "loss": 0.1601, + "step": 218820 + }, + { + "epoch": 9.06, + "grad_norm": 0.85546875, + "learning_rate": 0.000395650919445403, + "loss": 0.1702, + "step": 218830 + }, + { + "epoch": 9.06, + "grad_norm": 1.4453125, + "learning_rate": 0.000395642104806934, + "loss": 0.1135, + "step": 218840 + }, + { + "epoch": 9.06, + "grad_norm": 0.6796875, + "learning_rate": 0.0003956332898943824, + "loss": 0.2046, + "step": 218850 + }, + { + "epoch": 9.07, + "grad_norm": 0.55859375, + "learning_rate": 0.00039562447470776465, + "loss": 0.1386, + "step": 218860 + }, + { + "epoch": 9.07, + "grad_norm": 0.87109375, + "learning_rate": 0.00039561565924709733, + "loss": 0.2027, + "step": 218870 + }, + { + "epoch": 9.07, + "grad_norm": 1.7734375, + "learning_rate": 0.0003956068435123971, + "loss": 0.151, + "step": 218880 + }, + { + "epoch": 9.07, + "grad_norm": 0.66796875, + "learning_rate": 0.0003955980275036804, + "loss": 0.1848, + "step": 218890 + }, + { + "epoch": 9.07, + "grad_norm": 0.62109375, + "learning_rate": 0.000395589211220964, + "loss": 0.2227, + "step": 218900 + }, + { + "epoch": 9.07, + "grad_norm": 0.765625, + "learning_rate": 0.0003955803946642645, + "loss": 0.2416, + "step": 218910 + }, + { + "epoch": 9.07, + "grad_norm": 1.5625, + "learning_rate": 0.00039557157783359835, + "loss": 0.1756, + "step": 218920 + }, + { + "epoch": 9.07, + "grad_norm": 0.55859375, + "learning_rate": 0.0003955627607289822, + "loss": 0.1873, + "step": 218930 + }, + { + "epoch": 9.07, + "grad_norm": 2.859375, + "learning_rate": 0.0003955539433504327, + "loss": 0.2405, + "step": 218940 + }, + { + "epoch": 9.07, + "grad_norm": 0.498046875, + "learning_rate": 0.00039554512569796643, + "loss": 0.2201, + "step": 218950 + }, + { + "epoch": 9.07, + "grad_norm": 0.5859375, + "learning_rate": 0.00039553630777159986, + "loss": 0.2164, + "step": 218960 + }, + { + "epoch": 9.07, + "grad_norm": 0.8046875, + "learning_rate": 0.0003955274895713497, + "loss": 0.2411, + "step": 218970 + }, + { + "epoch": 9.07, + "grad_norm": 1.6953125, + "learning_rate": 0.0003955186710972326, + "loss": 0.2464, + "step": 218980 + }, + { + "epoch": 9.07, + "grad_norm": 0.71875, + "learning_rate": 0.00039550985234926495, + "loss": 0.2154, + "step": 218990 + }, + { + "epoch": 9.07, + "grad_norm": 0.65234375, + "learning_rate": 0.00039550103332746354, + "loss": 0.1816, + "step": 219000 + }, + { + "epoch": 9.07, + "grad_norm": 0.703125, + "learning_rate": 0.00039549221403184493, + "loss": 0.1555, + "step": 219010 + }, + { + "epoch": 9.07, + "grad_norm": 0.50390625, + "learning_rate": 0.00039548339446242564, + "loss": 0.2191, + "step": 219020 + }, + { + "epoch": 9.07, + "grad_norm": 2.109375, + "learning_rate": 0.0003954745746192223, + "loss": 0.2228, + "step": 219030 + }, + { + "epoch": 9.07, + "grad_norm": 0.416015625, + "learning_rate": 0.0003954657545022516, + "loss": 0.2493, + "step": 219040 + }, + { + "epoch": 9.07, + "grad_norm": 0.5390625, + "learning_rate": 0.00039545693411152996, + "loss": 0.2252, + "step": 219050 + }, + { + "epoch": 9.07, + "grad_norm": 0.640625, + "learning_rate": 0.0003954481134470741, + "loss": 0.1516, + "step": 219060 + }, + { + "epoch": 9.07, + "grad_norm": 1.1484375, + "learning_rate": 0.00039543929250890065, + "loss": 0.1934, + "step": 219070 + }, + { + "epoch": 9.07, + "grad_norm": 0.99609375, + "learning_rate": 0.0003954304712970261, + "loss": 0.1925, + "step": 219080 + }, + { + "epoch": 9.07, + "grad_norm": 0.94140625, + "learning_rate": 0.00039542164981146713, + "loss": 0.2058, + "step": 219090 + }, + { + "epoch": 9.08, + "grad_norm": 0.64453125, + "learning_rate": 0.0003954128280522403, + "loss": 0.1909, + "step": 219100 + }, + { + "epoch": 9.08, + "grad_norm": 1.6328125, + "learning_rate": 0.0003954040060193623, + "loss": 0.1937, + "step": 219110 + }, + { + "epoch": 9.08, + "grad_norm": 0.921875, + "learning_rate": 0.0003953951837128496, + "loss": 0.2166, + "step": 219120 + }, + { + "epoch": 9.08, + "grad_norm": 1.453125, + "learning_rate": 0.00039538636113271885, + "loss": 0.2365, + "step": 219130 + }, + { + "epoch": 9.08, + "grad_norm": 0.96484375, + "learning_rate": 0.0003953775382789867, + "loss": 0.1777, + "step": 219140 + }, + { + "epoch": 9.08, + "grad_norm": 0.64453125, + "learning_rate": 0.0003953687151516697, + "loss": 0.1974, + "step": 219150 + }, + { + "epoch": 9.08, + "grad_norm": 0.734375, + "learning_rate": 0.0003953598917507845, + "loss": 0.1854, + "step": 219160 + }, + { + "epoch": 9.08, + "grad_norm": 0.52734375, + "learning_rate": 0.0003953510680763477, + "loss": 0.2193, + "step": 219170 + }, + { + "epoch": 9.08, + "grad_norm": 0.298828125, + "learning_rate": 0.00039534224412837585, + "loss": 0.2032, + "step": 219180 + }, + { + "epoch": 9.08, + "grad_norm": 1.9453125, + "learning_rate": 0.0003953334199068857, + "loss": 0.1647, + "step": 219190 + }, + { + "epoch": 9.08, + "grad_norm": 1.375, + "learning_rate": 0.0003953245954118936, + "loss": 0.2278, + "step": 219200 + }, + { + "epoch": 9.08, + "grad_norm": 1.1015625, + "learning_rate": 0.0003953157706434163, + "loss": 0.2464, + "step": 219210 + }, + { + "epoch": 9.08, + "grad_norm": 0.271484375, + "learning_rate": 0.0003953069456014705, + "loss": 0.1655, + "step": 219220 + }, + { + "epoch": 9.08, + "grad_norm": 0.326171875, + "learning_rate": 0.0003952981202860727, + "loss": 0.1901, + "step": 219230 + }, + { + "epoch": 9.08, + "grad_norm": 0.53515625, + "learning_rate": 0.0003952892946972395, + "loss": 0.1781, + "step": 219240 + }, + { + "epoch": 9.08, + "grad_norm": 0.921875, + "learning_rate": 0.00039528046883498757, + "loss": 0.1568, + "step": 219250 + }, + { + "epoch": 9.08, + "grad_norm": 1.0546875, + "learning_rate": 0.00039527164269933347, + "loss": 0.1915, + "step": 219260 + }, + { + "epoch": 9.08, + "grad_norm": 1.5703125, + "learning_rate": 0.00039526281629029383, + "loss": 0.2119, + "step": 219270 + }, + { + "epoch": 9.08, + "grad_norm": 0.55859375, + "learning_rate": 0.00039525398960788525, + "loss": 0.1788, + "step": 219280 + }, + { + "epoch": 9.08, + "grad_norm": 0.72265625, + "learning_rate": 0.0003952451626521244, + "loss": 0.2788, + "step": 219290 + }, + { + "epoch": 9.08, + "grad_norm": 0.318359375, + "learning_rate": 0.0003952363354230277, + "loss": 0.1607, + "step": 219300 + }, + { + "epoch": 9.08, + "grad_norm": 1.4609375, + "learning_rate": 0.00039522750792061204, + "loss": 0.1784, + "step": 219310 + }, + { + "epoch": 9.08, + "grad_norm": 0.57421875, + "learning_rate": 0.0003952186801448939, + "loss": 0.2291, + "step": 219320 + }, + { + "epoch": 9.08, + "grad_norm": 0.84375, + "learning_rate": 0.00039520985209588977, + "loss": 0.1893, + "step": 219330 + }, + { + "epoch": 9.09, + "grad_norm": 0.1943359375, + "learning_rate": 0.00039520102377361646, + "loss": 0.1752, + "step": 219340 + }, + { + "epoch": 9.09, + "grad_norm": 1.0078125, + "learning_rate": 0.00039519219517809053, + "loss": 0.2352, + "step": 219350 + }, + { + "epoch": 9.09, + "grad_norm": 0.796875, + "learning_rate": 0.0003951833663093285, + "loss": 0.2246, + "step": 219360 + }, + { + "epoch": 9.09, + "grad_norm": 1.3359375, + "learning_rate": 0.0003951745371673471, + "loss": 0.2551, + "step": 219370 + }, + { + "epoch": 9.09, + "grad_norm": 0.640625, + "learning_rate": 0.00039516570775216287, + "loss": 0.1612, + "step": 219380 + }, + { + "epoch": 9.09, + "grad_norm": 0.84765625, + "learning_rate": 0.00039515687806379243, + "loss": 0.2214, + "step": 219390 + }, + { + "epoch": 9.09, + "grad_norm": 0.93359375, + "learning_rate": 0.0003951480481022525, + "loss": 0.1643, + "step": 219400 + }, + { + "epoch": 9.09, + "grad_norm": 0.5703125, + "learning_rate": 0.0003951392178675596, + "loss": 0.2185, + "step": 219410 + }, + { + "epoch": 9.09, + "grad_norm": 0.6875, + "learning_rate": 0.0003951303873597303, + "loss": 0.1387, + "step": 219420 + }, + { + "epoch": 9.09, + "grad_norm": 0.89453125, + "learning_rate": 0.00039512155657878134, + "loss": 0.1792, + "step": 219430 + }, + { + "epoch": 9.09, + "grad_norm": 0.419921875, + "learning_rate": 0.0003951127255247292, + "loss": 0.1814, + "step": 219440 + }, + { + "epoch": 9.09, + "grad_norm": 0.93359375, + "learning_rate": 0.00039510389419759065, + "loss": 0.1947, + "step": 219450 + }, + { + "epoch": 9.09, + "grad_norm": 0.625, + "learning_rate": 0.0003950950625973823, + "loss": 0.1952, + "step": 219460 + }, + { + "epoch": 9.09, + "grad_norm": 0.24609375, + "learning_rate": 0.0003950862307241206, + "loss": 0.1516, + "step": 219470 + }, + { + "epoch": 9.09, + "grad_norm": 0.796875, + "learning_rate": 0.0003950773985778223, + "loss": 0.229, + "step": 219480 + }, + { + "epoch": 9.09, + "grad_norm": 1.21875, + "learning_rate": 0.0003950685661585041, + "loss": 0.2064, + "step": 219490 + }, + { + "epoch": 9.09, + "grad_norm": 0.71484375, + "learning_rate": 0.0003950597334661824, + "loss": 0.187, + "step": 219500 + }, + { + "epoch": 9.09, + "grad_norm": 1.6484375, + "learning_rate": 0.000395050900500874, + "loss": 0.2147, + "step": 219510 + }, + { + "epoch": 9.09, + "grad_norm": 0.87890625, + "learning_rate": 0.0003950420672625955, + "loss": 0.2309, + "step": 219520 + }, + { + "epoch": 9.09, + "grad_norm": 0.59375, + "learning_rate": 0.00039503323375136346, + "loss": 0.1468, + "step": 219530 + }, + { + "epoch": 9.09, + "grad_norm": 0.97265625, + "learning_rate": 0.0003950243999671945, + "loss": 0.2437, + "step": 219540 + }, + { + "epoch": 9.09, + "grad_norm": 0.90625, + "learning_rate": 0.00039501556591010537, + "loss": 0.172, + "step": 219550 + }, + { + "epoch": 9.09, + "grad_norm": 0.87109375, + "learning_rate": 0.00039500673158011256, + "loss": 0.1765, + "step": 219560 + }, + { + "epoch": 9.09, + "grad_norm": 0.89453125, + "learning_rate": 0.00039499789697723263, + "loss": 0.1609, + "step": 219570 + }, + { + "epoch": 9.09, + "grad_norm": 0.91796875, + "learning_rate": 0.00039498906210148253, + "loss": 0.2222, + "step": 219580 + }, + { + "epoch": 9.1, + "grad_norm": 0.470703125, + "learning_rate": 0.00039498022695287854, + "loss": 0.1693, + "step": 219590 + }, + { + "epoch": 9.1, + "grad_norm": 1.3046875, + "learning_rate": 0.00039497139153143746, + "loss": 0.1448, + "step": 219600 + }, + { + "epoch": 9.1, + "grad_norm": 1.171875, + "learning_rate": 0.0003949625558371759, + "loss": 0.2161, + "step": 219610 + }, + { + "epoch": 9.1, + "grad_norm": 0.0, + "learning_rate": 0.0003949537198701104, + "loss": 0.1958, + "step": 219620 + }, + { + "epoch": 9.1, + "grad_norm": 0.96484375, + "learning_rate": 0.00039494488363025766, + "loss": 0.1417, + "step": 219630 + }, + { + "epoch": 9.1, + "grad_norm": 0.7109375, + "learning_rate": 0.0003949360471176343, + "loss": 0.2206, + "step": 219640 + }, + { + "epoch": 9.1, + "grad_norm": 0.85546875, + "learning_rate": 0.000394927210332257, + "loss": 0.1999, + "step": 219650 + }, + { + "epoch": 9.1, + "grad_norm": 0.859375, + "learning_rate": 0.0003949183732741423, + "loss": 0.1907, + "step": 219660 + }, + { + "epoch": 9.1, + "grad_norm": 0.470703125, + "learning_rate": 0.0003949095359433069, + "loss": 0.2141, + "step": 219670 + }, + { + "epoch": 9.1, + "grad_norm": 0.8828125, + "learning_rate": 0.00039490069833976736, + "loss": 0.2265, + "step": 219680 + }, + { + "epoch": 9.1, + "grad_norm": 0.39453125, + "learning_rate": 0.0003948918604635404, + "loss": 0.1605, + "step": 219690 + }, + { + "epoch": 9.1, + "grad_norm": 0.7109375, + "learning_rate": 0.00039488302231464255, + "loss": 0.1907, + "step": 219700 + }, + { + "epoch": 9.1, + "grad_norm": 1.203125, + "learning_rate": 0.0003948741838930906, + "loss": 0.2027, + "step": 219710 + }, + { + "epoch": 9.1, + "grad_norm": 0.65625, + "learning_rate": 0.000394865345198901, + "loss": 0.1733, + "step": 219720 + }, + { + "epoch": 9.1, + "grad_norm": 1.546875, + "learning_rate": 0.00039485650623209044, + "loss": 0.1982, + "step": 219730 + }, + { + "epoch": 9.1, + "grad_norm": 0.306640625, + "learning_rate": 0.00039484766699267565, + "loss": 0.2104, + "step": 219740 + }, + { + "epoch": 9.1, + "grad_norm": 0.9140625, + "learning_rate": 0.0003948388274806731, + "loss": 0.178, + "step": 219750 + }, + { + "epoch": 9.1, + "grad_norm": 0.52734375, + "learning_rate": 0.00039482998769609963, + "loss": 0.1803, + "step": 219760 + }, + { + "epoch": 9.1, + "grad_norm": 0.71875, + "learning_rate": 0.0003948211476389717, + "loss": 0.2019, + "step": 219770 + }, + { + "epoch": 9.1, + "grad_norm": 0.390625, + "learning_rate": 0.000394812307309306, + "loss": 0.1283, + "step": 219780 + }, + { + "epoch": 9.1, + "grad_norm": 0.330078125, + "learning_rate": 0.0003948034667071192, + "loss": 0.178, + "step": 219790 + }, + { + "epoch": 9.1, + "grad_norm": 0.515625, + "learning_rate": 0.000394794625832428, + "loss": 0.1888, + "step": 219800 + }, + { + "epoch": 9.1, + "grad_norm": 2.53125, + "learning_rate": 0.0003947857846852488, + "loss": 0.1864, + "step": 219810 + }, + { + "epoch": 9.1, + "grad_norm": 0.57421875, + "learning_rate": 0.00039477694326559845, + "loss": 0.187, + "step": 219820 + }, + { + "epoch": 9.11, + "grad_norm": 0.9609375, + "learning_rate": 0.0003947681015734935, + "loss": 0.2146, + "step": 219830 + }, + { + "epoch": 9.11, + "grad_norm": 0.84765625, + "learning_rate": 0.00039475925960895066, + "loss": 0.1969, + "step": 219840 + }, + { + "epoch": 9.11, + "grad_norm": 1.5234375, + "learning_rate": 0.0003947504173719865, + "loss": 0.1938, + "step": 219850 + }, + { + "epoch": 9.11, + "grad_norm": 0.224609375, + "learning_rate": 0.00039474157486261773, + "loss": 0.2191, + "step": 219860 + }, + { + "epoch": 9.11, + "grad_norm": 0.48046875, + "learning_rate": 0.0003947327320808609, + "loss": 0.1968, + "step": 219870 + }, + { + "epoch": 9.11, + "grad_norm": 0.94140625, + "learning_rate": 0.00039472388902673275, + "loss": 0.2425, + "step": 219880 + }, + { + "epoch": 9.11, + "grad_norm": 0.79296875, + "learning_rate": 0.00039471504570024975, + "loss": 0.1957, + "step": 219890 + }, + { + "epoch": 9.11, + "grad_norm": 0.255859375, + "learning_rate": 0.00039470620210142884, + "loss": 0.2008, + "step": 219900 + }, + { + "epoch": 9.11, + "grad_norm": 0.74609375, + "learning_rate": 0.0003946973582302863, + "loss": 0.173, + "step": 219910 + }, + { + "epoch": 9.11, + "grad_norm": 0.734375, + "learning_rate": 0.00039468851408683904, + "loss": 0.1803, + "step": 219920 + }, + { + "epoch": 9.11, + "grad_norm": 0.515625, + "learning_rate": 0.00039467966967110367, + "loss": 0.1602, + "step": 219930 + }, + { + "epoch": 9.11, + "grad_norm": 0.89453125, + "learning_rate": 0.0003946708249830967, + "loss": 0.2097, + "step": 219940 + }, + { + "epoch": 9.11, + "grad_norm": 1.2109375, + "learning_rate": 0.00039466198002283495, + "loss": 0.2217, + "step": 219950 + }, + { + "epoch": 9.11, + "grad_norm": 0.63671875, + "learning_rate": 0.0003946531347903349, + "loss": 0.1595, + "step": 219960 + }, + { + "epoch": 9.11, + "grad_norm": 0.8125, + "learning_rate": 0.0003946442892856133, + "loss": 0.2092, + "step": 219970 + }, + { + "epoch": 9.11, + "grad_norm": 0.8359375, + "learning_rate": 0.00039463544350868676, + "loss": 0.271, + "step": 219980 + }, + { + "epoch": 9.11, + "grad_norm": 0.64453125, + "learning_rate": 0.00039462659745957197, + "loss": 0.1924, + "step": 219990 + }, + { + "epoch": 9.11, + "grad_norm": 0.82421875, + "learning_rate": 0.00039461775113828546, + "loss": 0.2267, + "step": 220000 + }, + { + "epoch": 9.11, + "grad_norm": 0.263671875, + "learning_rate": 0.00039460890454484404, + "loss": 0.1908, + "step": 220010 + }, + { + "epoch": 9.11, + "grad_norm": 0.8515625, + "learning_rate": 0.00039460005767926425, + "loss": 0.17, + "step": 220020 + }, + { + "epoch": 9.11, + "grad_norm": 0.423828125, + "learning_rate": 0.00039459121054156277, + "loss": 0.1791, + "step": 220030 + }, + { + "epoch": 9.11, + "grad_norm": 1.7734375, + "learning_rate": 0.0003945823631317562, + "loss": 0.1991, + "step": 220040 + }, + { + "epoch": 9.11, + "grad_norm": 1.9453125, + "learning_rate": 0.0003945735154498613, + "loss": 0.2291, + "step": 220050 + }, + { + "epoch": 9.11, + "grad_norm": 0.63671875, + "learning_rate": 0.0003945646674958947, + "loss": 0.1642, + "step": 220060 + }, + { + "epoch": 9.12, + "grad_norm": 0.65234375, + "learning_rate": 0.00039455581926987296, + "loss": 0.2619, + "step": 220070 + }, + { + "epoch": 9.12, + "grad_norm": 0.515625, + "learning_rate": 0.0003945469707718127, + "loss": 0.1911, + "step": 220080 + }, + { + "epoch": 9.12, + "grad_norm": 0.39453125, + "learning_rate": 0.00039453812200173076, + "loss": 0.2224, + "step": 220090 + }, + { + "epoch": 9.12, + "grad_norm": 0.8984375, + "learning_rate": 0.00039452927295964357, + "loss": 0.1674, + "step": 220100 + }, + { + "epoch": 9.12, + "grad_norm": 0.78125, + "learning_rate": 0.00039452042364556805, + "loss": 0.1583, + "step": 220110 + }, + { + "epoch": 9.12, + "grad_norm": 0.81640625, + "learning_rate": 0.0003945115740595205, + "loss": 0.1977, + "step": 220120 + }, + { + "epoch": 9.12, + "grad_norm": 0.59375, + "learning_rate": 0.00039450272420151795, + "loss": 0.1669, + "step": 220130 + }, + { + "epoch": 9.12, + "grad_norm": 0.337890625, + "learning_rate": 0.00039449387407157683, + "loss": 0.1772, + "step": 220140 + }, + { + "epoch": 9.12, + "grad_norm": 0.427734375, + "learning_rate": 0.00039448502366971384, + "loss": 0.1801, + "step": 220150 + }, + { + "epoch": 9.12, + "grad_norm": 0.671875, + "learning_rate": 0.00039447617299594563, + "loss": 0.2199, + "step": 220160 + }, + { + "epoch": 9.12, + "grad_norm": 0.5703125, + "learning_rate": 0.0003944673220502888, + "loss": 0.184, + "step": 220170 + }, + { + "epoch": 9.12, + "grad_norm": 0.78515625, + "learning_rate": 0.0003944584708327602, + "loss": 0.197, + "step": 220180 + }, + { + "epoch": 9.12, + "grad_norm": 0.390625, + "learning_rate": 0.0003944496193433763, + "loss": 0.1505, + "step": 220190 + }, + { + "epoch": 9.12, + "grad_norm": 1.0625, + "learning_rate": 0.00039444076758215373, + "loss": 0.2051, + "step": 220200 + }, + { + "epoch": 9.12, + "grad_norm": 1.28125, + "learning_rate": 0.00039443191554910934, + "loss": 0.1615, + "step": 220210 + }, + { + "epoch": 9.12, + "grad_norm": 0.6328125, + "learning_rate": 0.0003944230632442597, + "loss": 0.1937, + "step": 220220 + }, + { + "epoch": 9.12, + "grad_norm": 0.86328125, + "learning_rate": 0.00039441421066762136, + "loss": 0.1592, + "step": 220230 + }, + { + "epoch": 9.12, + "grad_norm": 0.77734375, + "learning_rate": 0.0003944053578192111, + "loss": 0.1805, + "step": 220240 + }, + { + "epoch": 9.12, + "grad_norm": 0.62890625, + "learning_rate": 0.00039439650469904556, + "loss": 0.1589, + "step": 220250 + }, + { + "epoch": 9.12, + "grad_norm": 1.4921875, + "learning_rate": 0.00039438765130714136, + "loss": 0.1876, + "step": 220260 + }, + { + "epoch": 9.12, + "grad_norm": 0.9921875, + "learning_rate": 0.0003943787976435153, + "loss": 0.1779, + "step": 220270 + }, + { + "epoch": 9.12, + "grad_norm": 2.828125, + "learning_rate": 0.0003943699437081838, + "loss": 0.2587, + "step": 220280 + }, + { + "epoch": 9.12, + "grad_norm": 0.984375, + "learning_rate": 0.0003943610895011638, + "loss": 0.2382, + "step": 220290 + }, + { + "epoch": 9.12, + "grad_norm": 0.48046875, + "learning_rate": 0.0003943522350224717, + "loss": 0.2016, + "step": 220300 + }, + { + "epoch": 9.13, + "grad_norm": 0.61328125, + "learning_rate": 0.00039434338027212435, + "loss": 0.2391, + "step": 220310 + }, + { + "epoch": 9.13, + "grad_norm": 0.796875, + "learning_rate": 0.0003943345252501383, + "loss": 0.196, + "step": 220320 + }, + { + "epoch": 9.13, + "grad_norm": 0.84375, + "learning_rate": 0.0003943256699565303, + "loss": 0.1967, + "step": 220330 + }, + { + "epoch": 9.13, + "grad_norm": 0.421875, + "learning_rate": 0.00039431681439131694, + "loss": 0.2247, + "step": 220340 + }, + { + "epoch": 9.13, + "grad_norm": 0.86328125, + "learning_rate": 0.00039430795855451494, + "loss": 0.2008, + "step": 220350 + }, + { + "epoch": 9.13, + "grad_norm": 0.98046875, + "learning_rate": 0.00039429910244614085, + "loss": 0.2081, + "step": 220360 + }, + { + "epoch": 9.13, + "grad_norm": 0.359375, + "learning_rate": 0.00039429024606621156, + "loss": 0.164, + "step": 220370 + }, + { + "epoch": 9.13, + "grad_norm": 0.84765625, + "learning_rate": 0.0003942813894147436, + "loss": 0.2113, + "step": 220380 + }, + { + "epoch": 9.13, + "grad_norm": 0.625, + "learning_rate": 0.0003942725324917536, + "loss": 0.2358, + "step": 220390 + }, + { + "epoch": 9.13, + "grad_norm": 0.5078125, + "learning_rate": 0.00039426367529725833, + "loss": 0.201, + "step": 220400 + }, + { + "epoch": 9.13, + "grad_norm": 0.41015625, + "learning_rate": 0.00039425481783127435, + "loss": 0.2374, + "step": 220410 + }, + { + "epoch": 9.13, + "grad_norm": 1.9453125, + "learning_rate": 0.0003942459600938184, + "loss": 0.1853, + "step": 220420 + }, + { + "epoch": 9.13, + "grad_norm": 0.416015625, + "learning_rate": 0.0003942371020849072, + "loss": 0.2989, + "step": 220430 + }, + { + "epoch": 9.13, + "grad_norm": 1.0546875, + "learning_rate": 0.0003942282438045571, + "loss": 0.1879, + "step": 220440 + }, + { + "epoch": 9.13, + "grad_norm": 0.478515625, + "learning_rate": 0.00039421938525278525, + "loss": 0.2146, + "step": 220450 + }, + { + "epoch": 9.13, + "grad_norm": 0.640625, + "learning_rate": 0.000394210526429608, + "loss": 0.2066, + "step": 220460 + }, + { + "epoch": 9.13, + "grad_norm": 0.5078125, + "learning_rate": 0.0003942016673350422, + "loss": 0.2005, + "step": 220470 + }, + { + "epoch": 9.13, + "grad_norm": 0.99609375, + "learning_rate": 0.0003941928079691044, + "loss": 0.1379, + "step": 220480 + }, + { + "epoch": 9.13, + "grad_norm": 3.109375, + "learning_rate": 0.00039418394833181124, + "loss": 0.2101, + "step": 220490 + }, + { + "epoch": 9.13, + "grad_norm": 0.373046875, + "learning_rate": 0.00039417508842317956, + "loss": 0.2042, + "step": 220500 + }, + { + "epoch": 9.13, + "grad_norm": 1.296875, + "learning_rate": 0.00039416622824322585, + "loss": 0.2062, + "step": 220510 + }, + { + "epoch": 9.13, + "grad_norm": 0.6015625, + "learning_rate": 0.00039415736779196687, + "loss": 0.2621, + "step": 220520 + }, + { + "epoch": 9.13, + "grad_norm": 1.0625, + "learning_rate": 0.0003941485070694194, + "loss": 0.1887, + "step": 220530 + }, + { + "epoch": 9.13, + "grad_norm": 1.453125, + "learning_rate": 0.00039413964607559987, + "loss": 0.1658, + "step": 220540 + }, + { + "epoch": 9.14, + "grad_norm": 0.58984375, + "learning_rate": 0.0003941307848105252, + "loss": 0.1927, + "step": 220550 + }, + { + "epoch": 9.14, + "grad_norm": 0.78515625, + "learning_rate": 0.0003941219232742119, + "loss": 0.2321, + "step": 220560 + }, + { + "epoch": 9.14, + "grad_norm": 1.2109375, + "learning_rate": 0.0003941130614666767, + "loss": 0.1992, + "step": 220570 + }, + { + "epoch": 9.14, + "grad_norm": 0.73046875, + "learning_rate": 0.0003941041993879363, + "loss": 0.1898, + "step": 220580 + }, + { + "epoch": 9.14, + "grad_norm": 1.1796875, + "learning_rate": 0.0003940953370380073, + "loss": 0.2119, + "step": 220590 + }, + { + "epoch": 9.14, + "grad_norm": 1.1328125, + "learning_rate": 0.00039408647441690646, + "loss": 0.1968, + "step": 220600 + }, + { + "epoch": 9.14, + "grad_norm": 1.0703125, + "learning_rate": 0.00039407761152465047, + "loss": 0.1925, + "step": 220610 + }, + { + "epoch": 9.14, + "grad_norm": 1.6484375, + "learning_rate": 0.000394068748361256, + "loss": 0.2369, + "step": 220620 + }, + { + "epoch": 9.14, + "grad_norm": 0.9921875, + "learning_rate": 0.00039405988492673973, + "loss": 0.2713, + "step": 220630 + }, + { + "epoch": 9.14, + "grad_norm": 0.46875, + "learning_rate": 0.00039405102122111826, + "loss": 0.202, + "step": 220640 + }, + { + "epoch": 9.14, + "grad_norm": 0.271484375, + "learning_rate": 0.0003940421572444083, + "loss": 0.2049, + "step": 220650 + }, + { + "epoch": 9.14, + "grad_norm": 0.62109375, + "learning_rate": 0.0003940332929966266, + "loss": 0.2787, + "step": 220660 + }, + { + "epoch": 9.14, + "grad_norm": 0.26953125, + "learning_rate": 0.0003940244284777897, + "loss": 0.2151, + "step": 220670 + }, + { + "epoch": 9.14, + "grad_norm": 0.388671875, + "learning_rate": 0.0003940155636879145, + "loss": 0.228, + "step": 220680 + }, + { + "epoch": 9.14, + "grad_norm": 0.83984375, + "learning_rate": 0.0003940066986270175, + "loss": 0.1985, + "step": 220690 + }, + { + "epoch": 9.14, + "grad_norm": 0.59765625, + "learning_rate": 0.0003939978332951154, + "loss": 0.2101, + "step": 220700 + }, + { + "epoch": 9.14, + "grad_norm": 0.83203125, + "learning_rate": 0.000393988967692225, + "loss": 0.2187, + "step": 220710 + }, + { + "epoch": 9.14, + "grad_norm": 0.3203125, + "learning_rate": 0.00039398010181836287, + "loss": 0.195, + "step": 220720 + }, + { + "epoch": 9.14, + "grad_norm": 0.439453125, + "learning_rate": 0.00039397123567354574, + "loss": 0.1982, + "step": 220730 + }, + { + "epoch": 9.14, + "grad_norm": 0.64453125, + "learning_rate": 0.0003939623692577904, + "loss": 0.1669, + "step": 220740 + }, + { + "epoch": 9.14, + "grad_norm": 0.62109375, + "learning_rate": 0.0003939535025711133, + "loss": 0.192, + "step": 220750 + }, + { + "epoch": 9.14, + "grad_norm": 0.69921875, + "learning_rate": 0.00039394463561353134, + "loss": 0.2103, + "step": 220760 + }, + { + "epoch": 9.14, + "grad_norm": 1.234375, + "learning_rate": 0.0003939357683850611, + "loss": 0.2536, + "step": 220770 + }, + { + "epoch": 9.14, + "grad_norm": 0.447265625, + "learning_rate": 0.0003939269008857193, + "loss": 0.1849, + "step": 220780 + }, + { + "epoch": 9.15, + "grad_norm": 0.5703125, + "learning_rate": 0.0003939180331155225, + "loss": 0.1868, + "step": 220790 + }, + { + "epoch": 9.15, + "grad_norm": 1.3671875, + "learning_rate": 0.0003939091650744876, + "loss": 0.2205, + "step": 220800 + }, + { + "epoch": 9.15, + "grad_norm": 0.27734375, + "learning_rate": 0.00039390029676263116, + "loss": 0.2057, + "step": 220810 + }, + { + "epoch": 9.15, + "grad_norm": 0.6328125, + "learning_rate": 0.00039389142817996994, + "loss": 0.1731, + "step": 220820 + }, + { + "epoch": 9.15, + "grad_norm": 0.61328125, + "learning_rate": 0.0003938825593265205, + "loss": 0.168, + "step": 220830 + }, + { + "epoch": 9.15, + "grad_norm": 0.90234375, + "learning_rate": 0.00039387369020229976, + "loss": 0.2057, + "step": 220840 + }, + { + "epoch": 9.15, + "grad_norm": 0.5546875, + "learning_rate": 0.00039386482080732424, + "loss": 0.2347, + "step": 220850 + }, + { + "epoch": 9.15, + "grad_norm": 1.1875, + "learning_rate": 0.00039385595114161054, + "loss": 0.2033, + "step": 220860 + }, + { + "epoch": 9.15, + "grad_norm": 0.54296875, + "learning_rate": 0.00039384708120517557, + "loss": 0.2154, + "step": 220870 + }, + { + "epoch": 9.15, + "grad_norm": 1.2265625, + "learning_rate": 0.0003938382109980359, + "loss": 0.2194, + "step": 220880 + }, + { + "epoch": 9.15, + "grad_norm": 0.48828125, + "learning_rate": 0.0003938293405202082, + "loss": 0.2283, + "step": 220890 + }, + { + "epoch": 9.15, + "grad_norm": 0.71875, + "learning_rate": 0.0003938204697717094, + "loss": 0.2068, + "step": 220900 + }, + { + "epoch": 9.15, + "grad_norm": 0.294921875, + "learning_rate": 0.0003938115987525558, + "loss": 0.2066, + "step": 220910 + }, + { + "epoch": 9.15, + "grad_norm": 0.4140625, + "learning_rate": 0.00039380272746276446, + "loss": 0.1863, + "step": 220920 + }, + { + "epoch": 9.15, + "grad_norm": 1.5, + "learning_rate": 0.00039379385590235184, + "loss": 0.2226, + "step": 220930 + }, + { + "epoch": 9.15, + "grad_norm": 0.5546875, + "learning_rate": 0.00039378498407133467, + "loss": 0.1675, + "step": 220940 + }, + { + "epoch": 9.15, + "grad_norm": 1.390625, + "learning_rate": 0.0003937761119697298, + "loss": 0.2243, + "step": 220950 + }, + { + "epoch": 9.15, + "grad_norm": 0.60546875, + "learning_rate": 0.00039376723959755366, + "loss": 0.2195, + "step": 220960 + }, + { + "epoch": 9.15, + "grad_norm": 1.0234375, + "learning_rate": 0.00039375836695482326, + "loss": 0.2275, + "step": 220970 + }, + { + "epoch": 9.15, + "grad_norm": 0.67578125, + "learning_rate": 0.00039374949404155505, + "loss": 0.163, + "step": 220980 + }, + { + "epoch": 9.15, + "grad_norm": 0.703125, + "learning_rate": 0.00039374062085776587, + "loss": 0.2085, + "step": 220990 + }, + { + "epoch": 9.15, + "grad_norm": 0.400390625, + "learning_rate": 0.0003937317474034723, + "loss": 0.2102, + "step": 221000 + }, + { + "epoch": 9.15, + "grad_norm": 0.89453125, + "learning_rate": 0.0003937228736786911, + "loss": 0.1918, + "step": 221010 + }, + { + "epoch": 9.15, + "grad_norm": 0.74609375, + "learning_rate": 0.000393713999683439, + "loss": 0.2273, + "step": 221020 + }, + { + "epoch": 9.16, + "grad_norm": 0.494140625, + "learning_rate": 0.00039370512541773273, + "loss": 0.1604, + "step": 221030 + }, + { + "epoch": 9.16, + "grad_norm": 0.5625, + "learning_rate": 0.00039369625088158885, + "loss": 0.1671, + "step": 221040 + }, + { + "epoch": 9.16, + "grad_norm": 0.65625, + "learning_rate": 0.00039368737607502414, + "loss": 0.19, + "step": 221050 + }, + { + "epoch": 9.16, + "grad_norm": 0.515625, + "learning_rate": 0.00039367850099805533, + "loss": 0.1702, + "step": 221060 + }, + { + "epoch": 9.16, + "grad_norm": 0.69140625, + "learning_rate": 0.00039366962565069914, + "loss": 0.2346, + "step": 221070 + }, + { + "epoch": 9.16, + "grad_norm": 0.921875, + "learning_rate": 0.0003936607500329722, + "loss": 0.1645, + "step": 221080 + }, + { + "epoch": 9.16, + "grad_norm": 1.0234375, + "learning_rate": 0.00039365187414489125, + "loss": 0.1646, + "step": 221090 + }, + { + "epoch": 9.16, + "grad_norm": 0.5703125, + "learning_rate": 0.00039364299798647296, + "loss": 0.2209, + "step": 221100 + }, + { + "epoch": 9.16, + "grad_norm": 2.421875, + "learning_rate": 0.00039363412155773406, + "loss": 0.2072, + "step": 221110 + }, + { + "epoch": 9.16, + "grad_norm": 1.3671875, + "learning_rate": 0.0003936252448586912, + "loss": 0.1852, + "step": 221120 + }, + { + "epoch": 9.16, + "grad_norm": 0.53125, + "learning_rate": 0.0003936163678893613, + "loss": 0.2245, + "step": 221130 + }, + { + "epoch": 9.16, + "grad_norm": 0.74609375, + "learning_rate": 0.00039360749064976076, + "loss": 0.1596, + "step": 221140 + }, + { + "epoch": 9.16, + "grad_norm": 1.2734375, + "learning_rate": 0.0003935986131399065, + "loss": 0.2084, + "step": 221150 + }, + { + "epoch": 9.16, + "grad_norm": 1.2890625, + "learning_rate": 0.0003935897353598151, + "loss": 0.2261, + "step": 221160 + }, + { + "epoch": 9.16, + "grad_norm": 0.73046875, + "learning_rate": 0.00039358085730950337, + "loss": 0.2327, + "step": 221170 + }, + { + "epoch": 9.16, + "grad_norm": 0.63671875, + "learning_rate": 0.000393571978988988, + "loss": 0.187, + "step": 221180 + }, + { + "epoch": 9.16, + "grad_norm": 0.9453125, + "learning_rate": 0.00039356310039828565, + "loss": 0.1877, + "step": 221190 + }, + { + "epoch": 9.16, + "grad_norm": 0.62890625, + "learning_rate": 0.000393554221537413, + "loss": 0.1775, + "step": 221200 + }, + { + "epoch": 9.16, + "grad_norm": 1.203125, + "learning_rate": 0.00039354534240638685, + "loss": 0.216, + "step": 221210 + }, + { + "epoch": 9.16, + "grad_norm": 0.91015625, + "learning_rate": 0.00039353646300522384, + "loss": 0.2233, + "step": 221220 + }, + { + "epoch": 9.16, + "grad_norm": 1.6953125, + "learning_rate": 0.00039352758333394074, + "loss": 0.2206, + "step": 221230 + }, + { + "epoch": 9.16, + "grad_norm": 0.5390625, + "learning_rate": 0.00039351870339255425, + "loss": 0.2265, + "step": 221240 + }, + { + "epoch": 9.16, + "grad_norm": 1.2578125, + "learning_rate": 0.000393509823181081, + "loss": 0.2028, + "step": 221250 + }, + { + "epoch": 9.16, + "grad_norm": 0.38671875, + "learning_rate": 0.00039350094269953776, + "loss": 0.2749, + "step": 221260 + }, + { + "epoch": 9.16, + "grad_norm": 0.796875, + "learning_rate": 0.00039349206194794125, + "loss": 0.1958, + "step": 221270 + }, + { + "epoch": 9.17, + "grad_norm": 0.546875, + "learning_rate": 0.0003934831809263082, + "loss": 0.177, + "step": 221280 + }, + { + "epoch": 9.17, + "grad_norm": 0.375, + "learning_rate": 0.0003934742996346553, + "loss": 0.1901, + "step": 221290 + }, + { + "epoch": 9.17, + "grad_norm": 0.71875, + "learning_rate": 0.00039346541807299925, + "loss": 0.1814, + "step": 221300 + }, + { + "epoch": 9.17, + "grad_norm": 0.466796875, + "learning_rate": 0.0003934565362413568, + "loss": 0.195, + "step": 221310 + }, + { + "epoch": 9.17, + "grad_norm": 0.00160980224609375, + "learning_rate": 0.0003934476541397446, + "loss": 0.1871, + "step": 221320 + }, + { + "epoch": 9.17, + "grad_norm": 0.63671875, + "learning_rate": 0.00039343877176817944, + "loss": 0.1607, + "step": 221330 + }, + { + "epoch": 9.17, + "grad_norm": 1.78125, + "learning_rate": 0.000393429889126678, + "loss": 0.1534, + "step": 221340 + }, + { + "epoch": 9.17, + "grad_norm": 0.291015625, + "learning_rate": 0.0003934210062152569, + "loss": 0.1684, + "step": 221350 + }, + { + "epoch": 9.17, + "grad_norm": 0.431640625, + "learning_rate": 0.000393412123033933, + "loss": 0.1863, + "step": 221360 + }, + { + "epoch": 9.17, + "grad_norm": 0.921875, + "learning_rate": 0.0003934032395827231, + "loss": 0.2013, + "step": 221370 + }, + { + "epoch": 9.17, + "grad_norm": 0.7109375, + "learning_rate": 0.00039339435586164363, + "loss": 0.1743, + "step": 221380 + }, + { + "epoch": 9.17, + "grad_norm": 1.1953125, + "learning_rate": 0.0003933854718707116, + "loss": 0.1846, + "step": 221390 + }, + { + "epoch": 9.17, + "grad_norm": 1.234375, + "learning_rate": 0.00039337658760994353, + "loss": 0.2266, + "step": 221400 + }, + { + "epoch": 9.17, + "grad_norm": 1.6875, + "learning_rate": 0.0003933677030793562, + "loss": 0.2526, + "step": 221410 + }, + { + "epoch": 9.17, + "grad_norm": 1.203125, + "learning_rate": 0.00039335881827896635, + "loss": 0.2567, + "step": 221420 + }, + { + "epoch": 9.17, + "grad_norm": 0.88671875, + "learning_rate": 0.00039334993320879066, + "loss": 0.1871, + "step": 221430 + }, + { + "epoch": 9.17, + "grad_norm": 0.53125, + "learning_rate": 0.00039334104786884584, + "loss": 0.2027, + "step": 221440 + }, + { + "epoch": 9.17, + "grad_norm": 1.7109375, + "learning_rate": 0.00039333216225914875, + "loss": 0.1628, + "step": 221450 + }, + { + "epoch": 9.17, + "grad_norm": 2.15625, + "learning_rate": 0.00039332327637971594, + "loss": 0.2019, + "step": 221460 + }, + { + "epoch": 9.17, + "grad_norm": 0.67578125, + "learning_rate": 0.00039331439023056425, + "loss": 0.2255, + "step": 221470 + }, + { + "epoch": 9.17, + "grad_norm": 0.353515625, + "learning_rate": 0.0003933055038117103, + "loss": 0.1938, + "step": 221480 + }, + { + "epoch": 9.17, + "grad_norm": 0.98046875, + "learning_rate": 0.0003932966171231709, + "loss": 0.1835, + "step": 221490 + }, + { + "epoch": 9.17, + "grad_norm": 0.419921875, + "learning_rate": 0.00039328773016496276, + "loss": 0.1817, + "step": 221500 + }, + { + "epoch": 9.17, + "grad_norm": 0.447265625, + "learning_rate": 0.0003932788429371025, + "loss": 0.226, + "step": 221510 + }, + { + "epoch": 9.18, + "grad_norm": 0.0, + "learning_rate": 0.000393269955439607, + "loss": 0.1967, + "step": 221520 + }, + { + "epoch": 9.18, + "grad_norm": 0.8984375, + "learning_rate": 0.00039326106767249294, + "loss": 0.1987, + "step": 221530 + }, + { + "epoch": 9.18, + "grad_norm": 0.84375, + "learning_rate": 0.000393252179635777, + "loss": 0.2295, + "step": 221540 + }, + { + "epoch": 9.18, + "grad_norm": 0.7109375, + "learning_rate": 0.0003932432913294759, + "loss": 0.2266, + "step": 221550 + }, + { + "epoch": 9.18, + "grad_norm": 0.859375, + "learning_rate": 0.00039323440275360635, + "loss": 0.1898, + "step": 221560 + }, + { + "epoch": 9.18, + "grad_norm": 0.3359375, + "learning_rate": 0.00039322551390818524, + "loss": 0.1978, + "step": 221570 + }, + { + "epoch": 9.18, + "grad_norm": 1.4921875, + "learning_rate": 0.0003932166247932292, + "loss": 0.1754, + "step": 221580 + }, + { + "epoch": 9.18, + "grad_norm": 0.474609375, + "learning_rate": 0.00039320773540875474, + "loss": 0.183, + "step": 221590 + }, + { + "epoch": 9.18, + "grad_norm": 0.431640625, + "learning_rate": 0.0003931988457547789, + "loss": 0.2376, + "step": 221600 + }, + { + "epoch": 9.18, + "grad_norm": 0.4921875, + "learning_rate": 0.00039318995583131834, + "loss": 0.1481, + "step": 221610 + }, + { + "epoch": 9.18, + "grad_norm": 0.80078125, + "learning_rate": 0.00039318106563838973, + "loss": 0.2372, + "step": 221620 + }, + { + "epoch": 9.18, + "grad_norm": 0.474609375, + "learning_rate": 0.0003931721751760098, + "loss": 0.2051, + "step": 221630 + }, + { + "epoch": 9.18, + "grad_norm": 0.484375, + "learning_rate": 0.0003931632844441952, + "loss": 0.1452, + "step": 221640 + }, + { + "epoch": 9.18, + "grad_norm": 1.140625, + "learning_rate": 0.0003931543934429629, + "loss": 0.2429, + "step": 221650 + }, + { + "epoch": 9.18, + "grad_norm": 0.330078125, + "learning_rate": 0.00039314550217232945, + "loss": 0.2171, + "step": 221660 + }, + { + "epoch": 9.18, + "grad_norm": 0.66796875, + "learning_rate": 0.0003931366106323116, + "loss": 0.2295, + "step": 221670 + }, + { + "epoch": 9.18, + "grad_norm": 0.515625, + "learning_rate": 0.0003931277188229262, + "loss": 0.1372, + "step": 221680 + }, + { + "epoch": 9.18, + "grad_norm": 0.921875, + "learning_rate": 0.00039311882674418974, + "loss": 0.144, + "step": 221690 + }, + { + "epoch": 9.18, + "grad_norm": 0.75, + "learning_rate": 0.0003931099343961192, + "loss": 0.1784, + "step": 221700 + }, + { + "epoch": 9.18, + "grad_norm": 1.28125, + "learning_rate": 0.00039310104177873117, + "loss": 0.2008, + "step": 221710 + }, + { + "epoch": 9.18, + "grad_norm": 0.70703125, + "learning_rate": 0.00039309214889204246, + "loss": 0.1629, + "step": 221720 + }, + { + "epoch": 9.18, + "grad_norm": 0.416015625, + "learning_rate": 0.00039308325573606976, + "loss": 0.1518, + "step": 221730 + }, + { + "epoch": 9.18, + "grad_norm": 0.8046875, + "learning_rate": 0.0003930743623108299, + "loss": 0.1847, + "step": 221740 + }, + { + "epoch": 9.18, + "grad_norm": 1.09375, + "learning_rate": 0.0003930654686163395, + "loss": 0.1858, + "step": 221750 + }, + { + "epoch": 9.19, + "grad_norm": 0.515625, + "learning_rate": 0.0003930565746526153, + "loss": 0.1785, + "step": 221760 + }, + { + "epoch": 9.19, + "grad_norm": 0.58984375, + "learning_rate": 0.0003930476804196741, + "loss": 0.1761, + "step": 221770 + }, + { + "epoch": 9.19, + "grad_norm": 0.703125, + "learning_rate": 0.0003930387859175326, + "loss": 0.1763, + "step": 221780 + }, + { + "epoch": 9.19, + "grad_norm": 0.6328125, + "learning_rate": 0.0003930298911462076, + "loss": 0.1907, + "step": 221790 + }, + { + "epoch": 9.19, + "grad_norm": 0.73828125, + "learning_rate": 0.00039302099610571565, + "loss": 0.1863, + "step": 221800 + }, + { + "epoch": 9.19, + "grad_norm": 0.69921875, + "learning_rate": 0.00039301210079607373, + "loss": 0.1756, + "step": 221810 + }, + { + "epoch": 9.19, + "grad_norm": 1.046875, + "learning_rate": 0.0003930032052172985, + "loss": 0.143, + "step": 221820 + }, + { + "epoch": 9.19, + "grad_norm": 1.9140625, + "learning_rate": 0.0003929943093694067, + "loss": 0.1781, + "step": 221830 + }, + { + "epoch": 9.19, + "grad_norm": 1.65625, + "learning_rate": 0.000392985413252415, + "loss": 0.2149, + "step": 221840 + }, + { + "epoch": 9.19, + "grad_norm": 0.953125, + "learning_rate": 0.00039297651686634016, + "loss": 0.1844, + "step": 221850 + }, + { + "epoch": 9.19, + "grad_norm": 0.65234375, + "learning_rate": 0.000392967620211199, + "loss": 0.1767, + "step": 221860 + }, + { + "epoch": 9.19, + "grad_norm": 0.478515625, + "learning_rate": 0.00039295872328700825, + "loss": 0.21, + "step": 221870 + }, + { + "epoch": 9.19, + "grad_norm": 0.6875, + "learning_rate": 0.0003929498260937846, + "loss": 0.1628, + "step": 221880 + }, + { + "epoch": 9.19, + "grad_norm": 0.984375, + "learning_rate": 0.0003929409286315448, + "loss": 0.1878, + "step": 221890 + }, + { + "epoch": 9.19, + "grad_norm": 1.3359375, + "learning_rate": 0.0003929320309003056, + "loss": 0.2189, + "step": 221900 + }, + { + "epoch": 9.19, + "grad_norm": 0.61328125, + "learning_rate": 0.0003929231329000837, + "loss": 0.2647, + "step": 221910 + }, + { + "epoch": 9.19, + "grad_norm": 0.5625, + "learning_rate": 0.00039291423463089605, + "loss": 0.222, + "step": 221920 + }, + { + "epoch": 9.19, + "grad_norm": 0.41796875, + "learning_rate": 0.0003929053360927591, + "loss": 0.1982, + "step": 221930 + }, + { + "epoch": 9.19, + "grad_norm": 0.96484375, + "learning_rate": 0.00039289643728568986, + "loss": 0.1767, + "step": 221940 + }, + { + "epoch": 9.19, + "grad_norm": 0.2001953125, + "learning_rate": 0.0003928875382097049, + "loss": 0.1978, + "step": 221950 + }, + { + "epoch": 9.19, + "grad_norm": 0.5546875, + "learning_rate": 0.000392878638864821, + "loss": 0.1752, + "step": 221960 + }, + { + "epoch": 9.19, + "grad_norm": 0.48828125, + "learning_rate": 0.00039286973925105497, + "loss": 0.2015, + "step": 221970 + }, + { + "epoch": 9.19, + "grad_norm": 0.58203125, + "learning_rate": 0.0003928608393684235, + "loss": 0.1438, + "step": 221980 + }, + { + "epoch": 9.19, + "grad_norm": 0.76171875, + "learning_rate": 0.00039285193921694336, + "loss": 0.2215, + "step": 221990 + }, + { + "epoch": 9.2, + "grad_norm": 0.71875, + "learning_rate": 0.0003928430387966313, + "loss": 0.1644, + "step": 222000 + }, + { + "epoch": 9.2, + "grad_norm": 0.98828125, + "learning_rate": 0.000392834138107504, + "loss": 0.2382, + "step": 222010 + }, + { + "epoch": 9.2, + "grad_norm": 0.43359375, + "learning_rate": 0.0003928252371495784, + "loss": 0.157, + "step": 222020 + }, + { + "epoch": 9.2, + "grad_norm": 0.94921875, + "learning_rate": 0.00039281633592287105, + "loss": 0.1584, + "step": 222030 + }, + { + "epoch": 9.2, + "grad_norm": 0.93359375, + "learning_rate": 0.00039280743442739874, + "loss": 0.1606, + "step": 222040 + }, + { + "epoch": 9.2, + "grad_norm": 0.5390625, + "learning_rate": 0.00039279853266317835, + "loss": 0.2199, + "step": 222050 + }, + { + "epoch": 9.2, + "grad_norm": 0.47265625, + "learning_rate": 0.00039278963063022653, + "loss": 0.2082, + "step": 222060 + }, + { + "epoch": 9.2, + "grad_norm": 0.80859375, + "learning_rate": 0.00039278072832856, + "loss": 0.2089, + "step": 222070 + }, + { + "epoch": 9.2, + "grad_norm": 0.1943359375, + "learning_rate": 0.0003927718257581956, + "loss": 0.1826, + "step": 222080 + }, + { + "epoch": 9.2, + "grad_norm": 0.48828125, + "learning_rate": 0.00039276292291915007, + "loss": 0.1837, + "step": 222090 + }, + { + "epoch": 9.2, + "grad_norm": 1.1640625, + "learning_rate": 0.00039275401981144, + "loss": 0.2169, + "step": 222100 + }, + { + "epoch": 9.2, + "grad_norm": 2.3125, + "learning_rate": 0.0003927451164350824, + "loss": 0.207, + "step": 222110 + }, + { + "epoch": 9.2, + "grad_norm": 1.1640625, + "learning_rate": 0.00039273621279009384, + "loss": 0.2065, + "step": 222120 + }, + { + "epoch": 9.2, + "grad_norm": 0.78125, + "learning_rate": 0.0003927273088764912, + "loss": 0.1935, + "step": 222130 + }, + { + "epoch": 9.2, + "grad_norm": 0.6328125, + "learning_rate": 0.00039271840469429117, + "loss": 0.1534, + "step": 222140 + }, + { + "epoch": 9.2, + "grad_norm": 0.83984375, + "learning_rate": 0.0003927095002435105, + "loss": 0.2423, + "step": 222150 + }, + { + "epoch": 9.2, + "grad_norm": 0.6875, + "learning_rate": 0.000392700595524166, + "loss": 0.1853, + "step": 222160 + }, + { + "epoch": 9.2, + "grad_norm": 1.0546875, + "learning_rate": 0.0003926916905362743, + "loss": 0.1561, + "step": 222170 + }, + { + "epoch": 9.2, + "grad_norm": 1.5625, + "learning_rate": 0.00039268278527985237, + "loss": 0.1664, + "step": 222180 + }, + { + "epoch": 9.2, + "grad_norm": 0.6953125, + "learning_rate": 0.0003926738797549167, + "loss": 0.2005, + "step": 222190 + }, + { + "epoch": 9.2, + "grad_norm": 0.6484375, + "learning_rate": 0.00039266497396148425, + "loss": 0.1945, + "step": 222200 + }, + { + "epoch": 9.2, + "grad_norm": 1.8671875, + "learning_rate": 0.0003926560678995718, + "loss": 0.2052, + "step": 222210 + }, + { + "epoch": 9.2, + "grad_norm": 0.98046875, + "learning_rate": 0.000392647161569196, + "loss": 0.1727, + "step": 222220 + }, + { + "epoch": 9.2, + "grad_norm": 1.0078125, + "learning_rate": 0.0003926382549703736, + "loss": 0.1989, + "step": 222230 + }, + { + "epoch": 9.21, + "grad_norm": 0.84375, + "learning_rate": 0.0003926293481031214, + "loss": 0.1895, + "step": 222240 + }, + { + "epoch": 9.21, + "grad_norm": 0.89453125, + "learning_rate": 0.0003926204409674562, + "loss": 0.257, + "step": 222250 + }, + { + "epoch": 9.21, + "grad_norm": 1.28125, + "learning_rate": 0.0003926115335633948, + "loss": 0.2007, + "step": 222260 + }, + { + "epoch": 9.21, + "grad_norm": 0.91796875, + "learning_rate": 0.00039260262589095374, + "loss": 0.2331, + "step": 222270 + }, + { + "epoch": 9.21, + "grad_norm": 0.859375, + "learning_rate": 0.00039259371795015007, + "loss": 0.1844, + "step": 222280 + }, + { + "epoch": 9.21, + "grad_norm": 0.546875, + "learning_rate": 0.0003925848097410004, + "loss": 0.1764, + "step": 222290 + }, + { + "epoch": 9.21, + "grad_norm": 1.1640625, + "learning_rate": 0.0003925759012635215, + "loss": 0.1692, + "step": 222300 + }, + { + "epoch": 9.21, + "grad_norm": 0.625, + "learning_rate": 0.0003925669925177302, + "loss": 0.2219, + "step": 222310 + }, + { + "epoch": 9.21, + "grad_norm": 1.0078125, + "learning_rate": 0.00039255808350364306, + "loss": 0.2292, + "step": 222320 + }, + { + "epoch": 9.21, + "grad_norm": 0.86328125, + "learning_rate": 0.0003925491742212772, + "loss": 0.2088, + "step": 222330 + }, + { + "epoch": 9.21, + "grad_norm": 0.69140625, + "learning_rate": 0.0003925402646706491, + "loss": 0.2093, + "step": 222340 + }, + { + "epoch": 9.21, + "grad_norm": 0.64453125, + "learning_rate": 0.00039253135485177557, + "loss": 0.1804, + "step": 222350 + }, + { + "epoch": 9.21, + "grad_norm": 2.3125, + "learning_rate": 0.00039252244476467346, + "loss": 0.2171, + "step": 222360 + }, + { + "epoch": 9.21, + "grad_norm": 1.078125, + "learning_rate": 0.00039251353440935947, + "loss": 0.1972, + "step": 222370 + }, + { + "epoch": 9.21, + "grad_norm": 0.2177734375, + "learning_rate": 0.00039250462378585046, + "loss": 0.1602, + "step": 222380 + }, + { + "epoch": 9.21, + "grad_norm": 0.71875, + "learning_rate": 0.00039249571289416306, + "loss": 0.199, + "step": 222390 + }, + { + "epoch": 9.21, + "grad_norm": 0.83984375, + "learning_rate": 0.0003924868017343142, + "loss": 0.207, + "step": 222400 + }, + { + "epoch": 9.21, + "grad_norm": 1.15625, + "learning_rate": 0.00039247789030632055, + "loss": 0.2096, + "step": 222410 + }, + { + "epoch": 9.21, + "grad_norm": 0.91796875, + "learning_rate": 0.0003924689786101989, + "loss": 0.2077, + "step": 222420 + }, + { + "epoch": 9.21, + "grad_norm": 0.484375, + "learning_rate": 0.00039246006664596595, + "loss": 0.2214, + "step": 222430 + }, + { + "epoch": 9.21, + "grad_norm": 1.1171875, + "learning_rate": 0.00039245115441363855, + "loss": 0.2112, + "step": 222440 + }, + { + "epoch": 9.21, + "grad_norm": 0.78515625, + "learning_rate": 0.00039244224191323354, + "loss": 0.2598, + "step": 222450 + }, + { + "epoch": 9.21, + "grad_norm": 0.640625, + "learning_rate": 0.00039243332914476757, + "loss": 0.1598, + "step": 222460 + }, + { + "epoch": 9.21, + "grad_norm": 0.70703125, + "learning_rate": 0.0003924244161082574, + "loss": 0.2316, + "step": 222470 + }, + { + "epoch": 9.22, + "grad_norm": 0.21875, + "learning_rate": 0.00039241550280371995, + "loss": 0.1818, + "step": 222480 + }, + { + "epoch": 9.22, + "grad_norm": 0.71484375, + "learning_rate": 0.0003924065892311719, + "loss": 0.1912, + "step": 222490 + }, + { + "epoch": 9.22, + "grad_norm": 1.9453125, + "learning_rate": 0.00039239767539063, + "loss": 0.2131, + "step": 222500 + }, + { + "epoch": 9.22, + "grad_norm": 0.5078125, + "learning_rate": 0.000392388761282111, + "loss": 0.2011, + "step": 222510 + }, + { + "epoch": 9.22, + "grad_norm": 0.373046875, + "learning_rate": 0.00039237984690563177, + "loss": 0.2355, + "step": 222520 + }, + { + "epoch": 9.22, + "grad_norm": 0.33984375, + "learning_rate": 0.000392370932261209, + "loss": 0.1773, + "step": 222530 + }, + { + "epoch": 9.22, + "grad_norm": 0.51171875, + "learning_rate": 0.00039236201734885965, + "loss": 0.2235, + "step": 222540 + }, + { + "epoch": 9.22, + "grad_norm": 0.76171875, + "learning_rate": 0.0003923531021686002, + "loss": 0.2132, + "step": 222550 + }, + { + "epoch": 9.22, + "grad_norm": 0.2431640625, + "learning_rate": 0.0003923441867204477, + "loss": 0.1814, + "step": 222560 + }, + { + "epoch": 9.22, + "grad_norm": 0.8125, + "learning_rate": 0.0003923352710044187, + "loss": 0.2196, + "step": 222570 + }, + { + "epoch": 9.22, + "grad_norm": 0.74609375, + "learning_rate": 0.00039232635502053014, + "loss": 0.1988, + "step": 222580 + }, + { + "epoch": 9.22, + "grad_norm": 0.5703125, + "learning_rate": 0.0003923174387687988, + "loss": 0.1909, + "step": 222590 + }, + { + "epoch": 9.22, + "grad_norm": 0.25, + "learning_rate": 0.0003923085222492414, + "loss": 0.1545, + "step": 222600 + }, + { + "epoch": 9.22, + "grad_norm": 1.03125, + "learning_rate": 0.00039229960546187473, + "loss": 0.233, + "step": 222610 + }, + { + "epoch": 9.22, + "grad_norm": 1.859375, + "learning_rate": 0.0003922906884067155, + "loss": 0.2529, + "step": 222620 + }, + { + "epoch": 9.22, + "grad_norm": 2.03125, + "learning_rate": 0.0003922817710837806, + "loss": 0.2009, + "step": 222630 + }, + { + "epoch": 9.22, + "grad_norm": 0.6953125, + "learning_rate": 0.00039227285349308674, + "loss": 0.2036, + "step": 222640 + }, + { + "epoch": 9.22, + "grad_norm": 1.8984375, + "learning_rate": 0.0003922639356346508, + "loss": 0.176, + "step": 222650 + }, + { + "epoch": 9.22, + "grad_norm": 0.625, + "learning_rate": 0.00039225501750848944, + "loss": 0.1978, + "step": 222660 + }, + { + "epoch": 9.22, + "grad_norm": 0.86328125, + "learning_rate": 0.0003922460991146195, + "loss": 0.2034, + "step": 222670 + }, + { + "epoch": 9.22, + "grad_norm": 0.640625, + "learning_rate": 0.00039223718045305786, + "loss": 0.193, + "step": 222680 + }, + { + "epoch": 9.22, + "grad_norm": 0.2470703125, + "learning_rate": 0.0003922282615238211, + "loss": 0.1912, + "step": 222690 + }, + { + "epoch": 9.22, + "grad_norm": 0.58984375, + "learning_rate": 0.00039221934232692616, + "loss": 0.2066, + "step": 222700 + }, + { + "epoch": 9.22, + "grad_norm": 1.0859375, + "learning_rate": 0.00039221042286238973, + "loss": 0.1986, + "step": 222710 + }, + { + "epoch": 9.23, + "grad_norm": 1.09375, + "learning_rate": 0.0003922015031302286, + "loss": 0.2285, + "step": 222720 + }, + { + "epoch": 9.23, + "grad_norm": 0.59765625, + "learning_rate": 0.00039219258313045975, + "loss": 0.2019, + "step": 222730 + }, + { + "epoch": 9.23, + "grad_norm": 0.671875, + "learning_rate": 0.0003921836628630997, + "loss": 0.1658, + "step": 222740 + }, + { + "epoch": 9.23, + "grad_norm": 0.427734375, + "learning_rate": 0.0003921747423281654, + "loss": 0.1863, + "step": 222750 + }, + { + "epoch": 9.23, + "grad_norm": 2.75, + "learning_rate": 0.0003921658215256736, + "loss": 0.2091, + "step": 222760 + }, + { + "epoch": 9.23, + "grad_norm": 0.68359375, + "learning_rate": 0.000392156900455641, + "loss": 0.1908, + "step": 222770 + }, + { + "epoch": 9.23, + "grad_norm": 1.0390625, + "learning_rate": 0.00039214797911808454, + "loss": 0.2645, + "step": 222780 + }, + { + "epoch": 9.23, + "grad_norm": 1.3671875, + "learning_rate": 0.0003921390575130209, + "loss": 0.2033, + "step": 222790 + }, + { + "epoch": 9.23, + "grad_norm": 0.67578125, + "learning_rate": 0.0003921301356404669, + "loss": 0.2189, + "step": 222800 + }, + { + "epoch": 9.23, + "grad_norm": 1.34375, + "learning_rate": 0.0003921212135004394, + "loss": 0.2161, + "step": 222810 + }, + { + "epoch": 9.23, + "grad_norm": 0.59765625, + "learning_rate": 0.000392112291092955, + "loss": 0.2192, + "step": 222820 + }, + { + "epoch": 9.23, + "grad_norm": 0.53125, + "learning_rate": 0.0003921033684180307, + "loss": 0.2173, + "step": 222830 + }, + { + "epoch": 9.23, + "grad_norm": 0.76171875, + "learning_rate": 0.0003920944454756832, + "loss": 0.187, + "step": 222840 + }, + { + "epoch": 9.23, + "grad_norm": 0.4765625, + "learning_rate": 0.0003920855222659293, + "loss": 0.1969, + "step": 222850 + }, + { + "epoch": 9.23, + "grad_norm": 0.462890625, + "learning_rate": 0.00039207659878878574, + "loss": 0.1712, + "step": 222860 + }, + { + "epoch": 9.23, + "grad_norm": 0.150390625, + "learning_rate": 0.0003920676750442694, + "loss": 0.1932, + "step": 222870 + }, + { + "epoch": 9.23, + "grad_norm": 0.62890625, + "learning_rate": 0.00039205875103239706, + "loss": 0.1726, + "step": 222880 + }, + { + "epoch": 9.23, + "grad_norm": 1.2421875, + "learning_rate": 0.00039204982675318544, + "loss": 0.1537, + "step": 222890 + }, + { + "epoch": 9.23, + "grad_norm": 0.85546875, + "learning_rate": 0.0003920409022066515, + "loss": 0.1678, + "step": 222900 + }, + { + "epoch": 9.23, + "grad_norm": 0.62109375, + "learning_rate": 0.0003920319773928118, + "loss": 0.2206, + "step": 222910 + }, + { + "epoch": 9.23, + "grad_norm": 0.953125, + "learning_rate": 0.00039202305231168333, + "loss": 0.2109, + "step": 222920 + }, + { + "epoch": 9.23, + "grad_norm": 0.84375, + "learning_rate": 0.00039201412696328276, + "loss": 0.195, + "step": 222930 + }, + { + "epoch": 9.23, + "grad_norm": 0.828125, + "learning_rate": 0.00039200520134762695, + "loss": 0.1994, + "step": 222940 + }, + { + "epoch": 9.23, + "grad_norm": 0.4453125, + "learning_rate": 0.00039199627546473265, + "loss": 0.162, + "step": 222950 + }, + { + "epoch": 9.23, + "grad_norm": 0.6953125, + "learning_rate": 0.0003919873493146168, + "loss": 0.2095, + "step": 222960 + }, + { + "epoch": 9.24, + "grad_norm": 0.58984375, + "learning_rate": 0.00039197842289729603, + "loss": 0.1563, + "step": 222970 + }, + { + "epoch": 9.24, + "grad_norm": 0.765625, + "learning_rate": 0.00039196949621278716, + "loss": 0.1606, + "step": 222980 + }, + { + "epoch": 9.24, + "grad_norm": 0.392578125, + "learning_rate": 0.00039196056926110707, + "loss": 0.1915, + "step": 222990 + }, + { + "epoch": 9.24, + "grad_norm": 0.90625, + "learning_rate": 0.0003919516420422725, + "loss": 0.2602, + "step": 223000 + }, + { + "epoch": 9.24, + "grad_norm": 0.6640625, + "learning_rate": 0.0003919427145563003, + "loss": 0.1901, + "step": 223010 + }, + { + "epoch": 9.24, + "grad_norm": 0.671875, + "learning_rate": 0.00039193378680320724, + "loss": 0.1794, + "step": 223020 + }, + { + "epoch": 9.24, + "grad_norm": 1.1171875, + "learning_rate": 0.0003919248587830101, + "loss": 0.1812, + "step": 223030 + }, + { + "epoch": 9.24, + "grad_norm": 0.466796875, + "learning_rate": 0.0003919159304957257, + "loss": 0.2225, + "step": 223040 + }, + { + "epoch": 9.24, + "grad_norm": 0.84765625, + "learning_rate": 0.0003919070019413709, + "loss": 0.2206, + "step": 223050 + }, + { + "epoch": 9.24, + "grad_norm": 0.478515625, + "learning_rate": 0.0003918980731199624, + "loss": 0.2476, + "step": 223060 + }, + { + "epoch": 9.24, + "grad_norm": 0.87890625, + "learning_rate": 0.000391889144031517, + "loss": 0.1912, + "step": 223070 + }, + { + "epoch": 9.24, + "grad_norm": 0.5234375, + "learning_rate": 0.0003918802146760516, + "loss": 0.2306, + "step": 223080 + }, + { + "epoch": 9.24, + "grad_norm": 0.462890625, + "learning_rate": 0.00039187128505358297, + "loss": 0.1767, + "step": 223090 + }, + { + "epoch": 9.24, + "grad_norm": 0.90234375, + "learning_rate": 0.00039186235516412796, + "loss": 0.168, + "step": 223100 + }, + { + "epoch": 9.24, + "grad_norm": 0.41015625, + "learning_rate": 0.00039185342500770313, + "loss": 0.2126, + "step": 223110 + }, + { + "epoch": 9.24, + "grad_norm": 0.9296875, + "learning_rate": 0.0003918444945843257, + "loss": 0.212, + "step": 223120 + }, + { + "epoch": 9.24, + "grad_norm": 0.5, + "learning_rate": 0.0003918355638940121, + "loss": 0.1858, + "step": 223130 + }, + { + "epoch": 9.24, + "grad_norm": 0.490234375, + "learning_rate": 0.00039182663293677933, + "loss": 0.2019, + "step": 223140 + }, + { + "epoch": 9.24, + "grad_norm": 0.70703125, + "learning_rate": 0.0003918177017126442, + "loss": 0.2223, + "step": 223150 + }, + { + "epoch": 9.24, + "grad_norm": 0.40625, + "learning_rate": 0.00039180877022162335, + "loss": 0.2257, + "step": 223160 + }, + { + "epoch": 9.24, + "grad_norm": 1.203125, + "learning_rate": 0.00039179983846373385, + "loss": 0.2012, + "step": 223170 + }, + { + "epoch": 9.24, + "grad_norm": 0.390625, + "learning_rate": 0.00039179090643899227, + "loss": 0.1586, + "step": 223180 + }, + { + "epoch": 9.24, + "grad_norm": 1.1875, + "learning_rate": 0.0003917819741474156, + "loss": 0.2115, + "step": 223190 + }, + { + "epoch": 9.24, + "grad_norm": 1.703125, + "learning_rate": 0.0003917730415890205, + "loss": 0.2038, + "step": 223200 + }, + { + "epoch": 9.25, + "grad_norm": 1.9921875, + "learning_rate": 0.00039176410876382386, + "loss": 0.2331, + "step": 223210 + }, + { + "epoch": 9.25, + "grad_norm": 1.203125, + "learning_rate": 0.00039175517567184245, + "loss": 0.2069, + "step": 223220 + }, + { + "epoch": 9.25, + "grad_norm": 0.7109375, + "learning_rate": 0.00039174624231309315, + "loss": 0.2267, + "step": 223230 + }, + { + "epoch": 9.25, + "grad_norm": 1.84375, + "learning_rate": 0.00039173730868759265, + "loss": 0.2008, + "step": 223240 + }, + { + "epoch": 9.25, + "grad_norm": 0.2890625, + "learning_rate": 0.00039172837479535793, + "loss": 0.1923, + "step": 223250 + }, + { + "epoch": 9.25, + "grad_norm": 0.326171875, + "learning_rate": 0.00039171944063640573, + "loss": 0.1934, + "step": 223260 + }, + { + "epoch": 9.25, + "grad_norm": 0.357421875, + "learning_rate": 0.00039171050621075276, + "loss": 0.188, + "step": 223270 + }, + { + "epoch": 9.25, + "grad_norm": 0.30078125, + "learning_rate": 0.000391701571518416, + "loss": 0.2177, + "step": 223280 + }, + { + "epoch": 9.25, + "grad_norm": 0.66796875, + "learning_rate": 0.00039169263655941213, + "loss": 0.1848, + "step": 223290 + }, + { + "epoch": 9.25, + "grad_norm": 0.400390625, + "learning_rate": 0.00039168370133375803, + "loss": 0.2239, + "step": 223300 + }, + { + "epoch": 9.25, + "grad_norm": 0.404296875, + "learning_rate": 0.00039167476584147054, + "loss": 0.1857, + "step": 223310 + }, + { + "epoch": 9.25, + "grad_norm": 0.470703125, + "learning_rate": 0.00039166583008256635, + "loss": 0.1917, + "step": 223320 + }, + { + "epoch": 9.25, + "grad_norm": 1.359375, + "learning_rate": 0.0003916568940570624, + "loss": 0.1879, + "step": 223330 + }, + { + "epoch": 9.25, + "grad_norm": 0.8671875, + "learning_rate": 0.0003916479577649755, + "loss": 0.205, + "step": 223340 + }, + { + "epoch": 9.25, + "grad_norm": 0.7421875, + "learning_rate": 0.00039163902120632245, + "loss": 0.1742, + "step": 223350 + }, + { + "epoch": 9.25, + "grad_norm": 0.369140625, + "learning_rate": 0.0003916300843811201, + "loss": 0.176, + "step": 223360 + }, + { + "epoch": 9.25, + "grad_norm": 1.5234375, + "learning_rate": 0.00039162114728938515, + "loss": 0.2456, + "step": 223370 + }, + { + "epoch": 9.25, + "grad_norm": 0.72265625, + "learning_rate": 0.00039161220993113455, + "loss": 0.1875, + "step": 223380 + }, + { + "epoch": 9.25, + "grad_norm": 1.25, + "learning_rate": 0.00039160327230638504, + "loss": 0.2063, + "step": 223390 + }, + { + "epoch": 9.25, + "grad_norm": 0.4921875, + "learning_rate": 0.0003915943344151534, + "loss": 0.1938, + "step": 223400 + }, + { + "epoch": 9.25, + "grad_norm": 1.734375, + "learning_rate": 0.0003915853962574566, + "loss": 0.1915, + "step": 223410 + }, + { + "epoch": 9.25, + "grad_norm": 0.97265625, + "learning_rate": 0.0003915764578333113, + "loss": 0.2019, + "step": 223420 + }, + { + "epoch": 9.25, + "grad_norm": 1.5078125, + "learning_rate": 0.0003915675191427345, + "loss": 0.2466, + "step": 223430 + }, + { + "epoch": 9.25, + "grad_norm": 0.76953125, + "learning_rate": 0.00039155858018574284, + "loss": 0.2377, + "step": 223440 + }, + { + "epoch": 9.26, + "grad_norm": 0.9140625, + "learning_rate": 0.0003915496409623532, + "loss": 0.1667, + "step": 223450 + }, + { + "epoch": 9.26, + "grad_norm": 0.77734375, + "learning_rate": 0.0003915407014725825, + "loss": 0.1656, + "step": 223460 + }, + { + "epoch": 9.26, + "grad_norm": 1.7890625, + "learning_rate": 0.00039153176171644743, + "loss": 0.2329, + "step": 223470 + }, + { + "epoch": 9.26, + "grad_norm": 0.546875, + "learning_rate": 0.00039152282169396493, + "loss": 0.1846, + "step": 223480 + }, + { + "epoch": 9.26, + "grad_norm": 0.490234375, + "learning_rate": 0.00039151388140515163, + "loss": 0.1347, + "step": 223490 + }, + { + "epoch": 9.26, + "grad_norm": 0.88671875, + "learning_rate": 0.00039150494085002466, + "loss": 0.2263, + "step": 223500 + }, + { + "epoch": 9.26, + "grad_norm": 0.83984375, + "learning_rate": 0.0003914960000286005, + "loss": 0.1794, + "step": 223510 + }, + { + "epoch": 9.26, + "grad_norm": 1.1953125, + "learning_rate": 0.0003914870589408963, + "loss": 0.1948, + "step": 223520 + }, + { + "epoch": 9.26, + "grad_norm": 0.71875, + "learning_rate": 0.00039147811758692865, + "loss": 0.1887, + "step": 223530 + }, + { + "epoch": 9.26, + "grad_norm": 0.92578125, + "learning_rate": 0.0003914691759667145, + "loss": 0.1574, + "step": 223540 + }, + { + "epoch": 9.26, + "grad_norm": 0.6484375, + "learning_rate": 0.0003914602340802705, + "loss": 0.1454, + "step": 223550 + }, + { + "epoch": 9.26, + "grad_norm": 0.88671875, + "learning_rate": 0.0003914512919276138, + "loss": 0.2248, + "step": 223560 + }, + { + "epoch": 9.26, + "grad_norm": 0.69921875, + "learning_rate": 0.000391442349508761, + "loss": 0.231, + "step": 223570 + }, + { + "epoch": 9.26, + "grad_norm": 1.3515625, + "learning_rate": 0.00039143340682372895, + "loss": 0.2345, + "step": 223580 + }, + { + "epoch": 9.26, + "grad_norm": 1.125, + "learning_rate": 0.0003914244638725345, + "loss": 0.2353, + "step": 223590 + }, + { + "epoch": 9.26, + "grad_norm": 0.94921875, + "learning_rate": 0.00039141552065519446, + "loss": 0.1746, + "step": 223600 + }, + { + "epoch": 9.26, + "grad_norm": 0.62890625, + "learning_rate": 0.00039140657717172574, + "loss": 0.1676, + "step": 223610 + }, + { + "epoch": 9.26, + "grad_norm": 0.30078125, + "learning_rate": 0.00039139763342214506, + "loss": 0.2506, + "step": 223620 + }, + { + "epoch": 9.26, + "grad_norm": 1.03125, + "learning_rate": 0.0003913886894064693, + "loss": 0.2277, + "step": 223630 + }, + { + "epoch": 9.26, + "grad_norm": 1.2265625, + "learning_rate": 0.0003913797451247153, + "loss": 0.2299, + "step": 223640 + }, + { + "epoch": 9.26, + "grad_norm": 1.09375, + "learning_rate": 0.00039137080057689995, + "loss": 0.1811, + "step": 223650 + }, + { + "epoch": 9.26, + "grad_norm": 0.984375, + "learning_rate": 0.00039136185576304, + "loss": 0.1854, + "step": 223660 + }, + { + "epoch": 9.26, + "grad_norm": 0.53125, + "learning_rate": 0.00039135291068315224, + "loss": 0.2121, + "step": 223670 + }, + { + "epoch": 9.26, + "grad_norm": 0.470703125, + "learning_rate": 0.0003913439653372536, + "loss": 0.2107, + "step": 223680 + }, + { + "epoch": 9.27, + "grad_norm": 0.79296875, + "learning_rate": 0.0003913350197253609, + "loss": 0.2148, + "step": 223690 + }, + { + "epoch": 9.27, + "grad_norm": 0.8828125, + "learning_rate": 0.00039132607384749095, + "loss": 0.2162, + "step": 223700 + }, + { + "epoch": 9.27, + "grad_norm": 1.0625, + "learning_rate": 0.00039131712770366055, + "loss": 0.187, + "step": 223710 + }, + { + "epoch": 9.27, + "grad_norm": 0.376953125, + "learning_rate": 0.0003913081812938866, + "loss": 0.2615, + "step": 223720 + }, + { + "epoch": 9.27, + "grad_norm": 0.734375, + "learning_rate": 0.00039129923461818594, + "loss": 0.2217, + "step": 223730 + }, + { + "epoch": 9.27, + "grad_norm": 1.2421875, + "learning_rate": 0.00039129028767657535, + "loss": 0.1865, + "step": 223740 + }, + { + "epoch": 9.27, + "grad_norm": 0.68359375, + "learning_rate": 0.0003912813404690717, + "loss": 0.1875, + "step": 223750 + }, + { + "epoch": 9.27, + "grad_norm": 0.80859375, + "learning_rate": 0.00039127239299569183, + "loss": 0.2134, + "step": 223760 + }, + { + "epoch": 9.27, + "grad_norm": 0.59375, + "learning_rate": 0.00039126344525645257, + "loss": 0.2419, + "step": 223770 + }, + { + "epoch": 9.27, + "grad_norm": 0.98828125, + "learning_rate": 0.0003912544972513707, + "loss": 0.1875, + "step": 223780 + }, + { + "epoch": 9.27, + "grad_norm": 0.87109375, + "learning_rate": 0.0003912455489804633, + "loss": 0.1864, + "step": 223790 + }, + { + "epoch": 9.27, + "grad_norm": 0.6328125, + "learning_rate": 0.00039123660044374685, + "loss": 0.207, + "step": 223800 + }, + { + "epoch": 9.27, + "grad_norm": 0.84765625, + "learning_rate": 0.0003912276516412385, + "loss": 0.2041, + "step": 223810 + }, + { + "epoch": 9.27, + "grad_norm": 0.427734375, + "learning_rate": 0.0003912187025729548, + "loss": 0.1985, + "step": 223820 + }, + { + "epoch": 9.27, + "grad_norm": 0.44921875, + "learning_rate": 0.00039120975323891287, + "loss": 0.1902, + "step": 223830 + }, + { + "epoch": 9.27, + "grad_norm": 0.59765625, + "learning_rate": 0.0003912008036391294, + "loss": 0.1926, + "step": 223840 + }, + { + "epoch": 9.27, + "grad_norm": 1.4453125, + "learning_rate": 0.00039119185377362134, + "loss": 0.1625, + "step": 223850 + }, + { + "epoch": 9.27, + "grad_norm": 0.416015625, + "learning_rate": 0.0003911829036424054, + "loss": 0.2897, + "step": 223860 + }, + { + "epoch": 9.27, + "grad_norm": 1.1171875, + "learning_rate": 0.0003911739532454984, + "loss": 0.2252, + "step": 223870 + }, + { + "epoch": 9.27, + "grad_norm": 0.640625, + "learning_rate": 0.0003911650025829174, + "loss": 0.2093, + "step": 223880 + }, + { + "epoch": 9.27, + "grad_norm": 1.1484375, + "learning_rate": 0.00039115605165467905, + "loss": 0.2145, + "step": 223890 + }, + { + "epoch": 9.27, + "grad_norm": 0.6875, + "learning_rate": 0.00039114710046080027, + "loss": 0.2004, + "step": 223900 + }, + { + "epoch": 9.27, + "grad_norm": 0.9609375, + "learning_rate": 0.00039113814900129787, + "loss": 0.1687, + "step": 223910 + }, + { + "epoch": 9.27, + "grad_norm": 0.66015625, + "learning_rate": 0.0003911291972761887, + "loss": 0.2133, + "step": 223920 + }, + { + "epoch": 9.28, + "grad_norm": 1.03125, + "learning_rate": 0.00039112024528548963, + "loss": 0.2056, + "step": 223930 + }, + { + "epoch": 9.28, + "grad_norm": 0.703125, + "learning_rate": 0.00039111129302921757, + "loss": 0.2104, + "step": 223940 + }, + { + "epoch": 9.28, + "grad_norm": 0.31640625, + "learning_rate": 0.00039110234050738915, + "loss": 0.2161, + "step": 223950 + }, + { + "epoch": 9.28, + "grad_norm": 0.828125, + "learning_rate": 0.00039109338772002147, + "loss": 0.1869, + "step": 223960 + }, + { + "epoch": 9.28, + "grad_norm": 0.65625, + "learning_rate": 0.00039108443466713127, + "loss": 0.2229, + "step": 223970 + }, + { + "epoch": 9.28, + "grad_norm": 1.0390625, + "learning_rate": 0.0003910754813487354, + "loss": 0.1656, + "step": 223980 + }, + { + "epoch": 9.28, + "grad_norm": 1.1640625, + "learning_rate": 0.0003910665277648506, + "loss": 0.2582, + "step": 223990 + }, + { + "epoch": 9.28, + "grad_norm": 1.1953125, + "learning_rate": 0.00039105757391549395, + "loss": 0.2155, + "step": 224000 + }, + { + "epoch": 9.28, + "grad_norm": 1.90625, + "learning_rate": 0.0003910486198006822, + "loss": 0.187, + "step": 224010 + }, + { + "epoch": 9.28, + "grad_norm": 0.36328125, + "learning_rate": 0.0003910396654204321, + "loss": 0.1556, + "step": 224020 + }, + { + "epoch": 9.28, + "grad_norm": 0.5546875, + "learning_rate": 0.0003910307107747606, + "loss": 0.2209, + "step": 224030 + }, + { + "epoch": 9.28, + "grad_norm": 0.6640625, + "learning_rate": 0.00039102175586368455, + "loss": 0.1948, + "step": 224040 + }, + { + "epoch": 9.28, + "grad_norm": 0.90625, + "learning_rate": 0.0003910128006872208, + "loss": 0.1754, + "step": 224050 + }, + { + "epoch": 9.28, + "grad_norm": 0.326171875, + "learning_rate": 0.0003910038452453861, + "loss": 0.1707, + "step": 224060 + }, + { + "epoch": 9.28, + "grad_norm": 0.671875, + "learning_rate": 0.00039099488953819753, + "loss": 0.1685, + "step": 224070 + }, + { + "epoch": 9.28, + "grad_norm": 0.765625, + "learning_rate": 0.00039098593356567165, + "loss": 0.2439, + "step": 224080 + }, + { + "epoch": 9.28, + "grad_norm": 0.8359375, + "learning_rate": 0.00039097697732782557, + "loss": 0.1824, + "step": 224090 + }, + { + "epoch": 9.28, + "grad_norm": 0.2431640625, + "learning_rate": 0.00039096802082467597, + "loss": 0.1594, + "step": 224100 + }, + { + "epoch": 9.28, + "grad_norm": 0.94140625, + "learning_rate": 0.0003909590640562398, + "loss": 0.1934, + "step": 224110 + }, + { + "epoch": 9.28, + "grad_norm": 2.609375, + "learning_rate": 0.000390950107022534, + "loss": 0.1916, + "step": 224120 + }, + { + "epoch": 9.28, + "grad_norm": 4.125, + "learning_rate": 0.0003909411497235752, + "loss": 0.1764, + "step": 224130 + }, + { + "epoch": 9.28, + "grad_norm": 0.29296875, + "learning_rate": 0.0003909321921593804, + "loss": 0.186, + "step": 224140 + }, + { + "epoch": 9.28, + "grad_norm": 0.96875, + "learning_rate": 0.00039092323432996645, + "loss": 0.2185, + "step": 224150 + }, + { + "epoch": 9.28, + "grad_norm": 0.63671875, + "learning_rate": 0.0003909142762353502, + "loss": 0.2012, + "step": 224160 + }, + { + "epoch": 9.29, + "grad_norm": 1.4765625, + "learning_rate": 0.0003909053178755485, + "loss": 0.206, + "step": 224170 + }, + { + "epoch": 9.29, + "grad_norm": 0.796875, + "learning_rate": 0.0003908963592505782, + "loss": 0.2066, + "step": 224180 + }, + { + "epoch": 9.29, + "grad_norm": 0.76171875, + "learning_rate": 0.00039088740036045613, + "loss": 0.2182, + "step": 224190 + }, + { + "epoch": 9.29, + "grad_norm": 1.0703125, + "learning_rate": 0.0003908784412051992, + "loss": 0.2273, + "step": 224200 + }, + { + "epoch": 9.29, + "grad_norm": 1.1796875, + "learning_rate": 0.0003908694817848243, + "loss": 0.1751, + "step": 224210 + }, + { + "epoch": 9.29, + "grad_norm": 0.5703125, + "learning_rate": 0.0003908605220993483, + "loss": 0.2026, + "step": 224220 + }, + { + "epoch": 9.29, + "grad_norm": 0.255859375, + "learning_rate": 0.0003908515621487879, + "loss": 0.1637, + "step": 224230 + }, + { + "epoch": 9.29, + "grad_norm": 0.361328125, + "learning_rate": 0.0003908426019331601, + "loss": 0.2334, + "step": 224240 + }, + { + "epoch": 9.29, + "grad_norm": 4.875, + "learning_rate": 0.0003908336414524817, + "loss": 0.1902, + "step": 224250 + }, + { + "epoch": 9.29, + "grad_norm": 1.2890625, + "learning_rate": 0.00039082468070676966, + "loss": 0.1708, + "step": 224260 + }, + { + "epoch": 9.29, + "grad_norm": 0.5703125, + "learning_rate": 0.0003908157196960408, + "loss": 0.1813, + "step": 224270 + }, + { + "epoch": 9.29, + "grad_norm": 1.421875, + "learning_rate": 0.0003908067584203119, + "loss": 0.2174, + "step": 224280 + }, + { + "epoch": 9.29, + "grad_norm": 0.41796875, + "learning_rate": 0.00039079779687959987, + "loss": 0.1668, + "step": 224290 + }, + { + "epoch": 9.29, + "grad_norm": 0.60546875, + "learning_rate": 0.0003907888350739216, + "loss": 0.1557, + "step": 224300 + }, + { + "epoch": 9.29, + "grad_norm": 0.328125, + "learning_rate": 0.000390779873003294, + "loss": 0.1791, + "step": 224310 + }, + { + "epoch": 9.29, + "grad_norm": 0.66015625, + "learning_rate": 0.00039077091066773385, + "loss": 0.2661, + "step": 224320 + }, + { + "epoch": 9.29, + "grad_norm": 1.09375, + "learning_rate": 0.00039076194806725805, + "loss": 0.2188, + "step": 224330 + }, + { + "epoch": 9.29, + "grad_norm": 1.5234375, + "learning_rate": 0.0003907529852018834, + "loss": 0.1941, + "step": 224340 + }, + { + "epoch": 9.29, + "grad_norm": 0.390625, + "learning_rate": 0.00039074402207162697, + "loss": 0.2133, + "step": 224350 + }, + { + "epoch": 9.29, + "grad_norm": 0.59765625, + "learning_rate": 0.0003907350586765054, + "loss": 0.2066, + "step": 224360 + }, + { + "epoch": 9.29, + "grad_norm": 0.49609375, + "learning_rate": 0.00039072609501653564, + "loss": 0.2219, + "step": 224370 + }, + { + "epoch": 9.29, + "grad_norm": 0.49609375, + "learning_rate": 0.00039071713109173464, + "loss": 0.2021, + "step": 224380 + }, + { + "epoch": 9.29, + "grad_norm": 0.3203125, + "learning_rate": 0.00039070816690211905, + "loss": 0.1733, + "step": 224390 + }, + { + "epoch": 9.29, + "grad_norm": 1.65625, + "learning_rate": 0.00039069920244770605, + "loss": 0.182, + "step": 224400 + }, + { + "epoch": 9.3, + "grad_norm": 0.8203125, + "learning_rate": 0.00039069023772851224, + "loss": 0.1938, + "step": 224410 + }, + { + "epoch": 9.3, + "grad_norm": 0.96875, + "learning_rate": 0.0003906812727445547, + "loss": 0.1838, + "step": 224420 + }, + { + "epoch": 9.3, + "grad_norm": 0.69921875, + "learning_rate": 0.0003906723074958501, + "loss": 0.1686, + "step": 224430 + }, + { + "epoch": 9.3, + "grad_norm": 0.8671875, + "learning_rate": 0.00039066334198241545, + "loss": 0.1703, + "step": 224440 + }, + { + "epoch": 9.3, + "grad_norm": 0.9375, + "learning_rate": 0.00039065437620426754, + "loss": 0.2063, + "step": 224450 + }, + { + "epoch": 9.3, + "grad_norm": 0.73046875, + "learning_rate": 0.0003906454101614233, + "loss": 0.1933, + "step": 224460 + }, + { + "epoch": 9.3, + "grad_norm": 0.494140625, + "learning_rate": 0.0003906364438538996, + "loss": 0.1958, + "step": 224470 + }, + { + "epoch": 9.3, + "grad_norm": 0.83203125, + "learning_rate": 0.00039062747728171334, + "loss": 0.2464, + "step": 224480 + }, + { + "epoch": 9.3, + "grad_norm": 0.4921875, + "learning_rate": 0.00039061851044488126, + "loss": 0.1621, + "step": 224490 + }, + { + "epoch": 9.3, + "grad_norm": 0.66796875, + "learning_rate": 0.0003906095433434204, + "loss": 0.1459, + "step": 224500 + }, + { + "epoch": 9.3, + "grad_norm": 0.396484375, + "learning_rate": 0.00039060057597734753, + "loss": 0.224, + "step": 224510 + }, + { + "epoch": 9.3, + "grad_norm": 2.34375, + "learning_rate": 0.00039059160834667954, + "loss": 0.1972, + "step": 224520 + }, + { + "epoch": 9.3, + "grad_norm": 0.59765625, + "learning_rate": 0.0003905826404514334, + "loss": 0.1603, + "step": 224530 + }, + { + "epoch": 9.3, + "grad_norm": 0.91015625, + "learning_rate": 0.00039057367229162587, + "loss": 0.1689, + "step": 224540 + }, + { + "epoch": 9.3, + "grad_norm": 0.73828125, + "learning_rate": 0.00039056470386727386, + "loss": 0.2039, + "step": 224550 + }, + { + "epoch": 9.3, + "grad_norm": 0.546875, + "learning_rate": 0.00039055573517839424, + "loss": 0.2113, + "step": 224560 + }, + { + "epoch": 9.3, + "grad_norm": 0.65234375, + "learning_rate": 0.00039054676622500393, + "loss": 0.1938, + "step": 224570 + }, + { + "epoch": 9.3, + "grad_norm": 0.275390625, + "learning_rate": 0.0003905377970071198, + "loss": 0.2183, + "step": 224580 + }, + { + "epoch": 9.3, + "grad_norm": 0.51171875, + "learning_rate": 0.0003905288275247587, + "loss": 0.2262, + "step": 224590 + }, + { + "epoch": 9.3, + "grad_norm": 0.88671875, + "learning_rate": 0.0003905198577779375, + "loss": 0.208, + "step": 224600 + }, + { + "epoch": 9.3, + "grad_norm": 2.40625, + "learning_rate": 0.0003905108877666732, + "loss": 0.1795, + "step": 224610 + }, + { + "epoch": 9.3, + "grad_norm": 1.21875, + "learning_rate": 0.0003905019174909825, + "loss": 0.2248, + "step": 224620 + }, + { + "epoch": 9.3, + "grad_norm": 1.4375, + "learning_rate": 0.00039049294695088233, + "loss": 0.1721, + "step": 224630 + }, + { + "epoch": 9.3, + "grad_norm": 1.390625, + "learning_rate": 0.00039048397614638964, + "loss": 0.2417, + "step": 224640 + }, + { + "epoch": 9.3, + "grad_norm": 0.3515625, + "learning_rate": 0.0003904750050775213, + "loss": 0.2373, + "step": 224650 + }, + { + "epoch": 9.31, + "grad_norm": 1.40625, + "learning_rate": 0.0003904660337442941, + "loss": 0.2156, + "step": 224660 + }, + { + "epoch": 9.31, + "grad_norm": 0.76171875, + "learning_rate": 0.0003904570621467251, + "loss": 0.1865, + "step": 224670 + }, + { + "epoch": 9.31, + "grad_norm": 1.1328125, + "learning_rate": 0.00039044809028483104, + "loss": 0.27, + "step": 224680 + }, + { + "epoch": 9.31, + "grad_norm": 0.703125, + "learning_rate": 0.0003904391181586288, + "loss": 0.1901, + "step": 224690 + }, + { + "epoch": 9.31, + "grad_norm": 0.306640625, + "learning_rate": 0.0003904301457681354, + "loss": 0.1623, + "step": 224700 + }, + { + "epoch": 9.31, + "grad_norm": 0.8125, + "learning_rate": 0.0003904211731133675, + "loss": 0.1985, + "step": 224710 + }, + { + "epoch": 9.31, + "grad_norm": 1.1796875, + "learning_rate": 0.0003904122001943422, + "loss": 0.2042, + "step": 224720 + }, + { + "epoch": 9.31, + "grad_norm": 0.734375, + "learning_rate": 0.0003904032270110762, + "loss": 0.1953, + "step": 224730 + }, + { + "epoch": 9.31, + "grad_norm": 0.326171875, + "learning_rate": 0.00039039425356358665, + "loss": 0.2077, + "step": 224740 + }, + { + "epoch": 9.31, + "grad_norm": 0.75390625, + "learning_rate": 0.00039038527985189023, + "loss": 0.1702, + "step": 224750 + }, + { + "epoch": 9.31, + "grad_norm": 0.984375, + "learning_rate": 0.00039037630587600384, + "loss": 0.1839, + "step": 224760 + }, + { + "epoch": 9.31, + "grad_norm": 0.251953125, + "learning_rate": 0.0003903673316359444, + "loss": 0.1756, + "step": 224770 + }, + { + "epoch": 9.31, + "grad_norm": 0.53125, + "learning_rate": 0.0003903583571317288, + "loss": 0.2135, + "step": 224780 + }, + { + "epoch": 9.31, + "grad_norm": 0.3515625, + "learning_rate": 0.00039034938236337394, + "loss": 0.19, + "step": 224790 + }, + { + "epoch": 9.31, + "grad_norm": 0.25390625, + "learning_rate": 0.00039034040733089664, + "loss": 0.195, + "step": 224800 + }, + { + "epoch": 9.31, + "grad_norm": 0.875, + "learning_rate": 0.00039033143203431387, + "loss": 0.189, + "step": 224810 + }, + { + "epoch": 9.31, + "grad_norm": 1.2265625, + "learning_rate": 0.00039032245647364254, + "loss": 0.1403, + "step": 224820 + }, + { + "epoch": 9.31, + "grad_norm": 0.6015625, + "learning_rate": 0.0003903134806488995, + "loss": 0.2144, + "step": 224830 + }, + { + "epoch": 9.31, + "grad_norm": 0.55859375, + "learning_rate": 0.00039030450456010166, + "loss": 0.1487, + "step": 224840 + }, + { + "epoch": 9.31, + "grad_norm": 0.490234375, + "learning_rate": 0.0003902955282072659, + "loss": 0.1581, + "step": 224850 + }, + { + "epoch": 9.31, + "grad_norm": 1.765625, + "learning_rate": 0.00039028655159040903, + "loss": 0.2377, + "step": 224860 + }, + { + "epoch": 9.31, + "grad_norm": 0.9765625, + "learning_rate": 0.00039027757470954804, + "loss": 0.1969, + "step": 224870 + }, + { + "epoch": 9.31, + "grad_norm": 0.90625, + "learning_rate": 0.00039026859756469986, + "loss": 0.2511, + "step": 224880 + }, + { + "epoch": 9.31, + "grad_norm": 1.3203125, + "learning_rate": 0.00039025962015588124, + "loss": 0.1947, + "step": 224890 + }, + { + "epoch": 9.32, + "grad_norm": 0.439453125, + "learning_rate": 0.0003902506424831093, + "loss": 0.22, + "step": 224900 + }, + { + "epoch": 9.32, + "grad_norm": 0.65234375, + "learning_rate": 0.00039024166454640065, + "loss": 0.1632, + "step": 224910 + }, + { + "epoch": 9.32, + "grad_norm": 0.6953125, + "learning_rate": 0.0003902326863457724, + "loss": 0.2155, + "step": 224920 + }, + { + "epoch": 9.32, + "grad_norm": 0.83984375, + "learning_rate": 0.0003902237078812414, + "loss": 0.1635, + "step": 224930 + }, + { + "epoch": 9.32, + "grad_norm": 0.734375, + "learning_rate": 0.0003902147291528244, + "loss": 0.1488, + "step": 224940 + }, + { + "epoch": 9.32, + "grad_norm": 1.1015625, + "learning_rate": 0.0003902057501605386, + "loss": 0.1565, + "step": 224950 + }, + { + "epoch": 9.32, + "grad_norm": 0.455078125, + "learning_rate": 0.00039019677090440063, + "loss": 0.2006, + "step": 224960 + }, + { + "epoch": 9.32, + "grad_norm": 1.3828125, + "learning_rate": 0.00039018779138442746, + "loss": 0.1804, + "step": 224970 + }, + { + "epoch": 9.32, + "grad_norm": 1.1015625, + "learning_rate": 0.00039017881160063607, + "loss": 0.1855, + "step": 224980 + }, + { + "epoch": 9.32, + "grad_norm": 0.7421875, + "learning_rate": 0.0003901698315530432, + "loss": 0.2293, + "step": 224990 + }, + { + "epoch": 9.32, + "grad_norm": 0.8515625, + "learning_rate": 0.0003901608512416659, + "loss": 0.2103, + "step": 225000 + }, + { + "epoch": 9.32, + "grad_norm": 2.78125, + "learning_rate": 0.000390151870666521, + "loss": 0.1997, + "step": 225010 + }, + { + "epoch": 9.32, + "grad_norm": 0.7109375, + "learning_rate": 0.0003901428898276255, + "loss": 0.2208, + "step": 225020 + }, + { + "epoch": 9.32, + "grad_norm": 0.5703125, + "learning_rate": 0.0003901339087249961, + "loss": 0.2679, + "step": 225030 + }, + { + "epoch": 9.32, + "grad_norm": 0.439453125, + "learning_rate": 0.0003901249273586498, + "loss": 0.2072, + "step": 225040 + }, + { + "epoch": 9.32, + "grad_norm": 0.62109375, + "learning_rate": 0.0003901159457286036, + "loss": 0.198, + "step": 225050 + }, + { + "epoch": 9.32, + "grad_norm": 0.8671875, + "learning_rate": 0.0003901069638348743, + "loss": 0.2006, + "step": 225060 + }, + { + "epoch": 9.32, + "grad_norm": 1.484375, + "learning_rate": 0.0003900979816774788, + "loss": 0.2369, + "step": 225070 + }, + { + "epoch": 9.32, + "grad_norm": 0.890625, + "learning_rate": 0.000390088999256434, + "loss": 0.1732, + "step": 225080 + }, + { + "epoch": 9.32, + "grad_norm": 0.6015625, + "learning_rate": 0.00039008001657175694, + "loss": 0.1974, + "step": 225090 + }, + { + "epoch": 9.32, + "grad_norm": 1.0390625, + "learning_rate": 0.0003900710336234644, + "loss": 0.2892, + "step": 225100 + }, + { + "epoch": 9.32, + "grad_norm": 0.494140625, + "learning_rate": 0.0003900620504115733, + "loss": 0.2081, + "step": 225110 + }, + { + "epoch": 9.32, + "grad_norm": 0.609375, + "learning_rate": 0.00039005306693610045, + "loss": 0.171, + "step": 225120 + }, + { + "epoch": 9.32, + "grad_norm": 0.55859375, + "learning_rate": 0.00039004408319706294, + "loss": 0.1565, + "step": 225130 + }, + { + "epoch": 9.33, + "grad_norm": 1.8984375, + "learning_rate": 0.00039003509919447756, + "loss": 0.195, + "step": 225140 + }, + { + "epoch": 9.33, + "grad_norm": 0.99609375, + "learning_rate": 0.0003900261149283612, + "loss": 0.2109, + "step": 225150 + }, + { + "epoch": 9.33, + "grad_norm": 0.99609375, + "learning_rate": 0.00039001713039873087, + "loss": 0.1889, + "step": 225160 + }, + { + "epoch": 9.33, + "grad_norm": 3.140625, + "learning_rate": 0.0003900081456056034, + "loss": 0.1951, + "step": 225170 + }, + { + "epoch": 9.33, + "grad_norm": 0.4453125, + "learning_rate": 0.00038999916054899573, + "loss": 0.1687, + "step": 225180 + }, + { + "epoch": 9.33, + "grad_norm": 1.015625, + "learning_rate": 0.0003899901752289248, + "loss": 0.2558, + "step": 225190 + }, + { + "epoch": 9.33, + "grad_norm": 0.62890625, + "learning_rate": 0.0003899811896454074, + "loss": 0.1866, + "step": 225200 + }, + { + "epoch": 9.33, + "grad_norm": 1.03125, + "learning_rate": 0.00038997220379846057, + "loss": 0.1953, + "step": 225210 + }, + { + "epoch": 9.33, + "grad_norm": 0.94140625, + "learning_rate": 0.00038996321768810117, + "loss": 0.1963, + "step": 225220 + }, + { + "epoch": 9.33, + "grad_norm": 0.66796875, + "learning_rate": 0.00038995423131434607, + "loss": 0.2623, + "step": 225230 + }, + { + "epoch": 9.33, + "grad_norm": 0.81640625, + "learning_rate": 0.0003899452446772122, + "loss": 0.2181, + "step": 225240 + }, + { + "epoch": 9.33, + "grad_norm": 0.5546875, + "learning_rate": 0.00038993625777671655, + "loss": 0.2039, + "step": 225250 + }, + { + "epoch": 9.33, + "grad_norm": 0.91796875, + "learning_rate": 0.0003899272706128759, + "loss": 0.2231, + "step": 225260 + }, + { + "epoch": 9.33, + "grad_norm": 0.80859375, + "learning_rate": 0.0003899182831857073, + "loss": 0.1782, + "step": 225270 + }, + { + "epoch": 9.33, + "grad_norm": 0.65234375, + "learning_rate": 0.0003899092954952276, + "loss": 0.2104, + "step": 225280 + }, + { + "epoch": 9.33, + "grad_norm": 0.6640625, + "learning_rate": 0.0003899003075414537, + "loss": 0.2347, + "step": 225290 + }, + { + "epoch": 9.33, + "grad_norm": 0.4140625, + "learning_rate": 0.0003898913193244026, + "loss": 0.1967, + "step": 225300 + }, + { + "epoch": 9.33, + "grad_norm": 1.6328125, + "learning_rate": 0.00038988233084409103, + "loss": 0.1807, + "step": 225310 + }, + { + "epoch": 9.33, + "grad_norm": 0.9140625, + "learning_rate": 0.0003898733421005361, + "loss": 0.1956, + "step": 225320 + }, + { + "epoch": 9.33, + "grad_norm": 0.515625, + "learning_rate": 0.0003898643530937546, + "loss": 0.1721, + "step": 225330 + }, + { + "epoch": 9.33, + "grad_norm": 0.59375, + "learning_rate": 0.00038985536382376353, + "loss": 0.1818, + "step": 225340 + }, + { + "epoch": 9.33, + "grad_norm": 0.466796875, + "learning_rate": 0.0003898463742905797, + "loss": 0.1854, + "step": 225350 + }, + { + "epoch": 9.33, + "grad_norm": 0.51171875, + "learning_rate": 0.0003898373844942202, + "loss": 0.1701, + "step": 225360 + }, + { + "epoch": 9.33, + "grad_norm": 0.77734375, + "learning_rate": 0.00038982839443470175, + "loss": 0.2601, + "step": 225370 + }, + { + "epoch": 9.34, + "grad_norm": 0.224609375, + "learning_rate": 0.0003898194041120414, + "loss": 0.193, + "step": 225380 + }, + { + "epoch": 9.34, + "grad_norm": 1.2578125, + "learning_rate": 0.000389810413526256, + "loss": 0.1623, + "step": 225390 + }, + { + "epoch": 9.34, + "grad_norm": 0.7265625, + "learning_rate": 0.0003898014226773625, + "loss": 0.2151, + "step": 225400 + }, + { + "epoch": 9.34, + "grad_norm": 0.0003509521484375, + "learning_rate": 0.0003897924315653779, + "loss": 0.1575, + "step": 225410 + }, + { + "epoch": 9.34, + "grad_norm": 0.76171875, + "learning_rate": 0.000389783440190319, + "loss": 0.2145, + "step": 225420 + }, + { + "epoch": 9.34, + "grad_norm": 2.328125, + "learning_rate": 0.00038977444855220277, + "loss": 0.2661, + "step": 225430 + }, + { + "epoch": 9.34, + "grad_norm": 0.84375, + "learning_rate": 0.00038976545665104606, + "loss": 0.1949, + "step": 225440 + }, + { + "epoch": 9.34, + "grad_norm": 0.96484375, + "learning_rate": 0.0003897564644868659, + "loss": 0.231, + "step": 225450 + }, + { + "epoch": 9.34, + "grad_norm": 0.7109375, + "learning_rate": 0.0003897474720596792, + "loss": 0.1136, + "step": 225460 + }, + { + "epoch": 9.34, + "grad_norm": 0.8125, + "learning_rate": 0.0003897384793695028, + "loss": 0.1798, + "step": 225470 + }, + { + "epoch": 9.34, + "grad_norm": 1.8046875, + "learning_rate": 0.00038972948641635366, + "loss": 0.1735, + "step": 225480 + }, + { + "epoch": 9.34, + "grad_norm": 0.65625, + "learning_rate": 0.00038972049320024874, + "loss": 0.1646, + "step": 225490 + }, + { + "epoch": 9.34, + "grad_norm": 1.171875, + "learning_rate": 0.0003897114997212049, + "loss": 0.1678, + "step": 225500 + }, + { + "epoch": 9.34, + "grad_norm": 2.84375, + "learning_rate": 0.00038970250597923916, + "loss": 0.2144, + "step": 225510 + }, + { + "epoch": 9.34, + "grad_norm": 0.95703125, + "learning_rate": 0.00038969351197436843, + "loss": 0.2365, + "step": 225520 + }, + { + "epoch": 9.34, + "grad_norm": 0.59765625, + "learning_rate": 0.0003896845177066095, + "loss": 0.2268, + "step": 225530 + }, + { + "epoch": 9.34, + "grad_norm": 0.447265625, + "learning_rate": 0.00038967552317597945, + "loss": 0.1809, + "step": 225540 + }, + { + "epoch": 9.34, + "grad_norm": 0.78515625, + "learning_rate": 0.0003896665283824951, + "loss": 0.1956, + "step": 225550 + }, + { + "epoch": 9.34, + "grad_norm": 0.431640625, + "learning_rate": 0.0003896575333261735, + "loss": 0.2119, + "step": 225560 + }, + { + "epoch": 9.34, + "grad_norm": 1.5234375, + "learning_rate": 0.00038964853800703143, + "loss": 0.2092, + "step": 225570 + }, + { + "epoch": 9.34, + "grad_norm": 0.384765625, + "learning_rate": 0.0003896395424250859, + "loss": 0.2087, + "step": 225580 + }, + { + "epoch": 9.34, + "grad_norm": 0.546875, + "learning_rate": 0.0003896305465803539, + "loss": 0.217, + "step": 225590 + }, + { + "epoch": 9.34, + "grad_norm": 1.0234375, + "learning_rate": 0.0003896215504728522, + "loss": 0.2158, + "step": 225600 + }, + { + "epoch": 9.34, + "grad_norm": 1.0078125, + "learning_rate": 0.0003896125541025979, + "loss": 0.2008, + "step": 225610 + }, + { + "epoch": 9.35, + "grad_norm": 1.0078125, + "learning_rate": 0.0003896035574696078, + "loss": 0.2243, + "step": 225620 + }, + { + "epoch": 9.35, + "grad_norm": 0.671875, + "learning_rate": 0.0003895945605738989, + "loss": 0.1865, + "step": 225630 + }, + { + "epoch": 9.35, + "grad_norm": 0.9453125, + "learning_rate": 0.0003895855634154881, + "loss": 0.236, + "step": 225640 + }, + { + "epoch": 9.35, + "grad_norm": 0.2041015625, + "learning_rate": 0.00038957656599439234, + "loss": 0.2186, + "step": 225650 + }, + { + "epoch": 9.35, + "grad_norm": 0.6171875, + "learning_rate": 0.00038956756831062854, + "loss": 0.1987, + "step": 225660 + }, + { + "epoch": 9.35, + "grad_norm": 0.77734375, + "learning_rate": 0.00038955857036421373, + "loss": 0.1818, + "step": 225670 + }, + { + "epoch": 9.35, + "grad_norm": 0.72265625, + "learning_rate": 0.0003895495721551647, + "loss": 0.2086, + "step": 225680 + }, + { + "epoch": 9.35, + "grad_norm": 3.140625, + "learning_rate": 0.0003895405736834985, + "loss": 0.1772, + "step": 225690 + }, + { + "epoch": 9.35, + "grad_norm": 0.5625, + "learning_rate": 0.00038953157494923187, + "loss": 0.1947, + "step": 225700 + }, + { + "epoch": 9.35, + "grad_norm": 0.51953125, + "learning_rate": 0.000389522575952382, + "loss": 0.1505, + "step": 225710 + }, + { + "epoch": 9.35, + "grad_norm": 1.28125, + "learning_rate": 0.0003895135766929657, + "loss": 0.1641, + "step": 225720 + }, + { + "epoch": 9.35, + "grad_norm": 0.439453125, + "learning_rate": 0.00038950457717099984, + "loss": 0.2155, + "step": 225730 + }, + { + "epoch": 9.35, + "grad_norm": 0.435546875, + "learning_rate": 0.0003894955773865015, + "loss": 0.1702, + "step": 225740 + }, + { + "epoch": 9.35, + "grad_norm": 0.8359375, + "learning_rate": 0.00038948657733948756, + "loss": 0.1713, + "step": 225750 + }, + { + "epoch": 9.35, + "grad_norm": 0.65625, + "learning_rate": 0.0003894775770299749, + "loss": 0.198, + "step": 225760 + }, + { + "epoch": 9.35, + "grad_norm": 0.345703125, + "learning_rate": 0.0003894685764579806, + "loss": 0.2046, + "step": 225770 + }, + { + "epoch": 9.35, + "grad_norm": 0.7734375, + "learning_rate": 0.00038945957562352144, + "loss": 0.2254, + "step": 225780 + }, + { + "epoch": 9.35, + "grad_norm": 0.6171875, + "learning_rate": 0.0003894505745266144, + "loss": 0.1755, + "step": 225790 + }, + { + "epoch": 9.35, + "grad_norm": 1.3984375, + "learning_rate": 0.0003894415731672764, + "loss": 0.2043, + "step": 225800 + }, + { + "epoch": 9.35, + "grad_norm": 0.859375, + "learning_rate": 0.0003894325715455245, + "loss": 0.1531, + "step": 225810 + }, + { + "epoch": 9.35, + "grad_norm": 0.55078125, + "learning_rate": 0.0003894235696613755, + "loss": 0.214, + "step": 225820 + }, + { + "epoch": 9.35, + "grad_norm": 0.7109375, + "learning_rate": 0.0003894145675148464, + "loss": 0.2111, + "step": 225830 + }, + { + "epoch": 9.35, + "grad_norm": 0.75, + "learning_rate": 0.00038940556510595417, + "loss": 0.223, + "step": 225840 + }, + { + "epoch": 9.35, + "grad_norm": 0.002685546875, + "learning_rate": 0.0003893965624347157, + "loss": 0.2108, + "step": 225850 + }, + { + "epoch": 9.36, + "grad_norm": 0.75, + "learning_rate": 0.0003893875595011479, + "loss": 0.2086, + "step": 225860 + }, + { + "epoch": 9.36, + "grad_norm": 1.5703125, + "learning_rate": 0.00038937855630526787, + "loss": 0.1828, + "step": 225870 + }, + { + "epoch": 9.36, + "grad_norm": 0.60546875, + "learning_rate": 0.0003893695528470924, + "loss": 0.2206, + "step": 225880 + }, + { + "epoch": 9.36, + "grad_norm": 0.53515625, + "learning_rate": 0.0003893605491266385, + "loss": 0.1744, + "step": 225890 + }, + { + "epoch": 9.36, + "grad_norm": 0.8125, + "learning_rate": 0.0003893515451439231, + "loss": 0.1391, + "step": 225900 + }, + { + "epoch": 9.36, + "grad_norm": 1.1640625, + "learning_rate": 0.00038934254089896303, + "loss": 0.2388, + "step": 225910 + }, + { + "epoch": 9.36, + "grad_norm": 1.140625, + "learning_rate": 0.0003893335363917755, + "loss": 0.1542, + "step": 225920 + }, + { + "epoch": 9.36, + "grad_norm": 0.97265625, + "learning_rate": 0.0003893245316223772, + "loss": 0.2219, + "step": 225930 + }, + { + "epoch": 9.36, + "grad_norm": 0.7265625, + "learning_rate": 0.00038931552659078524, + "loss": 0.1379, + "step": 225940 + }, + { + "epoch": 9.36, + "grad_norm": 1.0546875, + "learning_rate": 0.0003893065212970165, + "loss": 0.2353, + "step": 225950 + }, + { + "epoch": 9.36, + "grad_norm": 1.015625, + "learning_rate": 0.0003892975157410879, + "loss": 0.2064, + "step": 225960 + }, + { + "epoch": 9.36, + "grad_norm": 1.0625, + "learning_rate": 0.00038928850992301647, + "loss": 0.2116, + "step": 225970 + }, + { + "epoch": 9.36, + "grad_norm": 0.51953125, + "learning_rate": 0.00038927950384281905, + "loss": 0.1774, + "step": 225980 + }, + { + "epoch": 9.36, + "grad_norm": 1.03125, + "learning_rate": 0.00038927049750051257, + "loss": 0.2696, + "step": 225990 + }, + { + "epoch": 9.36, + "grad_norm": 0.6484375, + "learning_rate": 0.0003892614908961142, + "loss": 0.1936, + "step": 226000 + }, + { + "epoch": 9.36, + "grad_norm": 0.609375, + "learning_rate": 0.00038925248402964073, + "loss": 0.16, + "step": 226010 + }, + { + "epoch": 9.36, + "grad_norm": 3.125, + "learning_rate": 0.00038924347690110904, + "loss": 0.2275, + "step": 226020 + }, + { + "epoch": 9.36, + "grad_norm": 1.015625, + "learning_rate": 0.0003892344695105362, + "loss": 0.2518, + "step": 226030 + }, + { + "epoch": 9.36, + "grad_norm": 0.83984375, + "learning_rate": 0.00038922546185793914, + "loss": 0.1897, + "step": 226040 + }, + { + "epoch": 9.36, + "grad_norm": 1.0234375, + "learning_rate": 0.0003892164539433348, + "loss": 0.1706, + "step": 226050 + }, + { + "epoch": 9.36, + "grad_norm": 0.625, + "learning_rate": 0.0003892074457667401, + "loss": 0.1965, + "step": 226060 + }, + { + "epoch": 9.36, + "grad_norm": 0.68359375, + "learning_rate": 0.000389198437328172, + "loss": 0.2044, + "step": 226070 + }, + { + "epoch": 9.36, + "grad_norm": 0.0, + "learning_rate": 0.0003891894286276475, + "loss": 0.2179, + "step": 226080 + }, + { + "epoch": 9.36, + "grad_norm": 0.89453125, + "learning_rate": 0.00038918041966518356, + "loss": 0.2113, + "step": 226090 + }, + { + "epoch": 9.37, + "grad_norm": 2.109375, + "learning_rate": 0.0003891714104407971, + "loss": 0.2334, + "step": 226100 + }, + { + "epoch": 9.37, + "grad_norm": 0.59375, + "learning_rate": 0.0003891624009545051, + "loss": 0.1505, + "step": 226110 + }, + { + "epoch": 9.37, + "grad_norm": 1.453125, + "learning_rate": 0.0003891533912063243, + "loss": 0.2078, + "step": 226120 + }, + { + "epoch": 9.37, + "grad_norm": 0.76953125, + "learning_rate": 0.00038914438119627204, + "loss": 0.1985, + "step": 226130 + }, + { + "epoch": 9.37, + "grad_norm": 1.046875, + "learning_rate": 0.00038913537092436504, + "loss": 0.226, + "step": 226140 + }, + { + "epoch": 9.37, + "grad_norm": 0.5859375, + "learning_rate": 0.00038912636039062023, + "loss": 0.2344, + "step": 226150 + }, + { + "epoch": 9.37, + "grad_norm": 0.19140625, + "learning_rate": 0.00038911734959505475, + "loss": 0.1792, + "step": 226160 + }, + { + "epoch": 9.37, + "grad_norm": 0.498046875, + "learning_rate": 0.00038910833853768534, + "loss": 0.2183, + "step": 226170 + }, + { + "epoch": 9.37, + "grad_norm": 0.59375, + "learning_rate": 0.0003890993272185291, + "loss": 0.2111, + "step": 226180 + }, + { + "epoch": 9.37, + "grad_norm": 0.859375, + "learning_rate": 0.00038909031563760294, + "loss": 0.1864, + "step": 226190 + }, + { + "epoch": 9.37, + "grad_norm": 0.17578125, + "learning_rate": 0.0003890813037949238, + "loss": 0.2025, + "step": 226200 + }, + { + "epoch": 9.37, + "grad_norm": 0.61328125, + "learning_rate": 0.00038907229169050873, + "loss": 0.1673, + "step": 226210 + }, + { + "epoch": 9.37, + "grad_norm": 1.1953125, + "learning_rate": 0.0003890632793243746, + "loss": 0.1817, + "step": 226220 + }, + { + "epoch": 9.37, + "grad_norm": 0.78125, + "learning_rate": 0.00038905426669653834, + "loss": 0.1984, + "step": 226230 + }, + { + "epoch": 9.37, + "grad_norm": 0.90234375, + "learning_rate": 0.00038904525380701704, + "loss": 0.1767, + "step": 226240 + }, + { + "epoch": 9.37, + "grad_norm": 0.462890625, + "learning_rate": 0.00038903624065582756, + "loss": 0.1572, + "step": 226250 + }, + { + "epoch": 9.37, + "grad_norm": 2.34375, + "learning_rate": 0.00038902722724298687, + "loss": 0.183, + "step": 226260 + }, + { + "epoch": 9.37, + "grad_norm": 0.57421875, + "learning_rate": 0.000389018213568512, + "loss": 0.1599, + "step": 226270 + }, + { + "epoch": 9.37, + "grad_norm": 0.73046875, + "learning_rate": 0.0003890091996324198, + "loss": 0.248, + "step": 226280 + }, + { + "epoch": 9.37, + "grad_norm": 0.8515625, + "learning_rate": 0.0003890001854347274, + "loss": 0.1971, + "step": 226290 + }, + { + "epoch": 9.37, + "grad_norm": 0.87109375, + "learning_rate": 0.00038899117097545156, + "loss": 0.2016, + "step": 226300 + }, + { + "epoch": 9.37, + "grad_norm": 0.490234375, + "learning_rate": 0.0003889821562546093, + "loss": 0.2039, + "step": 226310 + }, + { + "epoch": 9.37, + "grad_norm": 0.451171875, + "learning_rate": 0.0003889731412722178, + "loss": 0.1701, + "step": 226320 + }, + { + "epoch": 9.37, + "grad_norm": 0.48828125, + "learning_rate": 0.00038896412602829376, + "loss": 0.2338, + "step": 226330 + }, + { + "epoch": 9.37, + "grad_norm": 0.28515625, + "learning_rate": 0.00038895511052285423, + "loss": 0.1798, + "step": 226340 + }, + { + "epoch": 9.38, + "grad_norm": 0.8125, + "learning_rate": 0.00038894609475591626, + "loss": 0.1868, + "step": 226350 + }, + { + "epoch": 9.38, + "grad_norm": 0.283203125, + "learning_rate": 0.00038893707872749664, + "loss": 0.1486, + "step": 226360 + }, + { + "epoch": 9.38, + "grad_norm": 0.734375, + "learning_rate": 0.00038892806243761245, + "loss": 0.2052, + "step": 226370 + }, + { + "epoch": 9.38, + "grad_norm": 0.2431640625, + "learning_rate": 0.00038891904588628066, + "loss": 0.2065, + "step": 226380 + }, + { + "epoch": 9.38, + "grad_norm": 0.9375, + "learning_rate": 0.00038891002907351835, + "loss": 0.2102, + "step": 226390 + }, + { + "epoch": 9.38, + "grad_norm": 0.8203125, + "learning_rate": 0.00038890101199934226, + "loss": 0.1964, + "step": 226400 + }, + { + "epoch": 9.38, + "grad_norm": 0.7421875, + "learning_rate": 0.00038889199466376947, + "loss": 0.1919, + "step": 226410 + }, + { + "epoch": 9.38, + "grad_norm": 0.89453125, + "learning_rate": 0.00038888297706681695, + "loss": 0.2425, + "step": 226420 + }, + { + "epoch": 9.38, + "grad_norm": 0.443359375, + "learning_rate": 0.0003888739592085017, + "loss": 0.1556, + "step": 226430 + }, + { + "epoch": 9.38, + "grad_norm": 0.625, + "learning_rate": 0.00038886494108884064, + "loss": 0.1656, + "step": 226440 + }, + { + "epoch": 9.38, + "grad_norm": 0.74609375, + "learning_rate": 0.00038885592270785073, + "loss": 0.191, + "step": 226450 + }, + { + "epoch": 9.38, + "grad_norm": 0.87109375, + "learning_rate": 0.00038884690406554897, + "loss": 0.2137, + "step": 226460 + }, + { + "epoch": 9.38, + "grad_norm": 0.703125, + "learning_rate": 0.00038883788516195237, + "loss": 0.2221, + "step": 226470 + }, + { + "epoch": 9.38, + "grad_norm": 0.734375, + "learning_rate": 0.00038882886599707784, + "loss": 0.21, + "step": 226480 + }, + { + "epoch": 9.38, + "grad_norm": 0.58984375, + "learning_rate": 0.0003888198465709424, + "loss": 0.1629, + "step": 226490 + }, + { + "epoch": 9.38, + "grad_norm": 0.53515625, + "learning_rate": 0.00038881082688356294, + "loss": 0.1927, + "step": 226500 + }, + { + "epoch": 9.38, + "grad_norm": 0.36328125, + "learning_rate": 0.00038880180693495655, + "loss": 0.1839, + "step": 226510 + }, + { + "epoch": 9.38, + "grad_norm": 0.69921875, + "learning_rate": 0.0003887927867251401, + "loss": 0.188, + "step": 226520 + }, + { + "epoch": 9.38, + "grad_norm": 0.34375, + "learning_rate": 0.0003887837662541307, + "loss": 0.1949, + "step": 226530 + }, + { + "epoch": 9.38, + "grad_norm": 1.4765625, + "learning_rate": 0.00038877474552194516, + "loss": 0.1962, + "step": 226540 + }, + { + "epoch": 9.38, + "grad_norm": 0.796875, + "learning_rate": 0.0003887657245286005, + "loss": 0.2363, + "step": 226550 + }, + { + "epoch": 9.38, + "grad_norm": 0.8359375, + "learning_rate": 0.00038875670327411383, + "loss": 0.2068, + "step": 226560 + }, + { + "epoch": 9.38, + "grad_norm": 0.296875, + "learning_rate": 0.00038874768175850197, + "loss": 0.1936, + "step": 226570 + }, + { + "epoch": 9.38, + "grad_norm": 0.984375, + "learning_rate": 0.00038873865998178197, + "loss": 0.2322, + "step": 226580 + }, + { + "epoch": 9.39, + "grad_norm": 0.5625, + "learning_rate": 0.00038872963794397075, + "loss": 0.2211, + "step": 226590 + }, + { + "epoch": 9.39, + "grad_norm": 0.51953125, + "learning_rate": 0.0003887206156450854, + "loss": 0.1943, + "step": 226600 + }, + { + "epoch": 9.39, + "grad_norm": 0.625, + "learning_rate": 0.00038871159308514287, + "loss": 0.2054, + "step": 226610 + }, + { + "epoch": 9.39, + "grad_norm": 0.5, + "learning_rate": 0.00038870257026416, + "loss": 0.1885, + "step": 226620 + }, + { + "epoch": 9.39, + "grad_norm": 0.47265625, + "learning_rate": 0.00038869354718215393, + "loss": 0.1612, + "step": 226630 + }, + { + "epoch": 9.39, + "grad_norm": 0.275390625, + "learning_rate": 0.00038868452383914153, + "loss": 0.1545, + "step": 226640 + }, + { + "epoch": 9.39, + "grad_norm": 1.2421875, + "learning_rate": 0.0003886755002351399, + "loss": 0.1877, + "step": 226650 + }, + { + "epoch": 9.39, + "grad_norm": 0.73046875, + "learning_rate": 0.00038866647637016594, + "loss": 0.1986, + "step": 226660 + }, + { + "epoch": 9.39, + "grad_norm": 0.83984375, + "learning_rate": 0.0003886574522442366, + "loss": 0.1684, + "step": 226670 + }, + { + "epoch": 9.39, + "grad_norm": 0.65625, + "learning_rate": 0.00038864842785736896, + "loss": 0.2069, + "step": 226680 + }, + { + "epoch": 9.39, + "grad_norm": 0.51953125, + "learning_rate": 0.0003886394032095799, + "loss": 0.194, + "step": 226690 + }, + { + "epoch": 9.39, + "grad_norm": 1.484375, + "learning_rate": 0.0003886303783008865, + "loss": 0.2446, + "step": 226700 + }, + { + "epoch": 9.39, + "grad_norm": 1.0859375, + "learning_rate": 0.0003886213531313057, + "loss": 0.2127, + "step": 226710 + }, + { + "epoch": 9.39, + "grad_norm": 0.69921875, + "learning_rate": 0.00038861232770085445, + "loss": 0.1511, + "step": 226720 + }, + { + "epoch": 9.39, + "grad_norm": 1.3828125, + "learning_rate": 0.0003886033020095498, + "loss": 0.2227, + "step": 226730 + }, + { + "epoch": 9.39, + "grad_norm": 0.55078125, + "learning_rate": 0.00038859427605740873, + "loss": 0.1896, + "step": 226740 + }, + { + "epoch": 9.39, + "grad_norm": 1.953125, + "learning_rate": 0.0003885852498444481, + "loss": 0.1769, + "step": 226750 + }, + { + "epoch": 9.39, + "grad_norm": 1.046875, + "learning_rate": 0.00038857622337068503, + "loss": 0.2306, + "step": 226760 + }, + { + "epoch": 9.39, + "grad_norm": 0.703125, + "learning_rate": 0.0003885671966361366, + "loss": 0.2103, + "step": 226770 + }, + { + "epoch": 9.39, + "grad_norm": 0.69921875, + "learning_rate": 0.00038855816964081946, + "loss": 0.1946, + "step": 226780 + }, + { + "epoch": 9.39, + "grad_norm": 0.365234375, + "learning_rate": 0.00038854914238475093, + "loss": 0.1787, + "step": 226790 + }, + { + "epoch": 9.39, + "grad_norm": 1.3828125, + "learning_rate": 0.0003885401148679479, + "loss": 0.1967, + "step": 226800 + }, + { + "epoch": 9.39, + "grad_norm": 0.3984375, + "learning_rate": 0.0003885310870904273, + "loss": 0.1778, + "step": 226810 + }, + { + "epoch": 9.39, + "grad_norm": 0.2421875, + "learning_rate": 0.0003885220590522062, + "loss": 0.1857, + "step": 226820 + }, + { + "epoch": 9.4, + "grad_norm": 1.7734375, + "learning_rate": 0.00038851303075330143, + "loss": 0.1656, + "step": 226830 + }, + { + "epoch": 9.4, + "grad_norm": 0.73046875, + "learning_rate": 0.0003885040021937302, + "loss": 0.2039, + "step": 226840 + }, + { + "epoch": 9.4, + "grad_norm": 0.703125, + "learning_rate": 0.00038849497337350937, + "loss": 0.1933, + "step": 226850 + }, + { + "epoch": 9.4, + "grad_norm": 0.98828125, + "learning_rate": 0.00038848594429265583, + "loss": 0.2153, + "step": 226860 + }, + { + "epoch": 9.4, + "grad_norm": 0.875, + "learning_rate": 0.0003884769149511869, + "loss": 0.1788, + "step": 226870 + }, + { + "epoch": 9.4, + "grad_norm": 1.234375, + "learning_rate": 0.00038846788534911925, + "loss": 0.2031, + "step": 226880 + }, + { + "epoch": 9.4, + "grad_norm": 2.078125, + "learning_rate": 0.00038845885548646997, + "loss": 0.2462, + "step": 226890 + }, + { + "epoch": 9.4, + "grad_norm": 0.51171875, + "learning_rate": 0.0003884498253632561, + "loss": 0.2016, + "step": 226900 + }, + { + "epoch": 9.4, + "grad_norm": 1.0, + "learning_rate": 0.0003884407949794947, + "loss": 0.2192, + "step": 226910 + }, + { + "epoch": 9.4, + "grad_norm": 0.4609375, + "learning_rate": 0.00038843176433520256, + "loss": 0.1832, + "step": 226920 + }, + { + "epoch": 9.4, + "grad_norm": 0.96484375, + "learning_rate": 0.0003884227334303969, + "loss": 0.1812, + "step": 226930 + }, + { + "epoch": 9.4, + "grad_norm": 0.90625, + "learning_rate": 0.00038841370226509454, + "loss": 0.2474, + "step": 226940 + }, + { + "epoch": 9.4, + "grad_norm": 1.1015625, + "learning_rate": 0.0003884046708393125, + "loss": 0.2508, + "step": 226950 + }, + { + "epoch": 9.4, + "grad_norm": 0.5859375, + "learning_rate": 0.00038839563915306786, + "loss": 0.2159, + "step": 226960 + }, + { + "epoch": 9.4, + "grad_norm": 1.3125, + "learning_rate": 0.0003883866072063776, + "loss": 0.2183, + "step": 226970 + }, + { + "epoch": 9.4, + "grad_norm": 0.458984375, + "learning_rate": 0.00038837757499925863, + "loss": 0.2301, + "step": 226980 + }, + { + "epoch": 9.4, + "grad_norm": 1.4296875, + "learning_rate": 0.00038836854253172803, + "loss": 0.168, + "step": 226990 + }, + { + "epoch": 9.4, + "grad_norm": 1.0234375, + "learning_rate": 0.0003883595098038028, + "loss": 0.1978, + "step": 227000 + }, + { + "epoch": 9.4, + "grad_norm": 0.83984375, + "learning_rate": 0.0003883504768154999, + "loss": 0.1423, + "step": 227010 + }, + { + "epoch": 9.4, + "grad_norm": 1.4609375, + "learning_rate": 0.0003883414435668363, + "loss": 0.2507, + "step": 227020 + }, + { + "epoch": 9.4, + "grad_norm": 0.7109375, + "learning_rate": 0.00038833241005782913, + "loss": 0.1554, + "step": 227030 + }, + { + "epoch": 9.4, + "grad_norm": 0.53515625, + "learning_rate": 0.00038832337628849525, + "loss": 0.2473, + "step": 227040 + }, + { + "epoch": 9.4, + "grad_norm": 1.1328125, + "learning_rate": 0.00038831434225885175, + "loss": 0.2, + "step": 227050 + }, + { + "epoch": 9.4, + "grad_norm": 0.34765625, + "learning_rate": 0.00038830530796891554, + "loss": 0.1924, + "step": 227060 + }, + { + "epoch": 9.41, + "grad_norm": 1.09375, + "learning_rate": 0.00038829627341870364, + "loss": 0.17, + "step": 227070 + }, + { + "epoch": 9.41, + "grad_norm": 1.484375, + "learning_rate": 0.00038828723860823323, + "loss": 0.1792, + "step": 227080 + }, + { + "epoch": 9.41, + "grad_norm": 1.765625, + "learning_rate": 0.0003882782035375211, + "loss": 0.1917, + "step": 227090 + }, + { + "epoch": 9.41, + "grad_norm": 0.56640625, + "learning_rate": 0.0003882691682065843, + "loss": 0.2051, + "step": 227100 + }, + { + "epoch": 9.41, + "grad_norm": 1.109375, + "learning_rate": 0.00038826013261543987, + "loss": 0.1741, + "step": 227110 + }, + { + "epoch": 9.41, + "grad_norm": 0.45703125, + "learning_rate": 0.00038825109676410475, + "loss": 0.1457, + "step": 227120 + }, + { + "epoch": 9.41, + "grad_norm": 0.6875, + "learning_rate": 0.0003882420606525961, + "loss": 0.1971, + "step": 227130 + }, + { + "epoch": 9.41, + "grad_norm": 1.3515625, + "learning_rate": 0.0003882330242809307, + "loss": 0.1673, + "step": 227140 + }, + { + "epoch": 9.41, + "grad_norm": 2.46875, + "learning_rate": 0.0003882239876491258, + "loss": 0.1951, + "step": 227150 + }, + { + "epoch": 9.41, + "grad_norm": 0.7109375, + "learning_rate": 0.0003882149507571982, + "loss": 0.1934, + "step": 227160 + }, + { + "epoch": 9.41, + "grad_norm": 0.62109375, + "learning_rate": 0.000388205913605165, + "loss": 0.1427, + "step": 227170 + }, + { + "epoch": 9.41, + "grad_norm": 1.109375, + "learning_rate": 0.00038819687619304324, + "loss": 0.2112, + "step": 227180 + }, + { + "epoch": 9.41, + "grad_norm": 0.87109375, + "learning_rate": 0.0003881878385208499, + "loss": 0.141, + "step": 227190 + }, + { + "epoch": 9.41, + "grad_norm": 1.5, + "learning_rate": 0.0003881788005886019, + "loss": 0.1606, + "step": 227200 + }, + { + "epoch": 9.41, + "grad_norm": 0.1142578125, + "learning_rate": 0.00038816976239631633, + "loss": 0.1687, + "step": 227210 + }, + { + "epoch": 9.41, + "grad_norm": 1.5703125, + "learning_rate": 0.0003881607239440101, + "loss": 0.2085, + "step": 227220 + }, + { + "epoch": 9.41, + "grad_norm": 1.5625, + "learning_rate": 0.00038815168523170053, + "loss": 0.2029, + "step": 227230 + }, + { + "epoch": 9.41, + "grad_norm": 2.46875, + "learning_rate": 0.00038814264625940424, + "loss": 0.1941, + "step": 227240 + }, + { + "epoch": 9.41, + "grad_norm": 0.66015625, + "learning_rate": 0.00038813360702713844, + "loss": 0.1687, + "step": 227250 + }, + { + "epoch": 9.41, + "grad_norm": 0.3125, + "learning_rate": 0.0003881245675349202, + "loss": 0.1931, + "step": 227260 + }, + { + "epoch": 9.41, + "grad_norm": 0.58984375, + "learning_rate": 0.00038811552778276635, + "loss": 0.2105, + "step": 227270 + }, + { + "epoch": 9.41, + "grad_norm": 0.58203125, + "learning_rate": 0.000388106487770694, + "loss": 0.1661, + "step": 227280 + }, + { + "epoch": 9.41, + "grad_norm": 0.91796875, + "learning_rate": 0.0003880974474987201, + "loss": 0.1996, + "step": 227290 + }, + { + "epoch": 9.41, + "grad_norm": 1.015625, + "learning_rate": 0.00038808840696686177, + "loss": 0.2437, + "step": 227300 + }, + { + "epoch": 9.42, + "grad_norm": 0.9921875, + "learning_rate": 0.000388079366175136, + "loss": 0.152, + "step": 227310 + }, + { + "epoch": 9.42, + "grad_norm": 0.76953125, + "learning_rate": 0.00038807032512355976, + "loss": 0.1419, + "step": 227320 + }, + { + "epoch": 9.42, + "grad_norm": 0.7734375, + "learning_rate": 0.0003880612838121501, + "loss": 0.1468, + "step": 227330 + }, + { + "epoch": 9.42, + "grad_norm": 0.478515625, + "learning_rate": 0.00038805224224092394, + "loss": 0.1735, + "step": 227340 + }, + { + "epoch": 9.42, + "grad_norm": 1.0, + "learning_rate": 0.0003880432004098984, + "loss": 0.1632, + "step": 227350 + }, + { + "epoch": 9.42, + "grad_norm": 0.416015625, + "learning_rate": 0.0003880341583190905, + "loss": 0.2055, + "step": 227360 + }, + { + "epoch": 9.42, + "grad_norm": 0.44140625, + "learning_rate": 0.0003880251159685172, + "loss": 0.1744, + "step": 227370 + }, + { + "epoch": 9.42, + "grad_norm": 0.73046875, + "learning_rate": 0.00038801607335819553, + "loss": 0.2041, + "step": 227380 + }, + { + "epoch": 9.42, + "grad_norm": 0.76953125, + "learning_rate": 0.0003880070304881425, + "loss": 0.2447, + "step": 227390 + }, + { + "epoch": 9.42, + "grad_norm": 0.267578125, + "learning_rate": 0.00038799798735837524, + "loss": 0.1875, + "step": 227400 + }, + { + "epoch": 9.42, + "grad_norm": 1.328125, + "learning_rate": 0.0003879889439689105, + "loss": 0.1989, + "step": 227410 + }, + { + "epoch": 9.42, + "grad_norm": 0.7890625, + "learning_rate": 0.0003879799003197656, + "loss": 0.1907, + "step": 227420 + }, + { + "epoch": 9.42, + "grad_norm": 0.470703125, + "learning_rate": 0.00038797085641095736, + "loss": 0.1927, + "step": 227430 + }, + { + "epoch": 9.42, + "grad_norm": 0.60546875, + "learning_rate": 0.00038796181224250295, + "loss": 0.2074, + "step": 227440 + }, + { + "epoch": 9.42, + "grad_norm": 1.0390625, + "learning_rate": 0.00038795276781441925, + "loss": 0.1753, + "step": 227450 + }, + { + "epoch": 9.42, + "grad_norm": 0.2734375, + "learning_rate": 0.00038794372312672335, + "loss": 0.2136, + "step": 227460 + }, + { + "epoch": 9.42, + "grad_norm": 0.86328125, + "learning_rate": 0.0003879346781794322, + "loss": 0.1928, + "step": 227470 + }, + { + "epoch": 9.42, + "grad_norm": 0.38671875, + "learning_rate": 0.00038792563297256294, + "loss": 0.2412, + "step": 227480 + }, + { + "epoch": 9.42, + "grad_norm": 0.298828125, + "learning_rate": 0.0003879165875061326, + "loss": 0.1819, + "step": 227490 + }, + { + "epoch": 9.42, + "grad_norm": 0.609375, + "learning_rate": 0.00038790754178015795, + "loss": 0.2186, + "step": 227500 + }, + { + "epoch": 9.42, + "grad_norm": 1.015625, + "learning_rate": 0.0003878984957946563, + "loss": 0.192, + "step": 227510 + }, + { + "epoch": 9.42, + "grad_norm": 0.9453125, + "learning_rate": 0.0003878894495496447, + "loss": 0.1668, + "step": 227520 + }, + { + "epoch": 9.42, + "grad_norm": 0.75, + "learning_rate": 0.00038788040304513986, + "loss": 0.221, + "step": 227530 + }, + { + "epoch": 9.42, + "grad_norm": 0.515625, + "learning_rate": 0.00038787135628115906, + "loss": 0.2187, + "step": 227540 + }, + { + "epoch": 9.43, + "grad_norm": 0.98828125, + "learning_rate": 0.00038786230925771925, + "loss": 0.2289, + "step": 227550 + }, + { + "epoch": 9.43, + "grad_norm": 0.185546875, + "learning_rate": 0.0003878532619748375, + "loss": 0.1641, + "step": 227560 + }, + { + "epoch": 9.43, + "grad_norm": 0.6640625, + "learning_rate": 0.00038784421443253074, + "loss": 0.156, + "step": 227570 + }, + { + "epoch": 9.43, + "grad_norm": 0.251953125, + "learning_rate": 0.0003878351666308161, + "loss": 0.1778, + "step": 227580 + }, + { + "epoch": 9.43, + "grad_norm": 1.1484375, + "learning_rate": 0.0003878261185697105, + "loss": 0.2694, + "step": 227590 + }, + { + "epoch": 9.43, + "grad_norm": 0.62890625, + "learning_rate": 0.00038781707024923106, + "loss": 0.179, + "step": 227600 + }, + { + "epoch": 9.43, + "grad_norm": 0.447265625, + "learning_rate": 0.00038780802166939486, + "loss": 0.1882, + "step": 227610 + }, + { + "epoch": 9.43, + "grad_norm": 1.25, + "learning_rate": 0.0003877989728302187, + "loss": 0.1349, + "step": 227620 + }, + { + "epoch": 9.43, + "grad_norm": 1.3671875, + "learning_rate": 0.00038778992373171984, + "loss": 0.232, + "step": 227630 + }, + { + "epoch": 9.43, + "grad_norm": 0.72265625, + "learning_rate": 0.00038778087437391524, + "loss": 0.1799, + "step": 227640 + }, + { + "epoch": 9.43, + "grad_norm": 1.0859375, + "learning_rate": 0.00038777182475682194, + "loss": 0.2207, + "step": 227650 + }, + { + "epoch": 9.43, + "grad_norm": 0.64453125, + "learning_rate": 0.0003877627748804569, + "loss": 0.1549, + "step": 227660 + }, + { + "epoch": 9.43, + "grad_norm": 0.859375, + "learning_rate": 0.00038775372474483713, + "loss": 0.1892, + "step": 227670 + }, + { + "epoch": 9.43, + "grad_norm": 1.421875, + "learning_rate": 0.00038774467434997975, + "loss": 0.2164, + "step": 227680 + }, + { + "epoch": 9.43, + "grad_norm": 0.138671875, + "learning_rate": 0.0003877356236959019, + "loss": 0.2192, + "step": 227690 + }, + { + "epoch": 9.43, + "grad_norm": 0.228515625, + "learning_rate": 0.0003877265727826203, + "loss": 0.1876, + "step": 227700 + }, + { + "epoch": 9.43, + "grad_norm": 0.361328125, + "learning_rate": 0.0003877175216101523, + "loss": 0.2066, + "step": 227710 + }, + { + "epoch": 9.43, + "grad_norm": 0.8671875, + "learning_rate": 0.0003877084701785147, + "loss": 0.2386, + "step": 227720 + }, + { + "epoch": 9.43, + "grad_norm": 1.0, + "learning_rate": 0.00038769941848772465, + "loss": 0.1794, + "step": 227730 + }, + { + "epoch": 9.43, + "grad_norm": 0.79296875, + "learning_rate": 0.0003876903665377992, + "loss": 0.2025, + "step": 227740 + }, + { + "epoch": 9.43, + "grad_norm": 0.921875, + "learning_rate": 0.0003876813143287553, + "loss": 0.1683, + "step": 227750 + }, + { + "epoch": 9.43, + "grad_norm": 0.640625, + "learning_rate": 0.0003876722618606101, + "loss": 0.2347, + "step": 227760 + }, + { + "epoch": 9.43, + "grad_norm": 0.98828125, + "learning_rate": 0.0003876632091333805, + "loss": 0.201, + "step": 227770 + }, + { + "epoch": 9.43, + "grad_norm": 1.5234375, + "learning_rate": 0.0003876541561470837, + "loss": 0.2103, + "step": 227780 + }, + { + "epoch": 9.44, + "grad_norm": 0.5625, + "learning_rate": 0.00038764510290173655, + "loss": 0.206, + "step": 227790 + }, + { + "epoch": 9.44, + "grad_norm": 0.328125, + "learning_rate": 0.0003876360493973562, + "loss": 0.1823, + "step": 227800 + }, + { + "epoch": 9.44, + "grad_norm": 0.88671875, + "learning_rate": 0.0003876269956339597, + "loss": 0.2011, + "step": 227810 + }, + { + "epoch": 9.44, + "grad_norm": 0.55859375, + "learning_rate": 0.00038761794161156404, + "loss": 0.1789, + "step": 227820 + }, + { + "epoch": 9.44, + "grad_norm": 1.734375, + "learning_rate": 0.0003876088873301863, + "loss": 0.1782, + "step": 227830 + }, + { + "epoch": 9.44, + "grad_norm": 0.87109375, + "learning_rate": 0.00038759983278984346, + "loss": 0.1644, + "step": 227840 + }, + { + "epoch": 9.44, + "grad_norm": 0.82421875, + "learning_rate": 0.0003875907779905526, + "loss": 0.1963, + "step": 227850 + }, + { + "epoch": 9.44, + "grad_norm": 1.21875, + "learning_rate": 0.0003875817229323307, + "loss": 0.2065, + "step": 227860 + }, + { + "epoch": 9.44, + "grad_norm": 0.75390625, + "learning_rate": 0.000387572667615195, + "loss": 0.2441, + "step": 227870 + }, + { + "epoch": 9.44, + "grad_norm": 1.3359375, + "learning_rate": 0.0003875636120391623, + "loss": 0.2195, + "step": 227880 + }, + { + "epoch": 9.44, + "grad_norm": 0.97265625, + "learning_rate": 0.00038755455620424973, + "loss": 0.2273, + "step": 227890 + }, + { + "epoch": 9.44, + "grad_norm": 0.890625, + "learning_rate": 0.00038754550011047434, + "loss": 0.2074, + "step": 227900 + }, + { + "epoch": 9.44, + "grad_norm": 0.5390625, + "learning_rate": 0.00038753644375785323, + "loss": 0.2161, + "step": 227910 + }, + { + "epoch": 9.44, + "grad_norm": 0.73828125, + "learning_rate": 0.00038752738714640335, + "loss": 0.1888, + "step": 227920 + }, + { + "epoch": 9.44, + "grad_norm": 0.60546875, + "learning_rate": 0.00038751833027614183, + "loss": 0.1554, + "step": 227930 + }, + { + "epoch": 9.44, + "grad_norm": 0.5859375, + "learning_rate": 0.00038750927314708554, + "loss": 0.2593, + "step": 227940 + }, + { + "epoch": 9.44, + "grad_norm": 0.5703125, + "learning_rate": 0.00038750021575925176, + "loss": 0.1856, + "step": 227950 + }, + { + "epoch": 9.44, + "grad_norm": 0.94921875, + "learning_rate": 0.00038749115811265736, + "loss": 0.1919, + "step": 227960 + }, + { + "epoch": 9.44, + "grad_norm": 1.2421875, + "learning_rate": 0.00038748210020731947, + "loss": 0.2282, + "step": 227970 + }, + { + "epoch": 9.44, + "grad_norm": 0.6015625, + "learning_rate": 0.00038747304204325505, + "loss": 0.1802, + "step": 227980 + }, + { + "epoch": 9.44, + "grad_norm": 0.67578125, + "learning_rate": 0.0003874639836204813, + "loss": 0.1856, + "step": 227990 + }, + { + "epoch": 9.44, + "grad_norm": 0.5390625, + "learning_rate": 0.0003874549249390152, + "loss": 0.2033, + "step": 228000 + }, + { + "epoch": 9.44, + "grad_norm": 0.4296875, + "learning_rate": 0.00038744586599887373, + "loss": 0.1806, + "step": 228010 + }, + { + "epoch": 9.44, + "grad_norm": 0.150390625, + "learning_rate": 0.00038743680680007397, + "loss": 0.2589, + "step": 228020 + }, + { + "epoch": 9.44, + "grad_norm": 0.95703125, + "learning_rate": 0.00038742774734263296, + "loss": 0.2048, + "step": 228030 + }, + { + "epoch": 9.45, + "grad_norm": 0.515625, + "learning_rate": 0.0003874186876265678, + "loss": 0.17, + "step": 228040 + }, + { + "epoch": 9.45, + "grad_norm": 2.390625, + "learning_rate": 0.0003874096276518956, + "loss": 0.2099, + "step": 228050 + }, + { + "epoch": 9.45, + "grad_norm": 0.498046875, + "learning_rate": 0.00038740056741863317, + "loss": 0.2069, + "step": 228060 + }, + { + "epoch": 9.45, + "grad_norm": 0.828125, + "learning_rate": 0.00038739150692679774, + "loss": 0.189, + "step": 228070 + }, + { + "epoch": 9.45, + "grad_norm": 0.984375, + "learning_rate": 0.0003873824461764064, + "loss": 0.2274, + "step": 228080 + }, + { + "epoch": 9.45, + "grad_norm": 1.0234375, + "learning_rate": 0.0003873733851674761, + "loss": 0.2727, + "step": 228090 + }, + { + "epoch": 9.45, + "grad_norm": 0.34765625, + "learning_rate": 0.0003873643239000239, + "loss": 0.2132, + "step": 228100 + }, + { + "epoch": 9.45, + "grad_norm": 0.58203125, + "learning_rate": 0.0003873552623740669, + "loss": 0.195, + "step": 228110 + }, + { + "epoch": 9.45, + "grad_norm": 0.73828125, + "learning_rate": 0.0003873462005896221, + "loss": 0.226, + "step": 228120 + }, + { + "epoch": 9.45, + "grad_norm": 1.140625, + "learning_rate": 0.0003873371385467067, + "loss": 0.2214, + "step": 228130 + }, + { + "epoch": 9.45, + "grad_norm": 1.2109375, + "learning_rate": 0.00038732807624533745, + "loss": 0.2657, + "step": 228140 + }, + { + "epoch": 9.45, + "grad_norm": 0.3359375, + "learning_rate": 0.00038731901368553167, + "loss": 0.1868, + "step": 228150 + }, + { + "epoch": 9.45, + "grad_norm": 1.5859375, + "learning_rate": 0.00038730995086730635, + "loss": 0.2112, + "step": 228160 + }, + { + "epoch": 9.45, + "grad_norm": 0.4140625, + "learning_rate": 0.00038730088779067857, + "loss": 0.195, + "step": 228170 + }, + { + "epoch": 9.45, + "grad_norm": 0.478515625, + "learning_rate": 0.00038729182445566536, + "loss": 0.1859, + "step": 228180 + }, + { + "epoch": 9.45, + "grad_norm": 0.6953125, + "learning_rate": 0.00038728276086228363, + "loss": 0.1872, + "step": 228190 + }, + { + "epoch": 9.45, + "grad_norm": 0.333984375, + "learning_rate": 0.0003872736970105507, + "loss": 0.2295, + "step": 228200 + }, + { + "epoch": 9.45, + "grad_norm": 0.27734375, + "learning_rate": 0.0003872646329004834, + "loss": 0.1764, + "step": 228210 + }, + { + "epoch": 9.45, + "grad_norm": 2.40625, + "learning_rate": 0.0003872555685320989, + "loss": 0.1339, + "step": 228220 + }, + { + "epoch": 9.45, + "grad_norm": 0.8515625, + "learning_rate": 0.0003872465039054143, + "loss": 0.1892, + "step": 228230 + }, + { + "epoch": 9.45, + "grad_norm": 0.71875, + "learning_rate": 0.0003872374390204466, + "loss": 0.2146, + "step": 228240 + }, + { + "epoch": 9.45, + "grad_norm": 0.63671875, + "learning_rate": 0.0003872283738772128, + "loss": 0.1883, + "step": 228250 + }, + { + "epoch": 9.45, + "grad_norm": 0.578125, + "learning_rate": 0.00038721930847573005, + "loss": 0.175, + "step": 228260 + }, + { + "epoch": 9.45, + "grad_norm": 1.125, + "learning_rate": 0.00038721024281601537, + "loss": 0.2107, + "step": 228270 + }, + { + "epoch": 9.46, + "grad_norm": 0.1640625, + "learning_rate": 0.00038720117689808586, + "loss": 0.1818, + "step": 228280 + }, + { + "epoch": 9.46, + "grad_norm": 0.82421875, + "learning_rate": 0.0003871921107219586, + "loss": 0.2258, + "step": 228290 + }, + { + "epoch": 9.46, + "grad_norm": 1.21875, + "learning_rate": 0.0003871830442876505, + "loss": 0.1866, + "step": 228300 + }, + { + "epoch": 9.46, + "grad_norm": 0.9140625, + "learning_rate": 0.00038717397759517873, + "loss": 0.2052, + "step": 228310 + }, + { + "epoch": 9.46, + "grad_norm": 0.65234375, + "learning_rate": 0.00038716491064456036, + "loss": 0.1975, + "step": 228320 + }, + { + "epoch": 9.46, + "grad_norm": 0.58203125, + "learning_rate": 0.0003871558434358125, + "loss": 0.2761, + "step": 228330 + }, + { + "epoch": 9.46, + "grad_norm": 0.65625, + "learning_rate": 0.0003871467759689521, + "loss": 0.1865, + "step": 228340 + }, + { + "epoch": 9.46, + "grad_norm": 0.287109375, + "learning_rate": 0.0003871377082439963, + "loss": 0.1993, + "step": 228350 + }, + { + "epoch": 9.46, + "grad_norm": 1.1875, + "learning_rate": 0.0003871286402609622, + "loss": 0.2035, + "step": 228360 + }, + { + "epoch": 9.46, + "grad_norm": 0.59765625, + "learning_rate": 0.0003871195720198667, + "loss": 0.1987, + "step": 228370 + }, + { + "epoch": 9.46, + "grad_norm": 0.6796875, + "learning_rate": 0.000387110503520727, + "loss": 0.1956, + "step": 228380 + }, + { + "epoch": 9.46, + "grad_norm": 0.796875, + "learning_rate": 0.0003871014347635602, + "loss": 0.188, + "step": 228390 + }, + { + "epoch": 9.46, + "grad_norm": 0.0, + "learning_rate": 0.00038709236574838325, + "loss": 0.2152, + "step": 228400 + }, + { + "epoch": 9.46, + "grad_norm": 0.0001239776611328125, + "learning_rate": 0.00038708329647521323, + "loss": 0.2518, + "step": 228410 + }, + { + "epoch": 9.46, + "grad_norm": 0.52734375, + "learning_rate": 0.0003870742269440674, + "loss": 0.2043, + "step": 228420 + }, + { + "epoch": 9.46, + "grad_norm": 0.734375, + "learning_rate": 0.0003870651571549625, + "loss": 0.1936, + "step": 228430 + }, + { + "epoch": 9.46, + "grad_norm": 0.76953125, + "learning_rate": 0.0003870560871079159, + "loss": 0.141, + "step": 228440 + }, + { + "epoch": 9.46, + "grad_norm": 1.46875, + "learning_rate": 0.0003870470168029445, + "loss": 0.194, + "step": 228450 + }, + { + "epoch": 9.46, + "grad_norm": 0.6953125, + "learning_rate": 0.0003870379462400654, + "loss": 0.164, + "step": 228460 + }, + { + "epoch": 9.46, + "grad_norm": 0.427734375, + "learning_rate": 0.00038702887541929574, + "loss": 0.2097, + "step": 228470 + }, + { + "epoch": 9.46, + "grad_norm": 1.6953125, + "learning_rate": 0.0003870198043406524, + "loss": 0.1919, + "step": 228480 + }, + { + "epoch": 9.46, + "grad_norm": 0.83203125, + "learning_rate": 0.00038701073300415273, + "loss": 0.1683, + "step": 228490 + }, + { + "epoch": 9.46, + "grad_norm": 1.1328125, + "learning_rate": 0.0003870016614098136, + "loss": 0.203, + "step": 228500 + }, + { + "epoch": 9.46, + "grad_norm": 0.63671875, + "learning_rate": 0.00038699258955765217, + "loss": 0.1791, + "step": 228510 + }, + { + "epoch": 9.47, + "grad_norm": 1.609375, + "learning_rate": 0.0003869835174476854, + "loss": 0.1762, + "step": 228520 + }, + { + "epoch": 9.47, + "grad_norm": 0.51953125, + "learning_rate": 0.00038697444507993054, + "loss": 0.2249, + "step": 228530 + }, + { + "epoch": 9.47, + "grad_norm": 1.53125, + "learning_rate": 0.0003869653724544045, + "loss": 0.2072, + "step": 228540 + }, + { + "epoch": 9.47, + "grad_norm": 0.890625, + "learning_rate": 0.00038695629957112446, + "loss": 0.1807, + "step": 228550 + }, + { + "epoch": 9.47, + "grad_norm": 0.8515625, + "learning_rate": 0.0003869472264301074, + "loss": 0.2175, + "step": 228560 + }, + { + "epoch": 9.47, + "grad_norm": 1.0625, + "learning_rate": 0.00038693815303137054, + "loss": 0.2513, + "step": 228570 + }, + { + "epoch": 9.47, + "grad_norm": 1.203125, + "learning_rate": 0.00038692907937493083, + "loss": 0.1874, + "step": 228580 + }, + { + "epoch": 9.47, + "grad_norm": 0.80859375, + "learning_rate": 0.0003869200054608053, + "loss": 0.1702, + "step": 228590 + }, + { + "epoch": 9.47, + "grad_norm": 0.6796875, + "learning_rate": 0.0003869109312890112, + "loss": 0.1935, + "step": 228600 + }, + { + "epoch": 9.47, + "grad_norm": 1.03125, + "learning_rate": 0.0003869018568595655, + "loss": 0.2305, + "step": 228610 + }, + { + "epoch": 9.47, + "grad_norm": 0.8359375, + "learning_rate": 0.00038689278217248526, + "loss": 0.2166, + "step": 228620 + }, + { + "epoch": 9.47, + "grad_norm": 1.5078125, + "learning_rate": 0.00038688370722778754, + "loss": 0.1775, + "step": 228630 + }, + { + "epoch": 9.47, + "grad_norm": 0.98828125, + "learning_rate": 0.00038687463202548956, + "loss": 0.185, + "step": 228640 + }, + { + "epoch": 9.47, + "grad_norm": 1.6171875, + "learning_rate": 0.0003868655565656083, + "loss": 0.24, + "step": 228650 + }, + { + "epoch": 9.47, + "grad_norm": 0.53125, + "learning_rate": 0.0003868564808481607, + "loss": 0.2083, + "step": 228660 + }, + { + "epoch": 9.47, + "grad_norm": 1.0234375, + "learning_rate": 0.0003868474048731642, + "loss": 0.1659, + "step": 228670 + }, + { + "epoch": 9.47, + "grad_norm": 2.109375, + "learning_rate": 0.0003868383286406355, + "loss": 0.2401, + "step": 228680 + }, + { + "epoch": 9.47, + "grad_norm": 1.515625, + "learning_rate": 0.0003868292521505919, + "loss": 0.1739, + "step": 228690 + }, + { + "epoch": 9.47, + "grad_norm": 1.296875, + "learning_rate": 0.00038682017540305044, + "loss": 0.2259, + "step": 228700 + }, + { + "epoch": 9.47, + "grad_norm": 1.2578125, + "learning_rate": 0.00038681109839802815, + "loss": 0.1857, + "step": 228710 + }, + { + "epoch": 9.47, + "grad_norm": 0.78515625, + "learning_rate": 0.00038680202113554216, + "loss": 0.1892, + "step": 228720 + }, + { + "epoch": 9.47, + "grad_norm": 0.0004520416259765625, + "learning_rate": 0.0003867929436156096, + "loss": 0.1957, + "step": 228730 + }, + { + "epoch": 9.47, + "grad_norm": 0.5859375, + "learning_rate": 0.0003867838658382473, + "loss": 0.1829, + "step": 228740 + }, + { + "epoch": 9.47, + "grad_norm": 0.87109375, + "learning_rate": 0.00038677478780347273, + "loss": 0.2009, + "step": 228750 + }, + { + "epoch": 9.48, + "grad_norm": 1.3984375, + "learning_rate": 0.00038676570951130275, + "loss": 0.2012, + "step": 228760 + }, + { + "epoch": 9.48, + "grad_norm": 0.87109375, + "learning_rate": 0.0003867566309617544, + "loss": 0.2436, + "step": 228770 + }, + { + "epoch": 9.48, + "grad_norm": 0.87890625, + "learning_rate": 0.0003867475521548449, + "loss": 0.2384, + "step": 228780 + }, + { + "epoch": 9.48, + "grad_norm": 0.81640625, + "learning_rate": 0.0003867384730905913, + "loss": 0.2243, + "step": 228790 + }, + { + "epoch": 9.48, + "grad_norm": 0.96484375, + "learning_rate": 0.00038672939376901055, + "loss": 0.2352, + "step": 228800 + }, + { + "epoch": 9.48, + "grad_norm": 0.66015625, + "learning_rate": 0.00038672031419011995, + "loss": 0.1955, + "step": 228810 + }, + { + "epoch": 9.48, + "grad_norm": 0.87109375, + "learning_rate": 0.0003867112343539364, + "loss": 0.1654, + "step": 228820 + }, + { + "epoch": 9.48, + "grad_norm": 0.58984375, + "learning_rate": 0.0003867021542604771, + "loss": 0.1806, + "step": 228830 + }, + { + "epoch": 9.48, + "grad_norm": 0.69921875, + "learning_rate": 0.00038669307390975914, + "loss": 0.213, + "step": 228840 + }, + { + "epoch": 9.48, + "grad_norm": 1.0625, + "learning_rate": 0.0003866839933017996, + "loss": 0.2134, + "step": 228850 + }, + { + "epoch": 9.48, + "grad_norm": 1.5234375, + "learning_rate": 0.00038667491243661546, + "loss": 0.1961, + "step": 228860 + }, + { + "epoch": 9.48, + "grad_norm": 0.77734375, + "learning_rate": 0.00038666583131422396, + "loss": 0.1508, + "step": 228870 + }, + { + "epoch": 9.48, + "grad_norm": 0.2021484375, + "learning_rate": 0.00038665674993464214, + "loss": 0.1998, + "step": 228880 + }, + { + "epoch": 9.48, + "grad_norm": 0.67578125, + "learning_rate": 0.00038664766829788703, + "loss": 0.151, + "step": 228890 + }, + { + "epoch": 9.48, + "grad_norm": 0.478515625, + "learning_rate": 0.00038663858640397575, + "loss": 0.1838, + "step": 228900 + }, + { + "epoch": 9.48, + "grad_norm": 0.275390625, + "learning_rate": 0.0003866295042529254, + "loss": 0.1823, + "step": 228910 + }, + { + "epoch": 9.48, + "grad_norm": 0.75390625, + "learning_rate": 0.0003866204218447531, + "loss": 0.2016, + "step": 228920 + }, + { + "epoch": 9.48, + "grad_norm": 0.61328125, + "learning_rate": 0.000386611339179476, + "loss": 0.1928, + "step": 228930 + }, + { + "epoch": 9.48, + "grad_norm": 0.90625, + "learning_rate": 0.00038660225625711096, + "loss": 0.2004, + "step": 228940 + }, + { + "epoch": 9.48, + "grad_norm": 0.875, + "learning_rate": 0.0003865931730776753, + "loss": 0.2266, + "step": 228950 + }, + { + "epoch": 9.48, + "grad_norm": 0.5390625, + "learning_rate": 0.00038658408964118606, + "loss": 0.2089, + "step": 228960 + }, + { + "epoch": 9.48, + "grad_norm": 0.59375, + "learning_rate": 0.0003865750059476604, + "loss": 0.1852, + "step": 228970 + }, + { + "epoch": 9.48, + "grad_norm": 1.2421875, + "learning_rate": 0.00038656592199711517, + "loss": 0.1642, + "step": 228980 + }, + { + "epoch": 9.48, + "grad_norm": 0.70703125, + "learning_rate": 0.0003865568377895676, + "loss": 0.2234, + "step": 228990 + }, + { + "epoch": 9.49, + "grad_norm": 0.353515625, + "learning_rate": 0.00038654775332503495, + "loss": 0.1695, + "step": 229000 + }, + { + "epoch": 9.49, + "grad_norm": 1.5234375, + "learning_rate": 0.0003865386686035341, + "loss": 0.1996, + "step": 229010 + }, + { + "epoch": 9.49, + "grad_norm": 0.353515625, + "learning_rate": 0.0003865295836250823, + "loss": 0.2024, + "step": 229020 + }, + { + "epoch": 9.49, + "grad_norm": 0.65625, + "learning_rate": 0.00038652049838969643, + "loss": 0.1748, + "step": 229030 + }, + { + "epoch": 9.49, + "grad_norm": 0.73828125, + "learning_rate": 0.00038651141289739377, + "loss": 0.1883, + "step": 229040 + }, + { + "epoch": 9.49, + "grad_norm": 0.421875, + "learning_rate": 0.00038650232714819145, + "loss": 0.1878, + "step": 229050 + }, + { + "epoch": 9.49, + "grad_norm": 0.75390625, + "learning_rate": 0.00038649324114210635, + "loss": 0.1876, + "step": 229060 + }, + { + "epoch": 9.49, + "grad_norm": 0.84765625, + "learning_rate": 0.00038648415487915587, + "loss": 0.1486, + "step": 229070 + }, + { + "epoch": 9.49, + "grad_norm": 0.515625, + "learning_rate": 0.00038647506835935686, + "loss": 0.1997, + "step": 229080 + }, + { + "epoch": 9.49, + "grad_norm": 1.6328125, + "learning_rate": 0.0003864659815827265, + "loss": 0.1986, + "step": 229090 + }, + { + "epoch": 9.49, + "grad_norm": 0.330078125, + "learning_rate": 0.00038645689454928195, + "loss": 0.1996, + "step": 229100 + }, + { + "epoch": 9.49, + "grad_norm": 0.8984375, + "learning_rate": 0.0003864478072590402, + "loss": 0.1844, + "step": 229110 + }, + { + "epoch": 9.49, + "grad_norm": 0.8203125, + "learning_rate": 0.0003864387197120185, + "loss": 0.2151, + "step": 229120 + }, + { + "epoch": 9.49, + "grad_norm": 0.455078125, + "learning_rate": 0.00038642963190823384, + "loss": 0.2091, + "step": 229130 + }, + { + "epoch": 9.49, + "grad_norm": 0.73046875, + "learning_rate": 0.00038642054384770326, + "loss": 0.2139, + "step": 229140 + }, + { + "epoch": 9.49, + "grad_norm": 0.31640625, + "learning_rate": 0.000386411455530444, + "loss": 0.1634, + "step": 229150 + }, + { + "epoch": 9.49, + "grad_norm": 0.90625, + "learning_rate": 0.0003864023669564731, + "loss": 0.1399, + "step": 229160 + }, + { + "epoch": 9.49, + "grad_norm": 1.1875, + "learning_rate": 0.0003863932781258077, + "loss": 0.2166, + "step": 229170 + }, + { + "epoch": 9.49, + "grad_norm": 0.5625, + "learning_rate": 0.00038638418903846493, + "loss": 0.2383, + "step": 229180 + }, + { + "epoch": 9.49, + "grad_norm": 0.5390625, + "learning_rate": 0.0003863750996944617, + "loss": 0.2042, + "step": 229190 + }, + { + "epoch": 9.49, + "grad_norm": 0.796875, + "learning_rate": 0.00038636601009381545, + "loss": 0.2226, + "step": 229200 + }, + { + "epoch": 9.49, + "grad_norm": 1.3125, + "learning_rate": 0.000386356920236543, + "loss": 0.2207, + "step": 229210 + }, + { + "epoch": 9.49, + "grad_norm": 1.2890625, + "learning_rate": 0.0003863478301226615, + "loss": 0.1788, + "step": 229220 + }, + { + "epoch": 9.49, + "grad_norm": 1.109375, + "learning_rate": 0.0003863387397521881, + "loss": 0.1763, + "step": 229230 + }, + { + "epoch": 9.5, + "grad_norm": 1.3828125, + "learning_rate": 0.00038632964912514, + "loss": 0.1616, + "step": 229240 + }, + { + "epoch": 9.5, + "grad_norm": 0.93359375, + "learning_rate": 0.0003863205582415342, + "loss": 0.2237, + "step": 229250 + }, + { + "epoch": 9.5, + "grad_norm": 1.2734375, + "learning_rate": 0.00038631146710138786, + "loss": 0.2318, + "step": 229260 + }, + { + "epoch": 9.5, + "grad_norm": 0.59375, + "learning_rate": 0.0003863023757047179, + "loss": 0.1787, + "step": 229270 + }, + { + "epoch": 9.5, + "grad_norm": 0.328125, + "learning_rate": 0.0003862932840515418, + "loss": 0.1736, + "step": 229280 + }, + { + "epoch": 9.5, + "grad_norm": 0.55859375, + "learning_rate": 0.00038628419214187633, + "loss": 0.2375, + "step": 229290 + }, + { + "epoch": 9.5, + "grad_norm": 0.82421875, + "learning_rate": 0.0003862750999757388, + "loss": 0.2115, + "step": 229300 + }, + { + "epoch": 9.5, + "grad_norm": 0.94921875, + "learning_rate": 0.0003862660075531462, + "loss": 0.1865, + "step": 229310 + }, + { + "epoch": 9.5, + "grad_norm": 0.55859375, + "learning_rate": 0.00038625691487411567, + "loss": 0.2054, + "step": 229320 + }, + { + "epoch": 9.5, + "grad_norm": 0.33984375, + "learning_rate": 0.00038624782193866436, + "loss": 0.2058, + "step": 229330 + }, + { + "epoch": 9.5, + "grad_norm": 0.049560546875, + "learning_rate": 0.00038623872874680944, + "loss": 0.1687, + "step": 229340 + }, + { + "epoch": 9.5, + "grad_norm": 1.3046875, + "learning_rate": 0.0003862296352985678, + "loss": 0.2167, + "step": 229350 + }, + { + "epoch": 9.5, + "grad_norm": 0.7890625, + "learning_rate": 0.00038622054159395686, + "loss": 0.2375, + "step": 229360 + }, + { + "epoch": 9.5, + "grad_norm": 0.427734375, + "learning_rate": 0.0003862114476329934, + "loss": 0.1791, + "step": 229370 + }, + { + "epoch": 9.5, + "grad_norm": 0.314453125, + "learning_rate": 0.00038620235341569486, + "loss": 0.1758, + "step": 229380 + }, + { + "epoch": 9.5, + "grad_norm": 0.5234375, + "learning_rate": 0.00038619325894207813, + "loss": 0.2087, + "step": 229390 + }, + { + "epoch": 9.5, + "grad_norm": 0.73828125, + "learning_rate": 0.0003861841642121604, + "loss": 0.1947, + "step": 229400 + }, + { + "epoch": 9.5, + "grad_norm": 0.91796875, + "learning_rate": 0.0003861750692259588, + "loss": 0.2129, + "step": 229410 + }, + { + "epoch": 9.5, + "grad_norm": 0.671875, + "learning_rate": 0.0003861659739834903, + "loss": 0.2184, + "step": 229420 + }, + { + "epoch": 9.5, + "grad_norm": 0.0, + "learning_rate": 0.0003861568784847723, + "loss": 0.1652, + "step": 229430 + }, + { + "epoch": 9.5, + "grad_norm": 0.5390625, + "learning_rate": 0.0003861477827298217, + "loss": 0.2422, + "step": 229440 + }, + { + "epoch": 9.5, + "grad_norm": 0.46875, + "learning_rate": 0.0003861386867186557, + "loss": 0.1876, + "step": 229450 + }, + { + "epoch": 9.5, + "grad_norm": 1.4140625, + "learning_rate": 0.00038612959045129135, + "loss": 0.1308, + "step": 229460 + }, + { + "epoch": 9.5, + "grad_norm": 0.88671875, + "learning_rate": 0.00038612049392774587, + "loss": 0.2102, + "step": 229470 + }, + { + "epoch": 9.51, + "grad_norm": 1.140625, + "learning_rate": 0.0003861113971480362, + "loss": 0.1895, + "step": 229480 + }, + { + "epoch": 9.51, + "grad_norm": 1.609375, + "learning_rate": 0.0003861023001121797, + "loss": 0.1768, + "step": 229490 + }, + { + "epoch": 9.51, + "grad_norm": 2.140625, + "learning_rate": 0.0003860932028201933, + "loss": 0.1849, + "step": 229500 + }, + { + "epoch": 9.51, + "grad_norm": 1.625, + "learning_rate": 0.00038608410527209424, + "loss": 0.1902, + "step": 229510 + }, + { + "epoch": 9.51, + "grad_norm": 0.7734375, + "learning_rate": 0.0003860750074678996, + "loss": 0.1789, + "step": 229520 + }, + { + "epoch": 9.51, + "grad_norm": 1.5390625, + "learning_rate": 0.0003860659094076264, + "loss": 0.203, + "step": 229530 + }, + { + "epoch": 9.51, + "grad_norm": 1.7109375, + "learning_rate": 0.00038605681109129197, + "loss": 0.1848, + "step": 229540 + }, + { + "epoch": 9.51, + "grad_norm": 0.95703125, + "learning_rate": 0.0003860477125189133, + "loss": 0.1857, + "step": 229550 + }, + { + "epoch": 9.51, + "grad_norm": 0.68359375, + "learning_rate": 0.0003860386136905075, + "loss": 0.2611, + "step": 229560 + }, + { + "epoch": 9.51, + "grad_norm": 0.455078125, + "learning_rate": 0.0003860295146060917, + "loss": 0.2028, + "step": 229570 + }, + { + "epoch": 9.51, + "grad_norm": 0.3125, + "learning_rate": 0.000386020415265683, + "loss": 0.1917, + "step": 229580 + }, + { + "epoch": 9.51, + "grad_norm": 1.1171875, + "learning_rate": 0.0003860113156692986, + "loss": 0.1733, + "step": 229590 + }, + { + "epoch": 9.51, + "grad_norm": 0.478515625, + "learning_rate": 0.0003860022158169556, + "loss": 0.2049, + "step": 229600 + }, + { + "epoch": 9.51, + "grad_norm": 0.52734375, + "learning_rate": 0.0003859931157086711, + "loss": 0.1987, + "step": 229610 + }, + { + "epoch": 9.51, + "grad_norm": 1.296875, + "learning_rate": 0.0003859840153444623, + "loss": 0.2493, + "step": 229620 + }, + { + "epoch": 9.51, + "grad_norm": 0.734375, + "learning_rate": 0.0003859749147243462, + "loss": 0.1982, + "step": 229630 + }, + { + "epoch": 9.51, + "grad_norm": 0.8125, + "learning_rate": 0.00038596581384834006, + "loss": 0.146, + "step": 229640 + }, + { + "epoch": 9.51, + "grad_norm": 1.78125, + "learning_rate": 0.00038595671271646095, + "loss": 0.1824, + "step": 229650 + }, + { + "epoch": 9.51, + "grad_norm": 0.96484375, + "learning_rate": 0.0003859476113287259, + "loss": 0.2002, + "step": 229660 + }, + { + "epoch": 9.51, + "grad_norm": 0.6796875, + "learning_rate": 0.0003859385096851522, + "loss": 0.171, + "step": 229670 + }, + { + "epoch": 9.51, + "grad_norm": 1.484375, + "learning_rate": 0.0003859294077857569, + "loss": 0.2304, + "step": 229680 + }, + { + "epoch": 9.51, + "grad_norm": 1.09375, + "learning_rate": 0.00038592030563055715, + "loss": 0.1899, + "step": 229690 + }, + { + "epoch": 9.51, + "grad_norm": 0.96484375, + "learning_rate": 0.00038591120321957, + "loss": 0.1754, + "step": 229700 + }, + { + "epoch": 9.51, + "grad_norm": 0.61328125, + "learning_rate": 0.0003859021005528126, + "loss": 0.2158, + "step": 229710 + }, + { + "epoch": 9.51, + "grad_norm": 0.9140625, + "learning_rate": 0.0003858929976303023, + "loss": 0.2229, + "step": 229720 + }, + { + "epoch": 9.52, + "grad_norm": 0.79296875, + "learning_rate": 0.0003858838944520559, + "loss": 0.1981, + "step": 229730 + }, + { + "epoch": 9.52, + "grad_norm": 0.83984375, + "learning_rate": 0.0003858747910180907, + "loss": 0.1781, + "step": 229740 + }, + { + "epoch": 9.52, + "grad_norm": 0.578125, + "learning_rate": 0.0003858656873284239, + "loss": 0.1813, + "step": 229750 + }, + { + "epoch": 9.52, + "grad_norm": 0.37109375, + "learning_rate": 0.0003858565833830725, + "loss": 0.2015, + "step": 229760 + }, + { + "epoch": 9.52, + "grad_norm": 0.69921875, + "learning_rate": 0.00038584747918205366, + "loss": 0.1823, + "step": 229770 + }, + { + "epoch": 9.52, + "grad_norm": 0.58203125, + "learning_rate": 0.0003858383747253845, + "loss": 0.225, + "step": 229780 + }, + { + "epoch": 9.52, + "grad_norm": 0.69140625, + "learning_rate": 0.0003858292700130823, + "loss": 0.219, + "step": 229790 + }, + { + "epoch": 9.52, + "grad_norm": 0.64453125, + "learning_rate": 0.000385820165045164, + "loss": 0.2188, + "step": 229800 + }, + { + "epoch": 9.52, + "grad_norm": 0.72265625, + "learning_rate": 0.0003858110598216469, + "loss": 0.2271, + "step": 229810 + }, + { + "epoch": 9.52, + "grad_norm": 0.76953125, + "learning_rate": 0.0003858019543425479, + "loss": 0.2227, + "step": 229820 + }, + { + "epoch": 9.52, + "grad_norm": 0.74609375, + "learning_rate": 0.00038579284860788436, + "loss": 0.2061, + "step": 229830 + }, + { + "epoch": 9.52, + "grad_norm": 0.84765625, + "learning_rate": 0.00038578374261767336, + "loss": 0.1998, + "step": 229840 + }, + { + "epoch": 9.52, + "grad_norm": 0.83984375, + "learning_rate": 0.00038577463637193206, + "loss": 0.2068, + "step": 229850 + }, + { + "epoch": 9.52, + "grad_norm": 1.0546875, + "learning_rate": 0.00038576552987067754, + "loss": 0.223, + "step": 229860 + }, + { + "epoch": 9.52, + "grad_norm": 0.47265625, + "learning_rate": 0.0003857564231139269, + "loss": 0.2038, + "step": 229870 + }, + { + "epoch": 9.52, + "grad_norm": 2.546875, + "learning_rate": 0.00038574731610169737, + "loss": 0.1804, + "step": 229880 + }, + { + "epoch": 9.52, + "grad_norm": 2.765625, + "learning_rate": 0.0003857382088340061, + "loss": 0.2341, + "step": 229890 + }, + { + "epoch": 9.52, + "grad_norm": 0.466796875, + "learning_rate": 0.00038572910131087003, + "loss": 0.1576, + "step": 229900 + }, + { + "epoch": 9.52, + "grad_norm": 1.3359375, + "learning_rate": 0.0003857199935323066, + "loss": 0.2217, + "step": 229910 + }, + { + "epoch": 9.52, + "grad_norm": 0.76953125, + "learning_rate": 0.00038571088549833275, + "loss": 0.1965, + "step": 229920 + }, + { + "epoch": 9.52, + "grad_norm": 0.609375, + "learning_rate": 0.0003857017772089656, + "loss": 0.1875, + "step": 229930 + }, + { + "epoch": 9.52, + "grad_norm": 1.1171875, + "learning_rate": 0.0003856926686642225, + "loss": 0.1927, + "step": 229940 + }, + { + "epoch": 9.52, + "grad_norm": 0.83203125, + "learning_rate": 0.0003856835598641203, + "loss": 0.2284, + "step": 229950 + }, + { + "epoch": 9.52, + "grad_norm": 0.486328125, + "learning_rate": 0.0003856744508086764, + "loss": 0.1977, + "step": 229960 + }, + { + "epoch": 9.53, + "grad_norm": 1.109375, + "learning_rate": 0.0003856653414979078, + "loss": 0.1938, + "step": 229970 + }, + { + "epoch": 9.53, + "grad_norm": 0.73046875, + "learning_rate": 0.00038565623193183166, + "loss": 0.1807, + "step": 229980 + }, + { + "epoch": 9.53, + "grad_norm": 0.6015625, + "learning_rate": 0.00038564712211046516, + "loss": 0.1756, + "step": 229990 + }, + { + "epoch": 9.53, + "grad_norm": 0.419921875, + "learning_rate": 0.00038563801203382543, + "loss": 0.179, + "step": 230000 + }, + { + "epoch": 9.53, + "grad_norm": 0.74609375, + "learning_rate": 0.0003856289017019296, + "loss": 0.2734, + "step": 230010 + }, + { + "epoch": 9.53, + "grad_norm": 0.9765625, + "learning_rate": 0.00038561979111479484, + "loss": 0.2219, + "step": 230020 + }, + { + "epoch": 9.53, + "grad_norm": 0.76171875, + "learning_rate": 0.00038561068027243825, + "loss": 0.1959, + "step": 230030 + }, + { + "epoch": 9.53, + "grad_norm": 0.796875, + "learning_rate": 0.000385601569174877, + "loss": 0.1693, + "step": 230040 + }, + { + "epoch": 9.53, + "grad_norm": 0.458984375, + "learning_rate": 0.0003855924578221283, + "loss": 0.2272, + "step": 230050 + }, + { + "epoch": 9.53, + "grad_norm": 0.6015625, + "learning_rate": 0.00038558334621420915, + "loss": 0.2143, + "step": 230060 + }, + { + "epoch": 9.53, + "grad_norm": 0.73828125, + "learning_rate": 0.0003855742343511369, + "loss": 0.2055, + "step": 230070 + }, + { + "epoch": 9.53, + "grad_norm": 0.9296875, + "learning_rate": 0.00038556512223292846, + "loss": 0.1535, + "step": 230080 + }, + { + "epoch": 9.53, + "grad_norm": 0.5078125, + "learning_rate": 0.00038555600985960115, + "loss": 0.1338, + "step": 230090 + }, + { + "epoch": 9.53, + "grad_norm": 1.2421875, + "learning_rate": 0.0003855468972311721, + "loss": 0.2189, + "step": 230100 + }, + { + "epoch": 9.53, + "grad_norm": 1.2265625, + "learning_rate": 0.00038553778434765836, + "loss": 0.2116, + "step": 230110 + }, + { + "epoch": 9.53, + "grad_norm": 0.451171875, + "learning_rate": 0.0003855286712090772, + "loss": 0.2034, + "step": 230120 + }, + { + "epoch": 9.53, + "grad_norm": 1.2890625, + "learning_rate": 0.00038551955781544566, + "loss": 0.1731, + "step": 230130 + }, + { + "epoch": 9.53, + "grad_norm": 0.34375, + "learning_rate": 0.00038551044416678103, + "loss": 0.2012, + "step": 230140 + }, + { + "epoch": 9.53, + "grad_norm": 0.9921875, + "learning_rate": 0.0003855013302631003, + "loss": 0.2467, + "step": 230150 + }, + { + "epoch": 9.53, + "grad_norm": 0.59765625, + "learning_rate": 0.00038549221610442074, + "loss": 0.2314, + "step": 230160 + }, + { + "epoch": 9.53, + "grad_norm": 1.109375, + "learning_rate": 0.0003854831016907595, + "loss": 0.1895, + "step": 230170 + }, + { + "epoch": 9.53, + "grad_norm": 0.99609375, + "learning_rate": 0.0003854739870221336, + "loss": 0.164, + "step": 230180 + }, + { + "epoch": 9.53, + "grad_norm": 0.62109375, + "learning_rate": 0.0003854648720985603, + "loss": 0.2015, + "step": 230190 + }, + { + "epoch": 9.53, + "grad_norm": 0.86328125, + "learning_rate": 0.00038545575692005684, + "loss": 0.2157, + "step": 230200 + }, + { + "epoch": 9.54, + "grad_norm": 0.640625, + "learning_rate": 0.00038544664148664024, + "loss": 0.2421, + "step": 230210 + }, + { + "epoch": 9.54, + "grad_norm": 0.546875, + "learning_rate": 0.00038543752579832765, + "loss": 0.1767, + "step": 230220 + }, + { + "epoch": 9.54, + "grad_norm": 1.203125, + "learning_rate": 0.00038542840985513626, + "loss": 0.2461, + "step": 230230 + }, + { + "epoch": 9.54, + "grad_norm": 1.078125, + "learning_rate": 0.0003854192936570832, + "loss": 0.2194, + "step": 230240 + }, + { + "epoch": 9.54, + "grad_norm": 1.9765625, + "learning_rate": 0.0003854101772041857, + "loss": 0.2339, + "step": 230250 + }, + { + "epoch": 9.54, + "grad_norm": 0.609375, + "learning_rate": 0.0003854010604964608, + "loss": 0.1771, + "step": 230260 + }, + { + "epoch": 9.54, + "grad_norm": 0.76953125, + "learning_rate": 0.0003853919435339258, + "loss": 0.1818, + "step": 230270 + }, + { + "epoch": 9.54, + "grad_norm": 1.171875, + "learning_rate": 0.0003853828263165978, + "loss": 0.213, + "step": 230280 + }, + { + "epoch": 9.54, + "grad_norm": 1.3515625, + "learning_rate": 0.00038537370884449395, + "loss": 0.1935, + "step": 230290 + }, + { + "epoch": 9.54, + "grad_norm": 0.390625, + "learning_rate": 0.0003853645911176313, + "loss": 0.1679, + "step": 230300 + }, + { + "epoch": 9.54, + "grad_norm": 1.4375, + "learning_rate": 0.00038535547313602725, + "loss": 0.2386, + "step": 230310 + }, + { + "epoch": 9.54, + "grad_norm": 0.62890625, + "learning_rate": 0.0003853463548996987, + "loss": 0.2121, + "step": 230320 + }, + { + "epoch": 9.54, + "grad_norm": 0.0, + "learning_rate": 0.00038533723640866296, + "loss": 0.207, + "step": 230330 + }, + { + "epoch": 9.54, + "grad_norm": 0.73828125, + "learning_rate": 0.0003853281176629371, + "loss": 0.1795, + "step": 230340 + }, + { + "epoch": 9.54, + "grad_norm": 0.734375, + "learning_rate": 0.00038531899866253846, + "loss": 0.1706, + "step": 230350 + }, + { + "epoch": 9.54, + "grad_norm": 0.73828125, + "learning_rate": 0.00038530987940748403, + "loss": 0.2105, + "step": 230360 + }, + { + "epoch": 9.54, + "grad_norm": 0.6640625, + "learning_rate": 0.00038530075989779095, + "loss": 0.2331, + "step": 230370 + }, + { + "epoch": 9.54, + "grad_norm": 0.671875, + "learning_rate": 0.0003852916401334765, + "loss": 0.1974, + "step": 230380 + }, + { + "epoch": 9.54, + "grad_norm": 0.57421875, + "learning_rate": 0.0003852825201145578, + "loss": 0.1688, + "step": 230390 + }, + { + "epoch": 9.54, + "grad_norm": 1.1171875, + "learning_rate": 0.00038527339984105203, + "loss": 0.2076, + "step": 230400 + }, + { + "epoch": 9.54, + "grad_norm": 0.765625, + "learning_rate": 0.0003852642793129763, + "loss": 0.2168, + "step": 230410 + }, + { + "epoch": 9.54, + "grad_norm": 0.466796875, + "learning_rate": 0.00038525515853034774, + "loss": 0.167, + "step": 230420 + }, + { + "epoch": 9.54, + "grad_norm": 0.8984375, + "learning_rate": 0.00038524603749318364, + "loss": 0.1925, + "step": 230430 + }, + { + "epoch": 9.54, + "grad_norm": 1.0703125, + "learning_rate": 0.0003852369162015011, + "loss": 0.2153, + "step": 230440 + }, + { + "epoch": 9.55, + "grad_norm": 0.8828125, + "learning_rate": 0.00038522779465531733, + "loss": 0.2025, + "step": 230450 + }, + { + "epoch": 9.55, + "grad_norm": 0.65234375, + "learning_rate": 0.0003852186728546494, + "loss": 0.1833, + "step": 230460 + }, + { + "epoch": 9.55, + "grad_norm": 0.69921875, + "learning_rate": 0.0003852095507995146, + "loss": 0.1822, + "step": 230470 + }, + { + "epoch": 9.55, + "grad_norm": 1.2265625, + "learning_rate": 0.00038520042848993, + "loss": 0.1612, + "step": 230480 + }, + { + "epoch": 9.55, + "grad_norm": 1.0, + "learning_rate": 0.0003851913059259127, + "loss": 0.1606, + "step": 230490 + }, + { + "epoch": 9.55, + "grad_norm": 0.76953125, + "learning_rate": 0.00038518218310748005, + "loss": 0.175, + "step": 230500 + }, + { + "epoch": 9.55, + "grad_norm": 0.94921875, + "learning_rate": 0.0003851730600346491, + "loss": 0.1958, + "step": 230510 + }, + { + "epoch": 9.55, + "grad_norm": 0.765625, + "learning_rate": 0.0003851639367074371, + "loss": 0.1662, + "step": 230520 + }, + { + "epoch": 9.55, + "grad_norm": 0.46875, + "learning_rate": 0.0003851548131258611, + "loss": 0.2121, + "step": 230530 + }, + { + "epoch": 9.55, + "grad_norm": 0.52734375, + "learning_rate": 0.00038514568928993837, + "loss": 0.1599, + "step": 230540 + }, + { + "epoch": 9.55, + "grad_norm": 0.88671875, + "learning_rate": 0.000385136565199686, + "loss": 0.1992, + "step": 230550 + }, + { + "epoch": 9.55, + "grad_norm": 0.36328125, + "learning_rate": 0.0003851274408551213, + "loss": 0.1558, + "step": 230560 + }, + { + "epoch": 9.55, + "grad_norm": 1.0703125, + "learning_rate": 0.00038511831625626125, + "loss": 0.2101, + "step": 230570 + }, + { + "epoch": 9.55, + "grad_norm": 0.4921875, + "learning_rate": 0.0003851091914031232, + "loss": 0.1959, + "step": 230580 + }, + { + "epoch": 9.55, + "grad_norm": 0.90625, + "learning_rate": 0.0003851000662957242, + "loss": 0.192, + "step": 230590 + }, + { + "epoch": 9.55, + "grad_norm": 0.875, + "learning_rate": 0.0003850909409340815, + "loss": 0.1837, + "step": 230600 + }, + { + "epoch": 9.55, + "grad_norm": 0.98046875, + "learning_rate": 0.0003850818153182122, + "loss": 0.201, + "step": 230610 + }, + { + "epoch": 9.55, + "grad_norm": 1.0, + "learning_rate": 0.00038507268944813354, + "loss": 0.2334, + "step": 230620 + }, + { + "epoch": 9.55, + "grad_norm": 0.2734375, + "learning_rate": 0.0003850635633238626, + "loss": 0.182, + "step": 230630 + }, + { + "epoch": 9.55, + "grad_norm": 0.86328125, + "learning_rate": 0.00038505443694541666, + "loss": 0.2047, + "step": 230640 + }, + { + "epoch": 9.55, + "grad_norm": 0.474609375, + "learning_rate": 0.00038504531031281286, + "loss": 0.1334, + "step": 230650 + }, + { + "epoch": 9.55, + "grad_norm": 0.3828125, + "learning_rate": 0.0003850361834260684, + "loss": 0.1877, + "step": 230660 + }, + { + "epoch": 9.55, + "grad_norm": 0.77734375, + "learning_rate": 0.00038502705628520043, + "loss": 0.1693, + "step": 230670 + }, + { + "epoch": 9.55, + "grad_norm": 0.146484375, + "learning_rate": 0.000385017928890226, + "loss": 0.265, + "step": 230680 + }, + { + "epoch": 9.56, + "grad_norm": 0.578125, + "learning_rate": 0.00038500880124116255, + "loss": 0.2138, + "step": 230690 + }, + { + "epoch": 9.56, + "grad_norm": 0.734375, + "learning_rate": 0.0003849996733380271, + "loss": 0.1936, + "step": 230700 + }, + { + "epoch": 9.56, + "grad_norm": 1.1953125, + "learning_rate": 0.00038499054518083677, + "loss": 0.1747, + "step": 230710 + }, + { + "epoch": 9.56, + "grad_norm": 0.62109375, + "learning_rate": 0.00038498141676960885, + "loss": 0.1265, + "step": 230720 + }, + { + "epoch": 9.56, + "grad_norm": 0.5703125, + "learning_rate": 0.0003849722881043605, + "loss": 0.2221, + "step": 230730 + }, + { + "epoch": 9.56, + "grad_norm": 0.84765625, + "learning_rate": 0.00038496315918510883, + "loss": 0.1953, + "step": 230740 + }, + { + "epoch": 9.56, + "grad_norm": 1.2109375, + "learning_rate": 0.00038495403001187115, + "loss": 0.1748, + "step": 230750 + }, + { + "epoch": 9.56, + "grad_norm": 0.443359375, + "learning_rate": 0.00038494490058466437, + "loss": 0.1871, + "step": 230760 + }, + { + "epoch": 9.56, + "grad_norm": 1.8125, + "learning_rate": 0.00038493577090350606, + "loss": 0.1749, + "step": 230770 + }, + { + "epoch": 9.56, + "grad_norm": 0.63671875, + "learning_rate": 0.0003849266409684132, + "loss": 0.2039, + "step": 230780 + }, + { + "epoch": 9.56, + "grad_norm": 0.7421875, + "learning_rate": 0.00038491751077940285, + "loss": 0.21, + "step": 230790 + }, + { + "epoch": 9.56, + "grad_norm": 1.296875, + "learning_rate": 0.0003849083803364924, + "loss": 0.1546, + "step": 230800 + }, + { + "epoch": 9.56, + "grad_norm": 0.64453125, + "learning_rate": 0.0003848992496396989, + "loss": 0.2021, + "step": 230810 + }, + { + "epoch": 9.56, + "grad_norm": 0.67578125, + "learning_rate": 0.00038489011868903954, + "loss": 0.2215, + "step": 230820 + }, + { + "epoch": 9.56, + "grad_norm": 0.72265625, + "learning_rate": 0.0003848809874845317, + "loss": 0.2056, + "step": 230830 + }, + { + "epoch": 9.56, + "grad_norm": 1.1171875, + "learning_rate": 0.0003848718560261922, + "loss": 0.1971, + "step": 230840 + }, + { + "epoch": 9.56, + "grad_norm": 0.71484375, + "learning_rate": 0.00038486272431403857, + "loss": 0.2177, + "step": 230850 + }, + { + "epoch": 9.56, + "grad_norm": 0.181640625, + "learning_rate": 0.00038485359234808784, + "loss": 0.1946, + "step": 230860 + }, + { + "epoch": 9.56, + "grad_norm": 0.53515625, + "learning_rate": 0.0003848444601283572, + "loss": 0.1559, + "step": 230870 + }, + { + "epoch": 9.56, + "grad_norm": 2.203125, + "learning_rate": 0.0003848353276548639, + "loss": 0.21, + "step": 230880 + }, + { + "epoch": 9.56, + "grad_norm": 0.494140625, + "learning_rate": 0.000384826194927625, + "loss": 0.2138, + "step": 230890 + }, + { + "epoch": 9.56, + "grad_norm": 0.8203125, + "learning_rate": 0.0003848170619466578, + "loss": 0.2093, + "step": 230900 + }, + { + "epoch": 9.56, + "grad_norm": 0.64453125, + "learning_rate": 0.0003848079287119795, + "loss": 0.2064, + "step": 230910 + }, + { + "epoch": 9.56, + "grad_norm": 0.6875, + "learning_rate": 0.00038479879522360707, + "loss": 0.1572, + "step": 230920 + }, + { + "epoch": 9.57, + "grad_norm": 0.515625, + "learning_rate": 0.000384789661481558, + "loss": 0.1908, + "step": 230930 + }, + { + "epoch": 9.57, + "grad_norm": 0.65625, + "learning_rate": 0.0003847805274858493, + "loss": 0.1525, + "step": 230940 + }, + { + "epoch": 9.57, + "grad_norm": 0.296875, + "learning_rate": 0.00038477139323649817, + "loss": 0.1787, + "step": 230950 + }, + { + "epoch": 9.57, + "grad_norm": 0.81640625, + "learning_rate": 0.0003847622587335219, + "loss": 0.2233, + "step": 230960 + }, + { + "epoch": 9.57, + "grad_norm": 0.609375, + "learning_rate": 0.00038475312397693765, + "loss": 0.1814, + "step": 230970 + }, + { + "epoch": 9.57, + "grad_norm": 0.53125, + "learning_rate": 0.00038474398896676243, + "loss": 0.1876, + "step": 230980 + }, + { + "epoch": 9.57, + "grad_norm": 0.75390625, + "learning_rate": 0.0003847348537030137, + "loss": 0.1989, + "step": 230990 + }, + { + "epoch": 9.57, + "grad_norm": 0.55859375, + "learning_rate": 0.00038472571818570843, + "loss": 0.1939, + "step": 231000 + }, + { + "epoch": 9.57, + "grad_norm": 0.458984375, + "learning_rate": 0.00038471658241486396, + "loss": 0.1857, + "step": 231010 + }, + { + "epoch": 9.57, + "grad_norm": 0.98828125, + "learning_rate": 0.0003847074463904974, + "loss": 0.2112, + "step": 231020 + }, + { + "epoch": 9.57, + "grad_norm": 1.40625, + "learning_rate": 0.000384698310112626, + "loss": 0.1814, + "step": 231030 + }, + { + "epoch": 9.57, + "grad_norm": 0.80859375, + "learning_rate": 0.000384689173581267, + "loss": 0.2047, + "step": 231040 + }, + { + "epoch": 9.57, + "grad_norm": 1.515625, + "learning_rate": 0.00038468003679643735, + "loss": 0.2165, + "step": 231050 + }, + { + "epoch": 9.57, + "grad_norm": 0.6953125, + "learning_rate": 0.0003846708997581546, + "loss": 0.1771, + "step": 231060 + }, + { + "epoch": 9.57, + "grad_norm": 0.388671875, + "learning_rate": 0.0003846617624664357, + "loss": 0.185, + "step": 231070 + }, + { + "epoch": 9.57, + "grad_norm": 1.0078125, + "learning_rate": 0.0003846526249212978, + "loss": 0.1772, + "step": 231080 + }, + { + "epoch": 9.57, + "grad_norm": 1.53125, + "learning_rate": 0.0003846434871227583, + "loss": 0.1897, + "step": 231090 + }, + { + "epoch": 9.57, + "grad_norm": 1.03125, + "learning_rate": 0.00038463434907083427, + "loss": 0.1817, + "step": 231100 + }, + { + "epoch": 9.57, + "grad_norm": 2.09375, + "learning_rate": 0.0003846252107655429, + "loss": 0.1859, + "step": 231110 + }, + { + "epoch": 9.57, + "grad_norm": 0.67578125, + "learning_rate": 0.0003846160722069015, + "loss": 0.2259, + "step": 231120 + }, + { + "epoch": 9.57, + "grad_norm": 0.64453125, + "learning_rate": 0.0003846069333949271, + "loss": 0.1671, + "step": 231130 + }, + { + "epoch": 9.57, + "grad_norm": 0.59375, + "learning_rate": 0.00038459779432963704, + "loss": 0.1747, + "step": 231140 + }, + { + "epoch": 9.57, + "grad_norm": 0.97265625, + "learning_rate": 0.00038458865501104844, + "loss": 0.1911, + "step": 231150 + }, + { + "epoch": 9.57, + "grad_norm": 1.3125, + "learning_rate": 0.00038457951543917855, + "loss": 0.28, + "step": 231160 + }, + { + "epoch": 9.58, + "grad_norm": 0.3671875, + "learning_rate": 0.00038457037561404463, + "loss": 0.2188, + "step": 231170 + }, + { + "epoch": 9.58, + "grad_norm": 0.609375, + "learning_rate": 0.0003845612355356637, + "loss": 0.2012, + "step": 231180 + }, + { + "epoch": 9.58, + "grad_norm": 0.5703125, + "learning_rate": 0.000384552095204053, + "loss": 0.1771, + "step": 231190 + }, + { + "epoch": 9.58, + "grad_norm": 1.0, + "learning_rate": 0.0003845429546192299, + "loss": 0.2097, + "step": 231200 + }, + { + "epoch": 9.58, + "grad_norm": 0.52734375, + "learning_rate": 0.0003845338137812114, + "loss": 0.2336, + "step": 231210 + }, + { + "epoch": 9.58, + "grad_norm": 0.8359375, + "learning_rate": 0.0003845246726900148, + "loss": 0.1955, + "step": 231220 + }, + { + "epoch": 9.58, + "grad_norm": 0.86328125, + "learning_rate": 0.0003845155313456573, + "loss": 0.1764, + "step": 231230 + }, + { + "epoch": 9.58, + "grad_norm": 1.15625, + "learning_rate": 0.0003845063897481562, + "loss": 0.2359, + "step": 231240 + }, + { + "epoch": 9.58, + "grad_norm": 0.6875, + "learning_rate": 0.0003844972478975285, + "loss": 0.1806, + "step": 231250 + }, + { + "epoch": 9.58, + "grad_norm": 0.431640625, + "learning_rate": 0.0003844881057937916, + "loss": 0.1937, + "step": 231260 + }, + { + "epoch": 9.58, + "grad_norm": 1.1875, + "learning_rate": 0.0003844789634369624, + "loss": 0.178, + "step": 231270 + }, + { + "epoch": 9.58, + "grad_norm": 1.1484375, + "learning_rate": 0.0003844698208270585, + "loss": 0.1935, + "step": 231280 + }, + { + "epoch": 9.58, + "grad_norm": 0.59375, + "learning_rate": 0.0003844606779640969, + "loss": 0.2448, + "step": 231290 + }, + { + "epoch": 9.58, + "grad_norm": 0.423828125, + "learning_rate": 0.00038445153484809475, + "loss": 0.1857, + "step": 231300 + }, + { + "epoch": 9.58, + "grad_norm": 2.390625, + "learning_rate": 0.00038444239147906935, + "loss": 0.2093, + "step": 231310 + }, + { + "epoch": 9.58, + "grad_norm": 0.44140625, + "learning_rate": 0.00038443324785703795, + "loss": 0.2087, + "step": 231320 + }, + { + "epoch": 9.58, + "grad_norm": 0.859375, + "learning_rate": 0.0003844241039820177, + "loss": 0.1921, + "step": 231330 + }, + { + "epoch": 9.58, + "grad_norm": 0.6171875, + "learning_rate": 0.0003844149598540257, + "loss": 0.1788, + "step": 231340 + }, + { + "epoch": 9.58, + "grad_norm": 0.9140625, + "learning_rate": 0.0003844058154730793, + "loss": 0.2011, + "step": 231350 + }, + { + "epoch": 9.58, + "grad_norm": 1.515625, + "learning_rate": 0.00038439667083919573, + "loss": 0.1977, + "step": 231360 + }, + { + "epoch": 9.58, + "grad_norm": 1.1953125, + "learning_rate": 0.0003843875259523921, + "loss": 0.2361, + "step": 231370 + }, + { + "epoch": 9.58, + "grad_norm": 1.0078125, + "learning_rate": 0.00038437838081268564, + "loss": 0.2083, + "step": 231380 + }, + { + "epoch": 9.58, + "grad_norm": 0.51953125, + "learning_rate": 0.00038436923542009354, + "loss": 0.1783, + "step": 231390 + }, + { + "epoch": 9.58, + "grad_norm": 1.0, + "learning_rate": 0.0003843600897746331, + "loss": 0.1704, + "step": 231400 + }, + { + "epoch": 9.58, + "grad_norm": 0.58984375, + "learning_rate": 0.00038435094387632155, + "loss": 0.2086, + "step": 231410 + }, + { + "epoch": 9.59, + "grad_norm": 0.349609375, + "learning_rate": 0.00038434179772517597, + "loss": 0.2004, + "step": 231420 + }, + { + "epoch": 9.59, + "grad_norm": 0.65625, + "learning_rate": 0.0003843326513212136, + "loss": 0.1845, + "step": 231430 + }, + { + "epoch": 9.59, + "grad_norm": 0.9453125, + "learning_rate": 0.0003843235046644517, + "loss": 0.2033, + "step": 231440 + }, + { + "epoch": 9.59, + "grad_norm": 1.9375, + "learning_rate": 0.0003843143577549075, + "loss": 0.2109, + "step": 231450 + }, + { + "epoch": 9.59, + "grad_norm": 4.5625, + "learning_rate": 0.00038430521059259814, + "loss": 0.1941, + "step": 231460 + }, + { + "epoch": 9.59, + "grad_norm": 0.31640625, + "learning_rate": 0.00038429606317754095, + "loss": 0.2432, + "step": 231470 + }, + { + "epoch": 9.59, + "grad_norm": 0.390625, + "learning_rate": 0.00038428691550975304, + "loss": 0.2009, + "step": 231480 + }, + { + "epoch": 9.59, + "grad_norm": 0.578125, + "learning_rate": 0.00038427776758925163, + "loss": 0.1896, + "step": 231490 + }, + { + "epoch": 9.59, + "grad_norm": 0.6796875, + "learning_rate": 0.00038426861941605393, + "loss": 0.2073, + "step": 231500 + }, + { + "epoch": 9.59, + "grad_norm": 1.078125, + "learning_rate": 0.0003842594709901773, + "loss": 0.1801, + "step": 231510 + }, + { + "epoch": 9.59, + "grad_norm": 0.7890625, + "learning_rate": 0.00038425032231163874, + "loss": 0.2021, + "step": 231520 + }, + { + "epoch": 9.59, + "grad_norm": 0.66796875, + "learning_rate": 0.0003842411733804556, + "loss": 0.2181, + "step": 231530 + }, + { + "epoch": 9.59, + "grad_norm": 1.625, + "learning_rate": 0.00038423202419664515, + "loss": 0.146, + "step": 231540 + }, + { + "epoch": 9.59, + "grad_norm": 0.5, + "learning_rate": 0.0003842228747602244, + "loss": 0.2266, + "step": 231550 + }, + { + "epoch": 9.59, + "grad_norm": 1.5390625, + "learning_rate": 0.00038421372507121076, + "loss": 0.2087, + "step": 231560 + }, + { + "epoch": 9.59, + "grad_norm": 0.2490234375, + "learning_rate": 0.00038420457512962136, + "loss": 0.1901, + "step": 231570 + }, + { + "epoch": 9.59, + "grad_norm": 0.68359375, + "learning_rate": 0.0003841954249354735, + "loss": 0.1591, + "step": 231580 + }, + { + "epoch": 9.59, + "grad_norm": 0.55078125, + "learning_rate": 0.0003841862744887843, + "loss": 0.1881, + "step": 231590 + }, + { + "epoch": 9.59, + "grad_norm": 0.81640625, + "learning_rate": 0.000384177123789571, + "loss": 0.1701, + "step": 231600 + }, + { + "epoch": 9.59, + "grad_norm": 1.015625, + "learning_rate": 0.0003841679728378509, + "loss": 0.1679, + "step": 231610 + }, + { + "epoch": 9.59, + "grad_norm": 0.86328125, + "learning_rate": 0.0003841588216336411, + "loss": 0.2264, + "step": 231620 + }, + { + "epoch": 9.59, + "grad_norm": 0.48828125, + "learning_rate": 0.0003841496701769589, + "loss": 0.2052, + "step": 231630 + }, + { + "epoch": 9.59, + "grad_norm": 0.30078125, + "learning_rate": 0.0003841405184678215, + "loss": 0.2094, + "step": 231640 + }, + { + "epoch": 9.59, + "grad_norm": 0.640625, + "learning_rate": 0.00038413136650624615, + "loss": 0.1904, + "step": 231650 + }, + { + "epoch": 9.6, + "grad_norm": 1.765625, + "learning_rate": 0.00038412221429225016, + "loss": 0.1429, + "step": 231660 + }, + { + "epoch": 9.6, + "grad_norm": 0.85546875, + "learning_rate": 0.0003841130618258505, + "loss": 0.1902, + "step": 231670 + }, + { + "epoch": 9.6, + "grad_norm": 0.70703125, + "learning_rate": 0.00038410390910706454, + "loss": 0.1959, + "step": 231680 + }, + { + "epoch": 9.6, + "grad_norm": 0.6953125, + "learning_rate": 0.00038409475613590955, + "loss": 0.1911, + "step": 231690 + }, + { + "epoch": 9.6, + "grad_norm": 0.306640625, + "learning_rate": 0.00038408560291240267, + "loss": 0.1708, + "step": 231700 + }, + { + "epoch": 9.6, + "grad_norm": 0.7734375, + "learning_rate": 0.0003840764494365613, + "loss": 0.1733, + "step": 231710 + }, + { + "epoch": 9.6, + "grad_norm": 1.734375, + "learning_rate": 0.00038406729570840235, + "loss": 0.1833, + "step": 231720 + }, + { + "epoch": 9.6, + "grad_norm": 1.0859375, + "learning_rate": 0.00038405814172794335, + "loss": 0.1762, + "step": 231730 + }, + { + "epoch": 9.6, + "grad_norm": 1.7109375, + "learning_rate": 0.0003840489874952014, + "loss": 0.1922, + "step": 231740 + }, + { + "epoch": 9.6, + "grad_norm": 0.74609375, + "learning_rate": 0.0003840398330101936, + "loss": 0.2138, + "step": 231750 + }, + { + "epoch": 9.6, + "grad_norm": 1.3125, + "learning_rate": 0.0003840306782729375, + "loss": 0.2356, + "step": 231760 + }, + { + "epoch": 9.6, + "grad_norm": 0.81640625, + "learning_rate": 0.00038402152328344995, + "loss": 0.2071, + "step": 231770 + }, + { + "epoch": 9.6, + "grad_norm": 0.453125, + "learning_rate": 0.0003840123680417485, + "loss": 0.1652, + "step": 231780 + }, + { + "epoch": 9.6, + "grad_norm": 0.98828125, + "learning_rate": 0.0003840032125478502, + "loss": 0.2006, + "step": 231790 + }, + { + "epoch": 9.6, + "grad_norm": 1.3359375, + "learning_rate": 0.00038399405680177235, + "loss": 0.2402, + "step": 231800 + }, + { + "epoch": 9.6, + "grad_norm": 0.6875, + "learning_rate": 0.0003839849008035321, + "loss": 0.2002, + "step": 231810 + }, + { + "epoch": 9.6, + "grad_norm": 0.703125, + "learning_rate": 0.0003839757445531469, + "loss": 0.1911, + "step": 231820 + }, + { + "epoch": 9.6, + "grad_norm": 1.1640625, + "learning_rate": 0.0003839665880506336, + "loss": 0.2321, + "step": 231830 + }, + { + "epoch": 9.6, + "grad_norm": 0.58984375, + "learning_rate": 0.00038395743129600974, + "loss": 0.1883, + "step": 231840 + }, + { + "epoch": 9.6, + "grad_norm": 0.78125, + "learning_rate": 0.00038394827428929245, + "loss": 0.2024, + "step": 231850 + }, + { + "epoch": 9.6, + "grad_norm": 0.74609375, + "learning_rate": 0.000383939117030499, + "loss": 0.191, + "step": 231860 + }, + { + "epoch": 9.6, + "grad_norm": 0.369140625, + "learning_rate": 0.0003839299595196466, + "loss": 0.175, + "step": 231870 + }, + { + "epoch": 9.6, + "grad_norm": 1.828125, + "learning_rate": 0.0003839208017567524, + "loss": 0.1898, + "step": 231880 + }, + { + "epoch": 9.6, + "grad_norm": 0.5234375, + "learning_rate": 0.0003839116437418338, + "loss": 0.1525, + "step": 231890 + }, + { + "epoch": 9.61, + "grad_norm": 0.87890625, + "learning_rate": 0.0003839024854749079, + "loss": 0.2206, + "step": 231900 + }, + { + "epoch": 9.61, + "grad_norm": 0.59765625, + "learning_rate": 0.000383893326955992, + "loss": 0.2124, + "step": 231910 + }, + { + "epoch": 9.61, + "grad_norm": 0.384765625, + "learning_rate": 0.0003838841681851033, + "loss": 0.2995, + "step": 231920 + }, + { + "epoch": 9.61, + "grad_norm": 0.7109375, + "learning_rate": 0.0003838750091622591, + "loss": 0.1994, + "step": 231930 + }, + { + "epoch": 9.61, + "grad_norm": 0.73828125, + "learning_rate": 0.00038386584988747653, + "loss": 0.2199, + "step": 231940 + }, + { + "epoch": 9.61, + "grad_norm": 1.1640625, + "learning_rate": 0.00038385669036077296, + "loss": 0.2126, + "step": 231950 + }, + { + "epoch": 9.61, + "grad_norm": 0.5078125, + "learning_rate": 0.0003838475305821655, + "loss": 0.2074, + "step": 231960 + }, + { + "epoch": 9.61, + "grad_norm": 0.37890625, + "learning_rate": 0.0003838383705516715, + "loss": 0.1626, + "step": 231970 + }, + { + "epoch": 9.61, + "grad_norm": 0.6796875, + "learning_rate": 0.00038382921026930807, + "loss": 0.209, + "step": 231980 + }, + { + "epoch": 9.61, + "grad_norm": 1.765625, + "learning_rate": 0.0003838200497350925, + "loss": 0.1757, + "step": 231990 + }, + { + "epoch": 9.61, + "grad_norm": 0.87890625, + "learning_rate": 0.00038381088894904214, + "loss": 0.2536, + "step": 232000 + }, + { + "epoch": 9.61, + "grad_norm": 0.46484375, + "learning_rate": 0.00038380172791117407, + "loss": 0.2073, + "step": 232010 + }, + { + "epoch": 9.61, + "grad_norm": 0.8828125, + "learning_rate": 0.00038379256662150556, + "loss": 0.1882, + "step": 232020 + }, + { + "epoch": 9.61, + "grad_norm": 1.328125, + "learning_rate": 0.000383783405080054, + "loss": 0.2167, + "step": 232030 + }, + { + "epoch": 9.61, + "grad_norm": 0.94921875, + "learning_rate": 0.0003837742432868364, + "loss": 0.227, + "step": 232040 + }, + { + "epoch": 9.61, + "grad_norm": 1.1328125, + "learning_rate": 0.0003837650812418702, + "loss": 0.2013, + "step": 232050 + }, + { + "epoch": 9.61, + "grad_norm": 0.9140625, + "learning_rate": 0.00038375591894517256, + "loss": 0.2336, + "step": 232060 + }, + { + "epoch": 9.61, + "grad_norm": 1.0703125, + "learning_rate": 0.00038374675639676073, + "loss": 0.1703, + "step": 232070 + }, + { + "epoch": 9.61, + "grad_norm": 0.51953125, + "learning_rate": 0.00038373759359665196, + "loss": 0.1626, + "step": 232080 + }, + { + "epoch": 9.61, + "grad_norm": 1.03125, + "learning_rate": 0.0003837284305448634, + "loss": 0.2498, + "step": 232090 + }, + { + "epoch": 9.61, + "grad_norm": 0.484375, + "learning_rate": 0.00038371926724141235, + "loss": 0.2005, + "step": 232100 + }, + { + "epoch": 9.61, + "grad_norm": 0.61328125, + "learning_rate": 0.00038371010368631626, + "loss": 0.1776, + "step": 232110 + }, + { + "epoch": 9.61, + "grad_norm": 0.4609375, + "learning_rate": 0.00038370093987959196, + "loss": 0.2119, + "step": 232120 + }, + { + "epoch": 9.61, + "grad_norm": 0.83203125, + "learning_rate": 0.00038369177582125714, + "loss": 0.1316, + "step": 232130 + }, + { + "epoch": 9.62, + "grad_norm": 0.921875, + "learning_rate": 0.00038368261151132876, + "loss": 0.1809, + "step": 232140 + }, + { + "epoch": 9.62, + "grad_norm": 0.2265625, + "learning_rate": 0.0003836734469498241, + "loss": 0.2517, + "step": 232150 + }, + { + "epoch": 9.62, + "grad_norm": 1.09375, + "learning_rate": 0.00038366428213676055, + "loss": 0.2113, + "step": 232160 + }, + { + "epoch": 9.62, + "grad_norm": 0.8515625, + "learning_rate": 0.0003836551170721552, + "loss": 0.2388, + "step": 232170 + }, + { + "epoch": 9.62, + "grad_norm": 0.97265625, + "learning_rate": 0.0003836459517560253, + "loss": 0.2174, + "step": 232180 + }, + { + "epoch": 9.62, + "grad_norm": 0.82421875, + "learning_rate": 0.00038363678618838823, + "loss": 0.1646, + "step": 232190 + }, + { + "epoch": 9.62, + "grad_norm": 0.6953125, + "learning_rate": 0.000383627620369261, + "loss": 0.1556, + "step": 232200 + }, + { + "epoch": 9.62, + "grad_norm": 0.84765625, + "learning_rate": 0.00038361845429866126, + "loss": 0.2072, + "step": 232210 + }, + { + "epoch": 9.62, + "grad_norm": 1.1171875, + "learning_rate": 0.00038360928797660587, + "loss": 0.1939, + "step": 232220 + }, + { + "epoch": 9.62, + "grad_norm": 0.314453125, + "learning_rate": 0.0003836001214031122, + "loss": 0.199, + "step": 232230 + }, + { + "epoch": 9.62, + "grad_norm": 0.546875, + "learning_rate": 0.00038359095457819763, + "loss": 0.2, + "step": 232240 + }, + { + "epoch": 9.62, + "grad_norm": 1.640625, + "learning_rate": 0.00038358178750187927, + "loss": 0.2014, + "step": 232250 + }, + { + "epoch": 9.62, + "grad_norm": 0.640625, + "learning_rate": 0.0003835726201741744, + "loss": 0.2337, + "step": 232260 + }, + { + "epoch": 9.62, + "grad_norm": 2.203125, + "learning_rate": 0.00038356345259510035, + "loss": 0.2035, + "step": 232270 + }, + { + "epoch": 9.62, + "grad_norm": 1.1953125, + "learning_rate": 0.00038355428476467417, + "loss": 0.231, + "step": 232280 + }, + { + "epoch": 9.62, + "grad_norm": 3.703125, + "learning_rate": 0.0003835451166829134, + "loss": 0.2111, + "step": 232290 + }, + { + "epoch": 9.62, + "grad_norm": 0.625, + "learning_rate": 0.0003835359483498351, + "loss": 0.191, + "step": 232300 + }, + { + "epoch": 9.62, + "grad_norm": 1.09375, + "learning_rate": 0.00038352677976545644, + "loss": 0.2657, + "step": 232310 + }, + { + "epoch": 9.62, + "grad_norm": 0.88671875, + "learning_rate": 0.000383517610929795, + "loss": 0.1504, + "step": 232320 + }, + { + "epoch": 9.62, + "grad_norm": 0.53125, + "learning_rate": 0.00038350844184286765, + "loss": 0.2151, + "step": 232330 + }, + { + "epoch": 9.62, + "grad_norm": 0.625, + "learning_rate": 0.0003834992725046919, + "loss": 0.222, + "step": 232340 + }, + { + "epoch": 9.62, + "grad_norm": 1.0703125, + "learning_rate": 0.00038349010291528497, + "loss": 0.2129, + "step": 232350 + }, + { + "epoch": 9.62, + "grad_norm": 1.65625, + "learning_rate": 0.00038348093307466404, + "loss": 0.2353, + "step": 232360 + }, + { + "epoch": 9.62, + "grad_norm": 0.53515625, + "learning_rate": 0.0003834717629828465, + "loss": 0.1883, + "step": 232370 + }, + { + "epoch": 9.63, + "grad_norm": 0.98046875, + "learning_rate": 0.0003834625926398494, + "loss": 0.1753, + "step": 232380 + }, + { + "epoch": 9.63, + "grad_norm": 0.427734375, + "learning_rate": 0.00038345342204569013, + "loss": 0.21, + "step": 232390 + }, + { + "epoch": 9.63, + "grad_norm": 0.61328125, + "learning_rate": 0.00038344425120038594, + "loss": 0.1753, + "step": 232400 + }, + { + "epoch": 9.63, + "grad_norm": 0.9921875, + "learning_rate": 0.00038343508010395413, + "loss": 0.1989, + "step": 232410 + }, + { + "epoch": 9.63, + "grad_norm": 0.2392578125, + "learning_rate": 0.0003834259087564119, + "loss": 0.1762, + "step": 232420 + }, + { + "epoch": 9.63, + "grad_norm": 1.09375, + "learning_rate": 0.0003834167371577765, + "loss": 0.1762, + "step": 232430 + }, + { + "epoch": 9.63, + "grad_norm": 0.79296875, + "learning_rate": 0.00038340756530806517, + "loss": 0.1819, + "step": 232440 + }, + { + "epoch": 9.63, + "grad_norm": 0.953125, + "learning_rate": 0.00038339839320729525, + "loss": 0.1764, + "step": 232450 + }, + { + "epoch": 9.63, + "grad_norm": 1.78125, + "learning_rate": 0.0003833892208554839, + "loss": 0.2085, + "step": 232460 + }, + { + "epoch": 9.63, + "grad_norm": 0.6640625, + "learning_rate": 0.0003833800482526485, + "loss": 0.1702, + "step": 232470 + }, + { + "epoch": 9.63, + "grad_norm": 0.671875, + "learning_rate": 0.00038337087539880635, + "loss": 0.1489, + "step": 232480 + }, + { + "epoch": 9.63, + "grad_norm": 0.6171875, + "learning_rate": 0.0003833617022939744, + "loss": 0.1977, + "step": 232490 + }, + { + "epoch": 9.63, + "grad_norm": 0.3046875, + "learning_rate": 0.0003833525289381703, + "loss": 0.1538, + "step": 232500 + }, + { + "epoch": 9.63, + "grad_norm": 0.56640625, + "learning_rate": 0.00038334335533141107, + "loss": 0.1835, + "step": 232510 + }, + { + "epoch": 9.63, + "grad_norm": 1.1953125, + "learning_rate": 0.000383334181473714, + "loss": 0.2651, + "step": 232520 + }, + { + "epoch": 9.63, + "grad_norm": 0.98046875, + "learning_rate": 0.0003833250073650965, + "loss": 0.1747, + "step": 232530 + }, + { + "epoch": 9.63, + "grad_norm": 1.078125, + "learning_rate": 0.0003833158330055757, + "loss": 0.1922, + "step": 232540 + }, + { + "epoch": 9.63, + "grad_norm": 0.7265625, + "learning_rate": 0.0003833066583951689, + "loss": 0.147, + "step": 232550 + }, + { + "epoch": 9.63, + "grad_norm": 0.65625, + "learning_rate": 0.00038329748353389337, + "loss": 0.1105, + "step": 232560 + }, + { + "epoch": 9.63, + "grad_norm": 0.734375, + "learning_rate": 0.00038328830842176634, + "loss": 0.2017, + "step": 232570 + }, + { + "epoch": 9.63, + "grad_norm": 1.9765625, + "learning_rate": 0.00038327913305880506, + "loss": 0.1717, + "step": 232580 + }, + { + "epoch": 9.63, + "grad_norm": 0.96484375, + "learning_rate": 0.00038326995744502695, + "loss": 0.158, + "step": 232590 + }, + { + "epoch": 9.63, + "grad_norm": 0.9140625, + "learning_rate": 0.0003832607815804491, + "loss": 0.1481, + "step": 232600 + }, + { + "epoch": 9.63, + "grad_norm": 1.015625, + "learning_rate": 0.00038325160546508885, + "loss": 0.2109, + "step": 232610 + }, + { + "epoch": 9.64, + "grad_norm": 0.9453125, + "learning_rate": 0.00038324242909896356, + "loss": 0.214, + "step": 232620 + }, + { + "epoch": 9.64, + "grad_norm": 0.88671875, + "learning_rate": 0.00038323325248209027, + "loss": 0.2136, + "step": 232630 + }, + { + "epoch": 9.64, + "grad_norm": 0.9296875, + "learning_rate": 0.0003832240756144865, + "loss": 0.2075, + "step": 232640 + }, + { + "epoch": 9.64, + "grad_norm": 0.89453125, + "learning_rate": 0.00038321489849616944, + "loss": 0.1877, + "step": 232650 + }, + { + "epoch": 9.64, + "grad_norm": 0.0, + "learning_rate": 0.00038320572112715616, + "loss": 0.2125, + "step": 232660 + }, + { + "epoch": 9.64, + "grad_norm": 0.6953125, + "learning_rate": 0.0003831965435074642, + "loss": 0.1587, + "step": 232670 + }, + { + "epoch": 9.64, + "grad_norm": 0.796875, + "learning_rate": 0.0003831873656371107, + "loss": 0.2101, + "step": 232680 + }, + { + "epoch": 9.64, + "grad_norm": 0.76953125, + "learning_rate": 0.000383178187516113, + "loss": 0.2032, + "step": 232690 + }, + { + "epoch": 9.64, + "grad_norm": 0.0, + "learning_rate": 0.0003831690091444884, + "loss": 0.2448, + "step": 232700 + }, + { + "epoch": 9.64, + "grad_norm": 0.94921875, + "learning_rate": 0.00038315983052225394, + "loss": 0.1763, + "step": 232710 + }, + { + "epoch": 9.64, + "grad_norm": 1.5390625, + "learning_rate": 0.0003831506516494272, + "loss": 0.2013, + "step": 232720 + }, + { + "epoch": 9.64, + "grad_norm": 0.44140625, + "learning_rate": 0.00038314147252602524, + "loss": 0.1921, + "step": 232730 + }, + { + "epoch": 9.64, + "grad_norm": 0.703125, + "learning_rate": 0.0003831322931520654, + "loss": 0.1474, + "step": 232740 + }, + { + "epoch": 9.64, + "grad_norm": 0.84375, + "learning_rate": 0.000383123113527565, + "loss": 0.1461, + "step": 232750 + }, + { + "epoch": 9.64, + "grad_norm": 0.5, + "learning_rate": 0.0003831139336525412, + "loss": 0.2373, + "step": 232760 + }, + { + "epoch": 9.64, + "grad_norm": 0.64453125, + "learning_rate": 0.00038310475352701146, + "loss": 0.2075, + "step": 232770 + }, + { + "epoch": 9.64, + "grad_norm": 1.6171875, + "learning_rate": 0.0003830955731509929, + "loss": 0.2033, + "step": 232780 + }, + { + "epoch": 9.64, + "grad_norm": 0.390625, + "learning_rate": 0.00038308639252450285, + "loss": 0.2021, + "step": 232790 + }, + { + "epoch": 9.64, + "grad_norm": 0.52734375, + "learning_rate": 0.0003830772116475586, + "loss": 0.2174, + "step": 232800 + }, + { + "epoch": 9.64, + "grad_norm": 1.8515625, + "learning_rate": 0.00038306803052017734, + "loss": 0.1998, + "step": 232810 + }, + { + "epoch": 9.64, + "grad_norm": 1.5078125, + "learning_rate": 0.00038305884914237645, + "loss": 0.2241, + "step": 232820 + }, + { + "epoch": 9.64, + "grad_norm": 0.69921875, + "learning_rate": 0.00038304966751417317, + "loss": 0.2071, + "step": 232830 + }, + { + "epoch": 9.64, + "grad_norm": 0.96875, + "learning_rate": 0.0003830404856355848, + "loss": 0.196, + "step": 232840 + }, + { + "epoch": 9.64, + "grad_norm": 1.234375, + "learning_rate": 0.0003830313035066286, + "loss": 0.2187, + "step": 232850 + }, + { + "epoch": 9.65, + "grad_norm": 0.890625, + "learning_rate": 0.00038302212112732183, + "loss": 0.2484, + "step": 232860 + }, + { + "epoch": 9.65, + "grad_norm": 1.2265625, + "learning_rate": 0.0003830129384976818, + "loss": 0.2283, + "step": 232870 + }, + { + "epoch": 9.65, + "grad_norm": 0.455078125, + "learning_rate": 0.0003830037556177258, + "loss": 0.1846, + "step": 232880 + }, + { + "epoch": 9.65, + "grad_norm": 1.0, + "learning_rate": 0.00038299457248747117, + "loss": 0.1775, + "step": 232890 + }, + { + "epoch": 9.65, + "grad_norm": 1.796875, + "learning_rate": 0.000382985389106935, + "loss": 0.2154, + "step": 232900 + }, + { + "epoch": 9.65, + "grad_norm": 1.1875, + "learning_rate": 0.0003829762054761347, + "loss": 0.2298, + "step": 232910 + }, + { + "epoch": 9.65, + "grad_norm": 0.345703125, + "learning_rate": 0.00038296702159508754, + "loss": 0.2038, + "step": 232920 + }, + { + "epoch": 9.65, + "grad_norm": 0.87109375, + "learning_rate": 0.00038295783746381093, + "loss": 0.195, + "step": 232930 + }, + { + "epoch": 9.65, + "grad_norm": 1.1171875, + "learning_rate": 0.0003829486530823219, + "loss": 0.1745, + "step": 232940 + }, + { + "epoch": 9.65, + "grad_norm": 0.4765625, + "learning_rate": 0.0003829394684506379, + "loss": 0.2093, + "step": 232950 + }, + { + "epoch": 9.65, + "grad_norm": 0.64453125, + "learning_rate": 0.0003829302835687761, + "loss": 0.2012, + "step": 232960 + }, + { + "epoch": 9.65, + "grad_norm": 0.263671875, + "learning_rate": 0.00038292109843675407, + "loss": 0.1734, + "step": 232970 + }, + { + "epoch": 9.65, + "grad_norm": 0.8203125, + "learning_rate": 0.00038291191305458874, + "loss": 0.1951, + "step": 232980 + }, + { + "epoch": 9.65, + "grad_norm": 0.7109375, + "learning_rate": 0.0003829027274222975, + "loss": 0.2065, + "step": 232990 + }, + { + "epoch": 9.65, + "grad_norm": 0.5859375, + "learning_rate": 0.00038289354153989776, + "loss": 0.2034, + "step": 233000 + }, + { + "epoch": 9.65, + "grad_norm": 0.59375, + "learning_rate": 0.0003828843554074067, + "loss": 0.226, + "step": 233010 + }, + { + "epoch": 9.65, + "grad_norm": 0.640625, + "learning_rate": 0.00038287516902484167, + "loss": 0.174, + "step": 233020 + }, + { + "epoch": 9.65, + "grad_norm": 0.46484375, + "learning_rate": 0.0003828659823922198, + "loss": 0.2004, + "step": 233030 + }, + { + "epoch": 9.65, + "grad_norm": 1.6640625, + "learning_rate": 0.0003828567955095586, + "loss": 0.2017, + "step": 233040 + }, + { + "epoch": 9.65, + "grad_norm": 0.4609375, + "learning_rate": 0.00038284760837687526, + "loss": 0.1815, + "step": 233050 + }, + { + "epoch": 9.65, + "grad_norm": 0.439453125, + "learning_rate": 0.0003828384209941871, + "loss": 0.1861, + "step": 233060 + }, + { + "epoch": 9.65, + "grad_norm": 0.77734375, + "learning_rate": 0.00038282923336151127, + "loss": 0.2248, + "step": 233070 + }, + { + "epoch": 9.65, + "grad_norm": 0.72265625, + "learning_rate": 0.00038282004547886516, + "loss": 0.2234, + "step": 233080 + }, + { + "epoch": 9.65, + "grad_norm": 0.78515625, + "learning_rate": 0.00038281085734626617, + "loss": 0.1547, + "step": 233090 + }, + { + "epoch": 9.65, + "grad_norm": 0.60546875, + "learning_rate": 0.00038280166896373147, + "loss": 0.2226, + "step": 233100 + }, + { + "epoch": 9.66, + "grad_norm": 0.8125, + "learning_rate": 0.00038279248033127835, + "loss": 0.1639, + "step": 233110 + }, + { + "epoch": 9.66, + "grad_norm": 0.87109375, + "learning_rate": 0.0003827832914489241, + "loss": 0.2218, + "step": 233120 + }, + { + "epoch": 9.66, + "grad_norm": 0.546875, + "learning_rate": 0.00038277410231668605, + "loss": 0.2431, + "step": 233130 + }, + { + "epoch": 9.66, + "grad_norm": 1.0859375, + "learning_rate": 0.0003827649129345815, + "loss": 0.2283, + "step": 233140 + }, + { + "epoch": 9.66, + "grad_norm": 0.65625, + "learning_rate": 0.00038275572330262773, + "loss": 0.242, + "step": 233150 + }, + { + "epoch": 9.66, + "grad_norm": 0.60546875, + "learning_rate": 0.000382746533420842, + "loss": 0.1891, + "step": 233160 + }, + { + "epoch": 9.66, + "grad_norm": 1.2578125, + "learning_rate": 0.0003827373432892416, + "loss": 0.1984, + "step": 233170 + }, + { + "epoch": 9.66, + "grad_norm": 2.265625, + "learning_rate": 0.0003827281529078439, + "loss": 0.1763, + "step": 233180 + }, + { + "epoch": 9.66, + "grad_norm": 0.625, + "learning_rate": 0.00038271896227666616, + "loss": 0.1735, + "step": 233190 + }, + { + "epoch": 9.66, + "grad_norm": 0.859375, + "learning_rate": 0.00038270977139572563, + "loss": 0.1785, + "step": 233200 + }, + { + "epoch": 9.66, + "grad_norm": 0.703125, + "learning_rate": 0.00038270058026503966, + "loss": 0.2119, + "step": 233210 + }, + { + "epoch": 9.66, + "grad_norm": 1.4921875, + "learning_rate": 0.0003826913888846255, + "loss": 0.211, + "step": 233220 + }, + { + "epoch": 9.66, + "grad_norm": 0.8125, + "learning_rate": 0.00038268219725450046, + "loss": 0.2123, + "step": 233230 + }, + { + "epoch": 9.66, + "grad_norm": 0.75, + "learning_rate": 0.0003826730053746819, + "loss": 0.2008, + "step": 233240 + }, + { + "epoch": 9.66, + "grad_norm": 0.53515625, + "learning_rate": 0.00038266381324518706, + "loss": 0.1892, + "step": 233250 + }, + { + "epoch": 9.66, + "grad_norm": 0.51953125, + "learning_rate": 0.0003826546208660333, + "loss": 0.2115, + "step": 233260 + }, + { + "epoch": 9.66, + "grad_norm": 0.61328125, + "learning_rate": 0.0003826454282372378, + "loss": 0.2281, + "step": 233270 + }, + { + "epoch": 9.66, + "grad_norm": 0.828125, + "learning_rate": 0.000382636235358818, + "loss": 0.2206, + "step": 233280 + }, + { + "epoch": 9.66, + "grad_norm": 0.7421875, + "learning_rate": 0.00038262704223079104, + "loss": 0.1976, + "step": 233290 + }, + { + "epoch": 9.66, + "grad_norm": 0.423828125, + "learning_rate": 0.0003826178488531743, + "loss": 0.1947, + "step": 233300 + }, + { + "epoch": 9.66, + "grad_norm": 0.7578125, + "learning_rate": 0.0003826086552259852, + "loss": 0.1918, + "step": 233310 + }, + { + "epoch": 9.66, + "grad_norm": 0.7578125, + "learning_rate": 0.00038259946134924083, + "loss": 0.1701, + "step": 233320 + }, + { + "epoch": 9.66, + "grad_norm": 1.1796875, + "learning_rate": 0.00038259026722295864, + "loss": 0.2241, + "step": 233330 + }, + { + "epoch": 9.66, + "grad_norm": 1.109375, + "learning_rate": 0.00038258107284715594, + "loss": 0.2055, + "step": 233340 + }, + { + "epoch": 9.67, + "grad_norm": 0.478515625, + "learning_rate": 0.00038257187822184985, + "loss": 0.1793, + "step": 233350 + }, + { + "epoch": 9.67, + "grad_norm": 0.416015625, + "learning_rate": 0.00038256268334705784, + "loss": 0.2169, + "step": 233360 + }, + { + "epoch": 9.67, + "grad_norm": 0.8203125, + "learning_rate": 0.00038255348822279725, + "loss": 0.1706, + "step": 233370 + }, + { + "epoch": 9.67, + "grad_norm": 1.2734375, + "learning_rate": 0.0003825442928490852, + "loss": 0.1631, + "step": 233380 + }, + { + "epoch": 9.67, + "grad_norm": 0.671875, + "learning_rate": 0.0003825350972259392, + "loss": 0.1679, + "step": 233390 + }, + { + "epoch": 9.67, + "grad_norm": 0.47265625, + "learning_rate": 0.0003825259013533764, + "loss": 0.2081, + "step": 233400 + }, + { + "epoch": 9.67, + "grad_norm": 1.0625, + "learning_rate": 0.00038251670523141424, + "loss": 0.2621, + "step": 233410 + }, + { + "epoch": 9.67, + "grad_norm": 0.69140625, + "learning_rate": 0.0003825075088600699, + "loss": 0.1838, + "step": 233420 + }, + { + "epoch": 9.67, + "grad_norm": 0.53515625, + "learning_rate": 0.0003824983122393607, + "loss": 0.1861, + "step": 233430 + }, + { + "epoch": 9.67, + "grad_norm": 0.8125, + "learning_rate": 0.000382489115369304, + "loss": 0.2177, + "step": 233440 + }, + { + "epoch": 9.67, + "grad_norm": 0.41796875, + "learning_rate": 0.0003824799182499171, + "loss": 0.2178, + "step": 233450 + }, + { + "epoch": 9.67, + "grad_norm": 0.443359375, + "learning_rate": 0.0003824707208812174, + "loss": 0.1604, + "step": 233460 + }, + { + "epoch": 9.67, + "grad_norm": 1.546875, + "learning_rate": 0.00038246152326322204, + "loss": 0.2458, + "step": 233470 + }, + { + "epoch": 9.67, + "grad_norm": 0.6640625, + "learning_rate": 0.00038245232539594834, + "loss": 0.2526, + "step": 233480 + }, + { + "epoch": 9.67, + "grad_norm": 0.7890625, + "learning_rate": 0.00038244312727941373, + "loss": 0.2225, + "step": 233490 + }, + { + "epoch": 9.67, + "grad_norm": 0.1953125, + "learning_rate": 0.00038243392891363547, + "loss": 0.1841, + "step": 233500 + }, + { + "epoch": 9.67, + "grad_norm": 0.72265625, + "learning_rate": 0.0003824247302986308, + "loss": 0.1961, + "step": 233510 + }, + { + "epoch": 9.67, + "grad_norm": 1.359375, + "learning_rate": 0.0003824155314344171, + "loss": 0.174, + "step": 233520 + }, + { + "epoch": 9.67, + "grad_norm": 0.4765625, + "learning_rate": 0.00038240633232101165, + "loss": 0.1124, + "step": 233530 + }, + { + "epoch": 9.67, + "grad_norm": 0.6796875, + "learning_rate": 0.00038239713295843184, + "loss": 0.1953, + "step": 233540 + }, + { + "epoch": 9.67, + "grad_norm": 0.94921875, + "learning_rate": 0.0003823879333466949, + "loss": 0.1719, + "step": 233550 + }, + { + "epoch": 9.67, + "grad_norm": 0.84765625, + "learning_rate": 0.0003823787334858182, + "loss": 0.1899, + "step": 233560 + }, + { + "epoch": 9.67, + "grad_norm": 0.546875, + "learning_rate": 0.0003823695333758189, + "loss": 0.1924, + "step": 233570 + }, + { + "epoch": 9.67, + "grad_norm": 0.416015625, + "learning_rate": 0.00038236033301671456, + "loss": 0.1975, + "step": 233580 + }, + { + "epoch": 9.68, + "grad_norm": 0.4765625, + "learning_rate": 0.00038235113240852226, + "loss": 0.1712, + "step": 233590 + }, + { + "epoch": 9.68, + "grad_norm": 1.0, + "learning_rate": 0.00038234193155125954, + "loss": 0.2369, + "step": 233600 + }, + { + "epoch": 9.68, + "grad_norm": 1.4296875, + "learning_rate": 0.00038233273044494356, + "loss": 0.2308, + "step": 233610 + }, + { + "epoch": 9.68, + "grad_norm": 0.265625, + "learning_rate": 0.0003823235290895917, + "loss": 0.2157, + "step": 233620 + }, + { + "epoch": 9.68, + "grad_norm": 0.6875, + "learning_rate": 0.00038231432748522115, + "loss": 0.1355, + "step": 233630 + }, + { + "epoch": 9.68, + "grad_norm": 1.359375, + "learning_rate": 0.00038230512563184937, + "loss": 0.1829, + "step": 233640 + }, + { + "epoch": 9.68, + "grad_norm": 0.73046875, + "learning_rate": 0.0003822959235294936, + "loss": 0.2015, + "step": 233650 + }, + { + "epoch": 9.68, + "grad_norm": 0.72265625, + "learning_rate": 0.0003822867211781713, + "loss": 0.1728, + "step": 233660 + }, + { + "epoch": 9.68, + "grad_norm": 1.0078125, + "learning_rate": 0.0003822775185778996, + "loss": 0.1922, + "step": 233670 + }, + { + "epoch": 9.68, + "grad_norm": 0.7109375, + "learning_rate": 0.0003822683157286959, + "loss": 0.1529, + "step": 233680 + }, + { + "epoch": 9.68, + "grad_norm": 0.63671875, + "learning_rate": 0.0003822591126305774, + "loss": 0.1895, + "step": 233690 + }, + { + "epoch": 9.68, + "grad_norm": 0.984375, + "learning_rate": 0.0003822499092835617, + "loss": 0.1733, + "step": 233700 + }, + { + "epoch": 9.68, + "grad_norm": 1.4140625, + "learning_rate": 0.00038224070568766596, + "loss": 0.1736, + "step": 233710 + }, + { + "epoch": 9.68, + "grad_norm": 0.953125, + "learning_rate": 0.00038223150184290737, + "loss": 0.2141, + "step": 233720 + }, + { + "epoch": 9.68, + "grad_norm": 0.74609375, + "learning_rate": 0.00038222229774930344, + "loss": 0.1567, + "step": 233730 + }, + { + "epoch": 9.68, + "grad_norm": 0.283203125, + "learning_rate": 0.0003822130934068715, + "loss": 0.2165, + "step": 233740 + }, + { + "epoch": 9.68, + "grad_norm": 1.2890625, + "learning_rate": 0.00038220388881562864, + "loss": 0.2182, + "step": 233750 + }, + { + "epoch": 9.68, + "grad_norm": 0.8203125, + "learning_rate": 0.00038219468397559245, + "loss": 0.2053, + "step": 233760 + }, + { + "epoch": 9.68, + "grad_norm": 0.703125, + "learning_rate": 0.00038218547888678, + "loss": 0.2134, + "step": 233770 + }, + { + "epoch": 9.68, + "grad_norm": 0.8515625, + "learning_rate": 0.0003821762735492089, + "loss": 0.2011, + "step": 233780 + }, + { + "epoch": 9.68, + "grad_norm": 1.4765625, + "learning_rate": 0.00038216706796289637, + "loss": 0.1913, + "step": 233790 + }, + { + "epoch": 9.68, + "grad_norm": 0.9375, + "learning_rate": 0.00038215786212785955, + "loss": 0.2462, + "step": 233800 + }, + { + "epoch": 9.68, + "grad_norm": 0.58203125, + "learning_rate": 0.0003821486560441159, + "loss": 0.1953, + "step": 233810 + }, + { + "epoch": 9.68, + "grad_norm": 1.03125, + "learning_rate": 0.0003821394497116829, + "loss": 0.211, + "step": 233820 + }, + { + "epoch": 9.69, + "grad_norm": 0.625, + "learning_rate": 0.0003821302431305775, + "loss": 0.1964, + "step": 233830 + }, + { + "epoch": 9.69, + "grad_norm": 0.9140625, + "learning_rate": 0.00038212103630081746, + "loss": 0.2074, + "step": 233840 + }, + { + "epoch": 9.69, + "grad_norm": 0.09716796875, + "learning_rate": 0.0003821118292224198, + "loss": 0.1659, + "step": 233850 + }, + { + "epoch": 9.69, + "grad_norm": 0.310546875, + "learning_rate": 0.000382102621895402, + "loss": 0.1766, + "step": 233860 + }, + { + "epoch": 9.69, + "grad_norm": 0.23828125, + "learning_rate": 0.0003820934143197813, + "loss": 0.1721, + "step": 233870 + }, + { + "epoch": 9.69, + "grad_norm": 0.478515625, + "learning_rate": 0.00038208420649557496, + "loss": 0.2388, + "step": 233880 + }, + { + "epoch": 9.69, + "grad_norm": 0.87109375, + "learning_rate": 0.0003820749984228006, + "loss": 0.1701, + "step": 233890 + }, + { + "epoch": 9.69, + "grad_norm": 0.5234375, + "learning_rate": 0.00038206579010147525, + "loss": 0.2055, + "step": 233900 + }, + { + "epoch": 9.69, + "grad_norm": 1.15625, + "learning_rate": 0.0003820565815316163, + "loss": 0.1939, + "step": 233910 + }, + { + "epoch": 9.69, + "grad_norm": 0.271484375, + "learning_rate": 0.00038204737271324123, + "loss": 0.2628, + "step": 233920 + }, + { + "epoch": 9.69, + "grad_norm": 0.578125, + "learning_rate": 0.00038203816364636725, + "loss": 0.2173, + "step": 233930 + }, + { + "epoch": 9.69, + "grad_norm": 0.01220703125, + "learning_rate": 0.0003820289543310116, + "loss": 0.2436, + "step": 233940 + }, + { + "epoch": 9.69, + "grad_norm": 1.0546875, + "learning_rate": 0.0003820197447671918, + "loss": 0.1562, + "step": 233950 + }, + { + "epoch": 9.69, + "grad_norm": 0.8046875, + "learning_rate": 0.00038201053495492504, + "loss": 0.1991, + "step": 233960 + }, + { + "epoch": 9.69, + "grad_norm": 0.484375, + "learning_rate": 0.0003820013248942288, + "loss": 0.1788, + "step": 233970 + }, + { + "epoch": 9.69, + "grad_norm": 0.48828125, + "learning_rate": 0.0003819921145851203, + "loss": 0.1496, + "step": 233980 + }, + { + "epoch": 9.69, + "grad_norm": 0.7890625, + "learning_rate": 0.0003819829040276168, + "loss": 0.1593, + "step": 233990 + }, + { + "epoch": 9.69, + "grad_norm": 1.2265625, + "learning_rate": 0.0003819736932217358, + "loss": 0.1646, + "step": 234000 + }, + { + "epoch": 9.69, + "grad_norm": 0.97265625, + "learning_rate": 0.00038196448216749456, + "loss": 0.1752, + "step": 234010 + }, + { + "epoch": 9.69, + "grad_norm": 0.94921875, + "learning_rate": 0.00038195527086491043, + "loss": 0.2437, + "step": 234020 + }, + { + "epoch": 9.69, + "grad_norm": 0.76171875, + "learning_rate": 0.00038194605931400074, + "loss": 0.1755, + "step": 234030 + }, + { + "epoch": 9.69, + "grad_norm": 0.76953125, + "learning_rate": 0.00038193684751478273, + "loss": 0.1719, + "step": 234040 + }, + { + "epoch": 9.69, + "grad_norm": 0.482421875, + "learning_rate": 0.0003819276354672739, + "loss": 0.2112, + "step": 234050 + }, + { + "epoch": 9.69, + "grad_norm": 0.9296875, + "learning_rate": 0.0003819184231714915, + "loss": 0.1951, + "step": 234060 + }, + { + "epoch": 9.7, + "grad_norm": 0.328125, + "learning_rate": 0.00038190921062745286, + "loss": 0.2609, + "step": 234070 + }, + { + "epoch": 9.7, + "grad_norm": 1.734375, + "learning_rate": 0.0003818999978351754, + "loss": 0.1917, + "step": 234080 + }, + { + "epoch": 9.7, + "grad_norm": 0.7109375, + "learning_rate": 0.0003818907847946762, + "loss": 0.2182, + "step": 234090 + }, + { + "epoch": 9.7, + "grad_norm": 1.328125, + "learning_rate": 0.0003818815715059729, + "loss": 0.1966, + "step": 234100 + }, + { + "epoch": 9.7, + "grad_norm": 1.1484375, + "learning_rate": 0.0003818723579690828, + "loss": 0.1565, + "step": 234110 + }, + { + "epoch": 9.7, + "grad_norm": 0.0, + "learning_rate": 0.00038186314418402304, + "loss": 0.2163, + "step": 234120 + }, + { + "epoch": 9.7, + "grad_norm": 0.734375, + "learning_rate": 0.00038185393015081117, + "loss": 0.1873, + "step": 234130 + }, + { + "epoch": 9.7, + "grad_norm": 0.6015625, + "learning_rate": 0.00038184471586946435, + "loss": 0.1369, + "step": 234140 + }, + { + "epoch": 9.7, + "grad_norm": 0.5859375, + "learning_rate": 0.00038183550134000013, + "loss": 0.1804, + "step": 234150 + }, + { + "epoch": 9.7, + "grad_norm": 0.9140625, + "learning_rate": 0.00038182628656243566, + "loss": 0.1728, + "step": 234160 + }, + { + "epoch": 9.7, + "grad_norm": 1.515625, + "learning_rate": 0.00038181707153678833, + "loss": 0.1832, + "step": 234170 + }, + { + "epoch": 9.7, + "grad_norm": 0.95703125, + "learning_rate": 0.0003818078562630756, + "loss": 0.1722, + "step": 234180 + }, + { + "epoch": 9.7, + "grad_norm": 0.71484375, + "learning_rate": 0.0003817986407413146, + "loss": 0.2181, + "step": 234190 + }, + { + "epoch": 9.7, + "grad_norm": 0.52734375, + "learning_rate": 0.0003817894249715228, + "loss": 0.1812, + "step": 234200 + }, + { + "epoch": 9.7, + "grad_norm": 0.6796875, + "learning_rate": 0.0003817802089537177, + "loss": 0.1943, + "step": 234210 + }, + { + "epoch": 9.7, + "grad_norm": 0.9609375, + "learning_rate": 0.0003817709926879163, + "loss": 0.1955, + "step": 234220 + }, + { + "epoch": 9.7, + "grad_norm": 0.255859375, + "learning_rate": 0.0003817617761741362, + "loss": 0.2231, + "step": 234230 + }, + { + "epoch": 9.7, + "grad_norm": 0.353515625, + "learning_rate": 0.0003817525594123947, + "loss": 0.196, + "step": 234240 + }, + { + "epoch": 9.7, + "grad_norm": 0.4609375, + "learning_rate": 0.00038174334240270903, + "loss": 0.1893, + "step": 234250 + }, + { + "epoch": 9.7, + "grad_norm": 0.66015625, + "learning_rate": 0.00038173412514509665, + "loss": 0.1922, + "step": 234260 + }, + { + "epoch": 9.7, + "grad_norm": 3.015625, + "learning_rate": 0.00038172490763957483, + "loss": 0.2179, + "step": 234270 + }, + { + "epoch": 9.7, + "grad_norm": 0.63671875, + "learning_rate": 0.00038171568988616103, + "loss": 0.2057, + "step": 234280 + }, + { + "epoch": 9.7, + "grad_norm": 0.66015625, + "learning_rate": 0.00038170647188487253, + "loss": 0.2127, + "step": 234290 + }, + { + "epoch": 9.7, + "grad_norm": 0.94140625, + "learning_rate": 0.0003816972536357266, + "loss": 0.2602, + "step": 234300 + }, + { + "epoch": 9.71, + "grad_norm": 2.125, + "learning_rate": 0.00038168803513874075, + "loss": 0.1862, + "step": 234310 + }, + { + "epoch": 9.71, + "grad_norm": 0.484375, + "learning_rate": 0.0003816788163939322, + "loss": 0.2242, + "step": 234320 + }, + { + "epoch": 9.71, + "grad_norm": 1.0703125, + "learning_rate": 0.0003816695974013183, + "loss": 0.2035, + "step": 234330 + }, + { + "epoch": 9.71, + "grad_norm": 0.86328125, + "learning_rate": 0.00038166037816091646, + "loss": 0.1822, + "step": 234340 + }, + { + "epoch": 9.71, + "grad_norm": 1.6953125, + "learning_rate": 0.000381651158672744, + "loss": 0.211, + "step": 234350 + }, + { + "epoch": 9.71, + "grad_norm": 0.408203125, + "learning_rate": 0.00038164193893681827, + "loss": 0.2136, + "step": 234360 + }, + { + "epoch": 9.71, + "grad_norm": 0.546875, + "learning_rate": 0.00038163271895315663, + "loss": 0.1994, + "step": 234370 + }, + { + "epoch": 9.71, + "grad_norm": 0.455078125, + "learning_rate": 0.0003816234987217765, + "loss": 0.2097, + "step": 234380 + }, + { + "epoch": 9.71, + "grad_norm": 0.515625, + "learning_rate": 0.00038161427824269506, + "loss": 0.1652, + "step": 234390 + }, + { + "epoch": 9.71, + "grad_norm": 0.4609375, + "learning_rate": 0.00038160505751592977, + "loss": 0.2158, + "step": 234400 + }, + { + "epoch": 9.71, + "grad_norm": 0.2734375, + "learning_rate": 0.0003815958365414981, + "loss": 0.2033, + "step": 234410 + }, + { + "epoch": 9.71, + "grad_norm": 0.95703125, + "learning_rate": 0.0003815866153194171, + "loss": 0.1849, + "step": 234420 + }, + { + "epoch": 9.71, + "grad_norm": 0.56640625, + "learning_rate": 0.0003815773938497044, + "loss": 0.2012, + "step": 234430 + }, + { + "epoch": 9.71, + "grad_norm": 0.71484375, + "learning_rate": 0.00038156817213237724, + "loss": 0.1775, + "step": 234440 + }, + { + "epoch": 9.71, + "grad_norm": 1.546875, + "learning_rate": 0.000381558950167453, + "loss": 0.204, + "step": 234450 + }, + { + "epoch": 9.71, + "grad_norm": 1.8125, + "learning_rate": 0.00038154972795494905, + "loss": 0.1895, + "step": 234460 + }, + { + "epoch": 9.71, + "grad_norm": 0.0283203125, + "learning_rate": 0.00038154050549488264, + "loss": 0.1525, + "step": 234470 + }, + { + "epoch": 9.71, + "grad_norm": 0.76171875, + "learning_rate": 0.00038153128278727125, + "loss": 0.1837, + "step": 234480 + }, + { + "epoch": 9.71, + "grad_norm": 1.015625, + "learning_rate": 0.00038152205983213217, + "loss": 0.171, + "step": 234490 + }, + { + "epoch": 9.71, + "grad_norm": 1.15625, + "learning_rate": 0.0003815128366294828, + "loss": 0.2157, + "step": 234500 + }, + { + "epoch": 9.71, + "grad_norm": 0.94140625, + "learning_rate": 0.00038150361317934047, + "loss": 0.2328, + "step": 234510 + }, + { + "epoch": 9.71, + "grad_norm": 1.125, + "learning_rate": 0.0003814943894817225, + "loss": 0.2552, + "step": 234520 + }, + { + "epoch": 9.71, + "grad_norm": 0.4921875, + "learning_rate": 0.0003814851655366464, + "loss": 0.2405, + "step": 234530 + }, + { + "epoch": 9.71, + "grad_norm": 0.69921875, + "learning_rate": 0.00038147594134412926, + "loss": 0.2146, + "step": 234540 + }, + { + "epoch": 9.72, + "grad_norm": 1.546875, + "learning_rate": 0.00038146671690418864, + "loss": 0.1704, + "step": 234550 + }, + { + "epoch": 9.72, + "grad_norm": 1.8515625, + "learning_rate": 0.0003814574922168419, + "loss": 0.1982, + "step": 234560 + }, + { + "epoch": 9.72, + "grad_norm": 0.8125, + "learning_rate": 0.00038144826728210633, + "loss": 0.1947, + "step": 234570 + }, + { + "epoch": 9.72, + "grad_norm": 1.125, + "learning_rate": 0.00038143904209999934, + "loss": 0.2532, + "step": 234580 + }, + { + "epoch": 9.72, + "grad_norm": 0.7265625, + "learning_rate": 0.00038142981667053835, + "loss": 0.2523, + "step": 234590 + }, + { + "epoch": 9.72, + "grad_norm": 0.8984375, + "learning_rate": 0.0003814205909937405, + "loss": 0.1713, + "step": 234600 + }, + { + "epoch": 9.72, + "grad_norm": 0.98046875, + "learning_rate": 0.0003814113650696233, + "loss": 0.1703, + "step": 234610 + }, + { + "epoch": 9.72, + "grad_norm": 0.640625, + "learning_rate": 0.00038140213889820417, + "loss": 0.1856, + "step": 234620 + }, + { + "epoch": 9.72, + "grad_norm": 0.40625, + "learning_rate": 0.00038139291247950036, + "loss": 0.2106, + "step": 234630 + }, + { + "epoch": 9.72, + "grad_norm": 0.66015625, + "learning_rate": 0.00038138368581352934, + "loss": 0.1808, + "step": 234640 + }, + { + "epoch": 9.72, + "grad_norm": 0.5546875, + "learning_rate": 0.00038137445890030835, + "loss": 0.2039, + "step": 234650 + }, + { + "epoch": 9.72, + "grad_norm": 0.7109375, + "learning_rate": 0.00038136523173985484, + "loss": 0.1486, + "step": 234660 + }, + { + "epoch": 9.72, + "grad_norm": 0.578125, + "learning_rate": 0.0003813560043321861, + "loss": 0.1933, + "step": 234670 + }, + { + "epoch": 9.72, + "grad_norm": 0.41796875, + "learning_rate": 0.0003813467766773196, + "loss": 0.1796, + "step": 234680 + }, + { + "epoch": 9.72, + "grad_norm": 0.875, + "learning_rate": 0.0003813375487752726, + "loss": 0.2328, + "step": 234690 + }, + { + "epoch": 9.72, + "grad_norm": 1.4453125, + "learning_rate": 0.0003813283206260626, + "loss": 0.1691, + "step": 234700 + }, + { + "epoch": 9.72, + "grad_norm": 0.578125, + "learning_rate": 0.00038131909222970686, + "loss": 0.1869, + "step": 234710 + }, + { + "epoch": 9.72, + "grad_norm": 0.640625, + "learning_rate": 0.00038130986358622264, + "loss": 0.2123, + "step": 234720 + }, + { + "epoch": 9.72, + "grad_norm": 0.32421875, + "learning_rate": 0.00038130063469562765, + "loss": 0.187, + "step": 234730 + }, + { + "epoch": 9.72, + "grad_norm": 2.3125, + "learning_rate": 0.00038129140555793887, + "loss": 0.1914, + "step": 234740 + }, + { + "epoch": 9.72, + "grad_norm": 0.69140625, + "learning_rate": 0.00038128217617317396, + "loss": 0.2049, + "step": 234750 + }, + { + "epoch": 9.72, + "grad_norm": 0.78515625, + "learning_rate": 0.00038127294654135007, + "loss": 0.2138, + "step": 234760 + }, + { + "epoch": 9.72, + "grad_norm": 1.109375, + "learning_rate": 0.00038126371666248474, + "loss": 0.2351, + "step": 234770 + }, + { + "epoch": 9.72, + "grad_norm": 0.3671875, + "learning_rate": 0.0003812544865365953, + "loss": 0.2181, + "step": 234780 + }, + { + "epoch": 9.72, + "grad_norm": 0.5078125, + "learning_rate": 0.000381245256163699, + "loss": 0.1775, + "step": 234790 + }, + { + "epoch": 9.73, + "grad_norm": 0.326171875, + "learning_rate": 0.0003812360255438133, + "loss": 0.2067, + "step": 234800 + }, + { + "epoch": 9.73, + "grad_norm": 0.484375, + "learning_rate": 0.0003812267946769557, + "loss": 0.2067, + "step": 234810 + }, + { + "epoch": 9.73, + "grad_norm": 1.2890625, + "learning_rate": 0.0003812175635631433, + "loss": 0.2212, + "step": 234820 + }, + { + "epoch": 9.73, + "grad_norm": 0.42578125, + "learning_rate": 0.0003812083322023937, + "loss": 0.1988, + "step": 234830 + }, + { + "epoch": 9.73, + "grad_norm": 0.004638671875, + "learning_rate": 0.00038119910059472415, + "loss": 0.139, + "step": 234840 + }, + { + "epoch": 9.73, + "grad_norm": 1.34375, + "learning_rate": 0.00038118986874015207, + "loss": 0.2136, + "step": 234850 + }, + { + "epoch": 9.73, + "grad_norm": 2.328125, + "learning_rate": 0.00038118063663869484, + "loss": 0.1823, + "step": 234860 + }, + { + "epoch": 9.73, + "grad_norm": 2.015625, + "learning_rate": 0.00038117140429036976, + "loss": 0.2296, + "step": 234870 + }, + { + "epoch": 9.73, + "grad_norm": 0.90234375, + "learning_rate": 0.00038116217169519435, + "loss": 0.1945, + "step": 234880 + }, + { + "epoch": 9.73, + "grad_norm": 0.69921875, + "learning_rate": 0.0003811529388531859, + "loss": 0.1959, + "step": 234890 + }, + { + "epoch": 9.73, + "grad_norm": 0.328125, + "learning_rate": 0.0003811437057643617, + "loss": 0.2214, + "step": 234900 + }, + { + "epoch": 9.73, + "grad_norm": 0.7421875, + "learning_rate": 0.0003811344724287393, + "loss": 0.1687, + "step": 234910 + }, + { + "epoch": 9.73, + "grad_norm": 0.337890625, + "learning_rate": 0.0003811252388463359, + "loss": 0.1858, + "step": 234920 + }, + { + "epoch": 9.73, + "grad_norm": 0.4921875, + "learning_rate": 0.000381116005017169, + "loss": 0.2075, + "step": 234930 + }, + { + "epoch": 9.73, + "grad_norm": 1.09375, + "learning_rate": 0.0003811067709412559, + "loss": 0.1537, + "step": 234940 + }, + { + "epoch": 9.73, + "grad_norm": 1.421875, + "learning_rate": 0.000381097536618614, + "loss": 0.2052, + "step": 234950 + }, + { + "epoch": 9.73, + "grad_norm": 1.3046875, + "learning_rate": 0.00038108830204926084, + "loss": 0.1733, + "step": 234960 + }, + { + "epoch": 9.73, + "grad_norm": 1.21875, + "learning_rate": 0.0003810790672332135, + "loss": 0.1645, + "step": 234970 + }, + { + "epoch": 9.73, + "grad_norm": 0.306640625, + "learning_rate": 0.0003810698321704895, + "loss": 0.1624, + "step": 234980 + }, + { + "epoch": 9.73, + "grad_norm": 1.3515625, + "learning_rate": 0.0003810605968611064, + "loss": 0.1658, + "step": 234990 + }, + { + "epoch": 9.73, + "grad_norm": 1.4140625, + "learning_rate": 0.0003810513613050812, + "loss": 0.1922, + "step": 235000 + }, + { + "epoch": 9.73, + "grad_norm": 1.21875, + "learning_rate": 0.0003810421255024317, + "loss": 0.2554, + "step": 235010 + }, + { + "epoch": 9.73, + "grad_norm": 0.91796875, + "learning_rate": 0.0003810328894531749, + "loss": 0.1863, + "step": 235020 + }, + { + "epoch": 9.73, + "grad_norm": 0.60546875, + "learning_rate": 0.0003810236531573284, + "loss": 0.2077, + "step": 235030 + }, + { + "epoch": 9.74, + "grad_norm": 0.8359375, + "learning_rate": 0.00038101441661490956, + "loss": 0.1563, + "step": 235040 + }, + { + "epoch": 9.74, + "grad_norm": 0.427734375, + "learning_rate": 0.00038100517982593573, + "loss": 0.1917, + "step": 235050 + }, + { + "epoch": 9.74, + "grad_norm": 0.609375, + "learning_rate": 0.00038099594279042426, + "loss": 0.1734, + "step": 235060 + }, + { + "epoch": 9.74, + "grad_norm": 0.78515625, + "learning_rate": 0.00038098670550839263, + "loss": 0.1583, + "step": 235070 + }, + { + "epoch": 9.74, + "grad_norm": 1.0078125, + "learning_rate": 0.00038097746797985816, + "loss": 0.1339, + "step": 235080 + }, + { + "epoch": 9.74, + "grad_norm": 0.458984375, + "learning_rate": 0.00038096823020483824, + "loss": 0.1929, + "step": 235090 + }, + { + "epoch": 9.74, + "grad_norm": 1.71875, + "learning_rate": 0.00038095899218335027, + "loss": 0.1886, + "step": 235100 + }, + { + "epoch": 9.74, + "grad_norm": 1.0859375, + "learning_rate": 0.00038094975391541155, + "loss": 0.1662, + "step": 235110 + }, + { + "epoch": 9.74, + "grad_norm": 0.421875, + "learning_rate": 0.0003809405154010396, + "loss": 0.209, + "step": 235120 + }, + { + "epoch": 9.74, + "grad_norm": 1.4765625, + "learning_rate": 0.0003809312766402516, + "loss": 0.2073, + "step": 235130 + }, + { + "epoch": 9.74, + "grad_norm": 0.98046875, + "learning_rate": 0.0003809220376330652, + "loss": 0.2279, + "step": 235140 + }, + { + "epoch": 9.74, + "grad_norm": 0.51171875, + "learning_rate": 0.00038091279837949773, + "loss": 0.1869, + "step": 235150 + }, + { + "epoch": 9.74, + "grad_norm": 0.63671875, + "learning_rate": 0.0003809035588795664, + "loss": 0.1525, + "step": 235160 + }, + { + "epoch": 9.74, + "grad_norm": 0.7421875, + "learning_rate": 0.00038089431913328875, + "loss": 0.195, + "step": 235170 + }, + { + "epoch": 9.74, + "grad_norm": 1.1953125, + "learning_rate": 0.00038088507914068215, + "loss": 0.1842, + "step": 235180 + }, + { + "epoch": 9.74, + "grad_norm": 1.359375, + "learning_rate": 0.00038087583890176384, + "loss": 0.2085, + "step": 235190 + }, + { + "epoch": 9.74, + "grad_norm": 1.2578125, + "learning_rate": 0.00038086659841655144, + "loss": 0.1958, + "step": 235200 + }, + { + "epoch": 9.74, + "grad_norm": 0.84765625, + "learning_rate": 0.00038085735768506225, + "loss": 0.2446, + "step": 235210 + }, + { + "epoch": 9.74, + "grad_norm": 1.4765625, + "learning_rate": 0.0003808481167073136, + "loss": 0.1618, + "step": 235220 + }, + { + "epoch": 9.74, + "grad_norm": 1.5625, + "learning_rate": 0.000380838875483323, + "loss": 0.2073, + "step": 235230 + }, + { + "epoch": 9.74, + "grad_norm": 0.91015625, + "learning_rate": 0.00038082963401310765, + "loss": 0.1988, + "step": 235240 + }, + { + "epoch": 9.74, + "grad_norm": 0.51953125, + "learning_rate": 0.00038082039229668513, + "loss": 0.1933, + "step": 235250 + }, + { + "epoch": 9.74, + "grad_norm": 0.625, + "learning_rate": 0.00038081115033407277, + "loss": 0.2081, + "step": 235260 + }, + { + "epoch": 9.74, + "grad_norm": 0.5703125, + "learning_rate": 0.00038080190812528786, + "loss": 0.1751, + "step": 235270 + }, + { + "epoch": 9.75, + "grad_norm": 1.7734375, + "learning_rate": 0.00038079266567034797, + "loss": 0.2231, + "step": 235280 + }, + { + "epoch": 9.75, + "grad_norm": 0.61328125, + "learning_rate": 0.0003807834229692704, + "loss": 0.2798, + "step": 235290 + }, + { + "epoch": 9.75, + "grad_norm": 0.78125, + "learning_rate": 0.0003807741800220725, + "loss": 0.1687, + "step": 235300 + }, + { + "epoch": 9.75, + "grad_norm": 0.66015625, + "learning_rate": 0.0003807649368287718, + "loss": 0.1541, + "step": 235310 + }, + { + "epoch": 9.75, + "grad_norm": 0.74609375, + "learning_rate": 0.00038075569338938555, + "loss": 0.2463, + "step": 235320 + }, + { + "epoch": 9.75, + "grad_norm": 0.58984375, + "learning_rate": 0.0003807464497039312, + "loss": 0.2401, + "step": 235330 + }, + { + "epoch": 9.75, + "grad_norm": 0.81640625, + "learning_rate": 0.0003807372057724262, + "loss": 0.2261, + "step": 235340 + }, + { + "epoch": 9.75, + "grad_norm": 0.66796875, + "learning_rate": 0.00038072796159488784, + "loss": 0.164, + "step": 235350 + }, + { + "epoch": 9.75, + "grad_norm": 0.52734375, + "learning_rate": 0.00038071871717133365, + "loss": 0.2237, + "step": 235360 + }, + { + "epoch": 9.75, + "grad_norm": 0.8671875, + "learning_rate": 0.0003807094725017809, + "loss": 0.2242, + "step": 235370 + }, + { + "epoch": 9.75, + "grad_norm": 0.8359375, + "learning_rate": 0.00038070022758624703, + "loss": 0.2706, + "step": 235380 + }, + { + "epoch": 9.75, + "grad_norm": 0.6328125, + "learning_rate": 0.0003806909824247494, + "loss": 0.1953, + "step": 235390 + }, + { + "epoch": 9.75, + "grad_norm": 0.390625, + "learning_rate": 0.0003806817370173056, + "loss": 0.2086, + "step": 235400 + }, + { + "epoch": 9.75, + "grad_norm": 0.83984375, + "learning_rate": 0.0003806724913639327, + "loss": 0.2122, + "step": 235410 + }, + { + "epoch": 9.75, + "grad_norm": 0.58203125, + "learning_rate": 0.00038066324546464844, + "loss": 0.2127, + "step": 235420 + }, + { + "epoch": 9.75, + "grad_norm": 1.3828125, + "learning_rate": 0.00038065399931946995, + "loss": 0.1937, + "step": 235430 + }, + { + "epoch": 9.75, + "grad_norm": 0.53125, + "learning_rate": 0.00038064475292841484, + "loss": 0.1922, + "step": 235440 + }, + { + "epoch": 9.75, + "grad_norm": 0.99609375, + "learning_rate": 0.00038063550629150036, + "loss": 0.2171, + "step": 235450 + }, + { + "epoch": 9.75, + "grad_norm": 1.046875, + "learning_rate": 0.0003806262594087439, + "loss": 0.2128, + "step": 235460 + }, + { + "epoch": 9.75, + "grad_norm": 0.58984375, + "learning_rate": 0.00038061701228016303, + "loss": 0.2353, + "step": 235470 + }, + { + "epoch": 9.75, + "grad_norm": 0.71875, + "learning_rate": 0.000380607764905775, + "loss": 0.1953, + "step": 235480 + }, + { + "epoch": 9.75, + "grad_norm": 0.8359375, + "learning_rate": 0.00038059851728559723, + "loss": 0.213, + "step": 235490 + }, + { + "epoch": 9.75, + "grad_norm": 0.53125, + "learning_rate": 0.0003805892694196472, + "loss": 0.1959, + "step": 235500 + }, + { + "epoch": 9.75, + "grad_norm": 0.83984375, + "learning_rate": 0.0003805800213079423, + "loss": 0.2464, + "step": 235510 + }, + { + "epoch": 9.76, + "grad_norm": 0.71875, + "learning_rate": 0.00038057077295049993, + "loss": 0.2156, + "step": 235520 + }, + { + "epoch": 9.76, + "grad_norm": 0.42578125, + "learning_rate": 0.0003805615243473374, + "loss": 0.1877, + "step": 235530 + }, + { + "epoch": 9.76, + "grad_norm": 0.71484375, + "learning_rate": 0.00038055227549847216, + "loss": 0.1904, + "step": 235540 + }, + { + "epoch": 9.76, + "grad_norm": 0.42578125, + "learning_rate": 0.0003805430264039217, + "loss": 0.1854, + "step": 235550 + }, + { + "epoch": 9.76, + "grad_norm": 0.80078125, + "learning_rate": 0.00038053377706370327, + "loss": 0.2214, + "step": 235560 + }, + { + "epoch": 9.76, + "grad_norm": 0.62890625, + "learning_rate": 0.0003805245274778344, + "loss": 0.2401, + "step": 235570 + }, + { + "epoch": 9.76, + "grad_norm": 0.9140625, + "learning_rate": 0.0003805152776463324, + "loss": 0.1757, + "step": 235580 + }, + { + "epoch": 9.76, + "grad_norm": 0.890625, + "learning_rate": 0.0003805060275692149, + "loss": 0.1828, + "step": 235590 + }, + { + "epoch": 9.76, + "grad_norm": 0.765625, + "learning_rate": 0.0003804967772464991, + "loss": 0.1686, + "step": 235600 + }, + { + "epoch": 9.76, + "grad_norm": 0.6796875, + "learning_rate": 0.0003804875266782024, + "loss": 0.1369, + "step": 235610 + }, + { + "epoch": 9.76, + "grad_norm": 1.1015625, + "learning_rate": 0.0003804782758643423, + "loss": 0.1863, + "step": 235620 + }, + { + "epoch": 9.76, + "grad_norm": 0.416015625, + "learning_rate": 0.0003804690248049362, + "loss": 0.1772, + "step": 235630 + }, + { + "epoch": 9.76, + "grad_norm": 0.4921875, + "learning_rate": 0.00038045977350000145, + "loss": 0.1658, + "step": 235640 + }, + { + "epoch": 9.76, + "grad_norm": 0.96875, + "learning_rate": 0.00038045052194955547, + "loss": 0.1998, + "step": 235650 + }, + { + "epoch": 9.76, + "grad_norm": 0.91796875, + "learning_rate": 0.00038044127015361576, + "loss": 0.2063, + "step": 235660 + }, + { + "epoch": 9.76, + "grad_norm": 1.203125, + "learning_rate": 0.0003804320181121996, + "loss": 0.1767, + "step": 235670 + }, + { + "epoch": 9.76, + "grad_norm": 1.421875, + "learning_rate": 0.00038042276582532454, + "loss": 0.1983, + "step": 235680 + }, + { + "epoch": 9.76, + "grad_norm": 0.68359375, + "learning_rate": 0.0003804135132930079, + "loss": 0.1991, + "step": 235690 + }, + { + "epoch": 9.76, + "grad_norm": 0.84375, + "learning_rate": 0.00038040426051526706, + "loss": 0.2206, + "step": 235700 + }, + { + "epoch": 9.76, + "grad_norm": 0.87109375, + "learning_rate": 0.0003803950074921195, + "loss": 0.1735, + "step": 235710 + }, + { + "epoch": 9.76, + "grad_norm": 0.2138671875, + "learning_rate": 0.0003803857542235826, + "loss": 0.187, + "step": 235720 + }, + { + "epoch": 9.76, + "grad_norm": 0.341796875, + "learning_rate": 0.0003803765007096739, + "loss": 0.1719, + "step": 235730 + }, + { + "epoch": 9.76, + "grad_norm": 0.5546875, + "learning_rate": 0.0003803672469504106, + "loss": 0.2127, + "step": 235740 + }, + { + "epoch": 9.76, + "grad_norm": 0.76953125, + "learning_rate": 0.0003803579929458102, + "loss": 0.1976, + "step": 235750 + }, + { + "epoch": 9.77, + "grad_norm": 0.373046875, + "learning_rate": 0.00038034873869589023, + "loss": 0.2109, + "step": 235760 + }, + { + "epoch": 9.77, + "grad_norm": 0.8046875, + "learning_rate": 0.000380339484200668, + "loss": 0.2141, + "step": 235770 + }, + { + "epoch": 9.77, + "grad_norm": 0.75390625, + "learning_rate": 0.00038033022946016085, + "loss": 0.1823, + "step": 235780 + }, + { + "epoch": 9.77, + "grad_norm": 0.73828125, + "learning_rate": 0.0003803209744743863, + "loss": 0.1889, + "step": 235790 + }, + { + "epoch": 9.77, + "grad_norm": 1.953125, + "learning_rate": 0.00038031171924336185, + "loss": 0.2668, + "step": 235800 + }, + { + "epoch": 9.77, + "grad_norm": 0.6796875, + "learning_rate": 0.00038030246376710475, + "loss": 0.1566, + "step": 235810 + }, + { + "epoch": 9.77, + "grad_norm": 0.90625, + "learning_rate": 0.00038029320804563245, + "loss": 0.2969, + "step": 235820 + }, + { + "epoch": 9.77, + "grad_norm": 1.3828125, + "learning_rate": 0.0003802839520789624, + "loss": 0.1608, + "step": 235830 + }, + { + "epoch": 9.77, + "grad_norm": 0.7734375, + "learning_rate": 0.0003802746958671121, + "loss": 0.2009, + "step": 235840 + }, + { + "epoch": 9.77, + "grad_norm": 1.328125, + "learning_rate": 0.0003802654394100988, + "loss": 0.1944, + "step": 235850 + }, + { + "epoch": 9.77, + "grad_norm": 1.6875, + "learning_rate": 0.0003802561827079401, + "loss": 0.1844, + "step": 235860 + }, + { + "epoch": 9.77, + "grad_norm": 0.765625, + "learning_rate": 0.0003802469257606533, + "loss": 0.1779, + "step": 235870 + }, + { + "epoch": 9.77, + "grad_norm": 1.796875, + "learning_rate": 0.00038023766856825584, + "loss": 0.2125, + "step": 235880 + }, + { + "epoch": 9.77, + "grad_norm": 1.7421875, + "learning_rate": 0.00038022841113076515, + "loss": 0.219, + "step": 235890 + }, + { + "epoch": 9.77, + "grad_norm": 0.74609375, + "learning_rate": 0.0003802191534481987, + "loss": 0.2397, + "step": 235900 + }, + { + "epoch": 9.77, + "grad_norm": 0.6328125, + "learning_rate": 0.00038020989552057377, + "loss": 0.2552, + "step": 235910 + }, + { + "epoch": 9.77, + "grad_norm": 0.470703125, + "learning_rate": 0.0003802006373479079, + "loss": 0.2063, + "step": 235920 + }, + { + "epoch": 9.77, + "grad_norm": 0.65625, + "learning_rate": 0.0003801913789302186, + "loss": 0.2219, + "step": 235930 + }, + { + "epoch": 9.77, + "grad_norm": 0.72265625, + "learning_rate": 0.00038018212026752306, + "loss": 0.1789, + "step": 235940 + }, + { + "epoch": 9.77, + "grad_norm": 0.77734375, + "learning_rate": 0.0003801728613598389, + "loss": 0.2193, + "step": 235950 + }, + { + "epoch": 9.77, + "grad_norm": 1.1015625, + "learning_rate": 0.00038016360220718336, + "loss": 0.1992, + "step": 235960 + }, + { + "epoch": 9.77, + "grad_norm": 0.59375, + "learning_rate": 0.0003801543428095742, + "loss": 0.2095, + "step": 235970 + }, + { + "epoch": 9.77, + "grad_norm": 0.515625, + "learning_rate": 0.00038014508316702847, + "loss": 0.1858, + "step": 235980 + }, + { + "epoch": 9.77, + "grad_norm": 0.96875, + "learning_rate": 0.0003801358232795637, + "loss": 0.1797, + "step": 235990 + }, + { + "epoch": 9.78, + "grad_norm": 0.60546875, + "learning_rate": 0.0003801265631471974, + "loss": 0.19, + "step": 236000 + }, + { + "epoch": 9.78, + "grad_norm": 1.0078125, + "learning_rate": 0.00038011730276994705, + "loss": 0.2076, + "step": 236010 + }, + { + "epoch": 9.78, + "grad_norm": 0.59765625, + "learning_rate": 0.0003801080421478299, + "loss": 0.2298, + "step": 236020 + }, + { + "epoch": 9.78, + "grad_norm": 0.59375, + "learning_rate": 0.0003800987812808635, + "loss": 0.1866, + "step": 236030 + }, + { + "epoch": 9.78, + "grad_norm": 0.5546875, + "learning_rate": 0.00038008952016906517, + "loss": 0.1815, + "step": 236040 + }, + { + "epoch": 9.78, + "grad_norm": 0.37109375, + "learning_rate": 0.00038008025881245257, + "loss": 0.1563, + "step": 236050 + }, + { + "epoch": 9.78, + "grad_norm": 0.7265625, + "learning_rate": 0.0003800709972110429, + "loss": 0.1717, + "step": 236060 + }, + { + "epoch": 9.78, + "grad_norm": 0.80859375, + "learning_rate": 0.00038006173536485354, + "loss": 0.1736, + "step": 236070 + }, + { + "epoch": 9.78, + "grad_norm": 0.69140625, + "learning_rate": 0.00038005247327390214, + "loss": 0.2169, + "step": 236080 + }, + { + "epoch": 9.78, + "grad_norm": 0.2470703125, + "learning_rate": 0.000380043210938206, + "loss": 0.1805, + "step": 236090 + }, + { + "epoch": 9.78, + "grad_norm": 0.97265625, + "learning_rate": 0.0003800339483577827, + "loss": 0.2317, + "step": 236100 + }, + { + "epoch": 9.78, + "grad_norm": 0.490234375, + "learning_rate": 0.0003800246855326494, + "loss": 0.1676, + "step": 236110 + }, + { + "epoch": 9.78, + "grad_norm": 1.8515625, + "learning_rate": 0.0003800154224628237, + "loss": 0.1685, + "step": 236120 + }, + { + "epoch": 9.78, + "grad_norm": 1.9140625, + "learning_rate": 0.0003800061591483231, + "loss": 0.215, + "step": 236130 + }, + { + "epoch": 9.78, + "grad_norm": 0.72265625, + "learning_rate": 0.00037999689558916496, + "loss": 0.1948, + "step": 236140 + }, + { + "epoch": 9.78, + "grad_norm": 0.259765625, + "learning_rate": 0.0003799876317853666, + "loss": 0.1768, + "step": 236150 + }, + { + "epoch": 9.78, + "grad_norm": 0.78515625, + "learning_rate": 0.0003799783677369456, + "loss": 0.1625, + "step": 236160 + }, + { + "epoch": 9.78, + "grad_norm": 0.37109375, + "learning_rate": 0.00037996910344391936, + "loss": 0.2015, + "step": 236170 + }, + { + "epoch": 9.78, + "grad_norm": 0.197265625, + "learning_rate": 0.00037995983890630523, + "loss": 0.1851, + "step": 236180 + }, + { + "epoch": 9.78, + "grad_norm": 1.1015625, + "learning_rate": 0.0003799505741241208, + "loss": 0.1921, + "step": 236190 + }, + { + "epoch": 9.78, + "grad_norm": 0.4765625, + "learning_rate": 0.00037994130909738344, + "loss": 0.1832, + "step": 236200 + }, + { + "epoch": 9.78, + "grad_norm": 0.76953125, + "learning_rate": 0.00037993204382611047, + "loss": 0.2732, + "step": 236210 + }, + { + "epoch": 9.78, + "grad_norm": 0.423828125, + "learning_rate": 0.0003799227783103195, + "loss": 0.1822, + "step": 236220 + }, + { + "epoch": 9.78, + "grad_norm": 1.703125, + "learning_rate": 0.0003799135125500279, + "loss": 0.1918, + "step": 236230 + }, + { + "epoch": 9.79, + "grad_norm": 1.265625, + "learning_rate": 0.0003799042465452531, + "loss": 0.1831, + "step": 236240 + }, + { + "epoch": 9.79, + "grad_norm": 1.3125, + "learning_rate": 0.00037989498029601253, + "loss": 0.2179, + "step": 236250 + }, + { + "epoch": 9.79, + "grad_norm": 1.375, + "learning_rate": 0.00037988571380232356, + "loss": 0.1839, + "step": 236260 + }, + { + "epoch": 9.79, + "grad_norm": 0.4140625, + "learning_rate": 0.00037987644706420377, + "loss": 0.1781, + "step": 236270 + }, + { + "epoch": 9.79, + "grad_norm": 0.63671875, + "learning_rate": 0.00037986718008167054, + "loss": 0.1886, + "step": 236280 + }, + { + "epoch": 9.79, + "grad_norm": 0.37890625, + "learning_rate": 0.0003798579128547412, + "loss": 0.2099, + "step": 236290 + }, + { + "epoch": 9.79, + "grad_norm": 0.64453125, + "learning_rate": 0.00037984864538343345, + "loss": 0.2119, + "step": 236300 + }, + { + "epoch": 9.79, + "grad_norm": 0.326171875, + "learning_rate": 0.0003798393776677645, + "loss": 0.1844, + "step": 236310 + }, + { + "epoch": 9.79, + "grad_norm": 1.1875, + "learning_rate": 0.00037983010970775184, + "loss": 0.1845, + "step": 236320 + }, + { + "epoch": 9.79, + "grad_norm": 0.9296875, + "learning_rate": 0.000379820841503413, + "loss": 0.2101, + "step": 236330 + }, + { + "epoch": 9.79, + "grad_norm": 0.41015625, + "learning_rate": 0.00037981157305476523, + "loss": 0.19, + "step": 236340 + }, + { + "epoch": 9.79, + "grad_norm": 1.2109375, + "learning_rate": 0.0003798023043618262, + "loss": 0.1282, + "step": 236350 + }, + { + "epoch": 9.79, + "grad_norm": 0.69921875, + "learning_rate": 0.00037979303542461324, + "loss": 0.1601, + "step": 236360 + }, + { + "epoch": 9.79, + "grad_norm": 0.77734375, + "learning_rate": 0.0003797837662431438, + "loss": 0.2049, + "step": 236370 + }, + { + "epoch": 9.79, + "grad_norm": 0.90625, + "learning_rate": 0.0003797744968174354, + "loss": 0.1692, + "step": 236380 + }, + { + "epoch": 9.79, + "grad_norm": 0.73828125, + "learning_rate": 0.0003797652271475053, + "loss": 0.1858, + "step": 236390 + }, + { + "epoch": 9.79, + "grad_norm": 1.2421875, + "learning_rate": 0.00037975595723337103, + "loss": 0.2294, + "step": 236400 + }, + { + "epoch": 9.79, + "grad_norm": 0.2431640625, + "learning_rate": 0.0003797466870750501, + "loss": 0.1693, + "step": 236410 + }, + { + "epoch": 9.79, + "grad_norm": 0.55859375, + "learning_rate": 0.0003797374166725599, + "loss": 0.2393, + "step": 236420 + }, + { + "epoch": 9.79, + "grad_norm": 0.76953125, + "learning_rate": 0.000379728146025918, + "loss": 0.1467, + "step": 236430 + }, + { + "epoch": 9.79, + "grad_norm": 1.171875, + "learning_rate": 0.0003797188751351416, + "loss": 0.2113, + "step": 236440 + }, + { + "epoch": 9.79, + "grad_norm": 0.87109375, + "learning_rate": 0.00037970960400024836, + "loss": 0.1923, + "step": 236450 + }, + { + "epoch": 9.79, + "grad_norm": 0.51171875, + "learning_rate": 0.0003797003326212557, + "loss": 0.1701, + "step": 236460 + }, + { + "epoch": 9.79, + "grad_norm": 0.95703125, + "learning_rate": 0.0003796910609981809, + "loss": 0.184, + "step": 236470 + }, + { + "epoch": 9.79, + "grad_norm": 0.88671875, + "learning_rate": 0.00037968178913104164, + "loss": 0.194, + "step": 236480 + }, + { + "epoch": 9.8, + "grad_norm": 0.421875, + "learning_rate": 0.0003796725170198552, + "loss": 0.14, + "step": 236490 + }, + { + "epoch": 9.8, + "grad_norm": 1.7265625, + "learning_rate": 0.0003796632446646391, + "loss": 0.1632, + "step": 236500 + }, + { + "epoch": 9.8, + "grad_norm": 1.15625, + "learning_rate": 0.00037965397206541073, + "loss": 0.2148, + "step": 236510 + }, + { + "epoch": 9.8, + "grad_norm": 0.984375, + "learning_rate": 0.0003796446992221877, + "loss": 0.1992, + "step": 236520 + }, + { + "epoch": 9.8, + "grad_norm": 0.265625, + "learning_rate": 0.00037963542613498723, + "loss": 0.2098, + "step": 236530 + }, + { + "epoch": 9.8, + "grad_norm": 0.3828125, + "learning_rate": 0.000379626152803827, + "loss": 0.1813, + "step": 236540 + }, + { + "epoch": 9.8, + "grad_norm": 0.84375, + "learning_rate": 0.00037961687922872416, + "loss": 0.2135, + "step": 236550 + }, + { + "epoch": 9.8, + "grad_norm": 0.51953125, + "learning_rate": 0.0003796076054096965, + "loss": 0.2166, + "step": 236560 + }, + { + "epoch": 9.8, + "grad_norm": 2.8125, + "learning_rate": 0.00037959833134676137, + "loss": 0.1948, + "step": 236570 + }, + { + "epoch": 9.8, + "grad_norm": 0.578125, + "learning_rate": 0.00037958905703993603, + "loss": 0.2168, + "step": 236580 + }, + { + "epoch": 9.8, + "grad_norm": 0.9296875, + "learning_rate": 0.0003795797824892382, + "loss": 0.1679, + "step": 236590 + }, + { + "epoch": 9.8, + "grad_norm": 0.47265625, + "learning_rate": 0.00037957050769468515, + "loss": 0.1907, + "step": 236600 + }, + { + "epoch": 9.8, + "grad_norm": 1.703125, + "learning_rate": 0.0003795612326562944, + "loss": 0.1406, + "step": 236610 + }, + { + "epoch": 9.8, + "grad_norm": 0.6015625, + "learning_rate": 0.00037955195737408344, + "loss": 0.1539, + "step": 236620 + }, + { + "epoch": 9.8, + "grad_norm": 0.625, + "learning_rate": 0.00037954268184806956, + "loss": 0.1915, + "step": 236630 + }, + { + "epoch": 9.8, + "grad_norm": 0.8203125, + "learning_rate": 0.0003795334060782705, + "loss": 0.1686, + "step": 236640 + }, + { + "epoch": 9.8, + "grad_norm": 0.55859375, + "learning_rate": 0.00037952413006470353, + "loss": 0.2012, + "step": 236650 + }, + { + "epoch": 9.8, + "grad_norm": 0.765625, + "learning_rate": 0.000379514853807386, + "loss": 0.1911, + "step": 236660 + }, + { + "epoch": 9.8, + "grad_norm": 0.78515625, + "learning_rate": 0.0003795055773063357, + "loss": 0.2298, + "step": 236670 + }, + { + "epoch": 9.8, + "grad_norm": 0.578125, + "learning_rate": 0.0003794963005615698, + "loss": 0.209, + "step": 236680 + }, + { + "epoch": 9.8, + "grad_norm": 0.333984375, + "learning_rate": 0.0003794870235731058, + "loss": 0.1464, + "step": 236690 + }, + { + "epoch": 9.8, + "grad_norm": 1.2109375, + "learning_rate": 0.0003794777463409612, + "loss": 0.2158, + "step": 236700 + }, + { + "epoch": 9.8, + "grad_norm": 0.83984375, + "learning_rate": 0.0003794684688651535, + "loss": 0.1985, + "step": 236710 + }, + { + "epoch": 9.8, + "grad_norm": 0.6171875, + "learning_rate": 0.00037945919114570014, + "loss": 0.2107, + "step": 236720 + }, + { + "epoch": 9.81, + "grad_norm": 1.0625, + "learning_rate": 0.00037944991318261857, + "loss": 0.209, + "step": 236730 + }, + { + "epoch": 9.81, + "grad_norm": 1.1796875, + "learning_rate": 0.00037944063497592615, + "loss": 0.1944, + "step": 236740 + }, + { + "epoch": 9.81, + "grad_norm": 0.97265625, + "learning_rate": 0.0003794313565256406, + "loss": 0.1596, + "step": 236750 + }, + { + "epoch": 9.81, + "grad_norm": 0.255859375, + "learning_rate": 0.0003794220778317791, + "loss": 0.1841, + "step": 236760 + }, + { + "epoch": 9.81, + "grad_norm": 0.6796875, + "learning_rate": 0.0003794127988943593, + "loss": 0.1909, + "step": 236770 + }, + { + "epoch": 9.81, + "grad_norm": 1.0078125, + "learning_rate": 0.00037940351971339846, + "loss": 0.1762, + "step": 236780 + }, + { + "epoch": 9.81, + "grad_norm": 1.015625, + "learning_rate": 0.0003793942402889142, + "loss": 0.1831, + "step": 236790 + }, + { + "epoch": 9.81, + "grad_norm": 0.28515625, + "learning_rate": 0.0003793849606209241, + "loss": 0.1508, + "step": 236800 + }, + { + "epoch": 9.81, + "grad_norm": 0.98046875, + "learning_rate": 0.0003793756807094454, + "loss": 0.1985, + "step": 236810 + }, + { + "epoch": 9.81, + "grad_norm": 0.3828125, + "learning_rate": 0.00037936640055449556, + "loss": 0.2009, + "step": 236820 + }, + { + "epoch": 9.81, + "grad_norm": 0.8828125, + "learning_rate": 0.0003793571201560922, + "loss": 0.2281, + "step": 236830 + }, + { + "epoch": 9.81, + "grad_norm": 0.6484375, + "learning_rate": 0.00037934783951425264, + "loss": 0.2049, + "step": 236840 + }, + { + "epoch": 9.81, + "grad_norm": 0.8828125, + "learning_rate": 0.00037933855862899447, + "loss": 0.235, + "step": 236850 + }, + { + "epoch": 9.81, + "grad_norm": 1.8125, + "learning_rate": 0.0003793292775003352, + "loss": 0.2091, + "step": 236860 + }, + { + "epoch": 9.81, + "grad_norm": 0.62109375, + "learning_rate": 0.00037931999612829197, + "loss": 0.1956, + "step": 236870 + }, + { + "epoch": 9.81, + "grad_norm": 0.71875, + "learning_rate": 0.00037931071451288267, + "loss": 0.2211, + "step": 236880 + }, + { + "epoch": 9.81, + "grad_norm": 1.1015625, + "learning_rate": 0.0003793014326541244, + "loss": 0.2141, + "step": 236890 + }, + { + "epoch": 9.81, + "grad_norm": 0.5078125, + "learning_rate": 0.0003792921505520349, + "loss": 0.1654, + "step": 236900 + }, + { + "epoch": 9.81, + "grad_norm": 1.03125, + "learning_rate": 0.0003792828682066316, + "loss": 0.2008, + "step": 236910 + }, + { + "epoch": 9.81, + "grad_norm": 0.423828125, + "learning_rate": 0.0003792735856179318, + "loss": 0.1891, + "step": 236920 + }, + { + "epoch": 9.81, + "grad_norm": 0.48828125, + "learning_rate": 0.00037926430278595314, + "loss": 0.1763, + "step": 236930 + }, + { + "epoch": 9.81, + "grad_norm": 0.8125, + "learning_rate": 0.000379255019710713, + "loss": 0.2354, + "step": 236940 + }, + { + "epoch": 9.81, + "grad_norm": 1.3203125, + "learning_rate": 0.00037924573639222883, + "loss": 0.1577, + "step": 236950 + }, + { + "epoch": 9.81, + "grad_norm": 0.65234375, + "learning_rate": 0.00037923645283051817, + "loss": 0.2565, + "step": 236960 + }, + { + "epoch": 9.82, + "grad_norm": 1.5546875, + "learning_rate": 0.0003792271690255984, + "loss": 0.212, + "step": 236970 + }, + { + "epoch": 9.82, + "grad_norm": 1.1015625, + "learning_rate": 0.0003792178849774871, + "loss": 0.1887, + "step": 236980 + }, + { + "epoch": 9.82, + "grad_norm": 0.921875, + "learning_rate": 0.00037920860068620175, + "loss": 0.189, + "step": 236990 + }, + { + "epoch": 9.82, + "grad_norm": 0.65234375, + "learning_rate": 0.0003791993161517597, + "loss": 0.1813, + "step": 237000 + }, + { + "epoch": 9.82, + "grad_norm": 1.40625, + "learning_rate": 0.0003791900313741785, + "loss": 0.1908, + "step": 237010 + }, + { + "epoch": 9.82, + "grad_norm": 0.578125, + "learning_rate": 0.00037918074635347567, + "loss": 0.1947, + "step": 237020 + }, + { + "epoch": 9.82, + "grad_norm": 1.3359375, + "learning_rate": 0.00037917146108966854, + "loss": 0.1901, + "step": 237030 + }, + { + "epoch": 9.82, + "grad_norm": 1.40625, + "learning_rate": 0.0003791621755827748, + "loss": 0.2305, + "step": 237040 + }, + { + "epoch": 9.82, + "grad_norm": 0.73046875, + "learning_rate": 0.0003791528898328116, + "loss": 0.1918, + "step": 237050 + }, + { + "epoch": 9.82, + "grad_norm": 1.4921875, + "learning_rate": 0.0003791436038397967, + "loss": 0.201, + "step": 237060 + }, + { + "epoch": 9.82, + "grad_norm": 2.265625, + "learning_rate": 0.00037913431760374755, + "loss": 0.2027, + "step": 237070 + }, + { + "epoch": 9.82, + "grad_norm": 0.400390625, + "learning_rate": 0.0003791250311246815, + "loss": 0.1624, + "step": 237080 + }, + { + "epoch": 9.82, + "grad_norm": 0.5625, + "learning_rate": 0.0003791157444026161, + "loss": 0.2039, + "step": 237090 + }, + { + "epoch": 9.82, + "grad_norm": 0.81640625, + "learning_rate": 0.00037910645743756877, + "loss": 0.1923, + "step": 237100 + }, + { + "epoch": 9.82, + "grad_norm": 0.443359375, + "learning_rate": 0.000379097170229557, + "loss": 0.1938, + "step": 237110 + }, + { + "epoch": 9.82, + "grad_norm": 0.44921875, + "learning_rate": 0.00037908788277859844, + "loss": 0.1665, + "step": 237120 + }, + { + "epoch": 9.82, + "grad_norm": 0.458984375, + "learning_rate": 0.00037907859508471033, + "loss": 0.1843, + "step": 237130 + }, + { + "epoch": 9.82, + "grad_norm": 1.1328125, + "learning_rate": 0.0003790693071479103, + "loss": 0.1606, + "step": 237140 + }, + { + "epoch": 9.82, + "grad_norm": 1.2265625, + "learning_rate": 0.00037906001896821574, + "loss": 0.2226, + "step": 237150 + }, + { + "epoch": 9.82, + "grad_norm": 0.55078125, + "learning_rate": 0.0003790507305456441, + "loss": 0.2147, + "step": 237160 + }, + { + "epoch": 9.82, + "grad_norm": 0.359375, + "learning_rate": 0.00037904144188021297, + "loss": 0.2116, + "step": 237170 + }, + { + "epoch": 9.82, + "grad_norm": 0.875, + "learning_rate": 0.0003790321529719398, + "loss": 0.2335, + "step": 237180 + }, + { + "epoch": 9.82, + "grad_norm": 0.6015625, + "learning_rate": 0.000379022863820842, + "loss": 0.2133, + "step": 237190 + }, + { + "epoch": 9.82, + "grad_norm": 1.53125, + "learning_rate": 0.00037901357442693715, + "loss": 0.1624, + "step": 237200 + }, + { + "epoch": 9.83, + "grad_norm": 1.0546875, + "learning_rate": 0.0003790042847902427, + "loss": 0.2098, + "step": 237210 + }, + { + "epoch": 9.83, + "grad_norm": 0.50390625, + "learning_rate": 0.00037899499491077606, + "loss": 0.2068, + "step": 237220 + }, + { + "epoch": 9.83, + "grad_norm": 0.5703125, + "learning_rate": 0.0003789857047885548, + "loss": 0.1766, + "step": 237230 + }, + { + "epoch": 9.83, + "grad_norm": 0.875, + "learning_rate": 0.0003789764144235964, + "loss": 0.1785, + "step": 237240 + }, + { + "epoch": 9.83, + "grad_norm": 1.1328125, + "learning_rate": 0.0003789671238159183, + "loss": 0.2479, + "step": 237250 + }, + { + "epoch": 9.83, + "grad_norm": 0.6875, + "learning_rate": 0.000378957832965538, + "loss": 0.2046, + "step": 237260 + }, + { + "epoch": 9.83, + "grad_norm": 0.66015625, + "learning_rate": 0.00037894854187247294, + "loss": 0.1974, + "step": 237270 + }, + { + "epoch": 9.83, + "grad_norm": 0.66015625, + "learning_rate": 0.0003789392505367407, + "loss": 0.202, + "step": 237280 + }, + { + "epoch": 9.83, + "grad_norm": 1.5234375, + "learning_rate": 0.0003789299589583587, + "loss": 0.214, + "step": 237290 + }, + { + "epoch": 9.83, + "grad_norm": 0.95703125, + "learning_rate": 0.00037892066713734443, + "loss": 0.2595, + "step": 237300 + }, + { + "epoch": 9.83, + "grad_norm": 1.34375, + "learning_rate": 0.0003789113750737154, + "loss": 0.1624, + "step": 237310 + }, + { + "epoch": 9.83, + "grad_norm": 0.486328125, + "learning_rate": 0.0003789020827674891, + "loss": 0.2039, + "step": 237320 + }, + { + "epoch": 9.83, + "grad_norm": 0.76171875, + "learning_rate": 0.000378892790218683, + "loss": 0.1886, + "step": 237330 + }, + { + "epoch": 9.83, + "grad_norm": 0.68359375, + "learning_rate": 0.00037888349742731456, + "loss": 0.1701, + "step": 237340 + }, + { + "epoch": 9.83, + "grad_norm": 0.76953125, + "learning_rate": 0.00037887420439340126, + "loss": 0.2258, + "step": 237350 + }, + { + "epoch": 9.83, + "grad_norm": 1.0625, + "learning_rate": 0.00037886491111696076, + "loss": 0.2082, + "step": 237360 + }, + { + "epoch": 9.83, + "grad_norm": 0.220703125, + "learning_rate": 0.00037885561759801027, + "loss": 0.1999, + "step": 237370 + }, + { + "epoch": 9.83, + "grad_norm": 0.6328125, + "learning_rate": 0.0003788463238365675, + "loss": 0.2159, + "step": 237380 + }, + { + "epoch": 9.83, + "grad_norm": 1.3046875, + "learning_rate": 0.0003788370298326498, + "loss": 0.1874, + "step": 237390 + }, + { + "epoch": 9.83, + "grad_norm": 0.5390625, + "learning_rate": 0.0003788277355862748, + "loss": 0.1941, + "step": 237400 + }, + { + "epoch": 9.83, + "grad_norm": 0.6953125, + "learning_rate": 0.0003788184410974599, + "loss": 0.1888, + "step": 237410 + }, + { + "epoch": 9.83, + "grad_norm": 0.46875, + "learning_rate": 0.00037880914636622255, + "loss": 0.1913, + "step": 237420 + }, + { + "epoch": 9.83, + "grad_norm": 0.625, + "learning_rate": 0.00037879985139258033, + "loss": 0.2132, + "step": 237430 + }, + { + "epoch": 9.83, + "grad_norm": 0.32421875, + "learning_rate": 0.00037879055617655067, + "loss": 0.2197, + "step": 237440 + }, + { + "epoch": 9.84, + "grad_norm": 1.078125, + "learning_rate": 0.00037878126071815113, + "loss": 0.1921, + "step": 237450 + }, + { + "epoch": 9.84, + "grad_norm": 1.0, + "learning_rate": 0.0003787719650173992, + "loss": 0.2246, + "step": 237460 + }, + { + "epoch": 9.84, + "grad_norm": 0.8125, + "learning_rate": 0.0003787626690743122, + "loss": 0.1844, + "step": 237470 + }, + { + "epoch": 9.84, + "grad_norm": 0.5, + "learning_rate": 0.00037875337288890786, + "loss": 0.1699, + "step": 237480 + }, + { + "epoch": 9.84, + "grad_norm": 1.75, + "learning_rate": 0.0003787440764612036, + "loss": 0.1813, + "step": 237490 + }, + { + "epoch": 9.84, + "grad_norm": 0.84375, + "learning_rate": 0.0003787347797912168, + "loss": 0.2265, + "step": 237500 + }, + { + "epoch": 9.84, + "grad_norm": 0.796875, + "learning_rate": 0.0003787254828789651, + "loss": 0.1987, + "step": 237510 + }, + { + "epoch": 9.84, + "grad_norm": 0.30859375, + "learning_rate": 0.000378716185724466, + "loss": 0.1486, + "step": 237520 + }, + { + "epoch": 9.84, + "grad_norm": 0.54296875, + "learning_rate": 0.0003787068883277369, + "loss": 0.2025, + "step": 237530 + }, + { + "epoch": 9.84, + "grad_norm": 0.44140625, + "learning_rate": 0.00037869759068879534, + "loss": 0.2307, + "step": 237540 + }, + { + "epoch": 9.84, + "grad_norm": 0.7265625, + "learning_rate": 0.0003786882928076588, + "loss": 0.1612, + "step": 237550 + }, + { + "epoch": 9.84, + "grad_norm": 1.78125, + "learning_rate": 0.00037867899468434473, + "loss": 0.2332, + "step": 237560 + }, + { + "epoch": 9.84, + "grad_norm": 1.46875, + "learning_rate": 0.0003786696963188708, + "loss": 0.2113, + "step": 237570 + }, + { + "epoch": 9.84, + "grad_norm": 0.72265625, + "learning_rate": 0.00037866039771125435, + "loss": 0.168, + "step": 237580 + }, + { + "epoch": 9.84, + "grad_norm": 0.66015625, + "learning_rate": 0.00037865109886151287, + "loss": 0.1862, + "step": 237590 + }, + { + "epoch": 9.84, + "grad_norm": 0.337890625, + "learning_rate": 0.00037864179976966394, + "loss": 0.1905, + "step": 237600 + }, + { + "epoch": 9.84, + "grad_norm": 0.609375, + "learning_rate": 0.0003786325004357251, + "loss": 0.1718, + "step": 237610 + }, + { + "epoch": 9.84, + "grad_norm": 1.125, + "learning_rate": 0.00037862320085971376, + "loss": 0.1556, + "step": 237620 + }, + { + "epoch": 9.84, + "grad_norm": 0.87109375, + "learning_rate": 0.0003786139010416474, + "loss": 0.2199, + "step": 237630 + }, + { + "epoch": 9.84, + "grad_norm": 0.640625, + "learning_rate": 0.0003786046009815436, + "loss": 0.2019, + "step": 237640 + }, + { + "epoch": 9.84, + "grad_norm": 0.60546875, + "learning_rate": 0.00037859530067941986, + "loss": 0.1816, + "step": 237650 + }, + { + "epoch": 9.84, + "grad_norm": 0.84765625, + "learning_rate": 0.0003785860001352936, + "loss": 0.1993, + "step": 237660 + }, + { + "epoch": 9.84, + "grad_norm": 0.87109375, + "learning_rate": 0.0003785766993491824, + "loss": 0.1485, + "step": 237670 + }, + { + "epoch": 9.84, + "grad_norm": 0.287109375, + "learning_rate": 0.00037856739832110365, + "loss": 0.2006, + "step": 237680 + }, + { + "epoch": 9.85, + "grad_norm": 0.609375, + "learning_rate": 0.00037855809705107505, + "loss": 0.2253, + "step": 237690 + }, + { + "epoch": 9.85, + "grad_norm": 0.625, + "learning_rate": 0.000378548795539114, + "loss": 0.1736, + "step": 237700 + }, + { + "epoch": 9.85, + "grad_norm": 0.89453125, + "learning_rate": 0.0003785394937852379, + "loss": 0.1897, + "step": 237710 + }, + { + "epoch": 9.85, + "grad_norm": 1.015625, + "learning_rate": 0.0003785301917894645, + "loss": 0.2169, + "step": 237720 + }, + { + "epoch": 9.85, + "grad_norm": 0.5546875, + "learning_rate": 0.000378520889551811, + "loss": 0.174, + "step": 237730 + }, + { + "epoch": 9.85, + "grad_norm": 0.4609375, + "learning_rate": 0.0003785115870722952, + "loss": 0.222, + "step": 237740 + }, + { + "epoch": 9.85, + "grad_norm": 0.9140625, + "learning_rate": 0.00037850228435093436, + "loss": 0.2014, + "step": 237750 + }, + { + "epoch": 9.85, + "grad_norm": 0.921875, + "learning_rate": 0.0003784929813877461, + "loss": 0.1877, + "step": 237760 + }, + { + "epoch": 9.85, + "grad_norm": 0.205078125, + "learning_rate": 0.000378483678182748, + "loss": 0.1612, + "step": 237770 + }, + { + "epoch": 9.85, + "grad_norm": 0.7109375, + "learning_rate": 0.0003784743747359575, + "loss": 0.1986, + "step": 237780 + }, + { + "epoch": 9.85, + "grad_norm": 1.015625, + "learning_rate": 0.000378465071047392, + "loss": 0.1928, + "step": 237790 + }, + { + "epoch": 9.85, + "grad_norm": 0.78125, + "learning_rate": 0.0003784557671170692, + "loss": 0.2133, + "step": 237800 + }, + { + "epoch": 9.85, + "grad_norm": 0.314453125, + "learning_rate": 0.0003784464629450065, + "loss": 0.2285, + "step": 237810 + }, + { + "epoch": 9.85, + "grad_norm": 0.64453125, + "learning_rate": 0.0003784371585312215, + "loss": 0.1615, + "step": 237820 + }, + { + "epoch": 9.85, + "grad_norm": 0.76953125, + "learning_rate": 0.00037842785387573154, + "loss": 0.1947, + "step": 237830 + }, + { + "epoch": 9.85, + "grad_norm": 0.66796875, + "learning_rate": 0.00037841854897855423, + "loss": 0.2315, + "step": 237840 + }, + { + "epoch": 9.85, + "grad_norm": 0.671875, + "learning_rate": 0.0003784092438397071, + "loss": 0.2001, + "step": 237850 + }, + { + "epoch": 9.85, + "grad_norm": 0.890625, + "learning_rate": 0.00037839993845920755, + "loss": 0.173, + "step": 237860 + }, + { + "epoch": 9.85, + "grad_norm": 0.625, + "learning_rate": 0.0003783906328370733, + "loss": 0.1762, + "step": 237870 + }, + { + "epoch": 9.85, + "grad_norm": 1.03125, + "learning_rate": 0.00037838132697332174, + "loss": 0.1892, + "step": 237880 + }, + { + "epoch": 9.85, + "grad_norm": 0.6953125, + "learning_rate": 0.0003783720208679703, + "loss": 0.2301, + "step": 237890 + }, + { + "epoch": 9.85, + "grad_norm": 0.8671875, + "learning_rate": 0.00037836271452103666, + "loss": 0.1704, + "step": 237900 + }, + { + "epoch": 9.85, + "grad_norm": 0.7890625, + "learning_rate": 0.00037835340793253823, + "loss": 0.2212, + "step": 237910 + }, + { + "epoch": 9.85, + "grad_norm": 0.515625, + "learning_rate": 0.00037834410110249256, + "loss": 0.1932, + "step": 237920 + }, + { + "epoch": 9.86, + "grad_norm": 0.361328125, + "learning_rate": 0.0003783347940309172, + "loss": 0.198, + "step": 237930 + }, + { + "epoch": 9.86, + "grad_norm": 1.0859375, + "learning_rate": 0.00037832548671782945, + "loss": 0.1836, + "step": 237940 + }, + { + "epoch": 9.86, + "grad_norm": 0.921875, + "learning_rate": 0.00037831617916324713, + "loss": 0.1883, + "step": 237950 + }, + { + "epoch": 9.86, + "grad_norm": 0.66015625, + "learning_rate": 0.0003783068713671876, + "loss": 0.1966, + "step": 237960 + }, + { + "epoch": 9.86, + "grad_norm": 1.109375, + "learning_rate": 0.0003782975633296684, + "loss": 0.1859, + "step": 237970 + }, + { + "epoch": 9.86, + "grad_norm": 0.44921875, + "learning_rate": 0.000378288255050707, + "loss": 0.2137, + "step": 237980 + }, + { + "epoch": 9.86, + "grad_norm": 0.6796875, + "learning_rate": 0.00037827894653032104, + "loss": 0.1912, + "step": 237990 + }, + { + "epoch": 9.86, + "grad_norm": 0.7109375, + "learning_rate": 0.00037826963776852786, + "loss": 0.204, + "step": 238000 + }, + { + "epoch": 9.86, + "grad_norm": 0.6484375, + "learning_rate": 0.0003782603287653451, + "loss": 0.2065, + "step": 238010 + }, + { + "epoch": 9.86, + "grad_norm": 0.859375, + "learning_rate": 0.0003782510195207902, + "loss": 0.2351, + "step": 238020 + }, + { + "epoch": 9.86, + "grad_norm": 0.84375, + "learning_rate": 0.00037824171003488085, + "loss": 0.1724, + "step": 238030 + }, + { + "epoch": 9.86, + "grad_norm": 0.48046875, + "learning_rate": 0.0003782324003076344, + "loss": 0.1828, + "step": 238040 + }, + { + "epoch": 9.86, + "grad_norm": 0.58984375, + "learning_rate": 0.0003782230903390684, + "loss": 0.1895, + "step": 238050 + }, + { + "epoch": 9.86, + "grad_norm": 0.57421875, + "learning_rate": 0.00037821378012920035, + "loss": 0.1635, + "step": 238060 + }, + { + "epoch": 9.86, + "grad_norm": 0.62890625, + "learning_rate": 0.0003782044696780479, + "loss": 0.1961, + "step": 238070 + }, + { + "epoch": 9.86, + "grad_norm": 1.234375, + "learning_rate": 0.00037819515898562843, + "loss": 0.216, + "step": 238080 + }, + { + "epoch": 9.86, + "grad_norm": 0.8125, + "learning_rate": 0.00037818584805195956, + "loss": 0.1786, + "step": 238090 + }, + { + "epoch": 9.86, + "grad_norm": 0.73828125, + "learning_rate": 0.00037817653687705874, + "loss": 0.2402, + "step": 238100 + }, + { + "epoch": 9.86, + "grad_norm": 1.3046875, + "learning_rate": 0.00037816722546094353, + "loss": 0.2195, + "step": 238110 + }, + { + "epoch": 9.86, + "grad_norm": 0.6796875, + "learning_rate": 0.0003781579138036314, + "loss": 0.2041, + "step": 238120 + }, + { + "epoch": 9.86, + "grad_norm": 0.484375, + "learning_rate": 0.00037814860190513996, + "loss": 0.2096, + "step": 238130 + }, + { + "epoch": 9.86, + "grad_norm": 1.265625, + "learning_rate": 0.0003781392897654867, + "loss": 0.1886, + "step": 238140 + }, + { + "epoch": 9.86, + "grad_norm": 0.609375, + "learning_rate": 0.000378129977384689, + "loss": 0.1894, + "step": 238150 + }, + { + "epoch": 9.86, + "grad_norm": 1.546875, + "learning_rate": 0.00037812066476276476, + "loss": 0.1789, + "step": 238160 + }, + { + "epoch": 9.86, + "grad_norm": 0.75, + "learning_rate": 0.00037811135189973113, + "loss": 0.2054, + "step": 238170 + }, + { + "epoch": 9.87, + "grad_norm": 0.55078125, + "learning_rate": 0.00037810203879560573, + "loss": 0.1626, + "step": 238180 + }, + { + "epoch": 9.87, + "grad_norm": 0.47265625, + "learning_rate": 0.0003780927254504062, + "loss": 0.2394, + "step": 238190 + }, + { + "epoch": 9.87, + "grad_norm": 0.671875, + "learning_rate": 0.00037808341186414997, + "loss": 0.1667, + "step": 238200 + }, + { + "epoch": 9.87, + "grad_norm": 1.3984375, + "learning_rate": 0.00037807409803685457, + "loss": 0.1679, + "step": 238210 + }, + { + "epoch": 9.87, + "grad_norm": 0.37109375, + "learning_rate": 0.0003780647839685376, + "loss": 0.1805, + "step": 238220 + }, + { + "epoch": 9.87, + "grad_norm": 0.462890625, + "learning_rate": 0.00037805546965921644, + "loss": 0.159, + "step": 238230 + }, + { + "epoch": 9.87, + "grad_norm": 0.81640625, + "learning_rate": 0.00037804615510890884, + "loss": 0.165, + "step": 238240 + }, + { + "epoch": 9.87, + "grad_norm": 0.7734375, + "learning_rate": 0.00037803684031763215, + "loss": 0.2406, + "step": 238250 + }, + { + "epoch": 9.87, + "grad_norm": 0.62890625, + "learning_rate": 0.00037802752528540384, + "loss": 0.1851, + "step": 238260 + }, + { + "epoch": 9.87, + "grad_norm": 0.55859375, + "learning_rate": 0.00037801821001224174, + "loss": 0.189, + "step": 238270 + }, + { + "epoch": 9.87, + "grad_norm": 1.046875, + "learning_rate": 0.00037800889449816314, + "loss": 0.2412, + "step": 238280 + }, + { + "epoch": 9.87, + "grad_norm": 0.515625, + "learning_rate": 0.00037799957874318555, + "loss": 0.2173, + "step": 238290 + }, + { + "epoch": 9.87, + "grad_norm": 0.5234375, + "learning_rate": 0.00037799026274732664, + "loss": 0.1635, + "step": 238300 + }, + { + "epoch": 9.87, + "grad_norm": 0.96484375, + "learning_rate": 0.00037798094651060376, + "loss": 0.1831, + "step": 238310 + }, + { + "epoch": 9.87, + "grad_norm": 0.490234375, + "learning_rate": 0.0003779716300330347, + "loss": 0.2022, + "step": 238320 + }, + { + "epoch": 9.87, + "grad_norm": 0.71875, + "learning_rate": 0.0003779623133146369, + "loss": 0.1913, + "step": 238330 + }, + { + "epoch": 9.87, + "grad_norm": 1.1484375, + "learning_rate": 0.00037795299635542766, + "loss": 0.1636, + "step": 238340 + }, + { + "epoch": 9.87, + "grad_norm": 1.84375, + "learning_rate": 0.0003779436791554248, + "loss": 0.1881, + "step": 238350 + }, + { + "epoch": 9.87, + "grad_norm": 0.875, + "learning_rate": 0.0003779343617146458, + "loss": 0.1704, + "step": 238360 + }, + { + "epoch": 9.87, + "grad_norm": 1.5, + "learning_rate": 0.00037792504403310804, + "loss": 0.2077, + "step": 238370 + }, + { + "epoch": 9.87, + "grad_norm": 3.234375, + "learning_rate": 0.0003779157261108293, + "loss": 0.1945, + "step": 238380 + }, + { + "epoch": 9.87, + "grad_norm": 0.421875, + "learning_rate": 0.00037790640794782684, + "loss": 0.2239, + "step": 238390 + }, + { + "epoch": 9.87, + "grad_norm": 0.45703125, + "learning_rate": 0.0003778970895441184, + "loss": 0.2154, + "step": 238400 + }, + { + "epoch": 9.87, + "grad_norm": 0.82421875, + "learning_rate": 0.0003778877708997214, + "loss": 0.2145, + "step": 238410 + }, + { + "epoch": 9.88, + "grad_norm": 0.48046875, + "learning_rate": 0.0003778784520146534, + "loss": 0.2227, + "step": 238420 + }, + { + "epoch": 9.88, + "grad_norm": 0.64453125, + "learning_rate": 0.0003778691328889321, + "loss": 0.2091, + "step": 238430 + }, + { + "epoch": 9.88, + "grad_norm": 0.4921875, + "learning_rate": 0.0003778598135225748, + "loss": 0.1951, + "step": 238440 + }, + { + "epoch": 9.88, + "grad_norm": 1.1015625, + "learning_rate": 0.00037785049391559917, + "loss": 0.2482, + "step": 238450 + }, + { + "epoch": 9.88, + "grad_norm": 0.91015625, + "learning_rate": 0.00037784117406802276, + "loss": 0.2702, + "step": 238460 + }, + { + "epoch": 9.88, + "grad_norm": 1.5078125, + "learning_rate": 0.000377831853979863, + "loss": 0.1622, + "step": 238470 + }, + { + "epoch": 9.88, + "grad_norm": 0.53125, + "learning_rate": 0.0003778225336511375, + "loss": 0.1776, + "step": 238480 + }, + { + "epoch": 9.88, + "grad_norm": 1.203125, + "learning_rate": 0.0003778132130818638, + "loss": 0.2047, + "step": 238490 + }, + { + "epoch": 9.88, + "grad_norm": 1.203125, + "learning_rate": 0.0003778038922720594, + "loss": 0.2295, + "step": 238500 + }, + { + "epoch": 9.88, + "grad_norm": 0.431640625, + "learning_rate": 0.000377794571221742, + "loss": 0.1926, + "step": 238510 + }, + { + "epoch": 9.88, + "grad_norm": 1.703125, + "learning_rate": 0.0003777852499309289, + "loss": 0.1893, + "step": 238520 + }, + { + "epoch": 9.88, + "grad_norm": 0.546875, + "learning_rate": 0.0003777759283996378, + "loss": 0.1509, + "step": 238530 + }, + { + "epoch": 9.88, + "grad_norm": 1.1796875, + "learning_rate": 0.00037776660662788614, + "loss": 0.1329, + "step": 238540 + }, + { + "epoch": 9.88, + "grad_norm": 1.65625, + "learning_rate": 0.0003777572846156916, + "loss": 0.2402, + "step": 238550 + }, + { + "epoch": 9.88, + "grad_norm": 0.26171875, + "learning_rate": 0.0003777479623630716, + "loss": 0.2006, + "step": 238560 + }, + { + "epoch": 9.88, + "grad_norm": 1.921875, + "learning_rate": 0.00037773863987004374, + "loss": 0.1809, + "step": 238570 + }, + { + "epoch": 9.88, + "grad_norm": 0.52734375, + "learning_rate": 0.0003777293171366256, + "loss": 0.1607, + "step": 238580 + }, + { + "epoch": 9.88, + "grad_norm": 0.298828125, + "learning_rate": 0.0003777199941628347, + "loss": 0.1993, + "step": 238590 + }, + { + "epoch": 9.88, + "grad_norm": 1.2265625, + "learning_rate": 0.0003777106709486885, + "loss": 0.1873, + "step": 238600 + }, + { + "epoch": 9.88, + "grad_norm": 1.1015625, + "learning_rate": 0.0003777013474942046, + "loss": 0.2038, + "step": 238610 + }, + { + "epoch": 9.88, + "grad_norm": 1.015625, + "learning_rate": 0.0003776920237994006, + "loss": 0.2228, + "step": 238620 + }, + { + "epoch": 9.88, + "grad_norm": 0.32421875, + "learning_rate": 0.0003776826998642939, + "loss": 0.2127, + "step": 238630 + }, + { + "epoch": 9.88, + "grad_norm": 1.40625, + "learning_rate": 0.0003776733756889023, + "loss": 0.1829, + "step": 238640 + }, + { + "epoch": 9.88, + "grad_norm": 1.25, + "learning_rate": 0.0003776640512732431, + "loss": 0.1677, + "step": 238650 + }, + { + "epoch": 9.89, + "grad_norm": 0.7109375, + "learning_rate": 0.00037765472661733396, + "loss": 0.2532, + "step": 238660 + }, + { + "epoch": 9.89, + "grad_norm": 0.59765625, + "learning_rate": 0.00037764540172119246, + "loss": 0.2037, + "step": 238670 + }, + { + "epoch": 9.89, + "grad_norm": 1.390625, + "learning_rate": 0.000377636076584836, + "loss": 0.2412, + "step": 238680 + }, + { + "epoch": 9.89, + "grad_norm": 0.65625, + "learning_rate": 0.00037762675120828226, + "loss": 0.2326, + "step": 238690 + }, + { + "epoch": 9.89, + "grad_norm": 0.63671875, + "learning_rate": 0.0003776174255915488, + "loss": 0.2335, + "step": 238700 + }, + { + "epoch": 9.89, + "grad_norm": 0.44921875, + "learning_rate": 0.000377608099734653, + "loss": 0.2307, + "step": 238710 + }, + { + "epoch": 9.89, + "grad_norm": 0.5390625, + "learning_rate": 0.0003775987736376127, + "loss": 0.2254, + "step": 238720 + }, + { + "epoch": 9.89, + "grad_norm": 0.2373046875, + "learning_rate": 0.0003775894473004453, + "loss": 0.1423, + "step": 238730 + }, + { + "epoch": 9.89, + "grad_norm": 1.4453125, + "learning_rate": 0.0003775801207231682, + "loss": 0.2219, + "step": 238740 + }, + { + "epoch": 9.89, + "grad_norm": 3.53125, + "learning_rate": 0.0003775707939057992, + "loss": 0.2243, + "step": 238750 + }, + { + "epoch": 9.89, + "grad_norm": 0.85546875, + "learning_rate": 0.0003775614668483557, + "loss": 0.1646, + "step": 238760 + }, + { + "epoch": 9.89, + "grad_norm": 0.6328125, + "learning_rate": 0.00037755213955085527, + "loss": 0.2153, + "step": 238770 + }, + { + "epoch": 9.89, + "grad_norm": 0.75, + "learning_rate": 0.00037754281201331555, + "loss": 0.1731, + "step": 238780 + }, + { + "epoch": 9.89, + "grad_norm": 1.015625, + "learning_rate": 0.00037753348423575397, + "loss": 0.1972, + "step": 238790 + }, + { + "epoch": 9.89, + "grad_norm": 0.482421875, + "learning_rate": 0.00037752415621818814, + "loss": 0.2361, + "step": 238800 + }, + { + "epoch": 9.89, + "grad_norm": 1.0703125, + "learning_rate": 0.0003775148279606356, + "loss": 0.2102, + "step": 238810 + }, + { + "epoch": 9.89, + "grad_norm": 0.99609375, + "learning_rate": 0.00037750549946311403, + "loss": 0.2754, + "step": 238820 + }, + { + "epoch": 9.89, + "grad_norm": 0.5703125, + "learning_rate": 0.0003774961707256408, + "loss": 0.1821, + "step": 238830 + }, + { + "epoch": 9.89, + "grad_norm": 0.60546875, + "learning_rate": 0.00037748684174823357, + "loss": 0.1923, + "step": 238840 + }, + { + "epoch": 9.89, + "grad_norm": 0.0, + "learning_rate": 0.0003774775125309099, + "loss": 0.1793, + "step": 238850 + }, + { + "epoch": 9.89, + "grad_norm": 0.359375, + "learning_rate": 0.00037746818307368723, + "loss": 0.2177, + "step": 238860 + }, + { + "epoch": 9.89, + "grad_norm": 0.8515625, + "learning_rate": 0.00037745885337658327, + "loss": 0.2176, + "step": 238870 + }, + { + "epoch": 9.89, + "grad_norm": 0.4453125, + "learning_rate": 0.0003774495234396155, + "loss": 0.1815, + "step": 238880 + }, + { + "epoch": 9.89, + "grad_norm": 1.3359375, + "learning_rate": 0.0003774401932628015, + "loss": 0.1553, + "step": 238890 + }, + { + "epoch": 9.9, + "grad_norm": 0.921875, + "learning_rate": 0.0003774308628461588, + "loss": 0.1965, + "step": 238900 + }, + { + "epoch": 9.9, + "grad_norm": 0.9765625, + "learning_rate": 0.000377421532189705, + "loss": 0.1937, + "step": 238910 + }, + { + "epoch": 9.9, + "grad_norm": 0.8125, + "learning_rate": 0.00037741220129345764, + "loss": 0.181, + "step": 238920 + }, + { + "epoch": 9.9, + "grad_norm": 1.1796875, + "learning_rate": 0.0003774028701574342, + "loss": 0.2291, + "step": 238930 + }, + { + "epoch": 9.9, + "grad_norm": 0.9609375, + "learning_rate": 0.0003773935387816524, + "loss": 0.1975, + "step": 238940 + }, + { + "epoch": 9.9, + "grad_norm": 0.72265625, + "learning_rate": 0.00037738420716612966, + "loss": 0.1626, + "step": 238950 + }, + { + "epoch": 9.9, + "grad_norm": 0.796875, + "learning_rate": 0.00037737487531088365, + "loss": 0.1766, + "step": 238960 + }, + { + "epoch": 9.9, + "grad_norm": 2.984375, + "learning_rate": 0.0003773655432159319, + "loss": 0.2622, + "step": 238970 + }, + { + "epoch": 9.9, + "grad_norm": 0.55859375, + "learning_rate": 0.0003773562108812919, + "loss": 0.1714, + "step": 238980 + }, + { + "epoch": 9.9, + "grad_norm": 0.390625, + "learning_rate": 0.0003773468783069812, + "loss": 0.2215, + "step": 238990 + }, + { + "epoch": 9.9, + "grad_norm": 0.5625, + "learning_rate": 0.00037733754549301756, + "loss": 0.2184, + "step": 239000 + }, + { + "epoch": 9.9, + "grad_norm": 0.40234375, + "learning_rate": 0.0003773282124394184, + "loss": 0.2195, + "step": 239010 + }, + { + "epoch": 9.9, + "grad_norm": 0.21484375, + "learning_rate": 0.0003773188791462012, + "loss": 0.1766, + "step": 239020 + }, + { + "epoch": 9.9, + "grad_norm": 0.3203125, + "learning_rate": 0.0003773095456133837, + "loss": 0.188, + "step": 239030 + }, + { + "epoch": 9.9, + "grad_norm": 0.65625, + "learning_rate": 0.0003773002118409833, + "loss": 0.2297, + "step": 239040 + }, + { + "epoch": 9.9, + "grad_norm": 0.75390625, + "learning_rate": 0.0003772908778290177, + "loss": 0.2317, + "step": 239050 + }, + { + "epoch": 9.9, + "grad_norm": 2.046875, + "learning_rate": 0.00037728154357750444, + "loss": 0.206, + "step": 239060 + }, + { + "epoch": 9.9, + "grad_norm": 0.52734375, + "learning_rate": 0.00037727220908646104, + "loss": 0.1932, + "step": 239070 + }, + { + "epoch": 9.9, + "grad_norm": 1.03125, + "learning_rate": 0.0003772628743559051, + "loss": 0.2466, + "step": 239080 + }, + { + "epoch": 9.9, + "grad_norm": 1.453125, + "learning_rate": 0.00037725353938585415, + "loss": 0.2237, + "step": 239090 + }, + { + "epoch": 9.9, + "grad_norm": 0.63671875, + "learning_rate": 0.00037724420417632574, + "loss": 0.196, + "step": 239100 + }, + { + "epoch": 9.9, + "grad_norm": 0.384765625, + "learning_rate": 0.00037723486872733757, + "loss": 0.1852, + "step": 239110 + }, + { + "epoch": 9.9, + "grad_norm": 0.65234375, + "learning_rate": 0.00037722553303890703, + "loss": 0.2056, + "step": 239120 + }, + { + "epoch": 9.9, + "grad_norm": 0.1337890625, + "learning_rate": 0.00037721619711105186, + "loss": 0.1834, + "step": 239130 + }, + { + "epoch": 9.91, + "grad_norm": 1.3671875, + "learning_rate": 0.0003772068609437895, + "loss": 0.2506, + "step": 239140 + }, + { + "epoch": 9.91, + "grad_norm": 0.80859375, + "learning_rate": 0.0003771975245371376, + "loss": 0.1975, + "step": 239150 + }, + { + "epoch": 9.91, + "grad_norm": 0.82421875, + "learning_rate": 0.0003771881878911136, + "loss": 0.1847, + "step": 239160 + }, + { + "epoch": 9.91, + "grad_norm": 0.7109375, + "learning_rate": 0.0003771788510057352, + "loss": 0.1986, + "step": 239170 + }, + { + "epoch": 9.91, + "grad_norm": 0.28515625, + "learning_rate": 0.00037716951388102005, + "loss": 0.2028, + "step": 239180 + }, + { + "epoch": 9.91, + "grad_norm": 0.66796875, + "learning_rate": 0.0003771601765169855, + "loss": 0.1663, + "step": 239190 + }, + { + "epoch": 9.91, + "grad_norm": 0.73046875, + "learning_rate": 0.00037715083891364925, + "loss": 0.1985, + "step": 239200 + }, + { + "epoch": 9.91, + "grad_norm": 0.458984375, + "learning_rate": 0.0003771415010710289, + "loss": 0.176, + "step": 239210 + }, + { + "epoch": 9.91, + "grad_norm": 0.5, + "learning_rate": 0.0003771321629891419, + "loss": 0.1446, + "step": 239220 + }, + { + "epoch": 9.91, + "grad_norm": 1.0859375, + "learning_rate": 0.00037712282466800595, + "loss": 0.1805, + "step": 239230 + }, + { + "epoch": 9.91, + "grad_norm": 1.0390625, + "learning_rate": 0.00037711348610763856, + "loss": 0.2583, + "step": 239240 + }, + { + "epoch": 9.91, + "grad_norm": 0.6796875, + "learning_rate": 0.00037710414730805726, + "loss": 0.181, + "step": 239250 + }, + { + "epoch": 9.91, + "grad_norm": 0.48828125, + "learning_rate": 0.00037709480826927976, + "loss": 0.2058, + "step": 239260 + }, + { + "epoch": 9.91, + "grad_norm": 1.3359375, + "learning_rate": 0.0003770854689913235, + "loss": 0.2035, + "step": 239270 + }, + { + "epoch": 9.91, + "grad_norm": 0.40234375, + "learning_rate": 0.0003770761294742061, + "loss": 0.2611, + "step": 239280 + }, + { + "epoch": 9.91, + "grad_norm": 0.60546875, + "learning_rate": 0.0003770667897179453, + "loss": 0.181, + "step": 239290 + }, + { + "epoch": 9.91, + "grad_norm": 0.90234375, + "learning_rate": 0.00037705744972255827, + "loss": 0.233, + "step": 239300 + }, + { + "epoch": 9.91, + "grad_norm": 0.41015625, + "learning_rate": 0.00037704810948806304, + "loss": 0.187, + "step": 239310 + }, + { + "epoch": 9.91, + "grad_norm": 1.15625, + "learning_rate": 0.000377038769014477, + "loss": 0.2111, + "step": 239320 + }, + { + "epoch": 9.91, + "grad_norm": 0.55859375, + "learning_rate": 0.00037702942830181754, + "loss": 0.1936, + "step": 239330 + }, + { + "epoch": 9.91, + "grad_norm": 0.68359375, + "learning_rate": 0.00037702008735010257, + "loss": 0.186, + "step": 239340 + }, + { + "epoch": 9.91, + "grad_norm": 1.2265625, + "learning_rate": 0.0003770107461593494, + "loss": 0.2188, + "step": 239350 + }, + { + "epoch": 9.91, + "grad_norm": 0.5625, + "learning_rate": 0.00037700140472957577, + "loss": 0.2268, + "step": 239360 + }, + { + "epoch": 9.91, + "grad_norm": 0.466796875, + "learning_rate": 0.0003769920630607993, + "loss": 0.1851, + "step": 239370 + }, + { + "epoch": 9.92, + "grad_norm": 0.498046875, + "learning_rate": 0.00037698272115303734, + "loss": 0.1782, + "step": 239380 + }, + { + "epoch": 9.92, + "grad_norm": 0.8515625, + "learning_rate": 0.0003769733790063077, + "loss": 0.2121, + "step": 239390 + }, + { + "epoch": 9.92, + "grad_norm": 0.65234375, + "learning_rate": 0.00037696403662062784, + "loss": 0.2397, + "step": 239400 + }, + { + "epoch": 9.92, + "grad_norm": 0.55859375, + "learning_rate": 0.0003769546939960153, + "loss": 0.2027, + "step": 239410 + }, + { + "epoch": 9.92, + "grad_norm": 0.95703125, + "learning_rate": 0.00037694535113248794, + "loss": 0.2357, + "step": 239420 + }, + { + "epoch": 9.92, + "grad_norm": 1.3125, + "learning_rate": 0.000376936008030063, + "loss": 0.211, + "step": 239430 + }, + { + "epoch": 9.92, + "grad_norm": 0.65625, + "learning_rate": 0.00037692666468875817, + "loss": 0.1971, + "step": 239440 + }, + { + "epoch": 9.92, + "grad_norm": 0.86328125, + "learning_rate": 0.00037691732110859116, + "loss": 0.195, + "step": 239450 + }, + { + "epoch": 9.92, + "grad_norm": 0.69921875, + "learning_rate": 0.0003769079772895794, + "loss": 0.1441, + "step": 239460 + }, + { + "epoch": 9.92, + "grad_norm": 1.0390625, + "learning_rate": 0.00037689863323174056, + "loss": 0.2012, + "step": 239470 + }, + { + "epoch": 9.92, + "grad_norm": 1.0546875, + "learning_rate": 0.0003768892889350921, + "loss": 0.1896, + "step": 239480 + }, + { + "epoch": 9.92, + "grad_norm": 0.69921875, + "learning_rate": 0.0003768799443996518, + "loss": 0.2132, + "step": 239490 + }, + { + "epoch": 9.92, + "grad_norm": 0.69140625, + "learning_rate": 0.0003768705996254371, + "loss": 0.195, + "step": 239500 + }, + { + "epoch": 9.92, + "grad_norm": 1.0859375, + "learning_rate": 0.0003768612546124657, + "loss": 0.2089, + "step": 239510 + }, + { + "epoch": 9.92, + "grad_norm": 0.8515625, + "learning_rate": 0.0003768519093607551, + "loss": 0.2208, + "step": 239520 + }, + { + "epoch": 9.92, + "grad_norm": 0.4609375, + "learning_rate": 0.0003768425638703229, + "loss": 0.2188, + "step": 239530 + }, + { + "epoch": 9.92, + "grad_norm": 0.9765625, + "learning_rate": 0.0003768332181411866, + "loss": 0.1935, + "step": 239540 + }, + { + "epoch": 9.92, + "grad_norm": 0.416015625, + "learning_rate": 0.000376823872173364, + "loss": 0.189, + "step": 239550 + }, + { + "epoch": 9.92, + "grad_norm": 0.69140625, + "learning_rate": 0.0003768145259668725, + "loss": 0.2148, + "step": 239560 + }, + { + "epoch": 9.92, + "grad_norm": 0.447265625, + "learning_rate": 0.00037680517952172975, + "loss": 0.2572, + "step": 239570 + }, + { + "epoch": 9.92, + "grad_norm": 0.5390625, + "learning_rate": 0.00037679583283795336, + "loss": 0.1712, + "step": 239580 + }, + { + "epoch": 9.92, + "grad_norm": 0.87890625, + "learning_rate": 0.000376786485915561, + "loss": 0.1643, + "step": 239590 + }, + { + "epoch": 9.92, + "grad_norm": 0.66796875, + "learning_rate": 0.00037677713875456997, + "loss": 0.2031, + "step": 239600 + }, + { + "epoch": 9.92, + "grad_norm": 0.66015625, + "learning_rate": 0.00037676779135499826, + "loss": 0.2206, + "step": 239610 + }, + { + "epoch": 9.93, + "grad_norm": 0.6875, + "learning_rate": 0.0003767584437168631, + "loss": 0.2052, + "step": 239620 + }, + { + "epoch": 9.93, + "grad_norm": 0.921875, + "learning_rate": 0.00037674909584018237, + "loss": 0.2221, + "step": 239630 + }, + { + "epoch": 9.93, + "grad_norm": 0.5234375, + "learning_rate": 0.0003767397477249734, + "loss": 0.2231, + "step": 239640 + }, + { + "epoch": 9.93, + "grad_norm": 1.0078125, + "learning_rate": 0.000376730399371254, + "loss": 0.2331, + "step": 239650 + }, + { + "epoch": 9.93, + "grad_norm": 0.7890625, + "learning_rate": 0.00037672105077904167, + "loss": 0.1697, + "step": 239660 + }, + { + "epoch": 9.93, + "grad_norm": 0.375, + "learning_rate": 0.000376711701948354, + "loss": 0.2041, + "step": 239670 + }, + { + "epoch": 9.93, + "grad_norm": 1.2421875, + "learning_rate": 0.0003767023528792085, + "loss": 0.229, + "step": 239680 + }, + { + "epoch": 9.93, + "grad_norm": 1.0703125, + "learning_rate": 0.000376693003571623, + "loss": 0.2324, + "step": 239690 + }, + { + "epoch": 9.93, + "grad_norm": 0.00732421875, + "learning_rate": 0.0003766836540256148, + "loss": 0.1834, + "step": 239700 + }, + { + "epoch": 9.93, + "grad_norm": 0.63671875, + "learning_rate": 0.00037667430424120175, + "loss": 0.1526, + "step": 239710 + }, + { + "epoch": 9.93, + "grad_norm": 0.74609375, + "learning_rate": 0.0003766649542184013, + "loss": 0.2097, + "step": 239720 + }, + { + "epoch": 9.93, + "grad_norm": 0.64453125, + "learning_rate": 0.00037665560395723106, + "loss": 0.1865, + "step": 239730 + }, + { + "epoch": 9.93, + "grad_norm": 0.515625, + "learning_rate": 0.00037664625345770877, + "loss": 0.1679, + "step": 239740 + }, + { + "epoch": 9.93, + "grad_norm": 0.55859375, + "learning_rate": 0.00037663690271985185, + "loss": 0.2098, + "step": 239750 + }, + { + "epoch": 9.93, + "grad_norm": 1.375, + "learning_rate": 0.0003766275517436779, + "loss": 0.2157, + "step": 239760 + }, + { + "epoch": 9.93, + "grad_norm": 1.7109375, + "learning_rate": 0.0003766182005292046, + "loss": 0.1698, + "step": 239770 + }, + { + "epoch": 9.93, + "grad_norm": 0.921875, + "learning_rate": 0.0003766088490764495, + "loss": 0.1971, + "step": 239780 + }, + { + "epoch": 9.93, + "grad_norm": 0.48828125, + "learning_rate": 0.00037659949738543024, + "loss": 0.1774, + "step": 239790 + }, + { + "epoch": 9.93, + "grad_norm": 0.78515625, + "learning_rate": 0.0003765901454561644, + "loss": 0.2173, + "step": 239800 + }, + { + "epoch": 9.93, + "grad_norm": 0.984375, + "learning_rate": 0.00037658079328866957, + "loss": 0.2503, + "step": 239810 + }, + { + "epoch": 9.93, + "grad_norm": 1.5234375, + "learning_rate": 0.0003765714408829634, + "loss": 0.2061, + "step": 239820 + }, + { + "epoch": 9.93, + "grad_norm": 0.3125, + "learning_rate": 0.0003765620882390634, + "loss": 0.2594, + "step": 239830 + }, + { + "epoch": 9.93, + "grad_norm": 0.6171875, + "learning_rate": 0.0003765527353569872, + "loss": 0.2161, + "step": 239840 + }, + { + "epoch": 9.93, + "grad_norm": 0.578125, + "learning_rate": 0.0003765433822367525, + "loss": 0.1613, + "step": 239850 + }, + { + "epoch": 9.93, + "grad_norm": 0.55859375, + "learning_rate": 0.0003765340288783767, + "loss": 0.196, + "step": 239860 + }, + { + "epoch": 9.94, + "grad_norm": 0.83984375, + "learning_rate": 0.0003765246752818776, + "loss": 0.2344, + "step": 239870 + }, + { + "epoch": 9.94, + "grad_norm": 1.1953125, + "learning_rate": 0.0003765153214472727, + "loss": 0.1909, + "step": 239880 + }, + { + "epoch": 9.94, + "grad_norm": 0.89453125, + "learning_rate": 0.00037650596737457965, + "loss": 0.2352, + "step": 239890 + }, + { + "epoch": 9.94, + "grad_norm": 1.015625, + "learning_rate": 0.00037649661306381606, + "loss": 0.1948, + "step": 239900 + }, + { + "epoch": 9.94, + "grad_norm": 0.76953125, + "learning_rate": 0.00037648725851499945, + "loss": 0.1841, + "step": 239910 + }, + { + "epoch": 9.94, + "grad_norm": 0.69140625, + "learning_rate": 0.0003764779037281475, + "loss": 0.2168, + "step": 239920 + }, + { + "epoch": 9.94, + "grad_norm": 2.125, + "learning_rate": 0.0003764685487032777, + "loss": 0.215, + "step": 239930 + }, + { + "epoch": 9.94, + "grad_norm": 2.890625, + "learning_rate": 0.0003764591934404079, + "loss": 0.2272, + "step": 239940 + }, + { + "epoch": 9.94, + "grad_norm": 0.345703125, + "learning_rate": 0.0003764498379395555, + "loss": 0.1679, + "step": 239950 + }, + { + "epoch": 9.94, + "grad_norm": 0.76171875, + "learning_rate": 0.00037644048220073813, + "loss": 0.1565, + "step": 239960 + }, + { + "epoch": 9.94, + "grad_norm": 0.5859375, + "learning_rate": 0.0003764311262239735, + "loss": 0.1476, + "step": 239970 + }, + { + "epoch": 9.94, + "grad_norm": 1.125, + "learning_rate": 0.0003764217700092791, + "loss": 0.1718, + "step": 239980 + }, + { + "epoch": 9.94, + "grad_norm": 0.8515625, + "learning_rate": 0.0003764124135566725, + "loss": 0.2323, + "step": 239990 + }, + { + "epoch": 9.94, + "grad_norm": 0.10595703125, + "learning_rate": 0.0003764030568661715, + "loss": 0.2202, + "step": 240000 + }, + { + "epoch": 9.94, + "grad_norm": 0.478515625, + "learning_rate": 0.00037639369993779353, + "loss": 0.2207, + "step": 240010 + }, + { + "epoch": 9.94, + "grad_norm": 0.294921875, + "learning_rate": 0.0003763843427715563, + "loss": 0.1644, + "step": 240020 + }, + { + "epoch": 9.94, + "grad_norm": 1.5390625, + "learning_rate": 0.0003763749853674774, + "loss": 0.1996, + "step": 240030 + }, + { + "epoch": 9.94, + "grad_norm": 0.81640625, + "learning_rate": 0.00037636562772557435, + "loss": 0.1976, + "step": 240040 + }, + { + "epoch": 9.94, + "grad_norm": 0.55859375, + "learning_rate": 0.0003763562698458649, + "loss": 0.1931, + "step": 240050 + }, + { + "epoch": 9.94, + "grad_norm": 1.328125, + "learning_rate": 0.0003763469117283666, + "loss": 0.2063, + "step": 240060 + }, + { + "epoch": 9.94, + "grad_norm": 1.109375, + "learning_rate": 0.00037633755337309705, + "loss": 0.1829, + "step": 240070 + }, + { + "epoch": 9.94, + "grad_norm": 0.333984375, + "learning_rate": 0.00037632819478007384, + "loss": 0.2005, + "step": 240080 + }, + { + "epoch": 9.94, + "grad_norm": 0.7890625, + "learning_rate": 0.00037631883594931465, + "loss": 0.2077, + "step": 240090 + }, + { + "epoch": 9.94, + "grad_norm": 0.4140625, + "learning_rate": 0.00037630947688083703, + "loss": 0.2317, + "step": 240100 + }, + { + "epoch": 9.95, + "grad_norm": 0.5078125, + "learning_rate": 0.00037630011757465865, + "loss": 0.1864, + "step": 240110 + }, + { + "epoch": 9.95, + "grad_norm": 0.333984375, + "learning_rate": 0.00037629075803079703, + "loss": 0.1822, + "step": 240120 + }, + { + "epoch": 9.95, + "grad_norm": 0.0, + "learning_rate": 0.0003762813982492699, + "loss": 0.2039, + "step": 240130 + }, + { + "epoch": 9.95, + "grad_norm": 0.447265625, + "learning_rate": 0.0003762720382300947, + "loss": 0.2347, + "step": 240140 + }, + { + "epoch": 9.95, + "grad_norm": 0.66796875, + "learning_rate": 0.00037626267797328925, + "loss": 0.1452, + "step": 240150 + }, + { + "epoch": 9.95, + "grad_norm": 0.78515625, + "learning_rate": 0.00037625331747887116, + "loss": 0.1792, + "step": 240160 + }, + { + "epoch": 9.95, + "grad_norm": 0.6640625, + "learning_rate": 0.00037624395674685785, + "loss": 0.2051, + "step": 240170 + }, + { + "epoch": 9.95, + "grad_norm": 0.63671875, + "learning_rate": 0.000376234595777267, + "loss": 0.2141, + "step": 240180 + }, + { + "epoch": 9.95, + "grad_norm": 0.66015625, + "learning_rate": 0.00037622523457011637, + "loss": 0.2024, + "step": 240190 + }, + { + "epoch": 9.95, + "grad_norm": 0.66015625, + "learning_rate": 0.00037621587312542347, + "loss": 0.1749, + "step": 240200 + }, + { + "epoch": 9.95, + "grad_norm": 0.6171875, + "learning_rate": 0.00037620651144320584, + "loss": 0.1749, + "step": 240210 + }, + { + "epoch": 9.95, + "grad_norm": 1.1484375, + "learning_rate": 0.00037619714952348126, + "loss": 0.215, + "step": 240220 + }, + { + "epoch": 9.95, + "grad_norm": 1.7265625, + "learning_rate": 0.00037618778736626723, + "loss": 0.1977, + "step": 240230 + }, + { + "epoch": 9.95, + "grad_norm": 0.1806640625, + "learning_rate": 0.0003761784249715815, + "loss": 0.1742, + "step": 240240 + }, + { + "epoch": 9.95, + "grad_norm": 0.60546875, + "learning_rate": 0.00037616906233944153, + "loss": 0.1676, + "step": 240250 + }, + { + "epoch": 9.95, + "grad_norm": 0.55859375, + "learning_rate": 0.00037615969946986505, + "loss": 0.1501, + "step": 240260 + }, + { + "epoch": 9.95, + "grad_norm": 0.94140625, + "learning_rate": 0.00037615033636286953, + "loss": 0.2202, + "step": 240270 + }, + { + "epoch": 9.95, + "grad_norm": 0.146484375, + "learning_rate": 0.00037614097301847283, + "loss": 0.1994, + "step": 240280 + }, + { + "epoch": 9.95, + "grad_norm": 0.71484375, + "learning_rate": 0.00037613160943669244, + "loss": 0.1985, + "step": 240290 + }, + { + "epoch": 9.95, + "grad_norm": 0.396484375, + "learning_rate": 0.00037612224561754594, + "loss": 0.1736, + "step": 240300 + }, + { + "epoch": 9.95, + "grad_norm": 0.56640625, + "learning_rate": 0.00037611288156105094, + "loss": 0.1943, + "step": 240310 + }, + { + "epoch": 9.95, + "grad_norm": 0.65625, + "learning_rate": 0.0003761035172672252, + "loss": 0.1995, + "step": 240320 + }, + { + "epoch": 9.95, + "grad_norm": 0.9765625, + "learning_rate": 0.0003760941527360862, + "loss": 0.1618, + "step": 240330 + }, + { + "epoch": 9.95, + "grad_norm": 0.8671875, + "learning_rate": 0.0003760847879676517, + "loss": 0.2137, + "step": 240340 + }, + { + "epoch": 9.96, + "grad_norm": 0.8984375, + "learning_rate": 0.0003760754229619391, + "loss": 0.212, + "step": 240350 + }, + { + "epoch": 9.96, + "grad_norm": 0.83984375, + "learning_rate": 0.0003760660577189663, + "loss": 0.1845, + "step": 240360 + }, + { + "epoch": 9.96, + "grad_norm": 0.73828125, + "learning_rate": 0.00037605669223875074, + "loss": 0.2105, + "step": 240370 + }, + { + "epoch": 9.96, + "grad_norm": 0.58984375, + "learning_rate": 0.0003760473265213101, + "loss": 0.1757, + "step": 240380 + }, + { + "epoch": 9.96, + "grad_norm": 1.0390625, + "learning_rate": 0.000376037960566662, + "loss": 0.1769, + "step": 240390 + }, + { + "epoch": 9.96, + "grad_norm": 1.015625, + "learning_rate": 0.00037602859437482414, + "loss": 0.2149, + "step": 240400 + }, + { + "epoch": 9.96, + "grad_norm": 0.92578125, + "learning_rate": 0.000376019227945814, + "loss": 0.1809, + "step": 240410 + }, + { + "epoch": 9.96, + "grad_norm": 0.62890625, + "learning_rate": 0.00037600986127964934, + "loss": 0.1546, + "step": 240420 + }, + { + "epoch": 9.96, + "grad_norm": 0.66796875, + "learning_rate": 0.00037600049437634767, + "loss": 0.1845, + "step": 240430 + }, + { + "epoch": 9.96, + "grad_norm": 1.09375, + "learning_rate": 0.00037599112723592665, + "loss": 0.221, + "step": 240440 + }, + { + "epoch": 9.96, + "grad_norm": 0.40234375, + "learning_rate": 0.000375981759858404, + "loss": 0.2057, + "step": 240450 + }, + { + "epoch": 9.96, + "grad_norm": 0.93359375, + "learning_rate": 0.00037597239224379725, + "loss": 0.1705, + "step": 240460 + }, + { + "epoch": 9.96, + "grad_norm": 0.703125, + "learning_rate": 0.00037596302439212405, + "loss": 0.2079, + "step": 240470 + }, + { + "epoch": 9.96, + "grad_norm": 0.5234375, + "learning_rate": 0.0003759536563034021, + "loss": 0.1883, + "step": 240480 + }, + { + "epoch": 9.96, + "grad_norm": 0.40234375, + "learning_rate": 0.0003759442879776489, + "loss": 0.1987, + "step": 240490 + }, + { + "epoch": 9.96, + "grad_norm": 0.96875, + "learning_rate": 0.00037593491941488225, + "loss": 0.1781, + "step": 240500 + }, + { + "epoch": 9.96, + "grad_norm": 0.357421875, + "learning_rate": 0.00037592555061511957, + "loss": 0.1918, + "step": 240510 + }, + { + "epoch": 9.96, + "grad_norm": 0.61328125, + "learning_rate": 0.0003759161815783787, + "loss": 0.21, + "step": 240520 + }, + { + "epoch": 9.96, + "grad_norm": 0.63671875, + "learning_rate": 0.0003759068123046771, + "loss": 0.178, + "step": 240530 + }, + { + "epoch": 9.96, + "grad_norm": 1.046875, + "learning_rate": 0.0003758974427940325, + "loss": 0.2582, + "step": 240540 + }, + { + "epoch": 9.96, + "grad_norm": 0.94140625, + "learning_rate": 0.00037588807304646244, + "loss": 0.1565, + "step": 240550 + }, + { + "epoch": 9.96, + "grad_norm": 0.89453125, + "learning_rate": 0.00037587870306198467, + "loss": 0.1737, + "step": 240560 + }, + { + "epoch": 9.96, + "grad_norm": 0.72265625, + "learning_rate": 0.00037586933284061686, + "loss": 0.2066, + "step": 240570 + }, + { + "epoch": 9.96, + "grad_norm": 0.44921875, + "learning_rate": 0.0003758599623823765, + "loss": 0.1778, + "step": 240580 + }, + { + "epoch": 9.97, + "grad_norm": 0.59375, + "learning_rate": 0.0003758505916872813, + "loss": 0.1867, + "step": 240590 + }, + { + "epoch": 9.97, + "grad_norm": 1.1171875, + "learning_rate": 0.00037584122075534886, + "loss": 0.1987, + "step": 240600 + }, + { + "epoch": 9.97, + "grad_norm": 0.427734375, + "learning_rate": 0.0003758318495865968, + "loss": 0.1931, + "step": 240610 + }, + { + "epoch": 9.97, + "grad_norm": 0.365234375, + "learning_rate": 0.0003758224781810428, + "loss": 0.177, + "step": 240620 + }, + { + "epoch": 9.97, + "grad_norm": 1.75, + "learning_rate": 0.00037581310653870447, + "loss": 0.2217, + "step": 240630 + }, + { + "epoch": 9.97, + "grad_norm": 1.0, + "learning_rate": 0.0003758037346595995, + "loss": 0.1692, + "step": 240640 + }, + { + "epoch": 9.97, + "grad_norm": 0.6640625, + "learning_rate": 0.0003757943625437455, + "loss": 0.1904, + "step": 240650 + }, + { + "epoch": 9.97, + "grad_norm": 0.6875, + "learning_rate": 0.00037578499019116007, + "loss": 0.1785, + "step": 240660 + }, + { + "epoch": 9.97, + "grad_norm": 0.66015625, + "learning_rate": 0.00037577561760186083, + "loss": 0.2368, + "step": 240670 + }, + { + "epoch": 9.97, + "grad_norm": 0.52734375, + "learning_rate": 0.0003757662447758655, + "loss": 0.2024, + "step": 240680 + }, + { + "epoch": 9.97, + "grad_norm": 0.91796875, + "learning_rate": 0.0003757568717131917, + "loss": 0.1751, + "step": 240690 + }, + { + "epoch": 9.97, + "grad_norm": 0.95703125, + "learning_rate": 0.000375747498413857, + "loss": 0.1844, + "step": 240700 + }, + { + "epoch": 9.97, + "grad_norm": 0.65625, + "learning_rate": 0.00037573812487787917, + "loss": 0.1993, + "step": 240710 + }, + { + "epoch": 9.97, + "grad_norm": 0.8359375, + "learning_rate": 0.0003757287511052757, + "loss": 0.2338, + "step": 240720 + }, + { + "epoch": 9.97, + "grad_norm": 0.8671875, + "learning_rate": 0.0003757193770960643, + "loss": 0.2083, + "step": 240730 + }, + { + "epoch": 9.97, + "grad_norm": 0.333984375, + "learning_rate": 0.00037571000285026263, + "loss": 0.2007, + "step": 240740 + }, + { + "epoch": 9.97, + "grad_norm": 0.9609375, + "learning_rate": 0.00037570062836788835, + "loss": 0.1812, + "step": 240750 + }, + { + "epoch": 9.97, + "grad_norm": 0.29296875, + "learning_rate": 0.000375691253648959, + "loss": 0.2272, + "step": 240760 + }, + { + "epoch": 9.97, + "grad_norm": 0.61328125, + "learning_rate": 0.00037568187869349226, + "loss": 0.2029, + "step": 240770 + }, + { + "epoch": 9.97, + "grad_norm": 1.03125, + "learning_rate": 0.00037567250350150586, + "loss": 0.1515, + "step": 240780 + }, + { + "epoch": 9.97, + "grad_norm": 0.6953125, + "learning_rate": 0.0003756631280730173, + "loss": 0.1986, + "step": 240790 + }, + { + "epoch": 9.97, + "grad_norm": 0.5703125, + "learning_rate": 0.00037565375240804433, + "loss": 0.1783, + "step": 240800 + }, + { + "epoch": 9.97, + "grad_norm": 1.6796875, + "learning_rate": 0.00037564437650660466, + "loss": 0.166, + "step": 240810 + }, + { + "epoch": 9.97, + "grad_norm": 0.67578125, + "learning_rate": 0.00037563500036871567, + "loss": 0.1794, + "step": 240820 + }, + { + "epoch": 9.98, + "grad_norm": 0.70703125, + "learning_rate": 0.00037562562399439536, + "loss": 0.1965, + "step": 240830 + }, + { + "epoch": 9.98, + "grad_norm": 1.2421875, + "learning_rate": 0.00037561624738366114, + "loss": 0.1804, + "step": 240840 + }, + { + "epoch": 9.98, + "grad_norm": 0.431640625, + "learning_rate": 0.0003756068705365306, + "loss": 0.1896, + "step": 240850 + }, + { + "epoch": 9.98, + "grad_norm": 0.85546875, + "learning_rate": 0.00037559749345302164, + "loss": 0.191, + "step": 240860 + }, + { + "epoch": 9.98, + "grad_norm": 0.5703125, + "learning_rate": 0.0003755881161331517, + "loss": 0.1795, + "step": 240870 + }, + { + "epoch": 9.98, + "grad_norm": 0.5703125, + "learning_rate": 0.0003755787385769385, + "loss": 0.2225, + "step": 240880 + }, + { + "epoch": 9.98, + "grad_norm": 1.0390625, + "learning_rate": 0.0003755693607843996, + "loss": 0.2116, + "step": 240890 + }, + { + "epoch": 9.98, + "grad_norm": 0.55859375, + "learning_rate": 0.00037555998275555275, + "loss": 0.1931, + "step": 240900 + }, + { + "epoch": 9.98, + "grad_norm": 0.45703125, + "learning_rate": 0.00037555060449041567, + "loss": 0.2225, + "step": 240910 + }, + { + "epoch": 9.98, + "grad_norm": 0.5546875, + "learning_rate": 0.00037554122598900584, + "loss": 0.2185, + "step": 240920 + }, + { + "epoch": 9.98, + "grad_norm": 0.8828125, + "learning_rate": 0.00037553184725134094, + "loss": 0.1738, + "step": 240930 + }, + { + "epoch": 9.98, + "grad_norm": 1.234375, + "learning_rate": 0.00037552246827743873, + "loss": 0.1735, + "step": 240940 + }, + { + "epoch": 9.98, + "grad_norm": 0.38671875, + "learning_rate": 0.00037551308906731674, + "loss": 0.2615, + "step": 240950 + }, + { + "epoch": 9.98, + "grad_norm": 1.125, + "learning_rate": 0.0003755037096209927, + "loss": 0.2118, + "step": 240960 + }, + { + "epoch": 9.98, + "grad_norm": 0.65625, + "learning_rate": 0.00037549432993848434, + "loss": 0.1859, + "step": 240970 + }, + { + "epoch": 9.98, + "grad_norm": 1.6640625, + "learning_rate": 0.00037548495001980895, + "loss": 0.2083, + "step": 240980 + }, + { + "epoch": 9.98, + "grad_norm": 1.125, + "learning_rate": 0.00037547556986498463, + "loss": 0.2163, + "step": 240990 + }, + { + "epoch": 9.98, + "grad_norm": 2.140625, + "learning_rate": 0.0003754661894740288, + "loss": 0.1884, + "step": 241000 + }, + { + "epoch": 9.98, + "grad_norm": 1.0234375, + "learning_rate": 0.0003754568088469591, + "loss": 0.2071, + "step": 241010 + }, + { + "epoch": 9.98, + "grad_norm": 1.1875, + "learning_rate": 0.00037544742798379326, + "loss": 0.1536, + "step": 241020 + }, + { + "epoch": 9.98, + "grad_norm": 1.8359375, + "learning_rate": 0.00037543804688454894, + "loss": 0.1694, + "step": 241030 + }, + { + "epoch": 9.98, + "grad_norm": 1.328125, + "learning_rate": 0.0003754286655492437, + "loss": 0.1701, + "step": 241040 + }, + { + "epoch": 9.98, + "grad_norm": 0.64453125, + "learning_rate": 0.0003754192839778954, + "loss": 0.2163, + "step": 241050 + }, + { + "epoch": 9.98, + "grad_norm": 0.50390625, + "learning_rate": 0.00037540990217052134, + "loss": 0.1796, + "step": 241060 + }, + { + "epoch": 9.99, + "grad_norm": 1.171875, + "learning_rate": 0.00037540052012713956, + "loss": 0.198, + "step": 241070 + }, + { + "epoch": 9.99, + "grad_norm": 0.7734375, + "learning_rate": 0.0003753911378477675, + "loss": 0.2127, + "step": 241080 + }, + { + "epoch": 9.99, + "grad_norm": 0.546875, + "learning_rate": 0.00037538175533242274, + "loss": 0.1964, + "step": 241090 + }, + { + "epoch": 9.99, + "grad_norm": 0.5859375, + "learning_rate": 0.00037537237258112324, + "loss": 0.2588, + "step": 241100 + }, + { + "epoch": 9.99, + "grad_norm": 0.69921875, + "learning_rate": 0.0003753629895938864, + "loss": 0.2043, + "step": 241110 + }, + { + "epoch": 9.99, + "grad_norm": 0.431640625, + "learning_rate": 0.00037535360637072987, + "loss": 0.1972, + "step": 241120 + }, + { + "epoch": 9.99, + "grad_norm": 0.4921875, + "learning_rate": 0.0003753442229116715, + "loss": 0.2326, + "step": 241130 + }, + { + "epoch": 9.99, + "grad_norm": 1.5703125, + "learning_rate": 0.0003753348392167287, + "loss": 0.1884, + "step": 241140 + }, + { + "epoch": 9.99, + "grad_norm": 1.171875, + "learning_rate": 0.00037532545528591945, + "loss": 0.2789, + "step": 241150 + }, + { + "epoch": 9.99, + "grad_norm": 0.64453125, + "learning_rate": 0.00037531607111926114, + "loss": 0.1784, + "step": 241160 + }, + { + "epoch": 9.99, + "grad_norm": 0.5234375, + "learning_rate": 0.0003753066867167715, + "loss": 0.1502, + "step": 241170 + }, + { + "epoch": 9.99, + "grad_norm": 0.0, + "learning_rate": 0.0003752973020784682, + "loss": 0.2215, + "step": 241180 + }, + { + "epoch": 9.99, + "grad_norm": 0.40625, + "learning_rate": 0.00037528791720436896, + "loss": 0.2479, + "step": 241190 + }, + { + "epoch": 9.99, + "grad_norm": 1.03125, + "learning_rate": 0.00037527853209449135, + "loss": 0.1611, + "step": 241200 + }, + { + "epoch": 9.99, + "grad_norm": 0.90234375, + "learning_rate": 0.0003752691467488531, + "loss": 0.251, + "step": 241210 + }, + { + "epoch": 9.99, + "grad_norm": 1.8125, + "learning_rate": 0.00037525976116747174, + "loss": 0.2066, + "step": 241220 + }, + { + "epoch": 9.99, + "grad_norm": 0.62109375, + "learning_rate": 0.0003752503753503652, + "loss": 0.2145, + "step": 241230 + }, + { + "epoch": 9.99, + "grad_norm": 0.8203125, + "learning_rate": 0.0003752409892975509, + "loss": 0.1815, + "step": 241240 + }, + { + "epoch": 9.99, + "grad_norm": 0.423828125, + "learning_rate": 0.00037523160300904657, + "loss": 0.1361, + "step": 241250 + }, + { + "epoch": 9.99, + "grad_norm": 0.69921875, + "learning_rate": 0.00037522221648486994, + "loss": 0.1711, + "step": 241260 + }, + { + "epoch": 9.99, + "grad_norm": 0.63671875, + "learning_rate": 0.00037521282972503855, + "loss": 0.1749, + "step": 241270 + }, + { + "epoch": 9.99, + "grad_norm": 1.0, + "learning_rate": 0.0003752034427295702, + "loss": 0.2631, + "step": 241280 + }, + { + "epoch": 9.99, + "grad_norm": 1.0390625, + "learning_rate": 0.0003751940554984825, + "loss": 0.1567, + "step": 241290 + }, + { + "epoch": 9.99, + "grad_norm": 0.3671875, + "learning_rate": 0.00037518466803179295, + "loss": 0.1664, + "step": 241300 + }, + { + "epoch": 10.0, + "grad_norm": 0.60546875, + "learning_rate": 0.0003751752803295195, + "loss": 0.1916, + "step": 241310 + }, + { + "epoch": 10.0, + "grad_norm": 0.5625, + "learning_rate": 0.00037516589239167976, + "loss": 0.1971, + "step": 241320 + }, + { + "epoch": 10.0, + "grad_norm": 0.65234375, + "learning_rate": 0.00037515650421829117, + "loss": 0.2227, + "step": 241330 + }, + { + "epoch": 10.0, + "grad_norm": 0.0, + "learning_rate": 0.00037514711580937174, + "loss": 0.2301, + "step": 241340 + }, + { + "epoch": 10.0, + "grad_norm": 0.625, + "learning_rate": 0.00037513772716493883, + "loss": 0.1795, + "step": 241350 + }, + { + "epoch": 10.0, + "grad_norm": 0.443359375, + "learning_rate": 0.00037512833828501026, + "loss": 0.2143, + "step": 241360 + }, + { + "epoch": 10.0, + "grad_norm": 1.0546875, + "learning_rate": 0.00037511894916960367, + "loss": 0.2047, + "step": 241370 + }, + { + "epoch": 10.0, + "grad_norm": 0.578125, + "learning_rate": 0.0003751095598187366, + "loss": 0.1884, + "step": 241380 + }, + { + "epoch": 10.0, + "grad_norm": 0.53515625, + "learning_rate": 0.000375100170232427, + "loss": 0.2009, + "step": 241390 + }, + { + "epoch": 10.0, + "grad_norm": 0.56640625, + "learning_rate": 0.0003750907804106924, + "loss": 0.2216, + "step": 241400 + }, + { + "epoch": 10.0, + "grad_norm": 1.1171875, + "learning_rate": 0.00037508139035355036, + "loss": 0.2359, + "step": 241410 + }, + { + "epoch": 10.0, + "grad_norm": 0.474609375, + "learning_rate": 0.00037507200006101885, + "loss": 0.1836, + "step": 241420 + }, + { + "epoch": 10.0, + "grad_norm": 0.87890625, + "learning_rate": 0.0003750626095331151, + "loss": 0.201, + "step": 241430 + }, + { + "epoch": 10.0, + "grad_norm": 1.3671875, + "learning_rate": 0.00037505321876985713, + "loss": 0.2095, + "step": 241440 + }, + { + "epoch": 10.0, + "grad_norm": 0.5234375, + "learning_rate": 0.0003750438277712625, + "loss": 0.1954, + "step": 241450 + }, + { + "epoch": 10.0, + "grad_norm": 0.498046875, + "learning_rate": 0.0003750344365373489, + "loss": 0.2054, + "step": 241460 + }, + { + "epoch": 10.0, + "grad_norm": 0.94921875, + "learning_rate": 0.00037502504506813403, + "loss": 0.1914, + "step": 241470 + }, + { + "epoch": 10.0, + "grad_norm": 2.09375, + "learning_rate": 0.0003750156533636354, + "loss": 0.1443, + "step": 241480 + }, + { + "epoch": 10.0, + "grad_norm": 1.203125, + "learning_rate": 0.00037500626142387093, + "loss": 0.1577, + "step": 241490 + }, + { + "epoch": 10.0, + "grad_norm": 1.0625, + "learning_rate": 0.00037499686924885817, + "loss": 0.169, + "step": 241500 + }, + { + "epoch": 10.0, + "grad_norm": 0.279296875, + "learning_rate": 0.0003749874768386148, + "loss": 0.1251, + "step": 241510 + }, + { + "epoch": 10.0, + "grad_norm": 1.3203125, + "learning_rate": 0.00037497808419315845, + "loss": 0.2339, + "step": 241520 + }, + { + "epoch": 10.0, + "grad_norm": 0.765625, + "learning_rate": 0.00037496869131250685, + "loss": 0.2122, + "step": 241530 + }, + { + "epoch": 10.0, + "grad_norm": 0.9921875, + "learning_rate": 0.00037495929819667765, + "loss": 0.238, + "step": 241540 + }, + { + "epoch": 10.0, + "grad_norm": 1.0703125, + "learning_rate": 0.0003749499048456886, + "loss": 0.2373, + "step": 241550 + }, + { + "epoch": 10.01, + "grad_norm": 0.88671875, + "learning_rate": 0.00037494051125955724, + "loss": 0.2453, + "step": 241560 + }, + { + "epoch": 10.01, + "grad_norm": 0.92578125, + "learning_rate": 0.0003749311174383014, + "loss": 0.1923, + "step": 241570 + }, + { + "epoch": 10.01, + "grad_norm": 0.0, + "learning_rate": 0.0003749217233819386, + "loss": 0.1599, + "step": 241580 + }, + { + "epoch": 10.01, + "grad_norm": 0.66015625, + "learning_rate": 0.00037491232909048677, + "loss": 0.2493, + "step": 241590 + }, + { + "epoch": 10.01, + "grad_norm": 0.33984375, + "learning_rate": 0.0003749029345639633, + "loss": 0.244, + "step": 241600 + }, + { + "epoch": 10.01, + "grad_norm": 0.6875, + "learning_rate": 0.00037489353980238606, + "loss": 0.2056, + "step": 241610 + }, + { + "epoch": 10.01, + "grad_norm": 1.3984375, + "learning_rate": 0.0003748841448057726, + "loss": 0.192, + "step": 241620 + }, + { + "epoch": 10.01, + "grad_norm": 0.88671875, + "learning_rate": 0.00037487474957414074, + "loss": 0.1477, + "step": 241630 + }, + { + "epoch": 10.01, + "grad_norm": 0.734375, + "learning_rate": 0.00037486535410750807, + "loss": 0.2138, + "step": 241640 + }, + { + "epoch": 10.01, + "grad_norm": 0.78125, + "learning_rate": 0.0003748559584058922, + "loss": 0.1817, + "step": 241650 + }, + { + "epoch": 10.01, + "grad_norm": 0.78125, + "learning_rate": 0.000374846562469311, + "loss": 0.1616, + "step": 241660 + }, + { + "epoch": 10.01, + "grad_norm": 0.6796875, + "learning_rate": 0.000374837166297782, + "loss": 0.1699, + "step": 241670 + }, + { + "epoch": 10.01, + "grad_norm": 1.125, + "learning_rate": 0.00037482776989132294, + "loss": 0.2342, + "step": 241680 + }, + { + "epoch": 10.01, + "grad_norm": 1.453125, + "learning_rate": 0.00037481837324995147, + "loss": 0.1946, + "step": 241690 + }, + { + "epoch": 10.01, + "grad_norm": 1.0, + "learning_rate": 0.00037480897637368537, + "loss": 0.2049, + "step": 241700 + }, + { + "epoch": 10.01, + "grad_norm": 0.69921875, + "learning_rate": 0.00037479957926254227, + "loss": 0.1842, + "step": 241710 + }, + { + "epoch": 10.01, + "grad_norm": 0.427734375, + "learning_rate": 0.0003747901819165398, + "loss": 0.1917, + "step": 241720 + }, + { + "epoch": 10.01, + "grad_norm": 1.3203125, + "learning_rate": 0.00037478078433569563, + "loss": 0.1356, + "step": 241730 + }, + { + "epoch": 10.01, + "grad_norm": 1.1640625, + "learning_rate": 0.00037477138652002755, + "loss": 0.1691, + "step": 241740 + }, + { + "epoch": 10.01, + "grad_norm": 0.96875, + "learning_rate": 0.0003747619884695532, + "loss": 0.2296, + "step": 241750 + }, + { + "epoch": 10.01, + "grad_norm": 0.9609375, + "learning_rate": 0.0003747525901842903, + "loss": 0.1752, + "step": 241760 + }, + { + "epoch": 10.01, + "grad_norm": 2.765625, + "learning_rate": 0.00037474319166425643, + "loss": 0.2015, + "step": 241770 + }, + { + "epoch": 10.01, + "grad_norm": 1.0703125, + "learning_rate": 0.00037473379290946934, + "loss": 0.2167, + "step": 241780 + }, + { + "epoch": 10.01, + "grad_norm": 0.447265625, + "learning_rate": 0.00037472439391994674, + "loss": 0.1617, + "step": 241790 + }, + { + "epoch": 10.02, + "grad_norm": 1.1484375, + "learning_rate": 0.00037471499469570634, + "loss": 0.1734, + "step": 241800 + }, + { + "epoch": 10.02, + "grad_norm": 1.9609375, + "learning_rate": 0.0003747055952367657, + "loss": 0.1843, + "step": 241810 + }, + { + "epoch": 10.02, + "grad_norm": 0.69140625, + "learning_rate": 0.00037469619554314265, + "loss": 0.2108, + "step": 241820 + }, + { + "epoch": 10.02, + "grad_norm": 1.0546875, + "learning_rate": 0.00037468679561485487, + "loss": 0.2011, + "step": 241830 + }, + { + "epoch": 10.02, + "grad_norm": 0.5859375, + "learning_rate": 0.00037467739545192, + "loss": 0.1769, + "step": 241840 + }, + { + "epoch": 10.02, + "grad_norm": 0.91015625, + "learning_rate": 0.00037466799505435567, + "loss": 0.228, + "step": 241850 + }, + { + "epoch": 10.02, + "grad_norm": 1.078125, + "learning_rate": 0.00037465859442217965, + "loss": 0.2195, + "step": 241860 + }, + { + "epoch": 10.02, + "grad_norm": 1.5078125, + "learning_rate": 0.0003746491935554096, + "loss": 0.2212, + "step": 241870 + }, + { + "epoch": 10.02, + "grad_norm": 0.57421875, + "learning_rate": 0.0003746397924540633, + "loss": 0.1972, + "step": 241880 + }, + { + "epoch": 10.02, + "grad_norm": 0.69921875, + "learning_rate": 0.00037463039111815835, + "loss": 0.1939, + "step": 241890 + }, + { + "epoch": 10.02, + "grad_norm": 0.52734375, + "learning_rate": 0.0003746209895477124, + "loss": 0.1977, + "step": 241900 + }, + { + "epoch": 10.02, + "grad_norm": 0.318359375, + "learning_rate": 0.0003746115877427432, + "loss": 0.1954, + "step": 241910 + }, + { + "epoch": 10.02, + "grad_norm": 1.484375, + "learning_rate": 0.00037460218570326855, + "loss": 0.2015, + "step": 241920 + }, + { + "epoch": 10.02, + "grad_norm": 0.328125, + "learning_rate": 0.00037459278342930595, + "loss": 0.213, + "step": 241930 + }, + { + "epoch": 10.02, + "grad_norm": 0.76953125, + "learning_rate": 0.00037458338092087324, + "loss": 0.1818, + "step": 241940 + }, + { + "epoch": 10.02, + "grad_norm": 0.65625, + "learning_rate": 0.00037457397817798805, + "loss": 0.1395, + "step": 241950 + }, + { + "epoch": 10.02, + "grad_norm": 0.67578125, + "learning_rate": 0.0003745645752006681, + "loss": 0.1606, + "step": 241960 + }, + { + "epoch": 10.02, + "grad_norm": 0.76953125, + "learning_rate": 0.00037455517198893107, + "loss": 0.1995, + "step": 241970 + }, + { + "epoch": 10.02, + "grad_norm": 2.203125, + "learning_rate": 0.00037454576854279467, + "loss": 0.2114, + "step": 241980 + }, + { + "epoch": 10.02, + "grad_norm": 0.458984375, + "learning_rate": 0.0003745363648622765, + "loss": 0.1956, + "step": 241990 + }, + { + "epoch": 10.02, + "grad_norm": 0.64453125, + "learning_rate": 0.0003745269609473944, + "loss": 0.2362, + "step": 242000 + }, + { + "epoch": 10.02, + "grad_norm": 1.1328125, + "learning_rate": 0.000374517556798166, + "loss": 0.2286, + "step": 242010 + }, + { + "epoch": 10.02, + "grad_norm": 0.7890625, + "learning_rate": 0.000374508152414609, + "loss": 0.1789, + "step": 242020 + }, + { + "epoch": 10.02, + "grad_norm": 0.345703125, + "learning_rate": 0.0003744987477967411, + "loss": 0.1379, + "step": 242030 + }, + { + "epoch": 10.03, + "grad_norm": 0.765625, + "learning_rate": 0.00037448934294458005, + "loss": 0.2635, + "step": 242040 + }, + { + "epoch": 10.03, + "grad_norm": 1.7578125, + "learning_rate": 0.00037447993785814346, + "loss": 0.2089, + "step": 242050 + }, + { + "epoch": 10.03, + "grad_norm": 0.83984375, + "learning_rate": 0.00037447053253744905, + "loss": 0.1508, + "step": 242060 + }, + { + "epoch": 10.03, + "grad_norm": 0.37890625, + "learning_rate": 0.00037446112698251454, + "loss": 0.1935, + "step": 242070 + }, + { + "epoch": 10.03, + "grad_norm": 0.57421875, + "learning_rate": 0.0003744517211933576, + "loss": 0.2184, + "step": 242080 + }, + { + "epoch": 10.03, + "grad_norm": 0.703125, + "learning_rate": 0.0003744423151699961, + "loss": 0.1953, + "step": 242090 + }, + { + "epoch": 10.03, + "grad_norm": 0.330078125, + "learning_rate": 0.00037443290891244754, + "loss": 0.2006, + "step": 242100 + }, + { + "epoch": 10.03, + "grad_norm": 0.859375, + "learning_rate": 0.0003744235024207296, + "loss": 0.1966, + "step": 242110 + }, + { + "epoch": 10.03, + "grad_norm": 0.8671875, + "learning_rate": 0.0003744140956948602, + "loss": 0.188, + "step": 242120 + }, + { + "epoch": 10.03, + "grad_norm": 0.72265625, + "learning_rate": 0.00037440468873485675, + "loss": 0.1667, + "step": 242130 + }, + { + "epoch": 10.03, + "grad_norm": 1.0859375, + "learning_rate": 0.0003743952815407372, + "loss": 0.1769, + "step": 242140 + }, + { + "epoch": 10.03, + "grad_norm": 0.353515625, + "learning_rate": 0.0003743858741125191, + "loss": 0.1945, + "step": 242150 + }, + { + "epoch": 10.03, + "grad_norm": 1.2421875, + "learning_rate": 0.0003743764664502203, + "loss": 0.211, + "step": 242160 + }, + { + "epoch": 10.03, + "grad_norm": 0.9296875, + "learning_rate": 0.00037436705855385845, + "loss": 0.1702, + "step": 242170 + }, + { + "epoch": 10.03, + "grad_norm": 1.1171875, + "learning_rate": 0.0003743576504234512, + "loss": 0.1444, + "step": 242180 + }, + { + "epoch": 10.03, + "grad_norm": 0.52734375, + "learning_rate": 0.0003743482420590162, + "loss": 0.2049, + "step": 242190 + }, + { + "epoch": 10.03, + "grad_norm": 0.64453125, + "learning_rate": 0.0003743388334605713, + "loss": 0.1727, + "step": 242200 + }, + { + "epoch": 10.03, + "grad_norm": 0.70703125, + "learning_rate": 0.000374329424628134, + "loss": 0.1574, + "step": 242210 + }, + { + "epoch": 10.03, + "grad_norm": 0.5703125, + "learning_rate": 0.00037432001556172236, + "loss": 0.1837, + "step": 242220 + }, + { + "epoch": 10.03, + "grad_norm": 0.71484375, + "learning_rate": 0.00037431060626135377, + "loss": 0.201, + "step": 242230 + }, + { + "epoch": 10.03, + "grad_norm": 0.85546875, + "learning_rate": 0.0003743011967270461, + "loss": 0.2118, + "step": 242240 + }, + { + "epoch": 10.03, + "grad_norm": 0.5859375, + "learning_rate": 0.0003742917869588169, + "loss": 0.1874, + "step": 242250 + }, + { + "epoch": 10.03, + "grad_norm": 2.1875, + "learning_rate": 0.0003742823769566841, + "loss": 0.2207, + "step": 242260 + }, + { + "epoch": 10.03, + "grad_norm": 0.353515625, + "learning_rate": 0.00037427296672066525, + "loss": 0.1446, + "step": 242270 + }, + { + "epoch": 10.04, + "grad_norm": 0.671875, + "learning_rate": 0.00037426355625077814, + "loss": 0.252, + "step": 242280 + }, + { + "epoch": 10.04, + "grad_norm": 0.58203125, + "learning_rate": 0.00037425414554704027, + "loss": 0.2123, + "step": 242290 + }, + { + "epoch": 10.04, + "grad_norm": 0.49609375, + "learning_rate": 0.0003742447346094697, + "loss": 0.1869, + "step": 242300 + }, + { + "epoch": 10.04, + "grad_norm": 0.431640625, + "learning_rate": 0.0003742353234380839, + "loss": 0.1956, + "step": 242310 + }, + { + "epoch": 10.04, + "grad_norm": 1.5703125, + "learning_rate": 0.0003742259120329006, + "loss": 0.176, + "step": 242320 + }, + { + "epoch": 10.04, + "grad_norm": 0.91796875, + "learning_rate": 0.00037421650039393757, + "loss": 0.1845, + "step": 242330 + }, + { + "epoch": 10.04, + "grad_norm": 0.5234375, + "learning_rate": 0.00037420708852121243, + "loss": 0.1354, + "step": 242340 + }, + { + "epoch": 10.04, + "grad_norm": 0.8203125, + "learning_rate": 0.00037419767641474306, + "loss": 0.1596, + "step": 242350 + }, + { + "epoch": 10.04, + "grad_norm": 1.1640625, + "learning_rate": 0.0003741882640745471, + "loss": 0.1914, + "step": 242360 + }, + { + "epoch": 10.04, + "grad_norm": 0.96484375, + "learning_rate": 0.0003741788515006421, + "loss": 0.212, + "step": 242370 + }, + { + "epoch": 10.04, + "grad_norm": 0.408203125, + "learning_rate": 0.000374169438693046, + "loss": 0.2254, + "step": 242380 + }, + { + "epoch": 10.04, + "grad_norm": 0.4140625, + "learning_rate": 0.00037416002565177645, + "loss": 0.2388, + "step": 242390 + }, + { + "epoch": 10.04, + "grad_norm": 0.77734375, + "learning_rate": 0.0003741506123768511, + "loss": 0.2435, + "step": 242400 + }, + { + "epoch": 10.04, + "grad_norm": 0.388671875, + "learning_rate": 0.00037414119886828776, + "loss": 0.2255, + "step": 242410 + }, + { + "epoch": 10.04, + "grad_norm": 2.609375, + "learning_rate": 0.000374131785126104, + "loss": 0.1963, + "step": 242420 + }, + { + "epoch": 10.04, + "grad_norm": 0.7734375, + "learning_rate": 0.0003741223711503177, + "loss": 0.2412, + "step": 242430 + }, + { + "epoch": 10.04, + "grad_norm": 0.0615234375, + "learning_rate": 0.00037411295694094647, + "loss": 0.208, + "step": 242440 + }, + { + "epoch": 10.04, + "grad_norm": 1.2890625, + "learning_rate": 0.00037410354249800803, + "loss": 0.1807, + "step": 242450 + }, + { + "epoch": 10.04, + "grad_norm": 1.375, + "learning_rate": 0.0003740941278215202, + "loss": 0.1718, + "step": 242460 + }, + { + "epoch": 10.04, + "grad_norm": 1.0078125, + "learning_rate": 0.0003740847129115005, + "loss": 0.1941, + "step": 242470 + }, + { + "epoch": 10.04, + "grad_norm": 0.478515625, + "learning_rate": 0.0003740752977679669, + "loss": 0.1822, + "step": 242480 + }, + { + "epoch": 10.04, + "grad_norm": 0.423828125, + "learning_rate": 0.000374065882390937, + "loss": 0.1834, + "step": 242490 + }, + { + "epoch": 10.04, + "grad_norm": 0.41015625, + "learning_rate": 0.0003740564667804284, + "loss": 0.2183, + "step": 242500 + }, + { + "epoch": 10.04, + "grad_norm": 0.62109375, + "learning_rate": 0.00037404705093645895, + "loss": 0.2186, + "step": 242510 + }, + { + "epoch": 10.05, + "grad_norm": 2.34375, + "learning_rate": 0.0003740376348590464, + "loss": 0.1742, + "step": 242520 + }, + { + "epoch": 10.05, + "grad_norm": 0.349609375, + "learning_rate": 0.00037402821854820833, + "loss": 0.2101, + "step": 242530 + }, + { + "epoch": 10.05, + "grad_norm": 0.5078125, + "learning_rate": 0.00037401880200396264, + "loss": 0.2409, + "step": 242540 + }, + { + "epoch": 10.05, + "grad_norm": 0.65625, + "learning_rate": 0.0003740093852263269, + "loss": 0.2113, + "step": 242550 + }, + { + "epoch": 10.05, + "grad_norm": 1.1640625, + "learning_rate": 0.0003739999682153189, + "loss": 0.1867, + "step": 242560 + }, + { + "epoch": 10.05, + "grad_norm": 1.5234375, + "learning_rate": 0.00037399055097095635, + "loss": 0.2145, + "step": 242570 + }, + { + "epoch": 10.05, + "grad_norm": 0.427734375, + "learning_rate": 0.000373981133493257, + "loss": 0.2002, + "step": 242580 + }, + { + "epoch": 10.05, + "grad_norm": 0.8828125, + "learning_rate": 0.00037397171578223856, + "loss": 0.1851, + "step": 242590 + }, + { + "epoch": 10.05, + "grad_norm": 0.81640625, + "learning_rate": 0.0003739622978379187, + "loss": 0.1783, + "step": 242600 + }, + { + "epoch": 10.05, + "grad_norm": 0.71875, + "learning_rate": 0.0003739528796603151, + "loss": 0.2017, + "step": 242610 + }, + { + "epoch": 10.05, + "grad_norm": 1.0078125, + "learning_rate": 0.00037394346124944566, + "loss": 0.1777, + "step": 242620 + }, + { + "epoch": 10.05, + "grad_norm": 1.21875, + "learning_rate": 0.00037393404260532804, + "loss": 0.2251, + "step": 242630 + }, + { + "epoch": 10.05, + "grad_norm": 0.7734375, + "learning_rate": 0.0003739246237279799, + "loss": 0.235, + "step": 242640 + }, + { + "epoch": 10.05, + "grad_norm": 0.65234375, + "learning_rate": 0.00037391520461741904, + "loss": 0.2453, + "step": 242650 + }, + { + "epoch": 10.05, + "grad_norm": 0.0, + "learning_rate": 0.00037390578527366305, + "loss": 0.1773, + "step": 242660 + }, + { + "epoch": 10.05, + "grad_norm": 0.640625, + "learning_rate": 0.0003738963656967298, + "loss": 0.201, + "step": 242670 + }, + { + "epoch": 10.05, + "grad_norm": 1.3125, + "learning_rate": 0.000373886945886637, + "loss": 0.2621, + "step": 242680 + }, + { + "epoch": 10.05, + "grad_norm": 0.55078125, + "learning_rate": 0.0003738775258434023, + "loss": 0.2294, + "step": 242690 + }, + { + "epoch": 10.05, + "grad_norm": 1.203125, + "learning_rate": 0.0003738681055670435, + "loss": 0.233, + "step": 242700 + }, + { + "epoch": 10.05, + "grad_norm": 0.412109375, + "learning_rate": 0.0003738586850575783, + "loss": 0.1573, + "step": 242710 + }, + { + "epoch": 10.05, + "grad_norm": 1.140625, + "learning_rate": 0.0003738492643150244, + "loss": 0.1847, + "step": 242720 + }, + { + "epoch": 10.05, + "grad_norm": 0.69140625, + "learning_rate": 0.00037383984333939967, + "loss": 0.2199, + "step": 242730 + }, + { + "epoch": 10.05, + "grad_norm": 0.78125, + "learning_rate": 0.0003738304221307216, + "loss": 0.1905, + "step": 242740 + }, + { + "epoch": 10.05, + "grad_norm": 0.8359375, + "learning_rate": 0.00037382100068900813, + "loss": 0.2098, + "step": 242750 + }, + { + "epoch": 10.06, + "grad_norm": 1.171875, + "learning_rate": 0.0003738115790142769, + "loss": 0.1748, + "step": 242760 + }, + { + "epoch": 10.06, + "grad_norm": 1.4921875, + "learning_rate": 0.00037380215710654556, + "loss": 0.235, + "step": 242770 + }, + { + "epoch": 10.06, + "grad_norm": 1.46875, + "learning_rate": 0.000373792734965832, + "loss": 0.1801, + "step": 242780 + }, + { + "epoch": 10.06, + "grad_norm": 0.91796875, + "learning_rate": 0.0003737833125921539, + "loss": 0.1864, + "step": 242790 + }, + { + "epoch": 10.06, + "grad_norm": 0.66015625, + "learning_rate": 0.0003737738899855289, + "loss": 0.2163, + "step": 242800 + }, + { + "epoch": 10.06, + "grad_norm": 0.57421875, + "learning_rate": 0.0003737644671459749, + "loss": 0.1867, + "step": 242810 + }, + { + "epoch": 10.06, + "grad_norm": 0.82421875, + "learning_rate": 0.00037375504407350947, + "loss": 0.2065, + "step": 242820 + }, + { + "epoch": 10.06, + "grad_norm": 0.34765625, + "learning_rate": 0.00037374562076815046, + "loss": 0.2124, + "step": 242830 + }, + { + "epoch": 10.06, + "grad_norm": 0.70703125, + "learning_rate": 0.00037373619722991545, + "loss": 0.2, + "step": 242840 + }, + { + "epoch": 10.06, + "grad_norm": 1.265625, + "learning_rate": 0.0003737267734588224, + "loss": 0.1561, + "step": 242850 + }, + { + "epoch": 10.06, + "grad_norm": 0.451171875, + "learning_rate": 0.0003737173494548889, + "loss": 0.2189, + "step": 242860 + }, + { + "epoch": 10.06, + "grad_norm": 1.03125, + "learning_rate": 0.00037370792521813266, + "loss": 0.199, + "step": 242870 + }, + { + "epoch": 10.06, + "grad_norm": 2.15625, + "learning_rate": 0.0003736985007485715, + "loss": 0.1804, + "step": 242880 + }, + { + "epoch": 10.06, + "grad_norm": 0.80078125, + "learning_rate": 0.0003736890760462231, + "loss": 0.2097, + "step": 242890 + }, + { + "epoch": 10.06, + "grad_norm": 0.7890625, + "learning_rate": 0.00037367965111110527, + "loss": 0.2181, + "step": 242900 + }, + { + "epoch": 10.06, + "grad_norm": 0.498046875, + "learning_rate": 0.00037367022594323563, + "loss": 0.1983, + "step": 242910 + }, + { + "epoch": 10.06, + "grad_norm": 0.94921875, + "learning_rate": 0.000373660800542632, + "loss": 0.2336, + "step": 242920 + }, + { + "epoch": 10.06, + "grad_norm": 1.2734375, + "learning_rate": 0.0003736513749093121, + "loss": 0.2276, + "step": 242930 + }, + { + "epoch": 10.06, + "grad_norm": 1.21875, + "learning_rate": 0.0003736419490432937, + "loss": 0.1833, + "step": 242940 + }, + { + "epoch": 10.06, + "grad_norm": 0.1279296875, + "learning_rate": 0.00037363252294459446, + "loss": 0.172, + "step": 242950 + }, + { + "epoch": 10.06, + "grad_norm": 0.369140625, + "learning_rate": 0.00037362309661323213, + "loss": 0.1734, + "step": 242960 + }, + { + "epoch": 10.06, + "grad_norm": 0.71875, + "learning_rate": 0.0003736136700492245, + "loss": 0.1693, + "step": 242970 + }, + { + "epoch": 10.06, + "grad_norm": 0.734375, + "learning_rate": 0.00037360424325258933, + "loss": 0.1917, + "step": 242980 + }, + { + "epoch": 10.06, + "grad_norm": 1.171875, + "learning_rate": 0.0003735948162233443, + "loss": 0.1905, + "step": 242990 + }, + { + "epoch": 10.07, + "grad_norm": 0.90234375, + "learning_rate": 0.0003735853889615072, + "loss": 0.2104, + "step": 243000 + }, + { + "epoch": 10.07, + "grad_norm": 1.1875, + "learning_rate": 0.00037357596146709573, + "loss": 0.1746, + "step": 243010 + }, + { + "epoch": 10.07, + "grad_norm": 0.83203125, + "learning_rate": 0.0003735665337401277, + "loss": 0.2062, + "step": 243020 + }, + { + "epoch": 10.07, + "grad_norm": 0.98828125, + "learning_rate": 0.00037355710578062076, + "loss": 0.2077, + "step": 243030 + }, + { + "epoch": 10.07, + "grad_norm": 0.032470703125, + "learning_rate": 0.00037354767758859266, + "loss": 0.1726, + "step": 243040 + }, + { + "epoch": 10.07, + "grad_norm": 1.2578125, + "learning_rate": 0.00037353824916406116, + "loss": 0.1739, + "step": 243050 + }, + { + "epoch": 10.07, + "grad_norm": 0.59765625, + "learning_rate": 0.000373528820507044, + "loss": 0.1311, + "step": 243060 + }, + { + "epoch": 10.07, + "grad_norm": 0.76171875, + "learning_rate": 0.00037351939161755896, + "loss": 0.2289, + "step": 243070 + }, + { + "epoch": 10.07, + "grad_norm": 0.21484375, + "learning_rate": 0.0003735099624956238, + "loss": 0.1728, + "step": 243080 + }, + { + "epoch": 10.07, + "grad_norm": 2.078125, + "learning_rate": 0.0003735005331412562, + "loss": 0.2262, + "step": 243090 + }, + { + "epoch": 10.07, + "grad_norm": 0.73046875, + "learning_rate": 0.00037349110355447396, + "loss": 0.1832, + "step": 243100 + }, + { + "epoch": 10.07, + "grad_norm": 1.7109375, + "learning_rate": 0.00037348167373529476, + "loss": 0.2352, + "step": 243110 + }, + { + "epoch": 10.07, + "grad_norm": 1.140625, + "learning_rate": 0.00037347224368373646, + "loss": 0.1949, + "step": 243120 + }, + { + "epoch": 10.07, + "grad_norm": 0.93359375, + "learning_rate": 0.00037346281339981665, + "loss": 0.1867, + "step": 243130 + }, + { + "epoch": 10.07, + "grad_norm": 0.82421875, + "learning_rate": 0.0003734533828835532, + "loss": 0.2202, + "step": 243140 + }, + { + "epoch": 10.07, + "grad_norm": 0.66015625, + "learning_rate": 0.00037344395213496386, + "loss": 0.1856, + "step": 243150 + }, + { + "epoch": 10.07, + "grad_norm": 0.5390625, + "learning_rate": 0.0003734345211540662, + "loss": 0.1844, + "step": 243160 + }, + { + "epoch": 10.07, + "grad_norm": 1.0859375, + "learning_rate": 0.0003734250899408782, + "loss": 0.211, + "step": 243170 + }, + { + "epoch": 10.07, + "grad_norm": 1.484375, + "learning_rate": 0.00037341565849541747, + "loss": 0.1921, + "step": 243180 + }, + { + "epoch": 10.07, + "grad_norm": 0.6015625, + "learning_rate": 0.0003734062268177019, + "loss": 0.2445, + "step": 243190 + }, + { + "epoch": 10.07, + "grad_norm": 0.6484375, + "learning_rate": 0.00037339679490774896, + "loss": 0.2028, + "step": 243200 + }, + { + "epoch": 10.07, + "grad_norm": 1.3125, + "learning_rate": 0.0003733873627655767, + "loss": 0.2173, + "step": 243210 + }, + { + "epoch": 10.07, + "grad_norm": 0.96484375, + "learning_rate": 0.0003733779303912027, + "loss": 0.1926, + "step": 243220 + }, + { + "epoch": 10.07, + "grad_norm": 0.84375, + "learning_rate": 0.0003733684977846448, + "loss": 0.1775, + "step": 243230 + }, + { + "epoch": 10.07, + "grad_norm": 1.8828125, + "learning_rate": 0.00037335906494592063, + "loss": 0.1934, + "step": 243240 + }, + { + "epoch": 10.08, + "grad_norm": 1.7265625, + "learning_rate": 0.0003733496318750481, + "loss": 0.1885, + "step": 243250 + }, + { + "epoch": 10.08, + "grad_norm": 0.62109375, + "learning_rate": 0.0003733401985720448, + "loss": 0.2287, + "step": 243260 + }, + { + "epoch": 10.08, + "grad_norm": 0.84765625, + "learning_rate": 0.0003733307650369287, + "loss": 0.1741, + "step": 243270 + }, + { + "epoch": 10.08, + "grad_norm": 1.34375, + "learning_rate": 0.00037332133126971736, + "loss": 0.1982, + "step": 243280 + }, + { + "epoch": 10.08, + "grad_norm": 0.2314453125, + "learning_rate": 0.00037331189727042856, + "loss": 0.2299, + "step": 243290 + }, + { + "epoch": 10.08, + "grad_norm": 0.82421875, + "learning_rate": 0.0003733024630390801, + "loss": 0.2026, + "step": 243300 + }, + { + "epoch": 10.08, + "grad_norm": 0.4921875, + "learning_rate": 0.0003732930285756897, + "loss": 0.208, + "step": 243310 + }, + { + "epoch": 10.08, + "grad_norm": 1.0078125, + "learning_rate": 0.0003732835938802752, + "loss": 0.2192, + "step": 243320 + }, + { + "epoch": 10.08, + "grad_norm": 1.0859375, + "learning_rate": 0.00037327415895285423, + "loss": 0.1932, + "step": 243330 + }, + { + "epoch": 10.08, + "grad_norm": 1.359375, + "learning_rate": 0.0003732647237934446, + "loss": 0.2156, + "step": 243340 + }, + { + "epoch": 10.08, + "grad_norm": 0.92578125, + "learning_rate": 0.00037325528840206413, + "loss": 0.2271, + "step": 243350 + }, + { + "epoch": 10.08, + "grad_norm": 0.78515625, + "learning_rate": 0.00037324585277873047, + "loss": 0.1785, + "step": 243360 + }, + { + "epoch": 10.08, + "grad_norm": 0.765625, + "learning_rate": 0.00037323641692346146, + "loss": 0.2093, + "step": 243370 + }, + { + "epoch": 10.08, + "grad_norm": 0.59765625, + "learning_rate": 0.00037322698083627477, + "loss": 0.1884, + "step": 243380 + }, + { + "epoch": 10.08, + "grad_norm": 0.5625, + "learning_rate": 0.0003732175445171882, + "loss": 0.2416, + "step": 243390 + }, + { + "epoch": 10.08, + "grad_norm": 0.53515625, + "learning_rate": 0.0003732081079662196, + "loss": 0.2188, + "step": 243400 + }, + { + "epoch": 10.08, + "grad_norm": 0.69140625, + "learning_rate": 0.0003731986711833866, + "loss": 0.2001, + "step": 243410 + }, + { + "epoch": 10.08, + "grad_norm": 1.5703125, + "learning_rate": 0.000373189234168707, + "loss": 0.2187, + "step": 243420 + }, + { + "epoch": 10.08, + "grad_norm": 0.62109375, + "learning_rate": 0.0003731797969221986, + "loss": 0.1361, + "step": 243430 + }, + { + "epoch": 10.08, + "grad_norm": 0.1943359375, + "learning_rate": 0.0003731703594438791, + "loss": 0.1832, + "step": 243440 + }, + { + "epoch": 10.08, + "grad_norm": 0.73828125, + "learning_rate": 0.0003731609217337663, + "loss": 0.1858, + "step": 243450 + }, + { + "epoch": 10.08, + "grad_norm": 0.65625, + "learning_rate": 0.0003731514837918779, + "loss": 0.2133, + "step": 243460 + }, + { + "epoch": 10.08, + "grad_norm": 1.1796875, + "learning_rate": 0.00037314204561823166, + "loss": 0.1708, + "step": 243470 + }, + { + "epoch": 10.08, + "grad_norm": 0.3984375, + "learning_rate": 0.0003731326072128455, + "loss": 0.2081, + "step": 243480 + }, + { + "epoch": 10.09, + "grad_norm": 0.80078125, + "learning_rate": 0.000373123168575737, + "loss": 0.1687, + "step": 243490 + }, + { + "epoch": 10.09, + "grad_norm": 0.8515625, + "learning_rate": 0.000373113729706924, + "loss": 0.1846, + "step": 243500 + }, + { + "epoch": 10.09, + "grad_norm": 1.5390625, + "learning_rate": 0.00037310429060642437, + "loss": 0.1903, + "step": 243510 + }, + { + "epoch": 10.09, + "grad_norm": 0.6015625, + "learning_rate": 0.0003730948512742556, + "loss": 0.2033, + "step": 243520 + }, + { + "epoch": 10.09, + "grad_norm": 0.216796875, + "learning_rate": 0.00037308541171043576, + "loss": 0.1839, + "step": 243530 + }, + { + "epoch": 10.09, + "grad_norm": 0.58984375, + "learning_rate": 0.0003730759719149823, + "loss": 0.2117, + "step": 243540 + }, + { + "epoch": 10.09, + "grad_norm": 0.6484375, + "learning_rate": 0.0003730665318879133, + "loss": 0.2196, + "step": 243550 + }, + { + "epoch": 10.09, + "grad_norm": 0.271484375, + "learning_rate": 0.00037305709162924633, + "loss": 0.1867, + "step": 243560 + }, + { + "epoch": 10.09, + "grad_norm": 0.78515625, + "learning_rate": 0.0003730476511389992, + "loss": 0.1478, + "step": 243570 + }, + { + "epoch": 10.09, + "grad_norm": 2.046875, + "learning_rate": 0.00037303821041718966, + "loss": 0.2453, + "step": 243580 + }, + { + "epoch": 10.09, + "grad_norm": 0.7109375, + "learning_rate": 0.0003730287694638355, + "loss": 0.1841, + "step": 243590 + }, + { + "epoch": 10.09, + "grad_norm": 0.55859375, + "learning_rate": 0.00037301932827895446, + "loss": 0.2046, + "step": 243600 + }, + { + "epoch": 10.09, + "grad_norm": 0.443359375, + "learning_rate": 0.0003730098868625644, + "loss": 0.2232, + "step": 243610 + }, + { + "epoch": 10.09, + "grad_norm": 0.90234375, + "learning_rate": 0.0003730004452146829, + "loss": 0.2097, + "step": 243620 + }, + { + "epoch": 10.09, + "grad_norm": 0.8984375, + "learning_rate": 0.0003729910033353279, + "loss": 0.2136, + "step": 243630 + }, + { + "epoch": 10.09, + "grad_norm": 0.64453125, + "learning_rate": 0.0003729815612245172, + "loss": 0.1237, + "step": 243640 + }, + { + "epoch": 10.09, + "grad_norm": 1.8046875, + "learning_rate": 0.00037297211888226835, + "loss": 0.1682, + "step": 243650 + }, + { + "epoch": 10.09, + "grad_norm": 1.125, + "learning_rate": 0.00037296267630859936, + "loss": 0.213, + "step": 243660 + }, + { + "epoch": 10.09, + "grad_norm": 0.546875, + "learning_rate": 0.0003729532335035279, + "loss": 0.1798, + "step": 243670 + }, + { + "epoch": 10.09, + "grad_norm": 1.1796875, + "learning_rate": 0.0003729437904670716, + "loss": 0.1902, + "step": 243680 + }, + { + "epoch": 10.09, + "grad_norm": 1.0390625, + "learning_rate": 0.00037293434719924847, + "loss": 0.2251, + "step": 243690 + }, + { + "epoch": 10.09, + "grad_norm": 0.73046875, + "learning_rate": 0.00037292490370007615, + "loss": 0.1822, + "step": 243700 + }, + { + "epoch": 10.09, + "grad_norm": 1.0703125, + "learning_rate": 0.00037291545996957244, + "loss": 0.1763, + "step": 243710 + }, + { + "epoch": 10.09, + "grad_norm": 0.671875, + "learning_rate": 0.00037290601600775507, + "loss": 0.2811, + "step": 243720 + }, + { + "epoch": 10.1, + "grad_norm": 0.8046875, + "learning_rate": 0.00037289657181464186, + "loss": 0.185, + "step": 243730 + }, + { + "epoch": 10.1, + "grad_norm": 0.7265625, + "learning_rate": 0.00037288712739025066, + "loss": 0.1745, + "step": 243740 + }, + { + "epoch": 10.1, + "grad_norm": 0.8125, + "learning_rate": 0.0003728776827345991, + "loss": 0.172, + "step": 243750 + }, + { + "epoch": 10.1, + "grad_norm": 0.69921875, + "learning_rate": 0.00037286823784770496, + "loss": 0.21, + "step": 243760 + }, + { + "epoch": 10.1, + "grad_norm": 0.69921875, + "learning_rate": 0.00037285879272958613, + "loss": 0.2112, + "step": 243770 + }, + { + "epoch": 10.1, + "grad_norm": 0.28515625, + "learning_rate": 0.0003728493473802603, + "loss": 0.1951, + "step": 243780 + }, + { + "epoch": 10.1, + "grad_norm": 0.859375, + "learning_rate": 0.00037283990179974526, + "loss": 0.2566, + "step": 243790 + }, + { + "epoch": 10.1, + "grad_norm": 0.6484375, + "learning_rate": 0.00037283045598805883, + "loss": 0.2086, + "step": 243800 + }, + { + "epoch": 10.1, + "grad_norm": 1.8359375, + "learning_rate": 0.0003728210099452186, + "loss": 0.1948, + "step": 243810 + }, + { + "epoch": 10.1, + "grad_norm": 0.703125, + "learning_rate": 0.0003728115636712427, + "loss": 0.1953, + "step": 243820 + }, + { + "epoch": 10.1, + "grad_norm": 0.546875, + "learning_rate": 0.0003728021171661485, + "loss": 0.2247, + "step": 243830 + }, + { + "epoch": 10.1, + "grad_norm": 0.35546875, + "learning_rate": 0.0003727926704299541, + "loss": 0.1881, + "step": 243840 + }, + { + "epoch": 10.1, + "grad_norm": 0.796875, + "learning_rate": 0.00037278322346267725, + "loss": 0.2006, + "step": 243850 + }, + { + "epoch": 10.1, + "grad_norm": 0.51171875, + "learning_rate": 0.00037277377626433543, + "loss": 0.1957, + "step": 243860 + }, + { + "epoch": 10.1, + "grad_norm": 1.203125, + "learning_rate": 0.0003727643288349467, + "loss": 0.1935, + "step": 243870 + }, + { + "epoch": 10.1, + "grad_norm": 0.66015625, + "learning_rate": 0.00037275488117452884, + "loss": 0.1394, + "step": 243880 + }, + { + "epoch": 10.1, + "grad_norm": 0.8984375, + "learning_rate": 0.0003727454332830994, + "loss": 0.1713, + "step": 243890 + }, + { + "epoch": 10.1, + "grad_norm": 0.84765625, + "learning_rate": 0.0003727359851606764, + "loss": 0.2333, + "step": 243900 + }, + { + "epoch": 10.1, + "grad_norm": 0.2265625, + "learning_rate": 0.00037272653680727754, + "loss": 0.1898, + "step": 243910 + }, + { + "epoch": 10.1, + "grad_norm": 0.40234375, + "learning_rate": 0.00037271708822292056, + "loss": 0.1922, + "step": 243920 + }, + { + "epoch": 10.1, + "grad_norm": 0.84375, + "learning_rate": 0.0003727076394076233, + "loss": 0.2076, + "step": 243930 + }, + { + "epoch": 10.1, + "grad_norm": 0.78515625, + "learning_rate": 0.00037269819036140354, + "loss": 0.1661, + "step": 243940 + }, + { + "epoch": 10.1, + "grad_norm": 0.75390625, + "learning_rate": 0.000372688741084279, + "loss": 0.2075, + "step": 243950 + }, + { + "epoch": 10.1, + "grad_norm": 1.1171875, + "learning_rate": 0.00037267929157626747, + "loss": 0.2055, + "step": 243960 + }, + { + "epoch": 10.11, + "grad_norm": 1.5078125, + "learning_rate": 0.0003726698418373867, + "loss": 0.1798, + "step": 243970 + }, + { + "epoch": 10.11, + "grad_norm": 0.5546875, + "learning_rate": 0.00037266039186765467, + "loss": 0.2351, + "step": 243980 + }, + { + "epoch": 10.11, + "grad_norm": 0.94921875, + "learning_rate": 0.00037265094166708896, + "loss": 0.2699, + "step": 243990 + }, + { + "epoch": 10.11, + "grad_norm": 0.82421875, + "learning_rate": 0.00037264149123570744, + "loss": 0.1925, + "step": 244000 + }, + { + "epoch": 10.11, + "grad_norm": 0.4609375, + "learning_rate": 0.0003726320405735279, + "loss": 0.2204, + "step": 244010 + }, + { + "epoch": 10.11, + "grad_norm": 0.79296875, + "learning_rate": 0.00037262258968056807, + "loss": 0.181, + "step": 244020 + }, + { + "epoch": 10.11, + "grad_norm": 1.4375, + "learning_rate": 0.00037261313855684573, + "loss": 0.1759, + "step": 244030 + }, + { + "epoch": 10.11, + "grad_norm": 0.69140625, + "learning_rate": 0.0003726036872023788, + "loss": 0.1306, + "step": 244040 + }, + { + "epoch": 10.11, + "grad_norm": 0.8515625, + "learning_rate": 0.00037259423561718485, + "loss": 0.1862, + "step": 244050 + }, + { + "epoch": 10.11, + "grad_norm": 0.58984375, + "learning_rate": 0.00037258478380128193, + "loss": 0.1985, + "step": 244060 + }, + { + "epoch": 10.11, + "grad_norm": 0.64453125, + "learning_rate": 0.0003725753317546876, + "loss": 0.2227, + "step": 244070 + }, + { + "epoch": 10.11, + "grad_norm": 1.046875, + "learning_rate": 0.0003725658794774197, + "loss": 0.1914, + "step": 244080 + }, + { + "epoch": 10.11, + "grad_norm": 0.70703125, + "learning_rate": 0.00037255642696949623, + "loss": 0.188, + "step": 244090 + }, + { + "epoch": 10.11, + "grad_norm": 0.73046875, + "learning_rate": 0.0003725469742309346, + "loss": 0.1948, + "step": 244100 + }, + { + "epoch": 10.11, + "grad_norm": 1.328125, + "learning_rate": 0.0003725375212617529, + "loss": 0.1699, + "step": 244110 + }, + { + "epoch": 10.11, + "grad_norm": 0.65234375, + "learning_rate": 0.0003725280680619688, + "loss": 0.1742, + "step": 244120 + }, + { + "epoch": 10.11, + "grad_norm": 0.5703125, + "learning_rate": 0.00037251861463159997, + "loss": 0.2232, + "step": 244130 + }, + { + "epoch": 10.11, + "grad_norm": 0.4453125, + "learning_rate": 0.00037250916097066457, + "loss": 0.2069, + "step": 244140 + }, + { + "epoch": 10.11, + "grad_norm": 0.8671875, + "learning_rate": 0.00037249970707918, + "loss": 0.183, + "step": 244150 + }, + { + "epoch": 10.11, + "grad_norm": 0.6015625, + "learning_rate": 0.0003724902529571643, + "loss": 0.2327, + "step": 244160 + }, + { + "epoch": 10.11, + "grad_norm": 0.4609375, + "learning_rate": 0.0003724807986046351, + "loss": 0.2006, + "step": 244170 + }, + { + "epoch": 10.11, + "grad_norm": 0.71484375, + "learning_rate": 0.00037247134402161033, + "loss": 0.1641, + "step": 244180 + }, + { + "epoch": 10.11, + "grad_norm": 0.322265625, + "learning_rate": 0.00037246188920810763, + "loss": 0.1949, + "step": 244190 + }, + { + "epoch": 10.11, + "grad_norm": 0.69140625, + "learning_rate": 0.00037245243416414496, + "loss": 0.157, + "step": 244200 + }, + { + "epoch": 10.12, + "grad_norm": 0.7890625, + "learning_rate": 0.00037244297888973995, + "loss": 0.1447, + "step": 244210 + }, + { + "epoch": 10.12, + "grad_norm": 1.8046875, + "learning_rate": 0.00037243352338491053, + "loss": 0.2341, + "step": 244220 + }, + { + "epoch": 10.12, + "grad_norm": 0.0, + "learning_rate": 0.00037242406764967443, + "loss": 0.2026, + "step": 244230 + }, + { + "epoch": 10.12, + "grad_norm": 1.2109375, + "learning_rate": 0.0003724146116840495, + "loss": 0.1934, + "step": 244240 + }, + { + "epoch": 10.12, + "grad_norm": 1.203125, + "learning_rate": 0.0003724051554880535, + "loss": 0.2378, + "step": 244250 + }, + { + "epoch": 10.12, + "grad_norm": 0.578125, + "learning_rate": 0.0003723956990617041, + "loss": 0.1864, + "step": 244260 + }, + { + "epoch": 10.12, + "grad_norm": 1.015625, + "learning_rate": 0.0003723862424050193, + "loss": 0.2022, + "step": 244270 + }, + { + "epoch": 10.12, + "grad_norm": 0.9921875, + "learning_rate": 0.0003723767855180168, + "loss": 0.2287, + "step": 244280 + }, + { + "epoch": 10.12, + "grad_norm": 0.5703125, + "learning_rate": 0.00037236732840071436, + "loss": 0.2011, + "step": 244290 + }, + { + "epoch": 10.12, + "grad_norm": 1.1484375, + "learning_rate": 0.0003723578710531299, + "loss": 0.1882, + "step": 244300 + }, + { + "epoch": 10.12, + "grad_norm": 0.578125, + "learning_rate": 0.0003723484134752811, + "loss": 0.164, + "step": 244310 + }, + { + "epoch": 10.12, + "grad_norm": 0.0, + "learning_rate": 0.0003723389556671858, + "loss": 0.1915, + "step": 244320 + }, + { + "epoch": 10.12, + "grad_norm": 1.46875, + "learning_rate": 0.0003723294976288618, + "loss": 0.1826, + "step": 244330 + }, + { + "epoch": 10.12, + "grad_norm": 1.375, + "learning_rate": 0.0003723200393603269, + "loss": 0.2159, + "step": 244340 + }, + { + "epoch": 10.12, + "grad_norm": 1.5703125, + "learning_rate": 0.0003723105808615989, + "loss": 0.2381, + "step": 244350 + }, + { + "epoch": 10.12, + "grad_norm": 0.67578125, + "learning_rate": 0.0003723011221326955, + "loss": 0.2136, + "step": 244360 + }, + { + "epoch": 10.12, + "grad_norm": 0.87890625, + "learning_rate": 0.00037229166317363473, + "loss": 0.1834, + "step": 244370 + }, + { + "epoch": 10.12, + "grad_norm": 0.52734375, + "learning_rate": 0.00037228220398443425, + "loss": 0.2125, + "step": 244380 + }, + { + "epoch": 10.12, + "grad_norm": 1.3671875, + "learning_rate": 0.0003722727445651118, + "loss": 0.2095, + "step": 244390 + }, + { + "epoch": 10.12, + "grad_norm": 0.90625, + "learning_rate": 0.00037226328491568524, + "loss": 0.2041, + "step": 244400 + }, + { + "epoch": 10.12, + "grad_norm": 1.25, + "learning_rate": 0.00037225382503617237, + "loss": 0.2076, + "step": 244410 + }, + { + "epoch": 10.12, + "grad_norm": 0.93359375, + "learning_rate": 0.0003722443649265911, + "loss": 0.219, + "step": 244420 + }, + { + "epoch": 10.12, + "grad_norm": 0.890625, + "learning_rate": 0.0003722349045869591, + "loss": 0.1837, + "step": 244430 + }, + { + "epoch": 10.12, + "grad_norm": 1.2734375, + "learning_rate": 0.00037222544401729413, + "loss": 0.2032, + "step": 244440 + }, + { + "epoch": 10.13, + "grad_norm": 0.51953125, + "learning_rate": 0.0003722159832176141, + "loss": 0.2178, + "step": 244450 + }, + { + "epoch": 10.13, + "grad_norm": 1.046875, + "learning_rate": 0.00037220652218793686, + "loss": 0.1916, + "step": 244460 + }, + { + "epoch": 10.13, + "grad_norm": 0.0, + "learning_rate": 0.0003721970609282801, + "loss": 0.2004, + "step": 244470 + }, + { + "epoch": 10.13, + "grad_norm": 0.59375, + "learning_rate": 0.0003721875994386617, + "loss": 0.2075, + "step": 244480 + }, + { + "epoch": 10.13, + "grad_norm": 0.306640625, + "learning_rate": 0.0003721781377190994, + "loss": 0.2092, + "step": 244490 + }, + { + "epoch": 10.13, + "grad_norm": 1.8046875, + "learning_rate": 0.000372168675769611, + "loss": 0.2375, + "step": 244500 + }, + { + "epoch": 10.13, + "grad_norm": 0.8046875, + "learning_rate": 0.0003721592135902144, + "loss": 0.1646, + "step": 244510 + }, + { + "epoch": 10.13, + "grad_norm": 0.65625, + "learning_rate": 0.00037214975118092736, + "loss": 0.2026, + "step": 244520 + }, + { + "epoch": 10.13, + "grad_norm": 1.4921875, + "learning_rate": 0.0003721402885417676, + "loss": 0.2235, + "step": 244530 + }, + { + "epoch": 10.13, + "grad_norm": 0.453125, + "learning_rate": 0.0003721308256727531, + "loss": 0.1951, + "step": 244540 + }, + { + "epoch": 10.13, + "grad_norm": 1.2578125, + "learning_rate": 0.0003721213625739016, + "loss": 0.2264, + "step": 244550 + }, + { + "epoch": 10.13, + "grad_norm": 0.50390625, + "learning_rate": 0.0003721118992452308, + "loss": 0.2038, + "step": 244560 + }, + { + "epoch": 10.13, + "grad_norm": 1.1171875, + "learning_rate": 0.0003721024356867586, + "loss": 0.1775, + "step": 244570 + }, + { + "epoch": 10.13, + "grad_norm": 0.90234375, + "learning_rate": 0.00037209297189850284, + "loss": 0.222, + "step": 244580 + }, + { + "epoch": 10.13, + "grad_norm": 0.9453125, + "learning_rate": 0.00037208350788048126, + "loss": 0.2118, + "step": 244590 + }, + { + "epoch": 10.13, + "grad_norm": 0.7734375, + "learning_rate": 0.00037207404363271177, + "loss": 0.1859, + "step": 244600 + }, + { + "epoch": 10.13, + "grad_norm": 0.6875, + "learning_rate": 0.000372064579155212, + "loss": 0.1791, + "step": 244610 + }, + { + "epoch": 10.13, + "grad_norm": 0.59375, + "learning_rate": 0.0003720551144479999, + "loss": 0.1721, + "step": 244620 + }, + { + "epoch": 10.13, + "grad_norm": 0.0, + "learning_rate": 0.00037204564951109334, + "loss": 0.1953, + "step": 244630 + }, + { + "epoch": 10.13, + "grad_norm": 1.2109375, + "learning_rate": 0.00037203618434451, + "loss": 0.1948, + "step": 244640 + }, + { + "epoch": 10.13, + "grad_norm": 0.31640625, + "learning_rate": 0.0003720267189482677, + "loss": 0.1805, + "step": 244650 + }, + { + "epoch": 10.13, + "grad_norm": 0.494140625, + "learning_rate": 0.0003720172533223843, + "loss": 0.2456, + "step": 244660 + }, + { + "epoch": 10.13, + "grad_norm": 0.9296875, + "learning_rate": 0.00037200778746687767, + "loss": 0.2253, + "step": 244670 + }, + { + "epoch": 10.13, + "grad_norm": 0.337890625, + "learning_rate": 0.00037199832138176556, + "loss": 0.2022, + "step": 244680 + }, + { + "epoch": 10.14, + "grad_norm": 0.58203125, + "learning_rate": 0.0003719888550670657, + "loss": 0.2344, + "step": 244690 + }, + { + "epoch": 10.14, + "grad_norm": 1.109375, + "learning_rate": 0.000371979388522796, + "loss": 0.1631, + "step": 244700 + }, + { + "epoch": 10.14, + "grad_norm": 0.75, + "learning_rate": 0.0003719699217489744, + "loss": 0.1941, + "step": 244710 + }, + { + "epoch": 10.14, + "grad_norm": 1.0078125, + "learning_rate": 0.0003719604547456184, + "loss": 0.2037, + "step": 244720 + }, + { + "epoch": 10.14, + "grad_norm": 0.90625, + "learning_rate": 0.00037195098751274607, + "loss": 0.2001, + "step": 244730 + }, + { + "epoch": 10.14, + "grad_norm": 1.3046875, + "learning_rate": 0.0003719415200503752, + "loss": 0.1473, + "step": 244740 + }, + { + "epoch": 10.14, + "grad_norm": 0.83203125, + "learning_rate": 0.0003719320523585235, + "loss": 0.1884, + "step": 244750 + }, + { + "epoch": 10.14, + "grad_norm": 3.109375, + "learning_rate": 0.0003719225844372088, + "loss": 0.2049, + "step": 244760 + }, + { + "epoch": 10.14, + "grad_norm": 1.171875, + "learning_rate": 0.00037191311628644903, + "loss": 0.1823, + "step": 244770 + }, + { + "epoch": 10.14, + "grad_norm": 0.59375, + "learning_rate": 0.000371903647906262, + "loss": 0.2655, + "step": 244780 + }, + { + "epoch": 10.14, + "grad_norm": 0.466796875, + "learning_rate": 0.00037189417929666535, + "loss": 0.1986, + "step": 244790 + }, + { + "epoch": 10.14, + "grad_norm": 0.62109375, + "learning_rate": 0.00037188471045767707, + "loss": 0.2228, + "step": 244800 + }, + { + "epoch": 10.14, + "grad_norm": 0.99609375, + "learning_rate": 0.0003718752413893149, + "loss": 0.2101, + "step": 244810 + }, + { + "epoch": 10.14, + "grad_norm": 0.78125, + "learning_rate": 0.00037186577209159674, + "loss": 0.2111, + "step": 244820 + }, + { + "epoch": 10.14, + "grad_norm": 0.404296875, + "learning_rate": 0.00037185630256454026, + "loss": 0.2146, + "step": 244830 + }, + { + "epoch": 10.14, + "grad_norm": 0.55859375, + "learning_rate": 0.0003718468328081635, + "loss": 0.1837, + "step": 244840 + }, + { + "epoch": 10.14, + "grad_norm": 1.15625, + "learning_rate": 0.0003718373628224841, + "loss": 0.197, + "step": 244850 + }, + { + "epoch": 10.14, + "grad_norm": 0.2294921875, + "learning_rate": 0.00037182789260751994, + "loss": 0.1678, + "step": 244860 + }, + { + "epoch": 10.14, + "grad_norm": 0.90234375, + "learning_rate": 0.00037181842216328886, + "loss": 0.2103, + "step": 244870 + }, + { + "epoch": 10.14, + "grad_norm": 1.0625, + "learning_rate": 0.00037180895148980866, + "loss": 0.1782, + "step": 244880 + }, + { + "epoch": 10.14, + "grad_norm": 1.328125, + "learning_rate": 0.0003717994805870971, + "loss": 0.2068, + "step": 244890 + }, + { + "epoch": 10.14, + "grad_norm": 0.515625, + "learning_rate": 0.00037179000945517215, + "loss": 0.1532, + "step": 244900 + }, + { + "epoch": 10.14, + "grad_norm": 1.0, + "learning_rate": 0.00037178053809405157, + "loss": 0.216, + "step": 244910 + }, + { + "epoch": 10.14, + "grad_norm": 1.140625, + "learning_rate": 0.00037177106650375315, + "loss": 0.2201, + "step": 244920 + }, + { + "epoch": 10.14, + "grad_norm": 0.5859375, + "learning_rate": 0.0003717615946842947, + "loss": 0.2025, + "step": 244930 + }, + { + "epoch": 10.15, + "grad_norm": 0.220703125, + "learning_rate": 0.00037175212263569407, + "loss": 0.1362, + "step": 244940 + }, + { + "epoch": 10.15, + "grad_norm": 0.91015625, + "learning_rate": 0.0003717426503579692, + "loss": 0.1551, + "step": 244950 + }, + { + "epoch": 10.15, + "grad_norm": 0.85546875, + "learning_rate": 0.0003717331778511377, + "loss": 0.2104, + "step": 244960 + }, + { + "epoch": 10.15, + "grad_norm": 0.9765625, + "learning_rate": 0.0003717237051152175, + "loss": 0.1718, + "step": 244970 + }, + { + "epoch": 10.15, + "grad_norm": 0.408203125, + "learning_rate": 0.00037171423215022644, + "loss": 0.2075, + "step": 244980 + }, + { + "epoch": 10.15, + "grad_norm": 1.4609375, + "learning_rate": 0.00037170475895618234, + "loss": 0.2076, + "step": 244990 + }, + { + "epoch": 10.15, + "grad_norm": 1.265625, + "learning_rate": 0.00037169528553310303, + "loss": 0.1686, + "step": 245000 + }, + { + "epoch": 10.15, + "grad_norm": 0.41796875, + "learning_rate": 0.0003716858118810064, + "loss": 0.1787, + "step": 245010 + }, + { + "epoch": 10.15, + "grad_norm": 0.78125, + "learning_rate": 0.00037167633799991016, + "loss": 0.1404, + "step": 245020 + }, + { + "epoch": 10.15, + "grad_norm": 0.4609375, + "learning_rate": 0.0003716668638898322, + "loss": 0.1952, + "step": 245030 + }, + { + "epoch": 10.15, + "grad_norm": 0.79296875, + "learning_rate": 0.0003716573895507903, + "loss": 0.2105, + "step": 245040 + }, + { + "epoch": 10.15, + "grad_norm": 1.0625, + "learning_rate": 0.00037164791498280243, + "loss": 0.2498, + "step": 245050 + }, + { + "epoch": 10.15, + "grad_norm": 0.65234375, + "learning_rate": 0.0003716384401858862, + "loss": 0.1393, + "step": 245060 + }, + { + "epoch": 10.15, + "grad_norm": 0.76171875, + "learning_rate": 0.0003716289651600596, + "loss": 0.2269, + "step": 245070 + }, + { + "epoch": 10.15, + "grad_norm": 0.86328125, + "learning_rate": 0.00037161948990534053, + "loss": 0.1972, + "step": 245080 + }, + { + "epoch": 10.15, + "grad_norm": 0.51171875, + "learning_rate": 0.0003716100144217466, + "loss": 0.2115, + "step": 245090 + }, + { + "epoch": 10.15, + "grad_norm": 0.44140625, + "learning_rate": 0.0003716005387092958, + "loss": 0.1806, + "step": 245100 + }, + { + "epoch": 10.15, + "grad_norm": 0.71875, + "learning_rate": 0.0003715910627680059, + "loss": 0.2039, + "step": 245110 + }, + { + "epoch": 10.15, + "grad_norm": 0.8671875, + "learning_rate": 0.0003715815865978947, + "loss": 0.1671, + "step": 245120 + }, + { + "epoch": 10.15, + "grad_norm": 0.53515625, + "learning_rate": 0.0003715721101989802, + "loss": 0.1543, + "step": 245130 + }, + { + "epoch": 10.15, + "grad_norm": 0.9375, + "learning_rate": 0.00037156263357128, + "loss": 0.1741, + "step": 245140 + }, + { + "epoch": 10.15, + "grad_norm": 0.703125, + "learning_rate": 0.0003715531567148121, + "loss": 0.2244, + "step": 245150 + }, + { + "epoch": 10.15, + "grad_norm": 0.50390625, + "learning_rate": 0.00037154367962959436, + "loss": 0.1538, + "step": 245160 + }, + { + "epoch": 10.15, + "grad_norm": 1.59375, + "learning_rate": 0.0003715342023156444, + "loss": 0.1718, + "step": 245170 + }, + { + "epoch": 10.16, + "grad_norm": 1.7578125, + "learning_rate": 0.00037152472477298027, + "loss": 0.1795, + "step": 245180 + }, + { + "epoch": 10.16, + "grad_norm": 0.796875, + "learning_rate": 0.0003715152470016198, + "loss": 0.2027, + "step": 245190 + }, + { + "epoch": 10.16, + "grad_norm": 0.78125, + "learning_rate": 0.0003715057690015806, + "loss": 0.148, + "step": 245200 + }, + { + "epoch": 10.16, + "grad_norm": 1.328125, + "learning_rate": 0.00037149629077288084, + "loss": 0.1716, + "step": 245210 + }, + { + "epoch": 10.16, + "grad_norm": 0.94140625, + "learning_rate": 0.00037148681231553805, + "loss": 0.2361, + "step": 245220 + }, + { + "epoch": 10.16, + "grad_norm": 0.68359375, + "learning_rate": 0.00037147733362957025, + "loss": 0.163, + "step": 245230 + }, + { + "epoch": 10.16, + "grad_norm": 0.953125, + "learning_rate": 0.00037146785471499523, + "loss": 0.2439, + "step": 245240 + }, + { + "epoch": 10.16, + "grad_norm": 0.57421875, + "learning_rate": 0.00037145837557183076, + "loss": 0.1831, + "step": 245250 + }, + { + "epoch": 10.16, + "grad_norm": 1.4921875, + "learning_rate": 0.0003714488962000948, + "loss": 0.1958, + "step": 245260 + }, + { + "epoch": 10.16, + "grad_norm": 2.0625, + "learning_rate": 0.00037143941659980516, + "loss": 0.2079, + "step": 245270 + }, + { + "epoch": 10.16, + "grad_norm": 0.6328125, + "learning_rate": 0.0003714299367709796, + "loss": 0.2436, + "step": 245280 + }, + { + "epoch": 10.16, + "grad_norm": 0.341796875, + "learning_rate": 0.00037142045671363606, + "loss": 0.1434, + "step": 245290 + }, + { + "epoch": 10.16, + "grad_norm": 0.79296875, + "learning_rate": 0.0003714109764277923, + "loss": 0.2556, + "step": 245300 + }, + { + "epoch": 10.16, + "grad_norm": 1.109375, + "learning_rate": 0.00037140149591346615, + "loss": 0.2116, + "step": 245310 + }, + { + "epoch": 10.16, + "grad_norm": 0.5, + "learning_rate": 0.0003713920151706756, + "loss": 0.1817, + "step": 245320 + }, + { + "epoch": 10.16, + "grad_norm": 0.69140625, + "learning_rate": 0.00037138253419943823, + "loss": 0.1734, + "step": 245330 + }, + { + "epoch": 10.16, + "grad_norm": 0.91796875, + "learning_rate": 0.0003713730529997722, + "loss": 0.2233, + "step": 245340 + }, + { + "epoch": 10.16, + "grad_norm": 1.171875, + "learning_rate": 0.0003713635715716952, + "loss": 0.1821, + "step": 245350 + }, + { + "epoch": 10.16, + "grad_norm": 0.5390625, + "learning_rate": 0.0003713540899152249, + "loss": 0.21, + "step": 245360 + }, + { + "epoch": 10.16, + "grad_norm": 0.8046875, + "learning_rate": 0.0003713446080303794, + "loss": 0.1954, + "step": 245370 + }, + { + "epoch": 10.16, + "grad_norm": 0.6640625, + "learning_rate": 0.0003713351259171765, + "loss": 0.1903, + "step": 245380 + }, + { + "epoch": 10.16, + "grad_norm": 0.4296875, + "learning_rate": 0.0003713256435756339, + "loss": 0.2109, + "step": 245390 + }, + { + "epoch": 10.16, + "grad_norm": 0.828125, + "learning_rate": 0.00037131616100576965, + "loss": 0.2105, + "step": 245400 + }, + { + "epoch": 10.16, + "grad_norm": 0.59765625, + "learning_rate": 0.00037130667820760137, + "loss": 0.1785, + "step": 245410 + }, + { + "epoch": 10.17, + "grad_norm": 0.78125, + "learning_rate": 0.0003712971951811471, + "loss": 0.1986, + "step": 245420 + }, + { + "epoch": 10.17, + "grad_norm": 0.2021484375, + "learning_rate": 0.00037128771192642464, + "loss": 0.1899, + "step": 245430 + }, + { + "epoch": 10.17, + "grad_norm": 0.5625, + "learning_rate": 0.0003712782284434517, + "loss": 0.1563, + "step": 245440 + }, + { + "epoch": 10.17, + "grad_norm": 1.2890625, + "learning_rate": 0.0003712687447322463, + "loss": 0.1943, + "step": 245450 + }, + { + "epoch": 10.17, + "grad_norm": 0.734375, + "learning_rate": 0.0003712592607928262, + "loss": 0.1476, + "step": 245460 + }, + { + "epoch": 10.17, + "grad_norm": 0.65625, + "learning_rate": 0.0003712497766252093, + "loss": 0.2446, + "step": 245470 + }, + { + "epoch": 10.17, + "grad_norm": 0.93359375, + "learning_rate": 0.00037124029222941336, + "loss": 0.1524, + "step": 245480 + }, + { + "epoch": 10.17, + "grad_norm": 1.4765625, + "learning_rate": 0.00037123080760545626, + "loss": 0.1787, + "step": 245490 + }, + { + "epoch": 10.17, + "grad_norm": 1.140625, + "learning_rate": 0.00037122132275335597, + "loss": 0.1871, + "step": 245500 + }, + { + "epoch": 10.17, + "grad_norm": 0.5859375, + "learning_rate": 0.0003712118376731302, + "loss": 0.1932, + "step": 245510 + }, + { + "epoch": 10.17, + "grad_norm": 0.55859375, + "learning_rate": 0.0003712023523647968, + "loss": 0.1744, + "step": 245520 + }, + { + "epoch": 10.17, + "grad_norm": 0.8515625, + "learning_rate": 0.00037119286682837373, + "loss": 0.2444, + "step": 245530 + }, + { + "epoch": 10.17, + "grad_norm": 0.54296875, + "learning_rate": 0.00037118338106387867, + "loss": 0.2003, + "step": 245540 + }, + { + "epoch": 10.17, + "grad_norm": 0.8984375, + "learning_rate": 0.00037117389507132964, + "loss": 0.17, + "step": 245550 + }, + { + "epoch": 10.17, + "grad_norm": 0.55078125, + "learning_rate": 0.00037116440885074444, + "loss": 0.1812, + "step": 245560 + }, + { + "epoch": 10.17, + "grad_norm": 0.8046875, + "learning_rate": 0.00037115492240214084, + "loss": 0.2366, + "step": 245570 + }, + { + "epoch": 10.17, + "grad_norm": 0.69921875, + "learning_rate": 0.00037114543572553684, + "loss": 0.2137, + "step": 245580 + }, + { + "epoch": 10.17, + "grad_norm": 0.7890625, + "learning_rate": 0.0003711359488209501, + "loss": 0.2133, + "step": 245590 + }, + { + "epoch": 10.17, + "grad_norm": 0.400390625, + "learning_rate": 0.0003711264616883987, + "loss": 0.1526, + "step": 245600 + }, + { + "epoch": 10.17, + "grad_norm": 0.7421875, + "learning_rate": 0.0003711169743279004, + "loss": 0.1853, + "step": 245610 + }, + { + "epoch": 10.17, + "grad_norm": 0.96875, + "learning_rate": 0.00037110748673947297, + "loss": 0.1916, + "step": 245620 + }, + { + "epoch": 10.17, + "grad_norm": 0.53125, + "learning_rate": 0.00037109799892313434, + "loss": 0.2128, + "step": 245630 + }, + { + "epoch": 10.17, + "grad_norm": 0.66796875, + "learning_rate": 0.0003710885108789024, + "loss": 0.2365, + "step": 245640 + }, + { + "epoch": 10.17, + "grad_norm": 1.2421875, + "learning_rate": 0.0003710790226067948, + "loss": 0.1966, + "step": 245650 + }, + { + "epoch": 10.18, + "grad_norm": 0.50390625, + "learning_rate": 0.00037106953410682975, + "loss": 0.145, + "step": 245660 + }, + { + "epoch": 10.18, + "grad_norm": 0.984375, + "learning_rate": 0.00037106004537902474, + "loss": 0.1489, + "step": 245670 + }, + { + "epoch": 10.18, + "grad_norm": 0.490234375, + "learning_rate": 0.0003710505564233979, + "loss": 0.1642, + "step": 245680 + }, + { + "epoch": 10.18, + "grad_norm": 0.5859375, + "learning_rate": 0.000371041067239967, + "loss": 0.1574, + "step": 245690 + }, + { + "epoch": 10.18, + "grad_norm": 0.98828125, + "learning_rate": 0.0003710315778287499, + "loss": 0.1938, + "step": 245700 + }, + { + "epoch": 10.18, + "grad_norm": 1.0625, + "learning_rate": 0.00037102208818976434, + "loss": 0.187, + "step": 245710 + }, + { + "epoch": 10.18, + "grad_norm": 0.44140625, + "learning_rate": 0.0003710125983230284, + "loss": 0.2549, + "step": 245720 + }, + { + "epoch": 10.18, + "grad_norm": 1.0390625, + "learning_rate": 0.00037100310822855964, + "loss": 0.1753, + "step": 245730 + }, + { + "epoch": 10.18, + "grad_norm": 0.73046875, + "learning_rate": 0.00037099361790637626, + "loss": 0.2136, + "step": 245740 + }, + { + "epoch": 10.18, + "grad_norm": 0.412109375, + "learning_rate": 0.0003709841273564959, + "loss": 0.1999, + "step": 245750 + }, + { + "epoch": 10.18, + "grad_norm": 0.859375, + "learning_rate": 0.00037097463657893647, + "loss": 0.2353, + "step": 245760 + }, + { + "epoch": 10.18, + "grad_norm": 0.66796875, + "learning_rate": 0.00037096514557371585, + "loss": 0.1801, + "step": 245770 + }, + { + "epoch": 10.18, + "grad_norm": 0.57421875, + "learning_rate": 0.00037095565434085186, + "loss": 0.2023, + "step": 245780 + }, + { + "epoch": 10.18, + "grad_norm": 1.171875, + "learning_rate": 0.00037094616288036245, + "loss": 0.1737, + "step": 245790 + }, + { + "epoch": 10.18, + "grad_norm": 0.68359375, + "learning_rate": 0.0003709366711922654, + "loss": 0.2102, + "step": 245800 + }, + { + "epoch": 10.18, + "grad_norm": 0.9609375, + "learning_rate": 0.0003709271792765786, + "loss": 0.1902, + "step": 245810 + }, + { + "epoch": 10.18, + "grad_norm": 1.1640625, + "learning_rate": 0.00037091768713331993, + "loss": 0.228, + "step": 245820 + }, + { + "epoch": 10.18, + "grad_norm": 0.9296875, + "learning_rate": 0.00037090819476250715, + "loss": 0.22, + "step": 245830 + }, + { + "epoch": 10.18, + "grad_norm": 0.9921875, + "learning_rate": 0.00037089870216415824, + "loss": 0.2121, + "step": 245840 + }, + { + "epoch": 10.18, + "grad_norm": 0.65625, + "learning_rate": 0.00037088920933829107, + "loss": 0.2292, + "step": 245850 + }, + { + "epoch": 10.18, + "grad_norm": 0.388671875, + "learning_rate": 0.0003708797162849235, + "loss": 0.2076, + "step": 245860 + }, + { + "epoch": 10.18, + "grad_norm": 0.453125, + "learning_rate": 0.0003708702230040732, + "loss": 0.1783, + "step": 245870 + }, + { + "epoch": 10.18, + "grad_norm": 0.9765625, + "learning_rate": 0.00037086072949575827, + "loss": 0.1939, + "step": 245880 + }, + { + "epoch": 10.18, + "grad_norm": 0.546875, + "learning_rate": 0.00037085123575999654, + "loss": 0.1994, + "step": 245890 + }, + { + "epoch": 10.19, + "grad_norm": 1.28125, + "learning_rate": 0.0003708417417968058, + "loss": 0.2529, + "step": 245900 + }, + { + "epoch": 10.19, + "grad_norm": 0.671875, + "learning_rate": 0.0003708322476062039, + "loss": 0.1609, + "step": 245910 + }, + { + "epoch": 10.19, + "grad_norm": 0.890625, + "learning_rate": 0.00037082275318820883, + "loss": 0.2107, + "step": 245920 + }, + { + "epoch": 10.19, + "grad_norm": 0.2890625, + "learning_rate": 0.0003708132585428384, + "loss": 0.1894, + "step": 245930 + }, + { + "epoch": 10.19, + "grad_norm": 0.38671875, + "learning_rate": 0.0003708037636701105, + "loss": 0.2156, + "step": 245940 + }, + { + "epoch": 10.19, + "grad_norm": 0.609375, + "learning_rate": 0.00037079426857004286, + "loss": 0.2556, + "step": 245950 + }, + { + "epoch": 10.19, + "grad_norm": 0.78515625, + "learning_rate": 0.0003707847732426535, + "loss": 0.1965, + "step": 245960 + }, + { + "epoch": 10.19, + "grad_norm": 1.328125, + "learning_rate": 0.00037077527768796014, + "loss": 0.1922, + "step": 245970 + }, + { + "epoch": 10.19, + "grad_norm": 0.9140625, + "learning_rate": 0.00037076578190598094, + "loss": 0.1865, + "step": 245980 + }, + { + "epoch": 10.19, + "grad_norm": 0.890625, + "learning_rate": 0.00037075628589673345, + "loss": 0.2039, + "step": 245990 + }, + { + "epoch": 10.19, + "grad_norm": 0.6875, + "learning_rate": 0.00037074678966023567, + "loss": 0.1967, + "step": 246000 + }, + { + "epoch": 10.19, + "grad_norm": 1.359375, + "learning_rate": 0.0003707372931965055, + "loss": 0.2179, + "step": 246010 + }, + { + "epoch": 10.19, + "grad_norm": 0.314453125, + "learning_rate": 0.00037072779650556084, + "loss": 0.2196, + "step": 246020 + }, + { + "epoch": 10.19, + "grad_norm": 1.28125, + "learning_rate": 0.0003707182995874194, + "loss": 0.2267, + "step": 246030 + }, + { + "epoch": 10.19, + "grad_norm": 0.82421875, + "learning_rate": 0.00037070880244209923, + "loss": 0.2321, + "step": 246040 + }, + { + "epoch": 10.19, + "grad_norm": 0.83203125, + "learning_rate": 0.00037069930506961806, + "loss": 0.2124, + "step": 246050 + }, + { + "epoch": 10.19, + "grad_norm": 0.60546875, + "learning_rate": 0.0003706898074699939, + "loss": 0.1917, + "step": 246060 + }, + { + "epoch": 10.19, + "grad_norm": 0.9609375, + "learning_rate": 0.00037068030964324455, + "loss": 0.2136, + "step": 246070 + }, + { + "epoch": 10.19, + "grad_norm": 0.78515625, + "learning_rate": 0.00037067081158938784, + "loss": 0.2177, + "step": 246080 + }, + { + "epoch": 10.19, + "grad_norm": 0.74609375, + "learning_rate": 0.0003706613133084418, + "loss": 0.143, + "step": 246090 + }, + { + "epoch": 10.19, + "grad_norm": 0.259765625, + "learning_rate": 0.00037065181480042407, + "loss": 0.2121, + "step": 246100 + }, + { + "epoch": 10.19, + "grad_norm": 0.54296875, + "learning_rate": 0.0003706423160653528, + "loss": 0.1818, + "step": 246110 + }, + { + "epoch": 10.19, + "grad_norm": 0.85546875, + "learning_rate": 0.0003706328171032456, + "loss": 0.1879, + "step": 246120 + }, + { + "epoch": 10.19, + "grad_norm": 0.9453125, + "learning_rate": 0.0003706233179141205, + "loss": 0.1975, + "step": 246130 + }, + { + "epoch": 10.2, + "grad_norm": 0.51953125, + "learning_rate": 0.00037061381849799524, + "loss": 0.1925, + "step": 246140 + }, + { + "epoch": 10.2, + "grad_norm": 0.48828125, + "learning_rate": 0.000370604318854888, + "loss": 0.1722, + "step": 246150 + }, + { + "epoch": 10.2, + "grad_norm": 0.28515625, + "learning_rate": 0.0003705948189848163, + "loss": 0.2503, + "step": 246160 + }, + { + "epoch": 10.2, + "grad_norm": 0.82421875, + "learning_rate": 0.0003705853188877982, + "loss": 0.1919, + "step": 246170 + }, + { + "epoch": 10.2, + "grad_norm": 0.53125, + "learning_rate": 0.00037057581856385157, + "loss": 0.2407, + "step": 246180 + }, + { + "epoch": 10.2, + "grad_norm": 0.9375, + "learning_rate": 0.0003705663180129943, + "loss": 0.1741, + "step": 246190 + }, + { + "epoch": 10.2, + "grad_norm": 1.0703125, + "learning_rate": 0.00037055681723524423, + "loss": 0.2119, + "step": 246200 + }, + { + "epoch": 10.2, + "grad_norm": 0.9375, + "learning_rate": 0.0003705473162306192, + "loss": 0.1749, + "step": 246210 + }, + { + "epoch": 10.2, + "grad_norm": 0.890625, + "learning_rate": 0.0003705378149991372, + "loss": 0.2051, + "step": 246220 + }, + { + "epoch": 10.2, + "grad_norm": 0.9609375, + "learning_rate": 0.00037052831354081605, + "loss": 0.1706, + "step": 246230 + }, + { + "epoch": 10.2, + "grad_norm": 0.8828125, + "learning_rate": 0.0003705188118556736, + "loss": 0.2, + "step": 246240 + }, + { + "epoch": 10.2, + "grad_norm": 1.203125, + "learning_rate": 0.0003705093099437278, + "loss": 0.1813, + "step": 246250 + }, + { + "epoch": 10.2, + "grad_norm": 0.7265625, + "learning_rate": 0.0003704998078049964, + "loss": 0.2692, + "step": 246260 + }, + { + "epoch": 10.2, + "grad_norm": 0.671875, + "learning_rate": 0.0003704903054394975, + "loss": 0.1479, + "step": 246270 + }, + { + "epoch": 10.2, + "grad_norm": 0.4609375, + "learning_rate": 0.0003704808028472487, + "loss": 0.1973, + "step": 246280 + }, + { + "epoch": 10.2, + "grad_norm": 0.4765625, + "learning_rate": 0.00037047130002826816, + "loss": 0.179, + "step": 246290 + }, + { + "epoch": 10.2, + "grad_norm": 0.70703125, + "learning_rate": 0.0003704617969825736, + "loss": 0.1502, + "step": 246300 + }, + { + "epoch": 10.2, + "grad_norm": 0.91015625, + "learning_rate": 0.000370452293710183, + "loss": 0.2074, + "step": 246310 + }, + { + "epoch": 10.2, + "grad_norm": 1.125, + "learning_rate": 0.00037044279021111417, + "loss": 0.2083, + "step": 246320 + }, + { + "epoch": 10.2, + "grad_norm": 0.94140625, + "learning_rate": 0.00037043328648538493, + "loss": 0.212, + "step": 246330 + }, + { + "epoch": 10.2, + "grad_norm": 0.87109375, + "learning_rate": 0.00037042378253301336, + "loss": 0.2366, + "step": 246340 + }, + { + "epoch": 10.2, + "grad_norm": 0.6640625, + "learning_rate": 0.00037041427835401714, + "loss": 0.2515, + "step": 246350 + }, + { + "epoch": 10.2, + "grad_norm": 0.71875, + "learning_rate": 0.0003704047739484143, + "loss": 0.2477, + "step": 246360 + }, + { + "epoch": 10.2, + "grad_norm": 0.83203125, + "learning_rate": 0.0003703952693162227, + "loss": 0.1764, + "step": 246370 + }, + { + "epoch": 10.21, + "grad_norm": 0.4921875, + "learning_rate": 0.0003703857644574602, + "loss": 0.1817, + "step": 246380 + }, + { + "epoch": 10.21, + "grad_norm": 0.51171875, + "learning_rate": 0.0003703762593721447, + "loss": 0.1555, + "step": 246390 + }, + { + "epoch": 10.21, + "grad_norm": 0.455078125, + "learning_rate": 0.0003703667540602941, + "loss": 0.1946, + "step": 246400 + }, + { + "epoch": 10.21, + "grad_norm": 0.75390625, + "learning_rate": 0.00037035724852192616, + "loss": 0.1614, + "step": 246410 + }, + { + "epoch": 10.21, + "grad_norm": 0.6875, + "learning_rate": 0.00037034774275705897, + "loss": 0.2216, + "step": 246420 + }, + { + "epoch": 10.21, + "grad_norm": 1.140625, + "learning_rate": 0.00037033823676571023, + "loss": 0.2103, + "step": 246430 + }, + { + "epoch": 10.21, + "grad_norm": 0.796875, + "learning_rate": 0.00037032873054789806, + "loss": 0.164, + "step": 246440 + }, + { + "epoch": 10.21, + "grad_norm": 0.60546875, + "learning_rate": 0.0003703192241036401, + "loss": 0.2036, + "step": 246450 + }, + { + "epoch": 10.21, + "grad_norm": 1.15625, + "learning_rate": 0.0003703097174329544, + "loss": 0.215, + "step": 246460 + }, + { + "epoch": 10.21, + "grad_norm": 0.55078125, + "learning_rate": 0.0003703002105358588, + "loss": 0.1799, + "step": 246470 + }, + { + "epoch": 10.21, + "grad_norm": 0.4296875, + "learning_rate": 0.0003702907034123712, + "loss": 0.1771, + "step": 246480 + }, + { + "epoch": 10.21, + "grad_norm": 0.53125, + "learning_rate": 0.00037028119606250946, + "loss": 0.2024, + "step": 246490 + }, + { + "epoch": 10.21, + "grad_norm": 2.078125, + "learning_rate": 0.00037027168848629156, + "loss": 0.1778, + "step": 246500 + }, + { + "epoch": 10.21, + "grad_norm": 0.640625, + "learning_rate": 0.0003702621806837352, + "loss": 0.2037, + "step": 246510 + }, + { + "epoch": 10.21, + "grad_norm": 0.6015625, + "learning_rate": 0.0003702526726548586, + "loss": 0.208, + "step": 246520 + }, + { + "epoch": 10.21, + "grad_norm": 0.66796875, + "learning_rate": 0.00037024316439967933, + "loss": 0.2032, + "step": 246530 + }, + { + "epoch": 10.21, + "grad_norm": 0.76953125, + "learning_rate": 0.0003702336559182154, + "loss": 0.215, + "step": 246540 + }, + { + "epoch": 10.21, + "grad_norm": 0.7734375, + "learning_rate": 0.0003702241472104848, + "loss": 0.1742, + "step": 246550 + }, + { + "epoch": 10.21, + "grad_norm": 0.62890625, + "learning_rate": 0.0003702146382765052, + "loss": 0.1702, + "step": 246560 + }, + { + "epoch": 10.21, + "grad_norm": 0.609375, + "learning_rate": 0.0003702051291162948, + "loss": 0.2137, + "step": 246570 + }, + { + "epoch": 10.21, + "grad_norm": 1.7421875, + "learning_rate": 0.0003701956197298712, + "loss": 0.2218, + "step": 246580 + }, + { + "epoch": 10.21, + "grad_norm": 0.7734375, + "learning_rate": 0.00037018611011725244, + "loss": 0.241, + "step": 246590 + }, + { + "epoch": 10.21, + "grad_norm": 1.25, + "learning_rate": 0.00037017660027845645, + "loss": 0.2208, + "step": 246600 + }, + { + "epoch": 10.21, + "grad_norm": 0.57421875, + "learning_rate": 0.000370167090213501, + "loss": 0.1862, + "step": 246610 + }, + { + "epoch": 10.21, + "grad_norm": 0.69921875, + "learning_rate": 0.0003701575799224042, + "loss": 0.2271, + "step": 246620 + }, + { + "epoch": 10.22, + "grad_norm": 0.6796875, + "learning_rate": 0.0003701480694051837, + "loss": 0.1612, + "step": 246630 + }, + { + "epoch": 10.22, + "grad_norm": 0.578125, + "learning_rate": 0.0003701385586618575, + "loss": 0.1682, + "step": 246640 + }, + { + "epoch": 10.22, + "grad_norm": 1.265625, + "learning_rate": 0.0003701290476924436, + "loss": 0.1371, + "step": 246650 + }, + { + "epoch": 10.22, + "grad_norm": 0.96875, + "learning_rate": 0.00037011953649695973, + "loss": 0.1977, + "step": 246660 + }, + { + "epoch": 10.22, + "grad_norm": 0.6015625, + "learning_rate": 0.0003701100250754238, + "loss": 0.161, + "step": 246670 + }, + { + "epoch": 10.22, + "grad_norm": 0.90625, + "learning_rate": 0.00037010051342785394, + "loss": 0.2067, + "step": 246680 + }, + { + "epoch": 10.22, + "grad_norm": 0.5, + "learning_rate": 0.00037009100155426775, + "loss": 0.1843, + "step": 246690 + }, + { + "epoch": 10.22, + "grad_norm": 0.92578125, + "learning_rate": 0.00037008148945468335, + "loss": 0.1813, + "step": 246700 + }, + { + "epoch": 10.22, + "grad_norm": 0.9609375, + "learning_rate": 0.00037007197712911853, + "loss": 0.2168, + "step": 246710 + }, + { + "epoch": 10.22, + "grad_norm": 0.62890625, + "learning_rate": 0.00037006246457759113, + "loss": 0.1959, + "step": 246720 + }, + { + "epoch": 10.22, + "grad_norm": 1.046875, + "learning_rate": 0.00037005295180011924, + "loss": 0.2428, + "step": 246730 + }, + { + "epoch": 10.22, + "grad_norm": 2.796875, + "learning_rate": 0.00037004343879672066, + "loss": 0.1845, + "step": 246740 + }, + { + "epoch": 10.22, + "grad_norm": 1.1796875, + "learning_rate": 0.0003700339255674132, + "loss": 0.1942, + "step": 246750 + }, + { + "epoch": 10.22, + "grad_norm": 1.4609375, + "learning_rate": 0.00037002441211221495, + "loss": 0.2114, + "step": 246760 + }, + { + "epoch": 10.22, + "grad_norm": 0.73828125, + "learning_rate": 0.0003700148984311436, + "loss": 0.1551, + "step": 246770 + }, + { + "epoch": 10.22, + "grad_norm": 0.76953125, + "learning_rate": 0.00037000538452421734, + "loss": 0.2235, + "step": 246780 + }, + { + "epoch": 10.22, + "grad_norm": 0.462890625, + "learning_rate": 0.0003699958703914538, + "loss": 0.2489, + "step": 246790 + }, + { + "epoch": 10.22, + "grad_norm": 0.435546875, + "learning_rate": 0.000369986356032871, + "loss": 0.1689, + "step": 246800 + }, + { + "epoch": 10.22, + "grad_norm": 0.58203125, + "learning_rate": 0.0003699768414484869, + "loss": 0.2079, + "step": 246810 + }, + { + "epoch": 10.22, + "grad_norm": 0.439453125, + "learning_rate": 0.0003699673266383192, + "loss": 0.1898, + "step": 246820 + }, + { + "epoch": 10.22, + "grad_norm": 0.79296875, + "learning_rate": 0.00036995781160238607, + "loss": 0.2232, + "step": 246830 + }, + { + "epoch": 10.22, + "grad_norm": 0.67578125, + "learning_rate": 0.0003699482963407053, + "loss": 0.196, + "step": 246840 + }, + { + "epoch": 10.22, + "grad_norm": 1.1796875, + "learning_rate": 0.00036993878085329467, + "loss": 0.2191, + "step": 246850 + }, + { + "epoch": 10.22, + "grad_norm": 0.37109375, + "learning_rate": 0.0003699292651401723, + "loss": 0.1922, + "step": 246860 + }, + { + "epoch": 10.23, + "grad_norm": 0.859375, + "learning_rate": 0.00036991974920135603, + "loss": 0.194, + "step": 246870 + }, + { + "epoch": 10.23, + "grad_norm": 1.1171875, + "learning_rate": 0.00036991023303686366, + "loss": 0.2633, + "step": 246880 + }, + { + "epoch": 10.23, + "grad_norm": 1.203125, + "learning_rate": 0.0003699007166467132, + "loss": 0.2234, + "step": 246890 + }, + { + "epoch": 10.23, + "grad_norm": 1.03125, + "learning_rate": 0.00036989120003092256, + "loss": 0.1927, + "step": 246900 + }, + { + "epoch": 10.23, + "grad_norm": 0.83203125, + "learning_rate": 0.00036988168318950965, + "loss": 0.1988, + "step": 246910 + }, + { + "epoch": 10.23, + "grad_norm": 0.68359375, + "learning_rate": 0.0003698721661224924, + "loss": 0.2049, + "step": 246920 + }, + { + "epoch": 10.23, + "grad_norm": 0.9140625, + "learning_rate": 0.0003698626488298885, + "loss": 0.1787, + "step": 246930 + }, + { + "epoch": 10.23, + "grad_norm": 0.83984375, + "learning_rate": 0.0003698531313117162, + "loss": 0.2558, + "step": 246940 + }, + { + "epoch": 10.23, + "grad_norm": 1.5078125, + "learning_rate": 0.00036984361356799325, + "loss": 0.2137, + "step": 246950 + }, + { + "epoch": 10.23, + "grad_norm": 0.80859375, + "learning_rate": 0.0003698340955987374, + "loss": 0.1688, + "step": 246960 + }, + { + "epoch": 10.23, + "grad_norm": 0.453125, + "learning_rate": 0.0003698245774039669, + "loss": 0.1706, + "step": 246970 + }, + { + "epoch": 10.23, + "grad_norm": 0.66796875, + "learning_rate": 0.00036981505898369944, + "loss": 0.1862, + "step": 246980 + }, + { + "epoch": 10.23, + "grad_norm": 0.5546875, + "learning_rate": 0.00036980554033795294, + "loss": 0.1627, + "step": 246990 + }, + { + "epoch": 10.23, + "grad_norm": 1.8671875, + "learning_rate": 0.0003697960214667454, + "loss": 0.2067, + "step": 247000 + }, + { + "epoch": 10.23, + "grad_norm": 0.5546875, + "learning_rate": 0.00036978650237009457, + "loss": 0.2505, + "step": 247010 + }, + { + "epoch": 10.23, + "grad_norm": 0.5390625, + "learning_rate": 0.0003697769830480186, + "loss": 0.2251, + "step": 247020 + }, + { + "epoch": 10.23, + "grad_norm": 1.3984375, + "learning_rate": 0.00036976746350053533, + "loss": 0.2375, + "step": 247030 + }, + { + "epoch": 10.23, + "grad_norm": 0.375, + "learning_rate": 0.0003697579437276625, + "loss": 0.158, + "step": 247040 + }, + { + "epoch": 10.23, + "grad_norm": 1.1015625, + "learning_rate": 0.0003697484237294182, + "loss": 0.1932, + "step": 247050 + }, + { + "epoch": 10.23, + "grad_norm": 0.640625, + "learning_rate": 0.0003697389035058203, + "loss": 0.2289, + "step": 247060 + }, + { + "epoch": 10.23, + "grad_norm": 0.7890625, + "learning_rate": 0.0003697293830568866, + "loss": 0.2245, + "step": 247070 + }, + { + "epoch": 10.23, + "grad_norm": 0.921875, + "learning_rate": 0.0003697198623826353, + "loss": 0.1807, + "step": 247080 + }, + { + "epoch": 10.23, + "grad_norm": 0.8515625, + "learning_rate": 0.0003697103414830841, + "loss": 0.1941, + "step": 247090 + }, + { + "epoch": 10.23, + "grad_norm": 0.9375, + "learning_rate": 0.00036970082035825094, + "loss": 0.1792, + "step": 247100 + }, + { + "epoch": 10.24, + "grad_norm": 0.86328125, + "learning_rate": 0.0003696912990081538, + "loss": 0.1839, + "step": 247110 + }, + { + "epoch": 10.24, + "grad_norm": 0.56640625, + "learning_rate": 0.0003696817774328105, + "loss": 0.1878, + "step": 247120 + }, + { + "epoch": 10.24, + "grad_norm": 0.421875, + "learning_rate": 0.0003696722556322391, + "loss": 0.225, + "step": 247130 + }, + { + "epoch": 10.24, + "grad_norm": 1.8359375, + "learning_rate": 0.00036966273360645736, + "loss": 0.1941, + "step": 247140 + }, + { + "epoch": 10.24, + "grad_norm": 0.9921875, + "learning_rate": 0.0003696532113554833, + "loss": 0.1909, + "step": 247150 + }, + { + "epoch": 10.24, + "grad_norm": 1.7109375, + "learning_rate": 0.0003696436888793349, + "loss": 0.2537, + "step": 247160 + }, + { + "epoch": 10.24, + "grad_norm": 0.455078125, + "learning_rate": 0.0003696341661780298, + "loss": 0.1795, + "step": 247170 + }, + { + "epoch": 10.24, + "grad_norm": 0.1640625, + "learning_rate": 0.0003696246432515863, + "loss": 0.1613, + "step": 247180 + }, + { + "epoch": 10.24, + "grad_norm": 0.65234375, + "learning_rate": 0.00036961512010002205, + "loss": 0.1969, + "step": 247190 + }, + { + "epoch": 10.24, + "grad_norm": 0.4921875, + "learning_rate": 0.00036960559672335515, + "loss": 0.1916, + "step": 247200 + }, + { + "epoch": 10.24, + "grad_norm": 1.546875, + "learning_rate": 0.00036959607312160337, + "loss": 0.1973, + "step": 247210 + }, + { + "epoch": 10.24, + "grad_norm": 1.3046875, + "learning_rate": 0.00036958654929478477, + "loss": 0.2068, + "step": 247220 + }, + { + "epoch": 10.24, + "grad_norm": 1.03125, + "learning_rate": 0.0003695770252429171, + "loss": 0.2316, + "step": 247230 + }, + { + "epoch": 10.24, + "grad_norm": 1.0390625, + "learning_rate": 0.00036956750096601846, + "loss": 0.1856, + "step": 247240 + }, + { + "epoch": 10.24, + "grad_norm": 0.89453125, + "learning_rate": 0.0003695579764641065, + "loss": 0.1717, + "step": 247250 + }, + { + "epoch": 10.24, + "grad_norm": 1.3671875, + "learning_rate": 0.0003695484517371996, + "loss": 0.1918, + "step": 247260 + }, + { + "epoch": 10.24, + "grad_norm": 0.408203125, + "learning_rate": 0.00036953892678531525, + "loss": 0.2253, + "step": 247270 + }, + { + "epoch": 10.24, + "grad_norm": 1.828125, + "learning_rate": 0.0003695294016084716, + "loss": 0.1591, + "step": 247280 + }, + { + "epoch": 10.24, + "grad_norm": 0.6796875, + "learning_rate": 0.0003695198762066866, + "loss": 0.1991, + "step": 247290 + }, + { + "epoch": 10.24, + "grad_norm": 0.86328125, + "learning_rate": 0.000369510350579978, + "loss": 0.2124, + "step": 247300 + }, + { + "epoch": 10.24, + "grad_norm": 1.7734375, + "learning_rate": 0.00036950082472836387, + "loss": 0.183, + "step": 247310 + }, + { + "epoch": 10.24, + "grad_norm": 0.75390625, + "learning_rate": 0.0003694912986518621, + "loss": 0.1745, + "step": 247320 + }, + { + "epoch": 10.24, + "grad_norm": 1.0390625, + "learning_rate": 0.0003694817723504906, + "loss": 0.218, + "step": 247330 + }, + { + "epoch": 10.24, + "grad_norm": 0.55859375, + "learning_rate": 0.00036947224582426733, + "loss": 0.2383, + "step": 247340 + }, + { + "epoch": 10.25, + "grad_norm": 0.84375, + "learning_rate": 0.00036946271907321014, + "loss": 0.2175, + "step": 247350 + }, + { + "epoch": 10.25, + "grad_norm": 0.88671875, + "learning_rate": 0.000369453192097337, + "loss": 0.1991, + "step": 247360 + }, + { + "epoch": 10.25, + "grad_norm": 0.71484375, + "learning_rate": 0.00036944366489666594, + "loss": 0.1821, + "step": 247370 + }, + { + "epoch": 10.25, + "grad_norm": 1.484375, + "learning_rate": 0.0003694341374712147, + "loss": 0.1812, + "step": 247380 + }, + { + "epoch": 10.25, + "grad_norm": 1.1328125, + "learning_rate": 0.0003694246098210014, + "loss": 0.2035, + "step": 247390 + }, + { + "epoch": 10.25, + "grad_norm": 1.1953125, + "learning_rate": 0.00036941508194604386, + "loss": 0.1765, + "step": 247400 + }, + { + "epoch": 10.25, + "grad_norm": 0.98828125, + "learning_rate": 0.00036940555384636, + "loss": 0.1358, + "step": 247410 + }, + { + "epoch": 10.25, + "grad_norm": 0.5625, + "learning_rate": 0.0003693960255219678, + "loss": 0.144, + "step": 247420 + }, + { + "epoch": 10.25, + "grad_norm": 0.240234375, + "learning_rate": 0.0003693864969728852, + "loss": 0.1609, + "step": 247430 + }, + { + "epoch": 10.25, + "grad_norm": 1.59375, + "learning_rate": 0.00036937696819913006, + "loss": 0.1561, + "step": 247440 + }, + { + "epoch": 10.25, + "grad_norm": 1.8671875, + "learning_rate": 0.00036936743920072034, + "loss": 0.1629, + "step": 247450 + }, + { + "epoch": 10.25, + "grad_norm": 0.77734375, + "learning_rate": 0.0003693579099776741, + "loss": 0.1796, + "step": 247460 + }, + { + "epoch": 10.25, + "grad_norm": 0.92578125, + "learning_rate": 0.00036934838053000906, + "loss": 0.178, + "step": 247470 + }, + { + "epoch": 10.25, + "grad_norm": 0.921875, + "learning_rate": 0.0003693388508577433, + "loss": 0.1679, + "step": 247480 + }, + { + "epoch": 10.25, + "grad_norm": 0.4765625, + "learning_rate": 0.0003693293209608947, + "loss": 0.2435, + "step": 247490 + }, + { + "epoch": 10.25, + "grad_norm": 0.65625, + "learning_rate": 0.0003693197908394812, + "loss": 0.2102, + "step": 247500 + }, + { + "epoch": 10.25, + "grad_norm": 0.859375, + "learning_rate": 0.0003693102604935207, + "loss": 0.2293, + "step": 247510 + }, + { + "epoch": 10.25, + "grad_norm": 0.79296875, + "learning_rate": 0.0003693007299230313, + "loss": 0.2153, + "step": 247520 + }, + { + "epoch": 10.25, + "grad_norm": 0.451171875, + "learning_rate": 0.0003692911991280307, + "loss": 0.228, + "step": 247530 + }, + { + "epoch": 10.25, + "grad_norm": 0.376953125, + "learning_rate": 0.000369281668108537, + "loss": 0.2134, + "step": 247540 + }, + { + "epoch": 10.25, + "grad_norm": 0.8515625, + "learning_rate": 0.000369272136864568, + "loss": 0.1918, + "step": 247550 + }, + { + "epoch": 10.25, + "grad_norm": 0.8671875, + "learning_rate": 0.0003692626053961418, + "loss": 0.1636, + "step": 247560 + }, + { + "epoch": 10.25, + "grad_norm": 0.74609375, + "learning_rate": 0.0003692530737032762, + "loss": 0.2209, + "step": 247570 + }, + { + "epoch": 10.25, + "grad_norm": 0.72265625, + "learning_rate": 0.00036924354178598923, + "loss": 0.1964, + "step": 247580 + }, + { + "epoch": 10.26, + "grad_norm": 1.015625, + "learning_rate": 0.0003692340096442988, + "loss": 0.1628, + "step": 247590 + }, + { + "epoch": 10.26, + "grad_norm": 1.1953125, + "learning_rate": 0.0003692244772782228, + "loss": 0.1642, + "step": 247600 + }, + { + "epoch": 10.26, + "grad_norm": 0.91796875, + "learning_rate": 0.0003692149446877792, + "loss": 0.1775, + "step": 247610 + }, + { + "epoch": 10.26, + "grad_norm": 0.82421875, + "learning_rate": 0.000369205411872986, + "loss": 0.2114, + "step": 247620 + }, + { + "epoch": 10.26, + "grad_norm": 1.828125, + "learning_rate": 0.00036919587883386105, + "loss": 0.2277, + "step": 247630 + }, + { + "epoch": 10.26, + "grad_norm": 0.53125, + "learning_rate": 0.00036918634557042235, + "loss": 0.2188, + "step": 247640 + }, + { + "epoch": 10.26, + "grad_norm": 0.26171875, + "learning_rate": 0.00036917681208268786, + "loss": 0.1756, + "step": 247650 + }, + { + "epoch": 10.26, + "grad_norm": 0.953125, + "learning_rate": 0.00036916727837067536, + "loss": 0.2293, + "step": 247660 + }, + { + "epoch": 10.26, + "grad_norm": 0.2236328125, + "learning_rate": 0.000369157744434403, + "loss": 0.1791, + "step": 247670 + }, + { + "epoch": 10.26, + "grad_norm": 0.828125, + "learning_rate": 0.0003691482102738886, + "loss": 0.2224, + "step": 247680 + }, + { + "epoch": 10.26, + "grad_norm": 0.455078125, + "learning_rate": 0.0003691386758891502, + "loss": 0.1657, + "step": 247690 + }, + { + "epoch": 10.26, + "grad_norm": 0.4140625, + "learning_rate": 0.00036912914128020563, + "loss": 0.1383, + "step": 247700 + }, + { + "epoch": 10.26, + "grad_norm": 0.5234375, + "learning_rate": 0.0003691196064470729, + "loss": 0.2012, + "step": 247710 + }, + { + "epoch": 10.26, + "grad_norm": 0.703125, + "learning_rate": 0.0003691100713897699, + "loss": 0.2392, + "step": 247720 + }, + { + "epoch": 10.26, + "grad_norm": 0.35546875, + "learning_rate": 0.0003691005361083146, + "loss": 0.1463, + "step": 247730 + }, + { + "epoch": 10.26, + "grad_norm": 1.40625, + "learning_rate": 0.00036909100060272495, + "loss": 0.2289, + "step": 247740 + }, + { + "epoch": 10.26, + "grad_norm": 1.3125, + "learning_rate": 0.000369081464873019, + "loss": 0.1727, + "step": 247750 + }, + { + "epoch": 10.26, + "grad_norm": 0.43359375, + "learning_rate": 0.0003690719289192145, + "loss": 0.1989, + "step": 247760 + }, + { + "epoch": 10.26, + "grad_norm": 0.5859375, + "learning_rate": 0.00036906239274132945, + "loss": 0.1752, + "step": 247770 + }, + { + "epoch": 10.26, + "grad_norm": 0.59765625, + "learning_rate": 0.000369052856339382, + "loss": 0.188, + "step": 247780 + }, + { + "epoch": 10.26, + "grad_norm": 0.58984375, + "learning_rate": 0.00036904331971338977, + "loss": 0.1514, + "step": 247790 + }, + { + "epoch": 10.26, + "grad_norm": 0.7578125, + "learning_rate": 0.0003690337828633709, + "loss": 0.1878, + "step": 247800 + }, + { + "epoch": 10.26, + "grad_norm": 0.5859375, + "learning_rate": 0.0003690242457893433, + "loss": 0.1798, + "step": 247810 + }, + { + "epoch": 10.26, + "grad_norm": 1.1640625, + "learning_rate": 0.00036901470849132494, + "loss": 0.2481, + "step": 247820 + }, + { + "epoch": 10.27, + "grad_norm": 1.1328125, + "learning_rate": 0.0003690051709693338, + "loss": 0.1752, + "step": 247830 + }, + { + "epoch": 10.27, + "grad_norm": 0.416015625, + "learning_rate": 0.0003689956332233877, + "loss": 0.2196, + "step": 247840 + }, + { + "epoch": 10.27, + "grad_norm": 0.74609375, + "learning_rate": 0.00036898609525350476, + "loss": 0.1794, + "step": 247850 + }, + { + "epoch": 10.27, + "grad_norm": 0.6875, + "learning_rate": 0.0003689765570597028, + "loss": 0.1692, + "step": 247860 + }, + { + "epoch": 10.27, + "grad_norm": 0.9375, + "learning_rate": 0.0003689670186419997, + "loss": 0.1819, + "step": 247870 + }, + { + "epoch": 10.27, + "grad_norm": 0.50390625, + "learning_rate": 0.00036895748000041365, + "loss": 0.1897, + "step": 247880 + }, + { + "epoch": 10.27, + "grad_norm": 0.60546875, + "learning_rate": 0.00036894794113496245, + "loss": 0.2025, + "step": 247890 + }, + { + "epoch": 10.27, + "grad_norm": 0.453125, + "learning_rate": 0.00036893840204566404, + "loss": 0.1563, + "step": 247900 + }, + { + "epoch": 10.27, + "grad_norm": 1.8359375, + "learning_rate": 0.00036892886273253643, + "loss": 0.1906, + "step": 247910 + }, + { + "epoch": 10.27, + "grad_norm": 0.337890625, + "learning_rate": 0.0003689193231955975, + "loss": 0.161, + "step": 247920 + }, + { + "epoch": 10.27, + "grad_norm": 0.73828125, + "learning_rate": 0.00036890978343486527, + "loss": 0.2043, + "step": 247930 + }, + { + "epoch": 10.27, + "grad_norm": 0.921875, + "learning_rate": 0.0003689002434503577, + "loss": 0.1872, + "step": 247940 + }, + { + "epoch": 10.27, + "grad_norm": 0.08544921875, + "learning_rate": 0.0003688907032420926, + "loss": 0.1841, + "step": 247950 + }, + { + "epoch": 10.27, + "grad_norm": 0.6171875, + "learning_rate": 0.0003688811628100882, + "loss": 0.1868, + "step": 247960 + }, + { + "epoch": 10.27, + "grad_norm": 1.140625, + "learning_rate": 0.0003688716221543622, + "loss": 0.1666, + "step": 247970 + }, + { + "epoch": 10.27, + "grad_norm": 0.8203125, + "learning_rate": 0.0003688620812749326, + "loss": 0.2212, + "step": 247980 + }, + { + "epoch": 10.27, + "grad_norm": 1.3671875, + "learning_rate": 0.0003688525401718175, + "loss": 0.2063, + "step": 247990 + }, + { + "epoch": 10.27, + "grad_norm": 0.515625, + "learning_rate": 0.0003688429988450347, + "loss": 0.2287, + "step": 248000 + }, + { + "epoch": 10.27, + "grad_norm": 0.5859375, + "learning_rate": 0.0003688334572946022, + "loss": 0.1983, + "step": 248010 + }, + { + "epoch": 10.27, + "grad_norm": 1.125, + "learning_rate": 0.00036882391552053796, + "loss": 0.2276, + "step": 248020 + }, + { + "epoch": 10.27, + "grad_norm": 0.765625, + "learning_rate": 0.00036881437352285996, + "loss": 0.1607, + "step": 248030 + }, + { + "epoch": 10.27, + "grad_norm": 0.94140625, + "learning_rate": 0.00036880483130158614, + "loss": 0.188, + "step": 248040 + }, + { + "epoch": 10.27, + "grad_norm": 0.56640625, + "learning_rate": 0.0003687952888567345, + "loss": 0.2022, + "step": 248050 + }, + { + "epoch": 10.27, + "grad_norm": 0.57421875, + "learning_rate": 0.0003687857461883229, + "loss": 0.184, + "step": 248060 + }, + { + "epoch": 10.28, + "grad_norm": 1.1796875, + "learning_rate": 0.0003687762032963694, + "loss": 0.1993, + "step": 248070 + }, + { + "epoch": 10.28, + "grad_norm": 0.70703125, + "learning_rate": 0.00036876666018089186, + "loss": 0.1951, + "step": 248080 + }, + { + "epoch": 10.28, + "grad_norm": 1.2109375, + "learning_rate": 0.0003687571168419083, + "loss": 0.1657, + "step": 248090 + }, + { + "epoch": 10.28, + "grad_norm": 0.486328125, + "learning_rate": 0.00036874757327943667, + "loss": 0.2237, + "step": 248100 + }, + { + "epoch": 10.28, + "grad_norm": 0.2060546875, + "learning_rate": 0.00036873802949349496, + "loss": 0.2209, + "step": 248110 + }, + { + "epoch": 10.28, + "grad_norm": 0.7421875, + "learning_rate": 0.0003687284854841011, + "loss": 0.1777, + "step": 248120 + }, + { + "epoch": 10.28, + "grad_norm": 0.72265625, + "learning_rate": 0.000368718941251273, + "loss": 0.1774, + "step": 248130 + }, + { + "epoch": 10.28, + "grad_norm": 1.359375, + "learning_rate": 0.00036870939679502874, + "loss": 0.2247, + "step": 248140 + }, + { + "epoch": 10.28, + "grad_norm": 1.5625, + "learning_rate": 0.00036869985211538623, + "loss": 0.239, + "step": 248150 + }, + { + "epoch": 10.28, + "grad_norm": 0.54296875, + "learning_rate": 0.0003686903072123633, + "loss": 0.1679, + "step": 248160 + }, + { + "epoch": 10.28, + "grad_norm": 0.69921875, + "learning_rate": 0.0003686807620859781, + "loss": 0.1841, + "step": 248170 + }, + { + "epoch": 10.28, + "grad_norm": 1.265625, + "learning_rate": 0.0003686712167362485, + "loss": 0.1896, + "step": 248180 + }, + { + "epoch": 10.28, + "grad_norm": 0.81640625, + "learning_rate": 0.00036866167116319257, + "loss": 0.2016, + "step": 248190 + }, + { + "epoch": 10.28, + "grad_norm": 1.3125, + "learning_rate": 0.0003686521253668281, + "loss": 0.2053, + "step": 248200 + }, + { + "epoch": 10.28, + "grad_norm": 1.0234375, + "learning_rate": 0.0003686425793471732, + "loss": 0.2285, + "step": 248210 + }, + { + "epoch": 10.28, + "grad_norm": 1.328125, + "learning_rate": 0.00036863303310424574, + "loss": 0.2114, + "step": 248220 + }, + { + "epoch": 10.28, + "grad_norm": 0.7578125, + "learning_rate": 0.0003686234866380638, + "loss": 0.248, + "step": 248230 + }, + { + "epoch": 10.28, + "grad_norm": 1.0234375, + "learning_rate": 0.0003686139399486451, + "loss": 0.2082, + "step": 248240 + }, + { + "epoch": 10.28, + "grad_norm": 0.90625, + "learning_rate": 0.0003686043930360079, + "loss": 0.1964, + "step": 248250 + }, + { + "epoch": 10.28, + "grad_norm": 0.73046875, + "learning_rate": 0.00036859484590017, + "loss": 0.2168, + "step": 248260 + }, + { + "epoch": 10.28, + "grad_norm": 0.439453125, + "learning_rate": 0.0003685852985411494, + "loss": 0.1714, + "step": 248270 + }, + { + "epoch": 10.28, + "grad_norm": 0.458984375, + "learning_rate": 0.0003685757509589641, + "loss": 0.205, + "step": 248280 + }, + { + "epoch": 10.28, + "grad_norm": 0.7265625, + "learning_rate": 0.0003685662031536321, + "loss": 0.1558, + "step": 248290 + }, + { + "epoch": 10.28, + "grad_norm": 0.6796875, + "learning_rate": 0.0003685566551251712, + "loss": 0.1771, + "step": 248300 + }, + { + "epoch": 10.28, + "grad_norm": 1.0859375, + "learning_rate": 0.0003685471068735996, + "loss": 0.1881, + "step": 248310 + }, + { + "epoch": 10.29, + "grad_norm": 0.64453125, + "learning_rate": 0.000368537558398935, + "loss": 0.1597, + "step": 248320 + }, + { + "epoch": 10.29, + "grad_norm": 0.58203125, + "learning_rate": 0.0003685280097011956, + "loss": 0.1865, + "step": 248330 + }, + { + "epoch": 10.29, + "grad_norm": 0.609375, + "learning_rate": 0.0003685184607803993, + "loss": 0.2171, + "step": 248340 + }, + { + "epoch": 10.29, + "grad_norm": 0.921875, + "learning_rate": 0.00036850891163656396, + "loss": 0.1791, + "step": 248350 + }, + { + "epoch": 10.29, + "grad_norm": 0.91796875, + "learning_rate": 0.0003684993622697078, + "loss": 0.2284, + "step": 248360 + }, + { + "epoch": 10.29, + "grad_norm": 0.80078125, + "learning_rate": 0.00036848981267984855, + "loss": 0.158, + "step": 248370 + }, + { + "epoch": 10.29, + "grad_norm": 0.458984375, + "learning_rate": 0.0003684802628670043, + "loss": 0.1734, + "step": 248380 + }, + { + "epoch": 10.29, + "grad_norm": 1.203125, + "learning_rate": 0.0003684707128311929, + "loss": 0.1935, + "step": 248390 + }, + { + "epoch": 10.29, + "grad_norm": 0.87109375, + "learning_rate": 0.00036846116257243245, + "loss": 0.1689, + "step": 248400 + }, + { + "epoch": 10.29, + "grad_norm": 0.9921875, + "learning_rate": 0.00036845161209074094, + "loss": 0.2464, + "step": 248410 + }, + { + "epoch": 10.29, + "grad_norm": 0.67578125, + "learning_rate": 0.0003684420613861362, + "loss": 0.2352, + "step": 248420 + }, + { + "epoch": 10.29, + "grad_norm": 1.2421875, + "learning_rate": 0.00036843251045863643, + "loss": 0.1995, + "step": 248430 + }, + { + "epoch": 10.29, + "grad_norm": 1.1796875, + "learning_rate": 0.0003684229593082594, + "loss": 0.2198, + "step": 248440 + }, + { + "epoch": 10.29, + "grad_norm": 0.8203125, + "learning_rate": 0.0003684134079350231, + "loss": 0.2388, + "step": 248450 + }, + { + "epoch": 10.29, + "grad_norm": 0.56640625, + "learning_rate": 0.00036840385633894556, + "loss": 0.1708, + "step": 248460 + }, + { + "epoch": 10.29, + "grad_norm": 1.078125, + "learning_rate": 0.00036839430452004484, + "loss": 0.1994, + "step": 248470 + }, + { + "epoch": 10.29, + "grad_norm": 0.5546875, + "learning_rate": 0.0003683847524783387, + "loss": 0.2112, + "step": 248480 + }, + { + "epoch": 10.29, + "grad_norm": 1.3984375, + "learning_rate": 0.0003683752002138453, + "loss": 0.2041, + "step": 248490 + }, + { + "epoch": 10.29, + "grad_norm": 1.4765625, + "learning_rate": 0.00036836564772658255, + "loss": 0.2191, + "step": 248500 + }, + { + "epoch": 10.29, + "grad_norm": 0.98828125, + "learning_rate": 0.0003683560950165684, + "loss": 0.1656, + "step": 248510 + }, + { + "epoch": 10.29, + "grad_norm": 0.68359375, + "learning_rate": 0.000368346542083821, + "loss": 0.1916, + "step": 248520 + }, + { + "epoch": 10.29, + "grad_norm": 1.515625, + "learning_rate": 0.00036833698892835796, + "loss": 0.2124, + "step": 248530 + }, + { + "epoch": 10.29, + "grad_norm": 0.345703125, + "learning_rate": 0.0003683274355501977, + "loss": 0.2135, + "step": 248540 + }, + { + "epoch": 10.29, + "grad_norm": 1.0078125, + "learning_rate": 0.00036831788194935795, + "loss": 0.2288, + "step": 248550 + }, + { + "epoch": 10.3, + "grad_norm": 0.84375, + "learning_rate": 0.0003683083281258566, + "loss": 0.1735, + "step": 248560 + }, + { + "epoch": 10.3, + "grad_norm": 0.361328125, + "learning_rate": 0.00036829877407971187, + "loss": 0.2082, + "step": 248570 + }, + { + "epoch": 10.3, + "grad_norm": 0.322265625, + "learning_rate": 0.0003682892198109415, + "loss": 0.1912, + "step": 248580 + }, + { + "epoch": 10.3, + "grad_norm": 0.3984375, + "learning_rate": 0.0003682796653195637, + "loss": 0.185, + "step": 248590 + }, + { + "epoch": 10.3, + "grad_norm": 0.59375, + "learning_rate": 0.0003682701106055963, + "loss": 0.1993, + "step": 248600 + }, + { + "epoch": 10.3, + "grad_norm": 0.2265625, + "learning_rate": 0.00036826055566905734, + "loss": 0.1982, + "step": 248610 + }, + { + "epoch": 10.3, + "grad_norm": 1.1015625, + "learning_rate": 0.0003682510005099648, + "loss": 0.2099, + "step": 248620 + }, + { + "epoch": 10.3, + "grad_norm": 0.703125, + "learning_rate": 0.00036824144512833664, + "loss": 0.2232, + "step": 248630 + }, + { + "epoch": 10.3, + "grad_norm": 0.6171875, + "learning_rate": 0.00036823188952419086, + "loss": 0.1825, + "step": 248640 + }, + { + "epoch": 10.3, + "grad_norm": 0.76953125, + "learning_rate": 0.0003682223336975454, + "loss": 0.1872, + "step": 248650 + }, + { + "epoch": 10.3, + "grad_norm": 1.03125, + "learning_rate": 0.00036821277764841824, + "loss": 0.187, + "step": 248660 + }, + { + "epoch": 10.3, + "grad_norm": 1.0390625, + "learning_rate": 0.0003682032213768275, + "loss": 0.1241, + "step": 248670 + }, + { + "epoch": 10.3, + "grad_norm": 0.890625, + "learning_rate": 0.00036819366488279097, + "loss": 0.1667, + "step": 248680 + }, + { + "epoch": 10.3, + "grad_norm": 1.5078125, + "learning_rate": 0.0003681841081663267, + "loss": 0.1955, + "step": 248690 + }, + { + "epoch": 10.3, + "grad_norm": 0.5234375, + "learning_rate": 0.0003681745512274528, + "loss": 0.1837, + "step": 248700 + }, + { + "epoch": 10.3, + "grad_norm": 0.6953125, + "learning_rate": 0.00036816499406618715, + "loss": 0.2029, + "step": 248710 + }, + { + "epoch": 10.3, + "grad_norm": 0.609375, + "learning_rate": 0.0003681554366825477, + "loss": 0.2227, + "step": 248720 + }, + { + "epoch": 10.3, + "grad_norm": 0.81640625, + "learning_rate": 0.0003681458790765525, + "loss": 0.1854, + "step": 248730 + }, + { + "epoch": 10.3, + "grad_norm": 1.03125, + "learning_rate": 0.0003681363212482195, + "loss": 0.2135, + "step": 248740 + }, + { + "epoch": 10.3, + "grad_norm": 0.490234375, + "learning_rate": 0.0003681267631975667, + "loss": 0.1858, + "step": 248750 + }, + { + "epoch": 10.3, + "grad_norm": 1.5, + "learning_rate": 0.00036811720492461216, + "loss": 0.1469, + "step": 248760 + }, + { + "epoch": 10.3, + "grad_norm": 0.515625, + "learning_rate": 0.0003681076464293737, + "loss": 0.2245, + "step": 248770 + }, + { + "epoch": 10.3, + "grad_norm": 2.296875, + "learning_rate": 0.0003680980877118694, + "loss": 0.1574, + "step": 248780 + }, + { + "epoch": 10.3, + "grad_norm": 0.91015625, + "learning_rate": 0.00036808852877211727, + "loss": 0.2012, + "step": 248790 + }, + { + "epoch": 10.31, + "grad_norm": 1.2109375, + "learning_rate": 0.00036807896961013533, + "loss": 0.214, + "step": 248800 + }, + { + "epoch": 10.31, + "grad_norm": 0.400390625, + "learning_rate": 0.0003680694102259415, + "loss": 0.2352, + "step": 248810 + }, + { + "epoch": 10.31, + "grad_norm": 0.427734375, + "learning_rate": 0.00036805985061955375, + "loss": 0.2136, + "step": 248820 + }, + { + "epoch": 10.31, + "grad_norm": 1.0703125, + "learning_rate": 0.0003680502907909901, + "loss": 0.1872, + "step": 248830 + }, + { + "epoch": 10.31, + "grad_norm": 0.6640625, + "learning_rate": 0.0003680407307402686, + "loss": 0.199, + "step": 248840 + }, + { + "epoch": 10.31, + "grad_norm": 0.326171875, + "learning_rate": 0.00036803117046740717, + "loss": 0.2035, + "step": 248850 + }, + { + "epoch": 10.31, + "grad_norm": 1.2421875, + "learning_rate": 0.00036802160997242387, + "loss": 0.1553, + "step": 248860 + }, + { + "epoch": 10.31, + "grad_norm": 1.046875, + "learning_rate": 0.0003680120492553366, + "loss": 0.1654, + "step": 248870 + }, + { + "epoch": 10.31, + "grad_norm": 0.5078125, + "learning_rate": 0.0003680024883161634, + "loss": 0.2114, + "step": 248880 + }, + { + "epoch": 10.31, + "grad_norm": 1.1796875, + "learning_rate": 0.00036799292715492226, + "loss": 0.1871, + "step": 248890 + }, + { + "epoch": 10.31, + "grad_norm": 0.76171875, + "learning_rate": 0.0003679833657716312, + "loss": 0.2291, + "step": 248900 + }, + { + "epoch": 10.31, + "grad_norm": 0.92578125, + "learning_rate": 0.00036797380416630815, + "loss": 0.2138, + "step": 248910 + }, + { + "epoch": 10.31, + "grad_norm": 0.9140625, + "learning_rate": 0.0003679642423389712, + "loss": 0.21, + "step": 248920 + }, + { + "epoch": 10.31, + "grad_norm": 1.359375, + "learning_rate": 0.0003679546802896382, + "loss": 0.2076, + "step": 248930 + }, + { + "epoch": 10.31, + "grad_norm": 0.6640625, + "learning_rate": 0.0003679451180183273, + "loss": 0.1587, + "step": 248940 + }, + { + "epoch": 10.31, + "grad_norm": 0.640625, + "learning_rate": 0.00036793555552505636, + "loss": 0.2292, + "step": 248950 + }, + { + "epoch": 10.31, + "grad_norm": 0.490234375, + "learning_rate": 0.0003679259928098435, + "loss": 0.1525, + "step": 248960 + }, + { + "epoch": 10.31, + "grad_norm": 0.2197265625, + "learning_rate": 0.00036791642987270664, + "loss": 0.1938, + "step": 248970 + }, + { + "epoch": 10.31, + "grad_norm": 0.96875, + "learning_rate": 0.0003679068667136638, + "loss": 0.2097, + "step": 248980 + }, + { + "epoch": 10.31, + "grad_norm": 0.44921875, + "learning_rate": 0.00036789730333273296, + "loss": 0.1778, + "step": 248990 + }, + { + "epoch": 10.31, + "grad_norm": 0.56640625, + "learning_rate": 0.00036788773972993205, + "loss": 0.1939, + "step": 249000 + }, + { + "epoch": 10.31, + "grad_norm": 0.6484375, + "learning_rate": 0.0003678781759052792, + "loss": 0.2015, + "step": 249010 + }, + { + "epoch": 10.31, + "grad_norm": 1.234375, + "learning_rate": 0.00036786861185879245, + "loss": 0.2232, + "step": 249020 + }, + { + "epoch": 10.31, + "grad_norm": 0.6171875, + "learning_rate": 0.0003678590475904896, + "loss": 0.1747, + "step": 249030 + }, + { + "epoch": 10.32, + "grad_norm": 0.388671875, + "learning_rate": 0.0003678494831003888, + "loss": 0.2121, + "step": 249040 + }, + { + "epoch": 10.32, + "grad_norm": 0.62109375, + "learning_rate": 0.00036783991838850794, + "loss": 0.2241, + "step": 249050 + }, + { + "epoch": 10.32, + "grad_norm": 0.765625, + "learning_rate": 0.00036783035345486513, + "loss": 0.1981, + "step": 249060 + }, + { + "epoch": 10.32, + "grad_norm": 0.58984375, + "learning_rate": 0.0003678207882994783, + "loss": 0.1896, + "step": 249070 + }, + { + "epoch": 10.32, + "grad_norm": 0.53515625, + "learning_rate": 0.00036781122292236547, + "loss": 0.1859, + "step": 249080 + }, + { + "epoch": 10.32, + "grad_norm": 0.408203125, + "learning_rate": 0.0003678016573235446, + "loss": 0.1269, + "step": 249090 + }, + { + "epoch": 10.32, + "grad_norm": 0.6796875, + "learning_rate": 0.0003677920915030338, + "loss": 0.1862, + "step": 249100 + }, + { + "epoch": 10.32, + "grad_norm": 0.53125, + "learning_rate": 0.00036778252546085103, + "loss": 0.1925, + "step": 249110 + }, + { + "epoch": 10.32, + "grad_norm": 0.8515625, + "learning_rate": 0.0003677729591970141, + "loss": 0.1745, + "step": 249120 + }, + { + "epoch": 10.32, + "grad_norm": 0.77734375, + "learning_rate": 0.00036776339271154136, + "loss": 0.2001, + "step": 249130 + }, + { + "epoch": 10.32, + "grad_norm": 0.87890625, + "learning_rate": 0.00036775382600445056, + "loss": 0.2404, + "step": 249140 + }, + { + "epoch": 10.32, + "grad_norm": 0.796875, + "learning_rate": 0.0003677442590757598, + "loss": 0.1983, + "step": 249150 + }, + { + "epoch": 10.32, + "grad_norm": 0.921875, + "learning_rate": 0.00036773469192548705, + "loss": 0.2043, + "step": 249160 + }, + { + "epoch": 10.32, + "grad_norm": 0.98828125, + "learning_rate": 0.0003677251245536503, + "loss": 0.2202, + "step": 249170 + }, + { + "epoch": 10.32, + "grad_norm": 0.96875, + "learning_rate": 0.0003677155569602675, + "loss": 0.245, + "step": 249180 + }, + { + "epoch": 10.32, + "grad_norm": 0.953125, + "learning_rate": 0.0003677059891453569, + "loss": 0.1935, + "step": 249190 + }, + { + "epoch": 10.32, + "grad_norm": 0.341796875, + "learning_rate": 0.00036769642110893633, + "loss": 0.1882, + "step": 249200 + }, + { + "epoch": 10.32, + "grad_norm": 0.80859375, + "learning_rate": 0.0003676868528510237, + "loss": 0.2395, + "step": 249210 + }, + { + "epoch": 10.32, + "grad_norm": 0.625, + "learning_rate": 0.00036767728437163714, + "loss": 0.2376, + "step": 249220 + }, + { + "epoch": 10.32, + "grad_norm": 1.421875, + "learning_rate": 0.0003676677156707947, + "loss": 0.1856, + "step": 249230 + }, + { + "epoch": 10.32, + "grad_norm": 0.365234375, + "learning_rate": 0.00036765814674851426, + "loss": 0.193, + "step": 249240 + }, + { + "epoch": 10.32, + "grad_norm": 0.75, + "learning_rate": 0.0003676485776048139, + "loss": 0.1908, + "step": 249250 + }, + { + "epoch": 10.32, + "grad_norm": 1.6796875, + "learning_rate": 0.00036763900823971166, + "loss": 0.2458, + "step": 249260 + }, + { + "epoch": 10.32, + "grad_norm": 0.80859375, + "learning_rate": 0.0003676294386532255, + "loss": 0.1879, + "step": 249270 + }, + { + "epoch": 10.33, + "grad_norm": 0.484375, + "learning_rate": 0.0003676198688453735, + "loss": 0.1615, + "step": 249280 + }, + { + "epoch": 10.33, + "grad_norm": 0.53515625, + "learning_rate": 0.00036761029881617345, + "loss": 0.1682, + "step": 249290 + }, + { + "epoch": 10.33, + "grad_norm": 1.046875, + "learning_rate": 0.0003676007285656437, + "loss": 0.2393, + "step": 249300 + }, + { + "epoch": 10.33, + "grad_norm": 0.3671875, + "learning_rate": 0.00036759115809380195, + "loss": 0.2432, + "step": 249310 + }, + { + "epoch": 10.33, + "grad_norm": 0.68359375, + "learning_rate": 0.0003675815874006664, + "loss": 0.1758, + "step": 249320 + }, + { + "epoch": 10.33, + "grad_norm": 0.890625, + "learning_rate": 0.00036757201648625494, + "loss": 0.2022, + "step": 249330 + }, + { + "epoch": 10.33, + "grad_norm": 0.81640625, + "learning_rate": 0.0003675624453505857, + "loss": 0.2008, + "step": 249340 + }, + { + "epoch": 10.33, + "grad_norm": 0.72265625, + "learning_rate": 0.00036755287399367663, + "loss": 0.1828, + "step": 249350 + }, + { + "epoch": 10.33, + "grad_norm": 0.71484375, + "learning_rate": 0.00036754330241554575, + "loss": 0.1756, + "step": 249360 + }, + { + "epoch": 10.33, + "grad_norm": 0.62890625, + "learning_rate": 0.000367533730616211, + "loss": 0.2098, + "step": 249370 + }, + { + "epoch": 10.33, + "grad_norm": 0.423828125, + "learning_rate": 0.00036752415859569055, + "loss": 0.1975, + "step": 249380 + }, + { + "epoch": 10.33, + "grad_norm": 0.21484375, + "learning_rate": 0.00036751458635400224, + "loss": 0.2345, + "step": 249390 + }, + { + "epoch": 10.33, + "grad_norm": 0.44140625, + "learning_rate": 0.0003675050138911643, + "loss": 0.1686, + "step": 249400 + }, + { + "epoch": 10.33, + "grad_norm": 0.142578125, + "learning_rate": 0.0003674954412071945, + "loss": 0.2263, + "step": 249410 + }, + { + "epoch": 10.33, + "grad_norm": 0.58984375, + "learning_rate": 0.00036748586830211095, + "loss": 0.1554, + "step": 249420 + }, + { + "epoch": 10.33, + "grad_norm": 0.35546875, + "learning_rate": 0.00036747629517593176, + "loss": 0.1572, + "step": 249430 + }, + { + "epoch": 10.33, + "grad_norm": 0.59375, + "learning_rate": 0.0003674667218286748, + "loss": 0.2125, + "step": 249440 + }, + { + "epoch": 10.33, + "grad_norm": 2.875, + "learning_rate": 0.0003674571482603582, + "loss": 0.2072, + "step": 249450 + }, + { + "epoch": 10.33, + "grad_norm": 0.51953125, + "learning_rate": 0.000367447574471, + "loss": 0.2054, + "step": 249460 + }, + { + "epoch": 10.33, + "grad_norm": 0.9140625, + "learning_rate": 0.00036743800046061797, + "loss": 0.2417, + "step": 249470 + }, + { + "epoch": 10.33, + "grad_norm": 0.75, + "learning_rate": 0.00036742842622923045, + "loss": 0.1691, + "step": 249480 + }, + { + "epoch": 10.33, + "grad_norm": 0.251953125, + "learning_rate": 0.0003674188517768553, + "loss": 0.1625, + "step": 249490 + }, + { + "epoch": 10.33, + "grad_norm": 0.71484375, + "learning_rate": 0.00036740927710351046, + "loss": 0.2016, + "step": 249500 + }, + { + "epoch": 10.33, + "grad_norm": 0.65234375, + "learning_rate": 0.00036739970220921414, + "loss": 0.2248, + "step": 249510 + }, + { + "epoch": 10.34, + "grad_norm": 1.28125, + "learning_rate": 0.00036739012709398425, + "loss": 0.2107, + "step": 249520 + }, + { + "epoch": 10.34, + "grad_norm": 0.52734375, + "learning_rate": 0.00036738055175783874, + "loss": 0.1664, + "step": 249530 + }, + { + "epoch": 10.34, + "grad_norm": 0.447265625, + "learning_rate": 0.0003673709762007958, + "loss": 0.1794, + "step": 249540 + }, + { + "epoch": 10.34, + "grad_norm": 1.3046875, + "learning_rate": 0.00036736140042287327, + "loss": 0.1909, + "step": 249550 + }, + { + "epoch": 10.34, + "grad_norm": 0.58203125, + "learning_rate": 0.0003673518244240893, + "loss": 0.2228, + "step": 249560 + }, + { + "epoch": 10.34, + "grad_norm": 0.66796875, + "learning_rate": 0.00036734224820446186, + "loss": 0.1787, + "step": 249570 + }, + { + "epoch": 10.34, + "grad_norm": 1.84375, + "learning_rate": 0.00036733267176400895, + "loss": 0.1806, + "step": 249580 + }, + { + "epoch": 10.34, + "grad_norm": 0.80859375, + "learning_rate": 0.0003673230951027488, + "loss": 0.2032, + "step": 249590 + }, + { + "epoch": 10.34, + "grad_norm": 0.99609375, + "learning_rate": 0.000367313518220699, + "loss": 0.1696, + "step": 249600 + }, + { + "epoch": 10.34, + "grad_norm": 0.216796875, + "learning_rate": 0.000367303941117878, + "loss": 0.1888, + "step": 249610 + }, + { + "epoch": 10.34, + "grad_norm": 0.296875, + "learning_rate": 0.00036729436379430365, + "loss": 0.2017, + "step": 249620 + }, + { + "epoch": 10.34, + "grad_norm": 1.1640625, + "learning_rate": 0.0003672847862499938, + "loss": 0.1684, + "step": 249630 + }, + { + "epoch": 10.34, + "grad_norm": 0.89453125, + "learning_rate": 0.0003672752084849669, + "loss": 0.1898, + "step": 249640 + }, + { + "epoch": 10.34, + "grad_norm": 0.58203125, + "learning_rate": 0.0003672656304992406, + "loss": 0.1857, + "step": 249650 + }, + { + "epoch": 10.34, + "grad_norm": 0.42578125, + "learning_rate": 0.000367256052292833, + "loss": 0.1443, + "step": 249660 + }, + { + "epoch": 10.34, + "grad_norm": 1.0625, + "learning_rate": 0.0003672464738657622, + "loss": 0.2023, + "step": 249670 + }, + { + "epoch": 10.34, + "grad_norm": 0.63671875, + "learning_rate": 0.00036723689521804613, + "loss": 0.1998, + "step": 249680 + }, + { + "epoch": 10.34, + "grad_norm": 0.5625, + "learning_rate": 0.000367227316349703, + "loss": 0.1715, + "step": 249690 + }, + { + "epoch": 10.34, + "grad_norm": 0.53515625, + "learning_rate": 0.00036721773726075075, + "loss": 0.1932, + "step": 249700 + }, + { + "epoch": 10.34, + "grad_norm": 0.7109375, + "learning_rate": 0.00036720815795120723, + "loss": 0.1939, + "step": 249710 + }, + { + "epoch": 10.34, + "grad_norm": 0.47265625, + "learning_rate": 0.0003671985784210907, + "loss": 0.1999, + "step": 249720 + }, + { + "epoch": 10.34, + "grad_norm": 0.46484375, + "learning_rate": 0.0003671889986704191, + "loss": 0.1788, + "step": 249730 + }, + { + "epoch": 10.34, + "grad_norm": 0.7265625, + "learning_rate": 0.0003671794186992105, + "loss": 0.1664, + "step": 249740 + }, + { + "epoch": 10.34, + "grad_norm": 1.1796875, + "learning_rate": 0.0003671698385074829, + "loss": 0.2081, + "step": 249750 + }, + { + "epoch": 10.35, + "grad_norm": 0.890625, + "learning_rate": 0.00036716025809525415, + "loss": 0.1778, + "step": 249760 + }, + { + "epoch": 10.35, + "grad_norm": 1.3828125, + "learning_rate": 0.0003671506774625426, + "loss": 0.1596, + "step": 249770 + }, + { + "epoch": 10.35, + "grad_norm": 0.6328125, + "learning_rate": 0.0003671410966093661, + "loss": 0.1602, + "step": 249780 + }, + { + "epoch": 10.35, + "grad_norm": 1.0234375, + "learning_rate": 0.00036713151553574265, + "loss": 0.1689, + "step": 249790 + }, + { + "epoch": 10.35, + "grad_norm": 0.52734375, + "learning_rate": 0.0003671219342416904, + "loss": 0.2366, + "step": 249800 + }, + { + "epoch": 10.35, + "grad_norm": 1.5546875, + "learning_rate": 0.0003671123527272273, + "loss": 0.1971, + "step": 249810 + }, + { + "epoch": 10.35, + "grad_norm": 0.59765625, + "learning_rate": 0.00036710277099237136, + "loss": 0.2146, + "step": 249820 + }, + { + "epoch": 10.35, + "grad_norm": 0.498046875, + "learning_rate": 0.00036709318903714074, + "loss": 0.1188, + "step": 249830 + }, + { + "epoch": 10.35, + "grad_norm": 0.5, + "learning_rate": 0.0003670836068615533, + "loss": 0.1863, + "step": 249840 + }, + { + "epoch": 10.35, + "grad_norm": 0.7734375, + "learning_rate": 0.00036707402446562717, + "loss": 0.1731, + "step": 249850 + }, + { + "epoch": 10.35, + "grad_norm": 0.55078125, + "learning_rate": 0.0003670644418493804, + "loss": 0.1773, + "step": 249860 + }, + { + "epoch": 10.35, + "grad_norm": 1.0703125, + "learning_rate": 0.00036705485901283093, + "loss": 0.239, + "step": 249870 + }, + { + "epoch": 10.35, + "grad_norm": 0.91015625, + "learning_rate": 0.00036704527595599697, + "loss": 0.2279, + "step": 249880 + }, + { + "epoch": 10.35, + "grad_norm": 0.96875, + "learning_rate": 0.0003670356926788964, + "loss": 0.1539, + "step": 249890 + }, + { + "epoch": 10.35, + "grad_norm": 0.75390625, + "learning_rate": 0.0003670261091815472, + "loss": 0.1963, + "step": 249900 + }, + { + "epoch": 10.35, + "grad_norm": 0.328125, + "learning_rate": 0.0003670165254639676, + "loss": 0.16, + "step": 249910 + }, + { + "epoch": 10.35, + "grad_norm": 0.81640625, + "learning_rate": 0.0003670069415261754, + "loss": 0.1993, + "step": 249920 + }, + { + "epoch": 10.35, + "grad_norm": 1.1875, + "learning_rate": 0.00036699735736818895, + "loss": 0.1644, + "step": 249930 + }, + { + "epoch": 10.35, + "grad_norm": 0.8984375, + "learning_rate": 0.000366987772990026, + "loss": 0.2142, + "step": 249940 + }, + { + "epoch": 10.35, + "grad_norm": 0.734375, + "learning_rate": 0.0003669781883917047, + "loss": 0.2021, + "step": 249950 + }, + { + "epoch": 10.35, + "grad_norm": 0.80859375, + "learning_rate": 0.0003669686035732432, + "loss": 0.1736, + "step": 249960 + }, + { + "epoch": 10.35, + "grad_norm": 0.5234375, + "learning_rate": 0.0003669590185346593, + "loss": 0.2195, + "step": 249970 + }, + { + "epoch": 10.35, + "grad_norm": 0.51953125, + "learning_rate": 0.0003669494332759712, + "loss": 0.2418, + "step": 249980 + }, + { + "epoch": 10.35, + "grad_norm": 1.0, + "learning_rate": 0.0003669398477971969, + "loss": 0.1965, + "step": 249990 + }, + { + "epoch": 10.35, + "grad_norm": 0.95703125, + "learning_rate": 0.0003669302620983543, + "loss": 0.2012, + "step": 250000 + }, + { + "epoch": 10.36, + "grad_norm": 0.482421875, + "learning_rate": 0.0003669206761794618, + "loss": 0.2038, + "step": 250010 + }, + { + "epoch": 10.36, + "grad_norm": 0.51953125, + "learning_rate": 0.0003669110900405371, + "loss": 0.1826, + "step": 250020 + }, + { + "epoch": 10.36, + "grad_norm": 0.419921875, + "learning_rate": 0.0003669015036815983, + "loss": 0.2106, + "step": 250030 + }, + { + "epoch": 10.36, + "grad_norm": 0.435546875, + "learning_rate": 0.00036689191710266355, + "loss": 0.1985, + "step": 250040 + }, + { + "epoch": 10.36, + "grad_norm": 0.60546875, + "learning_rate": 0.00036688233030375084, + "loss": 0.2057, + "step": 250050 + }, + { + "epoch": 10.36, + "grad_norm": 0.58203125, + "learning_rate": 0.0003668727432848782, + "loss": 0.1583, + "step": 250060 + }, + { + "epoch": 10.36, + "grad_norm": 0.7265625, + "learning_rate": 0.00036686315604606375, + "loss": 0.1179, + "step": 250070 + }, + { + "epoch": 10.36, + "grad_norm": 0.796875, + "learning_rate": 0.0003668535685873253, + "loss": 0.213, + "step": 250080 + }, + { + "epoch": 10.36, + "grad_norm": 0.734375, + "learning_rate": 0.0003668439809086812, + "loss": 0.148, + "step": 250090 + }, + { + "epoch": 10.36, + "grad_norm": 0.58203125, + "learning_rate": 0.0003668343930101493, + "loss": 0.1706, + "step": 250100 + }, + { + "epoch": 10.36, + "grad_norm": 0.52734375, + "learning_rate": 0.0003668248048917476, + "loss": 0.2123, + "step": 250110 + }, + { + "epoch": 10.36, + "grad_norm": 0.85546875, + "learning_rate": 0.0003668152165534944, + "loss": 0.1988, + "step": 250120 + }, + { + "epoch": 10.36, + "grad_norm": 1.4140625, + "learning_rate": 0.0003668056279954075, + "loss": 0.2372, + "step": 250130 + }, + { + "epoch": 10.36, + "grad_norm": 0.77734375, + "learning_rate": 0.00036679603921750497, + "loss": 0.2039, + "step": 250140 + }, + { + "epoch": 10.36, + "grad_norm": 1.109375, + "learning_rate": 0.0003667864502198049, + "loss": 0.1986, + "step": 250150 + }, + { + "epoch": 10.36, + "grad_norm": 0.90625, + "learning_rate": 0.00036677686100232545, + "loss": 0.191, + "step": 250160 + }, + { + "epoch": 10.36, + "grad_norm": 0.56640625, + "learning_rate": 0.00036676727156508454, + "loss": 0.2661, + "step": 250170 + }, + { + "epoch": 10.36, + "grad_norm": 1.1171875, + "learning_rate": 0.0003667576819081002, + "loss": 0.1973, + "step": 250180 + }, + { + "epoch": 10.36, + "grad_norm": 1.046875, + "learning_rate": 0.0003667480920313905, + "loss": 0.2273, + "step": 250190 + }, + { + "epoch": 10.36, + "grad_norm": 0.376953125, + "learning_rate": 0.00036673850193497354, + "loss": 0.1507, + "step": 250200 + }, + { + "epoch": 10.36, + "grad_norm": 0.306640625, + "learning_rate": 0.00036672891161886723, + "loss": 0.1526, + "step": 250210 + }, + { + "epoch": 10.36, + "grad_norm": 0.6875, + "learning_rate": 0.0003667193210830898, + "loss": 0.1979, + "step": 250220 + }, + { + "epoch": 10.36, + "grad_norm": 1.1171875, + "learning_rate": 0.0003667097303276592, + "loss": 0.1657, + "step": 250230 + }, + { + "epoch": 10.36, + "grad_norm": 1.046875, + "learning_rate": 0.00036670013935259345, + "loss": 0.2044, + "step": 250240 + }, + { + "epoch": 10.37, + "grad_norm": 0.46875, + "learning_rate": 0.0003666905481579107, + "loss": 0.1864, + "step": 250250 + }, + { + "epoch": 10.37, + "grad_norm": 1.1328125, + "learning_rate": 0.0003666809567436289, + "loss": 0.217, + "step": 250260 + }, + { + "epoch": 10.37, + "grad_norm": 0.376953125, + "learning_rate": 0.00036667136510976615, + "loss": 0.191, + "step": 250270 + }, + { + "epoch": 10.37, + "grad_norm": 0.78515625, + "learning_rate": 0.00036666177325634053, + "loss": 0.2022, + "step": 250280 + }, + { + "epoch": 10.37, + "grad_norm": 0.84375, + "learning_rate": 0.00036665218118337, + "loss": 0.1953, + "step": 250290 + }, + { + "epoch": 10.37, + "grad_norm": 2.453125, + "learning_rate": 0.0003666425888908727, + "loss": 0.1947, + "step": 250300 + }, + { + "epoch": 10.37, + "grad_norm": 0.83203125, + "learning_rate": 0.00036663299637886655, + "loss": 0.2438, + "step": 250310 + }, + { + "epoch": 10.37, + "grad_norm": 0.60546875, + "learning_rate": 0.0003666234036473698, + "loss": 0.2689, + "step": 250320 + }, + { + "epoch": 10.37, + "grad_norm": 1.0625, + "learning_rate": 0.00036661381069640037, + "loss": 0.2285, + "step": 250330 + }, + { + "epoch": 10.37, + "grad_norm": 0.361328125, + "learning_rate": 0.00036660421752597626, + "loss": 0.2229, + "step": 250340 + }, + { + "epoch": 10.37, + "grad_norm": 1.3125, + "learning_rate": 0.0003665946241361157, + "loss": 0.2377, + "step": 250350 + }, + { + "epoch": 10.37, + "grad_norm": 0.578125, + "learning_rate": 0.00036658503052683665, + "loss": 0.2119, + "step": 250360 + }, + { + "epoch": 10.37, + "grad_norm": 0.86328125, + "learning_rate": 0.0003665754366981572, + "loss": 0.2042, + "step": 250370 + }, + { + "epoch": 10.37, + "grad_norm": 0.82421875, + "learning_rate": 0.0003665658426500953, + "loss": 0.2112, + "step": 250380 + }, + { + "epoch": 10.37, + "grad_norm": 0.73828125, + "learning_rate": 0.00036655624838266907, + "loss": 0.2072, + "step": 250390 + }, + { + "epoch": 10.37, + "grad_norm": 0.62109375, + "learning_rate": 0.0003665466538958965, + "loss": 0.142, + "step": 250400 + }, + { + "epoch": 10.37, + "grad_norm": 0.703125, + "learning_rate": 0.00036653705918979586, + "loss": 0.2455, + "step": 250410 + }, + { + "epoch": 10.37, + "grad_norm": 0.46484375, + "learning_rate": 0.000366527464264385, + "loss": 0.1949, + "step": 250420 + }, + { + "epoch": 10.37, + "grad_norm": 0.484375, + "learning_rate": 0.000366517869119682, + "loss": 0.199, + "step": 250430 + }, + { + "epoch": 10.37, + "grad_norm": 0.78515625, + "learning_rate": 0.00036650827375570494, + "loss": 0.1711, + "step": 250440 + }, + { + "epoch": 10.37, + "grad_norm": 0.91015625, + "learning_rate": 0.00036649867817247194, + "loss": 0.1869, + "step": 250450 + }, + { + "epoch": 10.37, + "grad_norm": 1.953125, + "learning_rate": 0.000366489082370001, + "loss": 0.1704, + "step": 250460 + }, + { + "epoch": 10.37, + "grad_norm": 1.203125, + "learning_rate": 0.00036647948634831016, + "loss": 0.1745, + "step": 250470 + }, + { + "epoch": 10.37, + "grad_norm": 1.203125, + "learning_rate": 0.00036646989010741753, + "loss": 0.1791, + "step": 250480 + }, + { + "epoch": 10.38, + "grad_norm": 0.8359375, + "learning_rate": 0.00036646029364734113, + "loss": 0.1985, + "step": 250490 + }, + { + "epoch": 10.38, + "grad_norm": 0.9453125, + "learning_rate": 0.0003664506969680991, + "loss": 0.2019, + "step": 250500 + }, + { + "epoch": 10.38, + "grad_norm": 1.0859375, + "learning_rate": 0.00036644110006970937, + "loss": 0.2019, + "step": 250510 + }, + { + "epoch": 10.38, + "grad_norm": 0.703125, + "learning_rate": 0.00036643150295219006, + "loss": 0.21, + "step": 250520 + }, + { + "epoch": 10.38, + "grad_norm": 1.1328125, + "learning_rate": 0.0003664219056155593, + "loss": 0.2057, + "step": 250530 + }, + { + "epoch": 10.38, + "grad_norm": 1.5390625, + "learning_rate": 0.00036641230805983506, + "loss": 0.2471, + "step": 250540 + }, + { + "epoch": 10.38, + "grad_norm": 1.1875, + "learning_rate": 0.00036640271028503537, + "loss": 0.1603, + "step": 250550 + }, + { + "epoch": 10.38, + "grad_norm": 1.859375, + "learning_rate": 0.0003663931122911784, + "loss": 0.2086, + "step": 250560 + }, + { + "epoch": 10.38, + "grad_norm": 0.72265625, + "learning_rate": 0.0003663835140782821, + "loss": 0.206, + "step": 250570 + }, + { + "epoch": 10.38, + "grad_norm": 1.6640625, + "learning_rate": 0.0003663739156463647, + "loss": 0.2105, + "step": 250580 + }, + { + "epoch": 10.38, + "grad_norm": 0.84375, + "learning_rate": 0.0003663643169954441, + "loss": 0.1208, + "step": 250590 + }, + { + "epoch": 10.38, + "grad_norm": 0.376953125, + "learning_rate": 0.0003663547181255384, + "loss": 0.1851, + "step": 250600 + }, + { + "epoch": 10.38, + "grad_norm": 0.95703125, + "learning_rate": 0.0003663451190366657, + "loss": 0.2153, + "step": 250610 + }, + { + "epoch": 10.38, + "grad_norm": 0.5625, + "learning_rate": 0.000366335519728844, + "loss": 0.1896, + "step": 250620 + }, + { + "epoch": 10.38, + "grad_norm": 1.2109375, + "learning_rate": 0.00036632592020209153, + "loss": 0.191, + "step": 250630 + }, + { + "epoch": 10.38, + "grad_norm": 0.51171875, + "learning_rate": 0.00036631632045642615, + "loss": 0.1944, + "step": 250640 + }, + { + "epoch": 10.38, + "grad_norm": 0.734375, + "learning_rate": 0.0003663067204918661, + "loss": 0.2348, + "step": 250650 + }, + { + "epoch": 10.38, + "grad_norm": 0.7265625, + "learning_rate": 0.0003662971203084293, + "loss": 0.1528, + "step": 250660 + }, + { + "epoch": 10.38, + "grad_norm": 1.5234375, + "learning_rate": 0.0003662875199061339, + "loss": 0.1882, + "step": 250670 + }, + { + "epoch": 10.38, + "grad_norm": 0.41796875, + "learning_rate": 0.00036627791928499793, + "loss": 0.188, + "step": 250680 + }, + { + "epoch": 10.38, + "grad_norm": 0.78515625, + "learning_rate": 0.00036626831844503943, + "loss": 0.1854, + "step": 250690 + }, + { + "epoch": 10.38, + "grad_norm": 0.3515625, + "learning_rate": 0.0003662587173862766, + "loss": 0.2261, + "step": 250700 + }, + { + "epoch": 10.38, + "grad_norm": 0.275390625, + "learning_rate": 0.00036624911610872737, + "loss": 0.1652, + "step": 250710 + }, + { + "epoch": 10.38, + "grad_norm": 3.453125, + "learning_rate": 0.0003662395146124099, + "loss": 0.2694, + "step": 250720 + }, + { + "epoch": 10.39, + "grad_norm": 0.62109375, + "learning_rate": 0.0003662299128973421, + "loss": 0.1484, + "step": 250730 + }, + { + "epoch": 10.39, + "grad_norm": 0.59375, + "learning_rate": 0.0003662203109635423, + "loss": 0.2195, + "step": 250740 + }, + { + "epoch": 10.39, + "grad_norm": 0.69921875, + "learning_rate": 0.0003662107088110284, + "loss": 0.2088, + "step": 250750 + }, + { + "epoch": 10.39, + "grad_norm": 1.015625, + "learning_rate": 0.0003662011064398184, + "loss": 0.1431, + "step": 250760 + }, + { + "epoch": 10.39, + "grad_norm": 0.6640625, + "learning_rate": 0.0003661915038499305, + "loss": 0.2218, + "step": 250770 + }, + { + "epoch": 10.39, + "grad_norm": 0.380859375, + "learning_rate": 0.00036618190104138276, + "loss": 0.1489, + "step": 250780 + }, + { + "epoch": 10.39, + "grad_norm": 0.375, + "learning_rate": 0.0003661722980141933, + "loss": 0.1938, + "step": 250790 + }, + { + "epoch": 10.39, + "grad_norm": 1.6796875, + "learning_rate": 0.00036616269476838004, + "loss": 0.1494, + "step": 250800 + }, + { + "epoch": 10.39, + "grad_norm": 1.4765625, + "learning_rate": 0.0003661530913039611, + "loss": 0.1965, + "step": 250810 + }, + { + "epoch": 10.39, + "grad_norm": 2.515625, + "learning_rate": 0.00036614348762095466, + "loss": 0.2215, + "step": 250820 + }, + { + "epoch": 10.39, + "grad_norm": 1.25, + "learning_rate": 0.0003661338837193787, + "loss": 0.1179, + "step": 250830 + }, + { + "epoch": 10.39, + "grad_norm": 0.0, + "learning_rate": 0.00036612427959925126, + "loss": 0.1833, + "step": 250840 + }, + { + "epoch": 10.39, + "grad_norm": 0.81640625, + "learning_rate": 0.0003661146752605905, + "loss": 0.2224, + "step": 250850 + }, + { + "epoch": 10.39, + "grad_norm": 0.5625, + "learning_rate": 0.00036610507070341446, + "loss": 0.1669, + "step": 250860 + }, + { + "epoch": 10.39, + "grad_norm": 0.5390625, + "learning_rate": 0.00036609546592774125, + "loss": 0.2053, + "step": 250870 + }, + { + "epoch": 10.39, + "grad_norm": 0.490234375, + "learning_rate": 0.00036608586093358884, + "loss": 0.2221, + "step": 250880 + }, + { + "epoch": 10.39, + "grad_norm": 0.15625, + "learning_rate": 0.0003660762557209755, + "loss": 0.1431, + "step": 250890 + }, + { + "epoch": 10.39, + "grad_norm": 1.9765625, + "learning_rate": 0.0003660666502899191, + "loss": 0.1708, + "step": 250900 + }, + { + "epoch": 10.39, + "grad_norm": 2.46875, + "learning_rate": 0.0003660570446404377, + "loss": 0.2015, + "step": 250910 + }, + { + "epoch": 10.39, + "grad_norm": 1.2734375, + "learning_rate": 0.00036604743877254964, + "loss": 0.199, + "step": 250920 + }, + { + "epoch": 10.39, + "grad_norm": 1.296875, + "learning_rate": 0.0003660378326862728, + "loss": 0.2197, + "step": 250930 + }, + { + "epoch": 10.39, + "grad_norm": 2.0, + "learning_rate": 0.0003660282263816252, + "loss": 0.191, + "step": 250940 + }, + { + "epoch": 10.39, + "grad_norm": 0.55078125, + "learning_rate": 0.0003660186198586251, + "loss": 0.1827, + "step": 250950 + }, + { + "epoch": 10.39, + "grad_norm": 0.515625, + "learning_rate": 0.00036600901311729044, + "loss": 0.1751, + "step": 250960 + }, + { + "epoch": 10.4, + "grad_norm": 0.75390625, + "learning_rate": 0.0003659994061576393, + "loss": 0.253, + "step": 250970 + }, + { + "epoch": 10.4, + "grad_norm": 0.84765625, + "learning_rate": 0.0003659897989796899, + "loss": 0.2317, + "step": 250980 + }, + { + "epoch": 10.4, + "grad_norm": 0.68359375, + "learning_rate": 0.0003659801915834602, + "loss": 0.194, + "step": 250990 + }, + { + "epoch": 10.4, + "grad_norm": 0.70703125, + "learning_rate": 0.0003659705839689683, + "loss": 0.1828, + "step": 251000 + }, + { + "epoch": 10.4, + "grad_norm": 0.7734375, + "learning_rate": 0.00036596097613623226, + "loss": 0.16, + "step": 251010 + }, + { + "epoch": 10.4, + "grad_norm": 0.96875, + "learning_rate": 0.0003659513680852702, + "loss": 0.1975, + "step": 251020 + }, + { + "epoch": 10.4, + "grad_norm": 0.75, + "learning_rate": 0.0003659417598161002, + "loss": 0.2029, + "step": 251030 + }, + { + "epoch": 10.4, + "grad_norm": 1.25, + "learning_rate": 0.0003659321513287403, + "loss": 0.1425, + "step": 251040 + }, + { + "epoch": 10.4, + "grad_norm": 0.33984375, + "learning_rate": 0.0003659225426232086, + "loss": 0.187, + "step": 251050 + }, + { + "epoch": 10.4, + "grad_norm": 0.353515625, + "learning_rate": 0.00036591293369952327, + "loss": 0.2291, + "step": 251060 + }, + { + "epoch": 10.4, + "grad_norm": 1.0078125, + "learning_rate": 0.0003659033245577022, + "loss": 0.1375, + "step": 251070 + }, + { + "epoch": 10.4, + "grad_norm": 0.78515625, + "learning_rate": 0.0003658937151977637, + "loss": 0.1502, + "step": 251080 + }, + { + "epoch": 10.4, + "grad_norm": 0.67578125, + "learning_rate": 0.00036588410561972566, + "loss": 0.1486, + "step": 251090 + }, + { + "epoch": 10.4, + "grad_norm": 1.421875, + "learning_rate": 0.0003658744958236063, + "loss": 0.2039, + "step": 251100 + }, + { + "epoch": 10.4, + "grad_norm": 1.3203125, + "learning_rate": 0.0003658648858094237, + "loss": 0.1694, + "step": 251110 + }, + { + "epoch": 10.4, + "grad_norm": 0.38671875, + "learning_rate": 0.0003658552755771957, + "loss": 0.1933, + "step": 251120 + }, + { + "epoch": 10.4, + "grad_norm": 0.83203125, + "learning_rate": 0.0003658456651269408, + "loss": 0.2828, + "step": 251130 + }, + { + "epoch": 10.4, + "grad_norm": 1.4375, + "learning_rate": 0.00036583605445867685, + "loss": 0.2034, + "step": 251140 + }, + { + "epoch": 10.4, + "grad_norm": 1.1171875, + "learning_rate": 0.00036582644357242177, + "loss": 0.2193, + "step": 251150 + }, + { + "epoch": 10.4, + "grad_norm": 0.349609375, + "learning_rate": 0.000365816832468194, + "loss": 0.1368, + "step": 251160 + }, + { + "epoch": 10.4, + "grad_norm": 0.4765625, + "learning_rate": 0.0003658072211460114, + "loss": 0.2013, + "step": 251170 + }, + { + "epoch": 10.4, + "grad_norm": 0.2236328125, + "learning_rate": 0.00036579760960589214, + "loss": 0.2164, + "step": 251180 + }, + { + "epoch": 10.4, + "grad_norm": 0.6171875, + "learning_rate": 0.0003657879978478543, + "loss": 0.2202, + "step": 251190 + }, + { + "epoch": 10.4, + "grad_norm": 1.1015625, + "learning_rate": 0.0003657783858719159, + "loss": 0.1775, + "step": 251200 + }, + { + "epoch": 10.41, + "grad_norm": 0.8515625, + "learning_rate": 0.0003657687736780951, + "loss": 0.1941, + "step": 251210 + }, + { + "epoch": 10.41, + "grad_norm": 0.734375, + "learning_rate": 0.00036575916126641, + "loss": 0.2041, + "step": 251220 + }, + { + "epoch": 10.41, + "grad_norm": 1.0234375, + "learning_rate": 0.0003657495486368786, + "loss": 0.2287, + "step": 251230 + }, + { + "epoch": 10.41, + "grad_norm": 0.60546875, + "learning_rate": 0.0003657399357895191, + "loss": 0.2088, + "step": 251240 + }, + { + "epoch": 10.41, + "grad_norm": 0.76171875, + "learning_rate": 0.00036573032272434945, + "loss": 0.1492, + "step": 251250 + }, + { + "epoch": 10.41, + "grad_norm": 1.0078125, + "learning_rate": 0.0003657207094413879, + "loss": 0.1832, + "step": 251260 + }, + { + "epoch": 10.41, + "grad_norm": 0.87109375, + "learning_rate": 0.00036571109594065257, + "loss": 0.1795, + "step": 251270 + }, + { + "epoch": 10.41, + "grad_norm": 0.546875, + "learning_rate": 0.00036570148222216125, + "loss": 0.191, + "step": 251280 + }, + { + "epoch": 10.41, + "grad_norm": 0.68359375, + "learning_rate": 0.00036569186828593243, + "loss": 0.1982, + "step": 251290 + }, + { + "epoch": 10.41, + "grad_norm": 0.92578125, + "learning_rate": 0.0003656822541319839, + "loss": 0.197, + "step": 251300 + }, + { + "epoch": 10.41, + "grad_norm": 1.8203125, + "learning_rate": 0.00036567263976033384, + "loss": 0.2386, + "step": 251310 + }, + { + "epoch": 10.41, + "grad_norm": 0.9375, + "learning_rate": 0.0003656630251710005, + "loss": 0.2239, + "step": 251320 + }, + { + "epoch": 10.41, + "grad_norm": 0.60546875, + "learning_rate": 0.0003656534103640017, + "loss": 0.2058, + "step": 251330 + }, + { + "epoch": 10.41, + "grad_norm": 0.82421875, + "learning_rate": 0.00036564379533935565, + "loss": 0.1833, + "step": 251340 + }, + { + "epoch": 10.41, + "grad_norm": 0.5625, + "learning_rate": 0.00036563418009708063, + "loss": 0.2033, + "step": 251350 + }, + { + "epoch": 10.41, + "grad_norm": 0.58984375, + "learning_rate": 0.0003656245646371944, + "loss": 0.2243, + "step": 251360 + }, + { + "epoch": 10.41, + "grad_norm": 1.28125, + "learning_rate": 0.00036561494895971533, + "loss": 0.1863, + "step": 251370 + }, + { + "epoch": 10.41, + "grad_norm": 0.81640625, + "learning_rate": 0.0003656053330646614, + "loss": 0.1604, + "step": 251380 + }, + { + "epoch": 10.41, + "grad_norm": 1.5078125, + "learning_rate": 0.00036559571695205065, + "loss": 0.1709, + "step": 251390 + }, + { + "epoch": 10.41, + "grad_norm": 0.85546875, + "learning_rate": 0.00036558610062190127, + "loss": 0.2105, + "step": 251400 + }, + { + "epoch": 10.41, + "grad_norm": 1.296875, + "learning_rate": 0.0003655764840742313, + "loss": 0.2034, + "step": 251410 + }, + { + "epoch": 10.41, + "grad_norm": 0.8671875, + "learning_rate": 0.000365566867309059, + "loss": 0.1854, + "step": 251420 + }, + { + "epoch": 10.41, + "grad_norm": 1.3046875, + "learning_rate": 0.00036555725032640226, + "loss": 0.1952, + "step": 251430 + }, + { + "epoch": 10.41, + "grad_norm": 0.60546875, + "learning_rate": 0.00036554763312627913, + "loss": 0.2021, + "step": 251440 + }, + { + "epoch": 10.42, + "grad_norm": 0.59765625, + "learning_rate": 0.00036553801570870804, + "loss": 0.179, + "step": 251450 + }, + { + "epoch": 10.42, + "grad_norm": 0.5859375, + "learning_rate": 0.00036552839807370676, + "loss": 0.2208, + "step": 251460 + }, + { + "epoch": 10.42, + "grad_norm": 0.96484375, + "learning_rate": 0.00036551878022129356, + "loss": 0.1807, + "step": 251470 + }, + { + "epoch": 10.42, + "grad_norm": 1.3984375, + "learning_rate": 0.00036550916215148644, + "loss": 0.1876, + "step": 251480 + }, + { + "epoch": 10.42, + "grad_norm": 1.1640625, + "learning_rate": 0.0003654995438643036, + "loss": 0.1709, + "step": 251490 + }, + { + "epoch": 10.42, + "grad_norm": 0.921875, + "learning_rate": 0.00036548992535976303, + "loss": 0.2167, + "step": 251500 + }, + { + "epoch": 10.42, + "grad_norm": 1.203125, + "learning_rate": 0.000365480306637883, + "loss": 0.205, + "step": 251510 + }, + { + "epoch": 10.42, + "grad_norm": 1.03125, + "learning_rate": 0.0003654706876986814, + "loss": 0.1951, + "step": 251520 + }, + { + "epoch": 10.42, + "grad_norm": 0.67578125, + "learning_rate": 0.00036546106854217647, + "loss": 0.1804, + "step": 251530 + }, + { + "epoch": 10.42, + "grad_norm": 1.890625, + "learning_rate": 0.0003654514491683863, + "loss": 0.2156, + "step": 251540 + }, + { + "epoch": 10.42, + "grad_norm": 0.6171875, + "learning_rate": 0.0003654418295773289, + "loss": 0.2361, + "step": 251550 + }, + { + "epoch": 10.42, + "grad_norm": 0.5625, + "learning_rate": 0.0003654322097690226, + "loss": 0.2104, + "step": 251560 + }, + { + "epoch": 10.42, + "grad_norm": 0.59765625, + "learning_rate": 0.0003654225897434852, + "loss": 0.2008, + "step": 251570 + }, + { + "epoch": 10.42, + "grad_norm": 0.71875, + "learning_rate": 0.000365412969500735, + "loss": 0.218, + "step": 251580 + }, + { + "epoch": 10.42, + "grad_norm": 0.29296875, + "learning_rate": 0.00036540334904079007, + "loss": 0.215, + "step": 251590 + }, + { + "epoch": 10.42, + "grad_norm": 0.37109375, + "learning_rate": 0.0003653937283636684, + "loss": 0.2549, + "step": 251600 + }, + { + "epoch": 10.42, + "grad_norm": 0.890625, + "learning_rate": 0.0003653841074693883, + "loss": 0.189, + "step": 251610 + }, + { + "epoch": 10.42, + "grad_norm": 0.73828125, + "learning_rate": 0.00036537448635796775, + "loss": 0.1813, + "step": 251620 + }, + { + "epoch": 10.42, + "grad_norm": 0.41015625, + "learning_rate": 0.00036536486502942485, + "loss": 0.1729, + "step": 251630 + }, + { + "epoch": 10.42, + "grad_norm": 0.75, + "learning_rate": 0.0003653552434837778, + "loss": 0.1767, + "step": 251640 + }, + { + "epoch": 10.42, + "grad_norm": 0.58203125, + "learning_rate": 0.0003653456217210446, + "loss": 0.2394, + "step": 251650 + }, + { + "epoch": 10.42, + "grad_norm": 0.19921875, + "learning_rate": 0.0003653359997412433, + "loss": 0.139, + "step": 251660 + }, + { + "epoch": 10.42, + "grad_norm": 0.0, + "learning_rate": 0.00036532637754439225, + "loss": 0.1845, + "step": 251670 + }, + { + "epoch": 10.42, + "grad_norm": 0.6640625, + "learning_rate": 0.0003653167551305093, + "loss": 0.2111, + "step": 251680 + }, + { + "epoch": 10.42, + "grad_norm": 1.015625, + "learning_rate": 0.0003653071324996128, + "loss": 0.2199, + "step": 251690 + }, + { + "epoch": 10.43, + "grad_norm": 0.671875, + "learning_rate": 0.0003652975096517206, + "loss": 0.1665, + "step": 251700 + }, + { + "epoch": 10.43, + "grad_norm": 1.015625, + "learning_rate": 0.000365287886586851, + "loss": 0.199, + "step": 251710 + }, + { + "epoch": 10.43, + "grad_norm": 0.75390625, + "learning_rate": 0.00036527826330502207, + "loss": 0.2334, + "step": 251720 + }, + { + "epoch": 10.43, + "grad_norm": 0.5703125, + "learning_rate": 0.0003652686398062518, + "loss": 0.1754, + "step": 251730 + }, + { + "epoch": 10.43, + "grad_norm": 0.64453125, + "learning_rate": 0.00036525901609055846, + "loss": 0.225, + "step": 251740 + }, + { + "epoch": 10.43, + "grad_norm": 0.953125, + "learning_rate": 0.00036524939215796013, + "loss": 0.1997, + "step": 251750 + }, + { + "epoch": 10.43, + "grad_norm": 3.0, + "learning_rate": 0.0003652397680084748, + "loss": 0.2244, + "step": 251760 + }, + { + "epoch": 10.43, + "grad_norm": 0.86328125, + "learning_rate": 0.0003652301436421208, + "loss": 0.2283, + "step": 251770 + }, + { + "epoch": 10.43, + "grad_norm": 1.4296875, + "learning_rate": 0.000365220519058916, + "loss": 0.1747, + "step": 251780 + }, + { + "epoch": 10.43, + "grad_norm": 2.21875, + "learning_rate": 0.0003652108942588787, + "loss": 0.2236, + "step": 251790 + }, + { + "epoch": 10.43, + "grad_norm": 0.58203125, + "learning_rate": 0.0003652012692420269, + "loss": 0.1365, + "step": 251800 + }, + { + "epoch": 10.43, + "grad_norm": 1.078125, + "learning_rate": 0.0003651916440083788, + "loss": 0.1819, + "step": 251810 + }, + { + "epoch": 10.43, + "grad_norm": 2.546875, + "learning_rate": 0.0003651820185579524, + "loss": 0.2191, + "step": 251820 + }, + { + "epoch": 10.43, + "grad_norm": 0.8125, + "learning_rate": 0.00036517239289076585, + "loss": 0.2075, + "step": 251830 + }, + { + "epoch": 10.43, + "grad_norm": 0.60546875, + "learning_rate": 0.00036516276700683737, + "loss": 0.2445, + "step": 251840 + }, + { + "epoch": 10.43, + "grad_norm": 0.94140625, + "learning_rate": 0.000365153140906185, + "loss": 0.2043, + "step": 251850 + }, + { + "epoch": 10.43, + "grad_norm": 0.8984375, + "learning_rate": 0.00036514351458882686, + "loss": 0.2391, + "step": 251860 + }, + { + "epoch": 10.43, + "grad_norm": 0.486328125, + "learning_rate": 0.000365133888054781, + "loss": 0.1951, + "step": 251870 + }, + { + "epoch": 10.43, + "grad_norm": 0.64453125, + "learning_rate": 0.0003651242613040656, + "loss": 0.1949, + "step": 251880 + }, + { + "epoch": 10.43, + "grad_norm": 0.74609375, + "learning_rate": 0.0003651146343366988, + "loss": 0.2097, + "step": 251890 + }, + { + "epoch": 10.43, + "grad_norm": 1.171875, + "learning_rate": 0.00036510500715269874, + "loss": 0.1774, + "step": 251900 + }, + { + "epoch": 10.43, + "grad_norm": 0.71875, + "learning_rate": 0.0003650953797520834, + "loss": 0.1751, + "step": 251910 + }, + { + "epoch": 10.43, + "grad_norm": 0.69140625, + "learning_rate": 0.000365085752134871, + "loss": 0.182, + "step": 251920 + }, + { + "epoch": 10.43, + "grad_norm": 1.3828125, + "learning_rate": 0.0003650761243010797, + "loss": 0.1927, + "step": 251930 + }, + { + "epoch": 10.44, + "grad_norm": 0.8359375, + "learning_rate": 0.0003650664962507275, + "loss": 0.1981, + "step": 251940 + }, + { + "epoch": 10.44, + "grad_norm": 0.62109375, + "learning_rate": 0.00036505686798383265, + "loss": 0.1909, + "step": 251950 + }, + { + "epoch": 10.44, + "grad_norm": 1.3828125, + "learning_rate": 0.0003650472395004131, + "loss": 0.2098, + "step": 251960 + }, + { + "epoch": 10.44, + "grad_norm": 1.0078125, + "learning_rate": 0.0003650376108004872, + "loss": 0.2173, + "step": 251970 + }, + { + "epoch": 10.44, + "grad_norm": 0.828125, + "learning_rate": 0.00036502798188407283, + "loss": 0.194, + "step": 251980 + }, + { + "epoch": 10.44, + "grad_norm": 0.9609375, + "learning_rate": 0.0003650183527511882, + "loss": 0.2559, + "step": 251990 + }, + { + "epoch": 10.44, + "grad_norm": 0.0, + "learning_rate": 0.00036500872340185155, + "loss": 0.1704, + "step": 252000 + }, + { + "epoch": 10.44, + "grad_norm": 1.75, + "learning_rate": 0.0003649990938360808, + "loss": 0.2076, + "step": 252010 + }, + { + "epoch": 10.44, + "grad_norm": 0.94140625, + "learning_rate": 0.00036498946405389435, + "loss": 0.2251, + "step": 252020 + }, + { + "epoch": 10.44, + "grad_norm": 0.4765625, + "learning_rate": 0.00036497983405531, + "loss": 0.2249, + "step": 252030 + }, + { + "epoch": 10.44, + "grad_norm": 1.921875, + "learning_rate": 0.000364970203840346, + "loss": 0.1707, + "step": 252040 + }, + { + "epoch": 10.44, + "grad_norm": 0.46875, + "learning_rate": 0.00036496057340902057, + "loss": 0.1813, + "step": 252050 + }, + { + "epoch": 10.44, + "grad_norm": 0.4140625, + "learning_rate": 0.0003649509427613517, + "loss": 0.1804, + "step": 252060 + }, + { + "epoch": 10.44, + "grad_norm": 0.76171875, + "learning_rate": 0.0003649413118973576, + "loss": 0.1678, + "step": 252070 + }, + { + "epoch": 10.44, + "grad_norm": 0.578125, + "learning_rate": 0.00036493168081705636, + "loss": 0.1918, + "step": 252080 + }, + { + "epoch": 10.44, + "grad_norm": 0.353515625, + "learning_rate": 0.00036492204952046607, + "loss": 0.2453, + "step": 252090 + }, + { + "epoch": 10.44, + "grad_norm": 0.68359375, + "learning_rate": 0.000364912418007605, + "loss": 0.2289, + "step": 252100 + }, + { + "epoch": 10.44, + "grad_norm": 0.65234375, + "learning_rate": 0.00036490278627849106, + "loss": 0.1985, + "step": 252110 + }, + { + "epoch": 10.44, + "grad_norm": 1.578125, + "learning_rate": 0.00036489315433314254, + "loss": 0.2075, + "step": 252120 + }, + { + "epoch": 10.44, + "grad_norm": 1.5703125, + "learning_rate": 0.0003648835221715775, + "loss": 0.1693, + "step": 252130 + }, + { + "epoch": 10.44, + "grad_norm": 1.7734375, + "learning_rate": 0.0003648738897938141, + "loss": 0.2553, + "step": 252140 + }, + { + "epoch": 10.44, + "grad_norm": 0.64453125, + "learning_rate": 0.00036486425719987035, + "loss": 0.1956, + "step": 252150 + }, + { + "epoch": 10.44, + "grad_norm": 1.2421875, + "learning_rate": 0.00036485462438976457, + "loss": 0.2814, + "step": 252160 + }, + { + "epoch": 10.44, + "grad_norm": 1.0625, + "learning_rate": 0.00036484499136351474, + "loss": 0.1532, + "step": 252170 + }, + { + "epoch": 10.45, + "grad_norm": 0.56640625, + "learning_rate": 0.0003648353581211391, + "loss": 0.2507, + "step": 252180 + }, + { + "epoch": 10.45, + "grad_norm": 1.3359375, + "learning_rate": 0.0003648257246626556, + "loss": 0.2205, + "step": 252190 + }, + { + "epoch": 10.45, + "grad_norm": 0.61328125, + "learning_rate": 0.0003648160909880826, + "loss": 0.1852, + "step": 252200 + }, + { + "epoch": 10.45, + "grad_norm": 0.7890625, + "learning_rate": 0.0003648064570974381, + "loss": 0.1953, + "step": 252210 + }, + { + "epoch": 10.45, + "grad_norm": 0.3359375, + "learning_rate": 0.0003647968229907402, + "loss": 0.2004, + "step": 252220 + }, + { + "epoch": 10.45, + "grad_norm": 0.7578125, + "learning_rate": 0.0003647871886680071, + "loss": 0.1809, + "step": 252230 + }, + { + "epoch": 10.45, + "grad_norm": 0.7421875, + "learning_rate": 0.0003647775541292569, + "loss": 0.1938, + "step": 252240 + }, + { + "epoch": 10.45, + "grad_norm": 0.4140625, + "learning_rate": 0.00036476791937450773, + "loss": 0.1692, + "step": 252250 + }, + { + "epoch": 10.45, + "grad_norm": 1.390625, + "learning_rate": 0.0003647582844037778, + "loss": 0.1781, + "step": 252260 + }, + { + "epoch": 10.45, + "grad_norm": 0.392578125, + "learning_rate": 0.00036474864921708506, + "loss": 0.2244, + "step": 252270 + }, + { + "epoch": 10.45, + "grad_norm": 0.57421875, + "learning_rate": 0.00036473901381444784, + "loss": 0.2067, + "step": 252280 + }, + { + "epoch": 10.45, + "grad_norm": 0.5546875, + "learning_rate": 0.00036472937819588417, + "loss": 0.2221, + "step": 252290 + }, + { + "epoch": 10.45, + "grad_norm": 1.1875, + "learning_rate": 0.00036471974236141215, + "loss": 0.1781, + "step": 252300 + }, + { + "epoch": 10.45, + "grad_norm": 0.609375, + "learning_rate": 0.00036471010631105, + "loss": 0.1765, + "step": 252310 + }, + { + "epoch": 10.45, + "grad_norm": 1.5078125, + "learning_rate": 0.0003647004700448158, + "loss": 0.2304, + "step": 252320 + }, + { + "epoch": 10.45, + "grad_norm": 0.7890625, + "learning_rate": 0.0003646908335627277, + "loss": 0.1893, + "step": 252330 + }, + { + "epoch": 10.45, + "grad_norm": 0.7421875, + "learning_rate": 0.0003646811968648039, + "loss": 0.1863, + "step": 252340 + }, + { + "epoch": 10.45, + "grad_norm": 0.50390625, + "learning_rate": 0.0003646715599510624, + "loss": 0.2033, + "step": 252350 + }, + { + "epoch": 10.45, + "grad_norm": 0.6015625, + "learning_rate": 0.00036466192282152147, + "loss": 0.1803, + "step": 252360 + }, + { + "epoch": 10.45, + "grad_norm": 1.1328125, + "learning_rate": 0.00036465228547619914, + "loss": 0.222, + "step": 252370 + }, + { + "epoch": 10.45, + "grad_norm": 0.66796875, + "learning_rate": 0.0003646426479151136, + "loss": 0.189, + "step": 252380 + }, + { + "epoch": 10.45, + "grad_norm": 0.85546875, + "learning_rate": 0.00036463301013828296, + "loss": 0.2191, + "step": 252390 + }, + { + "epoch": 10.45, + "grad_norm": 0.7109375, + "learning_rate": 0.0003646233721457254, + "loss": 0.1991, + "step": 252400 + }, + { + "epoch": 10.45, + "grad_norm": 0.8671875, + "learning_rate": 0.000364613733937459, + "loss": 0.1665, + "step": 252410 + }, + { + "epoch": 10.46, + "grad_norm": 0.66796875, + "learning_rate": 0.000364604095513502, + "loss": 0.2012, + "step": 252420 + }, + { + "epoch": 10.46, + "grad_norm": 0.83984375, + "learning_rate": 0.0003645944568738724, + "loss": 0.1808, + "step": 252430 + }, + { + "epoch": 10.46, + "grad_norm": 0.63671875, + "learning_rate": 0.00036458481801858846, + "loss": 0.2274, + "step": 252440 + }, + { + "epoch": 10.46, + "grad_norm": 0.546875, + "learning_rate": 0.00036457517894766816, + "loss": 0.1896, + "step": 252450 + }, + { + "epoch": 10.46, + "grad_norm": 0.40234375, + "learning_rate": 0.0003645655396611298, + "loss": 0.2186, + "step": 252460 + }, + { + "epoch": 10.46, + "grad_norm": 0.94140625, + "learning_rate": 0.00036455590015899153, + "loss": 0.2247, + "step": 252470 + }, + { + "epoch": 10.46, + "grad_norm": 0.87890625, + "learning_rate": 0.00036454626044127135, + "loss": 0.2088, + "step": 252480 + }, + { + "epoch": 10.46, + "grad_norm": 0.66015625, + "learning_rate": 0.00036453662050798754, + "loss": 0.2064, + "step": 252490 + }, + { + "epoch": 10.46, + "grad_norm": 0.37890625, + "learning_rate": 0.0003645269803591582, + "loss": 0.2244, + "step": 252500 + }, + { + "epoch": 10.46, + "grad_norm": 1.0234375, + "learning_rate": 0.0003645173399948013, + "loss": 0.1879, + "step": 252510 + }, + { + "epoch": 10.46, + "grad_norm": 0.8984375, + "learning_rate": 0.0003645076994149353, + "loss": 0.1914, + "step": 252520 + }, + { + "epoch": 10.46, + "grad_norm": 0.671875, + "learning_rate": 0.0003644980586195781, + "loss": 0.2014, + "step": 252530 + }, + { + "epoch": 10.46, + "grad_norm": 1.8671875, + "learning_rate": 0.0003644884176087479, + "loss": 0.1746, + "step": 252540 + }, + { + "epoch": 10.46, + "grad_norm": 0.396484375, + "learning_rate": 0.000364478776382463, + "loss": 0.2474, + "step": 252550 + }, + { + "epoch": 10.46, + "grad_norm": 0.7109375, + "learning_rate": 0.00036446913494074117, + "loss": 0.1915, + "step": 252560 + }, + { + "epoch": 10.46, + "grad_norm": 0.53515625, + "learning_rate": 0.00036445949328360095, + "loss": 0.2217, + "step": 252570 + }, + { + "epoch": 10.46, + "grad_norm": 0.2412109375, + "learning_rate": 0.00036444985141106034, + "loss": 0.1765, + "step": 252580 + }, + { + "epoch": 10.46, + "grad_norm": 0.53515625, + "learning_rate": 0.0003644402093231374, + "loss": 0.1833, + "step": 252590 + }, + { + "epoch": 10.46, + "grad_norm": 0.3125, + "learning_rate": 0.00036443056701985035, + "loss": 0.193, + "step": 252600 + }, + { + "epoch": 10.46, + "grad_norm": 0.6875, + "learning_rate": 0.00036442092450121735, + "loss": 0.1668, + "step": 252610 + }, + { + "epoch": 10.46, + "grad_norm": 0.703125, + "learning_rate": 0.0003644112817672565, + "loss": 0.198, + "step": 252620 + }, + { + "epoch": 10.46, + "grad_norm": 0.62109375, + "learning_rate": 0.000364401638817986, + "loss": 0.2103, + "step": 252630 + }, + { + "epoch": 10.46, + "grad_norm": 2.71875, + "learning_rate": 0.00036439199565342395, + "loss": 0.1979, + "step": 252640 + }, + { + "epoch": 10.46, + "grad_norm": 0.9296875, + "learning_rate": 0.0003643823522735885, + "loss": 0.2515, + "step": 252650 + }, + { + "epoch": 10.47, + "grad_norm": 0.78125, + "learning_rate": 0.0003643727086784979, + "loss": 0.1711, + "step": 252660 + }, + { + "epoch": 10.47, + "grad_norm": 0.87109375, + "learning_rate": 0.0003643630648681701, + "loss": 0.2317, + "step": 252670 + }, + { + "epoch": 10.47, + "grad_norm": 1.6015625, + "learning_rate": 0.0003643534208426235, + "loss": 0.1947, + "step": 252680 + }, + { + "epoch": 10.47, + "grad_norm": 0.56640625, + "learning_rate": 0.000364343776601876, + "loss": 0.1823, + "step": 252690 + }, + { + "epoch": 10.47, + "grad_norm": 0.72265625, + "learning_rate": 0.00036433413214594584, + "loss": 0.1988, + "step": 252700 + }, + { + "epoch": 10.47, + "grad_norm": 0.416015625, + "learning_rate": 0.0003643244874748513, + "loss": 0.1394, + "step": 252710 + }, + { + "epoch": 10.47, + "grad_norm": 0.51953125, + "learning_rate": 0.00036431484258861034, + "loss": 0.1583, + "step": 252720 + }, + { + "epoch": 10.47, + "grad_norm": 0.388671875, + "learning_rate": 0.00036430519748724123, + "loss": 0.2223, + "step": 252730 + }, + { + "epoch": 10.47, + "grad_norm": 1.7734375, + "learning_rate": 0.0003642955521707621, + "loss": 0.2206, + "step": 252740 + }, + { + "epoch": 10.47, + "grad_norm": 1.1015625, + "learning_rate": 0.00036428590663919097, + "loss": 0.1767, + "step": 252750 + }, + { + "epoch": 10.47, + "grad_norm": 1.40625, + "learning_rate": 0.00036427626089254625, + "loss": 0.2165, + "step": 252760 + }, + { + "epoch": 10.47, + "grad_norm": 0.50390625, + "learning_rate": 0.00036426661493084585, + "loss": 0.1831, + "step": 252770 + }, + { + "epoch": 10.47, + "grad_norm": 0.427734375, + "learning_rate": 0.00036425696875410805, + "loss": 0.1759, + "step": 252780 + }, + { + "epoch": 10.47, + "grad_norm": 1.15625, + "learning_rate": 0.000364247322362351, + "loss": 0.1555, + "step": 252790 + }, + { + "epoch": 10.47, + "grad_norm": 0.6953125, + "learning_rate": 0.00036423767575559276, + "loss": 0.1848, + "step": 252800 + }, + { + "epoch": 10.47, + "grad_norm": 0.46484375, + "learning_rate": 0.00036422802893385165, + "loss": 0.1539, + "step": 252810 + }, + { + "epoch": 10.47, + "grad_norm": 0.640625, + "learning_rate": 0.00036421838189714566, + "loss": 0.1577, + "step": 252820 + }, + { + "epoch": 10.47, + "grad_norm": 0.302734375, + "learning_rate": 0.00036420873464549296, + "loss": 0.2089, + "step": 252830 + }, + { + "epoch": 10.47, + "grad_norm": 1.34375, + "learning_rate": 0.0003641990871789119, + "loss": 0.1965, + "step": 252840 + }, + { + "epoch": 10.47, + "grad_norm": 0.80859375, + "learning_rate": 0.00036418943949742034, + "loss": 0.1937, + "step": 252850 + }, + { + "epoch": 10.47, + "grad_norm": 0.439453125, + "learning_rate": 0.00036417979160103663, + "loss": 0.1778, + "step": 252860 + }, + { + "epoch": 10.47, + "grad_norm": 1.1171875, + "learning_rate": 0.00036417014348977896, + "loss": 0.1986, + "step": 252870 + }, + { + "epoch": 10.47, + "grad_norm": 0.765625, + "learning_rate": 0.00036416049516366526, + "loss": 0.2413, + "step": 252880 + }, + { + "epoch": 10.47, + "grad_norm": 0.99609375, + "learning_rate": 0.00036415084662271403, + "loss": 0.1874, + "step": 252890 + }, + { + "epoch": 10.48, + "grad_norm": 0.53515625, + "learning_rate": 0.00036414119786694316, + "loss": 0.2144, + "step": 252900 + }, + { + "epoch": 10.48, + "grad_norm": 1.046875, + "learning_rate": 0.0003641315488963708, + "loss": 0.1881, + "step": 252910 + }, + { + "epoch": 10.48, + "grad_norm": 0.91015625, + "learning_rate": 0.0003641218997110153, + "loss": 0.2054, + "step": 252920 + }, + { + "epoch": 10.48, + "grad_norm": 0.6796875, + "learning_rate": 0.0003641122503108946, + "loss": 0.24, + "step": 252930 + }, + { + "epoch": 10.48, + "grad_norm": 0.94140625, + "learning_rate": 0.00036410260069602704, + "loss": 0.1792, + "step": 252940 + }, + { + "epoch": 10.48, + "grad_norm": 2.234375, + "learning_rate": 0.0003640929508664307, + "loss": 0.2583, + "step": 252950 + }, + { + "epoch": 10.48, + "grad_norm": 2.28125, + "learning_rate": 0.0003640833008221237, + "loss": 0.1762, + "step": 252960 + }, + { + "epoch": 10.48, + "grad_norm": 1.1796875, + "learning_rate": 0.0003640736505631243, + "loss": 0.1503, + "step": 252970 + }, + { + "epoch": 10.48, + "grad_norm": 1.0, + "learning_rate": 0.0003640640000894506, + "loss": 0.2368, + "step": 252980 + }, + { + "epoch": 10.48, + "grad_norm": 0.6875, + "learning_rate": 0.00036405434940112077, + "loss": 0.1859, + "step": 252990 + }, + { + "epoch": 10.48, + "grad_norm": 1.0859375, + "learning_rate": 0.000364044698498153, + "loss": 0.251, + "step": 253000 + }, + { + "epoch": 10.48, + "grad_norm": 0.91015625, + "learning_rate": 0.00036403504738056537, + "loss": 0.2043, + "step": 253010 + }, + { + "epoch": 10.48, + "grad_norm": 0.64453125, + "learning_rate": 0.0003640253960483761, + "loss": 0.2224, + "step": 253020 + }, + { + "epoch": 10.48, + "grad_norm": 0.546875, + "learning_rate": 0.00036401574450160336, + "loss": 0.1891, + "step": 253030 + }, + { + "epoch": 10.48, + "grad_norm": 0.8125, + "learning_rate": 0.0003640060927402652, + "loss": 0.2247, + "step": 253040 + }, + { + "epoch": 10.48, + "grad_norm": 1.171875, + "learning_rate": 0.0003639964407643801, + "loss": 0.1748, + "step": 253050 + }, + { + "epoch": 10.48, + "grad_norm": 0.73828125, + "learning_rate": 0.00036398678857396585, + "loss": 0.2256, + "step": 253060 + }, + { + "epoch": 10.48, + "grad_norm": 0.82421875, + "learning_rate": 0.0003639771361690408, + "loss": 0.2196, + "step": 253070 + }, + { + "epoch": 10.48, + "grad_norm": 0.66015625, + "learning_rate": 0.0003639674835496232, + "loss": 0.1815, + "step": 253080 + }, + { + "epoch": 10.48, + "grad_norm": 1.4375, + "learning_rate": 0.00036395783071573094, + "loss": 0.2118, + "step": 253090 + }, + { + "epoch": 10.48, + "grad_norm": 0.5625, + "learning_rate": 0.00036394817766738244, + "loss": 0.1363, + "step": 253100 + }, + { + "epoch": 10.48, + "grad_norm": 1.0078125, + "learning_rate": 0.00036393852440459577, + "loss": 0.175, + "step": 253110 + }, + { + "epoch": 10.48, + "grad_norm": 2.046875, + "learning_rate": 0.00036392887092738903, + "loss": 0.1944, + "step": 253120 + }, + { + "epoch": 10.48, + "grad_norm": 0.828125, + "learning_rate": 0.00036391921723578056, + "loss": 0.1714, + "step": 253130 + }, + { + "epoch": 10.49, + "grad_norm": 1.09375, + "learning_rate": 0.00036390956332978835, + "loss": 0.1882, + "step": 253140 + }, + { + "epoch": 10.49, + "grad_norm": 1.3359375, + "learning_rate": 0.00036389990920943066, + "loss": 0.1898, + "step": 253150 + }, + { + "epoch": 10.49, + "grad_norm": 0.7734375, + "learning_rate": 0.0003638902548747257, + "loss": 0.2211, + "step": 253160 + }, + { + "epoch": 10.49, + "grad_norm": 0.703125, + "learning_rate": 0.0003638806003256915, + "loss": 0.2279, + "step": 253170 + }, + { + "epoch": 10.49, + "grad_norm": 0.6484375, + "learning_rate": 0.00036387094556234633, + "loss": 0.185, + "step": 253180 + }, + { + "epoch": 10.49, + "grad_norm": 0.45703125, + "learning_rate": 0.0003638612905847084, + "loss": 0.2044, + "step": 253190 + }, + { + "epoch": 10.49, + "grad_norm": 0.94921875, + "learning_rate": 0.0003638516353927958, + "loss": 0.2263, + "step": 253200 + }, + { + "epoch": 10.49, + "grad_norm": 0.54296875, + "learning_rate": 0.0003638419799866267, + "loss": 0.2096, + "step": 253210 + }, + { + "epoch": 10.49, + "grad_norm": 0.66796875, + "learning_rate": 0.00036383232436621924, + "loss": 0.2143, + "step": 253220 + }, + { + "epoch": 10.49, + "grad_norm": 0.27734375, + "learning_rate": 0.0003638226685315916, + "loss": 0.1672, + "step": 253230 + }, + { + "epoch": 10.49, + "grad_norm": 0.55078125, + "learning_rate": 0.0003638130124827621, + "loss": 0.201, + "step": 253240 + }, + { + "epoch": 10.49, + "grad_norm": 0.84765625, + "learning_rate": 0.0003638033562197488, + "loss": 0.192, + "step": 253250 + }, + { + "epoch": 10.49, + "grad_norm": 0.7421875, + "learning_rate": 0.00036379369974256983, + "loss": 0.154, + "step": 253260 + }, + { + "epoch": 10.49, + "grad_norm": 0.458984375, + "learning_rate": 0.00036378404305124337, + "loss": 0.2134, + "step": 253270 + }, + { + "epoch": 10.49, + "grad_norm": 0.7265625, + "learning_rate": 0.0003637743861457877, + "loss": 0.2011, + "step": 253280 + }, + { + "epoch": 10.49, + "grad_norm": 1.3984375, + "learning_rate": 0.0003637647290262209, + "loss": 0.2311, + "step": 253290 + }, + { + "epoch": 10.49, + "grad_norm": 0.40234375, + "learning_rate": 0.0003637550716925611, + "loss": 0.2124, + "step": 253300 + }, + { + "epoch": 10.49, + "grad_norm": 0.9375, + "learning_rate": 0.00036374541414482657, + "loss": 0.2109, + "step": 253310 + }, + { + "epoch": 10.49, + "grad_norm": 0.6484375, + "learning_rate": 0.0003637357563830356, + "loss": 0.2374, + "step": 253320 + }, + { + "epoch": 10.49, + "grad_norm": 0.8828125, + "learning_rate": 0.00036372609840720603, + "loss": 0.2096, + "step": 253330 + }, + { + "epoch": 10.49, + "grad_norm": 0.95703125, + "learning_rate": 0.00036371644021735626, + "loss": 0.2468, + "step": 253340 + }, + { + "epoch": 10.49, + "grad_norm": 0.458984375, + "learning_rate": 0.0003637067818135045, + "loss": 0.1351, + "step": 253350 + }, + { + "epoch": 10.49, + "grad_norm": 0.498046875, + "learning_rate": 0.0003636971231956688, + "loss": 0.2081, + "step": 253360 + }, + { + "epoch": 10.49, + "grad_norm": 0.54296875, + "learning_rate": 0.00036368746436386747, + "loss": 0.1717, + "step": 253370 + }, + { + "epoch": 10.49, + "grad_norm": 1.0390625, + "learning_rate": 0.00036367780531811853, + "loss": 0.2242, + "step": 253380 + }, + { + "epoch": 10.5, + "grad_norm": 1.2578125, + "learning_rate": 0.00036366814605844023, + "loss": 0.1726, + "step": 253390 + }, + { + "epoch": 10.5, + "grad_norm": 0.26171875, + "learning_rate": 0.0003636584865848508, + "loss": 0.1554, + "step": 253400 + }, + { + "epoch": 10.5, + "grad_norm": 0.59765625, + "learning_rate": 0.00036364882689736834, + "loss": 0.251, + "step": 253410 + }, + { + "epoch": 10.5, + "grad_norm": 1.2109375, + "learning_rate": 0.00036363916699601106, + "loss": 0.1713, + "step": 253420 + }, + { + "epoch": 10.5, + "grad_norm": 0.625, + "learning_rate": 0.0003636295068807972, + "loss": 0.1772, + "step": 253430 + }, + { + "epoch": 10.5, + "grad_norm": 1.15625, + "learning_rate": 0.00036361984655174475, + "loss": 0.2056, + "step": 253440 + }, + { + "epoch": 10.5, + "grad_norm": 1.0546875, + "learning_rate": 0.0003636101860088722, + "loss": 0.1716, + "step": 253450 + }, + { + "epoch": 10.5, + "grad_norm": 0.73828125, + "learning_rate": 0.00036360052525219737, + "loss": 0.2075, + "step": 253460 + }, + { + "epoch": 10.5, + "grad_norm": 0.5859375, + "learning_rate": 0.0003635908642817387, + "loss": 0.2088, + "step": 253470 + }, + { + "epoch": 10.5, + "grad_norm": 0.734375, + "learning_rate": 0.0003635812030975143, + "loss": 0.1617, + "step": 253480 + }, + { + "epoch": 10.5, + "grad_norm": 0.349609375, + "learning_rate": 0.00036357154169954235, + "loss": 0.1114, + "step": 253490 + }, + { + "epoch": 10.5, + "grad_norm": 0.33984375, + "learning_rate": 0.00036356188008784097, + "loss": 0.1954, + "step": 253500 + }, + { + "epoch": 10.5, + "grad_norm": 0.68359375, + "learning_rate": 0.0003635522182624285, + "loss": 0.2369, + "step": 253510 + }, + { + "epoch": 10.5, + "grad_norm": 1.359375, + "learning_rate": 0.0003635425562233229, + "loss": 0.1763, + "step": 253520 + }, + { + "epoch": 10.5, + "grad_norm": 0.54296875, + "learning_rate": 0.00036353289397054246, + "loss": 0.2229, + "step": 253530 + }, + { + "epoch": 10.5, + "grad_norm": 1.0546875, + "learning_rate": 0.00036352323150410547, + "loss": 0.2139, + "step": 253540 + }, + { + "epoch": 10.5, + "grad_norm": 1.0546875, + "learning_rate": 0.00036351356882402997, + "loss": 0.1832, + "step": 253550 + }, + { + "epoch": 10.5, + "grad_norm": 1.59375, + "learning_rate": 0.0003635039059303342, + "loss": 0.1716, + "step": 253560 + }, + { + "epoch": 10.5, + "grad_norm": 0.79296875, + "learning_rate": 0.0003634942428230364, + "loss": 0.2184, + "step": 253570 + }, + { + "epoch": 10.5, + "grad_norm": 0.263671875, + "learning_rate": 0.0003634845795021546, + "loss": 0.2056, + "step": 253580 + }, + { + "epoch": 10.5, + "grad_norm": 1.203125, + "learning_rate": 0.0003634749159677071, + "loss": 0.1802, + "step": 253590 + }, + { + "epoch": 10.5, + "grad_norm": 0.87109375, + "learning_rate": 0.00036346525221971207, + "loss": 0.2728, + "step": 253600 + }, + { + "epoch": 10.5, + "grad_norm": 0.048583984375, + "learning_rate": 0.00036345558825818765, + "loss": 0.2032, + "step": 253610 + }, + { + "epoch": 10.5, + "grad_norm": 0.9921875, + "learning_rate": 0.0003634459240831521, + "loss": 0.2065, + "step": 253620 + }, + { + "epoch": 10.51, + "grad_norm": 0.63671875, + "learning_rate": 0.0003634362596946236, + "loss": 0.1703, + "step": 253630 + }, + { + "epoch": 10.51, + "grad_norm": 0.62890625, + "learning_rate": 0.0003634265950926203, + "loss": 0.2401, + "step": 253640 + }, + { + "epoch": 10.51, + "grad_norm": 1.0625, + "learning_rate": 0.0003634169302771604, + "loss": 0.2, + "step": 253650 + }, + { + "epoch": 10.51, + "grad_norm": 0.9453125, + "learning_rate": 0.00036340726524826205, + "loss": 0.1726, + "step": 253660 + }, + { + "epoch": 10.51, + "grad_norm": 1.1953125, + "learning_rate": 0.0003633976000059435, + "loss": 0.2003, + "step": 253670 + }, + { + "epoch": 10.51, + "grad_norm": 0.68359375, + "learning_rate": 0.00036338793455022286, + "loss": 0.1995, + "step": 253680 + }, + { + "epoch": 10.51, + "grad_norm": 0.58203125, + "learning_rate": 0.0003633782688811184, + "loss": 0.2135, + "step": 253690 + }, + { + "epoch": 10.51, + "grad_norm": 1.5, + "learning_rate": 0.00036336860299864834, + "loss": 0.1869, + "step": 253700 + }, + { + "epoch": 10.51, + "grad_norm": 1.0, + "learning_rate": 0.00036335893690283075, + "loss": 0.2285, + "step": 253710 + }, + { + "epoch": 10.51, + "grad_norm": 0.80859375, + "learning_rate": 0.0003633492705936839, + "loss": 0.2136, + "step": 253720 + }, + { + "epoch": 10.51, + "grad_norm": 1.203125, + "learning_rate": 0.000363339604071226, + "loss": 0.1653, + "step": 253730 + }, + { + "epoch": 10.51, + "grad_norm": 0.5390625, + "learning_rate": 0.0003633299373354751, + "loss": 0.1603, + "step": 253740 + }, + { + "epoch": 10.51, + "grad_norm": 0.671875, + "learning_rate": 0.0003633202703864496, + "loss": 0.2071, + "step": 253750 + }, + { + "epoch": 10.51, + "grad_norm": 1.2734375, + "learning_rate": 0.00036331060322416755, + "loss": 0.2274, + "step": 253760 + }, + { + "epoch": 10.51, + "grad_norm": 1.0234375, + "learning_rate": 0.00036330093584864716, + "loss": 0.199, + "step": 253770 + }, + { + "epoch": 10.51, + "grad_norm": 0.875, + "learning_rate": 0.0003632912682599067, + "loss": 0.1943, + "step": 253780 + }, + { + "epoch": 10.51, + "grad_norm": 0.59375, + "learning_rate": 0.00036328160045796424, + "loss": 0.1637, + "step": 253790 + }, + { + "epoch": 10.51, + "grad_norm": 0.330078125, + "learning_rate": 0.0003632719324428381, + "loss": 0.1514, + "step": 253800 + }, + { + "epoch": 10.51, + "grad_norm": 0.625, + "learning_rate": 0.00036326226421454645, + "loss": 0.1695, + "step": 253810 + }, + { + "epoch": 10.51, + "grad_norm": 0.84375, + "learning_rate": 0.00036325259577310733, + "loss": 0.1741, + "step": 253820 + }, + { + "epoch": 10.51, + "grad_norm": 0.7421875, + "learning_rate": 0.0003632429271185391, + "loss": 0.2139, + "step": 253830 + }, + { + "epoch": 10.51, + "grad_norm": 0.88671875, + "learning_rate": 0.00036323325825085996, + "loss": 0.2089, + "step": 253840 + }, + { + "epoch": 10.51, + "grad_norm": 0.59375, + "learning_rate": 0.000363223589170088, + "loss": 0.2147, + "step": 253850 + }, + { + "epoch": 10.51, + "grad_norm": 0.482421875, + "learning_rate": 0.0003632139198762415, + "loss": 0.1915, + "step": 253860 + }, + { + "epoch": 10.52, + "grad_norm": 0.87109375, + "learning_rate": 0.00036320425036933865, + "loss": 0.2103, + "step": 253870 + }, + { + "epoch": 10.52, + "grad_norm": 0.7890625, + "learning_rate": 0.0003631945806493976, + "loss": 0.2553, + "step": 253880 + }, + { + "epoch": 10.52, + "grad_norm": 1.7734375, + "learning_rate": 0.00036318491071643663, + "loss": 0.2317, + "step": 253890 + }, + { + "epoch": 10.52, + "grad_norm": 0.5703125, + "learning_rate": 0.00036317524057047373, + "loss": 0.1757, + "step": 253900 + }, + { + "epoch": 10.52, + "grad_norm": 0.7734375, + "learning_rate": 0.0003631655702115274, + "loss": 0.2338, + "step": 253910 + }, + { + "epoch": 10.52, + "grad_norm": 0.5234375, + "learning_rate": 0.0003631558996396156, + "loss": 0.2177, + "step": 253920 + }, + { + "epoch": 10.52, + "grad_norm": 0.7109375, + "learning_rate": 0.0003631462288547566, + "loss": 0.1702, + "step": 253930 + }, + { + "epoch": 10.52, + "grad_norm": 0.392578125, + "learning_rate": 0.00036313655785696875, + "loss": 0.1676, + "step": 253940 + }, + { + "epoch": 10.52, + "grad_norm": 0.84375, + "learning_rate": 0.00036312688664627, + "loss": 0.2005, + "step": 253950 + }, + { + "epoch": 10.52, + "grad_norm": 0.96875, + "learning_rate": 0.0003631172152226787, + "loss": 0.2471, + "step": 253960 + }, + { + "epoch": 10.52, + "grad_norm": 0.265625, + "learning_rate": 0.0003631075435862131, + "loss": 0.179, + "step": 253970 + }, + { + "epoch": 10.52, + "grad_norm": 2.046875, + "learning_rate": 0.0003630978717368911, + "loss": 0.2291, + "step": 253980 + }, + { + "epoch": 10.52, + "grad_norm": 1.84375, + "learning_rate": 0.0003630881996747313, + "loss": 0.2747, + "step": 253990 + }, + { + "epoch": 10.52, + "grad_norm": 0.77734375, + "learning_rate": 0.00036307852739975166, + "loss": 0.228, + "step": 254000 + }, + { + "epoch": 10.52, + "grad_norm": 0.55078125, + "learning_rate": 0.00036306885491197045, + "loss": 0.2039, + "step": 254010 + }, + { + "epoch": 10.52, + "grad_norm": 0.94921875, + "learning_rate": 0.0003630591822114059, + "loss": 0.1634, + "step": 254020 + }, + { + "epoch": 10.52, + "grad_norm": 1.265625, + "learning_rate": 0.00036304950929807604, + "loss": 0.2246, + "step": 254030 + }, + { + "epoch": 10.52, + "grad_norm": 0.828125, + "learning_rate": 0.00036303983617199943, + "loss": 0.1494, + "step": 254040 + }, + { + "epoch": 10.52, + "grad_norm": 0.55859375, + "learning_rate": 0.00036303016283319395, + "loss": 0.244, + "step": 254050 + }, + { + "epoch": 10.52, + "grad_norm": 1.0859375, + "learning_rate": 0.00036302048928167787, + "loss": 0.1817, + "step": 254060 + }, + { + "epoch": 10.52, + "grad_norm": 0.55078125, + "learning_rate": 0.0003630108155174695, + "loss": 0.2296, + "step": 254070 + }, + { + "epoch": 10.52, + "grad_norm": 0.44921875, + "learning_rate": 0.00036300114154058687, + "loss": 0.1925, + "step": 254080 + }, + { + "epoch": 10.52, + "grad_norm": 0.57421875, + "learning_rate": 0.0003629914673510484, + "loss": 0.2307, + "step": 254090 + }, + { + "epoch": 10.52, + "grad_norm": 0.8359375, + "learning_rate": 0.00036298179294887216, + "loss": 0.1812, + "step": 254100 + }, + { + "epoch": 10.53, + "grad_norm": 0.84765625, + "learning_rate": 0.0003629721183340764, + "loss": 0.2328, + "step": 254110 + }, + { + "epoch": 10.53, + "grad_norm": 1.2890625, + "learning_rate": 0.0003629624435066793, + "loss": 0.1998, + "step": 254120 + }, + { + "epoch": 10.53, + "grad_norm": 0.64453125, + "learning_rate": 0.00036295276846669907, + "loss": 0.2127, + "step": 254130 + }, + { + "epoch": 10.53, + "grad_norm": 1.3984375, + "learning_rate": 0.0003629430932141539, + "loss": 0.2283, + "step": 254140 + }, + { + "epoch": 10.53, + "grad_norm": 1.265625, + "learning_rate": 0.00036293341774906214, + "loss": 0.2075, + "step": 254150 + }, + { + "epoch": 10.53, + "grad_norm": 0.69140625, + "learning_rate": 0.0003629237420714418, + "loss": 0.1953, + "step": 254160 + }, + { + "epoch": 10.53, + "grad_norm": 0.94140625, + "learning_rate": 0.0003629140661813112, + "loss": 0.2093, + "step": 254170 + }, + { + "epoch": 10.53, + "grad_norm": 0.984375, + "learning_rate": 0.0003629043900786885, + "loss": 0.1911, + "step": 254180 + }, + { + "epoch": 10.53, + "grad_norm": 1.3125, + "learning_rate": 0.00036289471376359185, + "loss": 0.1825, + "step": 254190 + }, + { + "epoch": 10.53, + "grad_norm": 0.234375, + "learning_rate": 0.00036288503723603974, + "loss": 0.1598, + "step": 254200 + }, + { + "epoch": 10.53, + "grad_norm": 0.005096435546875, + "learning_rate": 0.00036287536049605, + "loss": 0.197, + "step": 254210 + }, + { + "epoch": 10.53, + "grad_norm": 0.341796875, + "learning_rate": 0.0003628656835436411, + "loss": 0.1785, + "step": 254220 + }, + { + "epoch": 10.53, + "grad_norm": 1.4140625, + "learning_rate": 0.00036285600637883116, + "loss": 0.1492, + "step": 254230 + }, + { + "epoch": 10.53, + "grad_norm": 0.9453125, + "learning_rate": 0.0003628463290016384, + "loss": 0.1793, + "step": 254240 + }, + { + "epoch": 10.53, + "grad_norm": 0.8828125, + "learning_rate": 0.000362836651412081, + "loss": 0.2202, + "step": 254250 + }, + { + "epoch": 10.53, + "grad_norm": 1.1015625, + "learning_rate": 0.0003628269736101773, + "loss": 0.2238, + "step": 254260 + }, + { + "epoch": 10.53, + "grad_norm": 0.5625, + "learning_rate": 0.00036281729559594527, + "loss": 0.1975, + "step": 254270 + }, + { + "epoch": 10.53, + "grad_norm": 0.98828125, + "learning_rate": 0.0003628076173694034, + "loss": 0.2227, + "step": 254280 + }, + { + "epoch": 10.53, + "grad_norm": 0.46484375, + "learning_rate": 0.0003627979389305698, + "loss": 0.1791, + "step": 254290 + }, + { + "epoch": 10.53, + "grad_norm": 0.5078125, + "learning_rate": 0.00036278826027946254, + "loss": 0.2177, + "step": 254300 + }, + { + "epoch": 10.53, + "grad_norm": 0.2890625, + "learning_rate": 0.0003627785814161001, + "loss": 0.1803, + "step": 254310 + }, + { + "epoch": 10.53, + "grad_norm": 1.2265625, + "learning_rate": 0.0003627689023405004, + "loss": 0.2038, + "step": 254320 + }, + { + "epoch": 10.53, + "grad_norm": 0.6328125, + "learning_rate": 0.0003627592230526819, + "loss": 0.1861, + "step": 254330 + }, + { + "epoch": 10.53, + "grad_norm": 1.234375, + "learning_rate": 0.0003627495435526627, + "loss": 0.1616, + "step": 254340 + }, + { + "epoch": 10.54, + "grad_norm": 1.2734375, + "learning_rate": 0.00036273986384046097, + "loss": 0.176, + "step": 254350 + }, + { + "epoch": 10.54, + "grad_norm": 1.21875, + "learning_rate": 0.0003627301839160951, + "loss": 0.1972, + "step": 254360 + }, + { + "epoch": 10.54, + "grad_norm": 1.1796875, + "learning_rate": 0.0003627205037795831, + "loss": 0.2334, + "step": 254370 + }, + { + "epoch": 10.54, + "grad_norm": 0.8125, + "learning_rate": 0.0003627108234309433, + "loss": 0.1704, + "step": 254380 + }, + { + "epoch": 10.54, + "grad_norm": 0.5625, + "learning_rate": 0.00036270114287019397, + "loss": 0.2221, + "step": 254390 + }, + { + "epoch": 10.54, + "grad_norm": 0.8203125, + "learning_rate": 0.0003626914620973532, + "loss": 0.1703, + "step": 254400 + }, + { + "epoch": 10.54, + "grad_norm": 0.5859375, + "learning_rate": 0.0003626817811124393, + "loss": 0.2079, + "step": 254410 + }, + { + "epoch": 10.54, + "grad_norm": 1.375, + "learning_rate": 0.00036267209991547046, + "loss": 0.2152, + "step": 254420 + }, + { + "epoch": 10.54, + "grad_norm": 0.41015625, + "learning_rate": 0.0003626624185064648, + "loss": 0.195, + "step": 254430 + }, + { + "epoch": 10.54, + "grad_norm": 0.6015625, + "learning_rate": 0.0003626527368854408, + "loss": 0.1829, + "step": 254440 + }, + { + "epoch": 10.54, + "grad_norm": 0.49609375, + "learning_rate": 0.0003626430550524164, + "loss": 0.2055, + "step": 254450 + }, + { + "epoch": 10.54, + "grad_norm": 1.5625, + "learning_rate": 0.00036263337300740996, + "loss": 0.2303, + "step": 254460 + }, + { + "epoch": 10.54, + "grad_norm": 0.55078125, + "learning_rate": 0.00036262369075043966, + "loss": 0.2107, + "step": 254470 + }, + { + "epoch": 10.54, + "grad_norm": 0.60546875, + "learning_rate": 0.0003626140082815238, + "loss": 0.2166, + "step": 254480 + }, + { + "epoch": 10.54, + "grad_norm": 1.0078125, + "learning_rate": 0.00036260432560068046, + "loss": 0.1835, + "step": 254490 + }, + { + "epoch": 10.54, + "grad_norm": 1.2421875, + "learning_rate": 0.000362594642707928, + "loss": 0.1951, + "step": 254500 + }, + { + "epoch": 10.54, + "grad_norm": 1.1484375, + "learning_rate": 0.0003625849596032845, + "loss": 0.2369, + "step": 254510 + }, + { + "epoch": 10.54, + "grad_norm": 1.0390625, + "learning_rate": 0.00036257527628676833, + "loss": 0.2233, + "step": 254520 + }, + { + "epoch": 10.54, + "grad_norm": 0.796875, + "learning_rate": 0.0003625655927583976, + "loss": 0.2457, + "step": 254530 + }, + { + "epoch": 10.54, + "grad_norm": 0.31640625, + "learning_rate": 0.0003625559090181906, + "loss": 0.1839, + "step": 254540 + }, + { + "epoch": 10.54, + "grad_norm": 0.85546875, + "learning_rate": 0.0003625462250661656, + "loss": 0.2156, + "step": 254550 + }, + { + "epoch": 10.54, + "grad_norm": 0.91015625, + "learning_rate": 0.0003625365409023407, + "loss": 0.1892, + "step": 254560 + }, + { + "epoch": 10.54, + "grad_norm": 0.91015625, + "learning_rate": 0.0003625268565267342, + "loss": 0.148, + "step": 254570 + }, + { + "epoch": 10.54, + "grad_norm": 0.7265625, + "learning_rate": 0.0003625171719393643, + "loss": 0.2094, + "step": 254580 + }, + { + "epoch": 10.55, + "grad_norm": 0.66796875, + "learning_rate": 0.00036250748714024923, + "loss": 0.2646, + "step": 254590 + }, + { + "epoch": 10.55, + "grad_norm": 0.4765625, + "learning_rate": 0.00036249780212940726, + "loss": 0.1739, + "step": 254600 + }, + { + "epoch": 10.55, + "grad_norm": 1.0390625, + "learning_rate": 0.00036248811690685654, + "loss": 0.1644, + "step": 254610 + }, + { + "epoch": 10.55, + "grad_norm": 0.8984375, + "learning_rate": 0.0003624784314726153, + "loss": 0.2063, + "step": 254620 + }, + { + "epoch": 10.55, + "grad_norm": 0.9765625, + "learning_rate": 0.00036246874582670184, + "loss": 0.1533, + "step": 254630 + }, + { + "epoch": 10.55, + "grad_norm": 0.515625, + "learning_rate": 0.0003624590599691343, + "loss": 0.2499, + "step": 254640 + }, + { + "epoch": 10.55, + "grad_norm": 1.859375, + "learning_rate": 0.00036244937389993097, + "loss": 0.1985, + "step": 254650 + }, + { + "epoch": 10.55, + "grad_norm": 0.435546875, + "learning_rate": 0.0003624396876191101, + "loss": 0.197, + "step": 254660 + }, + { + "epoch": 10.55, + "grad_norm": 1.5390625, + "learning_rate": 0.00036243000112668984, + "loss": 0.2383, + "step": 254670 + }, + { + "epoch": 10.55, + "grad_norm": 0.484375, + "learning_rate": 0.00036242031442268853, + "loss": 0.2136, + "step": 254680 + }, + { + "epoch": 10.55, + "grad_norm": 0.4375, + "learning_rate": 0.00036241062750712425, + "loss": 0.2076, + "step": 254690 + }, + { + "epoch": 10.55, + "grad_norm": 1.3671875, + "learning_rate": 0.0003624009403800153, + "loss": 0.2293, + "step": 254700 + }, + { + "epoch": 10.55, + "grad_norm": 0.8359375, + "learning_rate": 0.00036239125304137994, + "loss": 0.2051, + "step": 254710 + }, + { + "epoch": 10.55, + "grad_norm": 1.15625, + "learning_rate": 0.00036238156549123636, + "loss": 0.1351, + "step": 254720 + }, + { + "epoch": 10.55, + "grad_norm": 1.6875, + "learning_rate": 0.00036237187772960287, + "loss": 0.184, + "step": 254730 + }, + { + "epoch": 10.55, + "grad_norm": 0.322265625, + "learning_rate": 0.0003623621897564976, + "loss": 0.155, + "step": 254740 + }, + { + "epoch": 10.55, + "grad_norm": 0.6640625, + "learning_rate": 0.00036235250157193885, + "loss": 0.2066, + "step": 254750 + }, + { + "epoch": 10.55, + "grad_norm": 1.078125, + "learning_rate": 0.00036234281317594486, + "loss": 0.1587, + "step": 254760 + }, + { + "epoch": 10.55, + "grad_norm": 0.80078125, + "learning_rate": 0.0003623331245685337, + "loss": 0.1936, + "step": 254770 + }, + { + "epoch": 10.55, + "grad_norm": 0.392578125, + "learning_rate": 0.0003623234357497238, + "loss": 0.1659, + "step": 254780 + }, + { + "epoch": 10.55, + "grad_norm": 0.46875, + "learning_rate": 0.0003623137467195334, + "loss": 0.2138, + "step": 254790 + }, + { + "epoch": 10.55, + "grad_norm": 1.3984375, + "learning_rate": 0.0003623040574779806, + "loss": 0.1965, + "step": 254800 + }, + { + "epoch": 10.55, + "grad_norm": 0.46484375, + "learning_rate": 0.00036229436802508365, + "loss": 0.2107, + "step": 254810 + }, + { + "epoch": 10.55, + "grad_norm": 0.83984375, + "learning_rate": 0.00036228467836086085, + "loss": 0.2529, + "step": 254820 + }, + { + "epoch": 10.56, + "grad_norm": 0.423828125, + "learning_rate": 0.0003622749884853304, + "loss": 0.1867, + "step": 254830 + }, + { + "epoch": 10.56, + "grad_norm": 0.83984375, + "learning_rate": 0.0003622652983985106, + "loss": 0.195, + "step": 254840 + }, + { + "epoch": 10.56, + "grad_norm": 2.15625, + "learning_rate": 0.0003622556081004196, + "loss": 0.2268, + "step": 254850 + }, + { + "epoch": 10.56, + "grad_norm": 0.55859375, + "learning_rate": 0.0003622459175910757, + "loss": 0.2474, + "step": 254860 + }, + { + "epoch": 10.56, + "grad_norm": 0.875, + "learning_rate": 0.000362236226870497, + "loss": 0.266, + "step": 254870 + }, + { + "epoch": 10.56, + "grad_norm": 0.828125, + "learning_rate": 0.00036222653593870194, + "loss": 0.2332, + "step": 254880 + }, + { + "epoch": 10.56, + "grad_norm": 1.265625, + "learning_rate": 0.0003622168447957086, + "loss": 0.1629, + "step": 254890 + }, + { + "epoch": 10.56, + "grad_norm": 1.953125, + "learning_rate": 0.0003622071534415353, + "loss": 0.212, + "step": 254900 + }, + { + "epoch": 10.56, + "grad_norm": 1.0859375, + "learning_rate": 0.0003621974618762003, + "loss": 0.2378, + "step": 254910 + }, + { + "epoch": 10.56, + "grad_norm": 0.53515625, + "learning_rate": 0.0003621877700997217, + "loss": 0.2183, + "step": 254920 + }, + { + "epoch": 10.56, + "grad_norm": 1.0078125, + "learning_rate": 0.00036217807811211796, + "loss": 0.1818, + "step": 254930 + }, + { + "epoch": 10.56, + "grad_norm": 0.40234375, + "learning_rate": 0.0003621683859134071, + "loss": 0.2418, + "step": 254940 + }, + { + "epoch": 10.56, + "grad_norm": 0.482421875, + "learning_rate": 0.0003621586935036074, + "loss": 0.1966, + "step": 254950 + }, + { + "epoch": 10.56, + "grad_norm": 0.5390625, + "learning_rate": 0.0003621490008827372, + "loss": 0.2006, + "step": 254960 + }, + { + "epoch": 10.56, + "grad_norm": 1.125, + "learning_rate": 0.0003621393080508148, + "loss": 0.1878, + "step": 254970 + }, + { + "epoch": 10.56, + "grad_norm": 0.6953125, + "learning_rate": 0.0003621296150078582, + "loss": 0.2085, + "step": 254980 + }, + { + "epoch": 10.56, + "grad_norm": 1.0625, + "learning_rate": 0.00036211992175388583, + "loss": 0.2028, + "step": 254990 + }, + { + "epoch": 10.56, + "grad_norm": 0.9140625, + "learning_rate": 0.00036211022828891583, + "loss": 0.2627, + "step": 255000 + }, + { + "epoch": 10.56, + "grad_norm": 1.640625, + "learning_rate": 0.00036210053461296655, + "loss": 0.2156, + "step": 255010 + }, + { + "epoch": 10.56, + "grad_norm": 0.7890625, + "learning_rate": 0.0003620908407260561, + "loss": 0.2632, + "step": 255020 + }, + { + "epoch": 10.56, + "grad_norm": 0.58203125, + "learning_rate": 0.00036208114662820284, + "loss": 0.1798, + "step": 255030 + }, + { + "epoch": 10.56, + "grad_norm": 0.90234375, + "learning_rate": 0.00036207145231942497, + "loss": 0.193, + "step": 255040 + }, + { + "epoch": 10.56, + "grad_norm": 1.203125, + "learning_rate": 0.0003620617577997407, + "loss": 0.2453, + "step": 255050 + }, + { + "epoch": 10.56, + "grad_norm": 0.609375, + "learning_rate": 0.0003620520630691684, + "loss": 0.2073, + "step": 255060 + }, + { + "epoch": 10.56, + "grad_norm": 0.0, + "learning_rate": 0.0003620423681277261, + "loss": 0.2079, + "step": 255070 + }, + { + "epoch": 10.57, + "grad_norm": 0.640625, + "learning_rate": 0.0003620326729754322, + "loss": 0.2213, + "step": 255080 + }, + { + "epoch": 10.57, + "grad_norm": 1.265625, + "learning_rate": 0.0003620229776123049, + "loss": 0.1486, + "step": 255090 + }, + { + "epoch": 10.57, + "grad_norm": 0.74609375, + "learning_rate": 0.0003620132820383625, + "loss": 0.2057, + "step": 255100 + }, + { + "epoch": 10.57, + "grad_norm": 0.16796875, + "learning_rate": 0.00036200358625362313, + "loss": 0.2258, + "step": 255110 + }, + { + "epoch": 10.57, + "grad_norm": 0.5546875, + "learning_rate": 0.0003619938902581051, + "loss": 0.2007, + "step": 255120 + }, + { + "epoch": 10.57, + "grad_norm": 0.6640625, + "learning_rate": 0.00036198419405182667, + "loss": 0.1898, + "step": 255130 + }, + { + "epoch": 10.57, + "grad_norm": 0.267578125, + "learning_rate": 0.0003619744976348062, + "loss": 0.2294, + "step": 255140 + }, + { + "epoch": 10.57, + "grad_norm": 2.53125, + "learning_rate": 0.0003619648010070616, + "loss": 0.2244, + "step": 255150 + }, + { + "epoch": 10.57, + "grad_norm": 0.640625, + "learning_rate": 0.0003619551041686116, + "loss": 0.1465, + "step": 255160 + }, + { + "epoch": 10.57, + "grad_norm": 0.96484375, + "learning_rate": 0.000361945407119474, + "loss": 0.187, + "step": 255170 + }, + { + "epoch": 10.57, + "grad_norm": 0.703125, + "learning_rate": 0.0003619357098596673, + "loss": 0.19, + "step": 255180 + }, + { + "epoch": 10.57, + "grad_norm": 1.109375, + "learning_rate": 0.00036192601238920966, + "loss": 0.219, + "step": 255190 + }, + { + "epoch": 10.57, + "grad_norm": 0.68359375, + "learning_rate": 0.0003619163147081193, + "loss": 0.2334, + "step": 255200 + }, + { + "epoch": 10.57, + "grad_norm": 1.015625, + "learning_rate": 0.00036190661681641457, + "loss": 0.1918, + "step": 255210 + }, + { + "epoch": 10.57, + "grad_norm": 1.28125, + "learning_rate": 0.0003618969187141137, + "loss": 0.2015, + "step": 255220 + }, + { + "epoch": 10.57, + "grad_norm": 0.64453125, + "learning_rate": 0.0003618872204012349, + "loss": 0.216, + "step": 255230 + }, + { + "epoch": 10.57, + "grad_norm": 0.408203125, + "learning_rate": 0.0003618775218777964, + "loss": 0.2183, + "step": 255240 + }, + { + "epoch": 10.57, + "grad_norm": 2.0, + "learning_rate": 0.00036186782314381655, + "loss": 0.1649, + "step": 255250 + }, + { + "epoch": 10.57, + "grad_norm": 0.38671875, + "learning_rate": 0.00036185812419931344, + "loss": 0.1983, + "step": 255260 + }, + { + "epoch": 10.57, + "grad_norm": 0.69921875, + "learning_rate": 0.0003618484250443055, + "loss": 0.2145, + "step": 255270 + }, + { + "epoch": 10.57, + "grad_norm": 0.58984375, + "learning_rate": 0.0003618387256788109, + "loss": 0.2534, + "step": 255280 + }, + { + "epoch": 10.57, + "grad_norm": 0.60546875, + "learning_rate": 0.00036182902610284784, + "loss": 0.2738, + "step": 255290 + }, + { + "epoch": 10.57, + "grad_norm": 0.99609375, + "learning_rate": 0.00036181932631643463, + "loss": 0.2097, + "step": 255300 + }, + { + "epoch": 10.57, + "grad_norm": 0.98046875, + "learning_rate": 0.00036180962631958955, + "loss": 0.1905, + "step": 255310 + }, + { + "epoch": 10.58, + "grad_norm": 0.796875, + "learning_rate": 0.00036179992611233086, + "loss": 0.1699, + "step": 255320 + }, + { + "epoch": 10.58, + "grad_norm": 1.09375, + "learning_rate": 0.0003617902256946768, + "loss": 0.1837, + "step": 255330 + }, + { + "epoch": 10.58, + "grad_norm": 0.62890625, + "learning_rate": 0.0003617805250666455, + "loss": 0.179, + "step": 255340 + }, + { + "epoch": 10.58, + "grad_norm": 0.9609375, + "learning_rate": 0.00036177082422825544, + "loss": 0.167, + "step": 255350 + }, + { + "epoch": 10.58, + "grad_norm": 0.99609375, + "learning_rate": 0.0003617611231795247, + "loss": 0.2226, + "step": 255360 + }, + { + "epoch": 10.58, + "grad_norm": 0.54296875, + "learning_rate": 0.0003617514219204716, + "loss": 0.2122, + "step": 255370 + }, + { + "epoch": 10.58, + "grad_norm": 0.859375, + "learning_rate": 0.0003617417204511144, + "loss": 0.1781, + "step": 255380 + }, + { + "epoch": 10.58, + "grad_norm": 1.03125, + "learning_rate": 0.00036173201877147133, + "loss": 0.1748, + "step": 255390 + }, + { + "epoch": 10.58, + "grad_norm": 0.78515625, + "learning_rate": 0.00036172231688156074, + "loss": 0.1856, + "step": 255400 + }, + { + "epoch": 10.58, + "grad_norm": 0.6953125, + "learning_rate": 0.00036171261478140074, + "loss": 0.1733, + "step": 255410 + }, + { + "epoch": 10.58, + "grad_norm": 1.734375, + "learning_rate": 0.0003617029124710096, + "loss": 0.1719, + "step": 255420 + }, + { + "epoch": 10.58, + "grad_norm": 0.828125, + "learning_rate": 0.00036169320995040576, + "loss": 0.2087, + "step": 255430 + }, + { + "epoch": 10.58, + "grad_norm": 1.1015625, + "learning_rate": 0.0003616835072196073, + "loss": 0.2289, + "step": 255440 + }, + { + "epoch": 10.58, + "grad_norm": 0.412109375, + "learning_rate": 0.00036167380427863254, + "loss": 0.1814, + "step": 255450 + }, + { + "epoch": 10.58, + "grad_norm": 0.46484375, + "learning_rate": 0.0003616641011274998, + "loss": 0.2035, + "step": 255460 + }, + { + "epoch": 10.58, + "grad_norm": 0.78515625, + "learning_rate": 0.0003616543977662272, + "loss": 0.208, + "step": 255470 + }, + { + "epoch": 10.58, + "grad_norm": 0.859375, + "learning_rate": 0.0003616446941948331, + "loss": 0.203, + "step": 255480 + }, + { + "epoch": 10.58, + "grad_norm": 0.73046875, + "learning_rate": 0.00036163499041333587, + "loss": 0.191, + "step": 255490 + }, + { + "epoch": 10.58, + "grad_norm": 0.81640625, + "learning_rate": 0.00036162528642175347, + "loss": 0.2417, + "step": 255500 + }, + { + "epoch": 10.58, + "grad_norm": 0.6484375, + "learning_rate": 0.00036161558222010446, + "loss": 0.1769, + "step": 255510 + }, + { + "epoch": 10.58, + "grad_norm": 1.3671875, + "learning_rate": 0.00036160587780840694, + "loss": 0.2063, + "step": 255520 + }, + { + "epoch": 10.58, + "grad_norm": 0.7109375, + "learning_rate": 0.00036159617318667913, + "loss": 0.1813, + "step": 255530 + }, + { + "epoch": 10.58, + "grad_norm": 0.76171875, + "learning_rate": 0.00036158646835493947, + "loss": 0.2269, + "step": 255540 + }, + { + "epoch": 10.58, + "grad_norm": 0.984375, + "learning_rate": 0.00036157676331320605, + "loss": 0.159, + "step": 255550 + }, + { + "epoch": 10.59, + "grad_norm": 1.171875, + "learning_rate": 0.00036156705806149726, + "loss": 0.1533, + "step": 255560 + }, + { + "epoch": 10.59, + "grad_norm": 0.7890625, + "learning_rate": 0.00036155735259983137, + "loss": 0.2372, + "step": 255570 + }, + { + "epoch": 10.59, + "grad_norm": 0.6796875, + "learning_rate": 0.00036154764692822647, + "loss": 0.2325, + "step": 255580 + }, + { + "epoch": 10.59, + "grad_norm": 0.95703125, + "learning_rate": 0.000361537941046701, + "loss": 0.1968, + "step": 255590 + }, + { + "epoch": 10.59, + "grad_norm": 0.455078125, + "learning_rate": 0.0003615282349552732, + "loss": 0.1972, + "step": 255600 + }, + { + "epoch": 10.59, + "grad_norm": 0.474609375, + "learning_rate": 0.00036151852865396123, + "loss": 0.1777, + "step": 255610 + }, + { + "epoch": 10.59, + "grad_norm": 0.98828125, + "learning_rate": 0.0003615088221427835, + "loss": 0.2106, + "step": 255620 + }, + { + "epoch": 10.59, + "grad_norm": 0.9765625, + "learning_rate": 0.0003614991154217582, + "loss": 0.1872, + "step": 255630 + }, + { + "epoch": 10.59, + "grad_norm": 0.3515625, + "learning_rate": 0.00036148940849090364, + "loss": 0.1671, + "step": 255640 + }, + { + "epoch": 10.59, + "grad_norm": 0.263671875, + "learning_rate": 0.000361479701350238, + "loss": 0.1879, + "step": 255650 + }, + { + "epoch": 10.59, + "grad_norm": 0.46484375, + "learning_rate": 0.0003614699939997795, + "loss": 0.1451, + "step": 255660 + }, + { + "epoch": 10.59, + "grad_norm": 0.95703125, + "learning_rate": 0.0003614602864395467, + "loss": 0.1979, + "step": 255670 + }, + { + "epoch": 10.59, + "grad_norm": 0.6875, + "learning_rate": 0.00036145057866955756, + "loss": 0.2071, + "step": 255680 + }, + { + "epoch": 10.59, + "grad_norm": 0.46484375, + "learning_rate": 0.0003614408706898305, + "loss": 0.21, + "step": 255690 + }, + { + "epoch": 10.59, + "grad_norm": 0.51953125, + "learning_rate": 0.00036143116250038376, + "loss": 0.1951, + "step": 255700 + }, + { + "epoch": 10.59, + "grad_norm": 1.21875, + "learning_rate": 0.00036142145410123557, + "loss": 0.1842, + "step": 255710 + }, + { + "epoch": 10.59, + "grad_norm": 1.2734375, + "learning_rate": 0.0003614117454924043, + "loss": 0.1906, + "step": 255720 + }, + { + "epoch": 10.59, + "grad_norm": 0.5, + "learning_rate": 0.00036140203667390814, + "loss": 0.2, + "step": 255730 + }, + { + "epoch": 10.59, + "grad_norm": 0.75390625, + "learning_rate": 0.0003613923276457654, + "loss": 0.1651, + "step": 255740 + }, + { + "epoch": 10.59, + "grad_norm": 0.578125, + "learning_rate": 0.00036138261840799424, + "loss": 0.2222, + "step": 255750 + }, + { + "epoch": 10.59, + "grad_norm": 0.78125, + "learning_rate": 0.0003613729089606131, + "loss": 0.185, + "step": 255760 + }, + { + "epoch": 10.59, + "grad_norm": 0.69921875, + "learning_rate": 0.00036136319930364007, + "loss": 0.19, + "step": 255770 + }, + { + "epoch": 10.59, + "grad_norm": 0.859375, + "learning_rate": 0.0003613534894370937, + "loss": 0.1642, + "step": 255780 + }, + { + "epoch": 10.59, + "grad_norm": 0.498046875, + "learning_rate": 0.0003613437793609919, + "loss": 0.186, + "step": 255790 + }, + { + "epoch": 10.6, + "grad_norm": 0.5078125, + "learning_rate": 0.0003613340690753533, + "loss": 0.1617, + "step": 255800 + }, + { + "epoch": 10.6, + "grad_norm": 1.1484375, + "learning_rate": 0.00036132435858019586, + "loss": 0.237, + "step": 255810 + }, + { + "epoch": 10.6, + "grad_norm": 0.0, + "learning_rate": 0.00036131464787553805, + "loss": 0.2003, + "step": 255820 + }, + { + "epoch": 10.6, + "grad_norm": 0.8359375, + "learning_rate": 0.0003613049369613982, + "loss": 0.2235, + "step": 255830 + }, + { + "epoch": 10.6, + "grad_norm": 0.208984375, + "learning_rate": 0.0003612952258377943, + "loss": 0.1632, + "step": 255840 + }, + { + "epoch": 10.6, + "grad_norm": 0.64453125, + "learning_rate": 0.0003612855145047449, + "loss": 0.2032, + "step": 255850 + }, + { + "epoch": 10.6, + "grad_norm": 1.234375, + "learning_rate": 0.0003612758029622682, + "loss": 0.1426, + "step": 255860 + }, + { + "epoch": 10.6, + "grad_norm": 1.9765625, + "learning_rate": 0.00036126609121038235, + "loss": 0.2288, + "step": 255870 + }, + { + "epoch": 10.6, + "grad_norm": 0.953125, + "learning_rate": 0.0003612563792491059, + "loss": 0.2031, + "step": 255880 + }, + { + "epoch": 10.6, + "grad_norm": 1.15625, + "learning_rate": 0.0003612466670784568, + "loss": 0.1685, + "step": 255890 + }, + { + "epoch": 10.6, + "grad_norm": 0.296875, + "learning_rate": 0.0003612369546984535, + "loss": 0.1933, + "step": 255900 + }, + { + "epoch": 10.6, + "grad_norm": 0.578125, + "learning_rate": 0.0003612272421091144, + "loss": 0.1821, + "step": 255910 + }, + { + "epoch": 10.6, + "grad_norm": 0.5390625, + "learning_rate": 0.00036121752931045755, + "loss": 0.2075, + "step": 255920 + }, + { + "epoch": 10.6, + "grad_norm": 0.345703125, + "learning_rate": 0.00036120781630250134, + "loss": 0.1998, + "step": 255930 + }, + { + "epoch": 10.6, + "grad_norm": 1.0625, + "learning_rate": 0.00036119810308526403, + "loss": 0.2092, + "step": 255940 + }, + { + "epoch": 10.6, + "grad_norm": 1.171875, + "learning_rate": 0.0003611883896587638, + "loss": 0.2125, + "step": 255950 + }, + { + "epoch": 10.6, + "grad_norm": 0.388671875, + "learning_rate": 0.0003611786760230191, + "loss": 0.1897, + "step": 255960 + }, + { + "epoch": 10.6, + "grad_norm": 0.68359375, + "learning_rate": 0.00036116896217804816, + "loss": 0.1919, + "step": 255970 + }, + { + "epoch": 10.6, + "grad_norm": 0.7890625, + "learning_rate": 0.0003611592481238692, + "loss": 0.1905, + "step": 255980 + }, + { + "epoch": 10.6, + "grad_norm": 0.515625, + "learning_rate": 0.00036114953386050063, + "loss": 0.1932, + "step": 255990 + }, + { + "epoch": 10.6, + "grad_norm": 0.251953125, + "learning_rate": 0.0003611398193879606, + "loss": 0.1795, + "step": 256000 + }, + { + "epoch": 10.6, + "grad_norm": 1.984375, + "learning_rate": 0.0003611301047062674, + "loss": 0.2126, + "step": 256010 + }, + { + "epoch": 10.6, + "grad_norm": 0.6484375, + "learning_rate": 0.0003611203898154393, + "loss": 0.1872, + "step": 256020 + }, + { + "epoch": 10.6, + "grad_norm": 1.046875, + "learning_rate": 0.0003611106747154947, + "loss": 0.1884, + "step": 256030 + }, + { + "epoch": 10.61, + "grad_norm": 0.79296875, + "learning_rate": 0.00036110095940645185, + "loss": 0.2322, + "step": 256040 + }, + { + "epoch": 10.61, + "grad_norm": 0.90234375, + "learning_rate": 0.0003610912438883289, + "loss": 0.2154, + "step": 256050 + }, + { + "epoch": 10.61, + "grad_norm": 0.734375, + "learning_rate": 0.00036108152816114424, + "loss": 0.1812, + "step": 256060 + }, + { + "epoch": 10.61, + "grad_norm": 0.2470703125, + "learning_rate": 0.0003610718122249162, + "loss": 0.1944, + "step": 256070 + }, + { + "epoch": 10.61, + "grad_norm": 0.6640625, + "learning_rate": 0.00036106209607966294, + "loss": 0.167, + "step": 256080 + }, + { + "epoch": 10.61, + "grad_norm": 2.046875, + "learning_rate": 0.0003610523797254028, + "loss": 0.2467, + "step": 256090 + }, + { + "epoch": 10.61, + "grad_norm": 0.56640625, + "learning_rate": 0.0003610426631621541, + "loss": 0.2105, + "step": 256100 + }, + { + "epoch": 10.61, + "grad_norm": 0.546875, + "learning_rate": 0.0003610329463899351, + "loss": 0.1872, + "step": 256110 + }, + { + "epoch": 10.61, + "grad_norm": 0.56640625, + "learning_rate": 0.0003610232294087641, + "loss": 0.1669, + "step": 256120 + }, + { + "epoch": 10.61, + "grad_norm": 0.84375, + "learning_rate": 0.00036101351221865934, + "loss": 0.2002, + "step": 256130 + }, + { + "epoch": 10.61, + "grad_norm": 1.390625, + "learning_rate": 0.0003610037948196392, + "loss": 0.1607, + "step": 256140 + }, + { + "epoch": 10.61, + "grad_norm": 0.56640625, + "learning_rate": 0.00036099407721172183, + "loss": 0.1941, + "step": 256150 + }, + { + "epoch": 10.61, + "grad_norm": 1.28125, + "learning_rate": 0.00036098435939492567, + "loss": 0.1711, + "step": 256160 + }, + { + "epoch": 10.61, + "grad_norm": 1.359375, + "learning_rate": 0.00036097464136926886, + "loss": 0.1562, + "step": 256170 + }, + { + "epoch": 10.61, + "grad_norm": 0.71875, + "learning_rate": 0.0003609649231347698, + "loss": 0.2329, + "step": 256180 + }, + { + "epoch": 10.61, + "grad_norm": 0.396484375, + "learning_rate": 0.0003609552046914467, + "loss": 0.1664, + "step": 256190 + }, + { + "epoch": 10.61, + "grad_norm": 1.2265625, + "learning_rate": 0.00036094548603931795, + "loss": 0.206, + "step": 256200 + }, + { + "epoch": 10.61, + "grad_norm": 0.1904296875, + "learning_rate": 0.0003609357671784017, + "loss": 0.2011, + "step": 256210 + }, + { + "epoch": 10.61, + "grad_norm": 0.55078125, + "learning_rate": 0.00036092604810871636, + "loss": 0.2033, + "step": 256220 + }, + { + "epoch": 10.61, + "grad_norm": 0.82421875, + "learning_rate": 0.00036091632883028014, + "loss": 0.1861, + "step": 256230 + }, + { + "epoch": 10.61, + "grad_norm": 0.515625, + "learning_rate": 0.0003609066093431115, + "loss": 0.192, + "step": 256240 + }, + { + "epoch": 10.61, + "grad_norm": 0.36328125, + "learning_rate": 0.0003608968896472284, + "loss": 0.1644, + "step": 256250 + }, + { + "epoch": 10.61, + "grad_norm": 0.6171875, + "learning_rate": 0.00036088716974264946, + "loss": 0.2378, + "step": 256260 + }, + { + "epoch": 10.61, + "grad_norm": 0.84765625, + "learning_rate": 0.0003608774496293928, + "loss": 0.1971, + "step": 256270 + }, + { + "epoch": 10.62, + "grad_norm": 0.640625, + "learning_rate": 0.0003608677293074768, + "loss": 0.1558, + "step": 256280 + }, + { + "epoch": 10.62, + "grad_norm": 0.376953125, + "learning_rate": 0.00036085800877691965, + "loss": 0.1629, + "step": 256290 + }, + { + "epoch": 10.62, + "grad_norm": 0.2451171875, + "learning_rate": 0.00036084828803773975, + "loss": 0.198, + "step": 256300 + }, + { + "epoch": 10.62, + "grad_norm": 0.953125, + "learning_rate": 0.0003608385670899552, + "loss": 0.2475, + "step": 256310 + }, + { + "epoch": 10.62, + "grad_norm": 0.90234375, + "learning_rate": 0.0003608288459335846, + "loss": 0.1786, + "step": 256320 + }, + { + "epoch": 10.62, + "grad_norm": 2.453125, + "learning_rate": 0.000360819124568646, + "loss": 0.2066, + "step": 256330 + }, + { + "epoch": 10.62, + "grad_norm": 1.234375, + "learning_rate": 0.0003608094029951578, + "loss": 0.1846, + "step": 256340 + }, + { + "epoch": 10.62, + "grad_norm": 0.498046875, + "learning_rate": 0.0003607996812131383, + "loss": 0.1709, + "step": 256350 + }, + { + "epoch": 10.62, + "grad_norm": 0.57421875, + "learning_rate": 0.0003607899592226058, + "loss": 0.1953, + "step": 256360 + }, + { + "epoch": 10.62, + "grad_norm": 1.4296875, + "learning_rate": 0.00036078023702357844, + "loss": 0.2016, + "step": 256370 + }, + { + "epoch": 10.62, + "grad_norm": 1.1796875, + "learning_rate": 0.0003607705146160747, + "loss": 0.1874, + "step": 256380 + }, + { + "epoch": 10.62, + "grad_norm": 1.0, + "learning_rate": 0.00036076079200011275, + "loss": 0.2448, + "step": 256390 + }, + { + "epoch": 10.62, + "grad_norm": 0.59765625, + "learning_rate": 0.00036075106917571103, + "loss": 0.205, + "step": 256400 + }, + { + "epoch": 10.62, + "grad_norm": 0.80859375, + "learning_rate": 0.00036074134614288777, + "loss": 0.2475, + "step": 256410 + }, + { + "epoch": 10.62, + "grad_norm": 0.74609375, + "learning_rate": 0.00036073162290166117, + "loss": 0.2013, + "step": 256420 + }, + { + "epoch": 10.62, + "grad_norm": 1.53125, + "learning_rate": 0.00036072189945204967, + "loss": 0.2112, + "step": 256430 + }, + { + "epoch": 10.62, + "grad_norm": 0.494140625, + "learning_rate": 0.00036071217579407153, + "loss": 0.212, + "step": 256440 + }, + { + "epoch": 10.62, + "grad_norm": 0.70703125, + "learning_rate": 0.000360702451927745, + "loss": 0.153, + "step": 256450 + }, + { + "epoch": 10.62, + "grad_norm": 0.318359375, + "learning_rate": 0.0003606927278530884, + "loss": 0.1814, + "step": 256460 + }, + { + "epoch": 10.62, + "grad_norm": 0.86328125, + "learning_rate": 0.00036068300357012005, + "loss": 0.2063, + "step": 256470 + }, + { + "epoch": 10.62, + "grad_norm": 0.85546875, + "learning_rate": 0.00036067327907885827, + "loss": 0.2167, + "step": 256480 + }, + { + "epoch": 10.62, + "grad_norm": 0.62109375, + "learning_rate": 0.0003606635543793213, + "loss": 0.1808, + "step": 256490 + }, + { + "epoch": 10.62, + "grad_norm": 1.359375, + "learning_rate": 0.0003606538294715275, + "loss": 0.1988, + "step": 256500 + }, + { + "epoch": 10.62, + "grad_norm": 1.2890625, + "learning_rate": 0.0003606441043554951, + "loss": 0.2108, + "step": 256510 + }, + { + "epoch": 10.63, + "grad_norm": 0.5078125, + "learning_rate": 0.00036063437903124246, + "loss": 0.164, + "step": 256520 + }, + { + "epoch": 10.63, + "grad_norm": 0.4609375, + "learning_rate": 0.00036062465349878794, + "loss": 0.1977, + "step": 256530 + }, + { + "epoch": 10.63, + "grad_norm": 0.55859375, + "learning_rate": 0.00036061492775814964, + "loss": 0.2085, + "step": 256540 + }, + { + "epoch": 10.63, + "grad_norm": 0.330078125, + "learning_rate": 0.0003606052018093461, + "loss": 0.2228, + "step": 256550 + }, + { + "epoch": 10.63, + "grad_norm": 0.443359375, + "learning_rate": 0.0003605954756523954, + "loss": 0.2169, + "step": 256560 + }, + { + "epoch": 10.63, + "grad_norm": 0.62890625, + "learning_rate": 0.000360585749287316, + "loss": 0.222, + "step": 256570 + }, + { + "epoch": 10.63, + "grad_norm": 0.91796875, + "learning_rate": 0.00036057602271412625, + "loss": 0.214, + "step": 256580 + }, + { + "epoch": 10.63, + "grad_norm": 0.412109375, + "learning_rate": 0.0003605662959328443, + "loss": 0.1943, + "step": 256590 + }, + { + "epoch": 10.63, + "grad_norm": 0.61328125, + "learning_rate": 0.00036055656894348856, + "loss": 0.203, + "step": 256600 + }, + { + "epoch": 10.63, + "grad_norm": 0.89453125, + "learning_rate": 0.0003605468417460772, + "loss": 0.1669, + "step": 256610 + }, + { + "epoch": 10.63, + "grad_norm": 0.4296875, + "learning_rate": 0.00036053711434062873, + "loss": 0.1937, + "step": 256620 + }, + { + "epoch": 10.63, + "grad_norm": 0.9296875, + "learning_rate": 0.0003605273867271613, + "loss": 0.1926, + "step": 256630 + }, + { + "epoch": 10.63, + "grad_norm": 0.62109375, + "learning_rate": 0.0003605176589056932, + "loss": 0.2521, + "step": 256640 + }, + { + "epoch": 10.63, + "grad_norm": 1.1875, + "learning_rate": 0.0003605079308762429, + "loss": 0.2485, + "step": 256650 + }, + { + "epoch": 10.63, + "grad_norm": 0.91796875, + "learning_rate": 0.00036049820263882857, + "loss": 0.2009, + "step": 256660 + }, + { + "epoch": 10.63, + "grad_norm": 1.046875, + "learning_rate": 0.00036048847419346857, + "loss": 0.1861, + "step": 256670 + }, + { + "epoch": 10.63, + "grad_norm": 1.015625, + "learning_rate": 0.0003604787455401812, + "loss": 0.2411, + "step": 256680 + }, + { + "epoch": 10.63, + "grad_norm": 0.404296875, + "learning_rate": 0.0003604690166789847, + "loss": 0.1788, + "step": 256690 + }, + { + "epoch": 10.63, + "grad_norm": 0.55859375, + "learning_rate": 0.0003604592876098975, + "loss": 0.1882, + "step": 256700 + }, + { + "epoch": 10.63, + "grad_norm": 0.91796875, + "learning_rate": 0.00036044955833293787, + "loss": 0.1647, + "step": 256710 + }, + { + "epoch": 10.63, + "grad_norm": 0.408203125, + "learning_rate": 0.00036043982884812403, + "loss": 0.1545, + "step": 256720 + }, + { + "epoch": 10.63, + "grad_norm": 0.828125, + "learning_rate": 0.0003604300991554744, + "loss": 0.18, + "step": 256730 + }, + { + "epoch": 10.63, + "grad_norm": 0.306640625, + "learning_rate": 0.0003604203692550073, + "loss": 0.1997, + "step": 256740 + }, + { + "epoch": 10.63, + "grad_norm": 1.28125, + "learning_rate": 0.0003604106391467409, + "loss": 0.2091, + "step": 256750 + }, + { + "epoch": 10.63, + "grad_norm": 1.6171875, + "learning_rate": 0.0003604009088306936, + "loss": 0.1986, + "step": 256760 + }, + { + "epoch": 10.64, + "grad_norm": 0.439453125, + "learning_rate": 0.0003603911783068838, + "loss": 0.2011, + "step": 256770 + }, + { + "epoch": 10.64, + "grad_norm": 0.8125, + "learning_rate": 0.00036038144757532956, + "loss": 0.1704, + "step": 256780 + }, + { + "epoch": 10.64, + "grad_norm": 1.203125, + "learning_rate": 0.0003603717166360495, + "loss": 0.2412, + "step": 256790 + }, + { + "epoch": 10.64, + "grad_norm": 0.53125, + "learning_rate": 0.00036036198548906174, + "loss": 0.181, + "step": 256800 + }, + { + "epoch": 10.64, + "grad_norm": 0.5703125, + "learning_rate": 0.0003603522541343847, + "loss": 0.1836, + "step": 256810 + }, + { + "epoch": 10.64, + "grad_norm": 0.287109375, + "learning_rate": 0.0003603425225720366, + "loss": 0.2224, + "step": 256820 + }, + { + "epoch": 10.64, + "grad_norm": 0.61328125, + "learning_rate": 0.0003603327908020357, + "loss": 0.1781, + "step": 256830 + }, + { + "epoch": 10.64, + "grad_norm": 0.5859375, + "learning_rate": 0.0003603230588244005, + "loss": 0.1821, + "step": 256840 + }, + { + "epoch": 10.64, + "grad_norm": 0.8203125, + "learning_rate": 0.00036031332663914923, + "loss": 0.2056, + "step": 256850 + }, + { + "epoch": 10.64, + "grad_norm": 0.9609375, + "learning_rate": 0.0003603035942463001, + "loss": 0.1988, + "step": 256860 + }, + { + "epoch": 10.64, + "grad_norm": 1.3203125, + "learning_rate": 0.0003602938616458716, + "loss": 0.194, + "step": 256870 + }, + { + "epoch": 10.64, + "grad_norm": 0.8984375, + "learning_rate": 0.0003602841288378819, + "loss": 0.2167, + "step": 256880 + }, + { + "epoch": 10.64, + "grad_norm": 2.21875, + "learning_rate": 0.00036027439582234945, + "loss": 0.2129, + "step": 256890 + }, + { + "epoch": 10.64, + "grad_norm": 1.0234375, + "learning_rate": 0.0003602646625992925, + "loss": 0.1881, + "step": 256900 + }, + { + "epoch": 10.64, + "grad_norm": 0.91796875, + "learning_rate": 0.0003602549291687293, + "loss": 0.2159, + "step": 256910 + }, + { + "epoch": 10.64, + "grad_norm": 0.462890625, + "learning_rate": 0.00036024519553067827, + "loss": 0.2014, + "step": 256920 + }, + { + "epoch": 10.64, + "grad_norm": 0.5234375, + "learning_rate": 0.00036023546168515764, + "loss": 0.2095, + "step": 256930 + }, + { + "epoch": 10.64, + "grad_norm": 1.265625, + "learning_rate": 0.0003602257276321858, + "loss": 0.1987, + "step": 256940 + }, + { + "epoch": 10.64, + "grad_norm": 1.015625, + "learning_rate": 0.0003602159933717811, + "loss": 0.1702, + "step": 256950 + }, + { + "epoch": 10.64, + "grad_norm": 0.99609375, + "learning_rate": 0.0003602062589039617, + "loss": 0.207, + "step": 256960 + }, + { + "epoch": 10.64, + "grad_norm": 0.75390625, + "learning_rate": 0.00036019652422874604, + "loss": 0.2034, + "step": 256970 + }, + { + "epoch": 10.64, + "grad_norm": 0.921875, + "learning_rate": 0.0003601867893461525, + "loss": 0.1662, + "step": 256980 + }, + { + "epoch": 10.64, + "grad_norm": 0.7578125, + "learning_rate": 0.0003601770542561993, + "loss": 0.2065, + "step": 256990 + }, + { + "epoch": 10.64, + "grad_norm": 0.953125, + "learning_rate": 0.00036016731895890475, + "loss": 0.1657, + "step": 257000 + }, + { + "epoch": 10.65, + "grad_norm": 0.6484375, + "learning_rate": 0.0003601575834542872, + "loss": 0.1415, + "step": 257010 + }, + { + "epoch": 10.65, + "grad_norm": 2.171875, + "learning_rate": 0.000360147847742365, + "loss": 0.2074, + "step": 257020 + }, + { + "epoch": 10.65, + "grad_norm": 0.48046875, + "learning_rate": 0.00036013811182315645, + "loss": 0.1871, + "step": 257030 + }, + { + "epoch": 10.65, + "grad_norm": 0.59375, + "learning_rate": 0.0003601283756966798, + "loss": 0.19, + "step": 257040 + }, + { + "epoch": 10.65, + "grad_norm": 0.9765625, + "learning_rate": 0.00036011863936295346, + "loss": 0.1919, + "step": 257050 + }, + { + "epoch": 10.65, + "grad_norm": 0.51171875, + "learning_rate": 0.0003601089028219958, + "loss": 0.2185, + "step": 257060 + }, + { + "epoch": 10.65, + "grad_norm": 0.447265625, + "learning_rate": 0.000360099166073825, + "loss": 0.2324, + "step": 257070 + }, + { + "epoch": 10.65, + "grad_norm": 0.462890625, + "learning_rate": 0.00036008942911845953, + "loss": 0.1848, + "step": 257080 + }, + { + "epoch": 10.65, + "grad_norm": 1.15625, + "learning_rate": 0.00036007969195591766, + "loss": 0.2207, + "step": 257090 + }, + { + "epoch": 10.65, + "grad_norm": 0.7734375, + "learning_rate": 0.0003600699545862176, + "loss": 0.1628, + "step": 257100 + }, + { + "epoch": 10.65, + "grad_norm": 0.451171875, + "learning_rate": 0.00036006021700937785, + "loss": 0.2201, + "step": 257110 + }, + { + "epoch": 10.65, + "grad_norm": 0.341796875, + "learning_rate": 0.00036005047922541656, + "loss": 0.1913, + "step": 257120 + }, + { + "epoch": 10.65, + "grad_norm": 1.515625, + "learning_rate": 0.0003600407412343523, + "loss": 0.1953, + "step": 257130 + }, + { + "epoch": 10.65, + "grad_norm": 0.000125885009765625, + "learning_rate": 0.0003600310030362032, + "loss": 0.2301, + "step": 257140 + }, + { + "epoch": 10.65, + "grad_norm": 1.96875, + "learning_rate": 0.0003600212646309875, + "loss": 0.2098, + "step": 257150 + }, + { + "epoch": 10.65, + "grad_norm": 1.3828125, + "learning_rate": 0.0003600115260187238, + "loss": 0.1814, + "step": 257160 + }, + { + "epoch": 10.65, + "grad_norm": 1.078125, + "learning_rate": 0.0003600017871994303, + "loss": 0.2018, + "step": 257170 + }, + { + "epoch": 10.65, + "grad_norm": 1.2265625, + "learning_rate": 0.0003599920481731252, + "loss": 0.1796, + "step": 257180 + }, + { + "epoch": 10.65, + "grad_norm": 0.365234375, + "learning_rate": 0.000359982308939827, + "loss": 0.1947, + "step": 257190 + }, + { + "epoch": 10.65, + "grad_norm": 0.56640625, + "learning_rate": 0.000359972569499554, + "loss": 0.1778, + "step": 257200 + }, + { + "epoch": 10.65, + "grad_norm": 0.275390625, + "learning_rate": 0.00035996282985232447, + "loss": 0.1817, + "step": 257210 + }, + { + "epoch": 10.65, + "grad_norm": 1.1953125, + "learning_rate": 0.0003599530899981568, + "loss": 0.1887, + "step": 257220 + }, + { + "epoch": 10.65, + "grad_norm": 1.1328125, + "learning_rate": 0.0003599433499370692, + "loss": 0.2397, + "step": 257230 + }, + { + "epoch": 10.65, + "grad_norm": 0.828125, + "learning_rate": 0.00035993360966908025, + "loss": 0.157, + "step": 257240 + }, + { + "epoch": 10.66, + "grad_norm": 0.92578125, + "learning_rate": 0.00035992386919420804, + "loss": 0.2014, + "step": 257250 + }, + { + "epoch": 10.66, + "grad_norm": 1.6875, + "learning_rate": 0.0003599141285124709, + "loss": 0.2073, + "step": 257260 + }, + { + "epoch": 10.66, + "grad_norm": 0.79296875, + "learning_rate": 0.00035990438762388734, + "loss": 0.1888, + "step": 257270 + }, + { + "epoch": 10.66, + "grad_norm": 0.8515625, + "learning_rate": 0.00035989464652847554, + "loss": 0.1755, + "step": 257280 + }, + { + "epoch": 10.66, + "grad_norm": 0.62109375, + "learning_rate": 0.0003598849052262539, + "loss": 0.1885, + "step": 257290 + }, + { + "epoch": 10.66, + "grad_norm": 0.9296875, + "learning_rate": 0.0003598751637172408, + "loss": 0.2391, + "step": 257300 + }, + { + "epoch": 10.66, + "grad_norm": 0.546875, + "learning_rate": 0.00035986542200145435, + "loss": 0.2033, + "step": 257310 + }, + { + "epoch": 10.66, + "grad_norm": 0.71875, + "learning_rate": 0.0003598556800789132, + "loss": 0.2253, + "step": 257320 + }, + { + "epoch": 10.66, + "grad_norm": 0.45703125, + "learning_rate": 0.0003598459379496354, + "loss": 0.2078, + "step": 257330 + }, + { + "epoch": 10.66, + "grad_norm": 1.046875, + "learning_rate": 0.0003598361956136394, + "loss": 0.1616, + "step": 257340 + }, + { + "epoch": 10.66, + "grad_norm": 0.306640625, + "learning_rate": 0.0003598264530709437, + "loss": 0.2, + "step": 257350 + }, + { + "epoch": 10.66, + "grad_norm": 1.453125, + "learning_rate": 0.0003598167103215664, + "loss": 0.2145, + "step": 257360 + }, + { + "epoch": 10.66, + "grad_norm": 1.3984375, + "learning_rate": 0.00035980696736552585, + "loss": 0.178, + "step": 257370 + }, + { + "epoch": 10.66, + "grad_norm": 0.53125, + "learning_rate": 0.0003597972242028405, + "loss": 0.1908, + "step": 257380 + }, + { + "epoch": 10.66, + "grad_norm": 1.0390625, + "learning_rate": 0.00035978748083352857, + "loss": 0.1686, + "step": 257390 + }, + { + "epoch": 10.66, + "grad_norm": 0.90625, + "learning_rate": 0.0003597777372576085, + "loss": 0.2105, + "step": 257400 + }, + { + "epoch": 10.66, + "grad_norm": 0.89453125, + "learning_rate": 0.00035976799347509856, + "loss": 0.1609, + "step": 257410 + }, + { + "epoch": 10.66, + "grad_norm": 0.404296875, + "learning_rate": 0.0003597582494860172, + "loss": 0.2029, + "step": 257420 + }, + { + "epoch": 10.66, + "grad_norm": 0.62109375, + "learning_rate": 0.0003597485052903826, + "loss": 0.1915, + "step": 257430 + }, + { + "epoch": 10.66, + "grad_norm": 0.404296875, + "learning_rate": 0.0003597387608882132, + "loss": 0.1608, + "step": 257440 + }, + { + "epoch": 10.66, + "grad_norm": 0.3828125, + "learning_rate": 0.00035972901627952724, + "loss": 0.1973, + "step": 257450 + }, + { + "epoch": 10.66, + "grad_norm": 0.62109375, + "learning_rate": 0.0003597192714643432, + "loss": 0.2281, + "step": 257460 + }, + { + "epoch": 10.66, + "grad_norm": 1.25, + "learning_rate": 0.00035970952644267916, + "loss": 0.174, + "step": 257470 + }, + { + "epoch": 10.66, + "grad_norm": 1.78125, + "learning_rate": 0.0003596997812145539, + "loss": 0.1689, + "step": 257480 + }, + { + "epoch": 10.67, + "grad_norm": 0.515625, + "learning_rate": 0.00035969003577998533, + "loss": 0.1767, + "step": 257490 + }, + { + "epoch": 10.67, + "grad_norm": 1.1796875, + "learning_rate": 0.00035968029013899196, + "loss": 0.1779, + "step": 257500 + }, + { + "epoch": 10.67, + "grad_norm": 0.81640625, + "learning_rate": 0.00035967054429159224, + "loss": 0.246, + "step": 257510 + }, + { + "epoch": 10.67, + "grad_norm": 1.125, + "learning_rate": 0.0003596607982378043, + "loss": 0.1859, + "step": 257520 + }, + { + "epoch": 10.67, + "grad_norm": 1.890625, + "learning_rate": 0.0003596510519776466, + "loss": 0.2171, + "step": 257530 + }, + { + "epoch": 10.67, + "grad_norm": 0.66015625, + "learning_rate": 0.00035964130551113745, + "loss": 0.243, + "step": 257540 + }, + { + "epoch": 10.67, + "grad_norm": 0.96484375, + "learning_rate": 0.0003596315588382952, + "loss": 0.1958, + "step": 257550 + }, + { + "epoch": 10.67, + "grad_norm": 0.984375, + "learning_rate": 0.00035962181195913824, + "loss": 0.2066, + "step": 257560 + }, + { + "epoch": 10.67, + "grad_norm": 1.71875, + "learning_rate": 0.0003596120648736848, + "loss": 0.211, + "step": 257570 + }, + { + "epoch": 10.67, + "grad_norm": 1.484375, + "learning_rate": 0.0003596023175819534, + "loss": 0.2035, + "step": 257580 + }, + { + "epoch": 10.67, + "grad_norm": 0.640625, + "learning_rate": 0.0003595925700839622, + "loss": 0.2013, + "step": 257590 + }, + { + "epoch": 10.67, + "grad_norm": 1.171875, + "learning_rate": 0.00035958282237972964, + "loss": 0.1704, + "step": 257600 + }, + { + "epoch": 10.67, + "grad_norm": 0.5625, + "learning_rate": 0.00035957307446927403, + "loss": 0.176, + "step": 257610 + }, + { + "epoch": 10.67, + "grad_norm": 0.57421875, + "learning_rate": 0.0003595633263526137, + "loss": 0.1847, + "step": 257620 + }, + { + "epoch": 10.67, + "grad_norm": 0.5390625, + "learning_rate": 0.00035955357802976704, + "loss": 0.1993, + "step": 257630 + }, + { + "epoch": 10.67, + "grad_norm": 1.1171875, + "learning_rate": 0.00035954382950075236, + "loss": 0.1478, + "step": 257640 + }, + { + "epoch": 10.67, + "grad_norm": 1.0078125, + "learning_rate": 0.00035953408076558807, + "loss": 0.1758, + "step": 257650 + }, + { + "epoch": 10.67, + "grad_norm": 0.74609375, + "learning_rate": 0.00035952433182429244, + "loss": 0.193, + "step": 257660 + }, + { + "epoch": 10.67, + "grad_norm": 1.2734375, + "learning_rate": 0.00035951458267688386, + "loss": 0.1826, + "step": 257670 + }, + { + "epoch": 10.67, + "grad_norm": 0.357421875, + "learning_rate": 0.0003595048333233807, + "loss": 0.212, + "step": 257680 + }, + { + "epoch": 10.67, + "grad_norm": 0.50390625, + "learning_rate": 0.00035949508376380117, + "loss": 0.1928, + "step": 257690 + }, + { + "epoch": 10.67, + "grad_norm": 1.6171875, + "learning_rate": 0.00035948533399816375, + "loss": 0.2461, + "step": 257700 + }, + { + "epoch": 10.67, + "grad_norm": 1.75, + "learning_rate": 0.0003594755840264868, + "loss": 0.2141, + "step": 257710 + }, + { + "epoch": 10.67, + "grad_norm": 1.9140625, + "learning_rate": 0.00035946583384878863, + "loss": 0.257, + "step": 257720 + }, + { + "epoch": 10.68, + "grad_norm": 0.83203125, + "learning_rate": 0.0003594560834650875, + "loss": 0.1544, + "step": 257730 + }, + { + "epoch": 10.68, + "grad_norm": 0.95703125, + "learning_rate": 0.0003594463328754019, + "loss": 0.138, + "step": 257740 + }, + { + "epoch": 10.68, + "grad_norm": 2.296875, + "learning_rate": 0.0003594365820797501, + "loss": 0.1966, + "step": 257750 + }, + { + "epoch": 10.68, + "grad_norm": 0.98828125, + "learning_rate": 0.0003594268310781505, + "loss": 0.2022, + "step": 257760 + }, + { + "epoch": 10.68, + "grad_norm": 0.93359375, + "learning_rate": 0.0003594170798706214, + "loss": 0.1815, + "step": 257770 + }, + { + "epoch": 10.68, + "grad_norm": 1.1640625, + "learning_rate": 0.00035940732845718117, + "loss": 0.1953, + "step": 257780 + }, + { + "epoch": 10.68, + "grad_norm": 0.75390625, + "learning_rate": 0.0003593975768378481, + "loss": 0.2058, + "step": 257790 + }, + { + "epoch": 10.68, + "grad_norm": 0.90234375, + "learning_rate": 0.0003593878250126408, + "loss": 0.1952, + "step": 257800 + }, + { + "epoch": 10.68, + "grad_norm": 1.3828125, + "learning_rate": 0.00035937807298157726, + "loss": 0.254, + "step": 257810 + }, + { + "epoch": 10.68, + "grad_norm": 0.4609375, + "learning_rate": 0.00035936832074467604, + "loss": 0.1905, + "step": 257820 + }, + { + "epoch": 10.68, + "grad_norm": 0.7265625, + "learning_rate": 0.0003593585683019554, + "loss": 0.1925, + "step": 257830 + }, + { + "epoch": 10.68, + "grad_norm": 0.6640625, + "learning_rate": 0.0003593488156534338, + "loss": 0.1586, + "step": 257840 + }, + { + "epoch": 10.68, + "grad_norm": 1.109375, + "learning_rate": 0.0003593390627991295, + "loss": 0.2082, + "step": 257850 + }, + { + "epoch": 10.68, + "grad_norm": 0.77734375, + "learning_rate": 0.000359329309739061, + "loss": 0.1592, + "step": 257860 + }, + { + "epoch": 10.68, + "grad_norm": 0.4140625, + "learning_rate": 0.0003593195564732464, + "loss": 0.2389, + "step": 257870 + }, + { + "epoch": 10.68, + "grad_norm": 1.125, + "learning_rate": 0.0003593098030017043, + "loss": 0.1994, + "step": 257880 + }, + { + "epoch": 10.68, + "grad_norm": 0.625, + "learning_rate": 0.00035930004932445295, + "loss": 0.165, + "step": 257890 + }, + { + "epoch": 10.68, + "grad_norm": 0.765625, + "learning_rate": 0.0003592902954415107, + "loss": 0.2485, + "step": 257900 + }, + { + "epoch": 10.68, + "grad_norm": 0.61328125, + "learning_rate": 0.00035928054135289585, + "loss": 0.1956, + "step": 257910 + }, + { + "epoch": 10.68, + "grad_norm": 1.484375, + "learning_rate": 0.00035927078705862684, + "loss": 0.2026, + "step": 257920 + }, + { + "epoch": 10.68, + "grad_norm": 1.2734375, + "learning_rate": 0.00035926103255872204, + "loss": 0.2501, + "step": 257930 + }, + { + "epoch": 10.68, + "grad_norm": 2.078125, + "learning_rate": 0.00035925127785319975, + "loss": 0.1942, + "step": 257940 + }, + { + "epoch": 10.68, + "grad_norm": 1.9140625, + "learning_rate": 0.0003592415229420784, + "loss": 0.2099, + "step": 257950 + }, + { + "epoch": 10.68, + "grad_norm": 1.0078125, + "learning_rate": 0.00035923176782537625, + "loss": 0.2194, + "step": 257960 + }, + { + "epoch": 10.69, + "grad_norm": 0.609375, + "learning_rate": 0.0003592220125031118, + "loss": 0.2178, + "step": 257970 + }, + { + "epoch": 10.69, + "grad_norm": 1.6328125, + "learning_rate": 0.00035921225697530326, + "loss": 0.1867, + "step": 257980 + }, + { + "epoch": 10.69, + "grad_norm": 1.7578125, + "learning_rate": 0.000359202501241969, + "loss": 0.1504, + "step": 257990 + }, + { + "epoch": 10.69, + "grad_norm": 0.349609375, + "learning_rate": 0.0003591927453031275, + "loss": 0.2508, + "step": 258000 + }, + { + "epoch": 10.69, + "grad_norm": 1.0, + "learning_rate": 0.000359182989158797, + "loss": 0.1695, + "step": 258010 + }, + { + "epoch": 10.69, + "grad_norm": 0.400390625, + "learning_rate": 0.00035917323280899594, + "loss": 0.204, + "step": 258020 + }, + { + "epoch": 10.69, + "grad_norm": 1.6171875, + "learning_rate": 0.00035916347625374257, + "loss": 0.1923, + "step": 258030 + }, + { + "epoch": 10.69, + "grad_norm": 0.5546875, + "learning_rate": 0.00035915371949305543, + "loss": 0.2079, + "step": 258040 + }, + { + "epoch": 10.69, + "grad_norm": 0.81640625, + "learning_rate": 0.0003591439625269527, + "loss": 0.1767, + "step": 258050 + }, + { + "epoch": 10.69, + "grad_norm": 0.0002155303955078125, + "learning_rate": 0.0003591342053554529, + "loss": 0.2135, + "step": 258060 + }, + { + "epoch": 10.69, + "grad_norm": 0.55078125, + "learning_rate": 0.0003591244479785742, + "loss": 0.1676, + "step": 258070 + }, + { + "epoch": 10.69, + "grad_norm": 0.5078125, + "learning_rate": 0.00035911469039633516, + "loss": 0.2305, + "step": 258080 + }, + { + "epoch": 10.69, + "grad_norm": 0.96484375, + "learning_rate": 0.000359104932608754, + "loss": 0.2059, + "step": 258090 + }, + { + "epoch": 10.69, + "grad_norm": 1.4140625, + "learning_rate": 0.00035909517461584915, + "loss": 0.242, + "step": 258100 + }, + { + "epoch": 10.69, + "grad_norm": 0.65625, + "learning_rate": 0.000359085416417639, + "loss": 0.2224, + "step": 258110 + }, + { + "epoch": 10.69, + "grad_norm": 0.87890625, + "learning_rate": 0.0003590756580141418, + "loss": 0.204, + "step": 258120 + }, + { + "epoch": 10.69, + "grad_norm": 0.74609375, + "learning_rate": 0.0003590658994053762, + "loss": 0.1761, + "step": 258130 + }, + { + "epoch": 10.69, + "grad_norm": 0.63671875, + "learning_rate": 0.0003590561405913601, + "loss": 0.2137, + "step": 258140 + }, + { + "epoch": 10.69, + "grad_norm": 2.015625, + "learning_rate": 0.00035904638157211225, + "loss": 0.2, + "step": 258150 + }, + { + "epoch": 10.69, + "grad_norm": 1.1875, + "learning_rate": 0.00035903662234765087, + "loss": 0.1915, + "step": 258160 + }, + { + "epoch": 10.69, + "grad_norm": 0.52734375, + "learning_rate": 0.00035902686291799425, + "loss": 0.186, + "step": 258170 + }, + { + "epoch": 10.69, + "grad_norm": 0.33984375, + "learning_rate": 0.00035901710328316094, + "loss": 0.1823, + "step": 258180 + }, + { + "epoch": 10.69, + "grad_norm": 0.92578125, + "learning_rate": 0.0003590073434431692, + "loss": 0.2185, + "step": 258190 + }, + { + "epoch": 10.69, + "grad_norm": 1.2421875, + "learning_rate": 0.00035899758339803746, + "loss": 0.1748, + "step": 258200 + }, + { + "epoch": 10.7, + "grad_norm": 1.484375, + "learning_rate": 0.000358987823147784, + "loss": 0.2116, + "step": 258210 + }, + { + "epoch": 10.7, + "grad_norm": 0.64453125, + "learning_rate": 0.00035897806269242724, + "loss": 0.1911, + "step": 258220 + }, + { + "epoch": 10.7, + "grad_norm": 1.2109375, + "learning_rate": 0.0003589683020319855, + "loss": 0.1875, + "step": 258230 + }, + { + "epoch": 10.7, + "grad_norm": 0.83984375, + "learning_rate": 0.0003589585411664772, + "loss": 0.2391, + "step": 258240 + }, + { + "epoch": 10.7, + "grad_norm": 0.71875, + "learning_rate": 0.00035894878009592063, + "loss": 0.1682, + "step": 258250 + }, + { + "epoch": 10.7, + "grad_norm": 1.0, + "learning_rate": 0.0003589390188203343, + "loss": 0.1881, + "step": 258260 + }, + { + "epoch": 10.7, + "grad_norm": 0.35546875, + "learning_rate": 0.00035892925733973645, + "loss": 0.2295, + "step": 258270 + }, + { + "epoch": 10.7, + "grad_norm": 1.203125, + "learning_rate": 0.00035891949565414557, + "loss": 0.1813, + "step": 258280 + }, + { + "epoch": 10.7, + "grad_norm": 0.78515625, + "learning_rate": 0.00035890973376357993, + "loss": 0.2374, + "step": 258290 + }, + { + "epoch": 10.7, + "grad_norm": 1.0390625, + "learning_rate": 0.0003588999716680579, + "loss": 0.197, + "step": 258300 + }, + { + "epoch": 10.7, + "grad_norm": 0.6875, + "learning_rate": 0.0003588902093675979, + "loss": 0.2206, + "step": 258310 + }, + { + "epoch": 10.7, + "grad_norm": 0.9296875, + "learning_rate": 0.00035888044686221835, + "loss": 0.1969, + "step": 258320 + }, + { + "epoch": 10.7, + "grad_norm": 0.443359375, + "learning_rate": 0.0003588706841519375, + "loss": 0.1693, + "step": 258330 + }, + { + "epoch": 10.7, + "grad_norm": 0.0, + "learning_rate": 0.00035886092123677373, + "loss": 0.2638, + "step": 258340 + }, + { + "epoch": 10.7, + "grad_norm": 0.7109375, + "learning_rate": 0.00035885115811674554, + "loss": 0.2179, + "step": 258350 + }, + { + "epoch": 10.7, + "grad_norm": 0.404296875, + "learning_rate": 0.00035884139479187117, + "loss": 0.136, + "step": 258360 + }, + { + "epoch": 10.7, + "grad_norm": 1.0390625, + "learning_rate": 0.0003588316312621691, + "loss": 0.2429, + "step": 258370 + }, + { + "epoch": 10.7, + "grad_norm": 1.8203125, + "learning_rate": 0.00035882186752765757, + "loss": 0.1814, + "step": 258380 + }, + { + "epoch": 10.7, + "grad_norm": 0.7734375, + "learning_rate": 0.00035881210358835514, + "loss": 0.1875, + "step": 258390 + }, + { + "epoch": 10.7, + "grad_norm": 1.546875, + "learning_rate": 0.00035880233944428006, + "loss": 0.1714, + "step": 258400 + }, + { + "epoch": 10.7, + "grad_norm": 0.96875, + "learning_rate": 0.0003587925750954507, + "loss": 0.201, + "step": 258410 + }, + { + "epoch": 10.7, + "grad_norm": 0.58203125, + "learning_rate": 0.00035878281054188544, + "loss": 0.1756, + "step": 258420 + }, + { + "epoch": 10.7, + "grad_norm": 0.412109375, + "learning_rate": 0.0003587730457836027, + "loss": 0.1894, + "step": 258430 + }, + { + "epoch": 10.7, + "grad_norm": 0.89453125, + "learning_rate": 0.00035876328082062083, + "loss": 0.1628, + "step": 258440 + }, + { + "epoch": 10.7, + "grad_norm": 0.8125, + "learning_rate": 0.0003587535156529583, + "loss": 0.1677, + "step": 258450 + }, + { + "epoch": 10.71, + "grad_norm": 0.494140625, + "learning_rate": 0.0003587437502806332, + "loss": 0.2003, + "step": 258460 + }, + { + "epoch": 10.71, + "grad_norm": 0.412109375, + "learning_rate": 0.0003587339847036643, + "loss": 0.1829, + "step": 258470 + }, + { + "epoch": 10.71, + "grad_norm": 0.63671875, + "learning_rate": 0.0003587242189220697, + "loss": 0.1754, + "step": 258480 + }, + { + "epoch": 10.71, + "grad_norm": 0.87109375, + "learning_rate": 0.00035871445293586784, + "loss": 0.1964, + "step": 258490 + }, + { + "epoch": 10.71, + "grad_norm": 0.5078125, + "learning_rate": 0.00035870468674507715, + "loss": 0.2303, + "step": 258500 + }, + { + "epoch": 10.71, + "grad_norm": 0.74609375, + "learning_rate": 0.00035869492034971604, + "loss": 0.2117, + "step": 258510 + }, + { + "epoch": 10.71, + "grad_norm": 0.625, + "learning_rate": 0.0003586851537498027, + "loss": 0.1926, + "step": 258520 + }, + { + "epoch": 10.71, + "grad_norm": 0.71484375, + "learning_rate": 0.0003586753869453557, + "loss": 0.1983, + "step": 258530 + }, + { + "epoch": 10.71, + "grad_norm": 0.7890625, + "learning_rate": 0.0003586656199363933, + "loss": 0.2023, + "step": 258540 + }, + { + "epoch": 10.71, + "grad_norm": 0.8046875, + "learning_rate": 0.0003586558527229341, + "loss": 0.1997, + "step": 258550 + }, + { + "epoch": 10.71, + "grad_norm": 0.341796875, + "learning_rate": 0.0003586460853049962, + "loss": 0.2024, + "step": 258560 + }, + { + "epoch": 10.71, + "grad_norm": 1.1953125, + "learning_rate": 0.00035863631768259807, + "loss": 0.1845, + "step": 258570 + }, + { + "epoch": 10.71, + "grad_norm": 0.458984375, + "learning_rate": 0.0003586265498557582, + "loss": 0.1571, + "step": 258580 + }, + { + "epoch": 10.71, + "grad_norm": 1.0859375, + "learning_rate": 0.00035861678182449487, + "loss": 0.1909, + "step": 258590 + }, + { + "epoch": 10.71, + "grad_norm": 1.8046875, + "learning_rate": 0.00035860701358882645, + "loss": 0.2129, + "step": 258600 + }, + { + "epoch": 10.71, + "grad_norm": 1.5390625, + "learning_rate": 0.00035859724514877145, + "loss": 0.1703, + "step": 258610 + }, + { + "epoch": 10.71, + "grad_norm": 0.87109375, + "learning_rate": 0.000358587476504348, + "loss": 0.1699, + "step": 258620 + }, + { + "epoch": 10.71, + "grad_norm": 0.7109375, + "learning_rate": 0.0003585777076555748, + "loss": 0.1551, + "step": 258630 + }, + { + "epoch": 10.71, + "grad_norm": 1.078125, + "learning_rate": 0.00035856793860247006, + "loss": 0.2036, + "step": 258640 + }, + { + "epoch": 10.71, + "grad_norm": 0.6796875, + "learning_rate": 0.00035855816934505206, + "loss": 0.1885, + "step": 258650 + }, + { + "epoch": 10.71, + "grad_norm": 0.416015625, + "learning_rate": 0.0003585483998833395, + "loss": 0.1857, + "step": 258660 + }, + { + "epoch": 10.71, + "grad_norm": 0.50390625, + "learning_rate": 0.00035853863021735046, + "loss": 0.238, + "step": 258670 + }, + { + "epoch": 10.71, + "grad_norm": 0.77734375, + "learning_rate": 0.0003585288603471034, + "loss": 0.2137, + "step": 258680 + }, + { + "epoch": 10.71, + "grad_norm": 1.0234375, + "learning_rate": 0.00035851909027261687, + "loss": 0.1366, + "step": 258690 + }, + { + "epoch": 10.72, + "grad_norm": 0.51171875, + "learning_rate": 0.00035850931999390894, + "loss": 0.2072, + "step": 258700 + }, + { + "epoch": 10.72, + "grad_norm": 1.3515625, + "learning_rate": 0.0003584995495109984, + "loss": 0.2065, + "step": 258710 + }, + { + "epoch": 10.72, + "grad_norm": 1.6796875, + "learning_rate": 0.00035848977882390335, + "loss": 0.1774, + "step": 258720 + }, + { + "epoch": 10.72, + "grad_norm": 0.443359375, + "learning_rate": 0.0003584800079326421, + "loss": 0.2202, + "step": 258730 + }, + { + "epoch": 10.72, + "grad_norm": 0.56640625, + "learning_rate": 0.0003584702368372334, + "loss": 0.1622, + "step": 258740 + }, + { + "epoch": 10.72, + "grad_norm": 0.671875, + "learning_rate": 0.00035846046553769537, + "loss": 0.161, + "step": 258750 + }, + { + "epoch": 10.72, + "grad_norm": 1.3984375, + "learning_rate": 0.0003584506940340464, + "loss": 0.1967, + "step": 258760 + }, + { + "epoch": 10.72, + "grad_norm": 0.83984375, + "learning_rate": 0.000358440922326305, + "loss": 0.2162, + "step": 258770 + }, + { + "epoch": 10.72, + "grad_norm": 0.1826171875, + "learning_rate": 0.00035843115041448944, + "loss": 0.2111, + "step": 258780 + }, + { + "epoch": 10.72, + "grad_norm": 0.458984375, + "learning_rate": 0.0003584213782986182, + "loss": 0.1709, + "step": 258790 + }, + { + "epoch": 10.72, + "grad_norm": 0.68359375, + "learning_rate": 0.0003584116059787096, + "loss": 0.2661, + "step": 258800 + }, + { + "epoch": 10.72, + "grad_norm": 0.80859375, + "learning_rate": 0.00035840183345478213, + "loss": 0.1834, + "step": 258810 + }, + { + "epoch": 10.72, + "grad_norm": 0.81640625, + "learning_rate": 0.0003583920607268541, + "loss": 0.1892, + "step": 258820 + }, + { + "epoch": 10.72, + "grad_norm": 1.0625, + "learning_rate": 0.0003583822877949439, + "loss": 0.1688, + "step": 258830 + }, + { + "epoch": 10.72, + "grad_norm": 0.640625, + "learning_rate": 0.0003583725146590699, + "loss": 0.1835, + "step": 258840 + }, + { + "epoch": 10.72, + "grad_norm": 1.34375, + "learning_rate": 0.0003583627413192505, + "loss": 0.1938, + "step": 258850 + }, + { + "epoch": 10.72, + "grad_norm": 0.7109375, + "learning_rate": 0.0003583529677755042, + "loss": 0.208, + "step": 258860 + }, + { + "epoch": 10.72, + "grad_norm": 1.078125, + "learning_rate": 0.0003583431940278493, + "loss": 0.1949, + "step": 258870 + }, + { + "epoch": 10.72, + "grad_norm": 0.453125, + "learning_rate": 0.0003583334200763042, + "loss": 0.166, + "step": 258880 + }, + { + "epoch": 10.72, + "grad_norm": 0.8828125, + "learning_rate": 0.0003583236459208873, + "loss": 0.2489, + "step": 258890 + }, + { + "epoch": 10.72, + "grad_norm": 0.671875, + "learning_rate": 0.0003583138715616171, + "loss": 0.22, + "step": 258900 + }, + { + "epoch": 10.72, + "grad_norm": 0.62109375, + "learning_rate": 0.00035830409699851175, + "loss": 0.178, + "step": 258910 + }, + { + "epoch": 10.72, + "grad_norm": 0.59765625, + "learning_rate": 0.0003582943222315899, + "loss": 0.1766, + "step": 258920 + }, + { + "epoch": 10.72, + "grad_norm": 0.671875, + "learning_rate": 0.00035828454726086965, + "loss": 0.2399, + "step": 258930 + }, + { + "epoch": 10.73, + "grad_norm": 1.1875, + "learning_rate": 0.0003582747720863697, + "loss": 0.2009, + "step": 258940 + }, + { + "epoch": 10.73, + "grad_norm": 0.4609375, + "learning_rate": 0.00035826499670810836, + "loss": 0.1599, + "step": 258950 + }, + { + "epoch": 10.73, + "grad_norm": 0.83203125, + "learning_rate": 0.00035825522112610395, + "loss": 0.2014, + "step": 258960 + }, + { + "epoch": 10.73, + "grad_norm": 0.734375, + "learning_rate": 0.00035824544534037486, + "loss": 0.231, + "step": 258970 + }, + { + "epoch": 10.73, + "grad_norm": 0.52734375, + "learning_rate": 0.00035823566935093957, + "loss": 0.2099, + "step": 258980 + }, + { + "epoch": 10.73, + "grad_norm": 0.5703125, + "learning_rate": 0.0003582258931578164, + "loss": 0.2308, + "step": 258990 + }, + { + "epoch": 10.73, + "grad_norm": 0.80859375, + "learning_rate": 0.0003582161167610238, + "loss": 0.1903, + "step": 259000 + }, + { + "epoch": 10.73, + "grad_norm": 0.76171875, + "learning_rate": 0.0003582063401605802, + "loss": 0.1749, + "step": 259010 + }, + { + "epoch": 10.73, + "grad_norm": 0.72265625, + "learning_rate": 0.000358196563356504, + "loss": 0.1973, + "step": 259020 + }, + { + "epoch": 10.73, + "grad_norm": 0.53515625, + "learning_rate": 0.00035818678634881346, + "loss": 0.1609, + "step": 259030 + }, + { + "epoch": 10.73, + "grad_norm": 1.0625, + "learning_rate": 0.0003581770091375271, + "loss": 0.213, + "step": 259040 + }, + { + "epoch": 10.73, + "grad_norm": 0.53125, + "learning_rate": 0.00035816723172266325, + "loss": 0.2416, + "step": 259050 + }, + { + "epoch": 10.73, + "grad_norm": 0.69921875, + "learning_rate": 0.0003581574541042404, + "loss": 0.1792, + "step": 259060 + }, + { + "epoch": 10.73, + "grad_norm": 0.84375, + "learning_rate": 0.00035814767628227683, + "loss": 0.1981, + "step": 259070 + }, + { + "epoch": 10.73, + "grad_norm": 1.9140625, + "learning_rate": 0.0003581378982567911, + "loss": 0.1708, + "step": 259080 + }, + { + "epoch": 10.73, + "grad_norm": 2.09375, + "learning_rate": 0.0003581281200278015, + "loss": 0.234, + "step": 259090 + }, + { + "epoch": 10.73, + "grad_norm": 0.91796875, + "learning_rate": 0.0003581183415953264, + "loss": 0.2193, + "step": 259100 + }, + { + "epoch": 10.73, + "grad_norm": 0.154296875, + "learning_rate": 0.0003581085629593844, + "loss": 0.2074, + "step": 259110 + }, + { + "epoch": 10.73, + "grad_norm": 0.73046875, + "learning_rate": 0.00035809878411999365, + "loss": 0.19, + "step": 259120 + }, + { + "epoch": 10.73, + "grad_norm": 0.88671875, + "learning_rate": 0.00035808900507717264, + "loss": 0.2063, + "step": 259130 + }, + { + "epoch": 10.73, + "grad_norm": 0.875, + "learning_rate": 0.0003580792258309398, + "loss": 0.1788, + "step": 259140 + }, + { + "epoch": 10.73, + "grad_norm": 0.8125, + "learning_rate": 0.00035806944638131356, + "loss": 0.1825, + "step": 259150 + }, + { + "epoch": 10.73, + "grad_norm": 0.83984375, + "learning_rate": 0.0003580596667283123, + "loss": 0.1576, + "step": 259160 + }, + { + "epoch": 10.73, + "grad_norm": 1.4296875, + "learning_rate": 0.0003580498868719544, + "loss": 0.2106, + "step": 259170 + }, + { + "epoch": 10.74, + "grad_norm": 1.1328125, + "learning_rate": 0.0003580401068122583, + "loss": 0.2237, + "step": 259180 + }, + { + "epoch": 10.74, + "grad_norm": 0.546875, + "learning_rate": 0.0003580303265492424, + "loss": 0.2314, + "step": 259190 + }, + { + "epoch": 10.74, + "grad_norm": 0.78125, + "learning_rate": 0.00035802054608292505, + "loss": 0.1887, + "step": 259200 + }, + { + "epoch": 10.74, + "grad_norm": 0.7890625, + "learning_rate": 0.00035801076541332465, + "loss": 0.1515, + "step": 259210 + }, + { + "epoch": 10.74, + "grad_norm": 1.078125, + "learning_rate": 0.00035800098454045977, + "loss": 0.1635, + "step": 259220 + }, + { + "epoch": 10.74, + "grad_norm": 0.55859375, + "learning_rate": 0.0003579912034643486, + "loss": 0.2011, + "step": 259230 + }, + { + "epoch": 10.74, + "grad_norm": 0.48046875, + "learning_rate": 0.0003579814221850097, + "loss": 0.2302, + "step": 259240 + }, + { + "epoch": 10.74, + "grad_norm": 0.62109375, + "learning_rate": 0.00035797164070246147, + "loss": 0.1751, + "step": 259250 + }, + { + "epoch": 10.74, + "grad_norm": 1.453125, + "learning_rate": 0.0003579618590167222, + "loss": 0.2412, + "step": 259260 + }, + { + "epoch": 10.74, + "grad_norm": 0.75390625, + "learning_rate": 0.00035795207712781035, + "loss": 0.1881, + "step": 259270 + }, + { + "epoch": 10.74, + "grad_norm": 0.79296875, + "learning_rate": 0.00035794229503574446, + "loss": 0.205, + "step": 259280 + }, + { + "epoch": 10.74, + "grad_norm": 0.65234375, + "learning_rate": 0.0003579325127405427, + "loss": 0.2229, + "step": 259290 + }, + { + "epoch": 10.74, + "grad_norm": 0.447265625, + "learning_rate": 0.00035792273024222367, + "loss": 0.1465, + "step": 259300 + }, + { + "epoch": 10.74, + "grad_norm": 0.77734375, + "learning_rate": 0.00035791294754080574, + "loss": 0.2113, + "step": 259310 + }, + { + "epoch": 10.74, + "grad_norm": 1.171875, + "learning_rate": 0.00035790316463630726, + "loss": 0.1964, + "step": 259320 + }, + { + "epoch": 10.74, + "grad_norm": 0.5546875, + "learning_rate": 0.00035789338152874666, + "loss": 0.2328, + "step": 259330 + }, + { + "epoch": 10.74, + "grad_norm": 0.1826171875, + "learning_rate": 0.0003578835982181424, + "loss": 0.161, + "step": 259340 + }, + { + "epoch": 10.74, + "grad_norm": 0.859375, + "learning_rate": 0.00035787381470451287, + "loss": 0.1918, + "step": 259350 + }, + { + "epoch": 10.74, + "grad_norm": 0.5625, + "learning_rate": 0.00035786403098787644, + "loss": 0.1817, + "step": 259360 + }, + { + "epoch": 10.74, + "grad_norm": 0.1884765625, + "learning_rate": 0.00035785424706825155, + "loss": 0.2072, + "step": 259370 + }, + { + "epoch": 10.74, + "grad_norm": 0.5859375, + "learning_rate": 0.00035784446294565665, + "loss": 0.1816, + "step": 259380 + }, + { + "epoch": 10.74, + "grad_norm": 1.1171875, + "learning_rate": 0.00035783467862011005, + "loss": 0.236, + "step": 259390 + }, + { + "epoch": 10.74, + "grad_norm": 1.0703125, + "learning_rate": 0.00035782489409163034, + "loss": 0.1462, + "step": 259400 + }, + { + "epoch": 10.74, + "grad_norm": 0.83203125, + "learning_rate": 0.0003578151093602357, + "loss": 0.2145, + "step": 259410 + }, + { + "epoch": 10.75, + "grad_norm": 0.9296875, + "learning_rate": 0.0003578053244259447, + "loss": 0.161, + "step": 259420 + }, + { + "epoch": 10.75, + "grad_norm": 0.81640625, + "learning_rate": 0.00035779553928877574, + "loss": 0.1677, + "step": 259430 + }, + { + "epoch": 10.75, + "grad_norm": 1.15625, + "learning_rate": 0.0003577857539487473, + "loss": 0.2272, + "step": 259440 + }, + { + "epoch": 10.75, + "grad_norm": 0.34375, + "learning_rate": 0.0003577759684058776, + "loss": 0.1853, + "step": 259450 + }, + { + "epoch": 10.75, + "grad_norm": 0.89453125, + "learning_rate": 0.0003577661826601852, + "loss": 0.1978, + "step": 259460 + }, + { + "epoch": 10.75, + "grad_norm": 1.09375, + "learning_rate": 0.0003577563967116885, + "loss": 0.1928, + "step": 259470 + }, + { + "epoch": 10.75, + "grad_norm": 0.9296875, + "learning_rate": 0.0003577466105604058, + "loss": 0.1832, + "step": 259480 + }, + { + "epoch": 10.75, + "grad_norm": 0.59375, + "learning_rate": 0.0003577368242063557, + "loss": 0.1421, + "step": 259490 + }, + { + "epoch": 10.75, + "grad_norm": 0.44140625, + "learning_rate": 0.00035772703764955655, + "loss": 0.1777, + "step": 259500 + }, + { + "epoch": 10.75, + "grad_norm": 2.125, + "learning_rate": 0.0003577172508900267, + "loss": 0.2175, + "step": 259510 + }, + { + "epoch": 10.75, + "grad_norm": 0.8203125, + "learning_rate": 0.00035770746392778467, + "loss": 0.2408, + "step": 259520 + }, + { + "epoch": 10.75, + "grad_norm": 1.015625, + "learning_rate": 0.0003576976767628488, + "loss": 0.1737, + "step": 259530 + }, + { + "epoch": 10.75, + "grad_norm": 0.91796875, + "learning_rate": 0.0003576878893952375, + "loss": 0.2085, + "step": 259540 + }, + { + "epoch": 10.75, + "grad_norm": 1.2578125, + "learning_rate": 0.00035767810182496925, + "loss": 0.2276, + "step": 259550 + }, + { + "epoch": 10.75, + "grad_norm": 1.5625, + "learning_rate": 0.0003576683140520624, + "loss": 0.1922, + "step": 259560 + }, + { + "epoch": 10.75, + "grad_norm": 0.71875, + "learning_rate": 0.0003576585260765355, + "loss": 0.208, + "step": 259570 + }, + { + "epoch": 10.75, + "grad_norm": 0.91015625, + "learning_rate": 0.0003576487378984068, + "loss": 0.2487, + "step": 259580 + }, + { + "epoch": 10.75, + "grad_norm": 1.1640625, + "learning_rate": 0.0003576389495176948, + "loss": 0.1722, + "step": 259590 + }, + { + "epoch": 10.75, + "grad_norm": 0.59765625, + "learning_rate": 0.000357629160934418, + "loss": 0.2234, + "step": 259600 + }, + { + "epoch": 10.75, + "grad_norm": 0.2294921875, + "learning_rate": 0.00035761937214859465, + "loss": 0.1987, + "step": 259610 + }, + { + "epoch": 10.75, + "grad_norm": 0.74609375, + "learning_rate": 0.00035760958316024335, + "loss": 0.2072, + "step": 259620 + }, + { + "epoch": 10.75, + "grad_norm": 0.71484375, + "learning_rate": 0.0003575997939693824, + "loss": 0.232, + "step": 259630 + }, + { + "epoch": 10.75, + "grad_norm": 0.9609375, + "learning_rate": 0.00035759000457603017, + "loss": 0.2069, + "step": 259640 + }, + { + "epoch": 10.75, + "grad_norm": 0.859375, + "learning_rate": 0.0003575802149802053, + "loss": 0.2167, + "step": 259650 + }, + { + "epoch": 10.76, + "grad_norm": 0.6484375, + "learning_rate": 0.00035757042518192605, + "loss": 0.1851, + "step": 259660 + }, + { + "epoch": 10.76, + "grad_norm": 1.1640625, + "learning_rate": 0.00035756063518121084, + "loss": 0.1533, + "step": 259670 + }, + { + "epoch": 10.76, + "grad_norm": 1.109375, + "learning_rate": 0.00035755084497807817, + "loss": 0.1924, + "step": 259680 + }, + { + "epoch": 10.76, + "grad_norm": 0.67578125, + "learning_rate": 0.0003575410545725463, + "loss": 0.2126, + "step": 259690 + }, + { + "epoch": 10.76, + "grad_norm": 0.7421875, + "learning_rate": 0.00035753126396463397, + "loss": 0.1978, + "step": 259700 + }, + { + "epoch": 10.76, + "grad_norm": 1.25, + "learning_rate": 0.0003575214731543593, + "loss": 0.2135, + "step": 259710 + }, + { + "epoch": 10.76, + "grad_norm": 0.040283203125, + "learning_rate": 0.00035751168214174085, + "loss": 0.2195, + "step": 259720 + }, + { + "epoch": 10.76, + "grad_norm": 0.6484375, + "learning_rate": 0.0003575018909267971, + "loss": 0.1847, + "step": 259730 + }, + { + "epoch": 10.76, + "grad_norm": 0.5078125, + "learning_rate": 0.00035749209950954627, + "loss": 0.1877, + "step": 259740 + }, + { + "epoch": 10.76, + "grad_norm": 1.1328125, + "learning_rate": 0.00035748230789000695, + "loss": 0.2095, + "step": 259750 + }, + { + "epoch": 10.76, + "grad_norm": 0.62890625, + "learning_rate": 0.00035747251606819757, + "loss": 0.1954, + "step": 259760 + }, + { + "epoch": 10.76, + "grad_norm": 0.51171875, + "learning_rate": 0.0003574627240441365, + "loss": 0.2185, + "step": 259770 + }, + { + "epoch": 10.76, + "grad_norm": 0.875, + "learning_rate": 0.0003574529318178422, + "loss": 0.2041, + "step": 259780 + }, + { + "epoch": 10.76, + "grad_norm": 0.88671875, + "learning_rate": 0.00035744313938933307, + "loss": 0.1824, + "step": 259790 + }, + { + "epoch": 10.76, + "grad_norm": 0.6640625, + "learning_rate": 0.00035743334675862755, + "loss": 0.1576, + "step": 259800 + }, + { + "epoch": 10.76, + "grad_norm": 0.60546875, + "learning_rate": 0.00035742355392574414, + "loss": 0.1817, + "step": 259810 + }, + { + "epoch": 10.76, + "grad_norm": 0.38671875, + "learning_rate": 0.0003574137608907011, + "loss": 0.1678, + "step": 259820 + }, + { + "epoch": 10.76, + "grad_norm": 0.421875, + "learning_rate": 0.000357403967653517, + "loss": 0.205, + "step": 259830 + }, + { + "epoch": 10.76, + "grad_norm": 0.859375, + "learning_rate": 0.0003573941742142103, + "loss": 0.2116, + "step": 259840 + }, + { + "epoch": 10.76, + "grad_norm": 0.75390625, + "learning_rate": 0.0003573843805727993, + "loss": 0.1852, + "step": 259850 + }, + { + "epoch": 10.76, + "grad_norm": 1.5546875, + "learning_rate": 0.0003573745867293025, + "loss": 0.2193, + "step": 259860 + }, + { + "epoch": 10.76, + "grad_norm": 0.5234375, + "learning_rate": 0.00035736479268373835, + "loss": 0.1792, + "step": 259870 + }, + { + "epoch": 10.76, + "grad_norm": 1.1640625, + "learning_rate": 0.00035735499843612517, + "loss": 0.2401, + "step": 259880 + }, + { + "epoch": 10.76, + "grad_norm": 0.56640625, + "learning_rate": 0.0003573452039864816, + "loss": 0.1943, + "step": 259890 + }, + { + "epoch": 10.77, + "grad_norm": 0.609375, + "learning_rate": 0.00035733540933482577, + "loss": 0.1836, + "step": 259900 + }, + { + "epoch": 10.77, + "grad_norm": 0.6953125, + "learning_rate": 0.0003573256144811765, + "loss": 0.1773, + "step": 259910 + }, + { + "epoch": 10.77, + "grad_norm": 0.76953125, + "learning_rate": 0.0003573158194255519, + "loss": 0.1782, + "step": 259920 + }, + { + "epoch": 10.77, + "grad_norm": 1.515625, + "learning_rate": 0.0003573060241679705, + "loss": 0.2471, + "step": 259930 + }, + { + "epoch": 10.77, + "grad_norm": 0.4609375, + "learning_rate": 0.0003572962287084508, + "loss": 0.1775, + "step": 259940 + }, + { + "epoch": 10.77, + "grad_norm": 0.95703125, + "learning_rate": 0.0003572864330470112, + "loss": 0.1661, + "step": 259950 + }, + { + "epoch": 10.77, + "grad_norm": 0.81640625, + "learning_rate": 0.00035727663718367004, + "loss": 0.1463, + "step": 259960 + }, + { + "epoch": 10.77, + "grad_norm": 0.94921875, + "learning_rate": 0.0003572668411184459, + "loss": 0.154, + "step": 259970 + }, + { + "epoch": 10.77, + "grad_norm": 2.234375, + "learning_rate": 0.00035725704485135715, + "loss": 0.1869, + "step": 259980 + }, + { + "epoch": 10.77, + "grad_norm": 0.67578125, + "learning_rate": 0.00035724724838242215, + "loss": 0.1788, + "step": 259990 + }, + { + "epoch": 10.77, + "grad_norm": 1.8125, + "learning_rate": 0.00035723745171165955, + "loss": 0.25, + "step": 260000 + }, + { + "epoch": 10.77, + "grad_norm": 1.8046875, + "learning_rate": 0.00035722765483908747, + "loss": 0.2001, + "step": 260010 + }, + { + "epoch": 10.77, + "grad_norm": 0.4375, + "learning_rate": 0.0003572178577647247, + "loss": 0.1739, + "step": 260020 + }, + { + "epoch": 10.77, + "grad_norm": 0.8125, + "learning_rate": 0.00035720806048858937, + "loss": 0.1759, + "step": 260030 + }, + { + "epoch": 10.77, + "grad_norm": 0.4765625, + "learning_rate": 0.00035719826301070006, + "loss": 0.2089, + "step": 260040 + }, + { + "epoch": 10.77, + "grad_norm": 0.5859375, + "learning_rate": 0.0003571884653310753, + "loss": 0.2441, + "step": 260050 + }, + { + "epoch": 10.77, + "grad_norm": 0.75390625, + "learning_rate": 0.0003571786674497333, + "loss": 0.1813, + "step": 260060 + }, + { + "epoch": 10.77, + "grad_norm": 0.47265625, + "learning_rate": 0.00035716886936669267, + "loss": 0.1876, + "step": 260070 + }, + { + "epoch": 10.77, + "grad_norm": 0.37109375, + "learning_rate": 0.00035715907108197187, + "loss": 0.1906, + "step": 260080 + }, + { + "epoch": 10.77, + "grad_norm": 0.26953125, + "learning_rate": 0.00035714927259558913, + "loss": 0.1401, + "step": 260090 + }, + { + "epoch": 10.77, + "grad_norm": 0.5234375, + "learning_rate": 0.00035713947390756317, + "loss": 0.1933, + "step": 260100 + }, + { + "epoch": 10.77, + "grad_norm": 0.80859375, + "learning_rate": 0.0003571296750179122, + "loss": 0.2252, + "step": 260110 + }, + { + "epoch": 10.77, + "grad_norm": 0.43359375, + "learning_rate": 0.00035711987592665476, + "loss": 0.2197, + "step": 260120 + }, + { + "epoch": 10.77, + "grad_norm": 2.125, + "learning_rate": 0.0003571100766338093, + "loss": 0.2464, + "step": 260130 + }, + { + "epoch": 10.77, + "grad_norm": 0.53515625, + "learning_rate": 0.0003571002771393942, + "loss": 0.2222, + "step": 260140 + }, + { + "epoch": 10.78, + "grad_norm": 0.63671875, + "learning_rate": 0.00035709047744342807, + "loss": 0.1751, + "step": 260150 + }, + { + "epoch": 10.78, + "grad_norm": 0.302734375, + "learning_rate": 0.00035708067754592916, + "loss": 0.1591, + "step": 260160 + }, + { + "epoch": 10.78, + "grad_norm": 0.984375, + "learning_rate": 0.00035707087744691593, + "loss": 0.2126, + "step": 260170 + }, + { + "epoch": 10.78, + "grad_norm": 1.4375, + "learning_rate": 0.00035706107714640687, + "loss": 0.2097, + "step": 260180 + }, + { + "epoch": 10.78, + "grad_norm": 0.7265625, + "learning_rate": 0.0003570512766444205, + "loss": 0.2106, + "step": 260190 + }, + { + "epoch": 10.78, + "grad_norm": 1.09375, + "learning_rate": 0.0003570414759409751, + "loss": 0.1931, + "step": 260200 + }, + { + "epoch": 10.78, + "grad_norm": 0.5, + "learning_rate": 0.00035703167503608926, + "loss": 0.1809, + "step": 260210 + }, + { + "epoch": 10.78, + "grad_norm": 0.42578125, + "learning_rate": 0.0003570218739297813, + "loss": 0.214, + "step": 260220 + }, + { + "epoch": 10.78, + "grad_norm": 1.7890625, + "learning_rate": 0.0003570120726220698, + "loss": 0.261, + "step": 260230 + }, + { + "epoch": 10.78, + "grad_norm": 0.42578125, + "learning_rate": 0.00035700227111297314, + "loss": 0.2616, + "step": 260240 + }, + { + "epoch": 10.78, + "grad_norm": 0.484375, + "learning_rate": 0.00035699246940250974, + "loss": 0.1699, + "step": 260250 + }, + { + "epoch": 10.78, + "grad_norm": 0.70703125, + "learning_rate": 0.00035698266749069807, + "loss": 0.1871, + "step": 260260 + }, + { + "epoch": 10.78, + "grad_norm": 0.72265625, + "learning_rate": 0.00035697286537755656, + "loss": 0.1762, + "step": 260270 + }, + { + "epoch": 10.78, + "grad_norm": 0.609375, + "learning_rate": 0.00035696306306310364, + "loss": 0.1511, + "step": 260280 + }, + { + "epoch": 10.78, + "grad_norm": 0.98828125, + "learning_rate": 0.0003569532605473579, + "loss": 0.222, + "step": 260290 + }, + { + "epoch": 10.78, + "grad_norm": 1.6640625, + "learning_rate": 0.0003569434578303375, + "loss": 0.2232, + "step": 260300 + }, + { + "epoch": 10.78, + "grad_norm": 1.2578125, + "learning_rate": 0.0003569336549120612, + "loss": 0.1871, + "step": 260310 + }, + { + "epoch": 10.78, + "grad_norm": 0.95703125, + "learning_rate": 0.0003569238517925473, + "loss": 0.1917, + "step": 260320 + }, + { + "epoch": 10.78, + "grad_norm": 0.52734375, + "learning_rate": 0.0003569140484718142, + "loss": 0.1814, + "step": 260330 + }, + { + "epoch": 10.78, + "grad_norm": 0.412109375, + "learning_rate": 0.00035690424494988043, + "loss": 0.2387, + "step": 260340 + }, + { + "epoch": 10.78, + "grad_norm": 0.68359375, + "learning_rate": 0.0003568944412267644, + "loss": 0.1954, + "step": 260350 + }, + { + "epoch": 10.78, + "grad_norm": 0.8515625, + "learning_rate": 0.0003568846373024846, + "loss": 0.2087, + "step": 260360 + }, + { + "epoch": 10.78, + "grad_norm": 0.62109375, + "learning_rate": 0.00035687483317705943, + "loss": 0.1925, + "step": 260370 + }, + { + "epoch": 10.78, + "grad_norm": 2.0625, + "learning_rate": 0.00035686502885050737, + "loss": 0.2355, + "step": 260380 + }, + { + "epoch": 10.79, + "grad_norm": 1.03125, + "learning_rate": 0.00035685522432284697, + "loss": 0.1991, + "step": 260390 + }, + { + "epoch": 10.79, + "grad_norm": 0.6953125, + "learning_rate": 0.00035684541959409645, + "loss": 0.2049, + "step": 260400 + }, + { + "epoch": 10.79, + "grad_norm": 1.1328125, + "learning_rate": 0.0003568356146642744, + "loss": 0.2122, + "step": 260410 + }, + { + "epoch": 10.79, + "grad_norm": 0.671875, + "learning_rate": 0.0003568258095333993, + "loss": 0.1755, + "step": 260420 + }, + { + "epoch": 10.79, + "grad_norm": 0.69140625, + "learning_rate": 0.00035681600420148956, + "loss": 0.2051, + "step": 260430 + }, + { + "epoch": 10.79, + "grad_norm": 0.435546875, + "learning_rate": 0.00035680619866856363, + "loss": 0.2074, + "step": 260440 + }, + { + "epoch": 10.79, + "grad_norm": 0.83203125, + "learning_rate": 0.00035679639293463995, + "loss": 0.2045, + "step": 260450 + }, + { + "epoch": 10.79, + "grad_norm": 0.177734375, + "learning_rate": 0.000356786586999737, + "loss": 0.2425, + "step": 260460 + }, + { + "epoch": 10.79, + "grad_norm": 0.7890625, + "learning_rate": 0.00035677678086387324, + "loss": 0.2227, + "step": 260470 + }, + { + "epoch": 10.79, + "grad_norm": 0.80078125, + "learning_rate": 0.00035676697452706707, + "loss": 0.2088, + "step": 260480 + }, + { + "epoch": 10.79, + "grad_norm": 0.6484375, + "learning_rate": 0.000356757167989337, + "loss": 0.218, + "step": 260490 + }, + { + "epoch": 10.79, + "grad_norm": 1.0078125, + "learning_rate": 0.00035674736125070153, + "loss": 0.2023, + "step": 260500 + }, + { + "epoch": 10.79, + "grad_norm": 0.796875, + "learning_rate": 0.00035673755431117897, + "loss": 0.1757, + "step": 260510 + }, + { + "epoch": 10.79, + "grad_norm": 0.921875, + "learning_rate": 0.00035672774717078796, + "loss": 0.2419, + "step": 260520 + }, + { + "epoch": 10.79, + "grad_norm": 0.765625, + "learning_rate": 0.0003567179398295467, + "loss": 0.2811, + "step": 260530 + }, + { + "epoch": 10.79, + "grad_norm": 0.64453125, + "learning_rate": 0.0003567081322874739, + "loss": 0.2336, + "step": 260540 + }, + { + "epoch": 10.79, + "grad_norm": 0.466796875, + "learning_rate": 0.00035669832454458795, + "loss": 0.1939, + "step": 260550 + }, + { + "epoch": 10.79, + "grad_norm": 0.43359375, + "learning_rate": 0.0003566885166009072, + "loss": 0.2066, + "step": 260560 + }, + { + "epoch": 10.79, + "grad_norm": 0.62890625, + "learning_rate": 0.0003566787084564502, + "loss": 0.1703, + "step": 260570 + }, + { + "epoch": 10.79, + "grad_norm": 1.8671875, + "learning_rate": 0.0003566689001112354, + "loss": 0.1764, + "step": 260580 + }, + { + "epoch": 10.79, + "grad_norm": 0.61328125, + "learning_rate": 0.0003566590915652813, + "loss": 0.2101, + "step": 260590 + }, + { + "epoch": 10.79, + "grad_norm": 1.0, + "learning_rate": 0.00035664928281860627, + "loss": 0.1952, + "step": 260600 + }, + { + "epoch": 10.79, + "grad_norm": 1.4609375, + "learning_rate": 0.0003566394738712288, + "loss": 0.2338, + "step": 260610 + }, + { + "epoch": 10.79, + "grad_norm": 1.1015625, + "learning_rate": 0.0003566296647231673, + "loss": 0.1932, + "step": 260620 + }, + { + "epoch": 10.8, + "grad_norm": 0.74609375, + "learning_rate": 0.0003566198553744404, + "loss": 0.1862, + "step": 260630 + }, + { + "epoch": 10.8, + "grad_norm": 0.625, + "learning_rate": 0.0003566100458250664, + "loss": 0.2035, + "step": 260640 + }, + { + "epoch": 10.8, + "grad_norm": 0.8046875, + "learning_rate": 0.00035660023607506373, + "loss": 0.1998, + "step": 260650 + }, + { + "epoch": 10.8, + "grad_norm": 0.734375, + "learning_rate": 0.00035659042612445105, + "loss": 0.2015, + "step": 260660 + }, + { + "epoch": 10.8, + "grad_norm": 0.59765625, + "learning_rate": 0.00035658061597324664, + "loss": 0.2312, + "step": 260670 + }, + { + "epoch": 10.8, + "grad_norm": 0.640625, + "learning_rate": 0.00035657080562146907, + "loss": 0.186, + "step": 260680 + }, + { + "epoch": 10.8, + "grad_norm": 0.7421875, + "learning_rate": 0.00035656099506913664, + "loss": 0.1958, + "step": 260690 + }, + { + "epoch": 10.8, + "grad_norm": 0.796875, + "learning_rate": 0.00035655118431626796, + "loss": 0.2276, + "step": 260700 + }, + { + "epoch": 10.8, + "grad_norm": 0.8125, + "learning_rate": 0.00035654137336288156, + "loss": 0.1754, + "step": 260710 + }, + { + "epoch": 10.8, + "grad_norm": 0.703125, + "learning_rate": 0.00035653156220899577, + "loss": 0.2182, + "step": 260720 + }, + { + "epoch": 10.8, + "grad_norm": 1.8515625, + "learning_rate": 0.000356521750854629, + "loss": 0.1841, + "step": 260730 + }, + { + "epoch": 10.8, + "grad_norm": 1.40625, + "learning_rate": 0.0003565119392997998, + "loss": 0.1792, + "step": 260740 + }, + { + "epoch": 10.8, + "grad_norm": 0.64453125, + "learning_rate": 0.0003565021275445267, + "loss": 0.1608, + "step": 260750 + }, + { + "epoch": 10.8, + "grad_norm": 0.62890625, + "learning_rate": 0.0003564923155888281, + "loss": 0.1716, + "step": 260760 + }, + { + "epoch": 10.8, + "grad_norm": 0.3828125, + "learning_rate": 0.0003564825034327224, + "loss": 0.2174, + "step": 260770 + }, + { + "epoch": 10.8, + "grad_norm": 0.515625, + "learning_rate": 0.00035647269107622813, + "loss": 0.2333, + "step": 260780 + }, + { + "epoch": 10.8, + "grad_norm": 0.9140625, + "learning_rate": 0.0003564628785193638, + "loss": 0.1792, + "step": 260790 + }, + { + "epoch": 10.8, + "grad_norm": 0.435546875, + "learning_rate": 0.00035645306576214785, + "loss": 0.1786, + "step": 260800 + }, + { + "epoch": 10.8, + "grad_norm": 0.376953125, + "learning_rate": 0.00035644325280459867, + "loss": 0.2126, + "step": 260810 + }, + { + "epoch": 10.8, + "grad_norm": 0.55859375, + "learning_rate": 0.0003564334396467348, + "loss": 0.2285, + "step": 260820 + }, + { + "epoch": 10.8, + "grad_norm": 1.3046875, + "learning_rate": 0.00035642362628857474, + "loss": 0.1741, + "step": 260830 + }, + { + "epoch": 10.8, + "grad_norm": 1.671875, + "learning_rate": 0.00035641381273013686, + "loss": 0.1982, + "step": 260840 + }, + { + "epoch": 10.8, + "grad_norm": 0.9921875, + "learning_rate": 0.0003564039989714397, + "loss": 0.2082, + "step": 260850 + }, + { + "epoch": 10.8, + "grad_norm": 1.0078125, + "learning_rate": 0.00035639418501250163, + "loss": 0.1766, + "step": 260860 + }, + { + "epoch": 10.81, + "grad_norm": 0.0, + "learning_rate": 0.00035638437085334126, + "loss": 0.2151, + "step": 260870 + }, + { + "epoch": 10.81, + "grad_norm": 0.7109375, + "learning_rate": 0.000356374556493977, + "loss": 0.2125, + "step": 260880 + }, + { + "epoch": 10.81, + "grad_norm": 1.4921875, + "learning_rate": 0.00035636474193442727, + "loss": 0.164, + "step": 260890 + }, + { + "epoch": 10.81, + "grad_norm": 0.4609375, + "learning_rate": 0.0003563549271747106, + "loss": 0.193, + "step": 260900 + }, + { + "epoch": 10.81, + "grad_norm": 0.14453125, + "learning_rate": 0.0003563451122148454, + "loss": 0.1574, + "step": 260910 + }, + { + "epoch": 10.81, + "grad_norm": 0.84765625, + "learning_rate": 0.0003563352970548503, + "loss": 0.1908, + "step": 260920 + }, + { + "epoch": 10.81, + "grad_norm": 0.6328125, + "learning_rate": 0.00035632548169474356, + "loss": 0.2219, + "step": 260930 + }, + { + "epoch": 10.81, + "grad_norm": 0.8203125, + "learning_rate": 0.00035631566613454374, + "loss": 0.2224, + "step": 260940 + }, + { + "epoch": 10.81, + "grad_norm": 1.0390625, + "learning_rate": 0.0003563058503742694, + "loss": 0.1844, + "step": 260950 + }, + { + "epoch": 10.81, + "grad_norm": 0.88671875, + "learning_rate": 0.00035629603441393885, + "loss": 0.1624, + "step": 260960 + }, + { + "epoch": 10.81, + "grad_norm": 0.92578125, + "learning_rate": 0.00035628621825357064, + "loss": 0.1859, + "step": 260970 + }, + { + "epoch": 10.81, + "grad_norm": 0.71484375, + "learning_rate": 0.0003562764018931833, + "loss": 0.2146, + "step": 260980 + }, + { + "epoch": 10.81, + "grad_norm": 0.5078125, + "learning_rate": 0.0003562665853327952, + "loss": 0.1684, + "step": 260990 + }, + { + "epoch": 10.81, + "grad_norm": 0.62890625, + "learning_rate": 0.0003562567685724249, + "loss": 0.1461, + "step": 261000 + }, + { + "epoch": 10.81, + "grad_norm": 0.76953125, + "learning_rate": 0.0003562469516120909, + "loss": 0.1744, + "step": 261010 + }, + { + "epoch": 10.81, + "grad_norm": 0.640625, + "learning_rate": 0.00035623713445181147, + "loss": 0.2038, + "step": 261020 + }, + { + "epoch": 10.81, + "grad_norm": 1.8515625, + "learning_rate": 0.0003562273170916053, + "loss": 0.2084, + "step": 261030 + }, + { + "epoch": 10.81, + "grad_norm": 1.5625, + "learning_rate": 0.0003562174995314908, + "loss": 0.1784, + "step": 261040 + }, + { + "epoch": 10.81, + "grad_norm": 1.4765625, + "learning_rate": 0.00035620768177148637, + "loss": 0.227, + "step": 261050 + }, + { + "epoch": 10.81, + "grad_norm": 1.0078125, + "learning_rate": 0.00035619786381161056, + "loss": 0.2013, + "step": 261060 + }, + { + "epoch": 10.81, + "grad_norm": 0.69140625, + "learning_rate": 0.0003561880456518819, + "loss": 0.2074, + "step": 261070 + }, + { + "epoch": 10.81, + "grad_norm": 1.21875, + "learning_rate": 0.0003561782272923188, + "loss": 0.1949, + "step": 261080 + }, + { + "epoch": 10.81, + "grad_norm": 0.62890625, + "learning_rate": 0.00035616840873293977, + "loss": 0.2217, + "step": 261090 + }, + { + "epoch": 10.81, + "grad_norm": 1.0, + "learning_rate": 0.0003561585899737632, + "loss": 0.1578, + "step": 261100 + }, + { + "epoch": 10.82, + "grad_norm": 0.52734375, + "learning_rate": 0.0003561487710148076, + "loss": 0.1927, + "step": 261110 + }, + { + "epoch": 10.82, + "grad_norm": 0.5390625, + "learning_rate": 0.0003561389518560916, + "loss": 0.1864, + "step": 261120 + }, + { + "epoch": 10.82, + "grad_norm": 0.91796875, + "learning_rate": 0.00035612913249763337, + "loss": 0.2089, + "step": 261130 + }, + { + "epoch": 10.82, + "grad_norm": 0.58984375, + "learning_rate": 0.0003561193129394517, + "loss": 0.2155, + "step": 261140 + }, + { + "epoch": 10.82, + "grad_norm": 0.890625, + "learning_rate": 0.00035610949318156494, + "loss": 0.2076, + "step": 261150 + }, + { + "epoch": 10.82, + "grad_norm": 0.5546875, + "learning_rate": 0.0003560996732239915, + "loss": 0.2634, + "step": 261160 + }, + { + "epoch": 10.82, + "grad_norm": 0.9921875, + "learning_rate": 0.00035608985306675, + "loss": 0.248, + "step": 261170 + }, + { + "epoch": 10.82, + "grad_norm": 0.75, + "learning_rate": 0.00035608003270985885, + "loss": 0.1652, + "step": 261180 + }, + { + "epoch": 10.82, + "grad_norm": 0.416015625, + "learning_rate": 0.0003560702121533365, + "loss": 0.2027, + "step": 261190 + }, + { + "epoch": 10.82, + "grad_norm": 0.81640625, + "learning_rate": 0.00035606039139720154, + "loss": 0.2173, + "step": 261200 + }, + { + "epoch": 10.82, + "grad_norm": 0.439453125, + "learning_rate": 0.0003560505704414723, + "loss": 0.2233, + "step": 261210 + }, + { + "epoch": 10.82, + "grad_norm": 1.1796875, + "learning_rate": 0.00035604074928616736, + "loss": 0.2106, + "step": 261220 + }, + { + "epoch": 10.82, + "grad_norm": 1.9375, + "learning_rate": 0.00035603092793130516, + "loss": 0.1523, + "step": 261230 + }, + { + "epoch": 10.82, + "grad_norm": 1.3203125, + "learning_rate": 0.00035602110637690423, + "loss": 0.2644, + "step": 261240 + }, + { + "epoch": 10.82, + "grad_norm": 0.97265625, + "learning_rate": 0.00035601128462298305, + "loss": 0.2182, + "step": 261250 + }, + { + "epoch": 10.82, + "grad_norm": 0.75390625, + "learning_rate": 0.00035600146266956, + "loss": 0.1948, + "step": 261260 + }, + { + "epoch": 10.82, + "grad_norm": 0.6640625, + "learning_rate": 0.0003559916405166537, + "loss": 0.186, + "step": 261270 + }, + { + "epoch": 10.82, + "grad_norm": 0.98046875, + "learning_rate": 0.00035598181816428264, + "loss": 0.1689, + "step": 261280 + }, + { + "epoch": 10.82, + "grad_norm": 1.6171875, + "learning_rate": 0.00035597199561246515, + "loss": 0.2338, + "step": 261290 + }, + { + "epoch": 10.82, + "grad_norm": 0.435546875, + "learning_rate": 0.00035596217286121984, + "loss": 0.1735, + "step": 261300 + }, + { + "epoch": 10.82, + "grad_norm": 3.0625, + "learning_rate": 0.00035595234991056513, + "loss": 0.2047, + "step": 261310 + }, + { + "epoch": 10.82, + "grad_norm": 0.73828125, + "learning_rate": 0.0003559425267605196, + "loss": 0.2175, + "step": 261320 + }, + { + "epoch": 10.82, + "grad_norm": 0.6875, + "learning_rate": 0.00035593270341110166, + "loss": 0.1758, + "step": 261330 + }, + { + "epoch": 10.82, + "grad_norm": 1.5078125, + "learning_rate": 0.0003559228798623298, + "loss": 0.174, + "step": 261340 + }, + { + "epoch": 10.83, + "grad_norm": 1.125, + "learning_rate": 0.00035591305611422255, + "loss": 0.2279, + "step": 261350 + }, + { + "epoch": 10.83, + "grad_norm": 0.73046875, + "learning_rate": 0.00035590323216679834, + "loss": 0.1803, + "step": 261360 + }, + { + "epoch": 10.83, + "grad_norm": 0.7734375, + "learning_rate": 0.00035589340802007566, + "loss": 0.2274, + "step": 261370 + }, + { + "epoch": 10.83, + "grad_norm": 0.69140625, + "learning_rate": 0.00035588358367407303, + "loss": 0.2024, + "step": 261380 + }, + { + "epoch": 10.83, + "grad_norm": 0.81640625, + "learning_rate": 0.00035587375912880895, + "loss": 0.2136, + "step": 261390 + }, + { + "epoch": 10.83, + "grad_norm": 0.96875, + "learning_rate": 0.00035586393438430186, + "loss": 0.1906, + "step": 261400 + }, + { + "epoch": 10.83, + "grad_norm": 0.447265625, + "learning_rate": 0.00035585410944057033, + "loss": 0.2076, + "step": 261410 + }, + { + "epoch": 10.83, + "grad_norm": 0.76953125, + "learning_rate": 0.0003558442842976327, + "loss": 0.1867, + "step": 261420 + }, + { + "epoch": 10.83, + "grad_norm": 0.75390625, + "learning_rate": 0.0003558344589555076, + "loss": 0.1932, + "step": 261430 + }, + { + "epoch": 10.83, + "grad_norm": 0.63671875, + "learning_rate": 0.0003558246334142136, + "loss": 0.223, + "step": 261440 + }, + { + "epoch": 10.83, + "grad_norm": 1.2734375, + "learning_rate": 0.0003558148076737689, + "loss": 0.2029, + "step": 261450 + }, + { + "epoch": 10.83, + "grad_norm": 0.86328125, + "learning_rate": 0.00035580498173419224, + "loss": 0.1951, + "step": 261460 + }, + { + "epoch": 10.83, + "grad_norm": 0.34375, + "learning_rate": 0.00035579515559550204, + "loss": 0.2117, + "step": 261470 + }, + { + "epoch": 10.83, + "grad_norm": 1.4765625, + "learning_rate": 0.00035578532925771675, + "loss": 0.1609, + "step": 261480 + }, + { + "epoch": 10.83, + "grad_norm": 1.1953125, + "learning_rate": 0.0003557755027208549, + "loss": 0.1552, + "step": 261490 + }, + { + "epoch": 10.83, + "grad_norm": 2.09375, + "learning_rate": 0.00035576567598493493, + "loss": 0.1855, + "step": 261500 + }, + { + "epoch": 10.83, + "grad_norm": 0.62109375, + "learning_rate": 0.0003557558490499755, + "loss": 0.2196, + "step": 261510 + }, + { + "epoch": 10.83, + "grad_norm": 0.97265625, + "learning_rate": 0.00035574602191599494, + "loss": 0.2628, + "step": 261520 + }, + { + "epoch": 10.83, + "grad_norm": 0.6640625, + "learning_rate": 0.00035573619458301165, + "loss": 0.1914, + "step": 261530 + }, + { + "epoch": 10.83, + "grad_norm": 0.6796875, + "learning_rate": 0.00035572636705104445, + "loss": 0.2196, + "step": 261540 + }, + { + "epoch": 10.83, + "grad_norm": 0.78515625, + "learning_rate": 0.00035571653932011147, + "loss": 0.1846, + "step": 261550 + }, + { + "epoch": 10.83, + "grad_norm": 0.73828125, + "learning_rate": 0.00035570671139023146, + "loss": 0.1639, + "step": 261560 + }, + { + "epoch": 10.83, + "grad_norm": 0.275390625, + "learning_rate": 0.0003556968832614229, + "loss": 0.2213, + "step": 261570 + }, + { + "epoch": 10.83, + "grad_norm": 0.5546875, + "learning_rate": 0.0003556870549337041, + "loss": 0.2112, + "step": 261580 + }, + { + "epoch": 10.84, + "grad_norm": 0.69921875, + "learning_rate": 0.0003556772264070938, + "loss": 0.2319, + "step": 261590 + }, + { + "epoch": 10.84, + "grad_norm": 1.109375, + "learning_rate": 0.0003556673976816104, + "loss": 0.2032, + "step": 261600 + }, + { + "epoch": 10.84, + "grad_norm": 0.74609375, + "learning_rate": 0.0003556575687572722, + "loss": 0.1936, + "step": 261610 + }, + { + "epoch": 10.84, + "grad_norm": 0.73046875, + "learning_rate": 0.00035564773963409795, + "loss": 0.2009, + "step": 261620 + }, + { + "epoch": 10.84, + "grad_norm": 1.3515625, + "learning_rate": 0.0003556379103121061, + "loss": 0.2068, + "step": 261630 + }, + { + "epoch": 10.84, + "grad_norm": 0.419921875, + "learning_rate": 0.000355628080791315, + "loss": 0.158, + "step": 261640 + }, + { + "epoch": 10.84, + "grad_norm": 1.125, + "learning_rate": 0.0003556182510717434, + "loss": 0.2077, + "step": 261650 + }, + { + "epoch": 10.84, + "grad_norm": 1.2578125, + "learning_rate": 0.0003556084211534095, + "loss": 0.1682, + "step": 261660 + }, + { + "epoch": 10.84, + "grad_norm": 0.65625, + "learning_rate": 0.00035559859103633206, + "loss": 0.1706, + "step": 261670 + }, + { + "epoch": 10.84, + "grad_norm": 0.232421875, + "learning_rate": 0.00035558876072052954, + "loss": 0.1857, + "step": 261680 + }, + { + "epoch": 10.84, + "grad_norm": 0.478515625, + "learning_rate": 0.00035557893020602023, + "loss": 0.2055, + "step": 261690 + }, + { + "epoch": 10.84, + "grad_norm": 2.125, + "learning_rate": 0.0003555690994928229, + "loss": 0.1955, + "step": 261700 + }, + { + "epoch": 10.84, + "grad_norm": 0.66796875, + "learning_rate": 0.00035555926858095584, + "loss": 0.2371, + "step": 261710 + }, + { + "epoch": 10.84, + "grad_norm": 0.8984375, + "learning_rate": 0.0003555494374704377, + "loss": 0.1419, + "step": 261720 + }, + { + "epoch": 10.84, + "grad_norm": 0.65234375, + "learning_rate": 0.0003555396061612869, + "loss": 0.2371, + "step": 261730 + }, + { + "epoch": 10.84, + "grad_norm": 0.66796875, + "learning_rate": 0.0003555297746535218, + "loss": 0.1427, + "step": 261740 + }, + { + "epoch": 10.84, + "grad_norm": 1.0859375, + "learning_rate": 0.00035551994294716127, + "loss": 0.2019, + "step": 261750 + }, + { + "epoch": 10.84, + "grad_norm": 0.45703125, + "learning_rate": 0.0003555101110422235, + "loss": 0.2035, + "step": 261760 + }, + { + "epoch": 10.84, + "grad_norm": 1.0234375, + "learning_rate": 0.0003555002789387271, + "loss": 0.191, + "step": 261770 + }, + { + "epoch": 10.84, + "grad_norm": 0.984375, + "learning_rate": 0.00035549044663669066, + "loss": 0.2066, + "step": 261780 + }, + { + "epoch": 10.84, + "grad_norm": 0.63671875, + "learning_rate": 0.00035548061413613255, + "loss": 0.1887, + "step": 261790 + }, + { + "epoch": 10.84, + "grad_norm": 1.25, + "learning_rate": 0.00035547078143707123, + "loss": 0.1672, + "step": 261800 + }, + { + "epoch": 10.84, + "grad_norm": 0.87109375, + "learning_rate": 0.0003554609485395254, + "loss": 0.2316, + "step": 261810 + }, + { + "epoch": 10.84, + "grad_norm": 0.8046875, + "learning_rate": 0.0003554511154435133, + "loss": 0.2272, + "step": 261820 + }, + { + "epoch": 10.84, + "grad_norm": 1.0390625, + "learning_rate": 0.00035544128214905376, + "loss": 0.1876, + "step": 261830 + }, + { + "epoch": 10.85, + "grad_norm": 1.765625, + "learning_rate": 0.00035543144865616503, + "loss": 0.1578, + "step": 261840 + }, + { + "epoch": 10.85, + "grad_norm": 1.125, + "learning_rate": 0.0003554216149648657, + "loss": 0.202, + "step": 261850 + }, + { + "epoch": 10.85, + "grad_norm": 0.59375, + "learning_rate": 0.0003554117810751743, + "loss": 0.2341, + "step": 261860 + }, + { + "epoch": 10.85, + "grad_norm": 0.5390625, + "learning_rate": 0.0003554019469871093, + "loss": 0.2258, + "step": 261870 + }, + { + "epoch": 10.85, + "grad_norm": 1.28125, + "learning_rate": 0.0003553921127006892, + "loss": 0.2055, + "step": 261880 + }, + { + "epoch": 10.85, + "grad_norm": 0.68359375, + "learning_rate": 0.00035538227821593256, + "loss": 0.2278, + "step": 261890 + }, + { + "epoch": 10.85, + "grad_norm": 0.60546875, + "learning_rate": 0.0003553724435328578, + "loss": 0.2127, + "step": 261900 + }, + { + "epoch": 10.85, + "grad_norm": 0.5, + "learning_rate": 0.00035536260865148356, + "loss": 0.1948, + "step": 261910 + }, + { + "epoch": 10.85, + "grad_norm": 0.51171875, + "learning_rate": 0.0003553527735718282, + "loss": 0.2209, + "step": 261920 + }, + { + "epoch": 10.85, + "grad_norm": 0.65625, + "learning_rate": 0.0003553429382939103, + "loss": 0.1842, + "step": 261930 + }, + { + "epoch": 10.85, + "grad_norm": 0.69921875, + "learning_rate": 0.00035533310281774845, + "loss": 0.2521, + "step": 261940 + }, + { + "epoch": 10.85, + "grad_norm": 0.6953125, + "learning_rate": 0.000355323267143361, + "loss": 0.1806, + "step": 261950 + }, + { + "epoch": 10.85, + "grad_norm": 0.482421875, + "learning_rate": 0.00035531343127076655, + "loss": 0.1767, + "step": 261960 + }, + { + "epoch": 10.85, + "grad_norm": 0.99609375, + "learning_rate": 0.00035530359519998355, + "loss": 0.2122, + "step": 261970 + }, + { + "epoch": 10.85, + "grad_norm": 0.283203125, + "learning_rate": 0.0003552937589310306, + "loss": 0.1908, + "step": 261980 + }, + { + "epoch": 10.85, + "grad_norm": 1.0234375, + "learning_rate": 0.00035528392246392623, + "loss": 0.2086, + "step": 261990 + }, + { + "epoch": 10.85, + "grad_norm": 0.9453125, + "learning_rate": 0.00035527408579868883, + "loss": 0.2022, + "step": 262000 + }, + { + "epoch": 10.85, + "grad_norm": 0.6328125, + "learning_rate": 0.0003552642489353369, + "loss": 0.2042, + "step": 262010 + }, + { + "epoch": 10.85, + "grad_norm": 1.140625, + "learning_rate": 0.0003552544118738891, + "loss": 0.2193, + "step": 262020 + }, + { + "epoch": 10.85, + "grad_norm": 0.34765625, + "learning_rate": 0.0003552445746143639, + "loss": 0.1568, + "step": 262030 + }, + { + "epoch": 10.85, + "grad_norm": 0.56640625, + "learning_rate": 0.0003552347371567797, + "loss": 0.1483, + "step": 262040 + }, + { + "epoch": 10.85, + "grad_norm": 0.83984375, + "learning_rate": 0.00035522489950115505, + "loss": 0.1493, + "step": 262050 + }, + { + "epoch": 10.85, + "grad_norm": 0.9921875, + "learning_rate": 0.00035521506164750853, + "loss": 0.1903, + "step": 262060 + }, + { + "epoch": 10.85, + "grad_norm": 1.5, + "learning_rate": 0.0003552052235958587, + "loss": 0.1982, + "step": 262070 + }, + { + "epoch": 10.86, + "grad_norm": 1.0703125, + "learning_rate": 0.000355195385346224, + "loss": 0.1892, + "step": 262080 + }, + { + "epoch": 10.86, + "grad_norm": 0.9921875, + "learning_rate": 0.00035518554689862285, + "loss": 0.1915, + "step": 262090 + }, + { + "epoch": 10.86, + "grad_norm": 0.8125, + "learning_rate": 0.0003551757082530739, + "loss": 0.2143, + "step": 262100 + }, + { + "epoch": 10.86, + "grad_norm": 0.59765625, + "learning_rate": 0.0003551658694095957, + "loss": 0.1665, + "step": 262110 + }, + { + "epoch": 10.86, + "grad_norm": 0.73046875, + "learning_rate": 0.0003551560303682067, + "loss": 0.1727, + "step": 262120 + }, + { + "epoch": 10.86, + "grad_norm": 0.70703125, + "learning_rate": 0.00035514619112892527, + "loss": 0.2528, + "step": 262130 + }, + { + "epoch": 10.86, + "grad_norm": 0.90234375, + "learning_rate": 0.0003551363516917701, + "loss": 0.2102, + "step": 262140 + }, + { + "epoch": 10.86, + "grad_norm": 0.87890625, + "learning_rate": 0.0003551265120567597, + "loss": 0.1621, + "step": 262150 + }, + { + "epoch": 10.86, + "grad_norm": 0.53125, + "learning_rate": 0.0003551166722239126, + "loss": 0.1974, + "step": 262160 + }, + { + "epoch": 10.86, + "grad_norm": 0.75390625, + "learning_rate": 0.00035510683219324725, + "loss": 0.2276, + "step": 262170 + }, + { + "epoch": 10.86, + "grad_norm": 1.859375, + "learning_rate": 0.00035509699196478213, + "loss": 0.1771, + "step": 262180 + }, + { + "epoch": 10.86, + "grad_norm": 0.50390625, + "learning_rate": 0.00035508715153853587, + "loss": 0.1938, + "step": 262190 + }, + { + "epoch": 10.86, + "grad_norm": 0.88671875, + "learning_rate": 0.00035507731091452696, + "loss": 0.2198, + "step": 262200 + }, + { + "epoch": 10.86, + "grad_norm": 0.431640625, + "learning_rate": 0.0003550674700927738, + "loss": 0.1542, + "step": 262210 + }, + { + "epoch": 10.86, + "grad_norm": 0.63671875, + "learning_rate": 0.0003550576290732951, + "loss": 0.1932, + "step": 262220 + }, + { + "epoch": 10.86, + "grad_norm": 0.87109375, + "learning_rate": 0.00035504778785610934, + "loss": 0.1659, + "step": 262230 + }, + { + "epoch": 10.86, + "grad_norm": 0.4140625, + "learning_rate": 0.0003550379464412349, + "loss": 0.2252, + "step": 262240 + }, + { + "epoch": 10.86, + "grad_norm": 0.93359375, + "learning_rate": 0.0003550281048286904, + "loss": 0.1991, + "step": 262250 + }, + { + "epoch": 10.86, + "grad_norm": 1.078125, + "learning_rate": 0.0003550182630184943, + "loss": 0.2401, + "step": 262260 + }, + { + "epoch": 10.86, + "grad_norm": 1.265625, + "learning_rate": 0.00035500842101066527, + "loss": 0.1901, + "step": 262270 + }, + { + "epoch": 10.86, + "grad_norm": 0.48828125, + "learning_rate": 0.00035499857880522167, + "loss": 0.2107, + "step": 262280 + }, + { + "epoch": 10.86, + "grad_norm": 0.78125, + "learning_rate": 0.0003549887364021821, + "loss": 0.2046, + "step": 262290 + }, + { + "epoch": 10.86, + "grad_norm": 0.76953125, + "learning_rate": 0.0003549788938015651, + "loss": 0.2014, + "step": 262300 + }, + { + "epoch": 10.86, + "grad_norm": 1.2734375, + "learning_rate": 0.0003549690510033891, + "loss": 0.2322, + "step": 262310 + }, + { + "epoch": 10.87, + "grad_norm": 0.53515625, + "learning_rate": 0.0003549592080076727, + "loss": 0.2264, + "step": 262320 + }, + { + "epoch": 10.87, + "grad_norm": 1.421875, + "learning_rate": 0.00035494936481443443, + "loss": 0.1948, + "step": 262330 + }, + { + "epoch": 10.87, + "grad_norm": 0.76171875, + "learning_rate": 0.0003549395214236927, + "loss": 0.1668, + "step": 262340 + }, + { + "epoch": 10.87, + "grad_norm": 0.2216796875, + "learning_rate": 0.0003549296778354663, + "loss": 0.2007, + "step": 262350 + }, + { + "epoch": 10.87, + "grad_norm": 0.466796875, + "learning_rate": 0.0003549198340497734, + "loss": 0.1974, + "step": 262360 + }, + { + "epoch": 10.87, + "grad_norm": 0.52734375, + "learning_rate": 0.0003549099900666328, + "loss": 0.1848, + "step": 262370 + }, + { + "epoch": 10.87, + "grad_norm": 0.76171875, + "learning_rate": 0.00035490014588606284, + "loss": 0.18, + "step": 262380 + }, + { + "epoch": 10.87, + "grad_norm": 1.0234375, + "learning_rate": 0.00035489030150808224, + "loss": 0.1895, + "step": 262390 + }, + { + "epoch": 10.87, + "grad_norm": 0.703125, + "learning_rate": 0.00035488045693270935, + "loss": 0.1943, + "step": 262400 + }, + { + "epoch": 10.87, + "grad_norm": 2.234375, + "learning_rate": 0.0003548706121599628, + "loss": 0.2013, + "step": 262410 + }, + { + "epoch": 10.87, + "grad_norm": 0.63671875, + "learning_rate": 0.00035486076718986104, + "loss": 0.2236, + "step": 262420 + }, + { + "epoch": 10.87, + "grad_norm": 0.70703125, + "learning_rate": 0.00035485092202242275, + "loss": 0.2061, + "step": 262430 + }, + { + "epoch": 10.87, + "grad_norm": 1.015625, + "learning_rate": 0.0003548410766576662, + "loss": 0.1738, + "step": 262440 + }, + { + "epoch": 10.87, + "grad_norm": 0.7890625, + "learning_rate": 0.00035483123109561016, + "loss": 0.2223, + "step": 262450 + }, + { + "epoch": 10.87, + "grad_norm": 0.32421875, + "learning_rate": 0.000354821385336273, + "loss": 0.1907, + "step": 262460 + }, + { + "epoch": 10.87, + "grad_norm": 1.1953125, + "learning_rate": 0.0003548115393796734, + "loss": 0.2068, + "step": 262470 + }, + { + "epoch": 10.87, + "grad_norm": 2.1875, + "learning_rate": 0.0003548016932258298, + "loss": 0.1777, + "step": 262480 + }, + { + "epoch": 10.87, + "grad_norm": 0.42578125, + "learning_rate": 0.0003547918468747606, + "loss": 0.1559, + "step": 262490 + }, + { + "epoch": 10.87, + "grad_norm": 0.4765625, + "learning_rate": 0.00035478200032648455, + "loss": 0.1936, + "step": 262500 + }, + { + "epoch": 10.87, + "grad_norm": 0.294921875, + "learning_rate": 0.00035477215358102013, + "loss": 0.1767, + "step": 262510 + }, + { + "epoch": 10.87, + "grad_norm": 1.015625, + "learning_rate": 0.0003547623066383857, + "loss": 0.1589, + "step": 262520 + }, + { + "epoch": 10.87, + "grad_norm": 0.515625, + "learning_rate": 0.0003547524594986001, + "loss": 0.197, + "step": 262530 + }, + { + "epoch": 10.87, + "grad_norm": 0.6796875, + "learning_rate": 0.0003547426121616816, + "loss": 0.1981, + "step": 262540 + }, + { + "epoch": 10.87, + "grad_norm": 0.89453125, + "learning_rate": 0.00035473276462764876, + "loss": 0.2181, + "step": 262550 + }, + { + "epoch": 10.88, + "grad_norm": 1.125, + "learning_rate": 0.00035472291689652027, + "loss": 0.2137, + "step": 262560 + }, + { + "epoch": 10.88, + "grad_norm": 0.4375, + "learning_rate": 0.00035471306896831445, + "loss": 0.1758, + "step": 262570 + }, + { + "epoch": 10.88, + "grad_norm": 1.2109375, + "learning_rate": 0.00035470322084305, + "loss": 0.2032, + "step": 262580 + }, + { + "epoch": 10.88, + "grad_norm": 0.66015625, + "learning_rate": 0.0003546933725207454, + "loss": 0.1819, + "step": 262590 + }, + { + "epoch": 10.88, + "grad_norm": 0.94140625, + "learning_rate": 0.00035468352400141914, + "loss": 0.2168, + "step": 262600 + }, + { + "epoch": 10.88, + "grad_norm": 1.2109375, + "learning_rate": 0.00035467367528508985, + "loss": 0.1892, + "step": 262610 + }, + { + "epoch": 10.88, + "grad_norm": 0.40625, + "learning_rate": 0.000354663826371776, + "loss": 0.1795, + "step": 262620 + }, + { + "epoch": 10.88, + "grad_norm": 0.91796875, + "learning_rate": 0.0003546539772614961, + "loss": 0.1602, + "step": 262630 + }, + { + "epoch": 10.88, + "grad_norm": 0.94140625, + "learning_rate": 0.0003546441279542687, + "loss": 0.2319, + "step": 262640 + }, + { + "epoch": 10.88, + "grad_norm": 0.625, + "learning_rate": 0.00035463427845011235, + "loss": 0.2023, + "step": 262650 + }, + { + "epoch": 10.88, + "grad_norm": 0.6953125, + "learning_rate": 0.00035462442874904565, + "loss": 0.2035, + "step": 262660 + }, + { + "epoch": 10.88, + "grad_norm": 1.375, + "learning_rate": 0.0003546145788510871, + "loss": 0.206, + "step": 262670 + }, + { + "epoch": 10.88, + "grad_norm": 0.46484375, + "learning_rate": 0.00035460472875625506, + "loss": 0.2379, + "step": 262680 + }, + { + "epoch": 10.88, + "grad_norm": 1.0546875, + "learning_rate": 0.0003545948784645684, + "loss": 0.1534, + "step": 262690 + }, + { + "epoch": 10.88, + "grad_norm": 0.96875, + "learning_rate": 0.0003545850279760453, + "loss": 0.1785, + "step": 262700 + }, + { + "epoch": 10.88, + "grad_norm": 0.69921875, + "learning_rate": 0.00035457517729070456, + "loss": 0.197, + "step": 262710 + }, + { + "epoch": 10.88, + "grad_norm": 1.0234375, + "learning_rate": 0.00035456532640856466, + "loss": 0.1782, + "step": 262720 + }, + { + "epoch": 10.88, + "grad_norm": 0.328125, + "learning_rate": 0.0003545554753296441, + "loss": 0.1602, + "step": 262730 + }, + { + "epoch": 10.88, + "grad_norm": 0.5390625, + "learning_rate": 0.00035454562405396134, + "loss": 0.1833, + "step": 262740 + }, + { + "epoch": 10.88, + "grad_norm": 0.54296875, + "learning_rate": 0.0003545357725815351, + "loss": 0.1979, + "step": 262750 + }, + { + "epoch": 10.88, + "grad_norm": 0.92578125, + "learning_rate": 0.00035452592091238374, + "loss": 0.2156, + "step": 262760 + }, + { + "epoch": 10.88, + "grad_norm": 0.703125, + "learning_rate": 0.000354516069046526, + "loss": 0.2004, + "step": 262770 + }, + { + "epoch": 10.88, + "grad_norm": 0.2197265625, + "learning_rate": 0.00035450621698398025, + "loss": 0.2269, + "step": 262780 + }, + { + "epoch": 10.88, + "grad_norm": 0.53125, + "learning_rate": 0.00035449636472476497, + "loss": 0.2089, + "step": 262790 + }, + { + "epoch": 10.89, + "grad_norm": 1.171875, + "learning_rate": 0.000354486512268899, + "loss": 0.1785, + "step": 262800 + }, + { + "epoch": 10.89, + "grad_norm": 0.80859375, + "learning_rate": 0.0003544766596164005, + "loss": 0.1823, + "step": 262810 + }, + { + "epoch": 10.89, + "grad_norm": 0.921875, + "learning_rate": 0.00035446680676728845, + "loss": 0.2119, + "step": 262820 + }, + { + "epoch": 10.89, + "grad_norm": 0.5390625, + "learning_rate": 0.000354456953721581, + "loss": 0.1866, + "step": 262830 + }, + { + "epoch": 10.89, + "grad_norm": 0.52734375, + "learning_rate": 0.00035444710047929684, + "loss": 0.1856, + "step": 262840 + }, + { + "epoch": 10.89, + "grad_norm": 1.1015625, + "learning_rate": 0.00035443724704045453, + "loss": 0.1986, + "step": 262850 + }, + { + "epoch": 10.89, + "grad_norm": 0.94140625, + "learning_rate": 0.00035442739340507266, + "loss": 0.1812, + "step": 262860 + }, + { + "epoch": 10.89, + "grad_norm": 0.69140625, + "learning_rate": 0.0003544175395731696, + "loss": 0.1749, + "step": 262870 + }, + { + "epoch": 10.89, + "grad_norm": 0.86328125, + "learning_rate": 0.0003544076855447642, + "loss": 0.1965, + "step": 262880 + }, + { + "epoch": 10.89, + "grad_norm": 0.97265625, + "learning_rate": 0.0003543978313198746, + "loss": 0.1853, + "step": 262890 + }, + { + "epoch": 10.89, + "grad_norm": 0.83984375, + "learning_rate": 0.00035438797689851966, + "loss": 0.169, + "step": 262900 + }, + { + "epoch": 10.89, + "grad_norm": 0.90625, + "learning_rate": 0.0003543781222807178, + "loss": 0.2144, + "step": 262910 + }, + { + "epoch": 10.89, + "grad_norm": 0.82421875, + "learning_rate": 0.0003543682674664875, + "loss": 0.1462, + "step": 262920 + }, + { + "epoch": 10.89, + "grad_norm": 0.765625, + "learning_rate": 0.0003543584124558476, + "loss": 0.2695, + "step": 262930 + }, + { + "epoch": 10.89, + "grad_norm": 0.80859375, + "learning_rate": 0.0003543485572488163, + "loss": 0.213, + "step": 262940 + }, + { + "epoch": 10.89, + "grad_norm": 1.078125, + "learning_rate": 0.0003543387018454123, + "loss": 0.1694, + "step": 262950 + }, + { + "epoch": 10.89, + "grad_norm": 1.3203125, + "learning_rate": 0.00035432884624565417, + "loss": 0.1752, + "step": 262960 + }, + { + "epoch": 10.89, + "grad_norm": 0.88671875, + "learning_rate": 0.00035431899044956026, + "loss": 0.1946, + "step": 262970 + }, + { + "epoch": 10.89, + "grad_norm": 0.859375, + "learning_rate": 0.0003543091344571495, + "loss": 0.2068, + "step": 262980 + }, + { + "epoch": 10.89, + "grad_norm": 0.314453125, + "learning_rate": 0.00035429927826844006, + "loss": 0.1804, + "step": 262990 + }, + { + "epoch": 10.89, + "grad_norm": 0.5390625, + "learning_rate": 0.0003542894218834507, + "loss": 0.174, + "step": 263000 + }, + { + "epoch": 10.89, + "grad_norm": 0.447265625, + "learning_rate": 0.0003542795653021999, + "loss": 0.1908, + "step": 263010 + }, + { + "epoch": 10.89, + "grad_norm": 0.52734375, + "learning_rate": 0.0003542697085247062, + "loss": 0.1828, + "step": 263020 + }, + { + "epoch": 10.89, + "grad_norm": 0.97265625, + "learning_rate": 0.00035425985155098814, + "loss": 0.1737, + "step": 263030 + }, + { + "epoch": 10.9, + "grad_norm": 1.1015625, + "learning_rate": 0.0003542499943810644, + "loss": 0.2187, + "step": 263040 + }, + { + "epoch": 10.9, + "grad_norm": 0.5390625, + "learning_rate": 0.00035424013701495333, + "loss": 0.2187, + "step": 263050 + }, + { + "epoch": 10.9, + "grad_norm": 0.53125, + "learning_rate": 0.00035423027945267365, + "loss": 0.1738, + "step": 263060 + }, + { + "epoch": 10.9, + "grad_norm": 0.5703125, + "learning_rate": 0.00035422042169424385, + "loss": 0.1288, + "step": 263070 + }, + { + "epoch": 10.9, + "grad_norm": 0.7890625, + "learning_rate": 0.0003542105637396824, + "loss": 0.2045, + "step": 263080 + }, + { + "epoch": 10.9, + "grad_norm": 0.4921875, + "learning_rate": 0.000354200705589008, + "loss": 0.2018, + "step": 263090 + }, + { + "epoch": 10.9, + "grad_norm": 0.69921875, + "learning_rate": 0.00035419084724223903, + "loss": 0.1925, + "step": 263100 + }, + { + "epoch": 10.9, + "grad_norm": 1.25, + "learning_rate": 0.00035418098869939417, + "loss": 0.1846, + "step": 263110 + }, + { + "epoch": 10.9, + "grad_norm": 1.21875, + "learning_rate": 0.00035417112996049204, + "loss": 0.2002, + "step": 263120 + }, + { + "epoch": 10.9, + "grad_norm": 1.1171875, + "learning_rate": 0.00035416127102555093, + "loss": 0.1725, + "step": 263130 + }, + { + "epoch": 10.9, + "grad_norm": 0.8671875, + "learning_rate": 0.0003541514118945897, + "loss": 0.1348, + "step": 263140 + }, + { + "epoch": 10.9, + "grad_norm": 0.9140625, + "learning_rate": 0.0003541415525676267, + "loss": 0.2019, + "step": 263150 + }, + { + "epoch": 10.9, + "grad_norm": 0.283203125, + "learning_rate": 0.0003541316930446805, + "loss": 0.1951, + "step": 263160 + }, + { + "epoch": 10.9, + "grad_norm": 1.0625, + "learning_rate": 0.0003541218333257698, + "loss": 0.1417, + "step": 263170 + }, + { + "epoch": 10.9, + "grad_norm": 0.408203125, + "learning_rate": 0.000354111973410913, + "loss": 0.1511, + "step": 263180 + }, + { + "epoch": 10.9, + "grad_norm": 0.38671875, + "learning_rate": 0.00035410211330012863, + "loss": 0.2095, + "step": 263190 + }, + { + "epoch": 10.9, + "grad_norm": 0.53125, + "learning_rate": 0.0003540922529934355, + "loss": 0.2165, + "step": 263200 + }, + { + "epoch": 10.9, + "grad_norm": 0.7421875, + "learning_rate": 0.00035408239249085186, + "loss": 0.1773, + "step": 263210 + }, + { + "epoch": 10.9, + "grad_norm": 0.55859375, + "learning_rate": 0.0003540725317923964, + "loss": 0.1698, + "step": 263220 + }, + { + "epoch": 10.9, + "grad_norm": 1.8515625, + "learning_rate": 0.00035406267089808767, + "loss": 0.1847, + "step": 263230 + }, + { + "epoch": 10.9, + "grad_norm": 0.5625, + "learning_rate": 0.00035405280980794427, + "loss": 0.2279, + "step": 263240 + }, + { + "epoch": 10.9, + "grad_norm": 0.2470703125, + "learning_rate": 0.00035404294852198475, + "loss": 0.226, + "step": 263250 + }, + { + "epoch": 10.9, + "grad_norm": 1.3046875, + "learning_rate": 0.0003540330870402276, + "loss": 0.2287, + "step": 263260 + }, + { + "epoch": 10.9, + "grad_norm": 0.5078125, + "learning_rate": 0.00035402322536269137, + "loss": 0.212, + "step": 263270 + }, + { + "epoch": 10.91, + "grad_norm": 0.765625, + "learning_rate": 0.0003540133634893947, + "loss": 0.174, + "step": 263280 + }, + { + "epoch": 10.91, + "grad_norm": 0.75, + "learning_rate": 0.00035400350142035617, + "loss": 0.1985, + "step": 263290 + }, + { + "epoch": 10.91, + "grad_norm": 0.478515625, + "learning_rate": 0.0003539936391555942, + "loss": 0.1396, + "step": 263300 + }, + { + "epoch": 10.91, + "grad_norm": 1.125, + "learning_rate": 0.0003539837766951275, + "loss": 0.198, + "step": 263310 + }, + { + "epoch": 10.91, + "grad_norm": 1.0, + "learning_rate": 0.0003539739140389745, + "loss": 0.2209, + "step": 263320 + }, + { + "epoch": 10.91, + "grad_norm": 0.0, + "learning_rate": 0.0003539640511871539, + "loss": 0.1938, + "step": 263330 + }, + { + "epoch": 10.91, + "grad_norm": 0.8671875, + "learning_rate": 0.00035395418813968403, + "loss": 0.2165, + "step": 263340 + }, + { + "epoch": 10.91, + "grad_norm": 0.89453125, + "learning_rate": 0.0003539443248965838, + "loss": 0.1294, + "step": 263350 + }, + { + "epoch": 10.91, + "grad_norm": 1.171875, + "learning_rate": 0.00035393446145787146, + "loss": 0.153, + "step": 263360 + }, + { + "epoch": 10.91, + "grad_norm": 0.3828125, + "learning_rate": 0.0003539245978235657, + "loss": 0.1386, + "step": 263370 + }, + { + "epoch": 10.91, + "grad_norm": 1.1875, + "learning_rate": 0.00035391473399368505, + "loss": 0.1968, + "step": 263380 + }, + { + "epoch": 10.91, + "grad_norm": 0.302734375, + "learning_rate": 0.0003539048699682481, + "loss": 0.1813, + "step": 263390 + }, + { + "epoch": 10.91, + "grad_norm": 1.703125, + "learning_rate": 0.00035389500574727344, + "loss": 0.1986, + "step": 263400 + }, + { + "epoch": 10.91, + "grad_norm": 0.66796875, + "learning_rate": 0.0003538851413307796, + "loss": 0.2174, + "step": 263410 + }, + { + "epoch": 10.91, + "grad_norm": 1.2109375, + "learning_rate": 0.00035387527671878506, + "loss": 0.1508, + "step": 263420 + }, + { + "epoch": 10.91, + "grad_norm": 0.9296875, + "learning_rate": 0.0003538654119113086, + "loss": 0.2265, + "step": 263430 + }, + { + "epoch": 10.91, + "grad_norm": 1.0390625, + "learning_rate": 0.00035385554690836856, + "loss": 0.1877, + "step": 263440 + }, + { + "epoch": 10.91, + "grad_norm": 0.60546875, + "learning_rate": 0.0003538456817099836, + "loss": 0.2703, + "step": 263450 + }, + { + "epoch": 10.91, + "grad_norm": 1.3359375, + "learning_rate": 0.00035383581631617236, + "loss": 0.2219, + "step": 263460 + }, + { + "epoch": 10.91, + "grad_norm": 0.64453125, + "learning_rate": 0.0003538259507269532, + "loss": 0.1937, + "step": 263470 + }, + { + "epoch": 10.91, + "grad_norm": 0.828125, + "learning_rate": 0.0003538160849423449, + "loss": 0.2365, + "step": 263480 + }, + { + "epoch": 10.91, + "grad_norm": 0.44140625, + "learning_rate": 0.00035380621896236595, + "loss": 0.1965, + "step": 263490 + }, + { + "epoch": 10.91, + "grad_norm": 0.921875, + "learning_rate": 0.0003537963527870348, + "loss": 0.1768, + "step": 263500 + }, + { + "epoch": 10.91, + "grad_norm": 0.53125, + "learning_rate": 0.00035378648641637027, + "loss": 0.1979, + "step": 263510 + }, + { + "epoch": 10.91, + "grad_norm": 0.96484375, + "learning_rate": 0.00035377661985039067, + "loss": 0.1547, + "step": 263520 + }, + { + "epoch": 10.92, + "grad_norm": 0.87890625, + "learning_rate": 0.00035376675308911476, + "loss": 0.2202, + "step": 263530 + }, + { + "epoch": 10.92, + "grad_norm": 0.65234375, + "learning_rate": 0.00035375688613256104, + "loss": 0.1971, + "step": 263540 + }, + { + "epoch": 10.92, + "grad_norm": 0.53125, + "learning_rate": 0.000353747018980748, + "loss": 0.1542, + "step": 263550 + }, + { + "epoch": 10.92, + "grad_norm": 0.87109375, + "learning_rate": 0.0003537371516336942, + "loss": 0.1832, + "step": 263560 + }, + { + "epoch": 10.92, + "grad_norm": 0.5546875, + "learning_rate": 0.00035372728409141844, + "loss": 0.236, + "step": 263570 + }, + { + "epoch": 10.92, + "grad_norm": 0.703125, + "learning_rate": 0.000353717416353939, + "loss": 0.2302, + "step": 263580 + }, + { + "epoch": 10.92, + "grad_norm": 1.7265625, + "learning_rate": 0.0003537075484212747, + "loss": 0.1785, + "step": 263590 + }, + { + "epoch": 10.92, + "grad_norm": 1.4921875, + "learning_rate": 0.0003536976802934439, + "loss": 0.1959, + "step": 263600 + }, + { + "epoch": 10.92, + "grad_norm": 0.67578125, + "learning_rate": 0.0003536878119704653, + "loss": 0.195, + "step": 263610 + }, + { + "epoch": 10.92, + "grad_norm": 2.015625, + "learning_rate": 0.00035367794345235736, + "loss": 0.2555, + "step": 263620 + }, + { + "epoch": 10.92, + "grad_norm": 0.7265625, + "learning_rate": 0.0003536680747391389, + "loss": 0.1873, + "step": 263630 + }, + { + "epoch": 10.92, + "grad_norm": 0.359375, + "learning_rate": 0.0003536582058308282, + "loss": 0.1908, + "step": 263640 + }, + { + "epoch": 10.92, + "grad_norm": 0.6796875, + "learning_rate": 0.00035364833672744397, + "loss": 0.1718, + "step": 263650 + }, + { + "epoch": 10.92, + "grad_norm": 0.9375, + "learning_rate": 0.0003536384674290047, + "loss": 0.1827, + "step": 263660 + }, + { + "epoch": 10.92, + "grad_norm": 0.765625, + "learning_rate": 0.00035362859793552914, + "loss": 0.1813, + "step": 263670 + }, + { + "epoch": 10.92, + "grad_norm": 0.7265625, + "learning_rate": 0.00035361872824703574, + "loss": 0.1819, + "step": 263680 + }, + { + "epoch": 10.92, + "grad_norm": 0.498046875, + "learning_rate": 0.000353608858363543, + "loss": 0.2011, + "step": 263690 + }, + { + "epoch": 10.92, + "grad_norm": 1.5546875, + "learning_rate": 0.0003535989882850696, + "loss": 0.2022, + "step": 263700 + }, + { + "epoch": 10.92, + "grad_norm": 0.703125, + "learning_rate": 0.0003535891180116342, + "loss": 0.2902, + "step": 263710 + }, + { + "epoch": 10.92, + "grad_norm": 0.8125, + "learning_rate": 0.0003535792475432551, + "loss": 0.2147, + "step": 263720 + }, + { + "epoch": 10.92, + "grad_norm": 0.390625, + "learning_rate": 0.00035356937687995116, + "loss": 0.1912, + "step": 263730 + }, + { + "epoch": 10.92, + "grad_norm": 1.15625, + "learning_rate": 0.00035355950602174077, + "loss": 0.1652, + "step": 263740 + }, + { + "epoch": 10.92, + "grad_norm": 1.0078125, + "learning_rate": 0.0003535496349686426, + "loss": 0.1191, + "step": 263750 + }, + { + "epoch": 10.92, + "grad_norm": 1.3671875, + "learning_rate": 0.0003535397637206752, + "loss": 0.1811, + "step": 263760 + }, + { + "epoch": 10.93, + "grad_norm": 0.796875, + "learning_rate": 0.0003535298922778571, + "loss": 0.1857, + "step": 263770 + }, + { + "epoch": 10.93, + "grad_norm": 0.8125, + "learning_rate": 0.000353520020640207, + "loss": 0.1589, + "step": 263780 + }, + { + "epoch": 10.93, + "grad_norm": 0.70703125, + "learning_rate": 0.0003535101488077434, + "loss": 0.2015, + "step": 263790 + }, + { + "epoch": 10.93, + "grad_norm": 0.71484375, + "learning_rate": 0.00035350027678048484, + "loss": 0.178, + "step": 263800 + }, + { + "epoch": 10.93, + "grad_norm": 2.359375, + "learning_rate": 0.0003534904045584499, + "loss": 0.2141, + "step": 263810 + }, + { + "epoch": 10.93, + "grad_norm": 0.99609375, + "learning_rate": 0.0003534805321416573, + "loss": 0.186, + "step": 263820 + }, + { + "epoch": 10.93, + "grad_norm": 0.8671875, + "learning_rate": 0.00035347065953012543, + "loss": 0.1698, + "step": 263830 + }, + { + "epoch": 10.93, + "grad_norm": 0.9296875, + "learning_rate": 0.000353460786723873, + "loss": 0.1966, + "step": 263840 + }, + { + "epoch": 10.93, + "grad_norm": 0.447265625, + "learning_rate": 0.0003534509137229185, + "loss": 0.2365, + "step": 263850 + }, + { + "epoch": 10.93, + "grad_norm": 0.9140625, + "learning_rate": 0.00035344104052728055, + "loss": 0.214, + "step": 263860 + }, + { + "epoch": 10.93, + "grad_norm": 0.5546875, + "learning_rate": 0.00035343116713697784, + "loss": 0.2211, + "step": 263870 + }, + { + "epoch": 10.93, + "grad_norm": 0.0, + "learning_rate": 0.0003534212935520287, + "loss": 0.2601, + "step": 263880 + }, + { + "epoch": 10.93, + "grad_norm": 0.515625, + "learning_rate": 0.0003534114197724519, + "loss": 0.1752, + "step": 263890 + }, + { + "epoch": 10.93, + "grad_norm": 0.51171875, + "learning_rate": 0.000353401545798266, + "loss": 0.2216, + "step": 263900 + }, + { + "epoch": 10.93, + "grad_norm": 0.61328125, + "learning_rate": 0.0003533916716294895, + "loss": 0.1631, + "step": 263910 + }, + { + "epoch": 10.93, + "grad_norm": 1.1640625, + "learning_rate": 0.0003533817972661411, + "loss": 0.2141, + "step": 263920 + }, + { + "epoch": 10.93, + "grad_norm": 0.66015625, + "learning_rate": 0.0003533719227082393, + "loss": 0.1942, + "step": 263930 + }, + { + "epoch": 10.93, + "grad_norm": 1.7421875, + "learning_rate": 0.0003533620479558027, + "loss": 0.2467, + "step": 263940 + }, + { + "epoch": 10.93, + "grad_norm": 0.60546875, + "learning_rate": 0.0003533521730088499, + "loss": 0.1652, + "step": 263950 + }, + { + "epoch": 10.93, + "grad_norm": 0.578125, + "learning_rate": 0.00035334229786739945, + "loss": 0.1692, + "step": 263960 + }, + { + "epoch": 10.93, + "grad_norm": 0.61328125, + "learning_rate": 0.00035333242253147003, + "loss": 0.2112, + "step": 263970 + }, + { + "epoch": 10.93, + "grad_norm": 0.72265625, + "learning_rate": 0.00035332254700108, + "loss": 0.1606, + "step": 263980 + }, + { + "epoch": 10.93, + "grad_norm": 0.828125, + "learning_rate": 0.00035331267127624823, + "loss": 0.2161, + "step": 263990 + }, + { + "epoch": 10.93, + "grad_norm": 0.55859375, + "learning_rate": 0.00035330279535699316, + "loss": 0.2129, + "step": 264000 + }, + { + "epoch": 10.94, + "grad_norm": 0.458984375, + "learning_rate": 0.00035329291924333337, + "loss": 0.1693, + "step": 264010 + }, + { + "epoch": 10.94, + "grad_norm": 0.9140625, + "learning_rate": 0.0003532830429352874, + "loss": 0.1677, + "step": 264020 + }, + { + "epoch": 10.94, + "grad_norm": 0.41796875, + "learning_rate": 0.00035327316643287397, + "loss": 0.1782, + "step": 264030 + }, + { + "epoch": 10.94, + "grad_norm": 0.5078125, + "learning_rate": 0.0003532632897361115, + "loss": 0.1798, + "step": 264040 + }, + { + "epoch": 10.94, + "grad_norm": 0.734375, + "learning_rate": 0.0003532534128450188, + "loss": 0.2237, + "step": 264050 + }, + { + "epoch": 10.94, + "grad_norm": 0.384765625, + "learning_rate": 0.0003532435357596142, + "loss": 0.2044, + "step": 264060 + }, + { + "epoch": 10.94, + "grad_norm": 0.41796875, + "learning_rate": 0.00035323365847991653, + "loss": 0.1876, + "step": 264070 + }, + { + "epoch": 10.94, + "grad_norm": 0.83203125, + "learning_rate": 0.0003532237810059442, + "loss": 0.1921, + "step": 264080 + }, + { + "epoch": 10.94, + "grad_norm": 0.81640625, + "learning_rate": 0.00035321390333771584, + "loss": 0.203, + "step": 264090 + }, + { + "epoch": 10.94, + "grad_norm": 0.443359375, + "learning_rate": 0.0003532040254752501, + "loss": 0.2254, + "step": 264100 + }, + { + "epoch": 10.94, + "grad_norm": 0.7578125, + "learning_rate": 0.00035319414741856556, + "loss": 0.1904, + "step": 264110 + }, + { + "epoch": 10.94, + "grad_norm": 0.458984375, + "learning_rate": 0.00035318426916768064, + "loss": 0.2125, + "step": 264120 + }, + { + "epoch": 10.94, + "grad_norm": 0.74609375, + "learning_rate": 0.0003531743907226142, + "loss": 0.1908, + "step": 264130 + }, + { + "epoch": 10.94, + "grad_norm": 0.95703125, + "learning_rate": 0.0003531645120833847, + "loss": 0.2035, + "step": 264140 + }, + { + "epoch": 10.94, + "grad_norm": 1.0234375, + "learning_rate": 0.00035315463325001064, + "loss": 0.2146, + "step": 264150 + }, + { + "epoch": 10.94, + "grad_norm": 1.28125, + "learning_rate": 0.0003531447542225108, + "loss": 0.1977, + "step": 264160 + }, + { + "epoch": 10.94, + "grad_norm": 1.703125, + "learning_rate": 0.00035313487500090356, + "loss": 0.2126, + "step": 264170 + }, + { + "epoch": 10.94, + "grad_norm": 0.58984375, + "learning_rate": 0.0003531249955852077, + "loss": 0.1797, + "step": 264180 + }, + { + "epoch": 10.94, + "grad_norm": 1.296875, + "learning_rate": 0.00035311511597544174, + "loss": 0.1812, + "step": 264190 + }, + { + "epoch": 10.94, + "grad_norm": 1.109375, + "learning_rate": 0.00035310523617162424, + "loss": 0.181, + "step": 264200 + }, + { + "epoch": 10.94, + "grad_norm": 1.0078125, + "learning_rate": 0.00035309535617377384, + "loss": 0.1854, + "step": 264210 + }, + { + "epoch": 10.94, + "grad_norm": 1.5390625, + "learning_rate": 0.0003530854759819091, + "loss": 0.2139, + "step": 264220 + }, + { + "epoch": 10.94, + "grad_norm": 0.7890625, + "learning_rate": 0.0003530755955960486, + "loss": 0.2003, + "step": 264230 + }, + { + "epoch": 10.94, + "grad_norm": 0.462890625, + "learning_rate": 0.000353065715016211, + "loss": 0.1531, + "step": 264240 + }, + { + "epoch": 10.95, + "grad_norm": 0.77734375, + "learning_rate": 0.00035305583424241485, + "loss": 0.2232, + "step": 264250 + }, + { + "epoch": 10.95, + "grad_norm": 1.0390625, + "learning_rate": 0.0003530459532746788, + "loss": 0.2303, + "step": 264260 + }, + { + "epoch": 10.95, + "grad_norm": 0.51953125, + "learning_rate": 0.0003530360721130213, + "loss": 0.2003, + "step": 264270 + }, + { + "epoch": 10.95, + "grad_norm": 0.166015625, + "learning_rate": 0.00035302619075746103, + "loss": 0.1465, + "step": 264280 + }, + { + "epoch": 10.95, + "grad_norm": 0.92578125, + "learning_rate": 0.0003530163092080167, + "loss": 0.1707, + "step": 264290 + }, + { + "epoch": 10.95, + "grad_norm": 1.2734375, + "learning_rate": 0.0003530064274647067, + "loss": 0.232, + "step": 264300 + }, + { + "epoch": 10.95, + "grad_norm": 0.75390625, + "learning_rate": 0.0003529965455275498, + "loss": 0.1768, + "step": 264310 + }, + { + "epoch": 10.95, + "grad_norm": 0.36328125, + "learning_rate": 0.0003529866633965645, + "loss": 0.1842, + "step": 264320 + }, + { + "epoch": 10.95, + "grad_norm": 0.5234375, + "learning_rate": 0.00035297678107176935, + "loss": 0.2402, + "step": 264330 + }, + { + "epoch": 10.95, + "grad_norm": 0.79296875, + "learning_rate": 0.0003529668985531831, + "loss": 0.1818, + "step": 264340 + }, + { + "epoch": 10.95, + "grad_norm": 1.09375, + "learning_rate": 0.0003529570158408243, + "loss": 0.2199, + "step": 264350 + }, + { + "epoch": 10.95, + "grad_norm": 0.9140625, + "learning_rate": 0.00035294713293471136, + "loss": 0.2311, + "step": 264360 + }, + { + "epoch": 10.95, + "grad_norm": 0.64453125, + "learning_rate": 0.0003529372498348632, + "loss": 0.17, + "step": 264370 + }, + { + "epoch": 10.95, + "grad_norm": 0.609375, + "learning_rate": 0.0003529273665412982, + "loss": 0.1949, + "step": 264380 + }, + { + "epoch": 10.95, + "grad_norm": 0.33984375, + "learning_rate": 0.000352917483054035, + "loss": 0.1852, + "step": 264390 + }, + { + "epoch": 10.95, + "grad_norm": 0.83984375, + "learning_rate": 0.00035290759937309224, + "loss": 0.2426, + "step": 264400 + }, + { + "epoch": 10.95, + "grad_norm": 0.51953125, + "learning_rate": 0.0003528977154984884, + "loss": 0.1332, + "step": 264410 + }, + { + "epoch": 10.95, + "grad_norm": 0.6796875, + "learning_rate": 0.0003528878314302423, + "loss": 0.2168, + "step": 264420 + }, + { + "epoch": 10.95, + "grad_norm": 1.28125, + "learning_rate": 0.0003528779471683723, + "loss": 0.1708, + "step": 264430 + }, + { + "epoch": 10.95, + "grad_norm": 0.95703125, + "learning_rate": 0.0003528680627128971, + "loss": 0.178, + "step": 264440 + }, + { + "epoch": 10.95, + "grad_norm": 0.57421875, + "learning_rate": 0.0003528581780638355, + "loss": 0.1689, + "step": 264450 + }, + { + "epoch": 10.95, + "grad_norm": 0.71484375, + "learning_rate": 0.00035284829322120573, + "loss": 0.1832, + "step": 264460 + }, + { + "epoch": 10.95, + "grad_norm": 0.94140625, + "learning_rate": 0.0003528384081850266, + "loss": 0.2187, + "step": 264470 + }, + { + "epoch": 10.95, + "grad_norm": 0.6640625, + "learning_rate": 0.00035282852295531676, + "loss": 0.2148, + "step": 264480 + }, + { + "epoch": 10.96, + "grad_norm": 1.1640625, + "learning_rate": 0.00035281863753209463, + "loss": 0.2122, + "step": 264490 + }, + { + "epoch": 10.96, + "grad_norm": 0.431640625, + "learning_rate": 0.0003528087519153791, + "loss": 0.185, + "step": 264500 + }, + { + "epoch": 10.96, + "grad_norm": 0.93359375, + "learning_rate": 0.0003527988661051885, + "loss": 0.2408, + "step": 264510 + }, + { + "epoch": 10.96, + "grad_norm": 0.92578125, + "learning_rate": 0.0003527889801015415, + "loss": 0.2385, + "step": 264520 + }, + { + "epoch": 10.96, + "grad_norm": 0.9140625, + "learning_rate": 0.00035277909390445686, + "loss": 0.235, + "step": 264530 + }, + { + "epoch": 10.96, + "grad_norm": 0.83984375, + "learning_rate": 0.00035276920751395297, + "loss": 0.1467, + "step": 264540 + }, + { + "epoch": 10.96, + "grad_norm": 0.64453125, + "learning_rate": 0.00035275932093004856, + "loss": 0.2537, + "step": 264550 + }, + { + "epoch": 10.96, + "grad_norm": 0.68359375, + "learning_rate": 0.00035274943415276216, + "loss": 0.2209, + "step": 264560 + }, + { + "epoch": 10.96, + "grad_norm": 0.875, + "learning_rate": 0.0003527395471821124, + "loss": 0.2057, + "step": 264570 + }, + { + "epoch": 10.96, + "grad_norm": 1.125, + "learning_rate": 0.000352729660018118, + "loss": 0.1322, + "step": 264580 + }, + { + "epoch": 10.96, + "grad_norm": 1.9296875, + "learning_rate": 0.00035271977266079747, + "loss": 0.2609, + "step": 264590 + }, + { + "epoch": 10.96, + "grad_norm": 1.53125, + "learning_rate": 0.0003527098851101694, + "loss": 0.181, + "step": 264600 + }, + { + "epoch": 10.96, + "grad_norm": 0.73046875, + "learning_rate": 0.0003526999973662524, + "loss": 0.2104, + "step": 264610 + }, + { + "epoch": 10.96, + "grad_norm": 2.265625, + "learning_rate": 0.0003526901094290651, + "loss": 0.214, + "step": 264620 + }, + { + "epoch": 10.96, + "grad_norm": 0.3828125, + "learning_rate": 0.0003526802212986261, + "loss": 0.1904, + "step": 264630 + }, + { + "epoch": 10.96, + "grad_norm": 0.30859375, + "learning_rate": 0.00035267033297495406, + "loss": 0.206, + "step": 264640 + }, + { + "epoch": 10.96, + "grad_norm": 1.3515625, + "learning_rate": 0.00035266044445806744, + "loss": 0.2306, + "step": 264650 + }, + { + "epoch": 10.96, + "grad_norm": 0.65625, + "learning_rate": 0.00035265055574798505, + "loss": 0.1982, + "step": 264660 + }, + { + "epoch": 10.96, + "grad_norm": 0.53125, + "learning_rate": 0.00035264066684472537, + "loss": 0.2304, + "step": 264670 + }, + { + "epoch": 10.96, + "grad_norm": 0.72265625, + "learning_rate": 0.000352630777748307, + "loss": 0.1851, + "step": 264680 + }, + { + "epoch": 10.96, + "grad_norm": 0.77734375, + "learning_rate": 0.0003526208884587486, + "loss": 0.2128, + "step": 264690 + }, + { + "epoch": 10.96, + "grad_norm": 1.015625, + "learning_rate": 0.0003526109989760688, + "loss": 0.205, + "step": 264700 + }, + { + "epoch": 10.96, + "grad_norm": 0.69140625, + "learning_rate": 0.0003526011093002862, + "loss": 0.1882, + "step": 264710 + }, + { + "epoch": 10.96, + "grad_norm": 1.2265625, + "learning_rate": 0.0003525912194314193, + "loss": 0.1765, + "step": 264720 + }, + { + "epoch": 10.97, + "grad_norm": 0.91015625, + "learning_rate": 0.00035258132936948685, + "loss": 0.2055, + "step": 264730 + }, + { + "epoch": 10.97, + "grad_norm": 0.80078125, + "learning_rate": 0.0003525714391145075, + "loss": 0.2019, + "step": 264740 + }, + { + "epoch": 10.97, + "grad_norm": 0.83203125, + "learning_rate": 0.0003525615486664997, + "loss": 0.2139, + "step": 264750 + }, + { + "epoch": 10.97, + "grad_norm": 0.8359375, + "learning_rate": 0.00035255165802548213, + "loss": 0.1823, + "step": 264760 + }, + { + "epoch": 10.97, + "grad_norm": 0.546875, + "learning_rate": 0.0003525417671914735, + "loss": 0.2006, + "step": 264770 + }, + { + "epoch": 10.97, + "grad_norm": 1.0625, + "learning_rate": 0.00035253187616449225, + "loss": 0.1379, + "step": 264780 + }, + { + "epoch": 10.97, + "grad_norm": 0.400390625, + "learning_rate": 0.0003525219849445571, + "loss": 0.1593, + "step": 264790 + }, + { + "epoch": 10.97, + "grad_norm": 0.474609375, + "learning_rate": 0.00035251209353168655, + "loss": 0.2512, + "step": 264800 + }, + { + "epoch": 10.97, + "grad_norm": 0.3359375, + "learning_rate": 0.00035250220192589943, + "loss": 0.1332, + "step": 264810 + }, + { + "epoch": 10.97, + "grad_norm": 0.70703125, + "learning_rate": 0.00035249231012721426, + "loss": 0.205, + "step": 264820 + }, + { + "epoch": 10.97, + "grad_norm": 0.796875, + "learning_rate": 0.0003524824181356496, + "loss": 0.1702, + "step": 264830 + }, + { + "epoch": 10.97, + "grad_norm": 0.28515625, + "learning_rate": 0.00035247252595122403, + "loss": 0.2153, + "step": 264840 + }, + { + "epoch": 10.97, + "grad_norm": 0.61328125, + "learning_rate": 0.0003524626335739564, + "loss": 0.1935, + "step": 264850 + }, + { + "epoch": 10.97, + "grad_norm": 0.396484375, + "learning_rate": 0.00035245274100386494, + "loss": 0.1473, + "step": 264860 + }, + { + "epoch": 10.97, + "grad_norm": 0.8359375, + "learning_rate": 0.00035244284824096863, + "loss": 0.1501, + "step": 264870 + }, + { + "epoch": 10.97, + "grad_norm": 0.921875, + "learning_rate": 0.00035243295528528584, + "loss": 0.2107, + "step": 264880 + }, + { + "epoch": 10.97, + "grad_norm": 0.2333984375, + "learning_rate": 0.0003524230621368353, + "loss": 0.2323, + "step": 264890 + }, + { + "epoch": 10.97, + "grad_norm": 0.34375, + "learning_rate": 0.00035241316879563577, + "loss": 0.2564, + "step": 264900 + }, + { + "epoch": 10.97, + "grad_norm": 1.0390625, + "learning_rate": 0.00035240327526170557, + "loss": 0.166, + "step": 264910 + }, + { + "epoch": 10.97, + "grad_norm": 0.66015625, + "learning_rate": 0.0003523933815350635, + "loss": 0.1841, + "step": 264920 + }, + { + "epoch": 10.97, + "grad_norm": 0.95703125, + "learning_rate": 0.0003523834876157281, + "loss": 0.1925, + "step": 264930 + }, + { + "epoch": 10.97, + "grad_norm": 0.859375, + "learning_rate": 0.0003523735935037182, + "loss": 0.2045, + "step": 264940 + }, + { + "epoch": 10.97, + "grad_norm": 0.921875, + "learning_rate": 0.0003523636991990521, + "loss": 0.2233, + "step": 264950 + }, + { + "epoch": 10.97, + "grad_norm": 1.03125, + "learning_rate": 0.0003523538047017486, + "loss": 0.1936, + "step": 264960 + }, + { + "epoch": 10.98, + "grad_norm": 0.83984375, + "learning_rate": 0.0003523439100118263, + "loss": 0.1861, + "step": 264970 + }, + { + "epoch": 10.98, + "grad_norm": 1.03125, + "learning_rate": 0.00035233401512930383, + "loss": 0.1755, + "step": 264980 + }, + { + "epoch": 10.98, + "grad_norm": 0.54296875, + "learning_rate": 0.00035232412005419976, + "loss": 0.1998, + "step": 264990 + }, + { + "epoch": 10.98, + "grad_norm": 1.1015625, + "learning_rate": 0.00035231422478653276, + "loss": 0.2056, + "step": 265000 + }, + { + "epoch": 10.98, + "grad_norm": 0.6875, + "learning_rate": 0.00035230432932632143, + "loss": 0.2059, + "step": 265010 + }, + { + "epoch": 10.98, + "grad_norm": 1.359375, + "learning_rate": 0.00035229443367358447, + "loss": 0.144, + "step": 265020 + }, + { + "epoch": 10.98, + "grad_norm": 0.93359375, + "learning_rate": 0.00035228453782834037, + "loss": 0.1998, + "step": 265030 + }, + { + "epoch": 10.98, + "grad_norm": 0.484375, + "learning_rate": 0.0003522746417906078, + "loss": 0.1686, + "step": 265040 + }, + { + "epoch": 10.98, + "grad_norm": 1.5859375, + "learning_rate": 0.0003522647455604054, + "loss": 0.2454, + "step": 265050 + }, + { + "epoch": 10.98, + "grad_norm": 0.91015625, + "learning_rate": 0.00035225484913775184, + "loss": 0.1971, + "step": 265060 + }, + { + "epoch": 10.98, + "grad_norm": 1.03125, + "learning_rate": 0.00035224495252266565, + "loss": 0.186, + "step": 265070 + }, + { + "epoch": 10.98, + "grad_norm": 2.265625, + "learning_rate": 0.00035223505571516556, + "loss": 0.1608, + "step": 265080 + }, + { + "epoch": 10.98, + "grad_norm": 1.8203125, + "learning_rate": 0.00035222515871527005, + "loss": 0.1969, + "step": 265090 + }, + { + "epoch": 10.98, + "grad_norm": 0.65234375, + "learning_rate": 0.0003522152615229979, + "loss": 0.209, + "step": 265100 + }, + { + "epoch": 10.98, + "grad_norm": 0.232421875, + "learning_rate": 0.0003522053641383677, + "loss": 0.1735, + "step": 265110 + }, + { + "epoch": 10.98, + "grad_norm": 0.55078125, + "learning_rate": 0.0003521954665613979, + "loss": 0.1865, + "step": 265120 + }, + { + "epoch": 10.98, + "grad_norm": 0.7265625, + "learning_rate": 0.0003521855687921074, + "loss": 0.2147, + "step": 265130 + }, + { + "epoch": 10.98, + "grad_norm": 0.75, + "learning_rate": 0.0003521756708305146, + "loss": 0.1972, + "step": 265140 + }, + { + "epoch": 10.98, + "grad_norm": 0.80859375, + "learning_rate": 0.0003521657726766383, + "loss": 0.1371, + "step": 265150 + }, + { + "epoch": 10.98, + "grad_norm": 1.15625, + "learning_rate": 0.000352155874330497, + "loss": 0.1828, + "step": 265160 + }, + { + "epoch": 10.98, + "grad_norm": 0.0, + "learning_rate": 0.0003521459757921094, + "loss": 0.2027, + "step": 265170 + }, + { + "epoch": 10.98, + "grad_norm": 0.51953125, + "learning_rate": 0.0003521360770614941, + "loss": 0.1939, + "step": 265180 + }, + { + "epoch": 10.98, + "grad_norm": 0.380859375, + "learning_rate": 0.00035212617813866975, + "loss": 0.1852, + "step": 265190 + }, + { + "epoch": 10.98, + "grad_norm": 1.21875, + "learning_rate": 0.00035211627902365496, + "loss": 0.2119, + "step": 265200 + }, + { + "epoch": 10.98, + "grad_norm": 0.546875, + "learning_rate": 0.0003521063797164683, + "loss": 0.2218, + "step": 265210 + }, + { + "epoch": 10.99, + "grad_norm": 0.921875, + "learning_rate": 0.00035209648021712847, + "loss": 0.1643, + "step": 265220 + }, + { + "epoch": 10.99, + "grad_norm": 0.62109375, + "learning_rate": 0.0003520865805256542, + "loss": 0.2331, + "step": 265230 + }, + { + "epoch": 10.99, + "grad_norm": 0.32421875, + "learning_rate": 0.00035207668064206386, + "loss": 0.2172, + "step": 265240 + }, + { + "epoch": 10.99, + "grad_norm": 0.5234375, + "learning_rate": 0.0003520667805663763, + "loss": 0.169, + "step": 265250 + }, + { + "epoch": 10.99, + "grad_norm": 1.1328125, + "learning_rate": 0.0003520568802986101, + "loss": 0.1819, + "step": 265260 + }, + { + "epoch": 10.99, + "grad_norm": 1.1640625, + "learning_rate": 0.0003520469798387839, + "loss": 0.1888, + "step": 265270 + }, + { + "epoch": 10.99, + "grad_norm": 1.3359375, + "learning_rate": 0.0003520370791869162, + "loss": 0.1875, + "step": 265280 + }, + { + "epoch": 10.99, + "grad_norm": 0.6484375, + "learning_rate": 0.0003520271783430258, + "loss": 0.1551, + "step": 265290 + }, + { + "epoch": 10.99, + "grad_norm": 0.875, + "learning_rate": 0.0003520172773071313, + "loss": 0.2674, + "step": 265300 + }, + { + "epoch": 10.99, + "grad_norm": 0.74609375, + "learning_rate": 0.00035200737607925134, + "loss": 0.1658, + "step": 265310 + }, + { + "epoch": 10.99, + "grad_norm": 0.8359375, + "learning_rate": 0.0003519974746594044, + "loss": 0.1347, + "step": 265320 + }, + { + "epoch": 10.99, + "grad_norm": 0.734375, + "learning_rate": 0.00035198757304760926, + "loss": 0.1951, + "step": 265330 + }, + { + "epoch": 10.99, + "grad_norm": 1.125, + "learning_rate": 0.0003519776712438845, + "loss": 0.2176, + "step": 265340 + }, + { + "epoch": 10.99, + "grad_norm": 1.9296875, + "learning_rate": 0.0003519677692482488, + "loss": 0.2108, + "step": 265350 + }, + { + "epoch": 10.99, + "grad_norm": 1.0625, + "learning_rate": 0.0003519578670607208, + "loss": 0.2099, + "step": 265360 + }, + { + "epoch": 10.99, + "grad_norm": 0.322265625, + "learning_rate": 0.00035194796468131906, + "loss": 0.1826, + "step": 265370 + }, + { + "epoch": 10.99, + "grad_norm": 0.640625, + "learning_rate": 0.0003519380621100623, + "loss": 0.2153, + "step": 265380 + }, + { + "epoch": 10.99, + "grad_norm": 0.74609375, + "learning_rate": 0.0003519281593469691, + "loss": 0.1622, + "step": 265390 + }, + { + "epoch": 10.99, + "grad_norm": 0.68359375, + "learning_rate": 0.0003519182563920581, + "loss": 0.2288, + "step": 265400 + }, + { + "epoch": 10.99, + "grad_norm": 0.3515625, + "learning_rate": 0.00035190835324534796, + "loss": 0.2263, + "step": 265410 + }, + { + "epoch": 10.99, + "grad_norm": 3.109375, + "learning_rate": 0.0003518984499068573, + "loss": 0.1895, + "step": 265420 + }, + { + "epoch": 10.99, + "grad_norm": 2.34375, + "learning_rate": 0.00035188854637660473, + "loss": 0.1348, + "step": 265430 + }, + { + "epoch": 10.99, + "grad_norm": 1.453125, + "learning_rate": 0.000351878642654609, + "loss": 0.2091, + "step": 265440 + }, + { + "epoch": 10.99, + "grad_norm": 0.41796875, + "learning_rate": 0.0003518687387408886, + "loss": 0.1671, + "step": 265450 + }, + { + "epoch": 11.0, + "grad_norm": 0.64453125, + "learning_rate": 0.0003518588346354622, + "loss": 0.1682, + "step": 265460 + }, + { + "epoch": 11.0, + "grad_norm": 1.671875, + "learning_rate": 0.0003518489303383486, + "loss": 0.2119, + "step": 265470 + }, + { + "epoch": 11.0, + "grad_norm": 1.109375, + "learning_rate": 0.0003518390258495662, + "loss": 0.22, + "step": 265480 + }, + { + "epoch": 11.0, + "grad_norm": 2.40625, + "learning_rate": 0.0003518291211691338, + "loss": 0.2039, + "step": 265490 + }, + { + "epoch": 11.0, + "grad_norm": 0.515625, + "learning_rate": 0.0003518192162970699, + "loss": 0.1981, + "step": 265500 + }, + { + "epoch": 11.0, + "grad_norm": 0.50390625, + "learning_rate": 0.00035180931123339335, + "loss": 0.1771, + "step": 265510 + }, + { + "epoch": 11.0, + "grad_norm": 0.83203125, + "learning_rate": 0.00035179940597812267, + "loss": 0.1771, + "step": 265520 + }, + { + "epoch": 11.0, + "grad_norm": 0.83203125, + "learning_rate": 0.0003517895005312764, + "loss": 0.1486, + "step": 265530 + }, + { + "epoch": 11.0, + "grad_norm": 0.55078125, + "learning_rate": 0.0003517795948928734, + "loss": 0.1846, + "step": 265540 + }, + { + "epoch": 11.0, + "grad_norm": 0.6015625, + "learning_rate": 0.00035176968906293213, + "loss": 0.1766, + "step": 265550 + }, + { + "epoch": 11.0, + "grad_norm": 0.451171875, + "learning_rate": 0.00035175978304147123, + "loss": 0.2014, + "step": 265560 + }, + { + "epoch": 11.0, + "grad_norm": 0.5703125, + "learning_rate": 0.0003517498768285095, + "loss": 0.1637, + "step": 265570 + }, + { + "epoch": 11.0, + "grad_norm": 0.9375, + "learning_rate": 0.0003517399704240655, + "loss": 0.225, + "step": 265580 + }, + { + "epoch": 11.0, + "grad_norm": 0.4453125, + "learning_rate": 0.00035173006382815773, + "loss": 0.1653, + "step": 265590 + }, + { + "epoch": 11.0, + "grad_norm": 1.171875, + "learning_rate": 0.0003517201570408052, + "loss": 0.2512, + "step": 265600 + }, + { + "epoch": 11.0, + "grad_norm": 0.61328125, + "learning_rate": 0.00035171025006202613, + "loss": 0.194, + "step": 265610 + }, + { + "epoch": 11.0, + "grad_norm": 1.359375, + "learning_rate": 0.00035170034289183936, + "loss": 0.1708, + "step": 265620 + }, + { + "epoch": 11.0, + "grad_norm": 0.3203125, + "learning_rate": 0.00035169043553026364, + "loss": 0.1912, + "step": 265630 + }, + { + "epoch": 11.0, + "grad_norm": 0.7734375, + "learning_rate": 0.00035168052797731735, + "loss": 0.1872, + "step": 265640 + }, + { + "epoch": 11.0, + "grad_norm": 0.318359375, + "learning_rate": 0.00035167062023301946, + "loss": 0.1992, + "step": 265650 + }, + { + "epoch": 11.0, + "grad_norm": 1.296875, + "learning_rate": 0.00035166071229738834, + "loss": 0.1745, + "step": 265660 + }, + { + "epoch": 11.0, + "grad_norm": 0.6171875, + "learning_rate": 0.0003516508041704427, + "loss": 0.1863, + "step": 265670 + }, + { + "epoch": 11.0, + "grad_norm": 1.703125, + "learning_rate": 0.0003516408958522013, + "loss": 0.1846, + "step": 265680 + }, + { + "epoch": 11.0, + "grad_norm": 1.09375, + "learning_rate": 0.0003516309873426827, + "loss": 0.1863, + "step": 265690 + }, + { + "epoch": 11.01, + "grad_norm": 1.671875, + "learning_rate": 0.00035162107864190553, + "loss": 0.1664, + "step": 265700 + }, + { + "epoch": 11.01, + "grad_norm": 0.92578125, + "learning_rate": 0.00035161116974988856, + "loss": 0.1976, + "step": 265710 + }, + { + "epoch": 11.01, + "grad_norm": 0.63671875, + "learning_rate": 0.00035160126066665015, + "loss": 0.1794, + "step": 265720 + }, + { + "epoch": 11.01, + "grad_norm": 1.1953125, + "learning_rate": 0.00035159135139220933, + "loss": 0.1595, + "step": 265730 + }, + { + "epoch": 11.01, + "grad_norm": 0.53515625, + "learning_rate": 0.00035158144192658447, + "loss": 0.1909, + "step": 265740 + }, + { + "epoch": 11.01, + "grad_norm": 2.453125, + "learning_rate": 0.0003515715322697943, + "loss": 0.178, + "step": 265750 + }, + { + "epoch": 11.01, + "grad_norm": 0.8515625, + "learning_rate": 0.0003515616224218575, + "loss": 0.1822, + "step": 265760 + }, + { + "epoch": 11.01, + "grad_norm": 0.76953125, + "learning_rate": 0.0003515517123827926, + "loss": 0.1863, + "step": 265770 + }, + { + "epoch": 11.01, + "grad_norm": 0.80078125, + "learning_rate": 0.0003515418021526185, + "loss": 0.2057, + "step": 265780 + }, + { + "epoch": 11.01, + "grad_norm": 2.28125, + "learning_rate": 0.0003515318917313536, + "loss": 0.1788, + "step": 265790 + }, + { + "epoch": 11.01, + "grad_norm": 0.349609375, + "learning_rate": 0.0003515219811190167, + "loss": 0.177, + "step": 265800 + }, + { + "epoch": 11.01, + "grad_norm": 0.5625, + "learning_rate": 0.0003515120703156264, + "loss": 0.1956, + "step": 265810 + }, + { + "epoch": 11.01, + "grad_norm": 1.3515625, + "learning_rate": 0.0003515021593212012, + "loss": 0.1559, + "step": 265820 + }, + { + "epoch": 11.01, + "grad_norm": 0.70703125, + "learning_rate": 0.00035149224813576006, + "loss": 0.1819, + "step": 265830 + }, + { + "epoch": 11.01, + "grad_norm": 0.48046875, + "learning_rate": 0.0003514823367593214, + "loss": 0.1665, + "step": 265840 + }, + { + "epoch": 11.01, + "grad_norm": 1.03125, + "learning_rate": 0.0003514724251919039, + "loss": 0.2022, + "step": 265850 + }, + { + "epoch": 11.01, + "grad_norm": 1.203125, + "learning_rate": 0.00035146251343352636, + "loss": 0.1689, + "step": 265860 + }, + { + "epoch": 11.01, + "grad_norm": 2.046875, + "learning_rate": 0.0003514526014842073, + "loss": 0.2262, + "step": 265870 + }, + { + "epoch": 11.01, + "grad_norm": 0.494140625, + "learning_rate": 0.00035144268934396536, + "loss": 0.1678, + "step": 265880 + }, + { + "epoch": 11.01, + "grad_norm": 0.7578125, + "learning_rate": 0.00035143277701281926, + "loss": 0.1838, + "step": 265890 + }, + { + "epoch": 11.01, + "grad_norm": 0.62109375, + "learning_rate": 0.0003514228644907876, + "loss": 0.2154, + "step": 265900 + }, + { + "epoch": 11.01, + "grad_norm": 0.7421875, + "learning_rate": 0.0003514129517778891, + "loss": 0.2358, + "step": 265910 + }, + { + "epoch": 11.01, + "grad_norm": 0.88671875, + "learning_rate": 0.00035140303887414237, + "loss": 0.185, + "step": 265920 + }, + { + "epoch": 11.01, + "grad_norm": 0.80859375, + "learning_rate": 0.00035139312577956594, + "loss": 0.1851, + "step": 265930 + }, + { + "epoch": 11.02, + "grad_norm": 0.66796875, + "learning_rate": 0.0003513832124941788, + "loss": 0.2001, + "step": 265940 + }, + { + "epoch": 11.02, + "grad_norm": 0.9453125, + "learning_rate": 0.0003513732990179993, + "loss": 0.1895, + "step": 265950 + }, + { + "epoch": 11.02, + "grad_norm": 0.74609375, + "learning_rate": 0.00035136338535104615, + "loss": 0.2138, + "step": 265960 + }, + { + "epoch": 11.02, + "grad_norm": 0.82421875, + "learning_rate": 0.0003513534714933382, + "loss": 0.2156, + "step": 265970 + }, + { + "epoch": 11.02, + "grad_norm": 0.80078125, + "learning_rate": 0.0003513435574448939, + "loss": 0.1779, + "step": 265980 + }, + { + "epoch": 11.02, + "grad_norm": 0.703125, + "learning_rate": 0.0003513336432057319, + "loss": 0.2055, + "step": 265990 + }, + { + "epoch": 11.02, + "grad_norm": 1.59375, + "learning_rate": 0.00035132372877587104, + "loss": 0.1835, + "step": 266000 + }, + { + "epoch": 11.02, + "grad_norm": 0.466796875, + "learning_rate": 0.0003513138141553297, + "loss": 0.2003, + "step": 266010 + }, + { + "epoch": 11.02, + "grad_norm": 0.70703125, + "learning_rate": 0.00035130389934412686, + "loss": 0.2063, + "step": 266020 + }, + { + "epoch": 11.02, + "grad_norm": 0.341796875, + "learning_rate": 0.000351293984342281, + "loss": 0.181, + "step": 266030 + }, + { + "epoch": 11.02, + "grad_norm": 1.03125, + "learning_rate": 0.0003512840691498107, + "loss": 0.1597, + "step": 266040 + }, + { + "epoch": 11.02, + "grad_norm": 0.73828125, + "learning_rate": 0.0003512741537667348, + "loss": 0.1511, + "step": 266050 + }, + { + "epoch": 11.02, + "grad_norm": 1.859375, + "learning_rate": 0.0003512642381930718, + "loss": 0.2074, + "step": 266060 + }, + { + "epoch": 11.02, + "grad_norm": 0.84765625, + "learning_rate": 0.0003512543224288406, + "loss": 0.1696, + "step": 266070 + }, + { + "epoch": 11.02, + "grad_norm": 0.9296875, + "learning_rate": 0.0003512444064740596, + "loss": 0.1793, + "step": 266080 + }, + { + "epoch": 11.02, + "grad_norm": 0.26171875, + "learning_rate": 0.0003512344903287475, + "loss": 0.1923, + "step": 266090 + }, + { + "epoch": 11.02, + "grad_norm": 1.546875, + "learning_rate": 0.00035122457399292315, + "loss": 0.2095, + "step": 266100 + }, + { + "epoch": 11.02, + "grad_norm": 0.72265625, + "learning_rate": 0.00035121465746660495, + "loss": 0.1791, + "step": 266110 + }, + { + "epoch": 11.02, + "grad_norm": 0.6875, + "learning_rate": 0.0003512047407498118, + "loss": 0.2063, + "step": 266120 + }, + { + "epoch": 11.02, + "grad_norm": 0.87109375, + "learning_rate": 0.00035119482384256224, + "loss": 0.2205, + "step": 266130 + }, + { + "epoch": 11.02, + "grad_norm": 0.32421875, + "learning_rate": 0.0003511849067448749, + "loss": 0.1693, + "step": 266140 + }, + { + "epoch": 11.02, + "grad_norm": 1.5234375, + "learning_rate": 0.0003511749894567685, + "loss": 0.1753, + "step": 266150 + }, + { + "epoch": 11.02, + "grad_norm": 0.361328125, + "learning_rate": 0.00035116507197826175, + "loss": 0.2035, + "step": 266160 + }, + { + "epoch": 11.02, + "grad_norm": 0.52734375, + "learning_rate": 0.0003511551543093732, + "loss": 0.197, + "step": 266170 + }, + { + "epoch": 11.03, + "grad_norm": 0.7421875, + "learning_rate": 0.00035114523645012166, + "loss": 0.1612, + "step": 266180 + }, + { + "epoch": 11.03, + "grad_norm": 1.359375, + "learning_rate": 0.0003511353184005256, + "loss": 0.2522, + "step": 266190 + }, + { + "epoch": 11.03, + "grad_norm": 1.125, + "learning_rate": 0.00035112540016060385, + "loss": 0.1851, + "step": 266200 + }, + { + "epoch": 11.03, + "grad_norm": 0.7734375, + "learning_rate": 0.000351115481730375, + "loss": 0.2044, + "step": 266210 + }, + { + "epoch": 11.03, + "grad_norm": 0.82421875, + "learning_rate": 0.00035110556310985773, + "loss": 0.1935, + "step": 266220 + }, + { + "epoch": 11.03, + "grad_norm": 1.0, + "learning_rate": 0.0003510956442990707, + "loss": 0.2111, + "step": 266230 + }, + { + "epoch": 11.03, + "grad_norm": 1.1171875, + "learning_rate": 0.00035108572529803263, + "loss": 0.1905, + "step": 266240 + }, + { + "epoch": 11.03, + "grad_norm": 0.734375, + "learning_rate": 0.0003510758061067621, + "loss": 0.1821, + "step": 266250 + }, + { + "epoch": 11.03, + "grad_norm": 1.0859375, + "learning_rate": 0.00035106588672527785, + "loss": 0.1759, + "step": 266260 + }, + { + "epoch": 11.03, + "grad_norm": 0.640625, + "learning_rate": 0.0003510559671535985, + "loss": 0.1753, + "step": 266270 + }, + { + "epoch": 11.03, + "grad_norm": 0.52734375, + "learning_rate": 0.00035104604739174275, + "loss": 0.1849, + "step": 266280 + }, + { + "epoch": 11.03, + "grad_norm": 0.9140625, + "learning_rate": 0.0003510361274397292, + "loss": 0.1837, + "step": 266290 + }, + { + "epoch": 11.03, + "grad_norm": 1.9453125, + "learning_rate": 0.0003510262072975766, + "loss": 0.2209, + "step": 266300 + }, + { + "epoch": 11.03, + "grad_norm": 0.59765625, + "learning_rate": 0.0003510162869653036, + "loss": 0.1632, + "step": 266310 + }, + { + "epoch": 11.03, + "grad_norm": 0.68359375, + "learning_rate": 0.00035100636644292885, + "loss": 0.1544, + "step": 266320 + }, + { + "epoch": 11.03, + "grad_norm": 0.92578125, + "learning_rate": 0.000350996445730471, + "loss": 0.1928, + "step": 266330 + }, + { + "epoch": 11.03, + "grad_norm": 0.7109375, + "learning_rate": 0.0003509865248279488, + "loss": 0.1878, + "step": 266340 + }, + { + "epoch": 11.03, + "grad_norm": 0.66796875, + "learning_rate": 0.0003509766037353809, + "loss": 0.2056, + "step": 266350 + }, + { + "epoch": 11.03, + "grad_norm": 1.75, + "learning_rate": 0.00035096668245278577, + "loss": 0.2095, + "step": 266360 + }, + { + "epoch": 11.03, + "grad_norm": 0.86328125, + "learning_rate": 0.00035095676098018244, + "loss": 0.2002, + "step": 266370 + }, + { + "epoch": 11.03, + "grad_norm": 1.5703125, + "learning_rate": 0.00035094683931758926, + "loss": 0.2074, + "step": 266380 + }, + { + "epoch": 11.03, + "grad_norm": 0.7734375, + "learning_rate": 0.00035093691746502506, + "loss": 0.1952, + "step": 266390 + }, + { + "epoch": 11.03, + "grad_norm": 0.54296875, + "learning_rate": 0.0003509269954225085, + "loss": 0.1565, + "step": 266400 + }, + { + "epoch": 11.03, + "grad_norm": 0.6484375, + "learning_rate": 0.00035091707319005814, + "loss": 0.1783, + "step": 266410 + }, + { + "epoch": 11.04, + "grad_norm": 1.1171875, + "learning_rate": 0.0003509071507676929, + "loss": 0.1733, + "step": 266420 + }, + { + "epoch": 11.04, + "grad_norm": 2.421875, + "learning_rate": 0.00035089722815543126, + "loss": 0.2062, + "step": 266430 + }, + { + "epoch": 11.04, + "grad_norm": 0.8828125, + "learning_rate": 0.00035088730535329184, + "loss": 0.1907, + "step": 266440 + }, + { + "epoch": 11.04, + "grad_norm": 0.9453125, + "learning_rate": 0.0003508773823612935, + "loss": 0.2086, + "step": 266450 + }, + { + "epoch": 11.04, + "grad_norm": 1.0234375, + "learning_rate": 0.0003508674591794548, + "loss": 0.1526, + "step": 266460 + }, + { + "epoch": 11.04, + "grad_norm": 0.48046875, + "learning_rate": 0.0003508575358077944, + "loss": 0.1587, + "step": 266470 + }, + { + "epoch": 11.04, + "grad_norm": 1.125, + "learning_rate": 0.000350847612246331, + "loss": 0.1775, + "step": 266480 + }, + { + "epoch": 11.04, + "grad_norm": 0.380859375, + "learning_rate": 0.0003508376884950833, + "loss": 0.1594, + "step": 266490 + }, + { + "epoch": 11.04, + "grad_norm": 1.65625, + "learning_rate": 0.00035082776455407, + "loss": 0.1856, + "step": 266500 + }, + { + "epoch": 11.04, + "grad_norm": 1.2109375, + "learning_rate": 0.00035081784042330973, + "loss": 0.173, + "step": 266510 + }, + { + "epoch": 11.04, + "grad_norm": 0.34375, + "learning_rate": 0.00035080791610282117, + "loss": 0.1795, + "step": 266520 + }, + { + "epoch": 11.04, + "grad_norm": 0.6015625, + "learning_rate": 0.0003507979915926229, + "loss": 0.1542, + "step": 266530 + }, + { + "epoch": 11.04, + "grad_norm": 0.671875, + "learning_rate": 0.00035078806689273385, + "loss": 0.256, + "step": 266540 + }, + { + "epoch": 11.04, + "grad_norm": 1.5703125, + "learning_rate": 0.00035077814200317245, + "loss": 0.2025, + "step": 266550 + }, + { + "epoch": 11.04, + "grad_norm": 0.640625, + "learning_rate": 0.0003507682169239574, + "loss": 0.1625, + "step": 266560 + }, + { + "epoch": 11.04, + "grad_norm": 0.419921875, + "learning_rate": 0.00035075829165510757, + "loss": 0.271, + "step": 266570 + }, + { + "epoch": 11.04, + "grad_norm": 1.3203125, + "learning_rate": 0.00035074836619664154, + "loss": 0.1561, + "step": 266580 + }, + { + "epoch": 11.04, + "grad_norm": 1.28125, + "learning_rate": 0.0003507384405485779, + "loss": 0.2212, + "step": 266590 + }, + { + "epoch": 11.04, + "grad_norm": 0.34375, + "learning_rate": 0.00035072851471093535, + "loss": 0.219, + "step": 266600 + }, + { + "epoch": 11.04, + "grad_norm": 1.0234375, + "learning_rate": 0.0003507185886837327, + "loss": 0.2247, + "step": 266610 + }, + { + "epoch": 11.04, + "grad_norm": 1.765625, + "learning_rate": 0.0003507086624669885, + "loss": 0.2081, + "step": 266620 + }, + { + "epoch": 11.04, + "grad_norm": 1.0625, + "learning_rate": 0.00035069873606072147, + "loss": 0.1855, + "step": 266630 + }, + { + "epoch": 11.04, + "grad_norm": 0.5390625, + "learning_rate": 0.0003506888094649503, + "loss": 0.2035, + "step": 266640 + }, + { + "epoch": 11.04, + "grad_norm": 0.953125, + "learning_rate": 0.0003506788826796936, + "loss": 0.1713, + "step": 266650 + }, + { + "epoch": 11.05, + "grad_norm": 0.37890625, + "learning_rate": 0.0003506689557049702, + "loss": 0.1629, + "step": 266660 + }, + { + "epoch": 11.05, + "grad_norm": 0.0966796875, + "learning_rate": 0.0003506590285407987, + "loss": 0.1702, + "step": 266670 + }, + { + "epoch": 11.05, + "grad_norm": 1.3125, + "learning_rate": 0.0003506491011871978, + "loss": 0.1723, + "step": 266680 + }, + { + "epoch": 11.05, + "grad_norm": 0.486328125, + "learning_rate": 0.0003506391736441861, + "loss": 0.193, + "step": 266690 + }, + { + "epoch": 11.05, + "grad_norm": 0.65234375, + "learning_rate": 0.00035062924591178234, + "loss": 0.2232, + "step": 266700 + }, + { + "epoch": 11.05, + "grad_norm": 0.87109375, + "learning_rate": 0.00035061931799000524, + "loss": 0.1802, + "step": 266710 + }, + { + "epoch": 11.05, + "grad_norm": 0.6796875, + "learning_rate": 0.0003506093898788734, + "loss": 0.214, + "step": 266720 + }, + { + "epoch": 11.05, + "grad_norm": 0.625, + "learning_rate": 0.0003505994615784056, + "loss": 0.2485, + "step": 266730 + }, + { + "epoch": 11.05, + "grad_norm": 0.0167236328125, + "learning_rate": 0.00035058953308862045, + "loss": 0.168, + "step": 266740 + }, + { + "epoch": 11.05, + "grad_norm": 0.765625, + "learning_rate": 0.00035057960440953675, + "loss": 0.2022, + "step": 266750 + }, + { + "epoch": 11.05, + "grad_norm": 1.484375, + "learning_rate": 0.000350569675541173, + "loss": 0.2036, + "step": 266760 + }, + { + "epoch": 11.05, + "grad_norm": 0.76171875, + "learning_rate": 0.00035055974648354795, + "loss": 0.1679, + "step": 266770 + }, + { + "epoch": 11.05, + "grad_norm": 1.9921875, + "learning_rate": 0.0003505498172366804, + "loss": 0.2079, + "step": 266780 + }, + { + "epoch": 11.05, + "grad_norm": 0.890625, + "learning_rate": 0.00035053988780058893, + "loss": 0.183, + "step": 266790 + }, + { + "epoch": 11.05, + "grad_norm": 2.90625, + "learning_rate": 0.00035052995817529223, + "loss": 0.224, + "step": 266800 + }, + { + "epoch": 11.05, + "grad_norm": 0.400390625, + "learning_rate": 0.000350520028360809, + "loss": 0.1827, + "step": 266810 + }, + { + "epoch": 11.05, + "grad_norm": 0.478515625, + "learning_rate": 0.000350510098357158, + "loss": 0.1483, + "step": 266820 + }, + { + "epoch": 11.05, + "grad_norm": 0.60546875, + "learning_rate": 0.00035050016816435783, + "loss": 0.1683, + "step": 266830 + }, + { + "epoch": 11.05, + "grad_norm": 0.890625, + "learning_rate": 0.0003504902377824271, + "loss": 0.1813, + "step": 266840 + }, + { + "epoch": 11.05, + "grad_norm": 0.9453125, + "learning_rate": 0.0003504803072113846, + "loss": 0.1725, + "step": 266850 + }, + { + "epoch": 11.05, + "grad_norm": 1.7421875, + "learning_rate": 0.0003504703764512491, + "loss": 0.2148, + "step": 266860 + }, + { + "epoch": 11.05, + "grad_norm": 2.90625, + "learning_rate": 0.00035046044550203915, + "loss": 0.199, + "step": 266870 + }, + { + "epoch": 11.05, + "grad_norm": 0.60546875, + "learning_rate": 0.00035045051436377356, + "loss": 0.2239, + "step": 266880 + }, + { + "epoch": 11.05, + "grad_norm": 1.234375, + "learning_rate": 0.00035044058303647087, + "loss": 0.2077, + "step": 266890 + }, + { + "epoch": 11.05, + "grad_norm": 1.6015625, + "learning_rate": 0.0003504306515201499, + "loss": 0.1875, + "step": 266900 + }, + { + "epoch": 11.06, + "grad_norm": 0.59765625, + "learning_rate": 0.0003504207198148293, + "loss": 0.1695, + "step": 266910 + }, + { + "epoch": 11.06, + "grad_norm": 0.498046875, + "learning_rate": 0.0003504107879205277, + "loss": 0.1993, + "step": 266920 + }, + { + "epoch": 11.06, + "grad_norm": 0.98046875, + "learning_rate": 0.0003504008558372639, + "loss": 0.2351, + "step": 266930 + }, + { + "epoch": 11.06, + "grad_norm": 1.46875, + "learning_rate": 0.0003503909235650565, + "loss": 0.214, + "step": 266940 + }, + { + "epoch": 11.06, + "grad_norm": 1.46875, + "learning_rate": 0.00035038099110392415, + "loss": 0.1961, + "step": 266950 + }, + { + "epoch": 11.06, + "grad_norm": 0.61328125, + "learning_rate": 0.0003503710584538857, + "loss": 0.1866, + "step": 266960 + }, + { + "epoch": 11.06, + "grad_norm": 0.458984375, + "learning_rate": 0.0003503611256149598, + "loss": 0.2002, + "step": 266970 + }, + { + "epoch": 11.06, + "grad_norm": 1.2421875, + "learning_rate": 0.00035035119258716496, + "loss": 0.1991, + "step": 266980 + }, + { + "epoch": 11.06, + "grad_norm": 1.09375, + "learning_rate": 0.0003503412593705201, + "loss": 0.2412, + "step": 266990 + }, + { + "epoch": 11.06, + "grad_norm": 0.26953125, + "learning_rate": 0.0003503313259650438, + "loss": 0.1959, + "step": 267000 + }, + { + "epoch": 11.06, + "grad_norm": 0.5703125, + "learning_rate": 0.0003503213923707548, + "loss": 0.2248, + "step": 267010 + }, + { + "epoch": 11.06, + "grad_norm": 0.74609375, + "learning_rate": 0.00035031145858767176, + "loss": 0.1729, + "step": 267020 + }, + { + "epoch": 11.06, + "grad_norm": 1.296875, + "learning_rate": 0.00035030152461581343, + "loss": 0.1615, + "step": 267030 + }, + { + "epoch": 11.06, + "grad_norm": 1.25, + "learning_rate": 0.00035029159045519843, + "loss": 0.2465, + "step": 267040 + }, + { + "epoch": 11.06, + "grad_norm": 0.7734375, + "learning_rate": 0.0003502816561058455, + "loss": 0.2144, + "step": 267050 + }, + { + "epoch": 11.06, + "grad_norm": 0.455078125, + "learning_rate": 0.00035027172156777334, + "loss": 0.1704, + "step": 267060 + }, + { + "epoch": 11.06, + "grad_norm": 1.015625, + "learning_rate": 0.0003502617868410006, + "loss": 0.1612, + "step": 267070 + }, + { + "epoch": 11.06, + "grad_norm": 0.875, + "learning_rate": 0.00035025185192554594, + "loss": 0.2028, + "step": 267080 + }, + { + "epoch": 11.06, + "grad_norm": 1.1875, + "learning_rate": 0.0003502419168214282, + "loss": 0.1731, + "step": 267090 + }, + { + "epoch": 11.06, + "grad_norm": 0.47265625, + "learning_rate": 0.00035023198152866596, + "loss": 0.1444, + "step": 267100 + }, + { + "epoch": 11.06, + "grad_norm": 0.46875, + "learning_rate": 0.000350222046047278, + "loss": 0.168, + "step": 267110 + }, + { + "epoch": 11.06, + "grad_norm": 1.5625, + "learning_rate": 0.00035021211037728297, + "loss": 0.2245, + "step": 267120 + }, + { + "epoch": 11.06, + "grad_norm": 1.0703125, + "learning_rate": 0.0003502021745186995, + "loss": 0.2062, + "step": 267130 + }, + { + "epoch": 11.06, + "grad_norm": 1.328125, + "learning_rate": 0.0003501922384715464, + "loss": 0.175, + "step": 267140 + }, + { + "epoch": 11.07, + "grad_norm": 0.57421875, + "learning_rate": 0.00035018230223584235, + "loss": 0.1936, + "step": 267150 + }, + { + "epoch": 11.07, + "grad_norm": 0.431640625, + "learning_rate": 0.000350172365811606, + "loss": 0.2359, + "step": 267160 + }, + { + "epoch": 11.07, + "grad_norm": 0.79296875, + "learning_rate": 0.0003501624291988561, + "loss": 0.1977, + "step": 267170 + }, + { + "epoch": 11.07, + "grad_norm": 1.03125, + "learning_rate": 0.0003501524923976113, + "loss": 0.2085, + "step": 267180 + }, + { + "epoch": 11.07, + "grad_norm": 0.62890625, + "learning_rate": 0.0003501425554078903, + "loss": 0.1725, + "step": 267190 + }, + { + "epoch": 11.07, + "grad_norm": 2.3125, + "learning_rate": 0.0003501326182297119, + "loss": 0.1789, + "step": 267200 + }, + { + "epoch": 11.07, + "grad_norm": 0.5546875, + "learning_rate": 0.00035012268086309466, + "loss": 0.1985, + "step": 267210 + }, + { + "epoch": 11.07, + "grad_norm": 0.9921875, + "learning_rate": 0.00035011274330805744, + "loss": 0.2024, + "step": 267220 + }, + { + "epoch": 11.07, + "grad_norm": 0.30078125, + "learning_rate": 0.0003501028055646188, + "loss": 0.1693, + "step": 267230 + }, + { + "epoch": 11.07, + "grad_norm": 0.84765625, + "learning_rate": 0.00035009286763279735, + "loss": 0.2004, + "step": 267240 + }, + { + "epoch": 11.07, + "grad_norm": 0.95703125, + "learning_rate": 0.0003500829295126121, + "loss": 0.2352, + "step": 267250 + }, + { + "epoch": 11.07, + "grad_norm": 0.69921875, + "learning_rate": 0.00035007299120408153, + "loss": 0.1824, + "step": 267260 + }, + { + "epoch": 11.07, + "grad_norm": 0.859375, + "learning_rate": 0.0003500630527072244, + "loss": 0.1999, + "step": 267270 + }, + { + "epoch": 11.07, + "grad_norm": 0.6796875, + "learning_rate": 0.00035005311402205944, + "loss": 0.2267, + "step": 267280 + }, + { + "epoch": 11.07, + "grad_norm": 0.9296875, + "learning_rate": 0.0003500431751486053, + "loss": 0.2489, + "step": 267290 + }, + { + "epoch": 11.07, + "grad_norm": 0.6953125, + "learning_rate": 0.0003500332360868807, + "loss": 0.2087, + "step": 267300 + }, + { + "epoch": 11.07, + "grad_norm": 0.6328125, + "learning_rate": 0.00035002329683690443, + "loss": 0.216, + "step": 267310 + }, + { + "epoch": 11.07, + "grad_norm": 0.6484375, + "learning_rate": 0.000350013357398695, + "loss": 0.1894, + "step": 267320 + }, + { + "epoch": 11.07, + "grad_norm": 0.7734375, + "learning_rate": 0.0003500034177722713, + "loss": 0.1978, + "step": 267330 + }, + { + "epoch": 11.07, + "grad_norm": 1.953125, + "learning_rate": 0.0003499934779576519, + "loss": 0.2123, + "step": 267340 + }, + { + "epoch": 11.07, + "grad_norm": 0.62109375, + "learning_rate": 0.00034998353795485563, + "loss": 0.1747, + "step": 267350 + }, + { + "epoch": 11.07, + "grad_norm": 1.2734375, + "learning_rate": 0.00034997359776390115, + "loss": 0.1704, + "step": 267360 + }, + { + "epoch": 11.07, + "grad_norm": 0.6484375, + "learning_rate": 0.00034996365738480714, + "loss": 0.1515, + "step": 267370 + }, + { + "epoch": 11.07, + "grad_norm": 1.0, + "learning_rate": 0.0003499537168175923, + "loss": 0.2251, + "step": 267380 + }, + { + "epoch": 11.08, + "grad_norm": 0.625, + "learning_rate": 0.0003499437760622754, + "loss": 0.2373, + "step": 267390 + }, + { + "epoch": 11.08, + "grad_norm": 1.2734375, + "learning_rate": 0.000349933835118875, + "loss": 0.2359, + "step": 267400 + }, + { + "epoch": 11.08, + "grad_norm": 0.85546875, + "learning_rate": 0.0003499238939874101, + "loss": 0.2106, + "step": 267410 + }, + { + "epoch": 11.08, + "grad_norm": 0.79296875, + "learning_rate": 0.00034991395266789903, + "loss": 0.209, + "step": 267420 + }, + { + "epoch": 11.08, + "grad_norm": 0.8046875, + "learning_rate": 0.0003499040111603608, + "loss": 0.1685, + "step": 267430 + }, + { + "epoch": 11.08, + "grad_norm": 1.0625, + "learning_rate": 0.00034989406946481406, + "loss": 0.2022, + "step": 267440 + }, + { + "epoch": 11.08, + "grad_norm": 1.25, + "learning_rate": 0.00034988412758127737, + "loss": 0.1922, + "step": 267450 + }, + { + "epoch": 11.08, + "grad_norm": 0.26953125, + "learning_rate": 0.0003498741855097695, + "loss": 0.1623, + "step": 267460 + }, + { + "epoch": 11.08, + "grad_norm": 0.70703125, + "learning_rate": 0.00034986424325030935, + "loss": 0.2072, + "step": 267470 + }, + { + "epoch": 11.08, + "grad_norm": 0.59375, + "learning_rate": 0.0003498543008029153, + "loss": 0.1688, + "step": 267480 + }, + { + "epoch": 11.08, + "grad_norm": 0.6015625, + "learning_rate": 0.0003498443581676064, + "loss": 0.1512, + "step": 267490 + }, + { + "epoch": 11.08, + "grad_norm": 0.96875, + "learning_rate": 0.00034983441534440106, + "loss": 0.1599, + "step": 267500 + }, + { + "epoch": 11.08, + "grad_norm": 1.1328125, + "learning_rate": 0.0003498244723333183, + "loss": 0.2161, + "step": 267510 + }, + { + "epoch": 11.08, + "grad_norm": 0.0002460479736328125, + "learning_rate": 0.00034981452913437655, + "loss": 0.1546, + "step": 267520 + }, + { + "epoch": 11.08, + "grad_norm": 0.84765625, + "learning_rate": 0.0003498045857475945, + "loss": 0.179, + "step": 267530 + }, + { + "epoch": 11.08, + "grad_norm": 0.298828125, + "learning_rate": 0.0003497946421729913, + "loss": 0.2043, + "step": 267540 + }, + { + "epoch": 11.08, + "grad_norm": 0.625, + "learning_rate": 0.00034978469841058523, + "loss": 0.1873, + "step": 267550 + }, + { + "epoch": 11.08, + "grad_norm": 0.640625, + "learning_rate": 0.00034977475446039505, + "loss": 0.1826, + "step": 267560 + }, + { + "epoch": 11.08, + "grad_norm": 0.59375, + "learning_rate": 0.00034976481032243965, + "loss": 0.1548, + "step": 267570 + }, + { + "epoch": 11.08, + "grad_norm": 0.7109375, + "learning_rate": 0.0003497548659967376, + "loss": 0.2038, + "step": 267580 + }, + { + "epoch": 11.08, + "grad_norm": 0.81640625, + "learning_rate": 0.00034974492148330766, + "loss": 0.1864, + "step": 267590 + }, + { + "epoch": 11.08, + "grad_norm": 0.68359375, + "learning_rate": 0.00034973497678216864, + "loss": 0.1676, + "step": 267600 + }, + { + "epoch": 11.08, + "grad_norm": 0.80078125, + "learning_rate": 0.00034972503189333905, + "loss": 0.1712, + "step": 267610 + }, + { + "epoch": 11.08, + "grad_norm": 0.51171875, + "learning_rate": 0.0003497150868168378, + "loss": 0.2033, + "step": 267620 + }, + { + "epoch": 11.09, + "grad_norm": 2.40625, + "learning_rate": 0.00034970514155268343, + "loss": 0.1966, + "step": 267630 + }, + { + "epoch": 11.09, + "grad_norm": 0.3203125, + "learning_rate": 0.0003496951961008948, + "loss": 0.193, + "step": 267640 + }, + { + "epoch": 11.09, + "grad_norm": 2.625, + "learning_rate": 0.0003496852504614907, + "loss": 0.177, + "step": 267650 + }, + { + "epoch": 11.09, + "grad_norm": 1.109375, + "learning_rate": 0.00034967530463448954, + "loss": 0.1827, + "step": 267660 + }, + { + "epoch": 11.09, + "grad_norm": 0.63671875, + "learning_rate": 0.00034966535861991033, + "loss": 0.187, + "step": 267670 + }, + { + "epoch": 11.09, + "grad_norm": 0.7421875, + "learning_rate": 0.00034965541241777163, + "loss": 0.1997, + "step": 267680 + }, + { + "epoch": 11.09, + "grad_norm": 0.1962890625, + "learning_rate": 0.00034964546602809214, + "loss": 0.2083, + "step": 267690 + }, + { + "epoch": 11.09, + "grad_norm": 0.90234375, + "learning_rate": 0.00034963551945089077, + "loss": 0.1968, + "step": 267700 + }, + { + "epoch": 11.09, + "grad_norm": 0.337890625, + "learning_rate": 0.0003496255726861861, + "loss": 0.1871, + "step": 267710 + }, + { + "epoch": 11.09, + "grad_norm": 0.515625, + "learning_rate": 0.0003496156257339968, + "loss": 0.162, + "step": 267720 + }, + { + "epoch": 11.09, + "grad_norm": 1.1015625, + "learning_rate": 0.0003496056785943417, + "loss": 0.2484, + "step": 267730 + }, + { + "epoch": 11.09, + "grad_norm": 0.703125, + "learning_rate": 0.00034959573126723943, + "loss": 0.1714, + "step": 267740 + }, + { + "epoch": 11.09, + "grad_norm": 0.62109375, + "learning_rate": 0.00034958578375270875, + "loss": 0.2492, + "step": 267750 + }, + { + "epoch": 11.09, + "grad_norm": 0.82421875, + "learning_rate": 0.00034957583605076836, + "loss": 0.174, + "step": 267760 + }, + { + "epoch": 11.09, + "grad_norm": 1.1796875, + "learning_rate": 0.00034956588816143706, + "loss": 0.1835, + "step": 267770 + }, + { + "epoch": 11.09, + "grad_norm": 1.1875, + "learning_rate": 0.0003495559400847335, + "loss": 0.1758, + "step": 267780 + }, + { + "epoch": 11.09, + "grad_norm": 0.404296875, + "learning_rate": 0.0003495459918206764, + "loss": 0.1437, + "step": 267790 + }, + { + "epoch": 11.09, + "grad_norm": 0.8125, + "learning_rate": 0.00034953604336928446, + "loss": 0.1733, + "step": 267800 + }, + { + "epoch": 11.09, + "grad_norm": 1.34375, + "learning_rate": 0.0003495260947305765, + "loss": 0.177, + "step": 267810 + }, + { + "epoch": 11.09, + "grad_norm": 0.9375, + "learning_rate": 0.00034951614590457116, + "loss": 0.2065, + "step": 267820 + }, + { + "epoch": 11.09, + "grad_norm": 0.5078125, + "learning_rate": 0.00034950619689128715, + "loss": 0.1789, + "step": 267830 + }, + { + "epoch": 11.09, + "grad_norm": 0.5859375, + "learning_rate": 0.0003494962476907432, + "loss": 0.1663, + "step": 267840 + }, + { + "epoch": 11.09, + "grad_norm": 0.48046875, + "learning_rate": 0.0003494862983029581, + "loss": 0.1837, + "step": 267850 + }, + { + "epoch": 11.09, + "grad_norm": 0.796875, + "learning_rate": 0.00034947634872795053, + "loss": 0.2082, + "step": 267860 + }, + { + "epoch": 11.1, + "grad_norm": 1.1171875, + "learning_rate": 0.00034946639896573923, + "loss": 0.1797, + "step": 267870 + }, + { + "epoch": 11.1, + "grad_norm": 0.50390625, + "learning_rate": 0.0003494564490163429, + "loss": 0.2355, + "step": 267880 + }, + { + "epoch": 11.1, + "grad_norm": 0.69921875, + "learning_rate": 0.00034944649887978026, + "loss": 0.207, + "step": 267890 + }, + { + "epoch": 11.1, + "grad_norm": 0.8359375, + "learning_rate": 0.00034943654855607004, + "loss": 0.171, + "step": 267900 + }, + { + "epoch": 11.1, + "grad_norm": 0.7890625, + "learning_rate": 0.00034942659804523097, + "loss": 0.2101, + "step": 267910 + }, + { + "epoch": 11.1, + "grad_norm": 1.0390625, + "learning_rate": 0.00034941664734728175, + "loss": 0.1771, + "step": 267920 + }, + { + "epoch": 11.1, + "grad_norm": 0.6875, + "learning_rate": 0.0003494066964622412, + "loss": 0.2037, + "step": 267930 + }, + { + "epoch": 11.1, + "grad_norm": 0.6875, + "learning_rate": 0.000349396745390128, + "loss": 0.1934, + "step": 267940 + }, + { + "epoch": 11.1, + "grad_norm": 1.6171875, + "learning_rate": 0.00034938679413096073, + "loss": 0.2023, + "step": 267950 + }, + { + "epoch": 11.1, + "grad_norm": 0.625, + "learning_rate": 0.0003493768426847584, + "loss": 0.2203, + "step": 267960 + }, + { + "epoch": 11.1, + "grad_norm": 0.7109375, + "learning_rate": 0.0003493668910515395, + "loss": 0.1989, + "step": 267970 + }, + { + "epoch": 11.1, + "grad_norm": 0.90234375, + "learning_rate": 0.00034935693923132286, + "loss": 0.1608, + "step": 267980 + }, + { + "epoch": 11.1, + "grad_norm": 0.953125, + "learning_rate": 0.00034934698722412715, + "loss": 0.2011, + "step": 267990 + }, + { + "epoch": 11.1, + "grad_norm": 0.921875, + "learning_rate": 0.00034933703502997124, + "loss": 0.2051, + "step": 268000 + }, + { + "epoch": 11.1, + "grad_norm": 0.3515625, + "learning_rate": 0.00034932708264887366, + "loss": 0.1579, + "step": 268010 + }, + { + "epoch": 11.1, + "grad_norm": 0.625, + "learning_rate": 0.0003493171300808533, + "loss": 0.219, + "step": 268020 + }, + { + "epoch": 11.1, + "grad_norm": 0.99609375, + "learning_rate": 0.00034930717732592877, + "loss": 0.1734, + "step": 268030 + }, + { + "epoch": 11.1, + "grad_norm": 0.96875, + "learning_rate": 0.00034929722438411886, + "loss": 0.1674, + "step": 268040 + }, + { + "epoch": 11.1, + "grad_norm": 0.859375, + "learning_rate": 0.00034928727125544236, + "loss": 0.2017, + "step": 268050 + }, + { + "epoch": 11.1, + "grad_norm": 1.5625, + "learning_rate": 0.0003492773179399179, + "loss": 0.2245, + "step": 268060 + }, + { + "epoch": 11.1, + "grad_norm": 0.77734375, + "learning_rate": 0.0003492673644375642, + "loss": 0.1997, + "step": 268070 + }, + { + "epoch": 11.1, + "grad_norm": 0.8515625, + "learning_rate": 0.00034925741074840014, + "loss": 0.1906, + "step": 268080 + }, + { + "epoch": 11.1, + "grad_norm": 0.62109375, + "learning_rate": 0.0003492474568724443, + "loss": 0.2305, + "step": 268090 + }, + { + "epoch": 11.1, + "grad_norm": 1.09375, + "learning_rate": 0.0003492375028097155, + "loss": 0.1831, + "step": 268100 + }, + { + "epoch": 11.11, + "grad_norm": 0.51171875, + "learning_rate": 0.00034922754856023236, + "loss": 0.1938, + "step": 268110 + }, + { + "epoch": 11.11, + "grad_norm": 1.421875, + "learning_rate": 0.00034921759412401374, + "loss": 0.2046, + "step": 268120 + }, + { + "epoch": 11.11, + "grad_norm": 0.55078125, + "learning_rate": 0.00034920763950107826, + "loss": 0.249, + "step": 268130 + }, + { + "epoch": 11.11, + "grad_norm": 1.234375, + "learning_rate": 0.00034919768469144486, + "loss": 0.2042, + "step": 268140 + }, + { + "epoch": 11.11, + "grad_norm": 0.8046875, + "learning_rate": 0.000349187729695132, + "loss": 0.2288, + "step": 268150 + }, + { + "epoch": 11.11, + "grad_norm": 1.6640625, + "learning_rate": 0.0003491777745121586, + "loss": 0.1953, + "step": 268160 + }, + { + "epoch": 11.11, + "grad_norm": 0.3984375, + "learning_rate": 0.00034916781914254324, + "loss": 0.1755, + "step": 268170 + }, + { + "epoch": 11.11, + "grad_norm": 0.640625, + "learning_rate": 0.00034915786358630485, + "loss": 0.1933, + "step": 268180 + }, + { + "epoch": 11.11, + "grad_norm": 1.3125, + "learning_rate": 0.00034914790784346215, + "loss": 0.2249, + "step": 268190 + }, + { + "epoch": 11.11, + "grad_norm": 0.52734375, + "learning_rate": 0.0003491379519140336, + "loss": 0.2024, + "step": 268200 + }, + { + "epoch": 11.11, + "grad_norm": 0.6015625, + "learning_rate": 0.00034912799579803823, + "loss": 0.2087, + "step": 268210 + }, + { + "epoch": 11.11, + "grad_norm": 1.578125, + "learning_rate": 0.00034911803949549475, + "loss": 0.2239, + "step": 268220 + }, + { + "epoch": 11.11, + "grad_norm": 0.349609375, + "learning_rate": 0.0003491080830064217, + "loss": 0.2083, + "step": 268230 + }, + { + "epoch": 11.11, + "grad_norm": 0.396484375, + "learning_rate": 0.000349098126330838, + "loss": 0.1884, + "step": 268240 + }, + { + "epoch": 11.11, + "grad_norm": 1.0234375, + "learning_rate": 0.0003490881694687623, + "loss": 0.2039, + "step": 268250 + }, + { + "epoch": 11.11, + "grad_norm": 1.046875, + "learning_rate": 0.00034907821242021337, + "loss": 0.2262, + "step": 268260 + }, + { + "epoch": 11.11, + "grad_norm": 0.890625, + "learning_rate": 0.00034906825518521, + "loss": 0.2335, + "step": 268270 + }, + { + "epoch": 11.11, + "grad_norm": 1.0, + "learning_rate": 0.0003490582977637708, + "loss": 0.179, + "step": 268280 + }, + { + "epoch": 11.11, + "grad_norm": 1.265625, + "learning_rate": 0.0003490483401559146, + "loss": 0.2107, + "step": 268290 + }, + { + "epoch": 11.11, + "grad_norm": 0.640625, + "learning_rate": 0.0003490383823616602, + "loss": 0.1949, + "step": 268300 + }, + { + "epoch": 11.11, + "grad_norm": 0.85546875, + "learning_rate": 0.0003490284243810262, + "loss": 0.2022, + "step": 268310 + }, + { + "epoch": 11.11, + "grad_norm": 0.578125, + "learning_rate": 0.00034901846621403134, + "loss": 0.2626, + "step": 268320 + }, + { + "epoch": 11.11, + "grad_norm": 1.015625, + "learning_rate": 0.00034900850786069446, + "loss": 0.1489, + "step": 268330 + }, + { + "epoch": 11.11, + "grad_norm": 0.7734375, + "learning_rate": 0.00034899854932103424, + "loss": 0.1603, + "step": 268340 + }, + { + "epoch": 11.12, + "grad_norm": 0.44140625, + "learning_rate": 0.0003489885905950695, + "loss": 0.1892, + "step": 268350 + }, + { + "epoch": 11.12, + "grad_norm": 0.85546875, + "learning_rate": 0.0003489786316828189, + "loss": 0.2036, + "step": 268360 + }, + { + "epoch": 11.12, + "grad_norm": 2.265625, + "learning_rate": 0.0003489686725843012, + "loss": 0.1965, + "step": 268370 + }, + { + "epoch": 11.12, + "grad_norm": 0.57421875, + "learning_rate": 0.0003489587132995352, + "loss": 0.2139, + "step": 268380 + }, + { + "epoch": 11.12, + "grad_norm": 0.60546875, + "learning_rate": 0.00034894875382853943, + "loss": 0.1518, + "step": 268390 + }, + { + "epoch": 11.12, + "grad_norm": 0.70703125, + "learning_rate": 0.0003489387941713329, + "loss": 0.2136, + "step": 268400 + }, + { + "epoch": 11.12, + "grad_norm": 0.68359375, + "learning_rate": 0.00034892883432793425, + "loss": 0.2567, + "step": 268410 + }, + { + "epoch": 11.12, + "grad_norm": 0.56640625, + "learning_rate": 0.00034891887429836215, + "loss": 0.2233, + "step": 268420 + }, + { + "epoch": 11.12, + "grad_norm": 0.59375, + "learning_rate": 0.0003489089140826355, + "loss": 0.215, + "step": 268430 + }, + { + "epoch": 11.12, + "grad_norm": 0.87890625, + "learning_rate": 0.00034889895368077296, + "loss": 0.1757, + "step": 268440 + }, + { + "epoch": 11.12, + "grad_norm": 1.0390625, + "learning_rate": 0.0003488889930927932, + "loss": 0.1737, + "step": 268450 + }, + { + "epoch": 11.12, + "grad_norm": 0.87109375, + "learning_rate": 0.0003488790323187151, + "loss": 0.1622, + "step": 268460 + }, + { + "epoch": 11.12, + "grad_norm": 0.98046875, + "learning_rate": 0.0003488690713585572, + "loss": 0.2053, + "step": 268470 + }, + { + "epoch": 11.12, + "grad_norm": 0.96875, + "learning_rate": 0.0003488591102123386, + "loss": 0.2176, + "step": 268480 + }, + { + "epoch": 11.12, + "grad_norm": 1.921875, + "learning_rate": 0.0003488491488800777, + "loss": 0.1585, + "step": 268490 + }, + { + "epoch": 11.12, + "grad_norm": 0.6640625, + "learning_rate": 0.00034883918736179333, + "loss": 0.24, + "step": 268500 + }, + { + "epoch": 11.12, + "grad_norm": 1.1640625, + "learning_rate": 0.0003488292256575044, + "loss": 0.1426, + "step": 268510 + }, + { + "epoch": 11.12, + "grad_norm": 1.1640625, + "learning_rate": 0.0003488192637672294, + "loss": 0.2016, + "step": 268520 + }, + { + "epoch": 11.12, + "grad_norm": 0.9453125, + "learning_rate": 0.00034880930169098735, + "loss": 0.2115, + "step": 268530 + }, + { + "epoch": 11.12, + "grad_norm": 0.95703125, + "learning_rate": 0.0003487993394287968, + "loss": 0.2023, + "step": 268540 + }, + { + "epoch": 11.12, + "grad_norm": 1.0703125, + "learning_rate": 0.0003487893769806766, + "loss": 0.2068, + "step": 268550 + }, + { + "epoch": 11.12, + "grad_norm": 1.2890625, + "learning_rate": 0.00034877941434664544, + "loss": 0.2113, + "step": 268560 + }, + { + "epoch": 11.12, + "grad_norm": 0.318359375, + "learning_rate": 0.00034876945152672207, + "loss": 0.231, + "step": 268570 + }, + { + "epoch": 11.12, + "grad_norm": 1.0234375, + "learning_rate": 0.0003487594885209253, + "loss": 0.1855, + "step": 268580 + }, + { + "epoch": 11.12, + "grad_norm": 0.361328125, + "learning_rate": 0.0003487495253292738, + "loss": 0.223, + "step": 268590 + }, + { + "epoch": 11.13, + "grad_norm": 0.77734375, + "learning_rate": 0.00034873956195178636, + "loss": 0.1867, + "step": 268600 + }, + { + "epoch": 11.13, + "grad_norm": 1.6796875, + "learning_rate": 0.00034872959838848176, + "loss": 0.1808, + "step": 268610 + }, + { + "epoch": 11.13, + "grad_norm": 0.54296875, + "learning_rate": 0.00034871963463937875, + "loss": 0.1492, + "step": 268620 + }, + { + "epoch": 11.13, + "grad_norm": 0.51171875, + "learning_rate": 0.00034870967070449595, + "loss": 0.2218, + "step": 268630 + }, + { + "epoch": 11.13, + "grad_norm": 1.46875, + "learning_rate": 0.0003486997065838523, + "loss": 0.1903, + "step": 268640 + }, + { + "epoch": 11.13, + "grad_norm": 1.2109375, + "learning_rate": 0.00034868974227746633, + "loss": 0.2152, + "step": 268650 + }, + { + "epoch": 11.13, + "grad_norm": 0.796875, + "learning_rate": 0.000348679777785357, + "loss": 0.1825, + "step": 268660 + }, + { + "epoch": 11.13, + "grad_norm": 0.75, + "learning_rate": 0.00034866981310754307, + "loss": 0.1971, + "step": 268670 + }, + { + "epoch": 11.13, + "grad_norm": 0.4453125, + "learning_rate": 0.00034865984824404306, + "loss": 0.1758, + "step": 268680 + }, + { + "epoch": 11.13, + "grad_norm": 1.65625, + "learning_rate": 0.000348649883194876, + "loss": 0.209, + "step": 268690 + }, + { + "epoch": 11.13, + "grad_norm": 0.59765625, + "learning_rate": 0.0003486399179600605, + "loss": 0.1208, + "step": 268700 + }, + { + "epoch": 11.13, + "grad_norm": 0.5, + "learning_rate": 0.0003486299525396152, + "loss": 0.1597, + "step": 268710 + }, + { + "epoch": 11.13, + "grad_norm": 0.4921875, + "learning_rate": 0.0003486199869335591, + "loss": 0.213, + "step": 268720 + }, + { + "epoch": 11.13, + "grad_norm": 0.55859375, + "learning_rate": 0.00034861002114191077, + "loss": 0.1668, + "step": 268730 + }, + { + "epoch": 11.13, + "grad_norm": 0.94140625, + "learning_rate": 0.0003486000551646891, + "loss": 0.1538, + "step": 268740 + }, + { + "epoch": 11.13, + "grad_norm": 0.67578125, + "learning_rate": 0.00034859008900191274, + "loss": 0.1904, + "step": 268750 + }, + { + "epoch": 11.13, + "grad_norm": 0.70703125, + "learning_rate": 0.0003485801226536004, + "loss": 0.1739, + "step": 268760 + }, + { + "epoch": 11.13, + "grad_norm": 0.44140625, + "learning_rate": 0.000348570156119771, + "loss": 0.164, + "step": 268770 + }, + { + "epoch": 11.13, + "grad_norm": 0.220703125, + "learning_rate": 0.0003485601894004433, + "loss": 0.1706, + "step": 268780 + }, + { + "epoch": 11.13, + "grad_norm": 0.92578125, + "learning_rate": 0.00034855022249563573, + "loss": 0.1684, + "step": 268790 + }, + { + "epoch": 11.13, + "grad_norm": 1.3203125, + "learning_rate": 0.0003485402554053675, + "loss": 0.2436, + "step": 268800 + }, + { + "epoch": 11.13, + "grad_norm": 0.796875, + "learning_rate": 0.000348530288129657, + "loss": 0.1922, + "step": 268810 + }, + { + "epoch": 11.13, + "grad_norm": 4.46875, + "learning_rate": 0.00034852032066852326, + "loss": 0.1758, + "step": 268820 + }, + { + "epoch": 11.13, + "grad_norm": 0.96875, + "learning_rate": 0.00034851035302198486, + "loss": 0.1958, + "step": 268830 + }, + { + "epoch": 11.14, + "grad_norm": 0.380859375, + "learning_rate": 0.0003485003851900606, + "loss": 0.1851, + "step": 268840 + }, + { + "epoch": 11.14, + "grad_norm": 0.609375, + "learning_rate": 0.0003484904171727692, + "loss": 0.1798, + "step": 268850 + }, + { + "epoch": 11.14, + "grad_norm": 0.6640625, + "learning_rate": 0.0003484804489701296, + "loss": 0.2079, + "step": 268860 + }, + { + "epoch": 11.14, + "grad_norm": 0.474609375, + "learning_rate": 0.00034847048058216036, + "loss": 0.1649, + "step": 268870 + }, + { + "epoch": 11.14, + "grad_norm": 0.79296875, + "learning_rate": 0.0003484605120088803, + "loss": 0.1831, + "step": 268880 + }, + { + "epoch": 11.14, + "grad_norm": 0.49609375, + "learning_rate": 0.0003484505432503082, + "loss": 0.1959, + "step": 268890 + }, + { + "epoch": 11.14, + "grad_norm": 1.625, + "learning_rate": 0.0003484405743064628, + "loss": 0.1753, + "step": 268900 + }, + { + "epoch": 11.14, + "grad_norm": 0.59765625, + "learning_rate": 0.0003484306051773629, + "loss": 0.2171, + "step": 268910 + }, + { + "epoch": 11.14, + "grad_norm": 0.859375, + "learning_rate": 0.0003484206358630272, + "loss": 0.1814, + "step": 268920 + }, + { + "epoch": 11.14, + "grad_norm": 1.296875, + "learning_rate": 0.0003484106663634745, + "loss": 0.2098, + "step": 268930 + }, + { + "epoch": 11.14, + "grad_norm": 0.66015625, + "learning_rate": 0.0003484006966787236, + "loss": 0.2143, + "step": 268940 + }, + { + "epoch": 11.14, + "grad_norm": 0.79296875, + "learning_rate": 0.0003483907268087931, + "loss": 0.1788, + "step": 268950 + }, + { + "epoch": 11.14, + "grad_norm": 1.8828125, + "learning_rate": 0.00034838075675370204, + "loss": 0.1923, + "step": 268960 + }, + { + "epoch": 11.14, + "grad_norm": 0.52734375, + "learning_rate": 0.0003483707865134689, + "loss": 0.1921, + "step": 268970 + }, + { + "epoch": 11.14, + "grad_norm": 0.6796875, + "learning_rate": 0.0003483608160881126, + "loss": 0.2213, + "step": 268980 + }, + { + "epoch": 11.14, + "grad_norm": 1.0234375, + "learning_rate": 0.0003483508454776519, + "loss": 0.2306, + "step": 268990 + }, + { + "epoch": 11.14, + "grad_norm": 0.44921875, + "learning_rate": 0.00034834087468210543, + "loss": 0.1806, + "step": 269000 + }, + { + "epoch": 11.14, + "grad_norm": 0.765625, + "learning_rate": 0.00034833090370149216, + "loss": 0.1632, + "step": 269010 + }, + { + "epoch": 11.14, + "grad_norm": 1.40625, + "learning_rate": 0.00034832093253583067, + "loss": 0.2007, + "step": 269020 + }, + { + "epoch": 11.14, + "grad_norm": 0.703125, + "learning_rate": 0.00034831096118513984, + "loss": 0.1673, + "step": 269030 + }, + { + "epoch": 11.14, + "grad_norm": 0.54296875, + "learning_rate": 0.00034830098964943847, + "loss": 0.2554, + "step": 269040 + }, + { + "epoch": 11.14, + "grad_norm": 0.63671875, + "learning_rate": 0.0003482910179287451, + "loss": 0.2088, + "step": 269050 + }, + { + "epoch": 11.14, + "grad_norm": 0.94921875, + "learning_rate": 0.0003482810460230788, + "loss": 0.2377, + "step": 269060 + }, + { + "epoch": 11.14, + "grad_norm": 0.65234375, + "learning_rate": 0.0003482710739324581, + "loss": 0.2055, + "step": 269070 + }, + { + "epoch": 11.15, + "grad_norm": 0.78125, + "learning_rate": 0.0003482611016569018, + "loss": 0.2109, + "step": 269080 + }, + { + "epoch": 11.15, + "grad_norm": 0.63671875, + "learning_rate": 0.00034825112919642883, + "loss": 0.2163, + "step": 269090 + }, + { + "epoch": 11.15, + "grad_norm": 0.6796875, + "learning_rate": 0.00034824115655105776, + "loss": 0.1804, + "step": 269100 + }, + { + "epoch": 11.15, + "grad_norm": 0.6796875, + "learning_rate": 0.00034823118372080745, + "loss": 0.2547, + "step": 269110 + }, + { + "epoch": 11.15, + "grad_norm": 1.9375, + "learning_rate": 0.0003482212107056967, + "loss": 0.1877, + "step": 269120 + }, + { + "epoch": 11.15, + "grad_norm": 1.0234375, + "learning_rate": 0.00034821123750574426, + "loss": 0.1708, + "step": 269130 + }, + { + "epoch": 11.15, + "grad_norm": 0.423828125, + "learning_rate": 0.0003482012641209689, + "loss": 0.2001, + "step": 269140 + }, + { + "epoch": 11.15, + "grad_norm": 0.71484375, + "learning_rate": 0.0003481912905513892, + "loss": 0.1747, + "step": 269150 + }, + { + "epoch": 11.15, + "grad_norm": 1.03125, + "learning_rate": 0.00034818131679702425, + "loss": 0.1636, + "step": 269160 + }, + { + "epoch": 11.15, + "grad_norm": 0.92578125, + "learning_rate": 0.00034817134285789267, + "loss": 0.1862, + "step": 269170 + }, + { + "epoch": 11.15, + "grad_norm": 0.57421875, + "learning_rate": 0.0003481613687340131, + "loss": 0.19, + "step": 269180 + }, + { + "epoch": 11.15, + "grad_norm": 0.875, + "learning_rate": 0.0003481513944254045, + "loss": 0.2401, + "step": 269190 + }, + { + "epoch": 11.15, + "grad_norm": 0.48828125, + "learning_rate": 0.00034814141993208563, + "loss": 0.1573, + "step": 269200 + }, + { + "epoch": 11.15, + "grad_norm": 1.2734375, + "learning_rate": 0.00034813144525407515, + "loss": 0.2016, + "step": 269210 + }, + { + "epoch": 11.15, + "grad_norm": 0.50390625, + "learning_rate": 0.00034812147039139186, + "loss": 0.1852, + "step": 269220 + }, + { + "epoch": 11.15, + "grad_norm": 1.234375, + "learning_rate": 0.00034811149534405463, + "loss": 0.1568, + "step": 269230 + }, + { + "epoch": 11.15, + "grad_norm": 0.9609375, + "learning_rate": 0.00034810152011208206, + "loss": 0.2078, + "step": 269240 + }, + { + "epoch": 11.15, + "grad_norm": 0.6171875, + "learning_rate": 0.00034809154469549316, + "loss": 0.1834, + "step": 269250 + }, + { + "epoch": 11.15, + "grad_norm": 0.62890625, + "learning_rate": 0.0003480815690943065, + "loss": 0.2332, + "step": 269260 + }, + { + "epoch": 11.15, + "grad_norm": 1.1953125, + "learning_rate": 0.0003480715933085409, + "loss": 0.1842, + "step": 269270 + }, + { + "epoch": 11.15, + "grad_norm": 0.94140625, + "learning_rate": 0.0003480616173382151, + "loss": 0.2328, + "step": 269280 + }, + { + "epoch": 11.15, + "grad_norm": 1.203125, + "learning_rate": 0.0003480516411833481, + "loss": 0.2287, + "step": 269290 + }, + { + "epoch": 11.15, + "grad_norm": 0.6640625, + "learning_rate": 0.0003480416648439583, + "loss": 0.236, + "step": 269300 + }, + { + "epoch": 11.15, + "grad_norm": 0.7265625, + "learning_rate": 0.00034803168832006485, + "loss": 0.1855, + "step": 269310 + }, + { + "epoch": 11.16, + "grad_norm": 1.2578125, + "learning_rate": 0.00034802171161168614, + "loss": 0.1918, + "step": 269320 + }, + { + "epoch": 11.16, + "grad_norm": 0.265625, + "learning_rate": 0.0003480117347188414, + "loss": 0.1788, + "step": 269330 + }, + { + "epoch": 11.16, + "grad_norm": 0.50390625, + "learning_rate": 0.000348001757641549, + "loss": 0.1772, + "step": 269340 + }, + { + "epoch": 11.16, + "grad_norm": 0.9453125, + "learning_rate": 0.00034799178037982795, + "loss": 0.2064, + "step": 269350 + }, + { + "epoch": 11.16, + "grad_norm": 2.640625, + "learning_rate": 0.0003479818029336969, + "loss": 0.1989, + "step": 269360 + }, + { + "epoch": 11.16, + "grad_norm": 1.0625, + "learning_rate": 0.00034797182530317463, + "loss": 0.2036, + "step": 269370 + }, + { + "epoch": 11.16, + "grad_norm": 0.81640625, + "learning_rate": 0.00034796184748828006, + "loss": 0.1865, + "step": 269380 + }, + { + "epoch": 11.16, + "grad_norm": 1.3203125, + "learning_rate": 0.00034795186948903187, + "loss": 0.2056, + "step": 269390 + }, + { + "epoch": 11.16, + "grad_norm": 2.734375, + "learning_rate": 0.00034794189130544877, + "loss": 0.2303, + "step": 269400 + }, + { + "epoch": 11.16, + "grad_norm": 0.5546875, + "learning_rate": 0.00034793191293754967, + "loss": 0.1692, + "step": 269410 + }, + { + "epoch": 11.16, + "grad_norm": 0.64453125, + "learning_rate": 0.0003479219343853532, + "loss": 0.1725, + "step": 269420 + }, + { + "epoch": 11.16, + "grad_norm": 2.984375, + "learning_rate": 0.00034791195564887835, + "loss": 0.1697, + "step": 269430 + }, + { + "epoch": 11.16, + "grad_norm": 0.78125, + "learning_rate": 0.00034790197672814375, + "loss": 0.1872, + "step": 269440 + }, + { + "epoch": 11.16, + "grad_norm": 0.43359375, + "learning_rate": 0.00034789199762316807, + "loss": 0.1783, + "step": 269450 + }, + { + "epoch": 11.16, + "grad_norm": 0.384765625, + "learning_rate": 0.0003478820183339704, + "loss": 0.1659, + "step": 269460 + }, + { + "epoch": 11.16, + "grad_norm": 1.171875, + "learning_rate": 0.0003478720388605692, + "loss": 0.1486, + "step": 269470 + }, + { + "epoch": 11.16, + "grad_norm": 0.6171875, + "learning_rate": 0.00034786205920298344, + "loss": 0.1624, + "step": 269480 + }, + { + "epoch": 11.16, + "grad_norm": 0.80078125, + "learning_rate": 0.0003478520793612319, + "loss": 0.1576, + "step": 269490 + }, + { + "epoch": 11.16, + "grad_norm": 0.68359375, + "learning_rate": 0.00034784209933533326, + "loss": 0.1572, + "step": 269500 + }, + { + "epoch": 11.16, + "grad_norm": 0.478515625, + "learning_rate": 0.00034783211912530637, + "loss": 0.2117, + "step": 269510 + }, + { + "epoch": 11.16, + "grad_norm": 0.875, + "learning_rate": 0.00034782213873116996, + "loss": 0.2269, + "step": 269520 + }, + { + "epoch": 11.16, + "grad_norm": 1.0703125, + "learning_rate": 0.00034781215815294286, + "loss": 0.2363, + "step": 269530 + }, + { + "epoch": 11.16, + "grad_norm": 0.4765625, + "learning_rate": 0.000347802177390644, + "loss": 0.172, + "step": 269540 + }, + { + "epoch": 11.16, + "grad_norm": 0.671875, + "learning_rate": 0.00034779219644429186, + "loss": 0.2169, + "step": 269550 + }, + { + "epoch": 11.17, + "grad_norm": 0.8046875, + "learning_rate": 0.0003477822153139053, + "loss": 0.1742, + "step": 269560 + }, + { + "epoch": 11.17, + "grad_norm": 0.80078125, + "learning_rate": 0.00034777223399950325, + "loss": 0.2185, + "step": 269570 + }, + { + "epoch": 11.17, + "grad_norm": 0.240234375, + "learning_rate": 0.00034776225250110444, + "loss": 0.2233, + "step": 269580 + }, + { + "epoch": 11.17, + "grad_norm": 1.203125, + "learning_rate": 0.0003477522708187276, + "loss": 0.2167, + "step": 269590 + }, + { + "epoch": 11.17, + "grad_norm": 0.87890625, + "learning_rate": 0.00034774228895239153, + "loss": 0.1879, + "step": 269600 + }, + { + "epoch": 11.17, + "grad_norm": 0.80859375, + "learning_rate": 0.000347732306902115, + "loss": 0.205, + "step": 269610 + }, + { + "epoch": 11.17, + "grad_norm": 1.921875, + "learning_rate": 0.0003477223246679169, + "loss": 0.1965, + "step": 269620 + }, + { + "epoch": 11.17, + "grad_norm": 0.7890625, + "learning_rate": 0.0003477123422498159, + "loss": 0.1912, + "step": 269630 + }, + { + "epoch": 11.17, + "grad_norm": 1.1796875, + "learning_rate": 0.0003477023596478308, + "loss": 0.2187, + "step": 269640 + }, + { + "epoch": 11.17, + "grad_norm": 1.2734375, + "learning_rate": 0.0003476923768619804, + "loss": 0.1893, + "step": 269650 + }, + { + "epoch": 11.17, + "grad_norm": 1.0546875, + "learning_rate": 0.0003476823938922836, + "loss": 0.2106, + "step": 269660 + }, + { + "epoch": 11.17, + "grad_norm": 0.5546875, + "learning_rate": 0.000347672410738759, + "loss": 0.1826, + "step": 269670 + }, + { + "epoch": 11.17, + "grad_norm": 1.2890625, + "learning_rate": 0.0003476624274014255, + "loss": 0.2059, + "step": 269680 + }, + { + "epoch": 11.17, + "grad_norm": 0.59375, + "learning_rate": 0.0003476524438803018, + "loss": 0.2186, + "step": 269690 + }, + { + "epoch": 11.17, + "grad_norm": 0.94921875, + "learning_rate": 0.0003476424601754068, + "loss": 0.1967, + "step": 269700 + }, + { + "epoch": 11.17, + "grad_norm": 0.6484375, + "learning_rate": 0.0003476324762867593, + "loss": 0.1949, + "step": 269710 + }, + { + "epoch": 11.17, + "grad_norm": 0.6875, + "learning_rate": 0.00034762249221437794, + "loss": 0.1609, + "step": 269720 + }, + { + "epoch": 11.17, + "grad_norm": 0.66796875, + "learning_rate": 0.0003476125079582815, + "loss": 0.2179, + "step": 269730 + }, + { + "epoch": 11.17, + "grad_norm": 0.2333984375, + "learning_rate": 0.00034760252351848905, + "loss": 0.1863, + "step": 269740 + }, + { + "epoch": 11.17, + "grad_norm": 0.455078125, + "learning_rate": 0.0003475925388950191, + "loss": 0.2329, + "step": 269750 + }, + { + "epoch": 11.17, + "grad_norm": 1.6484375, + "learning_rate": 0.0003475825540878905, + "loss": 0.1941, + "step": 269760 + }, + { + "epoch": 11.17, + "grad_norm": 1.1171875, + "learning_rate": 0.0003475725690971221, + "loss": 0.2487, + "step": 269770 + }, + { + "epoch": 11.17, + "grad_norm": 0.390625, + "learning_rate": 0.00034756258392273264, + "loss": 0.1939, + "step": 269780 + }, + { + "epoch": 11.17, + "grad_norm": 1.3359375, + "learning_rate": 0.00034755259856474095, + "loss": 0.2115, + "step": 269790 + }, + { + "epoch": 11.18, + "grad_norm": 2.828125, + "learning_rate": 0.00034754261302316587, + "loss": 0.188, + "step": 269800 + }, + { + "epoch": 11.18, + "grad_norm": 1.15625, + "learning_rate": 0.00034753262729802603, + "loss": 0.209, + "step": 269810 + }, + { + "epoch": 11.18, + "grad_norm": 0.380859375, + "learning_rate": 0.0003475226413893404, + "loss": 0.1575, + "step": 269820 + }, + { + "epoch": 11.18, + "grad_norm": 0.265625, + "learning_rate": 0.0003475126552971276, + "loss": 0.1757, + "step": 269830 + }, + { + "epoch": 11.18, + "grad_norm": 1.2265625, + "learning_rate": 0.0003475026690214066, + "loss": 0.1617, + "step": 269840 + }, + { + "epoch": 11.18, + "grad_norm": 0.68359375, + "learning_rate": 0.000347492682562196, + "loss": 0.1732, + "step": 269850 + }, + { + "epoch": 11.18, + "grad_norm": 0.6796875, + "learning_rate": 0.00034748269591951475, + "loss": 0.2449, + "step": 269860 + }, + { + "epoch": 11.18, + "grad_norm": 3.109375, + "learning_rate": 0.0003474727090933817, + "loss": 0.1742, + "step": 269870 + }, + { + "epoch": 11.18, + "grad_norm": 0.66796875, + "learning_rate": 0.0003474627220838154, + "loss": 0.1384, + "step": 269880 + }, + { + "epoch": 11.18, + "grad_norm": 0.859375, + "learning_rate": 0.00034745273489083487, + "loss": 0.2248, + "step": 269890 + }, + { + "epoch": 11.18, + "grad_norm": 1.1015625, + "learning_rate": 0.0003474427475144588, + "loss": 0.1624, + "step": 269900 + }, + { + "epoch": 11.18, + "grad_norm": 0.8828125, + "learning_rate": 0.00034743275995470594, + "loss": 0.2122, + "step": 269910 + }, + { + "epoch": 11.18, + "grad_norm": 0.69921875, + "learning_rate": 0.0003474227722115952, + "loss": 0.2238, + "step": 269920 + }, + { + "epoch": 11.18, + "grad_norm": 0.53125, + "learning_rate": 0.0003474127842851453, + "loss": 0.2384, + "step": 269930 + }, + { + "epoch": 11.18, + "grad_norm": 0.9609375, + "learning_rate": 0.00034740279617537505, + "loss": 0.1647, + "step": 269940 + }, + { + "epoch": 11.18, + "grad_norm": 1.34375, + "learning_rate": 0.00034739280788230334, + "loss": 0.253, + "step": 269950 + }, + { + "epoch": 11.18, + "grad_norm": 0.66015625, + "learning_rate": 0.0003473828194059487, + "loss": 0.219, + "step": 269960 + }, + { + "epoch": 11.18, + "grad_norm": 0.80078125, + "learning_rate": 0.0003473728307463303, + "loss": 0.202, + "step": 269970 + }, + { + "epoch": 11.18, + "grad_norm": 0.55859375, + "learning_rate": 0.0003473628419034667, + "loss": 0.2324, + "step": 269980 + }, + { + "epoch": 11.18, + "grad_norm": 0.9453125, + "learning_rate": 0.0003473528528773767, + "loss": 0.1978, + "step": 269990 + }, + { + "epoch": 11.18, + "grad_norm": 1.875, + "learning_rate": 0.00034734286366807916, + "loss": 0.1858, + "step": 270000 + }, + { + "epoch": 11.18, + "grad_norm": 0.65234375, + "learning_rate": 0.00034733287427559284, + "loss": 0.2121, + "step": 270010 + }, + { + "epoch": 11.18, + "grad_norm": 0.6015625, + "learning_rate": 0.00034732288469993663, + "loss": 0.2536, + "step": 270020 + }, + { + "epoch": 11.18, + "grad_norm": 1.0234375, + "learning_rate": 0.00034731289494112924, + "loss": 0.1761, + "step": 270030 + }, + { + "epoch": 11.19, + "grad_norm": 2.46875, + "learning_rate": 0.0003473029049991894, + "loss": 0.2766, + "step": 270040 + }, + { + "epoch": 11.19, + "grad_norm": 0.470703125, + "learning_rate": 0.0003472929148741361, + "loss": 0.179, + "step": 270050 + }, + { + "epoch": 11.19, + "grad_norm": 0.3203125, + "learning_rate": 0.0003472829245659881, + "loss": 0.1707, + "step": 270060 + }, + { + "epoch": 11.19, + "grad_norm": 0.72265625, + "learning_rate": 0.00034727293407476397, + "loss": 0.2313, + "step": 270070 + }, + { + "epoch": 11.19, + "grad_norm": 0.365234375, + "learning_rate": 0.0003472629434004828, + "loss": 0.2287, + "step": 270080 + }, + { + "epoch": 11.19, + "grad_norm": 1.0078125, + "learning_rate": 0.00034725295254316326, + "loss": 0.1696, + "step": 270090 + }, + { + "epoch": 11.19, + "grad_norm": 0.5546875, + "learning_rate": 0.0003472429615028241, + "loss": 0.1822, + "step": 270100 + }, + { + "epoch": 11.19, + "grad_norm": 0.349609375, + "learning_rate": 0.0003472329702794843, + "loss": 0.196, + "step": 270110 + }, + { + "epoch": 11.19, + "grad_norm": 1.7578125, + "learning_rate": 0.0003472229788731624, + "loss": 0.2102, + "step": 270120 + }, + { + "epoch": 11.19, + "grad_norm": 0.80859375, + "learning_rate": 0.00034721298728387744, + "loss": 0.2393, + "step": 270130 + }, + { + "epoch": 11.19, + "grad_norm": 0.9453125, + "learning_rate": 0.0003472029955116482, + "loss": 0.1605, + "step": 270140 + }, + { + "epoch": 11.19, + "grad_norm": 0.6328125, + "learning_rate": 0.0003471930035564933, + "loss": 0.2204, + "step": 270150 + }, + { + "epoch": 11.19, + "grad_norm": 1.828125, + "learning_rate": 0.0003471830114184318, + "loss": 0.2381, + "step": 270160 + }, + { + "epoch": 11.19, + "grad_norm": 1.0546875, + "learning_rate": 0.0003471730190974822, + "loss": 0.1397, + "step": 270170 + }, + { + "epoch": 11.19, + "grad_norm": 0.40234375, + "learning_rate": 0.0003471630265936636, + "loss": 0.217, + "step": 270180 + }, + { + "epoch": 11.19, + "grad_norm": 0.46875, + "learning_rate": 0.00034715303390699467, + "loss": 0.1685, + "step": 270190 + }, + { + "epoch": 11.19, + "grad_norm": 0.6796875, + "learning_rate": 0.0003471430410374941, + "loss": 0.1957, + "step": 270200 + }, + { + "epoch": 11.19, + "grad_norm": 1.796875, + "learning_rate": 0.000347133047985181, + "loss": 0.2044, + "step": 270210 + }, + { + "epoch": 11.19, + "grad_norm": 1.03125, + "learning_rate": 0.0003471230547500739, + "loss": 0.2036, + "step": 270220 + }, + { + "epoch": 11.19, + "grad_norm": 0.65234375, + "learning_rate": 0.0003471130613321916, + "loss": 0.1842, + "step": 270230 + }, + { + "epoch": 11.19, + "grad_norm": 0.55859375, + "learning_rate": 0.00034710306773155314, + "loss": 0.1693, + "step": 270240 + }, + { + "epoch": 11.19, + "grad_norm": 0.66015625, + "learning_rate": 0.00034709307394817715, + "loss": 0.1681, + "step": 270250 + }, + { + "epoch": 11.19, + "grad_norm": 1.8515625, + "learning_rate": 0.00034708307998208246, + "loss": 0.2486, + "step": 270260 + }, + { + "epoch": 11.19, + "grad_norm": 0.189453125, + "learning_rate": 0.00034707308583328795, + "loss": 0.1726, + "step": 270270 + }, + { + "epoch": 11.19, + "grad_norm": 0.5859375, + "learning_rate": 0.0003470630915018123, + "loss": 0.1542, + "step": 270280 + }, + { + "epoch": 11.2, + "grad_norm": 1.28125, + "learning_rate": 0.0003470530969876745, + "loss": 0.1964, + "step": 270290 + }, + { + "epoch": 11.2, + "grad_norm": 0.357421875, + "learning_rate": 0.0003470431022908932, + "loss": 0.1573, + "step": 270300 + }, + { + "epoch": 11.2, + "grad_norm": 0.73046875, + "learning_rate": 0.0003470331074114872, + "loss": 0.198, + "step": 270310 + }, + { + "epoch": 11.2, + "grad_norm": 0.54296875, + "learning_rate": 0.0003470231123494755, + "loss": 0.1673, + "step": 270320 + }, + { + "epoch": 11.2, + "grad_norm": 0.33203125, + "learning_rate": 0.00034701311710487665, + "loss": 0.2061, + "step": 270330 + }, + { + "epoch": 11.2, + "grad_norm": 1.234375, + "learning_rate": 0.00034700312167770965, + "loss": 0.1758, + "step": 270340 + }, + { + "epoch": 11.2, + "grad_norm": 0.2158203125, + "learning_rate": 0.00034699312606799326, + "loss": 0.2004, + "step": 270350 + }, + { + "epoch": 11.2, + "grad_norm": 1.34375, + "learning_rate": 0.0003469831302757462, + "loss": 0.1912, + "step": 270360 + }, + { + "epoch": 11.2, + "grad_norm": 0.71484375, + "learning_rate": 0.00034697313430098747, + "loss": 0.1976, + "step": 270370 + }, + { + "epoch": 11.2, + "grad_norm": 0.267578125, + "learning_rate": 0.0003469631381437358, + "loss": 0.1965, + "step": 270380 + }, + { + "epoch": 11.2, + "grad_norm": 0.7734375, + "learning_rate": 0.00034695314180400983, + "loss": 0.1004, + "step": 270390 + }, + { + "epoch": 11.2, + "grad_norm": 1.125, + "learning_rate": 0.0003469431452818286, + "loss": 0.2057, + "step": 270400 + }, + { + "epoch": 11.2, + "grad_norm": 0.59375, + "learning_rate": 0.00034693314857721087, + "loss": 0.1698, + "step": 270410 + }, + { + "epoch": 11.2, + "grad_norm": 0.82421875, + "learning_rate": 0.00034692315169017535, + "loss": 0.1931, + "step": 270420 + }, + { + "epoch": 11.2, + "grad_norm": 0.486328125, + "learning_rate": 0.00034691315462074103, + "loss": 0.2146, + "step": 270430 + }, + { + "epoch": 11.2, + "grad_norm": 0.0, + "learning_rate": 0.00034690315736892645, + "loss": 0.2275, + "step": 270440 + }, + { + "epoch": 11.2, + "grad_norm": 1.1640625, + "learning_rate": 0.00034689315993475075, + "loss": 0.2615, + "step": 270450 + }, + { + "epoch": 11.2, + "grad_norm": 0.427734375, + "learning_rate": 0.0003468831623182325, + "loss": 0.2235, + "step": 270460 + }, + { + "epoch": 11.2, + "grad_norm": 0.89453125, + "learning_rate": 0.0003468731645193906, + "loss": 0.1954, + "step": 270470 + }, + { + "epoch": 11.2, + "grad_norm": 1.4609375, + "learning_rate": 0.00034686316653824396, + "loss": 0.1902, + "step": 270480 + }, + { + "epoch": 11.2, + "grad_norm": 1.0625, + "learning_rate": 0.0003468531683748112, + "loss": 0.1806, + "step": 270490 + }, + { + "epoch": 11.2, + "grad_norm": 0.40234375, + "learning_rate": 0.00034684317002911123, + "loss": 0.1907, + "step": 270500 + }, + { + "epoch": 11.2, + "grad_norm": 1.2421875, + "learning_rate": 0.000346833171501163, + "loss": 0.2521, + "step": 270510 + }, + { + "epoch": 11.2, + "grad_norm": 0.345703125, + "learning_rate": 0.0003468231727909851, + "loss": 0.1882, + "step": 270520 + }, + { + "epoch": 11.21, + "grad_norm": 2.34375, + "learning_rate": 0.00034681317389859644, + "loss": 0.2013, + "step": 270530 + }, + { + "epoch": 11.21, + "grad_norm": 0.5625, + "learning_rate": 0.00034680317482401585, + "loss": 0.1724, + "step": 270540 + }, + { + "epoch": 11.21, + "grad_norm": 0.369140625, + "learning_rate": 0.00034679317556726214, + "loss": 0.2107, + "step": 270550 + }, + { + "epoch": 11.21, + "grad_norm": 0.51953125, + "learning_rate": 0.0003467831761283542, + "loss": 0.1955, + "step": 270560 + }, + { + "epoch": 11.21, + "grad_norm": 0.4453125, + "learning_rate": 0.00034677317650731065, + "loss": 0.1662, + "step": 270570 + }, + { + "epoch": 11.21, + "grad_norm": 0.69140625, + "learning_rate": 0.0003467631767041505, + "loss": 0.2168, + "step": 270580 + }, + { + "epoch": 11.21, + "grad_norm": 1.515625, + "learning_rate": 0.0003467531767188925, + "loss": 0.2227, + "step": 270590 + }, + { + "epoch": 11.21, + "grad_norm": 0.69921875, + "learning_rate": 0.0003467431765515554, + "loss": 0.2178, + "step": 270600 + }, + { + "epoch": 11.21, + "grad_norm": 0.7421875, + "learning_rate": 0.0003467331762021582, + "loss": 0.1939, + "step": 270610 + }, + { + "epoch": 11.21, + "grad_norm": 0.6640625, + "learning_rate": 0.00034672317567071957, + "loss": 0.2015, + "step": 270620 + }, + { + "epoch": 11.21, + "grad_norm": 0.5234375, + "learning_rate": 0.00034671317495725833, + "loss": 0.1828, + "step": 270630 + }, + { + "epoch": 11.21, + "grad_norm": 0.79296875, + "learning_rate": 0.0003467031740617934, + "loss": 0.2104, + "step": 270640 + }, + { + "epoch": 11.21, + "grad_norm": 0.201171875, + "learning_rate": 0.00034669317298434356, + "loss": 0.202, + "step": 270650 + }, + { + "epoch": 11.21, + "grad_norm": 0.45703125, + "learning_rate": 0.00034668317172492755, + "loss": 0.2001, + "step": 270660 + }, + { + "epoch": 11.21, + "grad_norm": 0.80078125, + "learning_rate": 0.0003466731702835642, + "loss": 0.1833, + "step": 270670 + }, + { + "epoch": 11.21, + "grad_norm": 0.828125, + "learning_rate": 0.0003466631686602724, + "loss": 0.1964, + "step": 270680 + }, + { + "epoch": 11.21, + "grad_norm": 1.578125, + "learning_rate": 0.0003466531668550711, + "loss": 0.1411, + "step": 270690 + }, + { + "epoch": 11.21, + "grad_norm": 0.181640625, + "learning_rate": 0.00034664316486797886, + "loss": 0.184, + "step": 270700 + }, + { + "epoch": 11.21, + "grad_norm": 0.9765625, + "learning_rate": 0.0003466331626990146, + "loss": 0.1552, + "step": 270710 + }, + { + "epoch": 11.21, + "grad_norm": 0.734375, + "learning_rate": 0.00034662316034819724, + "loss": 0.1845, + "step": 270720 + }, + { + "epoch": 11.21, + "grad_norm": 0.78125, + "learning_rate": 0.00034661315781554547, + "loss": 0.2325, + "step": 270730 + }, + { + "epoch": 11.21, + "grad_norm": 0.5546875, + "learning_rate": 0.0003466031551010782, + "loss": 0.1956, + "step": 270740 + }, + { + "epoch": 11.21, + "grad_norm": 0.287109375, + "learning_rate": 0.0003465931522048142, + "loss": 0.1847, + "step": 270750 + }, + { + "epoch": 11.21, + "grad_norm": 0.9453125, + "learning_rate": 0.0003465831491267723, + "loss": 0.2544, + "step": 270760 + }, + { + "epoch": 11.22, + "grad_norm": 0.96484375, + "learning_rate": 0.0003465731458669714, + "loss": 0.1748, + "step": 270770 + }, + { + "epoch": 11.22, + "grad_norm": 0.546875, + "learning_rate": 0.00034656314242543026, + "loss": 0.1652, + "step": 270780 + }, + { + "epoch": 11.22, + "grad_norm": 0.66015625, + "learning_rate": 0.00034655313880216766, + "loss": 0.1655, + "step": 270790 + }, + { + "epoch": 11.22, + "grad_norm": 0.828125, + "learning_rate": 0.0003465431349972025, + "loss": 0.2255, + "step": 270800 + }, + { + "epoch": 11.22, + "grad_norm": 0.52734375, + "learning_rate": 0.00034653313101055364, + "loss": 0.2258, + "step": 270810 + }, + { + "epoch": 11.22, + "grad_norm": 0.578125, + "learning_rate": 0.0003465231268422398, + "loss": 0.157, + "step": 270820 + }, + { + "epoch": 11.22, + "grad_norm": 1.2265625, + "learning_rate": 0.00034651312249227984, + "loss": 0.2155, + "step": 270830 + }, + { + "epoch": 11.22, + "grad_norm": 0.51171875, + "learning_rate": 0.0003465031179606926, + "loss": 0.1703, + "step": 270840 + }, + { + "epoch": 11.22, + "grad_norm": 1.0234375, + "learning_rate": 0.000346493113247497, + "loss": 0.1898, + "step": 270850 + }, + { + "epoch": 11.22, + "grad_norm": 0.60546875, + "learning_rate": 0.00034648310835271175, + "loss": 0.1755, + "step": 270860 + }, + { + "epoch": 11.22, + "grad_norm": 0.625, + "learning_rate": 0.00034647310327635565, + "loss": 0.1725, + "step": 270870 + }, + { + "epoch": 11.22, + "grad_norm": 0.859375, + "learning_rate": 0.00034646309801844767, + "loss": 0.2268, + "step": 270880 + }, + { + "epoch": 11.22, + "grad_norm": 1.4140625, + "learning_rate": 0.0003464530925790065, + "loss": 0.2237, + "step": 270890 + }, + { + "epoch": 11.22, + "grad_norm": 2.09375, + "learning_rate": 0.00034644308695805105, + "loss": 0.2052, + "step": 270900 + }, + { + "epoch": 11.22, + "grad_norm": 0.625, + "learning_rate": 0.00034643308115560014, + "loss": 0.189, + "step": 270910 + }, + { + "epoch": 11.22, + "grad_norm": 1.0703125, + "learning_rate": 0.0003464230751716725, + "loss": 0.2091, + "step": 270920 + }, + { + "epoch": 11.22, + "grad_norm": 0.59375, + "learning_rate": 0.0003464130690062872, + "loss": 0.1444, + "step": 270930 + }, + { + "epoch": 11.22, + "grad_norm": 0.89453125, + "learning_rate": 0.00034640306265946275, + "loss": 0.1659, + "step": 270940 + }, + { + "epoch": 11.22, + "grad_norm": 0.87890625, + "learning_rate": 0.00034639305613121823, + "loss": 0.1892, + "step": 270950 + }, + { + "epoch": 11.22, + "grad_norm": 1.09375, + "learning_rate": 0.0003463830494215724, + "loss": 0.1665, + "step": 270960 + }, + { + "epoch": 11.22, + "grad_norm": 0.51953125, + "learning_rate": 0.000346373042530544, + "loss": 0.1909, + "step": 270970 + }, + { + "epoch": 11.22, + "grad_norm": 1.6171875, + "learning_rate": 0.00034636303545815207, + "loss": 0.1426, + "step": 270980 + }, + { + "epoch": 11.22, + "grad_norm": 1.0390625, + "learning_rate": 0.00034635302820441524, + "loss": 0.2498, + "step": 270990 + }, + { + "epoch": 11.22, + "grad_norm": 0.95703125, + "learning_rate": 0.00034634302076935237, + "loss": 0.2247, + "step": 271000 + }, + { + "epoch": 11.23, + "grad_norm": 0.94140625, + "learning_rate": 0.0003463330131529824, + "loss": 0.2097, + "step": 271010 + }, + { + "epoch": 11.23, + "grad_norm": 0.52734375, + "learning_rate": 0.00034632300535532415, + "loss": 0.1645, + "step": 271020 + }, + { + "epoch": 11.23, + "grad_norm": 0.83203125, + "learning_rate": 0.00034631299737639625, + "loss": 0.2185, + "step": 271030 + }, + { + "epoch": 11.23, + "grad_norm": 0.44140625, + "learning_rate": 0.00034630298921621783, + "loss": 0.1807, + "step": 271040 + }, + { + "epoch": 11.23, + "grad_norm": 0.69921875, + "learning_rate": 0.00034629298087480754, + "loss": 0.1922, + "step": 271050 + }, + { + "epoch": 11.23, + "grad_norm": 0.341796875, + "learning_rate": 0.00034628297235218423, + "loss": 0.1986, + "step": 271060 + }, + { + "epoch": 11.23, + "grad_norm": 0.435546875, + "learning_rate": 0.0003462729636483668, + "loss": 0.1683, + "step": 271070 + }, + { + "epoch": 11.23, + "grad_norm": 0.6953125, + "learning_rate": 0.00034626295476337406, + "loss": 0.1814, + "step": 271080 + }, + { + "epoch": 11.23, + "grad_norm": 0.91015625, + "learning_rate": 0.00034625294569722476, + "loss": 0.1791, + "step": 271090 + }, + { + "epoch": 11.23, + "grad_norm": 0.62109375, + "learning_rate": 0.0003462429364499379, + "loss": 0.1837, + "step": 271100 + }, + { + "epoch": 11.23, + "grad_norm": 0.8984375, + "learning_rate": 0.0003462329270215322, + "loss": 0.1819, + "step": 271110 + }, + { + "epoch": 11.23, + "grad_norm": 1.296875, + "learning_rate": 0.0003462229174120265, + "loss": 0.1596, + "step": 271120 + }, + { + "epoch": 11.23, + "grad_norm": 0.62890625, + "learning_rate": 0.00034621290762143974, + "loss": 0.2113, + "step": 271130 + }, + { + "epoch": 11.23, + "grad_norm": 1.03125, + "learning_rate": 0.0003462028976497906, + "loss": 0.1585, + "step": 271140 + }, + { + "epoch": 11.23, + "grad_norm": 0.50390625, + "learning_rate": 0.000346192887497098, + "loss": 0.2678, + "step": 271150 + }, + { + "epoch": 11.23, + "grad_norm": 0.875, + "learning_rate": 0.0003461828771633808, + "loss": 0.2143, + "step": 271160 + }, + { + "epoch": 11.23, + "grad_norm": 0.5859375, + "learning_rate": 0.0003461728666486578, + "loss": 0.1999, + "step": 271170 + }, + { + "epoch": 11.23, + "grad_norm": 0.7890625, + "learning_rate": 0.00034616285595294785, + "loss": 0.2294, + "step": 271180 + }, + { + "epoch": 11.23, + "grad_norm": 2.09375, + "learning_rate": 0.0003461528450762698, + "loss": 0.1531, + "step": 271190 + }, + { + "epoch": 11.23, + "grad_norm": 0.55078125, + "learning_rate": 0.0003461428340186424, + "loss": 0.1784, + "step": 271200 + }, + { + "epoch": 11.23, + "grad_norm": 1.7265625, + "learning_rate": 0.0003461328227800847, + "loss": 0.1795, + "step": 271210 + }, + { + "epoch": 11.23, + "grad_norm": 0.349609375, + "learning_rate": 0.0003461228113606152, + "loss": 0.1426, + "step": 271220 + }, + { + "epoch": 11.23, + "grad_norm": 1.5859375, + "learning_rate": 0.0003461127997602532, + "loss": 0.203, + "step": 271230 + }, + { + "epoch": 11.23, + "grad_norm": 0.65234375, + "learning_rate": 0.0003461027879790172, + "loss": 0.193, + "step": 271240 + }, + { + "epoch": 11.24, + "grad_norm": 0.8125, + "learning_rate": 0.00034609277601692606, + "loss": 0.1683, + "step": 271250 + }, + { + "epoch": 11.24, + "grad_norm": 0.494140625, + "learning_rate": 0.0003460827638739988, + "loss": 0.1996, + "step": 271260 + }, + { + "epoch": 11.24, + "grad_norm": 0.95703125, + "learning_rate": 0.0003460727515502541, + "loss": 0.1943, + "step": 271270 + }, + { + "epoch": 11.24, + "grad_norm": 0.45703125, + "learning_rate": 0.0003460627390457108, + "loss": 0.2071, + "step": 271280 + }, + { + "epoch": 11.24, + "grad_norm": 1.8125, + "learning_rate": 0.00034605272636038787, + "loss": 0.1719, + "step": 271290 + }, + { + "epoch": 11.24, + "grad_norm": 1.8984375, + "learning_rate": 0.0003460427134943041, + "loss": 0.2153, + "step": 271300 + }, + { + "epoch": 11.24, + "grad_norm": 0.466796875, + "learning_rate": 0.00034603270044747825, + "loss": 0.154, + "step": 271310 + }, + { + "epoch": 11.24, + "grad_norm": 0.6328125, + "learning_rate": 0.0003460226872199293, + "loss": 0.232, + "step": 271320 + }, + { + "epoch": 11.24, + "grad_norm": 1.265625, + "learning_rate": 0.0003460126738116759, + "loss": 0.1993, + "step": 271330 + }, + { + "epoch": 11.24, + "grad_norm": 1.5546875, + "learning_rate": 0.0003460026602227372, + "loss": 0.1964, + "step": 271340 + }, + { + "epoch": 11.24, + "grad_norm": 1.3203125, + "learning_rate": 0.00034599264645313174, + "loss": 0.1824, + "step": 271350 + }, + { + "epoch": 11.24, + "grad_norm": 1.2109375, + "learning_rate": 0.00034598263250287846, + "loss": 0.2246, + "step": 271360 + }, + { + "epoch": 11.24, + "grad_norm": 0.765625, + "learning_rate": 0.0003459726183719963, + "loss": 0.2142, + "step": 271370 + }, + { + "epoch": 11.24, + "grad_norm": 0.79296875, + "learning_rate": 0.000345962604060504, + "loss": 0.2255, + "step": 271380 + }, + { + "epoch": 11.24, + "grad_norm": 1.1953125, + "learning_rate": 0.0003459525895684205, + "loss": 0.206, + "step": 271390 + }, + { + "epoch": 11.24, + "grad_norm": 1.6015625, + "learning_rate": 0.00034594257489576453, + "loss": 0.1547, + "step": 271400 + }, + { + "epoch": 11.24, + "grad_norm": 0.68359375, + "learning_rate": 0.000345932560042555, + "loss": 0.1644, + "step": 271410 + }, + { + "epoch": 11.24, + "grad_norm": 1.953125, + "learning_rate": 0.00034592254500881084, + "loss": 0.1939, + "step": 271420 + }, + { + "epoch": 11.24, + "grad_norm": 0.69140625, + "learning_rate": 0.0003459125297945506, + "loss": 0.1736, + "step": 271430 + }, + { + "epoch": 11.24, + "grad_norm": 1.375, + "learning_rate": 0.0003459025143997935, + "loss": 0.2106, + "step": 271440 + }, + { + "epoch": 11.24, + "grad_norm": 0.765625, + "learning_rate": 0.0003458924988245582, + "loss": 0.2013, + "step": 271450 + }, + { + "epoch": 11.24, + "grad_norm": 0.921875, + "learning_rate": 0.00034588248306886355, + "loss": 0.2122, + "step": 271460 + }, + { + "epoch": 11.24, + "grad_norm": 0.58984375, + "learning_rate": 0.00034587246713272844, + "loss": 0.1715, + "step": 271470 + }, + { + "epoch": 11.24, + "grad_norm": 0.73046875, + "learning_rate": 0.00034586245101617173, + "loss": 0.1431, + "step": 271480 + }, + { + "epoch": 11.25, + "grad_norm": 1.03125, + "learning_rate": 0.0003458524347192122, + "loss": 0.199, + "step": 271490 + }, + { + "epoch": 11.25, + "grad_norm": 0.9921875, + "learning_rate": 0.0003458424182418688, + "loss": 0.1847, + "step": 271500 + }, + { + "epoch": 11.25, + "grad_norm": 1.3359375, + "learning_rate": 0.0003458324015841602, + "loss": 0.2136, + "step": 271510 + }, + { + "epoch": 11.25, + "grad_norm": 0.62890625, + "learning_rate": 0.00034582238474610544, + "loss": 0.1826, + "step": 271520 + }, + { + "epoch": 11.25, + "grad_norm": 1.4921875, + "learning_rate": 0.0003458123677277233, + "loss": 0.1898, + "step": 271530 + }, + { + "epoch": 11.25, + "grad_norm": 0.6875, + "learning_rate": 0.00034580235052903265, + "loss": 0.1805, + "step": 271540 + }, + { + "epoch": 11.25, + "grad_norm": 0.75390625, + "learning_rate": 0.00034579233315005234, + "loss": 0.1808, + "step": 271550 + }, + { + "epoch": 11.25, + "grad_norm": 0.6796875, + "learning_rate": 0.0003457823155908011, + "loss": 0.191, + "step": 271560 + }, + { + "epoch": 11.25, + "grad_norm": 0.93359375, + "learning_rate": 0.00034577229785129793, + "loss": 0.2403, + "step": 271570 + }, + { + "epoch": 11.25, + "grad_norm": 0.75390625, + "learning_rate": 0.00034576227993156166, + "loss": 0.1993, + "step": 271580 + }, + { + "epoch": 11.25, + "grad_norm": 1.1171875, + "learning_rate": 0.0003457522618316111, + "loss": 0.1882, + "step": 271590 + }, + { + "epoch": 11.25, + "grad_norm": 0.90234375, + "learning_rate": 0.00034574224355146514, + "loss": 0.2142, + "step": 271600 + }, + { + "epoch": 11.25, + "grad_norm": 0.19140625, + "learning_rate": 0.0003457322250911426, + "loss": 0.1774, + "step": 271610 + }, + { + "epoch": 11.25, + "grad_norm": 0.80859375, + "learning_rate": 0.00034572220645066236, + "loss": 0.1846, + "step": 271620 + }, + { + "epoch": 11.25, + "grad_norm": 0.59765625, + "learning_rate": 0.0003457121876300433, + "loss": 0.1439, + "step": 271630 + }, + { + "epoch": 11.25, + "grad_norm": 0.408203125, + "learning_rate": 0.0003457021686293042, + "loss": 0.1654, + "step": 271640 + }, + { + "epoch": 11.25, + "grad_norm": 0.4296875, + "learning_rate": 0.00034569214944846395, + "loss": 0.1981, + "step": 271650 + }, + { + "epoch": 11.25, + "grad_norm": 0.875, + "learning_rate": 0.0003456821300875415, + "loss": 0.1933, + "step": 271660 + }, + { + "epoch": 11.25, + "grad_norm": 1.3984375, + "learning_rate": 0.00034567211054655546, + "loss": 0.2054, + "step": 271670 + }, + { + "epoch": 11.25, + "grad_norm": 1.0234375, + "learning_rate": 0.0003456620908255249, + "loss": 0.1758, + "step": 271680 + }, + { + "epoch": 11.25, + "grad_norm": 0.8203125, + "learning_rate": 0.00034565207092446867, + "loss": 0.2015, + "step": 271690 + }, + { + "epoch": 11.25, + "grad_norm": 0.87109375, + "learning_rate": 0.0003456420508434055, + "loss": 0.1862, + "step": 271700 + }, + { + "epoch": 11.25, + "grad_norm": 0.7578125, + "learning_rate": 0.00034563203058235437, + "loss": 0.2167, + "step": 271710 + }, + { + "epoch": 11.25, + "grad_norm": 0.2451171875, + "learning_rate": 0.0003456220101413341, + "loss": 0.1762, + "step": 271720 + }, + { + "epoch": 11.26, + "grad_norm": 0.54296875, + "learning_rate": 0.00034561198952036344, + "loss": 0.1497, + "step": 271730 + }, + { + "epoch": 11.26, + "grad_norm": 0.6328125, + "learning_rate": 0.0003456019687194615, + "loss": 0.1734, + "step": 271740 + }, + { + "epoch": 11.26, + "grad_norm": 0.3046875, + "learning_rate": 0.00034559194773864675, + "loss": 0.1673, + "step": 271750 + }, + { + "epoch": 11.26, + "grad_norm": 0.55078125, + "learning_rate": 0.0003455819265779385, + "loss": 0.1709, + "step": 271760 + }, + { + "epoch": 11.26, + "grad_norm": 0.98046875, + "learning_rate": 0.00034557190523735527, + "loss": 0.1988, + "step": 271770 + }, + { + "epoch": 11.26, + "grad_norm": 0.83203125, + "learning_rate": 0.00034556188371691603, + "loss": 0.1677, + "step": 271780 + }, + { + "epoch": 11.26, + "grad_norm": 0.85546875, + "learning_rate": 0.0003455518620166397, + "loss": 0.1882, + "step": 271790 + }, + { + "epoch": 11.26, + "grad_norm": 0.98828125, + "learning_rate": 0.000345541840136545, + "loss": 0.2124, + "step": 271800 + }, + { + "epoch": 11.26, + "grad_norm": 0.76953125, + "learning_rate": 0.00034553181807665096, + "loss": 0.1862, + "step": 271810 + }, + { + "epoch": 11.26, + "grad_norm": 0.72265625, + "learning_rate": 0.0003455217958369764, + "loss": 0.1886, + "step": 271820 + }, + { + "epoch": 11.26, + "grad_norm": 0.98828125, + "learning_rate": 0.00034551177341754, + "loss": 0.2297, + "step": 271830 + }, + { + "epoch": 11.26, + "grad_norm": 0.44921875, + "learning_rate": 0.00034550175081836084, + "loss": 0.1883, + "step": 271840 + }, + { + "epoch": 11.26, + "grad_norm": 0.75390625, + "learning_rate": 0.0003454917280394577, + "loss": 0.2039, + "step": 271850 + }, + { + "epoch": 11.26, + "grad_norm": 1.1015625, + "learning_rate": 0.00034548170508084936, + "loss": 0.2273, + "step": 271860 + }, + { + "epoch": 11.26, + "grad_norm": 0.2734375, + "learning_rate": 0.00034547168194255486, + "loss": 0.233, + "step": 271870 + }, + { + "epoch": 11.26, + "grad_norm": 1.28125, + "learning_rate": 0.0003454616586245929, + "loss": 0.1996, + "step": 271880 + }, + { + "epoch": 11.26, + "grad_norm": 0.0, + "learning_rate": 0.0003454516351269824, + "loss": 0.2115, + "step": 271890 + }, + { + "epoch": 11.26, + "grad_norm": 0.7265625, + "learning_rate": 0.00034544161144974233, + "loss": 0.1822, + "step": 271900 + }, + { + "epoch": 11.26, + "grad_norm": 0.75, + "learning_rate": 0.00034543158759289133, + "loss": 0.2409, + "step": 271910 + }, + { + "epoch": 11.26, + "grad_norm": 0.375, + "learning_rate": 0.00034542156355644847, + "loss": 0.1479, + "step": 271920 + }, + { + "epoch": 11.26, + "grad_norm": 1.0546875, + "learning_rate": 0.0003454115393404325, + "loss": 0.2077, + "step": 271930 + }, + { + "epoch": 11.26, + "grad_norm": 1.3828125, + "learning_rate": 0.0003454015149448623, + "loss": 0.2062, + "step": 271940 + }, + { + "epoch": 11.26, + "grad_norm": 0.68359375, + "learning_rate": 0.0003453914903697568, + "loss": 0.2003, + "step": 271950 + }, + { + "epoch": 11.26, + "grad_norm": 0.66015625, + "learning_rate": 0.00034538146561513474, + "loss": 0.2024, + "step": 271960 + }, + { + "epoch": 11.26, + "grad_norm": 0.75390625, + "learning_rate": 0.0003453714406810152, + "loss": 0.2491, + "step": 271970 + }, + { + "epoch": 11.27, + "grad_norm": 2.015625, + "learning_rate": 0.00034536141556741677, + "loss": 0.2161, + "step": 271980 + }, + { + "epoch": 11.27, + "grad_norm": 0.62890625, + "learning_rate": 0.0003453513902743585, + "loss": 0.1973, + "step": 271990 + }, + { + "epoch": 11.27, + "grad_norm": 1.296875, + "learning_rate": 0.0003453413648018592, + "loss": 0.2581, + "step": 272000 + }, + { + "epoch": 11.27, + "grad_norm": 0.484375, + "learning_rate": 0.00034533133914993783, + "loss": 0.2068, + "step": 272010 + }, + { + "epoch": 11.27, + "grad_norm": 0.69140625, + "learning_rate": 0.0003453213133186131, + "loss": 0.2118, + "step": 272020 + }, + { + "epoch": 11.27, + "grad_norm": 1.0625, + "learning_rate": 0.000345311287307904, + "loss": 0.1606, + "step": 272030 + }, + { + "epoch": 11.27, + "grad_norm": 1.2578125, + "learning_rate": 0.00034530126111782924, + "loss": 0.1765, + "step": 272040 + }, + { + "epoch": 11.27, + "grad_norm": 0.75390625, + "learning_rate": 0.00034529123474840784, + "loss": 0.2003, + "step": 272050 + }, + { + "epoch": 11.27, + "grad_norm": 0.447265625, + "learning_rate": 0.0003452812081996587, + "loss": 0.2179, + "step": 272060 + }, + { + "epoch": 11.27, + "grad_norm": 0.63671875, + "learning_rate": 0.0003452711814716005, + "loss": 0.1856, + "step": 272070 + }, + { + "epoch": 11.27, + "grad_norm": 0.68359375, + "learning_rate": 0.0003452611545642524, + "loss": 0.2, + "step": 272080 + }, + { + "epoch": 11.27, + "grad_norm": 0.5546875, + "learning_rate": 0.0003452511274776329, + "loss": 0.2173, + "step": 272090 + }, + { + "epoch": 11.27, + "grad_norm": 1.078125, + "learning_rate": 0.0003452411002117612, + "loss": 0.198, + "step": 272100 + }, + { + "epoch": 11.27, + "grad_norm": 0.466796875, + "learning_rate": 0.00034523107276665597, + "loss": 0.1861, + "step": 272110 + }, + { + "epoch": 11.27, + "grad_norm": 1.28125, + "learning_rate": 0.00034522104514233615, + "loss": 0.2071, + "step": 272120 + }, + { + "epoch": 11.27, + "grad_norm": 0.353515625, + "learning_rate": 0.00034521101733882064, + "loss": 0.1769, + "step": 272130 + }, + { + "epoch": 11.27, + "grad_norm": 0.78515625, + "learning_rate": 0.00034520098935612826, + "loss": 0.2076, + "step": 272140 + }, + { + "epoch": 11.27, + "grad_norm": 2.625, + "learning_rate": 0.0003451909611942779, + "loss": 0.1891, + "step": 272150 + }, + { + "epoch": 11.27, + "grad_norm": 0.46484375, + "learning_rate": 0.00034518093285328843, + "loss": 0.2051, + "step": 272160 + }, + { + "epoch": 11.27, + "grad_norm": 0.392578125, + "learning_rate": 0.0003451709043331787, + "loss": 0.1764, + "step": 272170 + }, + { + "epoch": 11.27, + "grad_norm": 0.60546875, + "learning_rate": 0.00034516087563396763, + "loss": 0.2088, + "step": 272180 + }, + { + "epoch": 11.27, + "grad_norm": 0.8046875, + "learning_rate": 0.00034515084675567406, + "loss": 0.1706, + "step": 272190 + }, + { + "epoch": 11.27, + "grad_norm": 1.875, + "learning_rate": 0.00034514081769831685, + "loss": 0.2442, + "step": 272200 + }, + { + "epoch": 11.27, + "grad_norm": 0.625, + "learning_rate": 0.000345130788461915, + "loss": 0.2065, + "step": 272210 + }, + { + "epoch": 11.28, + "grad_norm": 0.5234375, + "learning_rate": 0.00034512075904648715, + "loss": 0.1802, + "step": 272220 + }, + { + "epoch": 11.28, + "grad_norm": 1.1875, + "learning_rate": 0.00034511072945205234, + "loss": 0.2134, + "step": 272230 + }, + { + "epoch": 11.28, + "grad_norm": 0.6484375, + "learning_rate": 0.0003451006996786295, + "loss": 0.1492, + "step": 272240 + }, + { + "epoch": 11.28, + "grad_norm": 0.9921875, + "learning_rate": 0.00034509066972623724, + "loss": 0.2002, + "step": 272250 + }, + { + "epoch": 11.28, + "grad_norm": 0.50390625, + "learning_rate": 0.00034508063959489473, + "loss": 0.2049, + "step": 272260 + }, + { + "epoch": 11.28, + "grad_norm": 1.0546875, + "learning_rate": 0.0003450706092846207, + "loss": 0.2093, + "step": 272270 + }, + { + "epoch": 11.28, + "grad_norm": 0.5625, + "learning_rate": 0.000345060578795434, + "loss": 0.173, + "step": 272280 + }, + { + "epoch": 11.28, + "grad_norm": 0.921875, + "learning_rate": 0.0003450505481273537, + "loss": 0.1801, + "step": 272290 + }, + { + "epoch": 11.28, + "grad_norm": 0.6484375, + "learning_rate": 0.00034504051728039844, + "loss": 0.2305, + "step": 272300 + }, + { + "epoch": 11.28, + "grad_norm": 2.296875, + "learning_rate": 0.00034503048625458713, + "loss": 0.139, + "step": 272310 + }, + { + "epoch": 11.28, + "grad_norm": 0.61328125, + "learning_rate": 0.00034502045504993873, + "loss": 0.2555, + "step": 272320 + }, + { + "epoch": 11.28, + "grad_norm": 0.578125, + "learning_rate": 0.00034501042366647216, + "loss": 0.1965, + "step": 272330 + }, + { + "epoch": 11.28, + "grad_norm": 0.4140625, + "learning_rate": 0.0003450003921042062, + "loss": 0.1647, + "step": 272340 + }, + { + "epoch": 11.28, + "grad_norm": 2.078125, + "learning_rate": 0.00034499036036315977, + "loss": 0.2597, + "step": 272350 + }, + { + "epoch": 11.28, + "grad_norm": 0.84765625, + "learning_rate": 0.0003449803284433517, + "loss": 0.2039, + "step": 272360 + }, + { + "epoch": 11.28, + "grad_norm": 0.4375, + "learning_rate": 0.00034497029634480097, + "loss": 0.1654, + "step": 272370 + }, + { + "epoch": 11.28, + "grad_norm": 0.921875, + "learning_rate": 0.0003449602640675264, + "loss": 0.195, + "step": 272380 + }, + { + "epoch": 11.28, + "grad_norm": 2.15625, + "learning_rate": 0.0003449502316115468, + "loss": 0.1866, + "step": 272390 + }, + { + "epoch": 11.28, + "grad_norm": 1.1484375, + "learning_rate": 0.00034494019897688116, + "loss": 0.2276, + "step": 272400 + }, + { + "epoch": 11.28, + "grad_norm": 0.83203125, + "learning_rate": 0.0003449301661635483, + "loss": 0.19, + "step": 272410 + }, + { + "epoch": 11.28, + "grad_norm": 0.4296875, + "learning_rate": 0.00034492013317156717, + "loss": 0.1956, + "step": 272420 + }, + { + "epoch": 11.28, + "grad_norm": 0.8671875, + "learning_rate": 0.00034491010000095655, + "loss": 0.1578, + "step": 272430 + }, + { + "epoch": 11.28, + "grad_norm": 0.7421875, + "learning_rate": 0.0003449000666517353, + "loss": 0.1726, + "step": 272440 + }, + { + "epoch": 11.28, + "grad_norm": 0.8046875, + "learning_rate": 0.00034489003312392254, + "loss": 0.1758, + "step": 272450 + }, + { + "epoch": 11.29, + "grad_norm": 0.609375, + "learning_rate": 0.0003448799994175369, + "loss": 0.2073, + "step": 272460 + }, + { + "epoch": 11.29, + "grad_norm": 0.9609375, + "learning_rate": 0.0003448699655325974, + "loss": 0.178, + "step": 272470 + }, + { + "epoch": 11.29, + "grad_norm": 0.439453125, + "learning_rate": 0.00034485993146912274, + "loss": 0.2252, + "step": 272480 + }, + { + "epoch": 11.29, + "grad_norm": 0.890625, + "learning_rate": 0.00034484989722713205, + "loss": 0.1593, + "step": 272490 + }, + { + "epoch": 11.29, + "grad_norm": 0.73046875, + "learning_rate": 0.00034483986280664405, + "loss": 0.1634, + "step": 272500 + }, + { + "epoch": 11.29, + "grad_norm": 0.5703125, + "learning_rate": 0.0003448298282076777, + "loss": 0.2135, + "step": 272510 + }, + { + "epoch": 11.29, + "grad_norm": 0.671875, + "learning_rate": 0.00034481979343025184, + "loss": 0.1696, + "step": 272520 + }, + { + "epoch": 11.29, + "grad_norm": 1.03125, + "learning_rate": 0.00034480975847438536, + "loss": 0.1996, + "step": 272530 + }, + { + "epoch": 11.29, + "grad_norm": 0.486328125, + "learning_rate": 0.00034479972334009715, + "loss": 0.1726, + "step": 272540 + }, + { + "epoch": 11.29, + "grad_norm": 0.83203125, + "learning_rate": 0.00034478968802740607, + "loss": 0.2041, + "step": 272550 + }, + { + "epoch": 11.29, + "grad_norm": 0.83984375, + "learning_rate": 0.00034477965253633104, + "loss": 0.2159, + "step": 272560 + }, + { + "epoch": 11.29, + "grad_norm": 0.85546875, + "learning_rate": 0.000344769616866891, + "loss": 0.2573, + "step": 272570 + }, + { + "epoch": 11.29, + "grad_norm": 0.287109375, + "learning_rate": 0.0003447595810191048, + "loss": 0.2172, + "step": 272580 + }, + { + "epoch": 11.29, + "grad_norm": 2.40625, + "learning_rate": 0.00034474954499299115, + "loss": 0.211, + "step": 272590 + }, + { + "epoch": 11.29, + "grad_norm": 0.7578125, + "learning_rate": 0.0003447395087885692, + "loss": 0.1805, + "step": 272600 + }, + { + "epoch": 11.29, + "grad_norm": 0.48828125, + "learning_rate": 0.00034472947240585765, + "loss": 0.1714, + "step": 272610 + }, + { + "epoch": 11.29, + "grad_norm": 0.828125, + "learning_rate": 0.0003447194358448756, + "loss": 0.1938, + "step": 272620 + }, + { + "epoch": 11.29, + "grad_norm": 0.7265625, + "learning_rate": 0.0003447093991056417, + "loss": 0.217, + "step": 272630 + }, + { + "epoch": 11.29, + "grad_norm": 0.4375, + "learning_rate": 0.0003446993621881749, + "loss": 0.1774, + "step": 272640 + }, + { + "epoch": 11.29, + "grad_norm": 0.671875, + "learning_rate": 0.0003446893250924943, + "loss": 0.2117, + "step": 272650 + }, + { + "epoch": 11.29, + "grad_norm": 1.5625, + "learning_rate": 0.00034467928781861846, + "loss": 0.2577, + "step": 272660 + }, + { + "epoch": 11.29, + "grad_norm": 0.4375, + "learning_rate": 0.00034466925036656643, + "loss": 0.223, + "step": 272670 + }, + { + "epoch": 11.29, + "grad_norm": 0.59375, + "learning_rate": 0.00034465921273635716, + "loss": 0.183, + "step": 272680 + }, + { + "epoch": 11.29, + "grad_norm": 0.55078125, + "learning_rate": 0.0003446491749280094, + "loss": 0.1503, + "step": 272690 + }, + { + "epoch": 11.3, + "grad_norm": 0.78515625, + "learning_rate": 0.0003446391369415422, + "loss": 0.2099, + "step": 272700 + }, + { + "epoch": 11.3, + "grad_norm": 1.046875, + "learning_rate": 0.00034462909877697434, + "loss": 0.2299, + "step": 272710 + }, + { + "epoch": 11.3, + "grad_norm": 0.0, + "learning_rate": 0.00034461906043432465, + "loss": 0.255, + "step": 272720 + }, + { + "epoch": 11.3, + "grad_norm": 1.1640625, + "learning_rate": 0.0003446090219136122, + "loss": 0.2, + "step": 272730 + }, + { + "epoch": 11.3, + "grad_norm": 0.166015625, + "learning_rate": 0.0003445989832148557, + "loss": 0.1601, + "step": 272740 + }, + { + "epoch": 11.3, + "grad_norm": 2.53125, + "learning_rate": 0.00034458894433807427, + "loss": 0.2134, + "step": 272750 + }, + { + "epoch": 11.3, + "grad_norm": 1.1484375, + "learning_rate": 0.00034457890528328655, + "loss": 0.2396, + "step": 272760 + }, + { + "epoch": 11.3, + "grad_norm": 1.171875, + "learning_rate": 0.0003445688660505116, + "loss": 0.1945, + "step": 272770 + }, + { + "epoch": 11.3, + "grad_norm": 0.62890625, + "learning_rate": 0.0003445588266397682, + "loss": 0.276, + "step": 272780 + }, + { + "epoch": 11.3, + "grad_norm": 0.85546875, + "learning_rate": 0.0003445487870510754, + "loss": 0.1995, + "step": 272790 + }, + { + "epoch": 11.3, + "grad_norm": 0.62890625, + "learning_rate": 0.0003445387472844519, + "loss": 0.2198, + "step": 272800 + }, + { + "epoch": 11.3, + "grad_norm": 1.1953125, + "learning_rate": 0.0003445287073399167, + "loss": 0.1531, + "step": 272810 + }, + { + "epoch": 11.3, + "grad_norm": 1.15625, + "learning_rate": 0.00034451866721748865, + "loss": 0.1697, + "step": 272820 + }, + { + "epoch": 11.3, + "grad_norm": 1.1015625, + "learning_rate": 0.00034450862691718677, + "loss": 0.2284, + "step": 272830 + }, + { + "epoch": 11.3, + "grad_norm": 1.265625, + "learning_rate": 0.00034449858643902976, + "loss": 0.2005, + "step": 272840 + }, + { + "epoch": 11.3, + "grad_norm": 0.0, + "learning_rate": 0.00034448854578303666, + "loss": 0.2019, + "step": 272850 + }, + { + "epoch": 11.3, + "grad_norm": 0.6953125, + "learning_rate": 0.00034447850494922637, + "loss": 0.2326, + "step": 272860 + }, + { + "epoch": 11.3, + "grad_norm": 0.77734375, + "learning_rate": 0.00034446846393761766, + "loss": 0.2486, + "step": 272870 + }, + { + "epoch": 11.3, + "grad_norm": 0.5, + "learning_rate": 0.00034445842274822956, + "loss": 0.1904, + "step": 272880 + }, + { + "epoch": 11.3, + "grad_norm": 0.67578125, + "learning_rate": 0.0003444483813810809, + "loss": 0.1763, + "step": 272890 + }, + { + "epoch": 11.3, + "grad_norm": 1.3359375, + "learning_rate": 0.00034443833983619057, + "loss": 0.1389, + "step": 272900 + }, + { + "epoch": 11.3, + "grad_norm": 0.6953125, + "learning_rate": 0.0003444282981135775, + "loss": 0.1946, + "step": 272910 + }, + { + "epoch": 11.3, + "grad_norm": 1.125, + "learning_rate": 0.0003444182562132605, + "loss": 0.138, + "step": 272920 + }, + { + "epoch": 11.3, + "grad_norm": 1.625, + "learning_rate": 0.00034440821413525856, + "loss": 0.1861, + "step": 272930 + }, + { + "epoch": 11.31, + "grad_norm": 0.578125, + "learning_rate": 0.0003443981718795907, + "loss": 0.1946, + "step": 272940 + }, + { + "epoch": 11.31, + "grad_norm": 0.93359375, + "learning_rate": 0.00034438812944627547, + "loss": 0.2402, + "step": 272950 + }, + { + "epoch": 11.31, + "grad_norm": 0.205078125, + "learning_rate": 0.00034437808683533206, + "loss": 0.2554, + "step": 272960 + }, + { + "epoch": 11.31, + "grad_norm": 0.474609375, + "learning_rate": 0.00034436804404677936, + "loss": 0.1993, + "step": 272970 + }, + { + "epoch": 11.31, + "grad_norm": 0.35546875, + "learning_rate": 0.000344358001080636, + "loss": 0.2077, + "step": 272980 + }, + { + "epoch": 11.31, + "grad_norm": 1.1875, + "learning_rate": 0.00034434795793692125, + "loss": 0.1919, + "step": 272990 + }, + { + "epoch": 11.31, + "grad_norm": 0.90234375, + "learning_rate": 0.0003443379146156538, + "loss": 0.1908, + "step": 273000 + }, + { + "epoch": 11.31, + "grad_norm": 0.92578125, + "learning_rate": 0.0003443278711168525, + "loss": 0.2387, + "step": 273010 + }, + { + "epoch": 11.31, + "grad_norm": 0.95703125, + "learning_rate": 0.0003443178274405364, + "loss": 0.1732, + "step": 273020 + }, + { + "epoch": 11.31, + "grad_norm": 0.7734375, + "learning_rate": 0.00034430778358672434, + "loss": 0.207, + "step": 273030 + }, + { + "epoch": 11.31, + "grad_norm": 0.349609375, + "learning_rate": 0.0003442977395554352, + "loss": 0.1077, + "step": 273040 + }, + { + "epoch": 11.31, + "grad_norm": 0.77734375, + "learning_rate": 0.0003442876953466879, + "loss": 0.18, + "step": 273050 + }, + { + "epoch": 11.31, + "grad_norm": 1.125, + "learning_rate": 0.0003442776509605013, + "loss": 0.2243, + "step": 273060 + }, + { + "epoch": 11.31, + "grad_norm": 0.7890625, + "learning_rate": 0.00034426760639689444, + "loss": 0.1914, + "step": 273070 + }, + { + "epoch": 11.31, + "grad_norm": 0.984375, + "learning_rate": 0.000344257561655886, + "loss": 0.1793, + "step": 273080 + }, + { + "epoch": 11.31, + "grad_norm": 0.5390625, + "learning_rate": 0.0003442475167374951, + "loss": 0.1555, + "step": 273090 + }, + { + "epoch": 11.31, + "grad_norm": 0.40234375, + "learning_rate": 0.0003442374716417406, + "loss": 0.1984, + "step": 273100 + }, + { + "epoch": 11.31, + "grad_norm": 0.72265625, + "learning_rate": 0.00034422742636864114, + "loss": 0.1573, + "step": 273110 + }, + { + "epoch": 11.31, + "grad_norm": 0.6640625, + "learning_rate": 0.00034421738091821604, + "loss": 0.2162, + "step": 273120 + }, + { + "epoch": 11.31, + "grad_norm": 0.59375, + "learning_rate": 0.00034420733529048393, + "loss": 0.1837, + "step": 273130 + }, + { + "epoch": 11.31, + "grad_norm": 0.99609375, + "learning_rate": 0.00034419728948546373, + "loss": 0.1896, + "step": 273140 + }, + { + "epoch": 11.31, + "grad_norm": 0.99609375, + "learning_rate": 0.0003441872435031745, + "loss": 0.2336, + "step": 273150 + }, + { + "epoch": 11.31, + "grad_norm": 0.3125, + "learning_rate": 0.00034417719734363507, + "loss": 0.2051, + "step": 273160 + }, + { + "epoch": 11.31, + "grad_norm": 0.578125, + "learning_rate": 0.0003441671510068642, + "loss": 0.1895, + "step": 273170 + }, + { + "epoch": 11.32, + "grad_norm": 0.53125, + "learning_rate": 0.0003441571044928811, + "loss": 0.2843, + "step": 273180 + }, + { + "epoch": 11.32, + "grad_norm": 1.2421875, + "learning_rate": 0.0003441470578017043, + "loss": 0.1639, + "step": 273190 + }, + { + "epoch": 11.32, + "grad_norm": 0.453125, + "learning_rate": 0.000344137010933353, + "loss": 0.1674, + "step": 273200 + }, + { + "epoch": 11.32, + "grad_norm": 0.9921875, + "learning_rate": 0.00034412696388784605, + "loss": 0.194, + "step": 273210 + }, + { + "epoch": 11.32, + "grad_norm": 0.77734375, + "learning_rate": 0.0003441169166652023, + "loss": 0.1675, + "step": 273220 + }, + { + "epoch": 11.32, + "grad_norm": 0.76953125, + "learning_rate": 0.00034410686926544066, + "loss": 0.2257, + "step": 273230 + }, + { + "epoch": 11.32, + "grad_norm": 1.015625, + "learning_rate": 0.00034409682168858003, + "loss": 0.2172, + "step": 273240 + }, + { + "epoch": 11.32, + "grad_norm": 0.73046875, + "learning_rate": 0.0003440867739346393, + "loss": 0.2241, + "step": 273250 + }, + { + "epoch": 11.32, + "grad_norm": 0.6640625, + "learning_rate": 0.0003440767260036376, + "loss": 0.1832, + "step": 273260 + }, + { + "epoch": 11.32, + "grad_norm": 0.7265625, + "learning_rate": 0.0003440666778955934, + "loss": 0.2292, + "step": 273270 + }, + { + "epoch": 11.32, + "grad_norm": 0.51953125, + "learning_rate": 0.00034405662961052607, + "loss": 0.186, + "step": 273280 + }, + { + "epoch": 11.32, + "grad_norm": 0.7578125, + "learning_rate": 0.00034404658114845426, + "loss": 0.2307, + "step": 273290 + }, + { + "epoch": 11.32, + "grad_norm": 0.69921875, + "learning_rate": 0.0003440365325093969, + "loss": 0.2315, + "step": 273300 + }, + { + "epoch": 11.32, + "grad_norm": 0.6640625, + "learning_rate": 0.0003440264836933731, + "loss": 0.2161, + "step": 273310 + }, + { + "epoch": 11.32, + "grad_norm": 1.0859375, + "learning_rate": 0.00034401643470040143, + "loss": 0.1873, + "step": 273320 + }, + { + "epoch": 11.32, + "grad_norm": 0.921875, + "learning_rate": 0.0003440063855305011, + "loss": 0.204, + "step": 273330 + }, + { + "epoch": 11.32, + "grad_norm": 0.9453125, + "learning_rate": 0.00034399633618369086, + "loss": 0.179, + "step": 273340 + }, + { + "epoch": 11.32, + "grad_norm": 0.373046875, + "learning_rate": 0.0003439862866599896, + "loss": 0.1983, + "step": 273350 + }, + { + "epoch": 11.32, + "grad_norm": 0.875, + "learning_rate": 0.0003439762369594164, + "loss": 0.1884, + "step": 273360 + }, + { + "epoch": 11.32, + "grad_norm": 0.96484375, + "learning_rate": 0.00034396618708199, + "loss": 0.249, + "step": 273370 + }, + { + "epoch": 11.32, + "grad_norm": 0.392578125, + "learning_rate": 0.0003439561370277294, + "loss": 0.2037, + "step": 273380 + }, + { + "epoch": 11.32, + "grad_norm": 0.0076904296875, + "learning_rate": 0.00034394608679665355, + "loss": 0.2044, + "step": 273390 + }, + { + "epoch": 11.32, + "grad_norm": 0.8671875, + "learning_rate": 0.0003439360363887813, + "loss": 0.1911, + "step": 273400 + }, + { + "epoch": 11.32, + "grad_norm": 1.2578125, + "learning_rate": 0.0003439259858041315, + "loss": 0.1861, + "step": 273410 + }, + { + "epoch": 11.33, + "grad_norm": 0.76171875, + "learning_rate": 0.00034391593504272324, + "loss": 0.1735, + "step": 273420 + }, + { + "epoch": 11.33, + "grad_norm": 0.6640625, + "learning_rate": 0.0003439058841045752, + "loss": 0.1721, + "step": 273430 + }, + { + "epoch": 11.33, + "grad_norm": 1.1875, + "learning_rate": 0.00034389583298970655, + "loss": 0.2075, + "step": 273440 + }, + { + "epoch": 11.33, + "grad_norm": 1.21875, + "learning_rate": 0.000343885781698136, + "loss": 0.2046, + "step": 273450 + }, + { + "epoch": 11.33, + "grad_norm": 0.72265625, + "learning_rate": 0.00034387573022988263, + "loss": 0.1962, + "step": 273460 + }, + { + "epoch": 11.33, + "grad_norm": 0.8671875, + "learning_rate": 0.0003438656785849652, + "loss": 0.173, + "step": 273470 + }, + { + "epoch": 11.33, + "grad_norm": 0.6953125, + "learning_rate": 0.0003438556267634027, + "loss": 0.1966, + "step": 273480 + }, + { + "epoch": 11.33, + "grad_norm": 0.58984375, + "learning_rate": 0.00034384557476521406, + "loss": 0.1748, + "step": 273490 + }, + { + "epoch": 11.33, + "grad_norm": 0.7734375, + "learning_rate": 0.00034383552259041817, + "loss": 0.1882, + "step": 273500 + }, + { + "epoch": 11.33, + "grad_norm": 0.58203125, + "learning_rate": 0.000343825470239034, + "loss": 0.1921, + "step": 273510 + }, + { + "epoch": 11.33, + "grad_norm": 0.86328125, + "learning_rate": 0.00034381541771108043, + "loss": 0.2294, + "step": 273520 + }, + { + "epoch": 11.33, + "grad_norm": 1.09375, + "learning_rate": 0.0003438053650065763, + "loss": 0.1871, + "step": 273530 + }, + { + "epoch": 11.33, + "grad_norm": 0.5703125, + "learning_rate": 0.00034379531212554073, + "loss": 0.1822, + "step": 273540 + }, + { + "epoch": 11.33, + "grad_norm": 1.109375, + "learning_rate": 0.00034378525906799247, + "loss": 0.2186, + "step": 273550 + }, + { + "epoch": 11.33, + "grad_norm": 0.62109375, + "learning_rate": 0.00034377520583395046, + "loss": 0.1699, + "step": 273560 + }, + { + "epoch": 11.33, + "grad_norm": 0.83203125, + "learning_rate": 0.0003437651524234336, + "loss": 0.1933, + "step": 273570 + }, + { + "epoch": 11.33, + "grad_norm": 0.7578125, + "learning_rate": 0.00034375509883646083, + "loss": 0.1779, + "step": 273580 + }, + { + "epoch": 11.33, + "grad_norm": 0.6796875, + "learning_rate": 0.00034374504507305115, + "loss": 0.2133, + "step": 273590 + }, + { + "epoch": 11.33, + "grad_norm": 1.1796875, + "learning_rate": 0.00034373499113322345, + "loss": 0.167, + "step": 273600 + }, + { + "epoch": 11.33, + "grad_norm": 1.0625, + "learning_rate": 0.00034372493701699655, + "loss": 0.2032, + "step": 273610 + }, + { + "epoch": 11.33, + "grad_norm": 0.73828125, + "learning_rate": 0.00034371488272438947, + "loss": 0.2279, + "step": 273620 + }, + { + "epoch": 11.33, + "grad_norm": 0.69921875, + "learning_rate": 0.0003437048282554211, + "loss": 0.1743, + "step": 273630 + }, + { + "epoch": 11.33, + "grad_norm": 0.859375, + "learning_rate": 0.00034369477361011034, + "loss": 0.1968, + "step": 273640 + }, + { + "epoch": 11.33, + "grad_norm": 0.85546875, + "learning_rate": 0.0003436847187884762, + "loss": 0.1886, + "step": 273650 + }, + { + "epoch": 11.33, + "grad_norm": 0.54296875, + "learning_rate": 0.0003436746637905375, + "loss": 0.1998, + "step": 273660 + }, + { + "epoch": 11.34, + "grad_norm": 0.88671875, + "learning_rate": 0.0003436646086163131, + "loss": 0.1906, + "step": 273670 + }, + { + "epoch": 11.34, + "grad_norm": 2.90625, + "learning_rate": 0.0003436545532658222, + "loss": 0.2482, + "step": 273680 + }, + { + "epoch": 11.34, + "grad_norm": 1.1640625, + "learning_rate": 0.0003436444977390834, + "loss": 0.2406, + "step": 273690 + }, + { + "epoch": 11.34, + "grad_norm": 0.6953125, + "learning_rate": 0.0003436344420361158, + "loss": 0.2009, + "step": 273700 + }, + { + "epoch": 11.34, + "grad_norm": 1.046875, + "learning_rate": 0.0003436243861569383, + "loss": 0.2302, + "step": 273710 + }, + { + "epoch": 11.34, + "grad_norm": 0.87890625, + "learning_rate": 0.00034361433010156984, + "loss": 0.1952, + "step": 273720 + }, + { + "epoch": 11.34, + "grad_norm": 0.734375, + "learning_rate": 0.00034360427387002937, + "loss": 0.1843, + "step": 273730 + }, + { + "epoch": 11.34, + "grad_norm": 0.76171875, + "learning_rate": 0.00034359421746233565, + "loss": 0.2098, + "step": 273740 + }, + { + "epoch": 11.34, + "grad_norm": 1.2890625, + "learning_rate": 0.00034358416087850775, + "loss": 0.1867, + "step": 273750 + }, + { + "epoch": 11.34, + "grad_norm": 0.640625, + "learning_rate": 0.00034357410411856466, + "loss": 0.2097, + "step": 273760 + }, + { + "epoch": 11.34, + "grad_norm": 0.640625, + "learning_rate": 0.0003435640471825251, + "loss": 0.2364, + "step": 273770 + }, + { + "epoch": 11.34, + "grad_norm": 0.5625, + "learning_rate": 0.00034355399007040817, + "loss": 0.1849, + "step": 273780 + }, + { + "epoch": 11.34, + "grad_norm": 0.4375, + "learning_rate": 0.00034354393278223267, + "loss": 0.2187, + "step": 273790 + }, + { + "epoch": 11.34, + "grad_norm": 2.265625, + "learning_rate": 0.0003435338753180176, + "loss": 0.1931, + "step": 273800 + }, + { + "epoch": 11.34, + "grad_norm": 0.2119140625, + "learning_rate": 0.00034352381767778197, + "loss": 0.1969, + "step": 273810 + }, + { + "epoch": 11.34, + "grad_norm": 1.0546875, + "learning_rate": 0.00034351375986154455, + "loss": 0.2093, + "step": 273820 + }, + { + "epoch": 11.34, + "grad_norm": 0.62109375, + "learning_rate": 0.0003435037018693243, + "loss": 0.2041, + "step": 273830 + }, + { + "epoch": 11.34, + "grad_norm": 2.734375, + "learning_rate": 0.00034349364370114024, + "loss": 0.1701, + "step": 273840 + }, + { + "epoch": 11.34, + "grad_norm": 0.9921875, + "learning_rate": 0.00034348358535701124, + "loss": 0.2007, + "step": 273850 + }, + { + "epoch": 11.34, + "grad_norm": 0.7265625, + "learning_rate": 0.00034347352683695623, + "loss": 0.1701, + "step": 273860 + }, + { + "epoch": 11.34, + "grad_norm": 0.82421875, + "learning_rate": 0.00034346346814099403, + "loss": 0.1872, + "step": 273870 + }, + { + "epoch": 11.34, + "grad_norm": 1.140625, + "learning_rate": 0.0003434534092691438, + "loss": 0.1898, + "step": 273880 + }, + { + "epoch": 11.34, + "grad_norm": 1.2265625, + "learning_rate": 0.0003434433502214243, + "loss": 0.1943, + "step": 273890 + }, + { + "epoch": 11.34, + "grad_norm": 0.78125, + "learning_rate": 0.00034343329099785453, + "loss": 0.1784, + "step": 273900 + }, + { + "epoch": 11.35, + "grad_norm": 1.5078125, + "learning_rate": 0.00034342323159845334, + "loss": 0.2271, + "step": 273910 + }, + { + "epoch": 11.35, + "grad_norm": 0.71484375, + "learning_rate": 0.0003434131720232397, + "loss": 0.1873, + "step": 273920 + }, + { + "epoch": 11.35, + "grad_norm": 1.71875, + "learning_rate": 0.0003434031122722327, + "loss": 0.2231, + "step": 273930 + }, + { + "epoch": 11.35, + "grad_norm": 0.49609375, + "learning_rate": 0.00034339305234545105, + "loss": 0.2136, + "step": 273940 + }, + { + "epoch": 11.35, + "grad_norm": 0.546875, + "learning_rate": 0.0003433829922429137, + "loss": 0.2121, + "step": 273950 + }, + { + "epoch": 11.35, + "grad_norm": 1.2578125, + "learning_rate": 0.0003433729319646397, + "loss": 0.1884, + "step": 273960 + }, + { + "epoch": 11.35, + "grad_norm": 0.77734375, + "learning_rate": 0.00034336287151064793, + "loss": 0.1891, + "step": 273970 + }, + { + "epoch": 11.35, + "grad_norm": 0.86328125, + "learning_rate": 0.0003433528108809573, + "loss": 0.1435, + "step": 273980 + }, + { + "epoch": 11.35, + "grad_norm": 1.796875, + "learning_rate": 0.00034334275007558677, + "loss": 0.1792, + "step": 273990 + }, + { + "epoch": 11.35, + "grad_norm": 0.3125, + "learning_rate": 0.0003433326890945553, + "loss": 0.1899, + "step": 274000 + }, + { + "epoch": 11.35, + "grad_norm": 0.9296875, + "learning_rate": 0.0003433226279378817, + "loss": 0.1943, + "step": 274010 + }, + { + "epoch": 11.35, + "grad_norm": 0.86328125, + "learning_rate": 0.00034331256660558506, + "loss": 0.1795, + "step": 274020 + }, + { + "epoch": 11.35, + "grad_norm": 0.66015625, + "learning_rate": 0.00034330250509768425, + "loss": 0.161, + "step": 274030 + }, + { + "epoch": 11.35, + "grad_norm": 1.0234375, + "learning_rate": 0.0003432924434141982, + "loss": 0.1935, + "step": 274040 + }, + { + "epoch": 11.35, + "grad_norm": 0.392578125, + "learning_rate": 0.0003432823815551458, + "loss": 0.1588, + "step": 274050 + }, + { + "epoch": 11.35, + "grad_norm": 0.4765625, + "learning_rate": 0.000343272319520546, + "loss": 0.1833, + "step": 274060 + }, + { + "epoch": 11.35, + "grad_norm": 1.0078125, + "learning_rate": 0.00034326225731041787, + "loss": 0.1801, + "step": 274070 + }, + { + "epoch": 11.35, + "grad_norm": 0.71875, + "learning_rate": 0.0003432521949247802, + "loss": 0.1748, + "step": 274080 + }, + { + "epoch": 11.35, + "grad_norm": 0.51953125, + "learning_rate": 0.000343242132363652, + "loss": 0.182, + "step": 274090 + }, + { + "epoch": 11.35, + "grad_norm": 0.62890625, + "learning_rate": 0.0003432320696270521, + "loss": 0.1526, + "step": 274100 + }, + { + "epoch": 11.35, + "grad_norm": 0.251953125, + "learning_rate": 0.0003432220067149996, + "loss": 0.2148, + "step": 274110 + }, + { + "epoch": 11.35, + "grad_norm": 0.734375, + "learning_rate": 0.00034321194362751326, + "loss": 0.2376, + "step": 274120 + }, + { + "epoch": 11.35, + "grad_norm": 1.2578125, + "learning_rate": 0.00034320188036461217, + "loss": 0.1549, + "step": 274130 + }, + { + "epoch": 11.35, + "grad_norm": 0.6953125, + "learning_rate": 0.00034319181692631524, + "loss": 0.2042, + "step": 274140 + }, + { + "epoch": 11.36, + "grad_norm": 0.451171875, + "learning_rate": 0.0003431817533126413, + "loss": 0.1582, + "step": 274150 + }, + { + "epoch": 11.36, + "grad_norm": 0.98828125, + "learning_rate": 0.0003431716895236094, + "loss": 0.1454, + "step": 274160 + }, + { + "epoch": 11.36, + "grad_norm": 1.3984375, + "learning_rate": 0.00034316162555923845, + "loss": 0.2306, + "step": 274170 + }, + { + "epoch": 11.36, + "grad_norm": 1.1328125, + "learning_rate": 0.00034315156141954733, + "loss": 0.1735, + "step": 274180 + }, + { + "epoch": 11.36, + "grad_norm": 1.546875, + "learning_rate": 0.00034314149710455504, + "loss": 0.2235, + "step": 274190 + }, + { + "epoch": 11.36, + "grad_norm": 1.234375, + "learning_rate": 0.00034313143261428054, + "loss": 0.2049, + "step": 274200 + }, + { + "epoch": 11.36, + "grad_norm": 0.33984375, + "learning_rate": 0.0003431213679487427, + "loss": 0.2014, + "step": 274210 + }, + { + "epoch": 11.36, + "grad_norm": 0.9921875, + "learning_rate": 0.0003431113031079606, + "loss": 0.2137, + "step": 274220 + }, + { + "epoch": 11.36, + "grad_norm": 1.40625, + "learning_rate": 0.00034310123809195306, + "loss": 0.203, + "step": 274230 + }, + { + "epoch": 11.36, + "grad_norm": 2.296875, + "learning_rate": 0.00034309117290073893, + "loss": 0.2048, + "step": 274240 + }, + { + "epoch": 11.36, + "grad_norm": 0.78515625, + "learning_rate": 0.0003430811075343374, + "loss": 0.14, + "step": 274250 + }, + { + "epoch": 11.36, + "grad_norm": 0.75390625, + "learning_rate": 0.0003430710419927672, + "loss": 0.2051, + "step": 274260 + }, + { + "epoch": 11.36, + "grad_norm": 1.2265625, + "learning_rate": 0.0003430609762760474, + "loss": 0.2041, + "step": 274270 + }, + { + "epoch": 11.36, + "grad_norm": 1.140625, + "learning_rate": 0.00034305091038419676, + "loss": 0.1846, + "step": 274280 + }, + { + "epoch": 11.36, + "grad_norm": 1.546875, + "learning_rate": 0.0003430408443172345, + "loss": 0.2442, + "step": 274290 + }, + { + "epoch": 11.36, + "grad_norm": 0.95703125, + "learning_rate": 0.0003430307780751794, + "loss": 0.147, + "step": 274300 + }, + { + "epoch": 11.36, + "grad_norm": 1.3046875, + "learning_rate": 0.0003430207116580504, + "loss": 0.2036, + "step": 274310 + }, + { + "epoch": 11.36, + "grad_norm": 0.478515625, + "learning_rate": 0.00034301064506586643, + "loss": 0.2119, + "step": 274320 + }, + { + "epoch": 11.36, + "grad_norm": 0.98046875, + "learning_rate": 0.00034300057829864653, + "loss": 0.2108, + "step": 274330 + }, + { + "epoch": 11.36, + "grad_norm": 0.76953125, + "learning_rate": 0.0003429905113564095, + "loss": 0.1898, + "step": 274340 + }, + { + "epoch": 11.36, + "grad_norm": 0.23046875, + "learning_rate": 0.00034298044423917447, + "loss": 0.1749, + "step": 274350 + }, + { + "epoch": 11.36, + "grad_norm": 0.357421875, + "learning_rate": 0.0003429703769469602, + "loss": 0.2402, + "step": 274360 + }, + { + "epoch": 11.36, + "grad_norm": 1.0078125, + "learning_rate": 0.0003429603094797858, + "loss": 0.2139, + "step": 274370 + }, + { + "epoch": 11.36, + "grad_norm": 1.5859375, + "learning_rate": 0.0003429502418376701, + "loss": 0.2135, + "step": 274380 + }, + { + "epoch": 11.37, + "grad_norm": 0.298828125, + "learning_rate": 0.0003429401740206321, + "loss": 0.2181, + "step": 274390 + }, + { + "epoch": 11.37, + "grad_norm": 0.98046875, + "learning_rate": 0.0003429301060286907, + "loss": 0.1464, + "step": 274400 + }, + { + "epoch": 11.37, + "grad_norm": 1.234375, + "learning_rate": 0.00034292003786186493, + "loss": 0.1889, + "step": 274410 + }, + { + "epoch": 11.37, + "grad_norm": 1.1640625, + "learning_rate": 0.0003429099695201736, + "loss": 0.2612, + "step": 274420 + }, + { + "epoch": 11.37, + "grad_norm": 0.38671875, + "learning_rate": 0.00034289990100363577, + "loss": 0.1767, + "step": 274430 + }, + { + "epoch": 11.37, + "grad_norm": 1.2265625, + "learning_rate": 0.00034288983231227035, + "loss": 0.2035, + "step": 274440 + }, + { + "epoch": 11.37, + "grad_norm": 2.359375, + "learning_rate": 0.00034287976344609636, + "loss": 0.215, + "step": 274450 + }, + { + "epoch": 11.37, + "grad_norm": 0.7734375, + "learning_rate": 0.00034286969440513267, + "loss": 0.1433, + "step": 274460 + }, + { + "epoch": 11.37, + "grad_norm": 2.1875, + "learning_rate": 0.0003428596251893981, + "loss": 0.2369, + "step": 274470 + }, + { + "epoch": 11.37, + "grad_norm": 0.46875, + "learning_rate": 0.00034284955579891194, + "loss": 0.1907, + "step": 274480 + }, + { + "epoch": 11.37, + "grad_norm": 0.828125, + "learning_rate": 0.00034283948623369285, + "loss": 0.2612, + "step": 274490 + }, + { + "epoch": 11.37, + "grad_norm": 0.56640625, + "learning_rate": 0.0003428294164937599, + "loss": 0.1843, + "step": 274500 + }, + { + "epoch": 11.37, + "grad_norm": 0.69140625, + "learning_rate": 0.000342819346579132, + "loss": 0.2197, + "step": 274510 + }, + { + "epoch": 11.37, + "grad_norm": 0.4921875, + "learning_rate": 0.00034280927648982807, + "loss": 0.1839, + "step": 274520 + }, + { + "epoch": 11.37, + "grad_norm": 0.30078125, + "learning_rate": 0.00034279920622586713, + "loss": 0.2184, + "step": 274530 + }, + { + "epoch": 11.37, + "grad_norm": 2.203125, + "learning_rate": 0.0003427891357872682, + "loss": 0.2216, + "step": 274540 + }, + { + "epoch": 11.37, + "grad_norm": 1.4765625, + "learning_rate": 0.00034277906517405, + "loss": 0.1675, + "step": 274550 + }, + { + "epoch": 11.37, + "grad_norm": 0.212890625, + "learning_rate": 0.00034276899438623164, + "loss": 0.2305, + "step": 274560 + }, + { + "epoch": 11.37, + "grad_norm": 1.2265625, + "learning_rate": 0.00034275892342383216, + "loss": 0.1909, + "step": 274570 + }, + { + "epoch": 11.37, + "grad_norm": 1.125, + "learning_rate": 0.00034274885228687023, + "loss": 0.1819, + "step": 274580 + }, + { + "epoch": 11.37, + "grad_norm": 0.7578125, + "learning_rate": 0.00034273878097536514, + "loss": 0.1893, + "step": 274590 + }, + { + "epoch": 11.37, + "grad_norm": 0.78125, + "learning_rate": 0.00034272870948933565, + "loss": 0.1743, + "step": 274600 + }, + { + "epoch": 11.37, + "grad_norm": 1.3671875, + "learning_rate": 0.0003427186378288006, + "loss": 0.1691, + "step": 274610 + }, + { + "epoch": 11.37, + "grad_norm": 1.796875, + "learning_rate": 0.0003427085659937793, + "loss": 0.2145, + "step": 274620 + }, + { + "epoch": 11.38, + "grad_norm": 0.2421875, + "learning_rate": 0.0003426984939842903, + "loss": 0.1717, + "step": 274630 + }, + { + "epoch": 11.38, + "grad_norm": 1.125, + "learning_rate": 0.00034268842180035285, + "loss": 0.1472, + "step": 274640 + }, + { + "epoch": 11.38, + "grad_norm": 0.6640625, + "learning_rate": 0.00034267834944198583, + "loss": 0.2264, + "step": 274650 + }, + { + "epoch": 11.38, + "grad_norm": 1.203125, + "learning_rate": 0.0003426682769092081, + "loss": 0.1889, + "step": 274660 + }, + { + "epoch": 11.38, + "grad_norm": 0.74609375, + "learning_rate": 0.0003426582042020387, + "loss": 0.1693, + "step": 274670 + }, + { + "epoch": 11.38, + "grad_norm": 0.875, + "learning_rate": 0.00034264813132049655, + "loss": 0.187, + "step": 274680 + }, + { + "epoch": 11.38, + "grad_norm": 0.73828125, + "learning_rate": 0.0003426380582646006, + "loss": 0.1457, + "step": 274690 + }, + { + "epoch": 11.38, + "grad_norm": 0.82421875, + "learning_rate": 0.0003426279850343699, + "loss": 0.1807, + "step": 274700 + }, + { + "epoch": 11.38, + "grad_norm": 0.71484375, + "learning_rate": 0.00034261791162982326, + "loss": 0.2061, + "step": 274710 + }, + { + "epoch": 11.38, + "grad_norm": 0.4609375, + "learning_rate": 0.0003426078380509798, + "loss": 0.2076, + "step": 274720 + }, + { + "epoch": 11.38, + "grad_norm": 0.6640625, + "learning_rate": 0.0003425977642978584, + "loss": 0.1642, + "step": 274730 + }, + { + "epoch": 11.38, + "grad_norm": 0.2001953125, + "learning_rate": 0.00034258769037047784, + "loss": 0.1494, + "step": 274740 + }, + { + "epoch": 11.38, + "grad_norm": 1.4765625, + "learning_rate": 0.00034257761626885734, + "loss": 0.1923, + "step": 274750 + }, + { + "epoch": 11.38, + "grad_norm": 1.0625, + "learning_rate": 0.00034256754199301585, + "loss": 0.2404, + "step": 274760 + }, + { + "epoch": 11.38, + "grad_norm": 1.6015625, + "learning_rate": 0.0003425574675429721, + "loss": 0.2236, + "step": 274770 + }, + { + "epoch": 11.38, + "grad_norm": 1.2734375, + "learning_rate": 0.0003425473929187453, + "loss": 0.202, + "step": 274780 + }, + { + "epoch": 11.38, + "grad_norm": 0.7890625, + "learning_rate": 0.0003425373181203542, + "loss": 0.2121, + "step": 274790 + }, + { + "epoch": 11.38, + "grad_norm": 0.99609375, + "learning_rate": 0.000342527243147818, + "loss": 0.1879, + "step": 274800 + }, + { + "epoch": 11.38, + "grad_norm": 0.53125, + "learning_rate": 0.0003425171680011554, + "loss": 0.2283, + "step": 274810 + }, + { + "epoch": 11.38, + "grad_norm": 1.2265625, + "learning_rate": 0.0003425070926803855, + "loss": 0.2006, + "step": 274820 + }, + { + "epoch": 11.38, + "grad_norm": 0.76171875, + "learning_rate": 0.0003424970171855273, + "loss": 0.2058, + "step": 274830 + }, + { + "epoch": 11.38, + "grad_norm": 1.5625, + "learning_rate": 0.00034248694151659963, + "loss": 0.235, + "step": 274840 + }, + { + "epoch": 11.38, + "grad_norm": 0.81640625, + "learning_rate": 0.0003424768656736216, + "loss": 0.1904, + "step": 274850 + }, + { + "epoch": 11.38, + "grad_norm": 0.77734375, + "learning_rate": 0.00034246678965661206, + "loss": 0.1993, + "step": 274860 + }, + { + "epoch": 11.39, + "grad_norm": 0.92578125, + "learning_rate": 0.00034245671346558993, + "loss": 0.1793, + "step": 274870 + }, + { + "epoch": 11.39, + "grad_norm": 0.5, + "learning_rate": 0.00034244663710057433, + "loss": 0.2589, + "step": 274880 + }, + { + "epoch": 11.39, + "grad_norm": 0.26171875, + "learning_rate": 0.00034243656056158413, + "loss": 0.2158, + "step": 274890 + }, + { + "epoch": 11.39, + "grad_norm": 0.31640625, + "learning_rate": 0.0003424264838486383, + "loss": 0.2271, + "step": 274900 + }, + { + "epoch": 11.39, + "grad_norm": 1.6328125, + "learning_rate": 0.00034241640696175586, + "loss": 0.2003, + "step": 274910 + }, + { + "epoch": 11.39, + "grad_norm": 0.921875, + "learning_rate": 0.00034240632990095563, + "loss": 0.1982, + "step": 274920 + }, + { + "epoch": 11.39, + "grad_norm": 0.765625, + "learning_rate": 0.00034239625266625675, + "loss": 0.1788, + "step": 274930 + }, + { + "epoch": 11.39, + "grad_norm": 0.87109375, + "learning_rate": 0.0003423861752576781, + "loss": 0.2347, + "step": 274940 + }, + { + "epoch": 11.39, + "grad_norm": 4.125, + "learning_rate": 0.00034237609767523853, + "loss": 0.2045, + "step": 274950 + }, + { + "epoch": 11.39, + "grad_norm": 0.921875, + "learning_rate": 0.0003423660199189572, + "loss": 0.1673, + "step": 274960 + }, + { + "epoch": 11.39, + "grad_norm": 1.2890625, + "learning_rate": 0.000342355941988853, + "loss": 0.2152, + "step": 274970 + }, + { + "epoch": 11.39, + "grad_norm": 0.76171875, + "learning_rate": 0.0003423458638849449, + "loss": 0.1785, + "step": 274980 + }, + { + "epoch": 11.39, + "grad_norm": 1.3515625, + "learning_rate": 0.00034233578560725187, + "loss": 0.2432, + "step": 274990 + }, + { + "epoch": 11.39, + "grad_norm": 0.9140625, + "learning_rate": 0.0003423257071557928, + "loss": 0.158, + "step": 275000 + }, + { + "epoch": 11.39, + "grad_norm": 0.35546875, + "learning_rate": 0.0003423156285305868, + "loss": 0.1683, + "step": 275010 + }, + { + "epoch": 11.39, + "grad_norm": 0.8359375, + "learning_rate": 0.00034230554973165267, + "loss": 0.1864, + "step": 275020 + }, + { + "epoch": 11.39, + "grad_norm": 0.32421875, + "learning_rate": 0.00034229547075900947, + "loss": 0.1719, + "step": 275030 + }, + { + "epoch": 11.39, + "grad_norm": 1.25, + "learning_rate": 0.00034228539161267627, + "loss": 0.19, + "step": 275040 + }, + { + "epoch": 11.39, + "grad_norm": 0.56640625, + "learning_rate": 0.0003422753122926718, + "loss": 0.2111, + "step": 275050 + }, + { + "epoch": 11.39, + "grad_norm": 0.4140625, + "learning_rate": 0.0003422652327990152, + "loss": 0.2295, + "step": 275060 + }, + { + "epoch": 11.39, + "grad_norm": 0.66796875, + "learning_rate": 0.00034225515313172553, + "loss": 0.1716, + "step": 275070 + }, + { + "epoch": 11.39, + "grad_norm": 1.171875, + "learning_rate": 0.00034224507329082146, + "loss": 0.186, + "step": 275080 + }, + { + "epoch": 11.39, + "grad_norm": 0.8984375, + "learning_rate": 0.0003422349932763221, + "loss": 0.1667, + "step": 275090 + }, + { + "epoch": 11.39, + "grad_norm": 1.2265625, + "learning_rate": 0.00034222491308824653, + "loss": 0.2006, + "step": 275100 + }, + { + "epoch": 11.4, + "grad_norm": 0.73046875, + "learning_rate": 0.0003422148327266136, + "loss": 0.1511, + "step": 275110 + }, + { + "epoch": 11.4, + "grad_norm": 0.95703125, + "learning_rate": 0.00034220475219144236, + "loss": 0.2191, + "step": 275120 + }, + { + "epoch": 11.4, + "grad_norm": 0.39453125, + "learning_rate": 0.0003421946714827517, + "loss": 0.2009, + "step": 275130 + }, + { + "epoch": 11.4, + "grad_norm": 0.66015625, + "learning_rate": 0.00034218459060056063, + "loss": 0.1507, + "step": 275140 + }, + { + "epoch": 11.4, + "grad_norm": 0.50390625, + "learning_rate": 0.0003421745095448881, + "loss": 0.1765, + "step": 275150 + }, + { + "epoch": 11.4, + "grad_norm": 0.6640625, + "learning_rate": 0.00034216442831575314, + "loss": 0.1643, + "step": 275160 + }, + { + "epoch": 11.4, + "grad_norm": 1.25, + "learning_rate": 0.0003421543469131746, + "loss": 0.1806, + "step": 275170 + }, + { + "epoch": 11.4, + "grad_norm": 1.2109375, + "learning_rate": 0.00034214426533717157, + "loss": 0.2274, + "step": 275180 + }, + { + "epoch": 11.4, + "grad_norm": 0.4609375, + "learning_rate": 0.00034213418358776295, + "loss": 0.2032, + "step": 275190 + }, + { + "epoch": 11.4, + "grad_norm": 0.75390625, + "learning_rate": 0.0003421241016649678, + "loss": 0.1888, + "step": 275200 + }, + { + "epoch": 11.4, + "grad_norm": 2.953125, + "learning_rate": 0.00034211401956880504, + "loss": 0.222, + "step": 275210 + }, + { + "epoch": 11.4, + "grad_norm": 1.2421875, + "learning_rate": 0.00034210393729929364, + "loss": 0.1966, + "step": 275220 + }, + { + "epoch": 11.4, + "grad_norm": 0.9453125, + "learning_rate": 0.0003420938548564525, + "loss": 0.172, + "step": 275230 + }, + { + "epoch": 11.4, + "grad_norm": 0.8046875, + "learning_rate": 0.0003420837722403008, + "loss": 0.2349, + "step": 275240 + }, + { + "epoch": 11.4, + "grad_norm": 0.6484375, + "learning_rate": 0.00034207368945085733, + "loss": 0.2293, + "step": 275250 + }, + { + "epoch": 11.4, + "grad_norm": 1.4375, + "learning_rate": 0.00034206360648814105, + "loss": 0.2133, + "step": 275260 + }, + { + "epoch": 11.4, + "grad_norm": 0.73046875, + "learning_rate": 0.0003420535233521711, + "loss": 0.1693, + "step": 275270 + }, + { + "epoch": 11.4, + "grad_norm": 0.96875, + "learning_rate": 0.00034204344004296636, + "loss": 0.1931, + "step": 275280 + }, + { + "epoch": 11.4, + "grad_norm": 6.125, + "learning_rate": 0.00034203335656054573, + "loss": 0.212, + "step": 275290 + }, + { + "epoch": 11.4, + "grad_norm": 0.314453125, + "learning_rate": 0.0003420232729049283, + "loss": 0.2397, + "step": 275300 + }, + { + "epoch": 11.4, + "grad_norm": 0.94921875, + "learning_rate": 0.000342013189076133, + "loss": 0.1836, + "step": 275310 + }, + { + "epoch": 11.4, + "grad_norm": 1.15625, + "learning_rate": 0.00034200310507417886, + "loss": 0.1988, + "step": 275320 + }, + { + "epoch": 11.4, + "grad_norm": 0.7578125, + "learning_rate": 0.0003419930208990847, + "loss": 0.2093, + "step": 275330 + }, + { + "epoch": 11.4, + "grad_norm": 0.25390625, + "learning_rate": 0.0003419829365508697, + "loss": 0.1977, + "step": 275340 + }, + { + "epoch": 11.4, + "grad_norm": 1.3359375, + "learning_rate": 0.00034197285202955277, + "loss": 0.1889, + "step": 275350 + }, + { + "epoch": 11.41, + "grad_norm": 0.73046875, + "learning_rate": 0.00034196276733515274, + "loss": 0.1699, + "step": 275360 + }, + { + "epoch": 11.41, + "grad_norm": 0.70703125, + "learning_rate": 0.00034195268246768883, + "loss": 0.2637, + "step": 275370 + }, + { + "epoch": 11.41, + "grad_norm": 0.85546875, + "learning_rate": 0.0003419425974271799, + "loss": 0.232, + "step": 275380 + }, + { + "epoch": 11.41, + "grad_norm": 0.58203125, + "learning_rate": 0.00034193251221364485, + "loss": 0.1745, + "step": 275390 + }, + { + "epoch": 11.41, + "grad_norm": 1.0, + "learning_rate": 0.00034192242682710283, + "loss": 0.2244, + "step": 275400 + }, + { + "epoch": 11.41, + "grad_norm": 0.38671875, + "learning_rate": 0.0003419123412675727, + "loss": 0.1448, + "step": 275410 + }, + { + "epoch": 11.41, + "grad_norm": 0.69140625, + "learning_rate": 0.00034190225553507346, + "loss": 0.2018, + "step": 275420 + }, + { + "epoch": 11.41, + "grad_norm": 0.2734375, + "learning_rate": 0.0003418921696296241, + "loss": 0.1508, + "step": 275430 + }, + { + "epoch": 11.41, + "grad_norm": 0.435546875, + "learning_rate": 0.00034188208355124356, + "loss": 0.2082, + "step": 275440 + }, + { + "epoch": 11.41, + "grad_norm": 0.181640625, + "learning_rate": 0.00034187199729995095, + "loss": 0.1791, + "step": 275450 + }, + { + "epoch": 11.41, + "grad_norm": 1.0625, + "learning_rate": 0.0003418619108757651, + "loss": 0.1944, + "step": 275460 + }, + { + "epoch": 11.41, + "grad_norm": 0.77734375, + "learning_rate": 0.00034185182427870503, + "loss": 0.2961, + "step": 275470 + }, + { + "epoch": 11.41, + "grad_norm": 1.390625, + "learning_rate": 0.00034184173750878983, + "loss": 0.1785, + "step": 275480 + }, + { + "epoch": 11.41, + "grad_norm": 0.5390625, + "learning_rate": 0.00034183165056603834, + "loss": 0.2118, + "step": 275490 + }, + { + "epoch": 11.41, + "grad_norm": 0.83203125, + "learning_rate": 0.0003418215634504696, + "loss": 0.2113, + "step": 275500 + }, + { + "epoch": 11.41, + "grad_norm": 0.455078125, + "learning_rate": 0.0003418114761621026, + "loss": 0.182, + "step": 275510 + }, + { + "epoch": 11.41, + "grad_norm": 0.78125, + "learning_rate": 0.00034180138870095627, + "loss": 0.2186, + "step": 275520 + }, + { + "epoch": 11.41, + "grad_norm": 0.80859375, + "learning_rate": 0.00034179130106704974, + "loss": 0.2065, + "step": 275530 + }, + { + "epoch": 11.41, + "grad_norm": 0.8203125, + "learning_rate": 0.00034178121326040184, + "loss": 0.2171, + "step": 275540 + }, + { + "epoch": 11.41, + "grad_norm": 1.3046875, + "learning_rate": 0.0003417711252810316, + "loss": 0.1916, + "step": 275550 + }, + { + "epoch": 11.41, + "grad_norm": 0.96875, + "learning_rate": 0.000341761037128958, + "loss": 0.1805, + "step": 275560 + }, + { + "epoch": 11.41, + "grad_norm": 0.84765625, + "learning_rate": 0.0003417509488042001, + "loss": 0.2319, + "step": 275570 + }, + { + "epoch": 11.41, + "grad_norm": 0.3671875, + "learning_rate": 0.0003417408603067768, + "loss": 0.1738, + "step": 275580 + }, + { + "epoch": 11.41, + "grad_norm": 0.5859375, + "learning_rate": 0.00034173077163670706, + "loss": 0.2214, + "step": 275590 + }, + { + "epoch": 11.42, + "grad_norm": 0.53125, + "learning_rate": 0.00034172068279400997, + "loss": 0.1637, + "step": 275600 + }, + { + "epoch": 11.42, + "grad_norm": 0.494140625, + "learning_rate": 0.00034171059377870443, + "loss": 0.1446, + "step": 275610 + }, + { + "epoch": 11.42, + "grad_norm": 0.87109375, + "learning_rate": 0.00034170050459080947, + "loss": 0.2289, + "step": 275620 + }, + { + "epoch": 11.42, + "grad_norm": 0.40234375, + "learning_rate": 0.00034169041523034405, + "loss": 0.2158, + "step": 275630 + }, + { + "epoch": 11.42, + "grad_norm": 0.484375, + "learning_rate": 0.00034168032569732715, + "loss": 0.2083, + "step": 275640 + }, + { + "epoch": 11.42, + "grad_norm": 1.03125, + "learning_rate": 0.0003416702359917778, + "loss": 0.1917, + "step": 275650 + }, + { + "epoch": 11.42, + "grad_norm": 0.9453125, + "learning_rate": 0.000341660146113715, + "loss": 0.2002, + "step": 275660 + }, + { + "epoch": 11.42, + "grad_norm": 0.75390625, + "learning_rate": 0.0003416500560631577, + "loss": 0.174, + "step": 275670 + }, + { + "epoch": 11.42, + "grad_norm": 1.296875, + "learning_rate": 0.00034163996584012486, + "loss": 0.1563, + "step": 275680 + }, + { + "epoch": 11.42, + "grad_norm": 0.263671875, + "learning_rate": 0.00034162987544463554, + "loss": 0.2039, + "step": 275690 + }, + { + "epoch": 11.42, + "grad_norm": 0.2734375, + "learning_rate": 0.00034161978487670864, + "loss": 0.1753, + "step": 275700 + }, + { + "epoch": 11.42, + "grad_norm": 1.2265625, + "learning_rate": 0.0003416096941363632, + "loss": 0.2196, + "step": 275710 + }, + { + "epoch": 11.42, + "grad_norm": 1.015625, + "learning_rate": 0.00034159960322361824, + "loss": 0.1785, + "step": 275720 + }, + { + "epoch": 11.42, + "grad_norm": 0.8046875, + "learning_rate": 0.00034158951213849264, + "loss": 0.1847, + "step": 275730 + }, + { + "epoch": 11.42, + "grad_norm": 1.8046875, + "learning_rate": 0.00034157942088100556, + "loss": 0.1507, + "step": 275740 + }, + { + "epoch": 11.42, + "grad_norm": 0.640625, + "learning_rate": 0.0003415693294511759, + "loss": 0.196, + "step": 275750 + }, + { + "epoch": 11.42, + "grad_norm": 0.71875, + "learning_rate": 0.0003415592378490226, + "loss": 0.2126, + "step": 275760 + }, + { + "epoch": 11.42, + "grad_norm": 1.2890625, + "learning_rate": 0.0003415491460745648, + "loss": 0.1791, + "step": 275770 + }, + { + "epoch": 11.42, + "grad_norm": 0.70703125, + "learning_rate": 0.00034153905412782123, + "loss": 0.209, + "step": 275780 + }, + { + "epoch": 11.42, + "grad_norm": 0.94921875, + "learning_rate": 0.0003415289620088111, + "loss": 0.1904, + "step": 275790 + }, + { + "epoch": 11.42, + "grad_norm": 1.84375, + "learning_rate": 0.0003415188697175534, + "loss": 0.2187, + "step": 275800 + }, + { + "epoch": 11.42, + "grad_norm": 0.76953125, + "learning_rate": 0.000341508777254067, + "loss": 0.1813, + "step": 275810 + }, + { + "epoch": 11.42, + "grad_norm": 0.6640625, + "learning_rate": 0.0003414986846183711, + "loss": 0.1668, + "step": 275820 + }, + { + "epoch": 11.42, + "grad_norm": 0.5859375, + "learning_rate": 0.0003414885918104845, + "loss": 0.1926, + "step": 275830 + }, + { + "epoch": 11.43, + "grad_norm": 0.83984375, + "learning_rate": 0.00034147849883042617, + "loss": 0.1772, + "step": 275840 + }, + { + "epoch": 11.43, + "grad_norm": 0.66796875, + "learning_rate": 0.00034146840567821524, + "loss": 0.2314, + "step": 275850 + }, + { + "epoch": 11.43, + "grad_norm": 1.1875, + "learning_rate": 0.0003414583123538706, + "loss": 0.1761, + "step": 275860 + }, + { + "epoch": 11.43, + "grad_norm": 0.9609375, + "learning_rate": 0.0003414482188574114, + "loss": 0.1948, + "step": 275870 + }, + { + "epoch": 11.43, + "grad_norm": 1.2265625, + "learning_rate": 0.0003414381251888564, + "loss": 0.1333, + "step": 275880 + }, + { + "epoch": 11.43, + "grad_norm": 0.7109375, + "learning_rate": 0.0003414280313482248, + "loss": 0.1911, + "step": 275890 + }, + { + "epoch": 11.43, + "grad_norm": 0.5859375, + "learning_rate": 0.0003414179373355356, + "loss": 0.1968, + "step": 275900 + }, + { + "epoch": 11.43, + "grad_norm": 2.109375, + "learning_rate": 0.00034140784315080753, + "loss": 0.1462, + "step": 275910 + }, + { + "epoch": 11.43, + "grad_norm": 0.55078125, + "learning_rate": 0.00034139774879405986, + "loss": 0.1897, + "step": 275920 + }, + { + "epoch": 11.43, + "grad_norm": 0.9375, + "learning_rate": 0.00034138765426531154, + "loss": 0.1778, + "step": 275930 + }, + { + "epoch": 11.43, + "grad_norm": 0.47265625, + "learning_rate": 0.0003413775595645814, + "loss": 0.1745, + "step": 275940 + }, + { + "epoch": 11.43, + "grad_norm": 0.23828125, + "learning_rate": 0.0003413674646918887, + "loss": 0.2092, + "step": 275950 + }, + { + "epoch": 11.43, + "grad_norm": 1.203125, + "learning_rate": 0.00034135736964725224, + "loss": 0.1796, + "step": 275960 + }, + { + "epoch": 11.43, + "grad_norm": 0.6875, + "learning_rate": 0.00034134727443069105, + "loss": 0.2003, + "step": 275970 + }, + { + "epoch": 11.43, + "grad_norm": 2.03125, + "learning_rate": 0.00034133717904222423, + "loss": 0.207, + "step": 275980 + }, + { + "epoch": 11.43, + "grad_norm": 0.89453125, + "learning_rate": 0.00034132708348187066, + "loss": 0.2767, + "step": 275990 + }, + { + "epoch": 11.43, + "grad_norm": 0.71875, + "learning_rate": 0.0003413169877496493, + "loss": 0.1918, + "step": 276000 + }, + { + "epoch": 11.43, + "grad_norm": 0.859375, + "learning_rate": 0.0003413068918455794, + "loss": 0.1789, + "step": 276010 + }, + { + "epoch": 11.43, + "grad_norm": 0.9453125, + "learning_rate": 0.0003412967957696796, + "loss": 0.1965, + "step": 276020 + }, + { + "epoch": 11.43, + "grad_norm": 0.30859375, + "learning_rate": 0.0003412866995219692, + "loss": 0.1734, + "step": 276030 + }, + { + "epoch": 11.43, + "grad_norm": 1.21875, + "learning_rate": 0.00034127660310246706, + "loss": 0.1919, + "step": 276040 + }, + { + "epoch": 11.43, + "grad_norm": 1.6015625, + "learning_rate": 0.0003412665065111922, + "loss": 0.2342, + "step": 276050 + }, + { + "epoch": 11.43, + "grad_norm": 0.61328125, + "learning_rate": 0.00034125640974816374, + "loss": 0.242, + "step": 276060 + }, + { + "epoch": 11.43, + "grad_norm": 0.6640625, + "learning_rate": 0.0003412463128134004, + "loss": 0.2178, + "step": 276070 + }, + { + "epoch": 11.44, + "grad_norm": 1.6953125, + "learning_rate": 0.0003412362157069215, + "loss": 0.2265, + "step": 276080 + }, + { + "epoch": 11.44, + "grad_norm": 0.9765625, + "learning_rate": 0.00034122611842874584, + "loss": 0.1581, + "step": 276090 + }, + { + "epoch": 11.44, + "grad_norm": 0.482421875, + "learning_rate": 0.0003412160209788924, + "loss": 0.196, + "step": 276100 + }, + { + "epoch": 11.44, + "grad_norm": 2.296875, + "learning_rate": 0.0003412059233573804, + "loss": 0.2121, + "step": 276110 + }, + { + "epoch": 11.44, + "grad_norm": 0.953125, + "learning_rate": 0.00034119582556422855, + "loss": 0.2495, + "step": 276120 + }, + { + "epoch": 11.44, + "grad_norm": 1.03125, + "learning_rate": 0.0003411857275994561, + "loss": 0.1945, + "step": 276130 + }, + { + "epoch": 11.44, + "grad_norm": 0.45703125, + "learning_rate": 0.000341175629463082, + "loss": 0.1798, + "step": 276140 + }, + { + "epoch": 11.44, + "grad_norm": 0.7421875, + "learning_rate": 0.00034116553115512503, + "loss": 0.1874, + "step": 276150 + }, + { + "epoch": 11.44, + "grad_norm": 1.28125, + "learning_rate": 0.00034115543267560454, + "loss": 0.1948, + "step": 276160 + }, + { + "epoch": 11.44, + "grad_norm": 0.76171875, + "learning_rate": 0.0003411453340245394, + "loss": 0.1835, + "step": 276170 + }, + { + "epoch": 11.44, + "grad_norm": 0.83984375, + "learning_rate": 0.00034113523520194847, + "loss": 0.1967, + "step": 276180 + }, + { + "epoch": 11.44, + "grad_norm": 0.43359375, + "learning_rate": 0.00034112513620785094, + "loss": 0.218, + "step": 276190 + }, + { + "epoch": 11.44, + "grad_norm": 1.53125, + "learning_rate": 0.0003411150370422657, + "loss": 0.2025, + "step": 276200 + }, + { + "epoch": 11.44, + "grad_norm": 1.2890625, + "learning_rate": 0.00034110493770521177, + "loss": 0.2508, + "step": 276210 + }, + { + "epoch": 11.44, + "grad_norm": 1.1015625, + "learning_rate": 0.0003410948381967083, + "loss": 0.2074, + "step": 276220 + }, + { + "epoch": 11.44, + "grad_norm": 0.54296875, + "learning_rate": 0.00034108473851677406, + "loss": 0.155, + "step": 276230 + }, + { + "epoch": 11.44, + "grad_norm": 0.69140625, + "learning_rate": 0.00034107463866542824, + "loss": 0.2011, + "step": 276240 + }, + { + "epoch": 11.44, + "grad_norm": 0.640625, + "learning_rate": 0.00034106453864268983, + "loss": 0.2202, + "step": 276250 + }, + { + "epoch": 11.44, + "grad_norm": 0.486328125, + "learning_rate": 0.0003410544384485777, + "loss": 0.2094, + "step": 276260 + }, + { + "epoch": 11.44, + "grad_norm": 0.828125, + "learning_rate": 0.000341044338083111, + "loss": 0.153, + "step": 276270 + }, + { + "epoch": 11.44, + "grad_norm": 1.0703125, + "learning_rate": 0.0003410342375463087, + "loss": 0.2034, + "step": 276280 + }, + { + "epoch": 11.44, + "grad_norm": 0.0986328125, + "learning_rate": 0.0003410241368381897, + "loss": 0.1984, + "step": 276290 + }, + { + "epoch": 11.44, + "grad_norm": 1.234375, + "learning_rate": 0.00034101403595877323, + "loss": 0.1754, + "step": 276300 + }, + { + "epoch": 11.44, + "grad_norm": 0.0, + "learning_rate": 0.0003410039349080781, + "loss": 0.1582, + "step": 276310 + }, + { + "epoch": 11.45, + "grad_norm": 0.328125, + "learning_rate": 0.0003409938336861235, + "loss": 0.2029, + "step": 276320 + }, + { + "epoch": 11.45, + "grad_norm": 0.875, + "learning_rate": 0.00034098373229292817, + "loss": 0.2107, + "step": 276330 + }, + { + "epoch": 11.45, + "grad_norm": 0.77734375, + "learning_rate": 0.00034097363072851136, + "loss": 0.1819, + "step": 276340 + }, + { + "epoch": 11.45, + "grad_norm": 1.5859375, + "learning_rate": 0.00034096352899289203, + "loss": 0.2397, + "step": 276350 + }, + { + "epoch": 11.45, + "grad_norm": 0.58984375, + "learning_rate": 0.0003409534270860891, + "loss": 0.1878, + "step": 276360 + }, + { + "epoch": 11.45, + "grad_norm": 0.66015625, + "learning_rate": 0.0003409433250081216, + "loss": 0.1799, + "step": 276370 + }, + { + "epoch": 11.45, + "grad_norm": 1.1171875, + "learning_rate": 0.00034093322275900875, + "loss": 0.149, + "step": 276380 + }, + { + "epoch": 11.45, + "grad_norm": 0.71875, + "learning_rate": 0.00034092312033876925, + "loss": 0.2232, + "step": 276390 + }, + { + "epoch": 11.45, + "grad_norm": 0.201171875, + "learning_rate": 0.0003409130177474223, + "loss": 0.208, + "step": 276400 + }, + { + "epoch": 11.45, + "grad_norm": 0.80078125, + "learning_rate": 0.0003409029149849869, + "loss": 0.1979, + "step": 276410 + }, + { + "epoch": 11.45, + "grad_norm": 0.78125, + "learning_rate": 0.00034089281205148197, + "loss": 0.2011, + "step": 276420 + }, + { + "epoch": 11.45, + "grad_norm": 0.515625, + "learning_rate": 0.00034088270894692664, + "loss": 0.1705, + "step": 276430 + }, + { + "epoch": 11.45, + "grad_norm": 0.8984375, + "learning_rate": 0.0003408726056713399, + "loss": 0.199, + "step": 276440 + }, + { + "epoch": 11.45, + "grad_norm": 0.9765625, + "learning_rate": 0.0003408625022247406, + "loss": 0.2156, + "step": 276450 + }, + { + "epoch": 11.45, + "grad_norm": 0.470703125, + "learning_rate": 0.000340852398607148, + "loss": 0.1461, + "step": 276460 + }, + { + "epoch": 11.45, + "grad_norm": 0.90234375, + "learning_rate": 0.00034084229481858084, + "loss": 0.1814, + "step": 276470 + }, + { + "epoch": 11.45, + "grad_norm": 0.6171875, + "learning_rate": 0.0003408321908590585, + "loss": 0.221, + "step": 276480 + }, + { + "epoch": 11.45, + "grad_norm": 0.953125, + "learning_rate": 0.00034082208672859974, + "loss": 0.2099, + "step": 276490 + }, + { + "epoch": 11.45, + "grad_norm": 0.333984375, + "learning_rate": 0.0003408119824272235, + "loss": 0.1884, + "step": 276500 + }, + { + "epoch": 11.45, + "grad_norm": 1.1953125, + "learning_rate": 0.000340801877954949, + "loss": 0.2009, + "step": 276510 + }, + { + "epoch": 11.45, + "grad_norm": 0.341796875, + "learning_rate": 0.0003407917733117952, + "loss": 0.1868, + "step": 276520 + }, + { + "epoch": 11.45, + "grad_norm": 0.58984375, + "learning_rate": 0.000340781668497781, + "loss": 0.1414, + "step": 276530 + }, + { + "epoch": 11.45, + "grad_norm": 0.474609375, + "learning_rate": 0.0003407715635129255, + "loss": 0.1635, + "step": 276540 + }, + { + "epoch": 11.45, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003407614583572478, + "loss": 0.191, + "step": 276550 + }, + { + "epoch": 11.46, + "grad_norm": 1.0703125, + "learning_rate": 0.0003407513530307668, + "loss": 0.1907, + "step": 276560 + }, + { + "epoch": 11.46, + "grad_norm": 0.86328125, + "learning_rate": 0.00034074124753350155, + "loss": 0.1821, + "step": 276570 + }, + { + "epoch": 11.46, + "grad_norm": 1.484375, + "learning_rate": 0.0003407311418654711, + "loss": 0.165, + "step": 276580 + }, + { + "epoch": 11.46, + "grad_norm": 1.125, + "learning_rate": 0.0003407210360266945, + "loss": 0.2136, + "step": 276590 + }, + { + "epoch": 11.46, + "grad_norm": 0.88671875, + "learning_rate": 0.0003407109300171906, + "loss": 0.2436, + "step": 276600 + }, + { + "epoch": 11.46, + "grad_norm": 0.51953125, + "learning_rate": 0.00034070082383697855, + "loss": 0.1594, + "step": 276610 + }, + { + "epoch": 11.46, + "grad_norm": 0.9296875, + "learning_rate": 0.00034069071748607727, + "loss": 0.1766, + "step": 276620 + }, + { + "epoch": 11.46, + "grad_norm": 1.828125, + "learning_rate": 0.000340680610964506, + "loss": 0.1747, + "step": 276630 + }, + { + "epoch": 11.46, + "grad_norm": 0.98828125, + "learning_rate": 0.00034067050427228355, + "loss": 0.1737, + "step": 276640 + }, + { + "epoch": 11.46, + "grad_norm": 0.294921875, + "learning_rate": 0.00034066039740942894, + "loss": 0.2229, + "step": 276650 + }, + { + "epoch": 11.46, + "grad_norm": 1.1328125, + "learning_rate": 0.0003406502903759613, + "loss": 0.182, + "step": 276660 + }, + { + "epoch": 11.46, + "grad_norm": 1.5234375, + "learning_rate": 0.00034064018317189965, + "loss": 0.165, + "step": 276670 + }, + { + "epoch": 11.46, + "grad_norm": 1.4453125, + "learning_rate": 0.0003406300757972629, + "loss": 0.1686, + "step": 276680 + }, + { + "epoch": 11.46, + "grad_norm": 1.453125, + "learning_rate": 0.0003406199682520702, + "loss": 0.1927, + "step": 276690 + }, + { + "epoch": 11.46, + "grad_norm": 1.671875, + "learning_rate": 0.0003406098605363404, + "loss": 0.2593, + "step": 276700 + }, + { + "epoch": 11.46, + "grad_norm": 0.3203125, + "learning_rate": 0.00034059975265009266, + "loss": 0.2471, + "step": 276710 + }, + { + "epoch": 11.46, + "grad_norm": 0.431640625, + "learning_rate": 0.00034058964459334607, + "loss": 0.2031, + "step": 276720 + }, + { + "epoch": 11.46, + "grad_norm": 1.703125, + "learning_rate": 0.00034057953636611947, + "loss": 0.2118, + "step": 276730 + }, + { + "epoch": 11.46, + "grad_norm": 0.9140625, + "learning_rate": 0.000340569427968432, + "loss": 0.2358, + "step": 276740 + }, + { + "epoch": 11.46, + "grad_norm": 0.5390625, + "learning_rate": 0.0003405593194003026, + "loss": 0.1877, + "step": 276750 + }, + { + "epoch": 11.46, + "grad_norm": 0.78515625, + "learning_rate": 0.00034054921066175035, + "loss": 0.1828, + "step": 276760 + }, + { + "epoch": 11.46, + "grad_norm": 0.0, + "learning_rate": 0.00034053910175279434, + "loss": 0.1804, + "step": 276770 + }, + { + "epoch": 11.46, + "grad_norm": 0.66796875, + "learning_rate": 0.00034052899267345346, + "loss": 0.1661, + "step": 276780 + }, + { + "epoch": 11.46, + "grad_norm": 0.88671875, + "learning_rate": 0.00034051888342374675, + "loss": 0.1504, + "step": 276790 + }, + { + "epoch": 11.47, + "grad_norm": 0.8828125, + "learning_rate": 0.00034050877400369333, + "loss": 0.1625, + "step": 276800 + }, + { + "epoch": 11.47, + "grad_norm": 0.80859375, + "learning_rate": 0.0003404986644133122, + "loss": 0.1705, + "step": 276810 + }, + { + "epoch": 11.47, + "grad_norm": 1.53125, + "learning_rate": 0.0003404885546526223, + "loss": 0.181, + "step": 276820 + }, + { + "epoch": 11.47, + "grad_norm": 0.98046875, + "learning_rate": 0.00034047844472164267, + "loss": 0.1761, + "step": 276830 + }, + { + "epoch": 11.47, + "grad_norm": 0.72265625, + "learning_rate": 0.0003404683346203925, + "loss": 0.2149, + "step": 276840 + }, + { + "epoch": 11.47, + "grad_norm": 0.62109375, + "learning_rate": 0.0003404582243488906, + "loss": 0.138, + "step": 276850 + }, + { + "epoch": 11.47, + "grad_norm": 1.2109375, + "learning_rate": 0.00034044811390715615, + "loss": 0.2273, + "step": 276860 + }, + { + "epoch": 11.47, + "grad_norm": 0.53515625, + "learning_rate": 0.000340438003295208, + "loss": 0.1998, + "step": 276870 + }, + { + "epoch": 11.47, + "grad_norm": 0.82421875, + "learning_rate": 0.0003404278925130654, + "loss": 0.1892, + "step": 276880 + }, + { + "epoch": 11.47, + "grad_norm": 0.68359375, + "learning_rate": 0.0003404177815607473, + "loss": 0.1782, + "step": 276890 + }, + { + "epoch": 11.47, + "grad_norm": 1.328125, + "learning_rate": 0.00034040767043827264, + "loss": 0.1698, + "step": 276900 + }, + { + "epoch": 11.47, + "grad_norm": 1.375, + "learning_rate": 0.00034039755914566053, + "loss": 0.2485, + "step": 276910 + }, + { + "epoch": 11.47, + "grad_norm": 1.203125, + "learning_rate": 0.00034038744768292994, + "loss": 0.1826, + "step": 276920 + }, + { + "epoch": 11.47, + "grad_norm": 1.296875, + "learning_rate": 0.00034037733605009996, + "loss": 0.2201, + "step": 276930 + }, + { + "epoch": 11.47, + "grad_norm": 1.65625, + "learning_rate": 0.00034036722424718956, + "loss": 0.2373, + "step": 276940 + }, + { + "epoch": 11.47, + "grad_norm": 1.0625, + "learning_rate": 0.0003403571122742178, + "loss": 0.1912, + "step": 276950 + }, + { + "epoch": 11.47, + "grad_norm": 0.62109375, + "learning_rate": 0.00034034700013120376, + "loss": 0.2142, + "step": 276960 + }, + { + "epoch": 11.47, + "grad_norm": 0.2392578125, + "learning_rate": 0.0003403368878181665, + "loss": 0.1394, + "step": 276970 + }, + { + "epoch": 11.47, + "grad_norm": 0.6953125, + "learning_rate": 0.00034032677533512483, + "loss": 0.1948, + "step": 276980 + }, + { + "epoch": 11.47, + "grad_norm": 0.39453125, + "learning_rate": 0.000340316662682098, + "loss": 0.1928, + "step": 276990 + }, + { + "epoch": 11.47, + "grad_norm": 1.203125, + "learning_rate": 0.000340306549859105, + "loss": 0.2003, + "step": 277000 + }, + { + "epoch": 11.47, + "grad_norm": 0.63671875, + "learning_rate": 0.00034029643686616474, + "loss": 0.1779, + "step": 277010 + }, + { + "epoch": 11.47, + "grad_norm": 1.40625, + "learning_rate": 0.0003402863237032963, + "loss": 0.1866, + "step": 277020 + }, + { + "epoch": 11.47, + "grad_norm": 1.2265625, + "learning_rate": 0.00034027621037051884, + "loss": 0.2054, + "step": 277030 + }, + { + "epoch": 11.47, + "grad_norm": 1.3671875, + "learning_rate": 0.00034026609686785124, + "loss": 0.1839, + "step": 277040 + }, + { + "epoch": 11.48, + "grad_norm": 0.61328125, + "learning_rate": 0.0003402559831953127, + "loss": 0.2066, + "step": 277050 + }, + { + "epoch": 11.48, + "grad_norm": 0.89453125, + "learning_rate": 0.000340245869352922, + "loss": 0.227, + "step": 277060 + }, + { + "epoch": 11.48, + "grad_norm": 0.63671875, + "learning_rate": 0.0003402357553406984, + "loss": 0.2591, + "step": 277070 + }, + { + "epoch": 11.48, + "grad_norm": 0.341796875, + "learning_rate": 0.0003402256411586609, + "loss": 0.1759, + "step": 277080 + }, + { + "epoch": 11.48, + "grad_norm": 0.6328125, + "learning_rate": 0.00034021552680682835, + "loss": 0.2254, + "step": 277090 + }, + { + "epoch": 11.48, + "grad_norm": 0.671875, + "learning_rate": 0.00034020541228522007, + "loss": 0.1863, + "step": 277100 + }, + { + "epoch": 11.48, + "grad_norm": 3.828125, + "learning_rate": 0.00034019529759385484, + "loss": 0.1525, + "step": 277110 + }, + { + "epoch": 11.48, + "grad_norm": 0.57421875, + "learning_rate": 0.00034018518273275185, + "loss": 0.2438, + "step": 277120 + }, + { + "epoch": 11.48, + "grad_norm": 1.328125, + "learning_rate": 0.00034017506770193007, + "loss": 0.2153, + "step": 277130 + }, + { + "epoch": 11.48, + "grad_norm": 0.59375, + "learning_rate": 0.00034016495250140853, + "loss": 0.2518, + "step": 277140 + }, + { + "epoch": 11.48, + "grad_norm": 0.376953125, + "learning_rate": 0.0003401548371312063, + "loss": 0.1981, + "step": 277150 + }, + { + "epoch": 11.48, + "grad_norm": 0.6328125, + "learning_rate": 0.00034014472159134247, + "loss": 0.1878, + "step": 277160 + }, + { + "epoch": 11.48, + "grad_norm": 0.44921875, + "learning_rate": 0.00034013460588183593, + "loss": 0.1907, + "step": 277170 + }, + { + "epoch": 11.48, + "grad_norm": 1.640625, + "learning_rate": 0.00034012449000270586, + "loss": 0.1673, + "step": 277180 + }, + { + "epoch": 11.48, + "grad_norm": 1.9375, + "learning_rate": 0.00034011437395397117, + "loss": 0.2408, + "step": 277190 + }, + { + "epoch": 11.48, + "grad_norm": 0.64453125, + "learning_rate": 0.00034010425773565095, + "loss": 0.1911, + "step": 277200 + }, + { + "epoch": 11.48, + "grad_norm": 0.6171875, + "learning_rate": 0.00034009414134776426, + "loss": 0.1496, + "step": 277210 + }, + { + "epoch": 11.48, + "grad_norm": 1.2734375, + "learning_rate": 0.0003400840247903302, + "loss": 0.1947, + "step": 277220 + }, + { + "epoch": 11.48, + "grad_norm": 0.78125, + "learning_rate": 0.00034007390806336764, + "loss": 0.2154, + "step": 277230 + }, + { + "epoch": 11.48, + "grad_norm": 1.59375, + "learning_rate": 0.00034006379116689576, + "loss": 0.1844, + "step": 277240 + }, + { + "epoch": 11.48, + "grad_norm": 0.2734375, + "learning_rate": 0.00034005367410093354, + "loss": 0.2128, + "step": 277250 + }, + { + "epoch": 11.48, + "grad_norm": 0.423828125, + "learning_rate": 0.00034004355686550005, + "loss": 0.1709, + "step": 277260 + }, + { + "epoch": 11.48, + "grad_norm": 0.7265625, + "learning_rate": 0.0003400334394606143, + "loss": 0.1972, + "step": 277270 + }, + { + "epoch": 11.48, + "grad_norm": 2.15625, + "learning_rate": 0.00034002332188629524, + "loss": 0.1835, + "step": 277280 + }, + { + "epoch": 11.49, + "grad_norm": 0.59375, + "learning_rate": 0.0003400132041425622, + "loss": 0.219, + "step": 277290 + }, + { + "epoch": 11.49, + "grad_norm": 1.0234375, + "learning_rate": 0.0003400030862294338, + "loss": 0.1824, + "step": 277300 + }, + { + "epoch": 11.49, + "grad_norm": 2.015625, + "learning_rate": 0.00033999296814692955, + "loss": 0.2228, + "step": 277310 + }, + { + "epoch": 11.49, + "grad_norm": 0.421875, + "learning_rate": 0.0003399828498950682, + "loss": 0.1599, + "step": 277320 + }, + { + "epoch": 11.49, + "grad_norm": 0.82421875, + "learning_rate": 0.00033997273147386864, + "loss": 0.1931, + "step": 277330 + }, + { + "epoch": 11.49, + "grad_norm": 1.1328125, + "learning_rate": 0.0003399626128833503, + "loss": 0.1632, + "step": 277340 + }, + { + "epoch": 11.49, + "grad_norm": 0.28125, + "learning_rate": 0.000339952494123532, + "loss": 0.2417, + "step": 277350 + }, + { + "epoch": 11.49, + "grad_norm": 0.3203125, + "learning_rate": 0.00033994237519443285, + "loss": 0.1672, + "step": 277360 + }, + { + "epoch": 11.49, + "grad_norm": 1.2578125, + "learning_rate": 0.00033993225609607185, + "loss": 0.1881, + "step": 277370 + }, + { + "epoch": 11.49, + "grad_norm": 0.73046875, + "learning_rate": 0.000339922136828468, + "loss": 0.217, + "step": 277380 + }, + { + "epoch": 11.49, + "grad_norm": 0.359375, + "learning_rate": 0.0003399120173916405, + "loss": 0.216, + "step": 277390 + }, + { + "epoch": 11.49, + "grad_norm": 0.5546875, + "learning_rate": 0.00033990189778560816, + "loss": 0.2215, + "step": 277400 + }, + { + "epoch": 11.49, + "grad_norm": 0.65234375, + "learning_rate": 0.0003398917780103902, + "loss": 0.1924, + "step": 277410 + }, + { + "epoch": 11.49, + "grad_norm": 0.462890625, + "learning_rate": 0.0003398816580660057, + "loss": 0.2179, + "step": 277420 + }, + { + "epoch": 11.49, + "grad_norm": 0.5546875, + "learning_rate": 0.0003398715379524736, + "loss": 0.2413, + "step": 277430 + }, + { + "epoch": 11.49, + "grad_norm": 1.078125, + "learning_rate": 0.0003398614176698129, + "loss": 0.1885, + "step": 277440 + }, + { + "epoch": 11.49, + "grad_norm": 0.69921875, + "learning_rate": 0.00033985129721804283, + "loss": 0.2246, + "step": 277450 + }, + { + "epoch": 11.49, + "grad_norm": 0.53515625, + "learning_rate": 0.00033984117659718215, + "loss": 0.1983, + "step": 277460 + }, + { + "epoch": 11.49, + "grad_norm": 0.369140625, + "learning_rate": 0.0003398310558072503, + "loss": 0.1891, + "step": 277470 + }, + { + "epoch": 11.49, + "grad_norm": 1.1953125, + "learning_rate": 0.0003398209348482659, + "loss": 0.2539, + "step": 277480 + }, + { + "epoch": 11.49, + "grad_norm": 1.65625, + "learning_rate": 0.0003398108137202483, + "loss": 0.2406, + "step": 277490 + }, + { + "epoch": 11.49, + "grad_norm": 1.0078125, + "learning_rate": 0.00033980069242321644, + "loss": 0.1973, + "step": 277500 + }, + { + "epoch": 11.49, + "grad_norm": 0.76953125, + "learning_rate": 0.0003397905709571894, + "loss": 0.1475, + "step": 277510 + }, + { + "epoch": 11.49, + "grad_norm": 0.640625, + "learning_rate": 0.0003397804493221861, + "loss": 0.183, + "step": 277520 + }, + { + "epoch": 11.5, + "grad_norm": 0.365234375, + "learning_rate": 0.00033977032751822585, + "loss": 0.1471, + "step": 277530 + }, + { + "epoch": 11.5, + "grad_norm": 0.765625, + "learning_rate": 0.0003397602055453274, + "loss": 0.1828, + "step": 277540 + }, + { + "epoch": 11.5, + "grad_norm": 0.55078125, + "learning_rate": 0.00033975008340351, + "loss": 0.1739, + "step": 277550 + }, + { + "epoch": 11.5, + "grad_norm": 0.462890625, + "learning_rate": 0.00033973996109279257, + "loss": 0.1467, + "step": 277560 + }, + { + "epoch": 11.5, + "grad_norm": 0.92578125, + "learning_rate": 0.0003397298386131943, + "loss": 0.1853, + "step": 277570 + }, + { + "epoch": 11.5, + "grad_norm": 1.6171875, + "learning_rate": 0.00033971971596473416, + "loss": 0.1983, + "step": 277580 + }, + { + "epoch": 11.5, + "grad_norm": 0.65625, + "learning_rate": 0.0003397095931474312, + "loss": 0.2436, + "step": 277590 + }, + { + "epoch": 11.5, + "grad_norm": 0.90625, + "learning_rate": 0.0003396994701613044, + "loss": 0.1353, + "step": 277600 + }, + { + "epoch": 11.5, + "grad_norm": 0.359375, + "learning_rate": 0.00033968934700637297, + "loss": 0.2059, + "step": 277610 + }, + { + "epoch": 11.5, + "grad_norm": 0.78125, + "learning_rate": 0.0003396792236826558, + "loss": 0.1742, + "step": 277620 + }, + { + "epoch": 11.5, + "grad_norm": 0.71484375, + "learning_rate": 0.00033966910019017204, + "loss": 0.2026, + "step": 277630 + }, + { + "epoch": 11.5, + "grad_norm": 0.98828125, + "learning_rate": 0.0003396589765289407, + "loss": 0.226, + "step": 277640 + }, + { + "epoch": 11.5, + "grad_norm": 0.349609375, + "learning_rate": 0.00033964885269898093, + "loss": 0.1968, + "step": 277650 + }, + { + "epoch": 11.5, + "grad_norm": 0.57421875, + "learning_rate": 0.0003396387287003117, + "loss": 0.2057, + "step": 277660 + }, + { + "epoch": 11.5, + "grad_norm": 0.3984375, + "learning_rate": 0.000339628604532952, + "loss": 0.2136, + "step": 277670 + }, + { + "epoch": 11.5, + "grad_norm": 0.62109375, + "learning_rate": 0.00033961848019692094, + "loss": 0.1785, + "step": 277680 + }, + { + "epoch": 11.5, + "grad_norm": 0.40234375, + "learning_rate": 0.00033960835569223763, + "loss": 0.1775, + "step": 277690 + }, + { + "epoch": 11.5, + "grad_norm": 0.86328125, + "learning_rate": 0.000339598231018921, + "loss": 0.2024, + "step": 277700 + }, + { + "epoch": 11.5, + "grad_norm": 0.78515625, + "learning_rate": 0.00033958810617699026, + "loss": 0.176, + "step": 277710 + }, + { + "epoch": 11.5, + "grad_norm": 0.859375, + "learning_rate": 0.0003395779811664643, + "loss": 0.1574, + "step": 277720 + }, + { + "epoch": 11.5, + "grad_norm": 1.2734375, + "learning_rate": 0.0003395678559873623, + "loss": 0.2065, + "step": 277730 + }, + { + "epoch": 11.5, + "grad_norm": 0.640625, + "learning_rate": 0.0003395577306397033, + "loss": 0.1861, + "step": 277740 + }, + { + "epoch": 11.5, + "grad_norm": 0.921875, + "learning_rate": 0.00033954760512350626, + "loss": 0.2253, + "step": 277750 + }, + { + "epoch": 11.5, + "grad_norm": 0.52734375, + "learning_rate": 0.0003395374794387903, + "loss": 0.1577, + "step": 277760 + }, + { + "epoch": 11.51, + "grad_norm": 0.671875, + "learning_rate": 0.0003395273535855745, + "loss": 0.1767, + "step": 277770 + }, + { + "epoch": 11.51, + "grad_norm": 0.703125, + "learning_rate": 0.0003395172275638778, + "loss": 0.2127, + "step": 277780 + }, + { + "epoch": 11.51, + "grad_norm": 0.65625, + "learning_rate": 0.0003395071013737195, + "loss": 0.1757, + "step": 277790 + }, + { + "epoch": 11.51, + "grad_norm": 0.59375, + "learning_rate": 0.0003394969750151184, + "loss": 0.1732, + "step": 277800 + }, + { + "epoch": 11.51, + "grad_norm": 0.84375, + "learning_rate": 0.00033948684848809365, + "loss": 0.177, + "step": 277810 + }, + { + "epoch": 11.51, + "grad_norm": 0.2890625, + "learning_rate": 0.00033947672179266444, + "loss": 0.1967, + "step": 277820 + }, + { + "epoch": 11.51, + "grad_norm": 0.9921875, + "learning_rate": 0.0003394665949288496, + "loss": 0.2105, + "step": 277830 + }, + { + "epoch": 11.51, + "grad_norm": 0.984375, + "learning_rate": 0.0003394564678966682, + "loss": 0.1732, + "step": 277840 + }, + { + "epoch": 11.51, + "grad_norm": 2.09375, + "learning_rate": 0.0003394463406961395, + "loss": 0.2044, + "step": 277850 + }, + { + "epoch": 11.51, + "grad_norm": 0.9140625, + "learning_rate": 0.0003394362133272824, + "loss": 0.2032, + "step": 277860 + }, + { + "epoch": 11.51, + "grad_norm": 1.6953125, + "learning_rate": 0.00033942608579011615, + "loss": 0.2153, + "step": 277870 + }, + { + "epoch": 11.51, + "grad_norm": 0.70703125, + "learning_rate": 0.00033941595808465954, + "loss": 0.2001, + "step": 277880 + }, + { + "epoch": 11.51, + "grad_norm": 0.83984375, + "learning_rate": 0.0003394058302109317, + "loss": 0.2226, + "step": 277890 + }, + { + "epoch": 11.51, + "grad_norm": 0.01123046875, + "learning_rate": 0.0003393957021689519, + "loss": 0.1863, + "step": 277900 + }, + { + "epoch": 11.51, + "grad_norm": 2.578125, + "learning_rate": 0.0003393855739587389, + "loss": 0.2237, + "step": 277910 + }, + { + "epoch": 11.51, + "grad_norm": 0.7109375, + "learning_rate": 0.0003393754455803119, + "loss": 0.145, + "step": 277920 + }, + { + "epoch": 11.51, + "grad_norm": 0.7734375, + "learning_rate": 0.00033936531703369, + "loss": 0.1824, + "step": 277930 + }, + { + "epoch": 11.51, + "grad_norm": 0.54296875, + "learning_rate": 0.0003393551883188922, + "loss": 0.1752, + "step": 277940 + }, + { + "epoch": 11.51, + "grad_norm": 1.1015625, + "learning_rate": 0.00033934505943593774, + "loss": 0.2468, + "step": 277950 + }, + { + "epoch": 11.51, + "grad_norm": 0.9140625, + "learning_rate": 0.00033933493038484534, + "loss": 0.1941, + "step": 277960 + }, + { + "epoch": 11.51, + "grad_norm": 0.6171875, + "learning_rate": 0.0003393248011656344, + "loss": 0.2074, + "step": 277970 + }, + { + "epoch": 11.51, + "grad_norm": 0.056884765625, + "learning_rate": 0.00033931467177832373, + "loss": 0.1805, + "step": 277980 + }, + { + "epoch": 11.51, + "grad_norm": 0.63671875, + "learning_rate": 0.0003393045422229324, + "loss": 0.1744, + "step": 277990 + }, + { + "epoch": 11.51, + "grad_norm": 1.4765625, + "learning_rate": 0.0003392944124994798, + "loss": 0.1537, + "step": 278000 + }, + { + "epoch": 11.52, + "grad_norm": 0.8046875, + "learning_rate": 0.0003392842826079846, + "loss": 0.1867, + "step": 278010 + }, + { + "epoch": 11.52, + "grad_norm": 0.7421875, + "learning_rate": 0.0003392741525484661, + "loss": 0.1591, + "step": 278020 + }, + { + "epoch": 11.52, + "grad_norm": 1.15625, + "learning_rate": 0.0003392640223209432, + "loss": 0.1998, + "step": 278030 + }, + { + "epoch": 11.52, + "grad_norm": 0.97265625, + "learning_rate": 0.0003392538919254351, + "loss": 0.1757, + "step": 278040 + }, + { + "epoch": 11.52, + "grad_norm": 0.61328125, + "learning_rate": 0.00033924376136196076, + "loss": 0.2076, + "step": 278050 + }, + { + "epoch": 11.52, + "grad_norm": 0.65625, + "learning_rate": 0.00033923363063053936, + "loss": 0.1983, + "step": 278060 + }, + { + "epoch": 11.52, + "grad_norm": 1.1953125, + "learning_rate": 0.0003392234997311899, + "loss": 0.1781, + "step": 278070 + }, + { + "epoch": 11.52, + "grad_norm": 0.419921875, + "learning_rate": 0.0003392133686639314, + "loss": 0.1892, + "step": 278080 + }, + { + "epoch": 11.52, + "grad_norm": 0.95703125, + "learning_rate": 0.00033920323742878307, + "loss": 0.1922, + "step": 278090 + }, + { + "epoch": 11.52, + "grad_norm": 0.99609375, + "learning_rate": 0.0003391931060257638, + "loss": 0.2082, + "step": 278100 + }, + { + "epoch": 11.52, + "grad_norm": 1.1484375, + "learning_rate": 0.00033918297445489275, + "loss": 0.2019, + "step": 278110 + }, + { + "epoch": 11.52, + "grad_norm": 0.9453125, + "learning_rate": 0.000339172842716189, + "loss": 0.1829, + "step": 278120 + }, + { + "epoch": 11.52, + "grad_norm": 0.765625, + "learning_rate": 0.00033916271080967154, + "loss": 0.2081, + "step": 278130 + }, + { + "epoch": 11.52, + "grad_norm": 0.490234375, + "learning_rate": 0.00033915257873535956, + "loss": 0.151, + "step": 278140 + }, + { + "epoch": 11.52, + "grad_norm": 0.46875, + "learning_rate": 0.00033914244649327195, + "loss": 0.2334, + "step": 278150 + }, + { + "epoch": 11.52, + "grad_norm": 0.5234375, + "learning_rate": 0.000339132314083428, + "loss": 0.1662, + "step": 278160 + }, + { + "epoch": 11.52, + "grad_norm": 0.2578125, + "learning_rate": 0.0003391221815058466, + "loss": 0.1759, + "step": 278170 + }, + { + "epoch": 11.52, + "grad_norm": 0.88671875, + "learning_rate": 0.0003391120487605469, + "loss": 0.1579, + "step": 278180 + }, + { + "epoch": 11.52, + "grad_norm": 1.15625, + "learning_rate": 0.00033910191584754794, + "loss": 0.1931, + "step": 278190 + }, + { + "epoch": 11.52, + "grad_norm": 0.408203125, + "learning_rate": 0.0003390917827668688, + "loss": 0.244, + "step": 278200 + }, + { + "epoch": 11.52, + "grad_norm": 0.91796875, + "learning_rate": 0.0003390816495185285, + "loss": 0.1809, + "step": 278210 + }, + { + "epoch": 11.52, + "grad_norm": 1.9296875, + "learning_rate": 0.00033907151610254615, + "loss": 0.2077, + "step": 278220 + }, + { + "epoch": 11.52, + "grad_norm": 1.265625, + "learning_rate": 0.00033906138251894093, + "loss": 0.1859, + "step": 278230 + }, + { + "epoch": 11.52, + "grad_norm": 0.50390625, + "learning_rate": 0.0003390512487677317, + "loss": 0.1994, + "step": 278240 + }, + { + "epoch": 11.53, + "grad_norm": 0.515625, + "learning_rate": 0.0003390411148489377, + "loss": 0.2053, + "step": 278250 + }, + { + "epoch": 11.53, + "grad_norm": 0.74609375, + "learning_rate": 0.00033903098076257785, + "loss": 0.2322, + "step": 278260 + }, + { + "epoch": 11.53, + "grad_norm": 0.56640625, + "learning_rate": 0.0003390208465086714, + "loss": 0.1812, + "step": 278270 + }, + { + "epoch": 11.53, + "grad_norm": 0.90625, + "learning_rate": 0.0003390107120872373, + "loss": 0.2104, + "step": 278280 + }, + { + "epoch": 11.53, + "grad_norm": 0.8359375, + "learning_rate": 0.00033900057749829474, + "loss": 0.2159, + "step": 278290 + }, + { + "epoch": 11.53, + "grad_norm": 0.4453125, + "learning_rate": 0.0003389904427418626, + "loss": 0.1906, + "step": 278300 + }, + { + "epoch": 11.53, + "grad_norm": 0.79296875, + "learning_rate": 0.00033898030781796004, + "loss": 0.1655, + "step": 278310 + }, + { + "epoch": 11.53, + "grad_norm": 0.48046875, + "learning_rate": 0.00033897017272660625, + "loss": 0.1873, + "step": 278320 + }, + { + "epoch": 11.53, + "grad_norm": 0.41015625, + "learning_rate": 0.0003389600374678201, + "loss": 0.2207, + "step": 278330 + }, + { + "epoch": 11.53, + "grad_norm": 0.609375, + "learning_rate": 0.00033894990204162077, + "loss": 0.204, + "step": 278340 + }, + { + "epoch": 11.53, + "grad_norm": 0.71484375, + "learning_rate": 0.0003389397664480274, + "loss": 0.2214, + "step": 278350 + }, + { + "epoch": 11.53, + "grad_norm": 0.68359375, + "learning_rate": 0.00033892963068705897, + "loss": 0.1694, + "step": 278360 + }, + { + "epoch": 11.53, + "grad_norm": 0.7890625, + "learning_rate": 0.00033891949475873455, + "loss": 0.1814, + "step": 278370 + }, + { + "epoch": 11.53, + "grad_norm": 0.62890625, + "learning_rate": 0.0003389093586630733, + "loss": 0.2552, + "step": 278380 + }, + { + "epoch": 11.53, + "grad_norm": 0.80078125, + "learning_rate": 0.0003388992224000942, + "loss": 0.1742, + "step": 278390 + }, + { + "epoch": 11.53, + "grad_norm": 0.6171875, + "learning_rate": 0.0003388890859698163, + "loss": 0.1749, + "step": 278400 + }, + { + "epoch": 11.53, + "grad_norm": 1.34375, + "learning_rate": 0.00033887894937225884, + "loss": 0.1934, + "step": 278410 + }, + { + "epoch": 11.53, + "grad_norm": 0.68359375, + "learning_rate": 0.00033886881260744074, + "loss": 0.2389, + "step": 278420 + }, + { + "epoch": 11.53, + "grad_norm": 1.2890625, + "learning_rate": 0.00033885867567538114, + "loss": 0.1869, + "step": 278430 + }, + { + "epoch": 11.53, + "grad_norm": 0.341796875, + "learning_rate": 0.0003388485385760992, + "loss": 0.2159, + "step": 278440 + }, + { + "epoch": 11.53, + "grad_norm": 0.97265625, + "learning_rate": 0.0003388384013096138, + "loss": 0.2038, + "step": 278450 + }, + { + "epoch": 11.53, + "grad_norm": 0.60546875, + "learning_rate": 0.0003388282638759441, + "loss": 0.1894, + "step": 278460 + }, + { + "epoch": 11.53, + "grad_norm": 0.61328125, + "learning_rate": 0.00033881812627510925, + "loss": 0.1976, + "step": 278470 + }, + { + "epoch": 11.53, + "grad_norm": 0.490234375, + "learning_rate": 0.0003388079885071283, + "loss": 0.2147, + "step": 278480 + }, + { + "epoch": 11.54, + "grad_norm": 0.58203125, + "learning_rate": 0.00033879785057202026, + "loss": 0.1714, + "step": 278490 + }, + { + "epoch": 11.54, + "grad_norm": 0.63671875, + "learning_rate": 0.00033878771246980435, + "loss": 0.2279, + "step": 278500 + }, + { + "epoch": 11.54, + "grad_norm": 0.84375, + "learning_rate": 0.00033877757420049944, + "loss": 0.1907, + "step": 278510 + }, + { + "epoch": 11.54, + "grad_norm": 0.7109375, + "learning_rate": 0.0003387674357641248, + "loss": 0.193, + "step": 278520 + }, + { + "epoch": 11.54, + "grad_norm": 0.5625, + "learning_rate": 0.00033875729716069934, + "loss": 0.1931, + "step": 278530 + }, + { + "epoch": 11.54, + "grad_norm": 0.435546875, + "learning_rate": 0.0003387471583902423, + "loss": 0.1966, + "step": 278540 + }, + { + "epoch": 11.54, + "grad_norm": 0.69140625, + "learning_rate": 0.0003387370194527727, + "loss": 0.2299, + "step": 278550 + }, + { + "epoch": 11.54, + "grad_norm": 1.25, + "learning_rate": 0.0003387268803483096, + "loss": 0.1917, + "step": 278560 + }, + { + "epoch": 11.54, + "grad_norm": 1.390625, + "learning_rate": 0.00033871674107687204, + "loss": 0.1506, + "step": 278570 + }, + { + "epoch": 11.54, + "grad_norm": 1.4296875, + "learning_rate": 0.0003387066016384792, + "loss": 0.1858, + "step": 278580 + }, + { + "epoch": 11.54, + "grad_norm": 1.1953125, + "learning_rate": 0.00033869646203315006, + "loss": 0.2227, + "step": 278590 + }, + { + "epoch": 11.54, + "grad_norm": 0.25390625, + "learning_rate": 0.0003386863222609039, + "loss": 0.2008, + "step": 278600 + }, + { + "epoch": 11.54, + "grad_norm": 0.796875, + "learning_rate": 0.0003386761823217595, + "loss": 0.2327, + "step": 278610 + }, + { + "epoch": 11.54, + "grad_norm": 0.6875, + "learning_rate": 0.0003386660422157362, + "loss": 0.1677, + "step": 278620 + }, + { + "epoch": 11.54, + "grad_norm": 0.00531005859375, + "learning_rate": 0.00033865590194285297, + "loss": 0.1653, + "step": 278630 + }, + { + "epoch": 11.54, + "grad_norm": 0.72265625, + "learning_rate": 0.0003386457615031289, + "loss": 0.2058, + "step": 278640 + }, + { + "epoch": 11.54, + "grad_norm": 0.3515625, + "learning_rate": 0.00033863562089658306, + "loss": 0.126, + "step": 278650 + }, + { + "epoch": 11.54, + "grad_norm": 0.76953125, + "learning_rate": 0.0003386254801232346, + "loss": 0.1626, + "step": 278660 + }, + { + "epoch": 11.54, + "grad_norm": 0.240234375, + "learning_rate": 0.00033861533918310246, + "loss": 0.2332, + "step": 278670 + }, + { + "epoch": 11.54, + "grad_norm": 0.78125, + "learning_rate": 0.00033860519807620595, + "loss": 0.1595, + "step": 278680 + }, + { + "epoch": 11.54, + "grad_norm": 0.43359375, + "learning_rate": 0.00033859505680256387, + "loss": 0.1667, + "step": 278690 + }, + { + "epoch": 11.54, + "grad_norm": 0.828125, + "learning_rate": 0.0003385849153621956, + "loss": 0.1779, + "step": 278700 + }, + { + "epoch": 11.54, + "grad_norm": 1.1875, + "learning_rate": 0.00033857477375512, + "loss": 0.2207, + "step": 278710 + }, + { + "epoch": 11.54, + "grad_norm": 0.85546875, + "learning_rate": 0.0003385646319813562, + "loss": 0.179, + "step": 278720 + }, + { + "epoch": 11.54, + "grad_norm": 0.76953125, + "learning_rate": 0.0003385544900409235, + "loss": 0.1994, + "step": 278730 + }, + { + "epoch": 11.55, + "grad_norm": 0.6640625, + "learning_rate": 0.0003385443479338407, + "loss": 0.1577, + "step": 278740 + }, + { + "epoch": 11.55, + "grad_norm": 1.0625, + "learning_rate": 0.000338534205660127, + "loss": 0.1885, + "step": 278750 + }, + { + "epoch": 11.55, + "grad_norm": 0.53515625, + "learning_rate": 0.00033852406321980156, + "loss": 0.1597, + "step": 278760 + }, + { + "epoch": 11.55, + "grad_norm": 2.109375, + "learning_rate": 0.0003385139206128832, + "loss": 0.1381, + "step": 278770 + }, + { + "epoch": 11.55, + "grad_norm": 0.37109375, + "learning_rate": 0.0003385037778393914, + "loss": 0.195, + "step": 278780 + }, + { + "epoch": 11.55, + "grad_norm": 0.6484375, + "learning_rate": 0.000338493634899345, + "loss": 0.2072, + "step": 278790 + }, + { + "epoch": 11.55, + "grad_norm": 1.1796875, + "learning_rate": 0.0003384834917927631, + "loss": 0.205, + "step": 278800 + }, + { + "epoch": 11.55, + "grad_norm": 1.109375, + "learning_rate": 0.00033847334851966484, + "loss": 0.1957, + "step": 278810 + }, + { + "epoch": 11.55, + "grad_norm": 0.78515625, + "learning_rate": 0.00033846320508006925, + "loss": 0.1387, + "step": 278820 + }, + { + "epoch": 11.55, + "grad_norm": 1.53125, + "learning_rate": 0.0003384530614739955, + "loss": 0.2167, + "step": 278830 + }, + { + "epoch": 11.55, + "grad_norm": 0.76171875, + "learning_rate": 0.00033844291770146267, + "loss": 0.1681, + "step": 278840 + }, + { + "epoch": 11.55, + "grad_norm": 0.87890625, + "learning_rate": 0.0003384327737624898, + "loss": 0.1956, + "step": 278850 + }, + { + "epoch": 11.55, + "grad_norm": 0.287109375, + "learning_rate": 0.00033842262965709595, + "loss": 0.173, + "step": 278860 + }, + { + "epoch": 11.55, + "grad_norm": 1.0, + "learning_rate": 0.0003384124853853003, + "loss": 0.2005, + "step": 278870 + }, + { + "epoch": 11.55, + "grad_norm": 1.6875, + "learning_rate": 0.0003384023409471219, + "loss": 0.2159, + "step": 278880 + }, + { + "epoch": 11.55, + "grad_norm": 0.35546875, + "learning_rate": 0.0003383921963425799, + "loss": 0.1832, + "step": 278890 + }, + { + "epoch": 11.55, + "grad_norm": 0.41796875, + "learning_rate": 0.00033838205157169323, + "loss": 0.2004, + "step": 278900 + }, + { + "epoch": 11.55, + "grad_norm": 0.5546875, + "learning_rate": 0.00033837190663448116, + "loss": 0.2468, + "step": 278910 + }, + { + "epoch": 11.55, + "grad_norm": 0.9140625, + "learning_rate": 0.0003383617615309627, + "loss": 0.2605, + "step": 278920 + }, + { + "epoch": 11.55, + "grad_norm": 0.6484375, + "learning_rate": 0.0003383516162611568, + "loss": 0.2087, + "step": 278930 + }, + { + "epoch": 11.55, + "grad_norm": 0.56640625, + "learning_rate": 0.00033834147082508295, + "loss": 0.227, + "step": 278940 + }, + { + "epoch": 11.55, + "grad_norm": 0.2314453125, + "learning_rate": 0.0003383313252227598, + "loss": 0.1463, + "step": 278950 + }, + { + "epoch": 11.55, + "grad_norm": 0.458984375, + "learning_rate": 0.0003383211794542067, + "loss": 0.1401, + "step": 278960 + }, + { + "epoch": 11.55, + "grad_norm": 1.4296875, + "learning_rate": 0.0003383110335194427, + "loss": 0.2366, + "step": 278970 + }, + { + "epoch": 11.56, + "grad_norm": 0.82421875, + "learning_rate": 0.0003383008874184868, + "loss": 0.2372, + "step": 278980 + }, + { + "epoch": 11.56, + "grad_norm": 0.87890625, + "learning_rate": 0.0003382907411513582, + "loss": 0.1629, + "step": 278990 + }, + { + "epoch": 11.56, + "grad_norm": 0.5859375, + "learning_rate": 0.00033828059471807605, + "loss": 0.2022, + "step": 279000 + }, + { + "epoch": 11.56, + "grad_norm": 1.25, + "learning_rate": 0.0003382704481186592, + "loss": 0.1795, + "step": 279010 + }, + { + "epoch": 11.56, + "grad_norm": 0.99609375, + "learning_rate": 0.000338260301353127, + "loss": 0.1869, + "step": 279020 + }, + { + "epoch": 11.56, + "grad_norm": 0.1689453125, + "learning_rate": 0.00033825015442149844, + "loss": 0.2316, + "step": 279030 + }, + { + "epoch": 11.56, + "grad_norm": 0.65625, + "learning_rate": 0.0003382400073237926, + "loss": 0.1635, + "step": 279040 + }, + { + "epoch": 11.56, + "grad_norm": 0.482421875, + "learning_rate": 0.00033822986006002857, + "loss": 0.1997, + "step": 279050 + }, + { + "epoch": 11.56, + "grad_norm": 1.15625, + "learning_rate": 0.00033821971263022546, + "loss": 0.2315, + "step": 279060 + }, + { + "epoch": 11.56, + "grad_norm": 0.62109375, + "learning_rate": 0.00033820956503440247, + "loss": 0.175, + "step": 279070 + }, + { + "epoch": 11.56, + "grad_norm": 0.451171875, + "learning_rate": 0.0003381994172725786, + "loss": 0.1933, + "step": 279080 + }, + { + "epoch": 11.56, + "grad_norm": 0.58984375, + "learning_rate": 0.0003381892693447728, + "loss": 0.1817, + "step": 279090 + }, + { + "epoch": 11.56, + "grad_norm": 1.9921875, + "learning_rate": 0.0003381791212510045, + "loss": 0.1572, + "step": 279100 + }, + { + "epoch": 11.56, + "grad_norm": 1.0546875, + "learning_rate": 0.0003381689729912925, + "loss": 0.233, + "step": 279110 + }, + { + "epoch": 11.56, + "grad_norm": 1.5546875, + "learning_rate": 0.000338158824565656, + "loss": 0.2316, + "step": 279120 + }, + { + "epoch": 11.56, + "grad_norm": 0.6875, + "learning_rate": 0.0003381486759741142, + "loss": 0.161, + "step": 279130 + }, + { + "epoch": 11.56, + "grad_norm": 0.66015625, + "learning_rate": 0.00033813852721668603, + "loss": 0.2333, + "step": 279140 + }, + { + "epoch": 11.56, + "grad_norm": 0.7421875, + "learning_rate": 0.00033812837829339074, + "loss": 0.1982, + "step": 279150 + }, + { + "epoch": 11.56, + "grad_norm": 0.4609375, + "learning_rate": 0.0003381182292042473, + "loss": 0.2228, + "step": 279160 + }, + { + "epoch": 11.56, + "grad_norm": 0.609375, + "learning_rate": 0.0003381080799492749, + "loss": 0.1782, + "step": 279170 + }, + { + "epoch": 11.56, + "grad_norm": 0.4140625, + "learning_rate": 0.0003380979305284926, + "loss": 0.1659, + "step": 279180 + }, + { + "epoch": 11.56, + "grad_norm": 0.76171875, + "learning_rate": 0.0003380877809419196, + "loss": 0.194, + "step": 279190 + }, + { + "epoch": 11.56, + "grad_norm": 0.7265625, + "learning_rate": 0.0003380776311895748, + "loss": 0.181, + "step": 279200 + }, + { + "epoch": 11.56, + "grad_norm": 1.5234375, + "learning_rate": 0.0003380674812714775, + "loss": 0.2051, + "step": 279210 + }, + { + "epoch": 11.57, + "grad_norm": 0.63671875, + "learning_rate": 0.00033805733118764654, + "loss": 0.2309, + "step": 279220 + }, + { + "epoch": 11.57, + "grad_norm": 0.83203125, + "learning_rate": 0.0003380471809381013, + "loss": 0.1553, + "step": 279230 + }, + { + "epoch": 11.57, + "grad_norm": 0.388671875, + "learning_rate": 0.0003380370305228608, + "loss": 0.2909, + "step": 279240 + }, + { + "epoch": 11.57, + "grad_norm": 0.82421875, + "learning_rate": 0.000338026879941944, + "loss": 0.2, + "step": 279250 + }, + { + "epoch": 11.57, + "grad_norm": 1.1015625, + "learning_rate": 0.0003380167291953703, + "loss": 0.137, + "step": 279260 + }, + { + "epoch": 11.57, + "grad_norm": 0.7109375, + "learning_rate": 0.0003380065782831585, + "loss": 0.1714, + "step": 279270 + }, + { + "epoch": 11.57, + "grad_norm": 0.66015625, + "learning_rate": 0.0003379964272053279, + "loss": 0.1838, + "step": 279280 + }, + { + "epoch": 11.57, + "grad_norm": 0.427734375, + "learning_rate": 0.00033798627596189746, + "loss": 0.182, + "step": 279290 + }, + { + "epoch": 11.57, + "grad_norm": 0.8203125, + "learning_rate": 0.00033797612455288627, + "loss": 0.1747, + "step": 279300 + }, + { + "epoch": 11.57, + "grad_norm": 0.83984375, + "learning_rate": 0.00033796597297831366, + "loss": 0.1974, + "step": 279310 + }, + { + "epoch": 11.57, + "grad_norm": 0.70703125, + "learning_rate": 0.00033795582123819857, + "loss": 0.2491, + "step": 279320 + }, + { + "epoch": 11.57, + "grad_norm": 0.373046875, + "learning_rate": 0.00033794566933256006, + "loss": 0.2088, + "step": 279330 + }, + { + "epoch": 11.57, + "grad_norm": 0.7109375, + "learning_rate": 0.0003379355172614174, + "loss": 0.2028, + "step": 279340 + }, + { + "epoch": 11.57, + "grad_norm": 0.875, + "learning_rate": 0.00033792536502478946, + "loss": 0.1699, + "step": 279350 + }, + { + "epoch": 11.57, + "grad_norm": 0.83984375, + "learning_rate": 0.00033791521262269554, + "loss": 0.1816, + "step": 279360 + }, + { + "epoch": 11.57, + "grad_norm": 1.078125, + "learning_rate": 0.00033790506005515464, + "loss": 0.1796, + "step": 279370 + }, + { + "epoch": 11.57, + "grad_norm": 0.291015625, + "learning_rate": 0.00033789490732218596, + "loss": 0.1381, + "step": 279380 + }, + { + "epoch": 11.57, + "grad_norm": 0.0, + "learning_rate": 0.00033788475442380863, + "loss": 0.2084, + "step": 279390 + }, + { + "epoch": 11.57, + "grad_norm": 0.65234375, + "learning_rate": 0.00033787460136004157, + "loss": 0.2067, + "step": 279400 + }, + { + "epoch": 11.57, + "grad_norm": 0.466796875, + "learning_rate": 0.000337864448130904, + "loss": 0.1885, + "step": 279410 + }, + { + "epoch": 11.57, + "grad_norm": 0.26171875, + "learning_rate": 0.0003378542947364151, + "loss": 0.1678, + "step": 279420 + }, + { + "epoch": 11.57, + "grad_norm": 0.71484375, + "learning_rate": 0.00033784414117659387, + "loss": 0.1999, + "step": 279430 + }, + { + "epoch": 11.57, + "grad_norm": 0.33203125, + "learning_rate": 0.0003378339874514594, + "loss": 0.2033, + "step": 279440 + }, + { + "epoch": 11.57, + "grad_norm": 0.62890625, + "learning_rate": 0.0003378238335610309, + "loss": 0.1884, + "step": 279450 + }, + { + "epoch": 11.58, + "grad_norm": 0.400390625, + "learning_rate": 0.00033781367950532745, + "loss": 0.1874, + "step": 279460 + }, + { + "epoch": 11.58, + "grad_norm": 0.8046875, + "learning_rate": 0.00033780352528436816, + "loss": 0.2398, + "step": 279470 + }, + { + "epoch": 11.58, + "grad_norm": 0.91796875, + "learning_rate": 0.000337793370898172, + "loss": 0.1879, + "step": 279480 + }, + { + "epoch": 11.58, + "grad_norm": 1.65625, + "learning_rate": 0.0003377832163467583, + "loss": 0.1926, + "step": 279490 + }, + { + "epoch": 11.58, + "grad_norm": 0.453125, + "learning_rate": 0.00033777306163014606, + "loss": 0.1825, + "step": 279500 + }, + { + "epoch": 11.58, + "grad_norm": 0.85546875, + "learning_rate": 0.0003377629067483544, + "loss": 0.2423, + "step": 279510 + }, + { + "epoch": 11.58, + "grad_norm": 0.64453125, + "learning_rate": 0.0003377527517014023, + "loss": 0.1759, + "step": 279520 + }, + { + "epoch": 11.58, + "grad_norm": 0.7109375, + "learning_rate": 0.00033774259648930915, + "loss": 0.1491, + "step": 279530 + }, + { + "epoch": 11.58, + "grad_norm": 0.236328125, + "learning_rate": 0.00033773244111209383, + "loss": 0.151, + "step": 279540 + }, + { + "epoch": 11.58, + "grad_norm": 0.8515625, + "learning_rate": 0.00033772228556977555, + "loss": 0.1705, + "step": 279550 + }, + { + "epoch": 11.58, + "grad_norm": 1.2421875, + "learning_rate": 0.00033771212986237345, + "loss": 0.1321, + "step": 279560 + }, + { + "epoch": 11.58, + "grad_norm": 0.56640625, + "learning_rate": 0.00033770197398990656, + "loss": 0.1699, + "step": 279570 + }, + { + "epoch": 11.58, + "grad_norm": 0.6328125, + "learning_rate": 0.00033769181795239396, + "loss": 0.1661, + "step": 279580 + }, + { + "epoch": 11.58, + "grad_norm": 0.890625, + "learning_rate": 0.00033768166174985494, + "loss": 0.2059, + "step": 279590 + }, + { + "epoch": 11.58, + "grad_norm": 3.8125, + "learning_rate": 0.00033767150538230843, + "loss": 0.1902, + "step": 279600 + }, + { + "epoch": 11.58, + "grad_norm": 0.6953125, + "learning_rate": 0.0003376613488497736, + "loss": 0.2314, + "step": 279610 + }, + { + "epoch": 11.58, + "grad_norm": 0.59765625, + "learning_rate": 0.00033765119215226964, + "loss": 0.1826, + "step": 279620 + }, + { + "epoch": 11.58, + "grad_norm": 0.78125, + "learning_rate": 0.00033764103528981563, + "loss": 0.2182, + "step": 279630 + }, + { + "epoch": 11.58, + "grad_norm": 0.8203125, + "learning_rate": 0.00033763087826243056, + "loss": 0.1797, + "step": 279640 + }, + { + "epoch": 11.58, + "grad_norm": 1.078125, + "learning_rate": 0.0003376207210701336, + "loss": 0.1744, + "step": 279650 + }, + { + "epoch": 11.58, + "grad_norm": 0.58984375, + "learning_rate": 0.000337610563712944, + "loss": 0.1884, + "step": 279660 + }, + { + "epoch": 11.58, + "grad_norm": 1.3203125, + "learning_rate": 0.00033760040619088086, + "loss": 0.178, + "step": 279670 + }, + { + "epoch": 11.58, + "grad_norm": 0.578125, + "learning_rate": 0.0003375902485039631, + "loss": 0.1759, + "step": 279680 + }, + { + "epoch": 11.58, + "grad_norm": 0.54296875, + "learning_rate": 0.00033758009065220994, + "loss": 0.2152, + "step": 279690 + }, + { + "epoch": 11.59, + "grad_norm": 0.5390625, + "learning_rate": 0.0003375699326356405, + "loss": 0.1783, + "step": 279700 + }, + { + "epoch": 11.59, + "grad_norm": 0.67578125, + "learning_rate": 0.000337559774454274, + "loss": 0.2367, + "step": 279710 + }, + { + "epoch": 11.59, + "grad_norm": 0.42578125, + "learning_rate": 0.0003375496161081294, + "loss": 0.162, + "step": 279720 + }, + { + "epoch": 11.59, + "grad_norm": 0.82421875, + "learning_rate": 0.0003375394575972259, + "loss": 0.1748, + "step": 279730 + }, + { + "epoch": 11.59, + "grad_norm": 0.9921875, + "learning_rate": 0.0003375292989215825, + "loss": 0.2114, + "step": 279740 + }, + { + "epoch": 11.59, + "grad_norm": 0.93359375, + "learning_rate": 0.00033751914008121853, + "loss": 0.2216, + "step": 279750 + }, + { + "epoch": 11.59, + "grad_norm": 1.1640625, + "learning_rate": 0.00033750898107615297, + "loss": 0.1739, + "step": 279760 + }, + { + "epoch": 11.59, + "grad_norm": 1.7421875, + "learning_rate": 0.0003374988219064049, + "loss": 0.1987, + "step": 279770 + }, + { + "epoch": 11.59, + "grad_norm": 0.65625, + "learning_rate": 0.0003374886625719935, + "loss": 0.1854, + "step": 279780 + }, + { + "epoch": 11.59, + "grad_norm": 0.8515625, + "learning_rate": 0.0003374785030729379, + "loss": 0.2219, + "step": 279790 + }, + { + "epoch": 11.59, + "grad_norm": 0.70703125, + "learning_rate": 0.00033746834340925725, + "loss": 0.1943, + "step": 279800 + }, + { + "epoch": 11.59, + "grad_norm": 1.1328125, + "learning_rate": 0.0003374581835809706, + "loss": 0.1748, + "step": 279810 + }, + { + "epoch": 11.59, + "grad_norm": 0.77734375, + "learning_rate": 0.00033744802358809704, + "loss": 0.2275, + "step": 279820 + }, + { + "epoch": 11.59, + "grad_norm": 1.0078125, + "learning_rate": 0.0003374378634306558, + "loss": 0.2147, + "step": 279830 + }, + { + "epoch": 11.59, + "grad_norm": 0.4296875, + "learning_rate": 0.00033742770310866597, + "loss": 0.1771, + "step": 279840 + }, + { + "epoch": 11.59, + "grad_norm": 1.0078125, + "learning_rate": 0.0003374175426221466, + "loss": 0.1481, + "step": 279850 + }, + { + "epoch": 11.59, + "grad_norm": 0.98828125, + "learning_rate": 0.00033740738197111686, + "loss": 0.1859, + "step": 279860 + }, + { + "epoch": 11.59, + "grad_norm": 0.78125, + "learning_rate": 0.00033739722115559587, + "loss": 0.1872, + "step": 279870 + }, + { + "epoch": 11.59, + "grad_norm": 0.89453125, + "learning_rate": 0.0003373870601756027, + "loss": 0.2032, + "step": 279880 + }, + { + "epoch": 11.59, + "grad_norm": 0.47265625, + "learning_rate": 0.0003373768990311566, + "loss": 0.1963, + "step": 279890 + }, + { + "epoch": 11.59, + "grad_norm": 0.6953125, + "learning_rate": 0.00033736673772227657, + "loss": 0.2068, + "step": 279900 + }, + { + "epoch": 11.59, + "grad_norm": 1.6796875, + "learning_rate": 0.0003373565762489818, + "loss": 0.2207, + "step": 279910 + }, + { + "epoch": 11.59, + "grad_norm": 0.75390625, + "learning_rate": 0.00033734641461129135, + "loss": 0.1692, + "step": 279920 + }, + { + "epoch": 11.59, + "grad_norm": 0.388671875, + "learning_rate": 0.00033733625280922444, + "loss": 0.1882, + "step": 279930 + }, + { + "epoch": 11.6, + "grad_norm": 0.58984375, + "learning_rate": 0.00033732609084280007, + "loss": 0.19, + "step": 279940 + }, + { + "epoch": 11.6, + "grad_norm": 0.9765625, + "learning_rate": 0.00033731592871203743, + "loss": 0.1698, + "step": 279950 + }, + { + "epoch": 11.6, + "grad_norm": 1.2734375, + "learning_rate": 0.00033730576641695565, + "loss": 0.1465, + "step": 279960 + }, + { + "epoch": 11.6, + "grad_norm": 0.470703125, + "learning_rate": 0.0003372956039575739, + "loss": 0.2087, + "step": 279970 + }, + { + "epoch": 11.6, + "grad_norm": 0.7890625, + "learning_rate": 0.00033728544133391114, + "loss": 0.1645, + "step": 279980 + }, + { + "epoch": 11.6, + "grad_norm": 1.5234375, + "learning_rate": 0.00033727527854598673, + "loss": 0.2264, + "step": 279990 + }, + { + "epoch": 11.6, + "grad_norm": 1.46875, + "learning_rate": 0.00033726511559381957, + "loss": 0.1963, + "step": 280000 + }, + { + "epoch": 11.6, + "grad_norm": 0.85546875, + "learning_rate": 0.000337254952477429, + "loss": 0.1715, + "step": 280010 + }, + { + "epoch": 11.6, + "grad_norm": 0.75390625, + "learning_rate": 0.00033724478919683386, + "loss": 0.2055, + "step": 280020 + }, + { + "epoch": 11.6, + "grad_norm": 0.9453125, + "learning_rate": 0.0003372346257520536, + "loss": 0.2146, + "step": 280030 + }, + { + "epoch": 11.6, + "grad_norm": 0.466796875, + "learning_rate": 0.0003372244621431072, + "loss": 0.2113, + "step": 280040 + }, + { + "epoch": 11.6, + "grad_norm": 0.88671875, + "learning_rate": 0.0003372142983700137, + "loss": 0.2247, + "step": 280050 + }, + { + "epoch": 11.6, + "grad_norm": 0.7734375, + "learning_rate": 0.0003372041344327923, + "loss": 0.2727, + "step": 280060 + }, + { + "epoch": 11.6, + "grad_norm": 0.71484375, + "learning_rate": 0.00033719397033146216, + "loss": 0.216, + "step": 280070 + }, + { + "epoch": 11.6, + "grad_norm": 1.2578125, + "learning_rate": 0.0003371838060660424, + "loss": 0.1768, + "step": 280080 + }, + { + "epoch": 11.6, + "grad_norm": 0.59375, + "learning_rate": 0.0003371736416365522, + "loss": 0.2387, + "step": 280090 + }, + { + "epoch": 11.6, + "grad_norm": 0.625, + "learning_rate": 0.0003371634770430105, + "loss": 0.2335, + "step": 280100 + }, + { + "epoch": 11.6, + "grad_norm": 1.1953125, + "learning_rate": 0.0003371533122854365, + "loss": 0.2057, + "step": 280110 + }, + { + "epoch": 11.6, + "grad_norm": 0.94140625, + "learning_rate": 0.0003371431473638496, + "loss": 0.201, + "step": 280120 + }, + { + "epoch": 11.6, + "grad_norm": 1.3046875, + "learning_rate": 0.0003371329822782685, + "loss": 0.2448, + "step": 280130 + }, + { + "epoch": 11.6, + "grad_norm": 1.0234375, + "learning_rate": 0.00033712281702871267, + "loss": 0.2007, + "step": 280140 + }, + { + "epoch": 11.6, + "grad_norm": 0.9140625, + "learning_rate": 0.00033711265161520107, + "loss": 0.1599, + "step": 280150 + }, + { + "epoch": 11.6, + "grad_norm": 0.8203125, + "learning_rate": 0.0003371024860377528, + "loss": 0.222, + "step": 280160 + }, + { + "epoch": 11.6, + "grad_norm": 1.3984375, + "learning_rate": 0.00033709232029638717, + "loss": 0.1924, + "step": 280170 + }, + { + "epoch": 11.61, + "grad_norm": 0.6171875, + "learning_rate": 0.0003370821543911231, + "loss": 0.1987, + "step": 280180 + }, + { + "epoch": 11.61, + "grad_norm": 0.7890625, + "learning_rate": 0.00033707198832197985, + "loss": 0.2259, + "step": 280190 + }, + { + "epoch": 11.61, + "grad_norm": 0.53125, + "learning_rate": 0.0003370618220889766, + "loss": 0.1898, + "step": 280200 + }, + { + "epoch": 11.61, + "grad_norm": 0.55078125, + "learning_rate": 0.00033705165569213226, + "loss": 0.2182, + "step": 280210 + }, + { + "epoch": 11.61, + "grad_norm": 0.703125, + "learning_rate": 0.0003370414891314662, + "loss": 0.1614, + "step": 280220 + }, + { + "epoch": 11.61, + "grad_norm": 0.81640625, + "learning_rate": 0.0003370313224069974, + "loss": 0.2021, + "step": 280230 + }, + { + "epoch": 11.61, + "grad_norm": 1.21875, + "learning_rate": 0.00033702115551874514, + "loss": 0.1774, + "step": 280240 + }, + { + "epoch": 11.61, + "grad_norm": 0.6171875, + "learning_rate": 0.0003370109884667284, + "loss": 0.1723, + "step": 280250 + }, + { + "epoch": 11.61, + "grad_norm": 1.125, + "learning_rate": 0.0003370008212509664, + "loss": 0.236, + "step": 280260 + }, + { + "epoch": 11.61, + "grad_norm": 0.91015625, + "learning_rate": 0.0003369906538714782, + "loss": 0.2016, + "step": 280270 + }, + { + "epoch": 11.61, + "grad_norm": 0.45703125, + "learning_rate": 0.00033698048632828305, + "loss": 0.1534, + "step": 280280 + }, + { + "epoch": 11.61, + "grad_norm": 1.15625, + "learning_rate": 0.00033697031862139996, + "loss": 0.2028, + "step": 280290 + }, + { + "epoch": 11.61, + "grad_norm": 0.5234375, + "learning_rate": 0.00033696015075084814, + "loss": 0.1935, + "step": 280300 + }, + { + "epoch": 11.61, + "grad_norm": 0.6171875, + "learning_rate": 0.0003369499827166467, + "loss": 0.1829, + "step": 280310 + }, + { + "epoch": 11.61, + "grad_norm": 0.94140625, + "learning_rate": 0.0003369398145188148, + "loss": 0.1915, + "step": 280320 + }, + { + "epoch": 11.61, + "grad_norm": 0.78515625, + "learning_rate": 0.00033692964615737156, + "loss": 0.2058, + "step": 280330 + }, + { + "epoch": 11.61, + "grad_norm": 0.9375, + "learning_rate": 0.0003369194776323361, + "loss": 0.2327, + "step": 280340 + }, + { + "epoch": 11.61, + "grad_norm": 0.1962890625, + "learning_rate": 0.00033690930894372757, + "loss": 0.2566, + "step": 280350 + }, + { + "epoch": 11.61, + "grad_norm": 1.171875, + "learning_rate": 0.0003368991400915651, + "loss": 0.2313, + "step": 280360 + }, + { + "epoch": 11.61, + "grad_norm": 0.68359375, + "learning_rate": 0.0003368889710758678, + "loss": 0.175, + "step": 280370 + }, + { + "epoch": 11.61, + "grad_norm": 0.5546875, + "learning_rate": 0.00033687880189665496, + "loss": 0.1847, + "step": 280380 + }, + { + "epoch": 11.61, + "grad_norm": 1.0546875, + "learning_rate": 0.0003368686325539455, + "loss": 0.213, + "step": 280390 + }, + { + "epoch": 11.61, + "grad_norm": 0.90234375, + "learning_rate": 0.0003368584630477587, + "loss": 0.1998, + "step": 280400 + }, + { + "epoch": 11.61, + "grad_norm": 0.40234375, + "learning_rate": 0.00033684829337811363, + "loss": 0.1919, + "step": 280410 + }, + { + "epoch": 11.61, + "grad_norm": 0.77734375, + "learning_rate": 0.0003368381235450294, + "loss": 0.1638, + "step": 280420 + }, + { + "epoch": 11.62, + "grad_norm": 0.609375, + "learning_rate": 0.00033682795354852523, + "loss": 0.1853, + "step": 280430 + }, + { + "epoch": 11.62, + "grad_norm": 0.671875, + "learning_rate": 0.00033681778338862036, + "loss": 0.1946, + "step": 280440 + }, + { + "epoch": 11.62, + "grad_norm": 0.57421875, + "learning_rate": 0.0003368076130653336, + "loss": 0.1919, + "step": 280450 + }, + { + "epoch": 11.62, + "grad_norm": 0.44921875, + "learning_rate": 0.0003367974425786844, + "loss": 0.2286, + "step": 280460 + }, + { + "epoch": 11.62, + "grad_norm": 0.90625, + "learning_rate": 0.00033678727192869176, + "loss": 0.1623, + "step": 280470 + }, + { + "epoch": 11.62, + "grad_norm": 0.54296875, + "learning_rate": 0.00033677710111537477, + "loss": 0.1721, + "step": 280480 + }, + { + "epoch": 11.62, + "grad_norm": 0.671875, + "learning_rate": 0.0003367669301387528, + "loss": 0.1789, + "step": 280490 + }, + { + "epoch": 11.62, + "grad_norm": 1.2109375, + "learning_rate": 0.00033675675899884474, + "loss": 0.2026, + "step": 280500 + }, + { + "epoch": 11.62, + "grad_norm": 1.96875, + "learning_rate": 0.00033674658769566987, + "loss": 0.209, + "step": 280510 + }, + { + "epoch": 11.62, + "grad_norm": 0.79296875, + "learning_rate": 0.00033673641622924727, + "loss": 0.1916, + "step": 280520 + }, + { + "epoch": 11.62, + "grad_norm": 0.97265625, + "learning_rate": 0.00033672624459959603, + "loss": 0.1769, + "step": 280530 + }, + { + "epoch": 11.62, + "grad_norm": 0.31640625, + "learning_rate": 0.00033671607280673545, + "loss": 0.1657, + "step": 280540 + }, + { + "epoch": 11.62, + "grad_norm": 0.1572265625, + "learning_rate": 0.0003367059008506846, + "loss": 0.1838, + "step": 280550 + }, + { + "epoch": 11.62, + "grad_norm": 0.6015625, + "learning_rate": 0.0003366957287314625, + "loss": 0.1963, + "step": 280560 + }, + { + "epoch": 11.62, + "grad_norm": 0.546875, + "learning_rate": 0.00033668555644908855, + "loss": 0.1467, + "step": 280570 + }, + { + "epoch": 11.62, + "grad_norm": 1.2890625, + "learning_rate": 0.00033667538400358163, + "loss": 0.2029, + "step": 280580 + }, + { + "epoch": 11.62, + "grad_norm": 1.015625, + "learning_rate": 0.00033666521139496104, + "loss": 0.1818, + "step": 280590 + }, + { + "epoch": 11.62, + "grad_norm": 0.62890625, + "learning_rate": 0.0003366550386232459, + "loss": 0.1983, + "step": 280600 + }, + { + "epoch": 11.62, + "grad_norm": 1.0078125, + "learning_rate": 0.00033664486568845526, + "loss": 0.1516, + "step": 280610 + }, + { + "epoch": 11.62, + "grad_norm": 0.64453125, + "learning_rate": 0.0003366346925906084, + "loss": 0.18, + "step": 280620 + }, + { + "epoch": 11.62, + "grad_norm": 0.0, + "learning_rate": 0.00033662451932972446, + "loss": 0.1742, + "step": 280630 + }, + { + "epoch": 11.62, + "grad_norm": 0.455078125, + "learning_rate": 0.00033661434590582243, + "loss": 0.1952, + "step": 280640 + }, + { + "epoch": 11.62, + "grad_norm": 0.91796875, + "learning_rate": 0.0003366041723189216, + "loss": 0.2029, + "step": 280650 + }, + { + "epoch": 11.62, + "grad_norm": 1.0859375, + "learning_rate": 0.00033659399856904103, + "loss": 0.1872, + "step": 280660 + }, + { + "epoch": 11.63, + "grad_norm": 0.69921875, + "learning_rate": 0.0003365838246562, + "loss": 0.1765, + "step": 280670 + }, + { + "epoch": 11.63, + "grad_norm": 0.60546875, + "learning_rate": 0.00033657365058041745, + "loss": 0.2032, + "step": 280680 + }, + { + "epoch": 11.63, + "grad_norm": 1.671875, + "learning_rate": 0.0003365634763417127, + "loss": 0.2249, + "step": 280690 + }, + { + "epoch": 11.63, + "grad_norm": 0.197265625, + "learning_rate": 0.0003365533019401048, + "loss": 0.2251, + "step": 280700 + }, + { + "epoch": 11.63, + "grad_norm": 0.984375, + "learning_rate": 0.000336543127375613, + "loss": 0.198, + "step": 280710 + }, + { + "epoch": 11.63, + "grad_norm": 1.734375, + "learning_rate": 0.0003365329526482563, + "loss": 0.2343, + "step": 280720 + }, + { + "epoch": 11.63, + "grad_norm": 0.35546875, + "learning_rate": 0.000336522777758054, + "loss": 0.1892, + "step": 280730 + }, + { + "epoch": 11.63, + "grad_norm": 0.9765625, + "learning_rate": 0.00033651260270502507, + "loss": 0.2118, + "step": 280740 + }, + { + "epoch": 11.63, + "grad_norm": 0.58203125, + "learning_rate": 0.00033650242748918893, + "loss": 0.1785, + "step": 280750 + }, + { + "epoch": 11.63, + "grad_norm": 0.63671875, + "learning_rate": 0.0003364922521105644, + "loss": 0.2545, + "step": 280760 + }, + { + "epoch": 11.63, + "grad_norm": 0.96484375, + "learning_rate": 0.00033648207656917085, + "loss": 0.1753, + "step": 280770 + }, + { + "epoch": 11.63, + "grad_norm": 1.3515625, + "learning_rate": 0.00033647190086502746, + "loss": 0.1668, + "step": 280780 + }, + { + "epoch": 11.63, + "grad_norm": 0.40625, + "learning_rate": 0.0003364617249981531, + "loss": 0.2475, + "step": 280790 + }, + { + "epoch": 11.63, + "grad_norm": 0.4765625, + "learning_rate": 0.00033645154896856725, + "loss": 0.2225, + "step": 280800 + }, + { + "epoch": 11.63, + "grad_norm": 0.78515625, + "learning_rate": 0.00033644137277628894, + "loss": 0.1979, + "step": 280810 + }, + { + "epoch": 11.63, + "grad_norm": 0.482421875, + "learning_rate": 0.0003364311964213372, + "loss": 0.1668, + "step": 280820 + }, + { + "epoch": 11.63, + "grad_norm": 0.64453125, + "learning_rate": 0.0003364210199037314, + "loss": 0.1508, + "step": 280830 + }, + { + "epoch": 11.63, + "grad_norm": 0.6875, + "learning_rate": 0.0003364108432234905, + "loss": 0.2472, + "step": 280840 + }, + { + "epoch": 11.63, + "grad_norm": 1.6171875, + "learning_rate": 0.0003364006663806337, + "loss": 0.1868, + "step": 280850 + }, + { + "epoch": 11.63, + "grad_norm": 0.859375, + "learning_rate": 0.00033639048937518024, + "loss": 0.2315, + "step": 280860 + }, + { + "epoch": 11.63, + "grad_norm": 0.55859375, + "learning_rate": 0.00033638031220714913, + "loss": 0.2201, + "step": 280870 + }, + { + "epoch": 11.63, + "grad_norm": 1.1953125, + "learning_rate": 0.00033637013487655963, + "loss": 0.1868, + "step": 280880 + }, + { + "epoch": 11.63, + "grad_norm": 0.6328125, + "learning_rate": 0.0003363599573834309, + "loss": 0.1785, + "step": 280890 + }, + { + "epoch": 11.63, + "grad_norm": 0.8359375, + "learning_rate": 0.000336349779727782, + "loss": 0.2064, + "step": 280900 + }, + { + "epoch": 11.64, + "grad_norm": 1.109375, + "learning_rate": 0.0003363396019096323, + "loss": 0.1912, + "step": 280910 + }, + { + "epoch": 11.64, + "grad_norm": 0.73828125, + "learning_rate": 0.0003363294239290006, + "loss": 0.2044, + "step": 280920 + }, + { + "epoch": 11.64, + "grad_norm": 0.2080078125, + "learning_rate": 0.00033631924578590635, + "loss": 0.1914, + "step": 280930 + }, + { + "epoch": 11.64, + "grad_norm": 0.734375, + "learning_rate": 0.00033630906748036854, + "loss": 0.2136, + "step": 280940 + }, + { + "epoch": 11.64, + "grad_norm": 0.96484375, + "learning_rate": 0.0003362988890124064, + "loss": 0.2077, + "step": 280950 + }, + { + "epoch": 11.64, + "grad_norm": 0.8828125, + "learning_rate": 0.0003362887103820391, + "loss": 0.1597, + "step": 280960 + }, + { + "epoch": 11.64, + "grad_norm": 0.65234375, + "learning_rate": 0.00033627853158928574, + "loss": 0.2, + "step": 280970 + }, + { + "epoch": 11.64, + "grad_norm": 0.515625, + "learning_rate": 0.0003362683526341655, + "loss": 0.1703, + "step": 280980 + }, + { + "epoch": 11.64, + "grad_norm": 0.7265625, + "learning_rate": 0.00033625817351669757, + "loss": 0.1909, + "step": 280990 + }, + { + "epoch": 11.64, + "grad_norm": 0.4765625, + "learning_rate": 0.00033624799423690105, + "loss": 0.198, + "step": 281000 + }, + { + "epoch": 11.64, + "grad_norm": 0.65625, + "learning_rate": 0.0003362378147947951, + "loss": 0.2026, + "step": 281010 + }, + { + "epoch": 11.64, + "grad_norm": 1.1875, + "learning_rate": 0.00033622763519039895, + "loss": 0.2191, + "step": 281020 + }, + { + "epoch": 11.64, + "grad_norm": 0.83203125, + "learning_rate": 0.00033621745542373164, + "loss": 0.2296, + "step": 281030 + }, + { + "epoch": 11.64, + "grad_norm": 0.640625, + "learning_rate": 0.00033620727549481244, + "loss": 0.2188, + "step": 281040 + }, + { + "epoch": 11.64, + "grad_norm": 0.89453125, + "learning_rate": 0.0003361970954036604, + "loss": 0.19, + "step": 281050 + }, + { + "epoch": 11.64, + "grad_norm": 0.43359375, + "learning_rate": 0.0003361869151502948, + "loss": 0.1849, + "step": 281060 + }, + { + "epoch": 11.64, + "grad_norm": 0.3671875, + "learning_rate": 0.0003361767347347347, + "loss": 0.2338, + "step": 281070 + }, + { + "epoch": 11.64, + "grad_norm": 0.9375, + "learning_rate": 0.00033616655415699925, + "loss": 0.1689, + "step": 281080 + }, + { + "epoch": 11.64, + "grad_norm": 0.6171875, + "learning_rate": 0.00033615637341710767, + "loss": 0.1533, + "step": 281090 + }, + { + "epoch": 11.64, + "grad_norm": 0.46875, + "learning_rate": 0.0003361461925150792, + "loss": 0.2027, + "step": 281100 + }, + { + "epoch": 11.64, + "grad_norm": 1.3359375, + "learning_rate": 0.0003361360114509328, + "loss": 0.1974, + "step": 281110 + }, + { + "epoch": 11.64, + "grad_norm": 0.515625, + "learning_rate": 0.0003361258302246877, + "loss": 0.1903, + "step": 281120 + }, + { + "epoch": 11.64, + "grad_norm": 0.87109375, + "learning_rate": 0.00033611564883636317, + "loss": 0.2161, + "step": 281130 + }, + { + "epoch": 11.64, + "grad_norm": 0.87109375, + "learning_rate": 0.00033610546728597826, + "loss": 0.233, + "step": 281140 + }, + { + "epoch": 11.65, + "grad_norm": 1.7578125, + "learning_rate": 0.0003360952855735522, + "loss": 0.2034, + "step": 281150 + }, + { + "epoch": 11.65, + "grad_norm": 0.609375, + "learning_rate": 0.00033608510369910406, + "loss": 0.205, + "step": 281160 + }, + { + "epoch": 11.65, + "grad_norm": 0.51171875, + "learning_rate": 0.000336074921662653, + "loss": 0.1525, + "step": 281170 + }, + { + "epoch": 11.65, + "grad_norm": 0.53125, + "learning_rate": 0.0003360647394642183, + "loss": 0.1922, + "step": 281180 + }, + { + "epoch": 11.65, + "grad_norm": 1.59375, + "learning_rate": 0.00033605455710381904, + "loss": 0.1633, + "step": 281190 + }, + { + "epoch": 11.65, + "grad_norm": 0.9140625, + "learning_rate": 0.0003360443745814744, + "loss": 0.1844, + "step": 281200 + }, + { + "epoch": 11.65, + "grad_norm": 0.5703125, + "learning_rate": 0.0003360341918972035, + "loss": 0.1695, + "step": 281210 + }, + { + "epoch": 11.65, + "grad_norm": 0.98828125, + "learning_rate": 0.0003360240090510256, + "loss": 0.1409, + "step": 281220 + }, + { + "epoch": 11.65, + "grad_norm": 0.52734375, + "learning_rate": 0.0003360138260429598, + "loss": 0.1729, + "step": 281230 + }, + { + "epoch": 11.65, + "grad_norm": 0.828125, + "learning_rate": 0.00033600364287302524, + "loss": 0.2033, + "step": 281240 + }, + { + "epoch": 11.65, + "grad_norm": 0.64453125, + "learning_rate": 0.00033599345954124114, + "loss": 0.1777, + "step": 281250 + }, + { + "epoch": 11.65, + "grad_norm": 0.369140625, + "learning_rate": 0.00033598327604762664, + "loss": 0.2022, + "step": 281260 + }, + { + "epoch": 11.65, + "grad_norm": 2.109375, + "learning_rate": 0.0003359730923922009, + "loss": 0.1348, + "step": 281270 + }, + { + "epoch": 11.65, + "grad_norm": 0.7421875, + "learning_rate": 0.0003359629085749831, + "loss": 0.2043, + "step": 281280 + }, + { + "epoch": 11.65, + "grad_norm": 0.216796875, + "learning_rate": 0.0003359527245959923, + "loss": 0.1884, + "step": 281290 + }, + { + "epoch": 11.65, + "grad_norm": 1.6328125, + "learning_rate": 0.0003359425404552478, + "loss": 0.1656, + "step": 281300 + }, + { + "epoch": 11.65, + "grad_norm": 0.953125, + "learning_rate": 0.00033593235615276873, + "loss": 0.2103, + "step": 281310 + }, + { + "epoch": 11.65, + "grad_norm": 0.625, + "learning_rate": 0.0003359221716885743, + "loss": 0.1848, + "step": 281320 + }, + { + "epoch": 11.65, + "grad_norm": 0.248046875, + "learning_rate": 0.0003359119870626835, + "loss": 0.1987, + "step": 281330 + }, + { + "epoch": 11.65, + "grad_norm": 0.8515625, + "learning_rate": 0.0003359018022751157, + "loss": 0.1699, + "step": 281340 + }, + { + "epoch": 11.65, + "grad_norm": 0.5, + "learning_rate": 0.00033589161732589, + "loss": 0.1696, + "step": 281350 + }, + { + "epoch": 11.65, + "grad_norm": 0.1728515625, + "learning_rate": 0.00033588143221502555, + "loss": 0.1984, + "step": 281360 + }, + { + "epoch": 11.65, + "grad_norm": 0.8515625, + "learning_rate": 0.00033587124694254146, + "loss": 0.1829, + "step": 281370 + }, + { + "epoch": 11.65, + "grad_norm": 0.69921875, + "learning_rate": 0.000335861061508457, + "loss": 0.23, + "step": 281380 + }, + { + "epoch": 11.66, + "grad_norm": 0.78515625, + "learning_rate": 0.00033585087591279125, + "loss": 0.175, + "step": 281390 + }, + { + "epoch": 11.66, + "grad_norm": 0.6015625, + "learning_rate": 0.00033584069015556346, + "loss": 0.1945, + "step": 281400 + }, + { + "epoch": 11.66, + "grad_norm": 1.1015625, + "learning_rate": 0.00033583050423679274, + "loss": 0.1853, + "step": 281410 + }, + { + "epoch": 11.66, + "grad_norm": 1.0859375, + "learning_rate": 0.0003358203181564983, + "loss": 0.1912, + "step": 281420 + }, + { + "epoch": 11.66, + "grad_norm": 1.1796875, + "learning_rate": 0.00033581013191469935, + "loss": 0.1774, + "step": 281430 + }, + { + "epoch": 11.66, + "grad_norm": 1.0390625, + "learning_rate": 0.0003357999455114148, + "loss": 0.1763, + "step": 281440 + }, + { + "epoch": 11.66, + "grad_norm": 1.015625, + "learning_rate": 0.0003357897589466642, + "loss": 0.1853, + "step": 281450 + }, + { + "epoch": 11.66, + "grad_norm": 1.546875, + "learning_rate": 0.0003357795722204665, + "loss": 0.2187, + "step": 281460 + }, + { + "epoch": 11.66, + "grad_norm": 0.609375, + "learning_rate": 0.00033576938533284086, + "loss": 0.2067, + "step": 281470 + }, + { + "epoch": 11.66, + "grad_norm": 1.3984375, + "learning_rate": 0.00033575919828380656, + "loss": 0.2503, + "step": 281480 + }, + { + "epoch": 11.66, + "grad_norm": 1.109375, + "learning_rate": 0.0003357490110733827, + "loss": 0.2234, + "step": 281490 + }, + { + "epoch": 11.66, + "grad_norm": 0.490234375, + "learning_rate": 0.0003357388237015884, + "loss": 0.189, + "step": 281500 + }, + { + "epoch": 11.66, + "grad_norm": 1.1640625, + "learning_rate": 0.000335728636168443, + "loss": 0.2025, + "step": 281510 + }, + { + "epoch": 11.66, + "grad_norm": 1.109375, + "learning_rate": 0.0003357184484739654, + "loss": 0.1389, + "step": 281520 + }, + { + "epoch": 11.66, + "grad_norm": 0.69921875, + "learning_rate": 0.0003357082606181751, + "loss": 0.2044, + "step": 281530 + }, + { + "epoch": 11.66, + "grad_norm": 0.2490234375, + "learning_rate": 0.0003356980726010911, + "loss": 0.2034, + "step": 281540 + }, + { + "epoch": 11.66, + "grad_norm": 0.6875, + "learning_rate": 0.00033568788442273246, + "loss": 0.1675, + "step": 281550 + }, + { + "epoch": 11.66, + "grad_norm": 0.74609375, + "learning_rate": 0.0003356776960831186, + "loss": 0.2142, + "step": 281560 + }, + { + "epoch": 11.66, + "grad_norm": 0.482421875, + "learning_rate": 0.0003356675075822685, + "loss": 0.2025, + "step": 281570 + }, + { + "epoch": 11.66, + "grad_norm": 0.83984375, + "learning_rate": 0.0003356573189202014, + "loss": 0.2101, + "step": 281580 + }, + { + "epoch": 11.66, + "grad_norm": 0.484375, + "learning_rate": 0.00033564713009693654, + "loss": 0.2095, + "step": 281590 + }, + { + "epoch": 11.66, + "grad_norm": 0.796875, + "learning_rate": 0.00033563694111249295, + "loss": 0.1716, + "step": 281600 + }, + { + "epoch": 11.66, + "grad_norm": 1.40625, + "learning_rate": 0.00033562675196688996, + "loss": 0.1682, + "step": 281610 + }, + { + "epoch": 11.66, + "grad_norm": 0.9921875, + "learning_rate": 0.0003356165626601466, + "loss": 0.2153, + "step": 281620 + }, + { + "epoch": 11.67, + "grad_norm": 0.8515625, + "learning_rate": 0.00033560637319228216, + "loss": 0.207, + "step": 281630 + }, + { + "epoch": 11.67, + "grad_norm": 0.828125, + "learning_rate": 0.00033559618356331577, + "loss": 0.2035, + "step": 281640 + }, + { + "epoch": 11.67, + "grad_norm": 1.1484375, + "learning_rate": 0.0003355859937732665, + "loss": 0.2432, + "step": 281650 + }, + { + "epoch": 11.67, + "grad_norm": 0.9765625, + "learning_rate": 0.0003355758038221538, + "loss": 0.2348, + "step": 281660 + }, + { + "epoch": 11.67, + "grad_norm": 0.62109375, + "learning_rate": 0.0003355656137099966, + "loss": 0.1558, + "step": 281670 + }, + { + "epoch": 11.67, + "grad_norm": 1.90625, + "learning_rate": 0.0003355554234368141, + "loss": 0.2055, + "step": 281680 + }, + { + "epoch": 11.67, + "grad_norm": 0.384765625, + "learning_rate": 0.00033554523300262563, + "loss": 0.1837, + "step": 281690 + }, + { + "epoch": 11.67, + "grad_norm": 0.93359375, + "learning_rate": 0.00033553504240745025, + "loss": 0.2156, + "step": 281700 + }, + { + "epoch": 11.67, + "grad_norm": 0.90234375, + "learning_rate": 0.00033552485165130706, + "loss": 0.2243, + "step": 281710 + }, + { + "epoch": 11.67, + "grad_norm": 0.515625, + "learning_rate": 0.00033551466073421547, + "loss": 0.1928, + "step": 281720 + }, + { + "epoch": 11.67, + "grad_norm": 1.0546875, + "learning_rate": 0.0003355044696561944, + "loss": 0.2096, + "step": 281730 + }, + { + "epoch": 11.67, + "grad_norm": 0.455078125, + "learning_rate": 0.00033549427841726324, + "loss": 0.2694, + "step": 281740 + }, + { + "epoch": 11.67, + "grad_norm": 0.765625, + "learning_rate": 0.00033548408701744104, + "loss": 0.2472, + "step": 281750 + }, + { + "epoch": 11.67, + "grad_norm": 1.2421875, + "learning_rate": 0.00033547389545674703, + "loss": 0.1854, + "step": 281760 + }, + { + "epoch": 11.67, + "grad_norm": 0.3203125, + "learning_rate": 0.0003354637037352004, + "loss": 0.1645, + "step": 281770 + }, + { + "epoch": 11.67, + "grad_norm": 0.8671875, + "learning_rate": 0.00033545351185282037, + "loss": 0.1593, + "step": 281780 + }, + { + "epoch": 11.67, + "grad_norm": 1.5234375, + "learning_rate": 0.00033544331980962593, + "loss": 0.2094, + "step": 281790 + }, + { + "epoch": 11.67, + "grad_norm": 0.373046875, + "learning_rate": 0.0003354331276056365, + "loss": 0.1405, + "step": 281800 + }, + { + "epoch": 11.67, + "grad_norm": 1.6015625, + "learning_rate": 0.00033542293524087095, + "loss": 0.2497, + "step": 281810 + }, + { + "epoch": 11.67, + "grad_norm": 0.80078125, + "learning_rate": 0.0003354127427153489, + "loss": 0.1635, + "step": 281820 + }, + { + "epoch": 11.67, + "grad_norm": 0.6875, + "learning_rate": 0.00033540255002908917, + "loss": 0.1531, + "step": 281830 + }, + { + "epoch": 11.67, + "grad_norm": 1.484375, + "learning_rate": 0.0003353923571821111, + "loss": 0.2178, + "step": 281840 + }, + { + "epoch": 11.67, + "grad_norm": 1.3671875, + "learning_rate": 0.0003353821641744338, + "loss": 0.2204, + "step": 281850 + }, + { + "epoch": 11.67, + "grad_norm": 0.93359375, + "learning_rate": 0.00033537197100607654, + "loss": 0.2161, + "step": 281860 + }, + { + "epoch": 11.68, + "grad_norm": 0.859375, + "learning_rate": 0.0003353617776770584, + "loss": 0.1836, + "step": 281870 + }, + { + "epoch": 11.68, + "grad_norm": 1.0390625, + "learning_rate": 0.0003353515841873987, + "loss": 0.212, + "step": 281880 + }, + { + "epoch": 11.68, + "grad_norm": 1.0390625, + "learning_rate": 0.0003353413905371164, + "loss": 0.2248, + "step": 281890 + }, + { + "epoch": 11.68, + "grad_norm": 0.625, + "learning_rate": 0.0003353311967262309, + "loss": 0.2106, + "step": 281900 + }, + { + "epoch": 11.68, + "grad_norm": 0.87890625, + "learning_rate": 0.0003353210027547613, + "loss": 0.1804, + "step": 281910 + }, + { + "epoch": 11.68, + "grad_norm": 0.5859375, + "learning_rate": 0.0003353108086227268, + "loss": 0.2319, + "step": 281920 + }, + { + "epoch": 11.68, + "grad_norm": 0.6328125, + "learning_rate": 0.0003353006143301465, + "loss": 0.2317, + "step": 281930 + }, + { + "epoch": 11.68, + "grad_norm": 2.09375, + "learning_rate": 0.00033529041987703974, + "loss": 0.1824, + "step": 281940 + }, + { + "epoch": 11.68, + "grad_norm": 0.142578125, + "learning_rate": 0.00033528022526342556, + "loss": 0.196, + "step": 281950 + }, + { + "epoch": 11.68, + "grad_norm": 0.609375, + "learning_rate": 0.00033527003048932323, + "loss": 0.206, + "step": 281960 + }, + { + "epoch": 11.68, + "grad_norm": 0.9609375, + "learning_rate": 0.0003352598355547519, + "loss": 0.252, + "step": 281970 + }, + { + "epoch": 11.68, + "grad_norm": 0.54296875, + "learning_rate": 0.00033524964045973085, + "loss": 0.1757, + "step": 281980 + }, + { + "epoch": 11.68, + "grad_norm": 0.4375, + "learning_rate": 0.0003352394452042791, + "loss": 0.1702, + "step": 281990 + }, + { + "epoch": 11.68, + "grad_norm": 0.89453125, + "learning_rate": 0.00033522924978841585, + "loss": 0.1761, + "step": 282000 + }, + { + "epoch": 11.68, + "grad_norm": 1.703125, + "learning_rate": 0.00033521905421216047, + "loss": 0.2153, + "step": 282010 + }, + { + "epoch": 11.68, + "grad_norm": 0.76171875, + "learning_rate": 0.000335208858475532, + "loss": 0.1984, + "step": 282020 + }, + { + "epoch": 11.68, + "grad_norm": 1.0078125, + "learning_rate": 0.00033519866257854963, + "loss": 0.2265, + "step": 282030 + }, + { + "epoch": 11.68, + "grad_norm": 1.7265625, + "learning_rate": 0.0003351884665212327, + "loss": 0.1916, + "step": 282040 + }, + { + "epoch": 11.68, + "grad_norm": 0.23828125, + "learning_rate": 0.0003351782703036001, + "loss": 0.1766, + "step": 282050 + }, + { + "epoch": 11.68, + "grad_norm": 0.41796875, + "learning_rate": 0.0003351680739256713, + "loss": 0.2087, + "step": 282060 + }, + { + "epoch": 11.68, + "grad_norm": 1.359375, + "learning_rate": 0.00033515787738746535, + "loss": 0.217, + "step": 282070 + }, + { + "epoch": 11.68, + "grad_norm": 0.578125, + "learning_rate": 0.00033514768068900147, + "loss": 0.2046, + "step": 282080 + }, + { + "epoch": 11.68, + "grad_norm": 0.8984375, + "learning_rate": 0.00033513748383029885, + "loss": 0.2019, + "step": 282090 + }, + { + "epoch": 11.68, + "grad_norm": 0.69921875, + "learning_rate": 0.00033512728681137674, + "loss": 0.2033, + "step": 282100 + }, + { + "epoch": 11.68, + "grad_norm": 1.3125, + "learning_rate": 0.00033511708963225416, + "loss": 0.1482, + "step": 282110 + }, + { + "epoch": 11.69, + "grad_norm": 0.58984375, + "learning_rate": 0.0003351068922929506, + "loss": 0.219, + "step": 282120 + }, + { + "epoch": 11.69, + "grad_norm": 0.71484375, + "learning_rate": 0.0003350966947934848, + "loss": 0.2583, + "step": 282130 + }, + { + "epoch": 11.69, + "grad_norm": 0.8046875, + "learning_rate": 0.00033508649713387634, + "loss": 0.1768, + "step": 282140 + }, + { + "epoch": 11.69, + "grad_norm": 0.57421875, + "learning_rate": 0.0003350762993141443, + "loss": 0.2079, + "step": 282150 + }, + { + "epoch": 11.69, + "grad_norm": 1.453125, + "learning_rate": 0.00033506610133430783, + "loss": 0.1946, + "step": 282160 + }, + { + "epoch": 11.69, + "grad_norm": 0.77734375, + "learning_rate": 0.00033505590319438615, + "loss": 0.1772, + "step": 282170 + }, + { + "epoch": 11.69, + "grad_norm": 0.67578125, + "learning_rate": 0.00033504570489439845, + "loss": 0.1932, + "step": 282180 + }, + { + "epoch": 11.69, + "grad_norm": 0.390625, + "learning_rate": 0.00033503550643436387, + "loss": 0.1792, + "step": 282190 + }, + { + "epoch": 11.69, + "grad_norm": 0.5625, + "learning_rate": 0.0003350253078143017, + "loss": 0.2067, + "step": 282200 + }, + { + "epoch": 11.69, + "grad_norm": 0.61328125, + "learning_rate": 0.00033501510903423106, + "loss": 0.214, + "step": 282210 + }, + { + "epoch": 11.69, + "grad_norm": 0.73828125, + "learning_rate": 0.00033500491009417116, + "loss": 0.1863, + "step": 282220 + }, + { + "epoch": 11.69, + "grad_norm": 0.9140625, + "learning_rate": 0.00033499471099414125, + "loss": 0.1626, + "step": 282230 + }, + { + "epoch": 11.69, + "grad_norm": 0.921875, + "learning_rate": 0.0003349845117341604, + "loss": 0.1439, + "step": 282240 + }, + { + "epoch": 11.69, + "grad_norm": 1.1640625, + "learning_rate": 0.0003349743123142479, + "loss": 0.217, + "step": 282250 + }, + { + "epoch": 11.69, + "grad_norm": 1.8828125, + "learning_rate": 0.0003349641127344229, + "loss": 0.2041, + "step": 282260 + }, + { + "epoch": 11.69, + "grad_norm": 0.3203125, + "learning_rate": 0.00033495391299470466, + "loss": 0.1842, + "step": 282270 + }, + { + "epoch": 11.69, + "grad_norm": 0.78125, + "learning_rate": 0.00033494371309511235, + "loss": 0.1587, + "step": 282280 + }, + { + "epoch": 11.69, + "grad_norm": 0.72265625, + "learning_rate": 0.0003349335130356651, + "loss": 0.2379, + "step": 282290 + }, + { + "epoch": 11.69, + "grad_norm": 0.5390625, + "learning_rate": 0.0003349233128163822, + "loss": 0.1964, + "step": 282300 + }, + { + "epoch": 11.69, + "grad_norm": 0.435546875, + "learning_rate": 0.00033491311243728273, + "loss": 0.1711, + "step": 282310 + }, + { + "epoch": 11.69, + "grad_norm": 1.09375, + "learning_rate": 0.00033490291189838594, + "loss": 0.2347, + "step": 282320 + }, + { + "epoch": 11.69, + "grad_norm": 0.46484375, + "learning_rate": 0.00033489271119971115, + "loss": 0.1942, + "step": 282330 + }, + { + "epoch": 11.69, + "grad_norm": 1.0546875, + "learning_rate": 0.0003348825103412773, + "loss": 0.2024, + "step": 282340 + }, + { + "epoch": 11.69, + "grad_norm": 2.140625, + "learning_rate": 0.00033487230932310386, + "loss": 0.2336, + "step": 282350 + }, + { + "epoch": 11.7, + "grad_norm": 1.578125, + "learning_rate": 0.00033486210814520986, + "loss": 0.233, + "step": 282360 + }, + { + "epoch": 11.7, + "grad_norm": 0.5234375, + "learning_rate": 0.00033485190680761445, + "loss": 0.189, + "step": 282370 + }, + { + "epoch": 11.7, + "grad_norm": 0.8046875, + "learning_rate": 0.0003348417053103371, + "loss": 0.1898, + "step": 282380 + }, + { + "epoch": 11.7, + "grad_norm": 0.51953125, + "learning_rate": 0.00033483150365339666, + "loss": 0.1869, + "step": 282390 + }, + { + "epoch": 11.7, + "grad_norm": 0.15625, + "learning_rate": 0.00033482130183681255, + "loss": 0.2063, + "step": 282400 + }, + { + "epoch": 11.7, + "grad_norm": 1.5078125, + "learning_rate": 0.00033481109986060385, + "loss": 0.2179, + "step": 282410 + }, + { + "epoch": 11.7, + "grad_norm": 0.419921875, + "learning_rate": 0.0003348008977247899, + "loss": 0.2132, + "step": 282420 + }, + { + "epoch": 11.7, + "grad_norm": 0.7734375, + "learning_rate": 0.00033479069542938976, + "loss": 0.2219, + "step": 282430 + }, + { + "epoch": 11.7, + "grad_norm": 0.84765625, + "learning_rate": 0.00033478049297442275, + "loss": 0.2157, + "step": 282440 + }, + { + "epoch": 11.7, + "grad_norm": 2.984375, + "learning_rate": 0.00033477029035990796, + "loss": 0.2033, + "step": 282450 + }, + { + "epoch": 11.7, + "grad_norm": 0.93359375, + "learning_rate": 0.0003347600875858647, + "loss": 0.2109, + "step": 282460 + }, + { + "epoch": 11.7, + "grad_norm": 0.7734375, + "learning_rate": 0.00033474988465231203, + "loss": 0.2136, + "step": 282470 + }, + { + "epoch": 11.7, + "grad_norm": 0.76171875, + "learning_rate": 0.0003347396815592693, + "loss": 0.1824, + "step": 282480 + }, + { + "epoch": 11.7, + "grad_norm": 0.76171875, + "learning_rate": 0.00033472947830675556, + "loss": 0.2188, + "step": 282490 + }, + { + "epoch": 11.7, + "grad_norm": 0.64453125, + "learning_rate": 0.00033471927489479007, + "loss": 0.1595, + "step": 282500 + }, + { + "epoch": 11.7, + "grad_norm": 0.55859375, + "learning_rate": 0.00033470907132339216, + "loss": 0.2104, + "step": 282510 + }, + { + "epoch": 11.7, + "grad_norm": 0.7109375, + "learning_rate": 0.00033469886759258087, + "loss": 0.1589, + "step": 282520 + }, + { + "epoch": 11.7, + "grad_norm": 0.5390625, + "learning_rate": 0.0003346886637023755, + "loss": 0.1842, + "step": 282530 + }, + { + "epoch": 11.7, + "grad_norm": 0.6484375, + "learning_rate": 0.0003346784596527952, + "loss": 0.2247, + "step": 282540 + }, + { + "epoch": 11.7, + "grad_norm": 0.7109375, + "learning_rate": 0.00033466825544385914, + "loss": 0.1814, + "step": 282550 + }, + { + "epoch": 11.7, + "grad_norm": 1.1875, + "learning_rate": 0.00033465805107558657, + "loss": 0.2225, + "step": 282560 + }, + { + "epoch": 11.7, + "grad_norm": 4.75, + "learning_rate": 0.0003346478465479967, + "loss": 0.1947, + "step": 282570 + }, + { + "epoch": 11.7, + "grad_norm": 1.328125, + "learning_rate": 0.00033463764186110874, + "loss": 0.2282, + "step": 282580 + }, + { + "epoch": 11.7, + "grad_norm": 0.63671875, + "learning_rate": 0.0003346274370149419, + "loss": 0.2532, + "step": 282590 + }, + { + "epoch": 11.71, + "grad_norm": 0.73046875, + "learning_rate": 0.00033461723200951533, + "loss": 0.1873, + "step": 282600 + }, + { + "epoch": 11.71, + "grad_norm": 0.5234375, + "learning_rate": 0.00033460702684484823, + "loss": 0.193, + "step": 282610 + }, + { + "epoch": 11.71, + "grad_norm": 0.73828125, + "learning_rate": 0.00033459682152095987, + "loss": 0.2278, + "step": 282620 + }, + { + "epoch": 11.71, + "grad_norm": 1.03125, + "learning_rate": 0.0003345866160378695, + "loss": 0.1999, + "step": 282630 + }, + { + "epoch": 11.71, + "grad_norm": 1.234375, + "learning_rate": 0.00033457641039559614, + "loss": 0.1134, + "step": 282640 + }, + { + "epoch": 11.71, + "grad_norm": 1.203125, + "learning_rate": 0.0003345662045941592, + "loss": 0.1794, + "step": 282650 + }, + { + "epoch": 11.71, + "grad_norm": 0.63671875, + "learning_rate": 0.0003345559986335778, + "loss": 0.1889, + "step": 282660 + }, + { + "epoch": 11.71, + "grad_norm": 1.4296875, + "learning_rate": 0.00033454579251387105, + "loss": 0.1859, + "step": 282670 + }, + { + "epoch": 11.71, + "grad_norm": 0.427734375, + "learning_rate": 0.00033453558623505834, + "loss": 0.1609, + "step": 282680 + }, + { + "epoch": 11.71, + "grad_norm": 0.6484375, + "learning_rate": 0.0003345253797971587, + "loss": 0.179, + "step": 282690 + }, + { + "epoch": 11.71, + "grad_norm": 0.75390625, + "learning_rate": 0.00033451517320019146, + "loss": 0.1866, + "step": 282700 + }, + { + "epoch": 11.71, + "grad_norm": 0.82421875, + "learning_rate": 0.00033450496644417583, + "loss": 0.196, + "step": 282710 + }, + { + "epoch": 11.71, + "grad_norm": 0.474609375, + "learning_rate": 0.00033449475952913095, + "loss": 0.2443, + "step": 282720 + }, + { + "epoch": 11.71, + "grad_norm": 0.91796875, + "learning_rate": 0.000334484552455076, + "loss": 0.2047, + "step": 282730 + }, + { + "epoch": 11.71, + "grad_norm": 0.6640625, + "learning_rate": 0.00033447434522203035, + "loss": 0.1944, + "step": 282740 + }, + { + "epoch": 11.71, + "grad_norm": 1.21875, + "learning_rate": 0.00033446413783001306, + "loss": 0.1504, + "step": 282750 + }, + { + "epoch": 11.71, + "grad_norm": 0.48828125, + "learning_rate": 0.0003344539302790434, + "loss": 0.2271, + "step": 282760 + }, + { + "epoch": 11.71, + "grad_norm": 0.4765625, + "learning_rate": 0.0003344437225691405, + "loss": 0.2313, + "step": 282770 + }, + { + "epoch": 11.71, + "grad_norm": 0.39453125, + "learning_rate": 0.0003344335147003237, + "loss": 0.1629, + "step": 282780 + }, + { + "epoch": 11.71, + "grad_norm": 1.1875, + "learning_rate": 0.0003344233066726121, + "loss": 0.2054, + "step": 282790 + }, + { + "epoch": 11.71, + "grad_norm": 0.58984375, + "learning_rate": 0.000334413098486025, + "loss": 0.1913, + "step": 282800 + }, + { + "epoch": 11.71, + "grad_norm": 1.1015625, + "learning_rate": 0.0003344028901405815, + "loss": 0.2458, + "step": 282810 + }, + { + "epoch": 11.71, + "grad_norm": 0.380859375, + "learning_rate": 0.00033439268163630093, + "loss": 0.1916, + "step": 282820 + }, + { + "epoch": 11.71, + "grad_norm": 0.796875, + "learning_rate": 0.0003343824729732024, + "loss": 0.1631, + "step": 282830 + }, + { + "epoch": 11.72, + "grad_norm": 0.359375, + "learning_rate": 0.0003343722641513052, + "loss": 0.1173, + "step": 282840 + }, + { + "epoch": 11.72, + "grad_norm": 0.63671875, + "learning_rate": 0.0003343620551706286, + "loss": 0.1766, + "step": 282850 + }, + { + "epoch": 11.72, + "grad_norm": 0.6796875, + "learning_rate": 0.00033435184603119154, + "loss": 0.144, + "step": 282860 + }, + { + "epoch": 11.72, + "grad_norm": 0.83203125, + "learning_rate": 0.0003343416367330135, + "loss": 0.1419, + "step": 282870 + }, + { + "epoch": 11.72, + "grad_norm": 0.400390625, + "learning_rate": 0.00033433142727611363, + "loss": 0.1553, + "step": 282880 + }, + { + "epoch": 11.72, + "grad_norm": 0.44921875, + "learning_rate": 0.00033432121766051114, + "loss": 0.1654, + "step": 282890 + }, + { + "epoch": 11.72, + "grad_norm": 0.37109375, + "learning_rate": 0.00033431100788622515, + "loss": 0.1701, + "step": 282900 + }, + { + "epoch": 11.72, + "grad_norm": 0.765625, + "learning_rate": 0.000334300797953275, + "loss": 0.2004, + "step": 282910 + }, + { + "epoch": 11.72, + "grad_norm": 0.419921875, + "learning_rate": 0.00033429058786167985, + "loss": 0.2071, + "step": 282920 + }, + { + "epoch": 11.72, + "grad_norm": 0.5625, + "learning_rate": 0.0003342803776114589, + "loss": 0.242, + "step": 282930 + }, + { + "epoch": 11.72, + "grad_norm": 1.203125, + "learning_rate": 0.00033427016720263136, + "loss": 0.1569, + "step": 282940 + }, + { + "epoch": 11.72, + "grad_norm": 0.7421875, + "learning_rate": 0.00033425995663521657, + "loss": 0.1864, + "step": 282950 + }, + { + "epoch": 11.72, + "grad_norm": 1.046875, + "learning_rate": 0.0003342497459092335, + "loss": 0.2184, + "step": 282960 + }, + { + "epoch": 11.72, + "grad_norm": 0.8125, + "learning_rate": 0.00033423953502470164, + "loss": 0.2437, + "step": 282970 + }, + { + "epoch": 11.72, + "grad_norm": 0.55859375, + "learning_rate": 0.00033422932398164, + "loss": 0.1885, + "step": 282980 + }, + { + "epoch": 11.72, + "grad_norm": 0.9140625, + "learning_rate": 0.0003342191127800678, + "loss": 0.2011, + "step": 282990 + }, + { + "epoch": 11.72, + "grad_norm": 0.93359375, + "learning_rate": 0.0003342089014200044, + "loss": 0.1988, + "step": 283000 + }, + { + "epoch": 11.72, + "grad_norm": 0.7265625, + "learning_rate": 0.000334198689901469, + "loss": 0.2019, + "step": 283010 + }, + { + "epoch": 11.72, + "grad_norm": 0.515625, + "learning_rate": 0.0003341884782244806, + "loss": 0.2319, + "step": 283020 + }, + { + "epoch": 11.72, + "grad_norm": 0.279296875, + "learning_rate": 0.00033417826638905873, + "loss": 0.1636, + "step": 283030 + }, + { + "epoch": 11.72, + "grad_norm": 0.380859375, + "learning_rate": 0.00033416805439522235, + "loss": 0.2299, + "step": 283040 + }, + { + "epoch": 11.72, + "grad_norm": 0.6796875, + "learning_rate": 0.00033415784224299086, + "loss": 0.1831, + "step": 283050 + }, + { + "epoch": 11.72, + "grad_norm": 0.359375, + "learning_rate": 0.00033414762993238335, + "loss": 0.2243, + "step": 283060 + }, + { + "epoch": 11.72, + "grad_norm": 0.515625, + "learning_rate": 0.00033413741746341906, + "loss": 0.2379, + "step": 283070 + }, + { + "epoch": 11.73, + "grad_norm": 1.0625, + "learning_rate": 0.00033412720483611734, + "loss": 0.1811, + "step": 283080 + }, + { + "epoch": 11.73, + "grad_norm": 1.1171875, + "learning_rate": 0.0003341169920504972, + "loss": 0.1946, + "step": 283090 + }, + { + "epoch": 11.73, + "grad_norm": 0.5, + "learning_rate": 0.00033410677910657806, + "loss": 0.1649, + "step": 283100 + }, + { + "epoch": 11.73, + "grad_norm": 0.67578125, + "learning_rate": 0.000334096566004379, + "loss": 0.2035, + "step": 283110 + }, + { + "epoch": 11.73, + "grad_norm": 0.4765625, + "learning_rate": 0.0003340863527439192, + "loss": 0.2154, + "step": 283120 + }, + { + "epoch": 11.73, + "grad_norm": 0.2197265625, + "learning_rate": 0.0003340761393252181, + "loss": 0.1984, + "step": 283130 + }, + { + "epoch": 11.73, + "grad_norm": 0.25, + "learning_rate": 0.0003340659257482947, + "loss": 0.1421, + "step": 283140 + }, + { + "epoch": 11.73, + "grad_norm": 0.76171875, + "learning_rate": 0.0003340557120131683, + "loss": 0.2256, + "step": 283150 + }, + { + "epoch": 11.73, + "grad_norm": 0.6875, + "learning_rate": 0.00033404549811985823, + "loss": 0.1523, + "step": 283160 + }, + { + "epoch": 11.73, + "grad_norm": 1.140625, + "learning_rate": 0.00033403528406838345, + "loss": 0.1805, + "step": 283170 + }, + { + "epoch": 11.73, + "grad_norm": 0.86328125, + "learning_rate": 0.00033402506985876347, + "loss": 0.1768, + "step": 283180 + }, + { + "epoch": 11.73, + "grad_norm": 0.8359375, + "learning_rate": 0.0003340148554910174, + "loss": 0.186, + "step": 283190 + }, + { + "epoch": 11.73, + "grad_norm": 1.0390625, + "learning_rate": 0.0003340046409651643, + "loss": 0.2308, + "step": 283200 + }, + { + "epoch": 11.73, + "grad_norm": 0.89453125, + "learning_rate": 0.0003339944262812237, + "loss": 0.1808, + "step": 283210 + }, + { + "epoch": 11.73, + "grad_norm": 1.1796875, + "learning_rate": 0.00033398421143921456, + "loss": 0.1572, + "step": 283220 + }, + { + "epoch": 11.73, + "grad_norm": 0.53125, + "learning_rate": 0.0003339739964391562, + "loss": 0.2617, + "step": 283230 + }, + { + "epoch": 11.73, + "grad_norm": 1.109375, + "learning_rate": 0.0003339637812810679, + "loss": 0.173, + "step": 283240 + }, + { + "epoch": 11.73, + "grad_norm": 0.5, + "learning_rate": 0.00033395356596496874, + "loss": 0.1624, + "step": 283250 + }, + { + "epoch": 11.73, + "grad_norm": 0.50390625, + "learning_rate": 0.00033394335049087814, + "loss": 0.1934, + "step": 283260 + }, + { + "epoch": 11.73, + "grad_norm": 0.73046875, + "learning_rate": 0.0003339331348588152, + "loss": 0.2056, + "step": 283270 + }, + { + "epoch": 11.73, + "grad_norm": 0.7109375, + "learning_rate": 0.0003339229190687991, + "loss": 0.1691, + "step": 283280 + }, + { + "epoch": 11.73, + "grad_norm": 0.26953125, + "learning_rate": 0.0003339127031208492, + "loss": 0.196, + "step": 283290 + }, + { + "epoch": 11.73, + "grad_norm": 1.0, + "learning_rate": 0.0003339024870149846, + "loss": 0.162, + "step": 283300 + }, + { + "epoch": 11.73, + "grad_norm": 1.390625, + "learning_rate": 0.00033389227075122463, + "loss": 0.1751, + "step": 283310 + }, + { + "epoch": 11.74, + "grad_norm": 0.671875, + "learning_rate": 0.0003338820543295885, + "loss": 0.1764, + "step": 283320 + }, + { + "epoch": 11.74, + "grad_norm": 1.1796875, + "learning_rate": 0.0003338718377500953, + "loss": 0.2678, + "step": 283330 + }, + { + "epoch": 11.74, + "grad_norm": 0.57421875, + "learning_rate": 0.0003338616210127644, + "loss": 0.1889, + "step": 283340 + }, + { + "epoch": 11.74, + "grad_norm": 0.84765625, + "learning_rate": 0.00033385140411761505, + "loss": 0.2311, + "step": 283350 + }, + { + "epoch": 11.74, + "grad_norm": 0.6171875, + "learning_rate": 0.00033384118706466625, + "loss": 0.2252, + "step": 283360 + }, + { + "epoch": 11.74, + "grad_norm": 0.82421875, + "learning_rate": 0.0003338309698539376, + "loss": 0.244, + "step": 283370 + }, + { + "epoch": 11.74, + "grad_norm": 0.40625, + "learning_rate": 0.00033382075248544795, + "loss": 0.1801, + "step": 283380 + }, + { + "epoch": 11.74, + "grad_norm": 0.7890625, + "learning_rate": 0.00033381053495921677, + "loss": 0.2191, + "step": 283390 + }, + { + "epoch": 11.74, + "grad_norm": 1.4375, + "learning_rate": 0.00033380031727526323, + "loss": 0.2016, + "step": 283400 + }, + { + "epoch": 11.74, + "grad_norm": 0.50390625, + "learning_rate": 0.00033379009943360646, + "loss": 0.2164, + "step": 283410 + }, + { + "epoch": 11.74, + "grad_norm": 0.578125, + "learning_rate": 0.00033377988143426583, + "loss": 0.1661, + "step": 283420 + }, + { + "epoch": 11.74, + "grad_norm": 1.171875, + "learning_rate": 0.00033376966327726057, + "loss": 0.2103, + "step": 283430 + }, + { + "epoch": 11.74, + "grad_norm": 0.734375, + "learning_rate": 0.00033375944496260966, + "loss": 0.1872, + "step": 283440 + }, + { + "epoch": 11.74, + "grad_norm": 1.1640625, + "learning_rate": 0.0003337492264903327, + "loss": 0.1761, + "step": 283450 + }, + { + "epoch": 11.74, + "grad_norm": 0.474609375, + "learning_rate": 0.00033373900786044867, + "loss": 0.1541, + "step": 283460 + }, + { + "epoch": 11.74, + "grad_norm": 1.6328125, + "learning_rate": 0.00033372878907297687, + "loss": 0.1597, + "step": 283470 + }, + { + "epoch": 11.74, + "grad_norm": 1.140625, + "learning_rate": 0.0003337185701279366, + "loss": 0.2027, + "step": 283480 + }, + { + "epoch": 11.74, + "grad_norm": 1.34375, + "learning_rate": 0.0003337083510253469, + "loss": 0.1764, + "step": 283490 + }, + { + "epoch": 11.74, + "grad_norm": 0.7265625, + "learning_rate": 0.0003336981317652272, + "loss": 0.1782, + "step": 283500 + }, + { + "epoch": 11.74, + "grad_norm": 0.640625, + "learning_rate": 0.0003336879123475967, + "loss": 0.2069, + "step": 283510 + }, + { + "epoch": 11.74, + "grad_norm": 1.0859375, + "learning_rate": 0.00033367769277247453, + "loss": 0.2148, + "step": 283520 + }, + { + "epoch": 11.74, + "grad_norm": 0.984375, + "learning_rate": 0.00033366747303988, + "loss": 0.1411, + "step": 283530 + }, + { + "epoch": 11.74, + "grad_norm": 0.6171875, + "learning_rate": 0.00033365725314983227, + "loss": 0.1917, + "step": 283540 + }, + { + "epoch": 11.74, + "grad_norm": 0.9140625, + "learning_rate": 0.0003336470331023507, + "loss": 0.1459, + "step": 283550 + }, + { + "epoch": 11.75, + "grad_norm": 1.2265625, + "learning_rate": 0.0003336368128974544, + "loss": 0.25, + "step": 283560 + }, + { + "epoch": 11.75, + "grad_norm": 0.369140625, + "learning_rate": 0.0003336265925351626, + "loss": 0.1797, + "step": 283570 + }, + { + "epoch": 11.75, + "grad_norm": 0.56640625, + "learning_rate": 0.00033361637201549465, + "loss": 0.1929, + "step": 283580 + }, + { + "epoch": 11.75, + "grad_norm": 0.88671875, + "learning_rate": 0.00033360615133846966, + "loss": 0.213, + "step": 283590 + }, + { + "epoch": 11.75, + "grad_norm": 0.3671875, + "learning_rate": 0.000333595930504107, + "loss": 0.2342, + "step": 283600 + }, + { + "epoch": 11.75, + "grad_norm": 0.43359375, + "learning_rate": 0.0003335857095124258, + "loss": 0.207, + "step": 283610 + }, + { + "epoch": 11.75, + "grad_norm": 0.6640625, + "learning_rate": 0.0003335754883634453, + "loss": 0.2073, + "step": 283620 + }, + { + "epoch": 11.75, + "grad_norm": 0.71875, + "learning_rate": 0.00033356526705718477, + "loss": 0.1515, + "step": 283630 + }, + { + "epoch": 11.75, + "grad_norm": 0.5390625, + "learning_rate": 0.0003335550455936635, + "loss": 0.2261, + "step": 283640 + }, + { + "epoch": 11.75, + "grad_norm": 0.8515625, + "learning_rate": 0.0003335448239729005, + "loss": 0.1963, + "step": 283650 + }, + { + "epoch": 11.75, + "grad_norm": 0.9453125, + "learning_rate": 0.00033353460219491527, + "loss": 0.1666, + "step": 283660 + }, + { + "epoch": 11.75, + "grad_norm": 2.1875, + "learning_rate": 0.00033352438025972695, + "loss": 0.1981, + "step": 283670 + }, + { + "epoch": 11.75, + "grad_norm": 0.65625, + "learning_rate": 0.0003335141581673547, + "loss": 0.1728, + "step": 283680 + }, + { + "epoch": 11.75, + "grad_norm": 0.4375, + "learning_rate": 0.00033350393591781793, + "loss": 0.1516, + "step": 283690 + }, + { + "epoch": 11.75, + "grad_norm": 1.5625, + "learning_rate": 0.0003334937135111357, + "loss": 0.2133, + "step": 283700 + }, + { + "epoch": 11.75, + "grad_norm": 1.7265625, + "learning_rate": 0.0003334834909473272, + "loss": 0.2432, + "step": 283710 + }, + { + "epoch": 11.75, + "grad_norm": 0.625, + "learning_rate": 0.00033347326822641193, + "loss": 0.1506, + "step": 283720 + }, + { + "epoch": 11.75, + "grad_norm": 1.03125, + "learning_rate": 0.000333463045348409, + "loss": 0.2099, + "step": 283730 + }, + { + "epoch": 11.75, + "grad_norm": 1.21875, + "learning_rate": 0.00033345282231333757, + "loss": 0.2041, + "step": 283740 + }, + { + "epoch": 11.75, + "grad_norm": 0.5625, + "learning_rate": 0.000333442599121217, + "loss": 0.2361, + "step": 283750 + }, + { + "epoch": 11.75, + "grad_norm": 0.7890625, + "learning_rate": 0.0003334323757720664, + "loss": 0.2014, + "step": 283760 + }, + { + "epoch": 11.75, + "grad_norm": 1.03125, + "learning_rate": 0.0003334221522659051, + "loss": 0.2185, + "step": 283770 + }, + { + "epoch": 11.75, + "grad_norm": 1.390625, + "learning_rate": 0.00033341192860275235, + "loss": 0.192, + "step": 283780 + }, + { + "epoch": 11.75, + "grad_norm": 1.3359375, + "learning_rate": 0.0003334017047826273, + "loss": 0.2108, + "step": 283790 + }, + { + "epoch": 11.75, + "grad_norm": 0.51171875, + "learning_rate": 0.0003333914808055493, + "loss": 0.2319, + "step": 283800 + }, + { + "epoch": 11.76, + "grad_norm": 0.478515625, + "learning_rate": 0.00033338125667153754, + "loss": 0.2226, + "step": 283810 + }, + { + "epoch": 11.76, + "grad_norm": 0.69140625, + "learning_rate": 0.0003333710323806112, + "loss": 0.1964, + "step": 283820 + }, + { + "epoch": 11.76, + "grad_norm": 0.74609375, + "learning_rate": 0.0003333608079327896, + "loss": 0.2038, + "step": 283830 + }, + { + "epoch": 11.76, + "grad_norm": 0.09765625, + "learning_rate": 0.000333350583328092, + "loss": 0.2144, + "step": 283840 + }, + { + "epoch": 11.76, + "grad_norm": 1.609375, + "learning_rate": 0.0003333403585665377, + "loss": 0.1883, + "step": 283850 + }, + { + "epoch": 11.76, + "grad_norm": 0.80859375, + "learning_rate": 0.0003333301336481456, + "loss": 0.2002, + "step": 283860 + }, + { + "epoch": 11.76, + "grad_norm": 0.84765625, + "learning_rate": 0.00033331990857293536, + "loss": 0.1927, + "step": 283870 + }, + { + "epoch": 11.76, + "grad_norm": 0.70703125, + "learning_rate": 0.000333309683340926, + "loss": 0.1591, + "step": 283880 + }, + { + "epoch": 11.76, + "grad_norm": 0.7578125, + "learning_rate": 0.0003332994579521368, + "loss": 0.2033, + "step": 283890 + }, + { + "epoch": 11.76, + "grad_norm": 2.484375, + "learning_rate": 0.00033328923240658704, + "loss": 0.2066, + "step": 283900 + }, + { + "epoch": 11.76, + "grad_norm": 3.15625, + "learning_rate": 0.00033327900670429594, + "loss": 0.1957, + "step": 283910 + }, + { + "epoch": 11.76, + "grad_norm": 0.50390625, + "learning_rate": 0.0003332687808452827, + "loss": 0.1999, + "step": 283920 + }, + { + "epoch": 11.76, + "grad_norm": 0.78515625, + "learning_rate": 0.0003332585548295667, + "loss": 0.1773, + "step": 283930 + }, + { + "epoch": 11.76, + "grad_norm": 0.73046875, + "learning_rate": 0.00033324832865716704, + "loss": 0.2282, + "step": 283940 + }, + { + "epoch": 11.76, + "grad_norm": 0.91015625, + "learning_rate": 0.00033323810232810305, + "loss": 0.1679, + "step": 283950 + }, + { + "epoch": 11.76, + "grad_norm": 0.9453125, + "learning_rate": 0.00033322787584239385, + "loss": 0.2002, + "step": 283960 + }, + { + "epoch": 11.76, + "grad_norm": 1.0625, + "learning_rate": 0.0003332176492000588, + "loss": 0.2105, + "step": 283970 + }, + { + "epoch": 11.76, + "grad_norm": 1.375, + "learning_rate": 0.0003332074224011172, + "loss": 0.2098, + "step": 283980 + }, + { + "epoch": 11.76, + "grad_norm": 0.88671875, + "learning_rate": 0.00033319719544558813, + "loss": 0.2217, + "step": 283990 + }, + { + "epoch": 11.76, + "grad_norm": 0.55859375, + "learning_rate": 0.00033318696833349094, + "loss": 0.197, + "step": 284000 + }, + { + "epoch": 11.76, + "grad_norm": 0.1826171875, + "learning_rate": 0.0003331767410648449, + "loss": 0.1604, + "step": 284010 + }, + { + "epoch": 11.76, + "grad_norm": 1.0625, + "learning_rate": 0.00033316651363966924, + "loss": 0.185, + "step": 284020 + }, + { + "epoch": 11.76, + "grad_norm": 0.53515625, + "learning_rate": 0.0003331562860579831, + "loss": 0.2197, + "step": 284030 + }, + { + "epoch": 11.76, + "grad_norm": 0.55078125, + "learning_rate": 0.00033314605831980584, + "loss": 0.2047, + "step": 284040 + }, + { + "epoch": 11.77, + "grad_norm": 0.2001953125, + "learning_rate": 0.0003331358304251567, + "loss": 0.1831, + "step": 284050 + }, + { + "epoch": 11.77, + "grad_norm": 0.80078125, + "learning_rate": 0.00033312560237405486, + "loss": 0.2187, + "step": 284060 + }, + { + "epoch": 11.77, + "grad_norm": 0.56640625, + "learning_rate": 0.00033311537416651964, + "loss": 0.2019, + "step": 284070 + }, + { + "epoch": 11.77, + "grad_norm": 0.92578125, + "learning_rate": 0.00033310514580257026, + "loss": 0.1833, + "step": 284080 + }, + { + "epoch": 11.77, + "grad_norm": 0.65625, + "learning_rate": 0.00033309491728222596, + "loss": 0.1821, + "step": 284090 + }, + { + "epoch": 11.77, + "grad_norm": 0.671875, + "learning_rate": 0.000333084688605506, + "loss": 0.2168, + "step": 284100 + }, + { + "epoch": 11.77, + "grad_norm": 0.62109375, + "learning_rate": 0.0003330744597724297, + "loss": 0.2018, + "step": 284110 + }, + { + "epoch": 11.77, + "grad_norm": 0.91015625, + "learning_rate": 0.0003330642307830161, + "loss": 0.2131, + "step": 284120 + }, + { + "epoch": 11.77, + "grad_norm": 0.71875, + "learning_rate": 0.00033305400163728465, + "loss": 0.2, + "step": 284130 + }, + { + "epoch": 11.77, + "grad_norm": 0.8359375, + "learning_rate": 0.00033304377233525455, + "loss": 0.2067, + "step": 284140 + }, + { + "epoch": 11.77, + "grad_norm": 0.76953125, + "learning_rate": 0.0003330335428769451, + "loss": 0.235, + "step": 284150 + }, + { + "epoch": 11.77, + "grad_norm": 0.50390625, + "learning_rate": 0.00033302331326237546, + "loss": 0.2111, + "step": 284160 + }, + { + "epoch": 11.77, + "grad_norm": 1.15625, + "learning_rate": 0.00033301308349156485, + "loss": 0.1834, + "step": 284170 + }, + { + "epoch": 11.77, + "grad_norm": 0.3984375, + "learning_rate": 0.0003330028535645326, + "loss": 0.2504, + "step": 284180 + }, + { + "epoch": 11.77, + "grad_norm": 1.1484375, + "learning_rate": 0.000332992623481298, + "loss": 0.2102, + "step": 284190 + }, + { + "epoch": 11.77, + "grad_norm": 1.0234375, + "learning_rate": 0.0003329823932418802, + "loss": 0.1625, + "step": 284200 + }, + { + "epoch": 11.77, + "grad_norm": 1.21875, + "learning_rate": 0.00033297216284629847, + "loss": 0.1924, + "step": 284210 + }, + { + "epoch": 11.77, + "grad_norm": 0.98828125, + "learning_rate": 0.00033296193229457205, + "loss": 0.1975, + "step": 284220 + }, + { + "epoch": 11.77, + "grad_norm": 0.490234375, + "learning_rate": 0.00033295170158672036, + "loss": 0.1095, + "step": 284230 + }, + { + "epoch": 11.77, + "grad_norm": 1.03125, + "learning_rate": 0.0003329414707227625, + "loss": 0.1456, + "step": 284240 + }, + { + "epoch": 11.77, + "grad_norm": 1.640625, + "learning_rate": 0.00033293123970271765, + "loss": 0.1822, + "step": 284250 + }, + { + "epoch": 11.77, + "grad_norm": 0.62890625, + "learning_rate": 0.0003329210085266052, + "loss": 0.2151, + "step": 284260 + }, + { + "epoch": 11.77, + "grad_norm": 0.57421875, + "learning_rate": 0.00033291077719444444, + "loss": 0.1839, + "step": 284270 + }, + { + "epoch": 11.77, + "grad_norm": 0.94921875, + "learning_rate": 0.0003329005457062545, + "loss": 0.1625, + "step": 284280 + }, + { + "epoch": 11.78, + "grad_norm": 1.2890625, + "learning_rate": 0.00033289031406205465, + "loss": 0.1973, + "step": 284290 + }, + { + "epoch": 11.78, + "grad_norm": 0.55859375, + "learning_rate": 0.00033288008226186423, + "loss": 0.1803, + "step": 284300 + }, + { + "epoch": 11.78, + "grad_norm": 0.380859375, + "learning_rate": 0.0003328698503057025, + "loss": 0.1962, + "step": 284310 + }, + { + "epoch": 11.78, + "grad_norm": 0.609375, + "learning_rate": 0.00033285961819358857, + "loss": 0.2261, + "step": 284320 + }, + { + "epoch": 11.78, + "grad_norm": 0.9140625, + "learning_rate": 0.0003328493859255418, + "loss": 0.2173, + "step": 284330 + }, + { + "epoch": 11.78, + "grad_norm": 0.85546875, + "learning_rate": 0.00033283915350158143, + "loss": 0.1737, + "step": 284340 + }, + { + "epoch": 11.78, + "grad_norm": 0.65234375, + "learning_rate": 0.00033282892092172675, + "loss": 0.212, + "step": 284350 + }, + { + "epoch": 11.78, + "grad_norm": 0.353515625, + "learning_rate": 0.000332818688185997, + "loss": 0.1559, + "step": 284360 + }, + { + "epoch": 11.78, + "grad_norm": 0.51171875, + "learning_rate": 0.0003328084552944114, + "loss": 0.1521, + "step": 284370 + }, + { + "epoch": 11.78, + "grad_norm": 0.87890625, + "learning_rate": 0.0003327982222469892, + "loss": 0.1782, + "step": 284380 + }, + { + "epoch": 11.78, + "grad_norm": 0.69140625, + "learning_rate": 0.00033278798904374976, + "loss": 0.2074, + "step": 284390 + }, + { + "epoch": 11.78, + "grad_norm": 0.5390625, + "learning_rate": 0.00033277775568471216, + "loss": 0.2141, + "step": 284400 + }, + { + "epoch": 11.78, + "grad_norm": 0.87890625, + "learning_rate": 0.0003327675221698958, + "loss": 0.166, + "step": 284410 + }, + { + "epoch": 11.78, + "grad_norm": 0.50390625, + "learning_rate": 0.0003327572884993199, + "loss": 0.1746, + "step": 284420 + }, + { + "epoch": 11.78, + "grad_norm": 0.66796875, + "learning_rate": 0.00033274705467300375, + "loss": 0.2, + "step": 284430 + }, + { + "epoch": 11.78, + "grad_norm": 0.640625, + "learning_rate": 0.0003327368206909666, + "loss": 0.1504, + "step": 284440 + }, + { + "epoch": 11.78, + "grad_norm": 0.482421875, + "learning_rate": 0.00033272658655322763, + "loss": 0.1677, + "step": 284450 + }, + { + "epoch": 11.78, + "grad_norm": 0.75, + "learning_rate": 0.00033271635225980617, + "loss": 0.2204, + "step": 284460 + }, + { + "epoch": 11.78, + "grad_norm": 0.38671875, + "learning_rate": 0.0003327061178107215, + "loss": 0.2413, + "step": 284470 + }, + { + "epoch": 11.78, + "grad_norm": 0.9296875, + "learning_rate": 0.0003326958832059928, + "loss": 0.1954, + "step": 284480 + }, + { + "epoch": 11.78, + "grad_norm": 0.50390625, + "learning_rate": 0.0003326856484456394, + "loss": 0.2062, + "step": 284490 + }, + { + "epoch": 11.78, + "grad_norm": 0.55078125, + "learning_rate": 0.00033267541352968055, + "loss": 0.1973, + "step": 284500 + }, + { + "epoch": 11.78, + "grad_norm": 0.474609375, + "learning_rate": 0.0003326651784581355, + "loss": 0.1556, + "step": 284510 + }, + { + "epoch": 11.78, + "grad_norm": 1.0, + "learning_rate": 0.00033265494323102355, + "loss": 0.1893, + "step": 284520 + }, + { + "epoch": 11.79, + "grad_norm": 0.91796875, + "learning_rate": 0.00033264470784836385, + "loss": 0.1971, + "step": 284530 + }, + { + "epoch": 11.79, + "grad_norm": 0.0, + "learning_rate": 0.0003326344723101758, + "loss": 0.2048, + "step": 284540 + }, + { + "epoch": 11.79, + "grad_norm": 0.494140625, + "learning_rate": 0.0003326242366164786, + "loss": 0.1891, + "step": 284550 + }, + { + "epoch": 11.79, + "grad_norm": 0.91015625, + "learning_rate": 0.00033261400076729135, + "loss": 0.1811, + "step": 284560 + }, + { + "epoch": 11.79, + "grad_norm": 0.8828125, + "learning_rate": 0.00033260376476263365, + "loss": 0.2012, + "step": 284570 + }, + { + "epoch": 11.79, + "grad_norm": 0.75, + "learning_rate": 0.00033259352860252455, + "loss": 0.1969, + "step": 284580 + }, + { + "epoch": 11.79, + "grad_norm": 0.5859375, + "learning_rate": 0.0003325832922869833, + "loss": 0.1831, + "step": 284590 + }, + { + "epoch": 11.79, + "grad_norm": 0.81640625, + "learning_rate": 0.0003325730558160293, + "loss": 0.1961, + "step": 284600 + }, + { + "epoch": 11.79, + "grad_norm": 1.6640625, + "learning_rate": 0.00033256281918968164, + "loss": 0.2196, + "step": 284610 + }, + { + "epoch": 11.79, + "grad_norm": 0.765625, + "learning_rate": 0.00033255258240795965, + "loss": 0.1666, + "step": 284620 + }, + { + "epoch": 11.79, + "grad_norm": 0.58203125, + "learning_rate": 0.0003325423454708827, + "loss": 0.208, + "step": 284630 + }, + { + "epoch": 11.79, + "grad_norm": 1.1171875, + "learning_rate": 0.0003325321083784699, + "loss": 0.2408, + "step": 284640 + }, + { + "epoch": 11.79, + "grad_norm": 0.6015625, + "learning_rate": 0.00033252187113074063, + "loss": 0.2086, + "step": 284650 + }, + { + "epoch": 11.79, + "grad_norm": 0.5234375, + "learning_rate": 0.0003325116337277141, + "loss": 0.1707, + "step": 284660 + }, + { + "epoch": 11.79, + "grad_norm": 0.60546875, + "learning_rate": 0.0003325013961694095, + "loss": 0.1987, + "step": 284670 + }, + { + "epoch": 11.79, + "grad_norm": 0.30859375, + "learning_rate": 0.0003324911584558464, + "loss": 0.197, + "step": 284680 + }, + { + "epoch": 11.79, + "grad_norm": 0.53125, + "learning_rate": 0.0003324809205870436, + "loss": 0.2464, + "step": 284690 + }, + { + "epoch": 11.79, + "grad_norm": 0.62109375, + "learning_rate": 0.00033247068256302077, + "loss": 0.2019, + "step": 284700 + }, + { + "epoch": 11.79, + "grad_norm": 1.171875, + "learning_rate": 0.00033246044438379704, + "loss": 0.1434, + "step": 284710 + }, + { + "epoch": 11.79, + "grad_norm": 1.0234375, + "learning_rate": 0.0003324502060493915, + "loss": 0.1907, + "step": 284720 + }, + { + "epoch": 11.79, + "grad_norm": 0.62890625, + "learning_rate": 0.00033243996755982373, + "loss": 0.1933, + "step": 284730 + }, + { + "epoch": 11.79, + "grad_norm": 1.0859375, + "learning_rate": 0.00033242972891511276, + "loss": 0.1885, + "step": 284740 + }, + { + "epoch": 11.79, + "grad_norm": 0.412109375, + "learning_rate": 0.00033241949011527793, + "loss": 0.151, + "step": 284750 + }, + { + "epoch": 11.79, + "grad_norm": 1.4453125, + "learning_rate": 0.0003324092511603386, + "loss": 0.1653, + "step": 284760 + }, + { + "epoch": 11.8, + "grad_norm": 0.4453125, + "learning_rate": 0.00033239901205031386, + "loss": 0.1642, + "step": 284770 + }, + { + "epoch": 11.8, + "grad_norm": 0.69921875, + "learning_rate": 0.00033238877278522316, + "loss": 0.172, + "step": 284780 + }, + { + "epoch": 11.8, + "grad_norm": 0.58203125, + "learning_rate": 0.00033237853336508575, + "loss": 0.2095, + "step": 284790 + }, + { + "epoch": 11.8, + "grad_norm": 0.58203125, + "learning_rate": 0.00033236829378992065, + "loss": 0.1511, + "step": 284800 + }, + { + "epoch": 11.8, + "grad_norm": 0.96875, + "learning_rate": 0.00033235805405974745, + "loss": 0.1401, + "step": 284810 + }, + { + "epoch": 11.8, + "grad_norm": 0.53125, + "learning_rate": 0.00033234781417458526, + "loss": 0.2176, + "step": 284820 + }, + { + "epoch": 11.8, + "grad_norm": 0.97265625, + "learning_rate": 0.0003323375741344533, + "loss": 0.2452, + "step": 284830 + }, + { + "epoch": 11.8, + "grad_norm": 2.453125, + "learning_rate": 0.0003323273339393711, + "loss": 0.2193, + "step": 284840 + }, + { + "epoch": 11.8, + "grad_norm": 0.73828125, + "learning_rate": 0.0003323170935893575, + "loss": 0.1688, + "step": 284850 + }, + { + "epoch": 11.8, + "grad_norm": 0.921875, + "learning_rate": 0.0003323068530844322, + "loss": 0.2298, + "step": 284860 + }, + { + "epoch": 11.8, + "grad_norm": 0.287109375, + "learning_rate": 0.00033229661242461427, + "loss": 0.1494, + "step": 284870 + }, + { + "epoch": 11.8, + "grad_norm": 3.171875, + "learning_rate": 0.0003322863716099229, + "loss": 0.2416, + "step": 284880 + }, + { + "epoch": 11.8, + "grad_norm": 1.3984375, + "learning_rate": 0.00033227613064037764, + "loss": 0.1958, + "step": 284890 + }, + { + "epoch": 11.8, + "grad_norm": 0.7109375, + "learning_rate": 0.00033226588951599747, + "loss": 0.2171, + "step": 284900 + }, + { + "epoch": 11.8, + "grad_norm": 0.453125, + "learning_rate": 0.00033225564823680174, + "loss": 0.1708, + "step": 284910 + }, + { + "epoch": 11.8, + "grad_norm": 0.6796875, + "learning_rate": 0.0003322454068028099, + "loss": 0.1945, + "step": 284920 + }, + { + "epoch": 11.8, + "grad_norm": 0.51953125, + "learning_rate": 0.0003322351652140409, + "loss": 0.2008, + "step": 284930 + }, + { + "epoch": 11.8, + "grad_norm": 0.29296875, + "learning_rate": 0.00033222492347051436, + "loss": 0.2073, + "step": 284940 + }, + { + "epoch": 11.8, + "grad_norm": 1.265625, + "learning_rate": 0.0003322146815722494, + "loss": 0.212, + "step": 284950 + }, + { + "epoch": 11.8, + "grad_norm": 0.87890625, + "learning_rate": 0.0003322044395192652, + "loss": 0.2305, + "step": 284960 + }, + { + "epoch": 11.8, + "grad_norm": 0.44140625, + "learning_rate": 0.00033219419731158115, + "loss": 0.216, + "step": 284970 + }, + { + "epoch": 11.8, + "grad_norm": 0.61328125, + "learning_rate": 0.0003321839549492165, + "loss": 0.1597, + "step": 284980 + }, + { + "epoch": 11.8, + "grad_norm": 0.76171875, + "learning_rate": 0.00033217371243219045, + "loss": 0.2057, + "step": 284990 + }, + { + "epoch": 11.8, + "grad_norm": 1.3671875, + "learning_rate": 0.0003321634697605225, + "loss": 0.2386, + "step": 285000 + }, + { + "epoch": 11.81, + "grad_norm": 0.6328125, + "learning_rate": 0.00033215322693423157, + "loss": 0.1398, + "step": 285010 + }, + { + "epoch": 11.81, + "grad_norm": 0.84765625, + "learning_rate": 0.0003321429839533373, + "loss": 0.1765, + "step": 285020 + }, + { + "epoch": 11.81, + "grad_norm": 0.55078125, + "learning_rate": 0.00033213274081785884, + "loss": 0.133, + "step": 285030 + }, + { + "epoch": 11.81, + "grad_norm": 0.498046875, + "learning_rate": 0.00033212249752781524, + "loss": 0.1955, + "step": 285040 + }, + { + "epoch": 11.81, + "grad_norm": 0.98046875, + "learning_rate": 0.00033211225408322616, + "loss": 0.1632, + "step": 285050 + }, + { + "epoch": 11.81, + "grad_norm": 0.8984375, + "learning_rate": 0.00033210201048411057, + "loss": 0.1784, + "step": 285060 + }, + { + "epoch": 11.81, + "grad_norm": 0.28515625, + "learning_rate": 0.0003320917667304879, + "loss": 0.1728, + "step": 285070 + }, + { + "epoch": 11.81, + "grad_norm": 0.64453125, + "learning_rate": 0.0003320815228223774, + "loss": 0.2132, + "step": 285080 + }, + { + "epoch": 11.81, + "grad_norm": 0.8828125, + "learning_rate": 0.00033207127875979826, + "loss": 0.2335, + "step": 285090 + }, + { + "epoch": 11.81, + "grad_norm": 1.5625, + "learning_rate": 0.00033206103454276993, + "loss": 0.1901, + "step": 285100 + }, + { + "epoch": 11.81, + "grad_norm": 0.62890625, + "learning_rate": 0.00033205079017131154, + "loss": 0.1958, + "step": 285110 + }, + { + "epoch": 11.81, + "grad_norm": 0.73046875, + "learning_rate": 0.0003320405456454425, + "loss": 0.1789, + "step": 285120 + }, + { + "epoch": 11.81, + "grad_norm": 0.50390625, + "learning_rate": 0.00033203030096518195, + "loss": 0.1882, + "step": 285130 + }, + { + "epoch": 11.81, + "grad_norm": 1.1796875, + "learning_rate": 0.00033202005613054925, + "loss": 0.2179, + "step": 285140 + }, + { + "epoch": 11.81, + "grad_norm": 0.69921875, + "learning_rate": 0.00033200981114156367, + "loss": 0.1793, + "step": 285150 + }, + { + "epoch": 11.81, + "grad_norm": 0.796875, + "learning_rate": 0.0003319995659982445, + "loss": 0.181, + "step": 285160 + }, + { + "epoch": 11.81, + "grad_norm": 0.64453125, + "learning_rate": 0.0003319893207006109, + "loss": 0.1821, + "step": 285170 + }, + { + "epoch": 11.81, + "grad_norm": 1.515625, + "learning_rate": 0.0003319790752486824, + "loss": 0.1768, + "step": 285180 + }, + { + "epoch": 11.81, + "grad_norm": 0.96875, + "learning_rate": 0.000331968829642478, + "loss": 0.1992, + "step": 285190 + }, + { + "epoch": 11.81, + "grad_norm": 1.359375, + "learning_rate": 0.00033195858388201715, + "loss": 0.1712, + "step": 285200 + }, + { + "epoch": 11.81, + "grad_norm": 0.66015625, + "learning_rate": 0.00033194833796731916, + "loss": 0.17, + "step": 285210 + }, + { + "epoch": 11.81, + "grad_norm": 0.042724609375, + "learning_rate": 0.00033193809189840316, + "loss": 0.1694, + "step": 285220 + }, + { + "epoch": 11.81, + "grad_norm": 0.54296875, + "learning_rate": 0.00033192784567528857, + "loss": 0.1918, + "step": 285230 + }, + { + "epoch": 11.81, + "grad_norm": 1.4609375, + "learning_rate": 0.00033191759929799457, + "loss": 0.177, + "step": 285240 + }, + { + "epoch": 11.82, + "grad_norm": 0.96875, + "learning_rate": 0.0003319073527665406, + "loss": 0.2009, + "step": 285250 + }, + { + "epoch": 11.82, + "grad_norm": 0.74609375, + "learning_rate": 0.00033189710608094584, + "loss": 0.2051, + "step": 285260 + }, + { + "epoch": 11.82, + "grad_norm": 0.396484375, + "learning_rate": 0.0003318868592412294, + "loss": 0.1712, + "step": 285270 + }, + { + "epoch": 11.82, + "grad_norm": 0.94140625, + "learning_rate": 0.0003318766122474109, + "loss": 0.1994, + "step": 285280 + }, + { + "epoch": 11.82, + "grad_norm": 0.83203125, + "learning_rate": 0.0003318663650995094, + "loss": 0.2271, + "step": 285290 + }, + { + "epoch": 11.82, + "grad_norm": 1.1171875, + "learning_rate": 0.00033185611779754424, + "loss": 0.1777, + "step": 285300 + }, + { + "epoch": 11.82, + "grad_norm": 1.03125, + "learning_rate": 0.0003318458703415347, + "loss": 0.161, + "step": 285310 + }, + { + "epoch": 11.82, + "grad_norm": 1.3046875, + "learning_rate": 0.00033183562273150004, + "loss": 0.2203, + "step": 285320 + }, + { + "epoch": 11.82, + "grad_norm": 0.349609375, + "learning_rate": 0.00033182537496745964, + "loss": 0.1899, + "step": 285330 + }, + { + "epoch": 11.82, + "grad_norm": 1.9921875, + "learning_rate": 0.0003318151270494327, + "loss": 0.1928, + "step": 285340 + }, + { + "epoch": 11.82, + "grad_norm": 1.7265625, + "learning_rate": 0.0003318048789774385, + "loss": 0.1762, + "step": 285350 + }, + { + "epoch": 11.82, + "grad_norm": 0.451171875, + "learning_rate": 0.0003317946307514964, + "loss": 0.2205, + "step": 285360 + }, + { + "epoch": 11.82, + "grad_norm": 0.68359375, + "learning_rate": 0.00033178438237162563, + "loss": 0.1382, + "step": 285370 + }, + { + "epoch": 11.82, + "grad_norm": 1.3203125, + "learning_rate": 0.00033177413383784544, + "loss": 0.2119, + "step": 285380 + }, + { + "epoch": 11.82, + "grad_norm": 0.625, + "learning_rate": 0.0003317638851501752, + "loss": 0.2002, + "step": 285390 + }, + { + "epoch": 11.82, + "grad_norm": 0.5859375, + "learning_rate": 0.0003317536363086341, + "loss": 0.2026, + "step": 285400 + }, + { + "epoch": 11.82, + "grad_norm": 1.0546875, + "learning_rate": 0.0003317433873132415, + "loss": 0.2505, + "step": 285410 + }, + { + "epoch": 11.82, + "grad_norm": 0.796875, + "learning_rate": 0.0003317331381640168, + "loss": 0.1881, + "step": 285420 + }, + { + "epoch": 11.82, + "grad_norm": 0.462890625, + "learning_rate": 0.000331722888860979, + "loss": 0.1881, + "step": 285430 + }, + { + "epoch": 11.82, + "grad_norm": 0.875, + "learning_rate": 0.00033171263940414764, + "loss": 0.1604, + "step": 285440 + }, + { + "epoch": 11.82, + "grad_norm": 0.8671875, + "learning_rate": 0.00033170238979354186, + "loss": 0.2006, + "step": 285450 + }, + { + "epoch": 11.82, + "grad_norm": 0.9765625, + "learning_rate": 0.0003316921400291811, + "loss": 0.2638, + "step": 285460 + }, + { + "epoch": 11.82, + "grad_norm": 0.59375, + "learning_rate": 0.0003316818901110844, + "loss": 0.2076, + "step": 285470 + }, + { + "epoch": 11.82, + "grad_norm": 0.296875, + "learning_rate": 0.00033167164003927133, + "loss": 0.1753, + "step": 285480 + }, + { + "epoch": 11.82, + "grad_norm": 1.71875, + "learning_rate": 0.000331661389813761, + "loss": 0.2023, + "step": 285490 + }, + { + "epoch": 11.83, + "grad_norm": 0.48046875, + "learning_rate": 0.0003316511394345728, + "loss": 0.1631, + "step": 285500 + }, + { + "epoch": 11.83, + "grad_norm": 1.03125, + "learning_rate": 0.00033164088890172595, + "loss": 0.21, + "step": 285510 + }, + { + "epoch": 11.83, + "grad_norm": 0.53515625, + "learning_rate": 0.0003316306382152397, + "loss": 0.1583, + "step": 285520 + }, + { + "epoch": 11.83, + "grad_norm": 0.625, + "learning_rate": 0.00033162038737513346, + "loss": 0.1577, + "step": 285530 + }, + { + "epoch": 11.83, + "grad_norm": 0.59375, + "learning_rate": 0.00033161013638142653, + "loss": 0.1998, + "step": 285540 + }, + { + "epoch": 11.83, + "grad_norm": 0.41015625, + "learning_rate": 0.000331599885234138, + "loss": 0.159, + "step": 285550 + }, + { + "epoch": 11.83, + "grad_norm": 1.3359375, + "learning_rate": 0.00033158963393328735, + "loss": 0.2071, + "step": 285560 + }, + { + "epoch": 11.83, + "grad_norm": 1.1015625, + "learning_rate": 0.00033157938247889386, + "loss": 0.1518, + "step": 285570 + }, + { + "epoch": 11.83, + "grad_norm": 1.09375, + "learning_rate": 0.0003315691308709768, + "loss": 0.1872, + "step": 285580 + }, + { + "epoch": 11.83, + "grad_norm": 0.96484375, + "learning_rate": 0.0003315588791095554, + "loss": 0.1726, + "step": 285590 + }, + { + "epoch": 11.83, + "grad_norm": 0.6953125, + "learning_rate": 0.00033154862719464895, + "loss": 0.1964, + "step": 285600 + }, + { + "epoch": 11.83, + "grad_norm": 0.9453125, + "learning_rate": 0.0003315383751262768, + "loss": 0.1361, + "step": 285610 + }, + { + "epoch": 11.83, + "grad_norm": 0.23828125, + "learning_rate": 0.00033152812290445835, + "loss": 0.1754, + "step": 285620 + }, + { + "epoch": 11.83, + "grad_norm": 0.73828125, + "learning_rate": 0.00033151787052921266, + "loss": 0.198, + "step": 285630 + }, + { + "epoch": 11.83, + "grad_norm": 1.3046875, + "learning_rate": 0.0003315076180005592, + "loss": 0.2194, + "step": 285640 + }, + { + "epoch": 11.83, + "grad_norm": 0.5859375, + "learning_rate": 0.0003314973653185172, + "loss": 0.2044, + "step": 285650 + }, + { + "epoch": 11.83, + "grad_norm": 0.69140625, + "learning_rate": 0.00033148711248310594, + "loss": 0.1794, + "step": 285660 + }, + { + "epoch": 11.83, + "grad_norm": 1.3984375, + "learning_rate": 0.00033147685949434476, + "loss": 0.2319, + "step": 285670 + }, + { + "epoch": 11.83, + "grad_norm": 0.62109375, + "learning_rate": 0.0003314666063522529, + "loss": 0.183, + "step": 285680 + }, + { + "epoch": 11.83, + "grad_norm": 0.77734375, + "learning_rate": 0.0003314563530568497, + "loss": 0.1975, + "step": 285690 + }, + { + "epoch": 11.83, + "grad_norm": 0.333984375, + "learning_rate": 0.0003314460996081544, + "loss": 0.174, + "step": 285700 + }, + { + "epoch": 11.83, + "grad_norm": 2.65625, + "learning_rate": 0.00033143584600618637, + "loss": 0.215, + "step": 285710 + }, + { + "epoch": 11.83, + "grad_norm": 0.78125, + "learning_rate": 0.0003314255922509649, + "loss": 0.1817, + "step": 285720 + }, + { + "epoch": 11.83, + "grad_norm": 0.7265625, + "learning_rate": 0.00033141533834250917, + "loss": 0.2044, + "step": 285730 + }, + { + "epoch": 11.84, + "grad_norm": 0.0, + "learning_rate": 0.00033140508428083866, + "loss": 0.187, + "step": 285740 + }, + { + "epoch": 11.84, + "grad_norm": 0.384765625, + "learning_rate": 0.0003313948300659726, + "loss": 0.1449, + "step": 285750 + }, + { + "epoch": 11.84, + "grad_norm": 0.4140625, + "learning_rate": 0.00033138457569793013, + "loss": 0.2491, + "step": 285760 + }, + { + "epoch": 11.84, + "grad_norm": 0.53515625, + "learning_rate": 0.0003313743211767307, + "loss": 0.1748, + "step": 285770 + }, + { + "epoch": 11.84, + "grad_norm": 0.28125, + "learning_rate": 0.0003313640665023937, + "loss": 0.2026, + "step": 285780 + }, + { + "epoch": 11.84, + "grad_norm": 0.890625, + "learning_rate": 0.0003313538116749382, + "loss": 0.2035, + "step": 285790 + }, + { + "epoch": 11.84, + "grad_norm": 2.8125, + "learning_rate": 0.0003313435566943837, + "loss": 0.2039, + "step": 285800 + }, + { + "epoch": 11.84, + "grad_norm": 1.1484375, + "learning_rate": 0.00033133330156074936, + "loss": 0.162, + "step": 285810 + }, + { + "epoch": 11.84, + "grad_norm": 1.9140625, + "learning_rate": 0.00033132304627405453, + "loss": 0.1993, + "step": 285820 + }, + { + "epoch": 11.84, + "grad_norm": 1.109375, + "learning_rate": 0.00033131279083431855, + "loss": 0.1947, + "step": 285830 + }, + { + "epoch": 11.84, + "grad_norm": 0.232421875, + "learning_rate": 0.0003313025352415606, + "loss": 0.2471, + "step": 285840 + }, + { + "epoch": 11.84, + "grad_norm": 0.86328125, + "learning_rate": 0.0003312922794958001, + "loss": 0.2136, + "step": 285850 + }, + { + "epoch": 11.84, + "grad_norm": 1.296875, + "learning_rate": 0.0003312820235970564, + "loss": 0.1598, + "step": 285860 + }, + { + "epoch": 11.84, + "grad_norm": 1.03125, + "learning_rate": 0.00033127176754534847, + "loss": 0.2199, + "step": 285870 + }, + { + "epoch": 11.84, + "grad_norm": 1.9296875, + "learning_rate": 0.0003312615113406961, + "loss": 0.1793, + "step": 285880 + }, + { + "epoch": 11.84, + "grad_norm": 1.9921875, + "learning_rate": 0.0003312512549831182, + "loss": 0.216, + "step": 285890 + }, + { + "epoch": 11.84, + "grad_norm": 0.671875, + "learning_rate": 0.0003312409984726342, + "loss": 0.243, + "step": 285900 + }, + { + "epoch": 11.84, + "grad_norm": 1.4609375, + "learning_rate": 0.00033123074180926355, + "loss": 0.1679, + "step": 285910 + }, + { + "epoch": 11.84, + "grad_norm": 2.0625, + "learning_rate": 0.0003312204849930253, + "loss": 0.1894, + "step": 285920 + }, + { + "epoch": 11.84, + "grad_norm": 0.5078125, + "learning_rate": 0.0003312102280239389, + "loss": 0.1741, + "step": 285930 + }, + { + "epoch": 11.84, + "grad_norm": 0.51953125, + "learning_rate": 0.0003311999709020237, + "loss": 0.1756, + "step": 285940 + }, + { + "epoch": 11.84, + "grad_norm": 0.84375, + "learning_rate": 0.00033118971362729876, + "loss": 0.181, + "step": 285950 + }, + { + "epoch": 11.84, + "grad_norm": 0.66796875, + "learning_rate": 0.0003311794561997837, + "loss": 0.2138, + "step": 285960 + }, + { + "epoch": 11.84, + "grad_norm": 0.431640625, + "learning_rate": 0.00033116919861949756, + "loss": 0.171, + "step": 285970 + }, + { + "epoch": 11.85, + "grad_norm": 1.046875, + "learning_rate": 0.0003311589408864598, + "loss": 0.1854, + "step": 285980 + }, + { + "epoch": 11.85, + "grad_norm": 0.69140625, + "learning_rate": 0.0003311486830006897, + "loss": 0.2566, + "step": 285990 + }, + { + "epoch": 11.85, + "grad_norm": 1.1171875, + "learning_rate": 0.00033113842496220646, + "loss": 0.211, + "step": 286000 + }, + { + "epoch": 11.85, + "grad_norm": 0.60546875, + "learning_rate": 0.0003311281667710296, + "loss": 0.1915, + "step": 286010 + }, + { + "epoch": 11.85, + "grad_norm": 0.83203125, + "learning_rate": 0.0003311179084271782, + "loss": 0.2002, + "step": 286020 + }, + { + "epoch": 11.85, + "grad_norm": 1.140625, + "learning_rate": 0.00033110764993067165, + "loss": 0.1787, + "step": 286030 + }, + { + "epoch": 11.85, + "grad_norm": 0.8671875, + "learning_rate": 0.00033109739128152934, + "loss": 0.1916, + "step": 286040 + }, + { + "epoch": 11.85, + "grad_norm": 0.578125, + "learning_rate": 0.0003310871324797704, + "loss": 0.2008, + "step": 286050 + }, + { + "epoch": 11.85, + "grad_norm": 0.64453125, + "learning_rate": 0.00033107687352541425, + "loss": 0.1537, + "step": 286060 + }, + { + "epoch": 11.85, + "grad_norm": 0.9375, + "learning_rate": 0.00033106661441848026, + "loss": 0.1879, + "step": 286070 + }, + { + "epoch": 11.85, + "grad_norm": 1.2890625, + "learning_rate": 0.00033105635515898754, + "loss": 0.2099, + "step": 286080 + }, + { + "epoch": 11.85, + "grad_norm": 0.98828125, + "learning_rate": 0.0003310460957469556, + "loss": 0.2204, + "step": 286090 + }, + { + "epoch": 11.85, + "grad_norm": 0.51953125, + "learning_rate": 0.0003310358361824036, + "loss": 0.1381, + "step": 286100 + }, + { + "epoch": 11.85, + "grad_norm": 0.76171875, + "learning_rate": 0.0003310255764653509, + "loss": 0.2278, + "step": 286110 + }, + { + "epoch": 11.85, + "grad_norm": 0.44140625, + "learning_rate": 0.000331015316595817, + "loss": 0.1611, + "step": 286120 + }, + { + "epoch": 11.85, + "grad_norm": 0.91015625, + "learning_rate": 0.0003310050565738208, + "loss": 0.1843, + "step": 286130 + }, + { + "epoch": 11.85, + "grad_norm": 0.5859375, + "learning_rate": 0.0003309947963993819, + "loss": 0.1345, + "step": 286140 + }, + { + "epoch": 11.85, + "grad_norm": 1.5078125, + "learning_rate": 0.00033098453607251953, + "loss": 0.2503, + "step": 286150 + }, + { + "epoch": 11.85, + "grad_norm": 0.6953125, + "learning_rate": 0.000330974275593253, + "loss": 0.1881, + "step": 286160 + }, + { + "epoch": 11.85, + "grad_norm": 1.1015625, + "learning_rate": 0.0003309640149616017, + "loss": 0.1827, + "step": 286170 + }, + { + "epoch": 11.85, + "grad_norm": 0.369140625, + "learning_rate": 0.00033095375417758484, + "loss": 0.1719, + "step": 286180 + }, + { + "epoch": 11.85, + "grad_norm": 0.97265625, + "learning_rate": 0.0003309434932412217, + "loss": 0.1942, + "step": 286190 + }, + { + "epoch": 11.85, + "grad_norm": 0.25390625, + "learning_rate": 0.00033093323215253173, + "loss": 0.215, + "step": 286200 + }, + { + "epoch": 11.85, + "grad_norm": 0.291015625, + "learning_rate": 0.0003309229709115341, + "loss": 0.1885, + "step": 286210 + }, + { + "epoch": 11.86, + "grad_norm": 1.09375, + "learning_rate": 0.0003309127095182482, + "loss": 0.1651, + "step": 286220 + }, + { + "epoch": 11.86, + "grad_norm": 0.2138671875, + "learning_rate": 0.00033090244797269333, + "loss": 0.1876, + "step": 286230 + }, + { + "epoch": 11.86, + "grad_norm": 0.435546875, + "learning_rate": 0.0003308921862748887, + "loss": 0.1655, + "step": 286240 + }, + { + "epoch": 11.86, + "grad_norm": 0.48046875, + "learning_rate": 0.0003308819244248538, + "loss": 0.2157, + "step": 286250 + }, + { + "epoch": 11.86, + "grad_norm": 0.265625, + "learning_rate": 0.00033087166242260787, + "loss": 0.1871, + "step": 286260 + }, + { + "epoch": 11.86, + "grad_norm": 0.953125, + "learning_rate": 0.0003308614002681701, + "loss": 0.1997, + "step": 286270 + }, + { + "epoch": 11.86, + "grad_norm": 0.90625, + "learning_rate": 0.0003308511379615601, + "loss": 0.2045, + "step": 286280 + }, + { + "epoch": 11.86, + "grad_norm": 0.92578125, + "learning_rate": 0.0003308408755027968, + "loss": 0.2061, + "step": 286290 + }, + { + "epoch": 11.86, + "grad_norm": 0.70703125, + "learning_rate": 0.0003308306128918998, + "loss": 0.1844, + "step": 286300 + }, + { + "epoch": 11.86, + "grad_norm": 0.6484375, + "learning_rate": 0.0003308203501288883, + "loss": 0.2154, + "step": 286310 + }, + { + "epoch": 11.86, + "grad_norm": 0.97265625, + "learning_rate": 0.0003308100872137815, + "loss": 0.205, + "step": 286320 + }, + { + "epoch": 11.86, + "grad_norm": 0.61328125, + "learning_rate": 0.00033079982414659906, + "loss": 0.2179, + "step": 286330 + }, + { + "epoch": 11.86, + "grad_norm": 0.9453125, + "learning_rate": 0.0003307895609273599, + "loss": 0.1948, + "step": 286340 + }, + { + "epoch": 11.86, + "grad_norm": 1.6640625, + "learning_rate": 0.0003307792975560836, + "loss": 0.1682, + "step": 286350 + }, + { + "epoch": 11.86, + "grad_norm": 1.15625, + "learning_rate": 0.00033076903403278935, + "loss": 0.1873, + "step": 286360 + }, + { + "epoch": 11.86, + "grad_norm": 0.9296875, + "learning_rate": 0.0003307587703574965, + "loss": 0.1966, + "step": 286370 + }, + { + "epoch": 11.86, + "grad_norm": 0.58984375, + "learning_rate": 0.0003307485065302244, + "loss": 0.2145, + "step": 286380 + }, + { + "epoch": 11.86, + "grad_norm": 0.427734375, + "learning_rate": 0.0003307382425509924, + "loss": 0.1964, + "step": 286390 + }, + { + "epoch": 11.86, + "grad_norm": 0.6015625, + "learning_rate": 0.00033072797841981956, + "loss": 0.2089, + "step": 286400 + }, + { + "epoch": 11.86, + "grad_norm": 0.484375, + "learning_rate": 0.0003307177141367255, + "loss": 0.1539, + "step": 286410 + }, + { + "epoch": 11.86, + "grad_norm": 0.6796875, + "learning_rate": 0.0003307074497017294, + "loss": 0.2044, + "step": 286420 + }, + { + "epoch": 11.86, + "grad_norm": 2.609375, + "learning_rate": 0.0003306971851148506, + "loss": 0.1545, + "step": 286430 + }, + { + "epoch": 11.86, + "grad_norm": 0.78125, + "learning_rate": 0.0003306869203761084, + "loss": 0.2203, + "step": 286440 + }, + { + "epoch": 11.86, + "grad_norm": 0.6171875, + "learning_rate": 0.00033067665548552216, + "loss": 0.1929, + "step": 286450 + }, + { + "epoch": 11.87, + "grad_norm": 0.59375, + "learning_rate": 0.00033066639044311113, + "loss": 0.1822, + "step": 286460 + }, + { + "epoch": 11.87, + "grad_norm": 0.83984375, + "learning_rate": 0.0003306561252488947, + "loss": 0.169, + "step": 286470 + }, + { + "epoch": 11.87, + "grad_norm": 0.7109375, + "learning_rate": 0.00033064585990289207, + "loss": 0.1633, + "step": 286480 + }, + { + "epoch": 11.87, + "grad_norm": 0.384765625, + "learning_rate": 0.00033063559440512275, + "loss": 0.1358, + "step": 286490 + }, + { + "epoch": 11.87, + "grad_norm": 1.0234375, + "learning_rate": 0.00033062532875560594, + "loss": 0.1964, + "step": 286500 + }, + { + "epoch": 11.87, + "grad_norm": 0.86328125, + "learning_rate": 0.00033061506295436083, + "loss": 0.236, + "step": 286510 + }, + { + "epoch": 11.87, + "grad_norm": 0.48828125, + "learning_rate": 0.00033060479700140706, + "loss": 0.1792, + "step": 286520 + }, + { + "epoch": 11.87, + "grad_norm": 1.3515625, + "learning_rate": 0.0003305945308967636, + "loss": 0.2339, + "step": 286530 + }, + { + "epoch": 11.87, + "grad_norm": 0.408203125, + "learning_rate": 0.00033058426464045005, + "loss": 0.2255, + "step": 286540 + }, + { + "epoch": 11.87, + "grad_norm": 0.5546875, + "learning_rate": 0.00033057399823248557, + "loss": 0.2187, + "step": 286550 + }, + { + "epoch": 11.87, + "grad_norm": 0.8671875, + "learning_rate": 0.00033056373167288957, + "loss": 0.1954, + "step": 286560 + }, + { + "epoch": 11.87, + "grad_norm": 0.8984375, + "learning_rate": 0.00033055346496168133, + "loss": 0.2347, + "step": 286570 + }, + { + "epoch": 11.87, + "grad_norm": 0.462890625, + "learning_rate": 0.0003305431980988801, + "loss": 0.186, + "step": 286580 + }, + { + "epoch": 11.87, + "grad_norm": 0.7578125, + "learning_rate": 0.0003305329310845053, + "loss": 0.1636, + "step": 286590 + }, + { + "epoch": 11.87, + "grad_norm": 0.5546875, + "learning_rate": 0.0003305226639185763, + "loss": 0.2043, + "step": 286600 + }, + { + "epoch": 11.87, + "grad_norm": 1.4375, + "learning_rate": 0.0003305123966011122, + "loss": 0.211, + "step": 286610 + }, + { + "epoch": 11.87, + "grad_norm": 0.8671875, + "learning_rate": 0.0003305021291321325, + "loss": 0.1821, + "step": 286620 + }, + { + "epoch": 11.87, + "grad_norm": 0.953125, + "learning_rate": 0.00033049186151165653, + "loss": 0.1708, + "step": 286630 + }, + { + "epoch": 11.87, + "grad_norm": 1.0078125, + "learning_rate": 0.00033048159373970354, + "loss": 0.1858, + "step": 286640 + }, + { + "epoch": 11.87, + "grad_norm": 0.515625, + "learning_rate": 0.000330471325816293, + "loss": 0.1772, + "step": 286650 + }, + { + "epoch": 11.87, + "grad_norm": 0.703125, + "learning_rate": 0.000330461057741444, + "loss": 0.1804, + "step": 286660 + }, + { + "epoch": 11.87, + "grad_norm": 0.98046875, + "learning_rate": 0.00033045078951517593, + "loss": 0.1764, + "step": 286670 + }, + { + "epoch": 11.87, + "grad_norm": 0.90234375, + "learning_rate": 0.0003304405211375083, + "loss": 0.2199, + "step": 286680 + }, + { + "epoch": 11.87, + "grad_norm": 0.93359375, + "learning_rate": 0.0003304302526084602, + "loss": 0.164, + "step": 286690 + }, + { + "epoch": 11.88, + "grad_norm": 0.8515625, + "learning_rate": 0.00033041998392805105, + "loss": 0.1615, + "step": 286700 + }, + { + "epoch": 11.88, + "grad_norm": 1.125, + "learning_rate": 0.0003304097150963002, + "loss": 0.1798, + "step": 286710 + }, + { + "epoch": 11.88, + "grad_norm": 1.109375, + "learning_rate": 0.0003303994461132269, + "loss": 0.1366, + "step": 286720 + }, + { + "epoch": 11.88, + "grad_norm": 0.5546875, + "learning_rate": 0.0003303891769788506, + "loss": 0.1164, + "step": 286730 + }, + { + "epoch": 11.88, + "grad_norm": 1.75, + "learning_rate": 0.00033037890769319056, + "loss": 0.2113, + "step": 286740 + }, + { + "epoch": 11.88, + "grad_norm": 0.85546875, + "learning_rate": 0.00033036863825626603, + "loss": 0.2133, + "step": 286750 + }, + { + "epoch": 11.88, + "grad_norm": 1.0546875, + "learning_rate": 0.00033035836866809643, + "loss": 0.2024, + "step": 286760 + }, + { + "epoch": 11.88, + "grad_norm": 0.69921875, + "learning_rate": 0.00033034809892870106, + "loss": 0.1706, + "step": 286770 + }, + { + "epoch": 11.88, + "grad_norm": 1.6328125, + "learning_rate": 0.0003303378290380993, + "loss": 0.1858, + "step": 286780 + }, + { + "epoch": 11.88, + "grad_norm": 0.96875, + "learning_rate": 0.0003303275589963103, + "loss": 0.175, + "step": 286790 + }, + { + "epoch": 11.88, + "grad_norm": 0.75390625, + "learning_rate": 0.0003303172888033535, + "loss": 0.2091, + "step": 286800 + }, + { + "epoch": 11.88, + "grad_norm": 0.78515625, + "learning_rate": 0.00033030701845924835, + "loss": 0.1827, + "step": 286810 + }, + { + "epoch": 11.88, + "grad_norm": 0.640625, + "learning_rate": 0.000330296747964014, + "loss": 0.2159, + "step": 286820 + }, + { + "epoch": 11.88, + "grad_norm": 1.0234375, + "learning_rate": 0.00033028647731766993, + "loss": 0.1273, + "step": 286830 + }, + { + "epoch": 11.88, + "grad_norm": 0.376953125, + "learning_rate": 0.0003302762065202353, + "loss": 0.211, + "step": 286840 + }, + { + "epoch": 11.88, + "grad_norm": 0.6875, + "learning_rate": 0.0003302659355717295, + "loss": 0.1905, + "step": 286850 + }, + { + "epoch": 11.88, + "grad_norm": 0.443359375, + "learning_rate": 0.00033025566447217195, + "loss": 0.1381, + "step": 286860 + }, + { + "epoch": 11.88, + "grad_norm": 0.671875, + "learning_rate": 0.0003302453932215819, + "loss": 0.1654, + "step": 286870 + }, + { + "epoch": 11.88, + "grad_norm": 1.0078125, + "learning_rate": 0.0003302351218199786, + "loss": 0.2006, + "step": 286880 + }, + { + "epoch": 11.88, + "grad_norm": 0.734375, + "learning_rate": 0.00033022485026738157, + "loss": 0.1933, + "step": 286890 + }, + { + "epoch": 11.88, + "grad_norm": 0.62109375, + "learning_rate": 0.00033021457856380993, + "loss": 0.1978, + "step": 286900 + }, + { + "epoch": 11.88, + "grad_norm": 0.65625, + "learning_rate": 0.00033020430670928315, + "loss": 0.2022, + "step": 286910 + }, + { + "epoch": 11.88, + "grad_norm": 0.6015625, + "learning_rate": 0.0003301940347038205, + "loss": 0.1728, + "step": 286920 + }, + { + "epoch": 11.88, + "grad_norm": 0.73046875, + "learning_rate": 0.00033018376254744143, + "loss": 0.235, + "step": 286930 + }, + { + "epoch": 11.89, + "grad_norm": 0.412109375, + "learning_rate": 0.0003301734902401652, + "loss": 0.158, + "step": 286940 + }, + { + "epoch": 11.89, + "grad_norm": 1.1328125, + "learning_rate": 0.000330163217782011, + "loss": 0.1489, + "step": 286950 + }, + { + "epoch": 11.89, + "grad_norm": 0.4609375, + "learning_rate": 0.0003301529451729983, + "loss": 0.185, + "step": 286960 + }, + { + "epoch": 11.89, + "grad_norm": 0.357421875, + "learning_rate": 0.00033014267241314644, + "loss": 0.1585, + "step": 286970 + }, + { + "epoch": 11.89, + "grad_norm": 0.375, + "learning_rate": 0.00033013239950247474, + "loss": 0.2215, + "step": 286980 + }, + { + "epoch": 11.89, + "grad_norm": 1.71875, + "learning_rate": 0.00033012212644100255, + "loss": 0.2091, + "step": 286990 + }, + { + "epoch": 11.89, + "grad_norm": 1.0390625, + "learning_rate": 0.0003301118532287491, + "loss": 0.1615, + "step": 287000 + }, + { + "epoch": 11.89, + "grad_norm": 0.87109375, + "learning_rate": 0.00033010157986573373, + "loss": 0.2164, + "step": 287010 + }, + { + "epoch": 11.89, + "grad_norm": 2.0625, + "learning_rate": 0.00033009130635197604, + "loss": 0.1977, + "step": 287020 + }, + { + "epoch": 11.89, + "grad_norm": 0.65234375, + "learning_rate": 0.000330081032687495, + "loss": 0.1879, + "step": 287030 + }, + { + "epoch": 11.89, + "grad_norm": 0.64453125, + "learning_rate": 0.0003300707588723101, + "loss": 0.1802, + "step": 287040 + }, + { + "epoch": 11.89, + "grad_norm": 0.30078125, + "learning_rate": 0.00033006048490644076, + "loss": 0.1367, + "step": 287050 + }, + { + "epoch": 11.89, + "grad_norm": 0.54296875, + "learning_rate": 0.0003300502107899063, + "loss": 0.195, + "step": 287060 + }, + { + "epoch": 11.89, + "grad_norm": 0.486328125, + "learning_rate": 0.0003300399365227258, + "loss": 0.1782, + "step": 287070 + }, + { + "epoch": 11.89, + "grad_norm": 0.54296875, + "learning_rate": 0.0003300296621049189, + "loss": 0.1958, + "step": 287080 + }, + { + "epoch": 11.89, + "grad_norm": 2.640625, + "learning_rate": 0.00033001938753650476, + "loss": 0.1656, + "step": 287090 + }, + { + "epoch": 11.89, + "grad_norm": 1.328125, + "learning_rate": 0.0003300091128175028, + "loss": 0.2264, + "step": 287100 + }, + { + "epoch": 11.89, + "grad_norm": 0.61328125, + "learning_rate": 0.0003299988379479324, + "loss": 0.1903, + "step": 287110 + }, + { + "epoch": 11.89, + "grad_norm": 0.55078125, + "learning_rate": 0.00032998856292781273, + "loss": 0.1973, + "step": 287120 + }, + { + "epoch": 11.89, + "grad_norm": 0.4296875, + "learning_rate": 0.00032997828775716324, + "loss": 0.1927, + "step": 287130 + }, + { + "epoch": 11.89, + "grad_norm": 0.60546875, + "learning_rate": 0.00032996801243600327, + "loss": 0.2287, + "step": 287140 + }, + { + "epoch": 11.89, + "grad_norm": 0.462890625, + "learning_rate": 0.00032995773696435215, + "loss": 0.2163, + "step": 287150 + }, + { + "epoch": 11.89, + "grad_norm": 1.3671875, + "learning_rate": 0.0003299474613422292, + "loss": 0.211, + "step": 287160 + }, + { + "epoch": 11.89, + "grad_norm": 0.6640625, + "learning_rate": 0.0003299371855696537, + "loss": 0.171, + "step": 287170 + }, + { + "epoch": 11.89, + "grad_norm": 0.84375, + "learning_rate": 0.0003299269096466451, + "loss": 0.1674, + "step": 287180 + }, + { + "epoch": 11.9, + "grad_norm": 0.36328125, + "learning_rate": 0.0003299166335732227, + "loss": 0.2247, + "step": 287190 + }, + { + "epoch": 11.9, + "grad_norm": 0.470703125, + "learning_rate": 0.0003299063573494058, + "loss": 0.1959, + "step": 287200 + }, + { + "epoch": 11.9, + "grad_norm": 0.55078125, + "learning_rate": 0.0003298960809752137, + "loss": 0.1644, + "step": 287210 + }, + { + "epoch": 11.9, + "grad_norm": 0.83984375, + "learning_rate": 0.00032988580445066594, + "loss": 0.1883, + "step": 287220 + }, + { + "epoch": 11.9, + "grad_norm": 0.85546875, + "learning_rate": 0.00032987552777578166, + "loss": 0.2104, + "step": 287230 + }, + { + "epoch": 11.9, + "grad_norm": 1.0, + "learning_rate": 0.0003298652509505803, + "loss": 0.2391, + "step": 287240 + }, + { + "epoch": 11.9, + "grad_norm": 0.71875, + "learning_rate": 0.00032985497397508106, + "loss": 0.1539, + "step": 287250 + }, + { + "epoch": 11.9, + "grad_norm": 0.2197265625, + "learning_rate": 0.00032984469684930345, + "loss": 0.1828, + "step": 287260 + }, + { + "epoch": 11.9, + "grad_norm": 1.3359375, + "learning_rate": 0.00032983441957326674, + "loss": 0.2204, + "step": 287270 + }, + { + "epoch": 11.9, + "grad_norm": 1.2109375, + "learning_rate": 0.0003298241421469903, + "loss": 0.1849, + "step": 287280 + }, + { + "epoch": 11.9, + "grad_norm": 0.9140625, + "learning_rate": 0.0003298138645704934, + "loss": 0.1963, + "step": 287290 + }, + { + "epoch": 11.9, + "grad_norm": 0.9921875, + "learning_rate": 0.00032980358684379544, + "loss": 0.2398, + "step": 287300 + }, + { + "epoch": 11.9, + "grad_norm": 1.171875, + "learning_rate": 0.00032979330896691574, + "loss": 0.2341, + "step": 287310 + }, + { + "epoch": 11.9, + "grad_norm": 0.244140625, + "learning_rate": 0.0003297830309398737, + "loss": 0.2078, + "step": 287320 + }, + { + "epoch": 11.9, + "grad_norm": 0.59375, + "learning_rate": 0.00032977275276268854, + "loss": 0.1863, + "step": 287330 + }, + { + "epoch": 11.9, + "grad_norm": 0.55859375, + "learning_rate": 0.0003297624744353797, + "loss": 0.2025, + "step": 287340 + }, + { + "epoch": 11.9, + "grad_norm": 0.96484375, + "learning_rate": 0.0003297521959579666, + "loss": 0.1781, + "step": 287350 + }, + { + "epoch": 11.9, + "grad_norm": 0.515625, + "learning_rate": 0.0003297419173304683, + "loss": 0.2248, + "step": 287360 + }, + { + "epoch": 11.9, + "grad_norm": 2.390625, + "learning_rate": 0.0003297316385529044, + "loss": 0.208, + "step": 287370 + }, + { + "epoch": 11.9, + "grad_norm": 1.21875, + "learning_rate": 0.0003297213596252942, + "loss": 0.2057, + "step": 287380 + }, + { + "epoch": 11.9, + "grad_norm": 0.8125, + "learning_rate": 0.0003297110805476569, + "loss": 0.1902, + "step": 287390 + }, + { + "epoch": 11.9, + "grad_norm": 0.5390625, + "learning_rate": 0.0003297008013200121, + "loss": 0.1848, + "step": 287400 + }, + { + "epoch": 11.9, + "grad_norm": 0.61328125, + "learning_rate": 0.0003296905219423789, + "loss": 0.2201, + "step": 287410 + }, + { + "epoch": 11.9, + "grad_norm": 1.0, + "learning_rate": 0.0003296802424147768, + "loss": 0.1187, + "step": 287420 + }, + { + "epoch": 11.91, + "grad_norm": 1.5390625, + "learning_rate": 0.0003296699627372251, + "loss": 0.1253, + "step": 287430 + }, + { + "epoch": 11.91, + "grad_norm": 0.427734375, + "learning_rate": 0.00032965968290974313, + "loss": 0.1831, + "step": 287440 + }, + { + "epoch": 11.91, + "grad_norm": 0.1806640625, + "learning_rate": 0.0003296494029323502, + "loss": 0.1943, + "step": 287450 + }, + { + "epoch": 11.91, + "grad_norm": 0.44140625, + "learning_rate": 0.0003296391228050657, + "loss": 0.2066, + "step": 287460 + }, + { + "epoch": 11.91, + "grad_norm": 0.58203125, + "learning_rate": 0.0003296288425279089, + "loss": 0.2119, + "step": 287470 + }, + { + "epoch": 11.91, + "grad_norm": 0.294921875, + "learning_rate": 0.00032961856210089936, + "loss": 0.2004, + "step": 287480 + }, + { + "epoch": 11.91, + "grad_norm": 0.71484375, + "learning_rate": 0.0003296082815240562, + "loss": 0.1863, + "step": 287490 + }, + { + "epoch": 11.91, + "grad_norm": 1.5234375, + "learning_rate": 0.0003295980007973989, + "loss": 0.1891, + "step": 287500 + }, + { + "epoch": 11.91, + "grad_norm": 1.15625, + "learning_rate": 0.0003295877199209467, + "loss": 0.1701, + "step": 287510 + }, + { + "epoch": 11.91, + "grad_norm": 0.75, + "learning_rate": 0.000329577438894719, + "loss": 0.1584, + "step": 287520 + }, + { + "epoch": 11.91, + "grad_norm": 0.84375, + "learning_rate": 0.0003295671577187352, + "loss": 0.1886, + "step": 287530 + }, + { + "epoch": 11.91, + "grad_norm": 1.21875, + "learning_rate": 0.0003295568763930147, + "loss": 0.1594, + "step": 287540 + }, + { + "epoch": 11.91, + "grad_norm": 0.63671875, + "learning_rate": 0.0003295465949175765, + "loss": 0.2044, + "step": 287550 + }, + { + "epoch": 11.91, + "grad_norm": 0.921875, + "learning_rate": 0.0003295363132924404, + "loss": 0.1737, + "step": 287560 + }, + { + "epoch": 11.91, + "grad_norm": 1.4609375, + "learning_rate": 0.0003295260315176255, + "loss": 0.224, + "step": 287570 + }, + { + "epoch": 11.91, + "grad_norm": 0.59375, + "learning_rate": 0.0003295157495931512, + "loss": 0.1869, + "step": 287580 + }, + { + "epoch": 11.91, + "grad_norm": 0.62890625, + "learning_rate": 0.00032950546751903684, + "loss": 0.2188, + "step": 287590 + }, + { + "epoch": 11.91, + "grad_norm": 0.60546875, + "learning_rate": 0.00032949518529530163, + "loss": 0.2022, + "step": 287600 + }, + { + "epoch": 11.91, + "grad_norm": 0.06787109375, + "learning_rate": 0.00032948490292196534, + "loss": 0.1426, + "step": 287610 + }, + { + "epoch": 11.91, + "grad_norm": 0.87890625, + "learning_rate": 0.0003294746203990469, + "loss": 0.1867, + "step": 287620 + }, + { + "epoch": 11.91, + "grad_norm": 0.8203125, + "learning_rate": 0.00032946433772656575, + "loss": 0.1758, + "step": 287630 + }, + { + "epoch": 11.91, + "grad_norm": 1.453125, + "learning_rate": 0.0003294540549045414, + "loss": 0.1608, + "step": 287640 + }, + { + "epoch": 11.91, + "grad_norm": 0.69921875, + "learning_rate": 0.000329443771932993, + "loss": 0.1879, + "step": 287650 + }, + { + "epoch": 11.91, + "grad_norm": 1.0703125, + "learning_rate": 0.00032943348881194005, + "loss": 0.1791, + "step": 287660 + }, + { + "epoch": 11.92, + "grad_norm": 0.625, + "learning_rate": 0.00032942320554140195, + "loss": 0.1899, + "step": 287670 + }, + { + "epoch": 11.92, + "grad_norm": 0.66015625, + "learning_rate": 0.0003294129221213978, + "loss": 0.2268, + "step": 287680 + }, + { + "epoch": 11.92, + "grad_norm": 0.953125, + "learning_rate": 0.0003294026385519472, + "loss": 0.2123, + "step": 287690 + }, + { + "epoch": 11.92, + "grad_norm": 0.73046875, + "learning_rate": 0.00032939235483306947, + "loss": 0.1847, + "step": 287700 + }, + { + "epoch": 11.92, + "grad_norm": 0.41796875, + "learning_rate": 0.0003293820709647837, + "loss": 0.2122, + "step": 287710 + }, + { + "epoch": 11.92, + "grad_norm": 0.90234375, + "learning_rate": 0.0003293717869471096, + "loss": 0.299, + "step": 287720 + }, + { + "epoch": 11.92, + "grad_norm": 0.45703125, + "learning_rate": 0.0003293615027800663, + "loss": 0.2375, + "step": 287730 + }, + { + "epoch": 11.92, + "grad_norm": 0.7265625, + "learning_rate": 0.0003293512184636733, + "loss": 0.1814, + "step": 287740 + }, + { + "epoch": 11.92, + "grad_norm": 0.78125, + "learning_rate": 0.00032934093399794985, + "loss": 0.2015, + "step": 287750 + }, + { + "epoch": 11.92, + "grad_norm": 0.03515625, + "learning_rate": 0.00032933064938291525, + "loss": 0.1643, + "step": 287760 + }, + { + "epoch": 11.92, + "grad_norm": 0.96875, + "learning_rate": 0.000329320364618589, + "loss": 0.2225, + "step": 287770 + }, + { + "epoch": 11.92, + "grad_norm": 0.484375, + "learning_rate": 0.0003293100797049904, + "loss": 0.1615, + "step": 287780 + }, + { + "epoch": 11.92, + "grad_norm": 0.640625, + "learning_rate": 0.00032929979464213873, + "loss": 0.2223, + "step": 287790 + }, + { + "epoch": 11.92, + "grad_norm": 0.953125, + "learning_rate": 0.0003292895094300535, + "loss": 0.179, + "step": 287800 + }, + { + "epoch": 11.92, + "grad_norm": 0.5546875, + "learning_rate": 0.0003292792240687539, + "loss": 0.2031, + "step": 287810 + }, + { + "epoch": 11.92, + "grad_norm": 1.4140625, + "learning_rate": 0.0003292689385582594, + "loss": 0.1677, + "step": 287820 + }, + { + "epoch": 11.92, + "grad_norm": 0.6640625, + "learning_rate": 0.0003292586528985894, + "loss": 0.1798, + "step": 287830 + }, + { + "epoch": 11.92, + "grad_norm": 0.640625, + "learning_rate": 0.00032924836708976297, + "loss": 0.1803, + "step": 287840 + }, + { + "epoch": 11.92, + "grad_norm": 0.396484375, + "learning_rate": 0.0003292380811317998, + "loss": 0.2112, + "step": 287850 + }, + { + "epoch": 11.92, + "grad_norm": 0.400390625, + "learning_rate": 0.0003292277950247191, + "loss": 0.1757, + "step": 287860 + }, + { + "epoch": 11.92, + "grad_norm": 1.09375, + "learning_rate": 0.0003292175087685403, + "loss": 0.1913, + "step": 287870 + }, + { + "epoch": 11.92, + "grad_norm": 1.6953125, + "learning_rate": 0.00032920722236328266, + "loss": 0.2451, + "step": 287880 + }, + { + "epoch": 11.92, + "grad_norm": 1.0625, + "learning_rate": 0.00032919693580896555, + "loss": 0.2036, + "step": 287890 + }, + { + "epoch": 11.92, + "grad_norm": 0.0, + "learning_rate": 0.0003291866491056084, + "loss": 0.2126, + "step": 287900 + }, + { + "epoch": 11.93, + "grad_norm": 0.400390625, + "learning_rate": 0.00032917636225323054, + "loss": 0.1447, + "step": 287910 + }, + { + "epoch": 11.93, + "grad_norm": 1.828125, + "learning_rate": 0.00032916607525185123, + "loss": 0.2427, + "step": 287920 + }, + { + "epoch": 11.93, + "grad_norm": 0.42578125, + "learning_rate": 0.00032915578810149, + "loss": 0.1662, + "step": 287930 + }, + { + "epoch": 11.93, + "grad_norm": 1.375, + "learning_rate": 0.0003291455008021661, + "loss": 0.2045, + "step": 287940 + }, + { + "epoch": 11.93, + "grad_norm": 0.484375, + "learning_rate": 0.0003291352133538989, + "loss": 0.1719, + "step": 287950 + }, + { + "epoch": 11.93, + "grad_norm": 0.431640625, + "learning_rate": 0.0003291249257567078, + "loss": 0.1923, + "step": 287960 + }, + { + "epoch": 11.93, + "grad_norm": 0.58984375, + "learning_rate": 0.00032911463801061215, + "loss": 0.2312, + "step": 287970 + }, + { + "epoch": 11.93, + "grad_norm": 0.84765625, + "learning_rate": 0.00032910435011563124, + "loss": 0.2062, + "step": 287980 + }, + { + "epoch": 11.93, + "grad_norm": 0.443359375, + "learning_rate": 0.00032909406207178455, + "loss": 0.1981, + "step": 287990 + }, + { + "epoch": 11.93, + "grad_norm": 0.400390625, + "learning_rate": 0.00032908377387909127, + "loss": 0.1814, + "step": 288000 + }, + { + "epoch": 11.93, + "grad_norm": 0.54296875, + "learning_rate": 0.000329073485537571, + "loss": 0.1871, + "step": 288010 + }, + { + "epoch": 11.93, + "grad_norm": 0.74609375, + "learning_rate": 0.0003290631970472429, + "loss": 0.1924, + "step": 288020 + }, + { + "epoch": 11.93, + "grad_norm": 0.9375, + "learning_rate": 0.00032905290840812637, + "loss": 0.1642, + "step": 288030 + }, + { + "epoch": 11.93, + "grad_norm": 0.75, + "learning_rate": 0.00032904261962024084, + "loss": 0.1905, + "step": 288040 + }, + { + "epoch": 11.93, + "grad_norm": 0.71875, + "learning_rate": 0.00032903233068360565, + "loss": 0.1978, + "step": 288050 + }, + { + "epoch": 11.93, + "grad_norm": 0.408203125, + "learning_rate": 0.0003290220415982401, + "loss": 0.2307, + "step": 288060 + }, + { + "epoch": 11.93, + "grad_norm": 0.58203125, + "learning_rate": 0.00032901175236416363, + "loss": 0.2125, + "step": 288070 + }, + { + "epoch": 11.93, + "grad_norm": 0.55859375, + "learning_rate": 0.0003290014629813956, + "loss": 0.2711, + "step": 288080 + }, + { + "epoch": 11.93, + "grad_norm": 1.109375, + "learning_rate": 0.00032899117344995535, + "loss": 0.2168, + "step": 288090 + }, + { + "epoch": 11.93, + "grad_norm": 0.70703125, + "learning_rate": 0.00032898088376986224, + "loss": 0.2188, + "step": 288100 + }, + { + "epoch": 11.93, + "grad_norm": 0.333984375, + "learning_rate": 0.0003289705939411356, + "loss": 0.1641, + "step": 288110 + }, + { + "epoch": 11.93, + "grad_norm": 0.41796875, + "learning_rate": 0.00032896030396379483, + "loss": 0.2206, + "step": 288120 + }, + { + "epoch": 11.93, + "grad_norm": 1.1796875, + "learning_rate": 0.00032895001383785935, + "loss": 0.1881, + "step": 288130 + }, + { + "epoch": 11.93, + "grad_norm": 0.30078125, + "learning_rate": 0.0003289397235633484, + "loss": 0.2116, + "step": 288140 + }, + { + "epoch": 11.94, + "grad_norm": 0.5, + "learning_rate": 0.00032892943314028145, + "loss": 0.1869, + "step": 288150 + }, + { + "epoch": 11.94, + "grad_norm": 1.8359375, + "learning_rate": 0.00032891914256867784, + "loss": 0.2092, + "step": 288160 + }, + { + "epoch": 11.94, + "grad_norm": 0.7109375, + "learning_rate": 0.0003289088518485569, + "loss": 0.1874, + "step": 288170 + }, + { + "epoch": 11.94, + "grad_norm": 0.94921875, + "learning_rate": 0.000328898560979938, + "loss": 0.2432, + "step": 288180 + }, + { + "epoch": 11.94, + "grad_norm": 0.640625, + "learning_rate": 0.0003288882699628406, + "loss": 0.1985, + "step": 288190 + }, + { + "epoch": 11.94, + "grad_norm": 0.84375, + "learning_rate": 0.000328877978797284, + "loss": 0.2476, + "step": 288200 + }, + { + "epoch": 11.94, + "grad_norm": 1.203125, + "learning_rate": 0.00032886768748328747, + "loss": 0.2136, + "step": 288210 + }, + { + "epoch": 11.94, + "grad_norm": 2.125, + "learning_rate": 0.00032885739602087056, + "loss": 0.2322, + "step": 288220 + }, + { + "epoch": 11.94, + "grad_norm": 1.109375, + "learning_rate": 0.0003288471044100525, + "loss": 0.1502, + "step": 288230 + }, + { + "epoch": 11.94, + "grad_norm": 0.5859375, + "learning_rate": 0.0003288368126508527, + "loss": 0.1761, + "step": 288240 + }, + { + "epoch": 11.94, + "grad_norm": 0.6953125, + "learning_rate": 0.00032882652074329056, + "loss": 0.208, + "step": 288250 + }, + { + "epoch": 11.94, + "grad_norm": 0.302734375, + "learning_rate": 0.0003288162286873854, + "loss": 0.2344, + "step": 288260 + }, + { + "epoch": 11.94, + "grad_norm": 0.453125, + "learning_rate": 0.0003288059364831566, + "loss": 0.2153, + "step": 288270 + }, + { + "epoch": 11.94, + "grad_norm": 1.015625, + "learning_rate": 0.0003287956441306235, + "loss": 0.1701, + "step": 288280 + }, + { + "epoch": 11.94, + "grad_norm": 0.67578125, + "learning_rate": 0.0003287853516298056, + "loss": 0.1655, + "step": 288290 + }, + { + "epoch": 11.94, + "grad_norm": 0.83984375, + "learning_rate": 0.0003287750589807221, + "loss": 0.2143, + "step": 288300 + }, + { + "epoch": 11.94, + "grad_norm": 0.765625, + "learning_rate": 0.0003287647661833925, + "loss": 0.184, + "step": 288310 + }, + { + "epoch": 11.94, + "grad_norm": 1.0078125, + "learning_rate": 0.00032875447323783613, + "loss": 0.197, + "step": 288320 + }, + { + "epoch": 11.94, + "grad_norm": 0.3046875, + "learning_rate": 0.0003287441801440723, + "loss": 0.1477, + "step": 288330 + }, + { + "epoch": 11.94, + "grad_norm": 0.6640625, + "learning_rate": 0.0003287338869021205, + "loss": 0.2026, + "step": 288340 + }, + { + "epoch": 11.94, + "grad_norm": 0.765625, + "learning_rate": 0.0003287235935119999, + "loss": 0.1796, + "step": 288350 + }, + { + "epoch": 11.94, + "grad_norm": 0.41015625, + "learning_rate": 0.00032871329997373005, + "loss": 0.2083, + "step": 288360 + }, + { + "epoch": 11.94, + "grad_norm": 1.0859375, + "learning_rate": 0.0003287030062873303, + "loss": 0.1986, + "step": 288370 + }, + { + "epoch": 11.94, + "grad_norm": 0.341796875, + "learning_rate": 0.00032869271245282, + "loss": 0.1893, + "step": 288380 + }, + { + "epoch": 11.95, + "grad_norm": 0.76171875, + "learning_rate": 0.0003286824184702185, + "loss": 0.19, + "step": 288390 + }, + { + "epoch": 11.95, + "grad_norm": 0.76171875, + "learning_rate": 0.00032867212433954513, + "loss": 0.2058, + "step": 288400 + }, + { + "epoch": 11.95, + "grad_norm": 0.87890625, + "learning_rate": 0.0003286618300608194, + "loss": 0.2365, + "step": 288410 + }, + { + "epoch": 11.95, + "grad_norm": 0.53125, + "learning_rate": 0.0003286515356340606, + "loss": 0.1935, + "step": 288420 + }, + { + "epoch": 11.95, + "grad_norm": 1.1875, + "learning_rate": 0.00032864124105928805, + "loss": 0.1732, + "step": 288430 + }, + { + "epoch": 11.95, + "grad_norm": 0.6640625, + "learning_rate": 0.00032863094633652114, + "loss": 0.2233, + "step": 288440 + }, + { + "epoch": 11.95, + "grad_norm": 0.51953125, + "learning_rate": 0.00032862065146577933, + "loss": 0.1803, + "step": 288450 + }, + { + "epoch": 11.95, + "grad_norm": 1.1796875, + "learning_rate": 0.000328610356447082, + "loss": 0.2491, + "step": 288460 + }, + { + "epoch": 11.95, + "grad_norm": 1.171875, + "learning_rate": 0.0003286000612804484, + "loss": 0.1782, + "step": 288470 + }, + { + "epoch": 11.95, + "grad_norm": 0.4765625, + "learning_rate": 0.000328589765965898, + "loss": 0.2006, + "step": 288480 + }, + { + "epoch": 11.95, + "grad_norm": 0.6015625, + "learning_rate": 0.0003285794705034501, + "loss": 0.2009, + "step": 288490 + }, + { + "epoch": 11.95, + "grad_norm": 0.61328125, + "learning_rate": 0.0003285691748931242, + "loss": 0.2087, + "step": 288500 + }, + { + "epoch": 11.95, + "grad_norm": 0.80859375, + "learning_rate": 0.00032855887913493955, + "loss": 0.1858, + "step": 288510 + }, + { + "epoch": 11.95, + "grad_norm": 0.66015625, + "learning_rate": 0.0003285485832289156, + "loss": 0.2156, + "step": 288520 + }, + { + "epoch": 11.95, + "grad_norm": 1.8203125, + "learning_rate": 0.0003285382871750716, + "loss": 0.1837, + "step": 288530 + }, + { + "epoch": 11.95, + "grad_norm": 0.396484375, + "learning_rate": 0.0003285279909734272, + "loss": 0.2495, + "step": 288540 + }, + { + "epoch": 11.95, + "grad_norm": 0.8828125, + "learning_rate": 0.00032851769462400145, + "loss": 0.1734, + "step": 288550 + }, + { + "epoch": 11.95, + "grad_norm": 0.73046875, + "learning_rate": 0.000328507398126814, + "loss": 0.1966, + "step": 288560 + }, + { + "epoch": 11.95, + "grad_norm": 0.90625, + "learning_rate": 0.00032849710148188395, + "loss": 0.2494, + "step": 288570 + }, + { + "epoch": 11.95, + "grad_norm": 1.4453125, + "learning_rate": 0.00032848680468923095, + "loss": 0.1708, + "step": 288580 + }, + { + "epoch": 11.95, + "grad_norm": 0.53125, + "learning_rate": 0.0003284765077488742, + "loss": 0.1999, + "step": 288590 + }, + { + "epoch": 11.95, + "grad_norm": 0.2470703125, + "learning_rate": 0.00032846621066083315, + "loss": 0.2396, + "step": 288600 + }, + { + "epoch": 11.95, + "grad_norm": 0.52734375, + "learning_rate": 0.0003284559134251272, + "loss": 0.1785, + "step": 288610 + }, + { + "epoch": 11.95, + "grad_norm": 0.6796875, + "learning_rate": 0.0003284456160417757, + "loss": 0.2532, + "step": 288620 + }, + { + "epoch": 11.96, + "grad_norm": 1.9375, + "learning_rate": 0.000328435318510798, + "loss": 0.2156, + "step": 288630 + }, + { + "epoch": 11.96, + "grad_norm": 1.0625, + "learning_rate": 0.0003284250208322135, + "loss": 0.2156, + "step": 288640 + }, + { + "epoch": 11.96, + "grad_norm": 0.5859375, + "learning_rate": 0.0003284147230060416, + "loss": 0.1981, + "step": 288650 + }, + { + "epoch": 11.96, + "grad_norm": 0.671875, + "learning_rate": 0.0003284044250323016, + "loss": 0.1992, + "step": 288660 + }, + { + "epoch": 11.96, + "grad_norm": 0.2216796875, + "learning_rate": 0.000328394126911013, + "loss": 0.191, + "step": 288670 + }, + { + "epoch": 11.96, + "grad_norm": 1.25, + "learning_rate": 0.0003283838286421951, + "loss": 0.1803, + "step": 288680 + }, + { + "epoch": 11.96, + "grad_norm": 0.38671875, + "learning_rate": 0.00032837353022586725, + "loss": 0.21, + "step": 288690 + }, + { + "epoch": 11.96, + "grad_norm": 0.7421875, + "learning_rate": 0.0003283632316620489, + "loss": 0.1779, + "step": 288700 + }, + { + "epoch": 11.96, + "grad_norm": 0.65625, + "learning_rate": 0.0003283529329507595, + "loss": 0.1846, + "step": 288710 + }, + { + "epoch": 11.96, + "grad_norm": 1.0703125, + "learning_rate": 0.0003283426340920183, + "loss": 0.1894, + "step": 288720 + }, + { + "epoch": 11.96, + "grad_norm": 1.578125, + "learning_rate": 0.0003283323350858446, + "loss": 0.191, + "step": 288730 + }, + { + "epoch": 11.96, + "grad_norm": 0.8046875, + "learning_rate": 0.00032832203593225807, + "loss": 0.1945, + "step": 288740 + }, + { + "epoch": 11.96, + "grad_norm": 1.8203125, + "learning_rate": 0.0003283117366312779, + "loss": 0.198, + "step": 288750 + }, + { + "epoch": 11.96, + "grad_norm": 0.79296875, + "learning_rate": 0.00032830143718292335, + "loss": 0.1799, + "step": 288760 + }, + { + "epoch": 11.96, + "grad_norm": 2.65625, + "learning_rate": 0.00032829113758721406, + "loss": 0.2189, + "step": 288770 + }, + { + "epoch": 11.96, + "grad_norm": 0.73828125, + "learning_rate": 0.00032828083784416934, + "loss": 0.2192, + "step": 288780 + }, + { + "epoch": 11.96, + "grad_norm": 0.60546875, + "learning_rate": 0.0003282705379538086, + "loss": 0.2084, + "step": 288790 + }, + { + "epoch": 11.96, + "grad_norm": 0.765625, + "learning_rate": 0.000328260237916151, + "loss": 0.1978, + "step": 288800 + }, + { + "epoch": 11.96, + "grad_norm": 1.0390625, + "learning_rate": 0.00032824993773121616, + "loss": 0.2116, + "step": 288810 + }, + { + "epoch": 11.96, + "grad_norm": 0.37890625, + "learning_rate": 0.00032823963739902335, + "loss": 0.2004, + "step": 288820 + }, + { + "epoch": 11.96, + "grad_norm": 1.359375, + "learning_rate": 0.0003282293369195919, + "loss": 0.1721, + "step": 288830 + }, + { + "epoch": 11.96, + "grad_norm": 0.75, + "learning_rate": 0.00032821903629294145, + "loss": 0.2055, + "step": 288840 + }, + { + "epoch": 11.96, + "grad_norm": 0.51171875, + "learning_rate": 0.0003282087355190912, + "loss": 0.1758, + "step": 288850 + }, + { + "epoch": 11.96, + "grad_norm": 1.5703125, + "learning_rate": 0.00032819843459806053, + "loss": 0.2052, + "step": 288860 + }, + { + "epoch": 11.96, + "grad_norm": 1.0625, + "learning_rate": 0.0003281881335298689, + "loss": 0.179, + "step": 288870 + }, + { + "epoch": 11.97, + "grad_norm": 1.0078125, + "learning_rate": 0.00032817783231453557, + "loss": 0.1955, + "step": 288880 + }, + { + "epoch": 11.97, + "grad_norm": 0.66015625, + "learning_rate": 0.00032816753095208, + "loss": 0.1959, + "step": 288890 + }, + { + "epoch": 11.97, + "grad_norm": 0.5859375, + "learning_rate": 0.00032815722944252166, + "loss": 0.184, + "step": 288900 + }, + { + "epoch": 11.97, + "grad_norm": 0.53515625, + "learning_rate": 0.0003281469277858797, + "loss": 0.1879, + "step": 288910 + }, + { + "epoch": 11.97, + "grad_norm": 0.361328125, + "learning_rate": 0.0003281366259821738, + "loss": 0.1835, + "step": 288920 + }, + { + "epoch": 11.97, + "grad_norm": 0.6015625, + "learning_rate": 0.00032812632403142315, + "loss": 0.2241, + "step": 288930 + }, + { + "epoch": 11.97, + "grad_norm": 0.51171875, + "learning_rate": 0.0003281160219336472, + "loss": 0.1788, + "step": 288940 + }, + { + "epoch": 11.97, + "grad_norm": 1.1640625, + "learning_rate": 0.0003281057196888654, + "loss": 0.234, + "step": 288950 + }, + { + "epoch": 11.97, + "grad_norm": 0.396484375, + "learning_rate": 0.00032809541729709703, + "loss": 0.2093, + "step": 288960 + }, + { + "epoch": 11.97, + "grad_norm": 1.1171875, + "learning_rate": 0.00032808511475836144, + "loss": 0.23, + "step": 288970 + }, + { + "epoch": 11.97, + "grad_norm": 0.484375, + "learning_rate": 0.0003280748120726782, + "loss": 0.1187, + "step": 288980 + }, + { + "epoch": 11.97, + "grad_norm": 0.92578125, + "learning_rate": 0.0003280645092400665, + "loss": 0.2374, + "step": 288990 + }, + { + "epoch": 11.97, + "grad_norm": 0.001983642578125, + "learning_rate": 0.00032805420626054596, + "loss": 0.1801, + "step": 289000 + }, + { + "epoch": 11.97, + "grad_norm": 0.98828125, + "learning_rate": 0.0003280439031341357, + "loss": 0.2424, + "step": 289010 + }, + { + "epoch": 11.97, + "grad_norm": 0.330078125, + "learning_rate": 0.0003280335998608553, + "loss": 0.2041, + "step": 289020 + }, + { + "epoch": 11.97, + "grad_norm": 0.81640625, + "learning_rate": 0.0003280232964407241, + "loss": 0.1493, + "step": 289030 + }, + { + "epoch": 11.97, + "grad_norm": 0.91796875, + "learning_rate": 0.00032801299287376143, + "loss": 0.2012, + "step": 289040 + }, + { + "epoch": 11.97, + "grad_norm": 0.6640625, + "learning_rate": 0.00032800268915998676, + "loss": 0.2153, + "step": 289050 + }, + { + "epoch": 11.97, + "grad_norm": 0.67578125, + "learning_rate": 0.0003279923852994195, + "loss": 0.2259, + "step": 289060 + }, + { + "epoch": 11.97, + "grad_norm": 0.921875, + "learning_rate": 0.0003279820812920789, + "loss": 0.2455, + "step": 289070 + }, + { + "epoch": 11.97, + "grad_norm": 1.0, + "learning_rate": 0.00032797177713798453, + "loss": 0.1669, + "step": 289080 + }, + { + "epoch": 11.97, + "grad_norm": 0.58203125, + "learning_rate": 0.00032796147283715565, + "loss": 0.1853, + "step": 289090 + }, + { + "epoch": 11.97, + "grad_norm": 1.1015625, + "learning_rate": 0.0003279511683896117, + "loss": 0.2256, + "step": 289100 + }, + { + "epoch": 11.97, + "grad_norm": 0.734375, + "learning_rate": 0.0003279408637953721, + "loss": 0.1628, + "step": 289110 + }, + { + "epoch": 11.98, + "grad_norm": 0.734375, + "learning_rate": 0.0003279305590544562, + "loss": 0.2171, + "step": 289120 + }, + { + "epoch": 11.98, + "grad_norm": 0.69921875, + "learning_rate": 0.00032792025416688344, + "loss": 0.2554, + "step": 289130 + }, + { + "epoch": 11.98, + "grad_norm": 0.640625, + "learning_rate": 0.00032790994913267315, + "loss": 0.1852, + "step": 289140 + }, + { + "epoch": 11.98, + "grad_norm": 0.59375, + "learning_rate": 0.00032789964395184463, + "loss": 0.2048, + "step": 289150 + }, + { + "epoch": 11.98, + "grad_norm": 0.0, + "learning_rate": 0.00032788933862441756, + "loss": 0.1341, + "step": 289160 + }, + { + "epoch": 11.98, + "grad_norm": 0.515625, + "learning_rate": 0.00032787903315041107, + "loss": 0.2042, + "step": 289170 + }, + { + "epoch": 11.98, + "grad_norm": 0.86328125, + "learning_rate": 0.0003278687275298446, + "loss": 0.2158, + "step": 289180 + }, + { + "epoch": 11.98, + "grad_norm": 1.03125, + "learning_rate": 0.0003278584217627378, + "loss": 0.1711, + "step": 289190 + }, + { + "epoch": 11.98, + "grad_norm": 0.625, + "learning_rate": 0.00032784811584910965, + "loss": 0.1957, + "step": 289200 + }, + { + "epoch": 11.98, + "grad_norm": 0.8125, + "learning_rate": 0.0003278378097889799, + "loss": 0.2167, + "step": 289210 + }, + { + "epoch": 11.98, + "grad_norm": 0.8984375, + "learning_rate": 0.00032782750358236774, + "loss": 0.212, + "step": 289220 + }, + { + "epoch": 11.98, + "grad_norm": 0.328125, + "learning_rate": 0.0003278171972292925, + "loss": 0.1534, + "step": 289230 + }, + { + "epoch": 11.98, + "grad_norm": 0.388671875, + "learning_rate": 0.0003278068907297739, + "loss": 0.1894, + "step": 289240 + }, + { + "epoch": 11.98, + "grad_norm": 0.8046875, + "learning_rate": 0.00032779658408383104, + "loss": 0.2102, + "step": 289250 + }, + { + "epoch": 11.98, + "grad_norm": 1.1171875, + "learning_rate": 0.0003277862772914834, + "loss": 0.2185, + "step": 289260 + }, + { + "epoch": 11.98, + "grad_norm": 1.2265625, + "learning_rate": 0.0003277759703527504, + "loss": 0.1768, + "step": 289270 + }, + { + "epoch": 11.98, + "grad_norm": 0.33984375, + "learning_rate": 0.0003277656632676514, + "loss": 0.2159, + "step": 289280 + }, + { + "epoch": 11.98, + "grad_norm": 0.86328125, + "learning_rate": 0.0003277553560362059, + "loss": 0.2179, + "step": 289290 + }, + { + "epoch": 11.98, + "grad_norm": 1.125, + "learning_rate": 0.0003277450486584332, + "loss": 0.2209, + "step": 289300 + }, + { + "epoch": 11.98, + "grad_norm": 1.3046875, + "learning_rate": 0.00032773474113435266, + "loss": 0.212, + "step": 289310 + }, + { + "epoch": 11.98, + "grad_norm": 0.8515625, + "learning_rate": 0.0003277244334639837, + "loss": 0.1947, + "step": 289320 + }, + { + "epoch": 11.98, + "grad_norm": 0.73046875, + "learning_rate": 0.0003277141256473458, + "loss": 0.2114, + "step": 289330 + }, + { + "epoch": 11.98, + "grad_norm": 1.0703125, + "learning_rate": 0.0003277038176844583, + "loss": 0.2131, + "step": 289340 + }, + { + "epoch": 11.98, + "grad_norm": 0.78125, + "learning_rate": 0.0003276935095753407, + "loss": 0.1936, + "step": 289350 + }, + { + "epoch": 11.99, + "grad_norm": 1.8671875, + "learning_rate": 0.0003276832013200121, + "loss": 0.2022, + "step": 289360 + }, + { + "epoch": 11.99, + "grad_norm": 0.61328125, + "learning_rate": 0.0003276728929184923, + "loss": 0.162, + "step": 289370 + }, + { + "epoch": 11.99, + "grad_norm": 0.85546875, + "learning_rate": 0.00032766258437080044, + "loss": 0.2139, + "step": 289380 + }, + { + "epoch": 11.99, + "grad_norm": 0.53125, + "learning_rate": 0.0003276522756769559, + "loss": 0.2066, + "step": 289390 + }, + { + "epoch": 11.99, + "grad_norm": 0.2294921875, + "learning_rate": 0.00032764196683697825, + "loss": 0.1697, + "step": 289400 + }, + { + "epoch": 11.99, + "grad_norm": 0.7265625, + "learning_rate": 0.0003276316578508868, + "loss": 0.1943, + "step": 289410 + }, + { + "epoch": 11.99, + "grad_norm": 0.85546875, + "learning_rate": 0.000327621348718701, + "loss": 0.1626, + "step": 289420 + }, + { + "epoch": 11.99, + "grad_norm": 0.71875, + "learning_rate": 0.0003276110394404401, + "loss": 0.2262, + "step": 289430 + }, + { + "epoch": 11.99, + "grad_norm": 0.703125, + "learning_rate": 0.0003276007300161236, + "loss": 0.2243, + "step": 289440 + }, + { + "epoch": 11.99, + "grad_norm": 2.15625, + "learning_rate": 0.000327590420445771, + "loss": 0.2059, + "step": 289450 + }, + { + "epoch": 11.99, + "grad_norm": 0.83984375, + "learning_rate": 0.0003275801107294015, + "loss": 0.1694, + "step": 289460 + }, + { + "epoch": 11.99, + "grad_norm": 0.74609375, + "learning_rate": 0.0003275698008670347, + "loss": 0.1714, + "step": 289470 + }, + { + "epoch": 11.99, + "grad_norm": 1.2890625, + "learning_rate": 0.0003275594908586899, + "loss": 0.2145, + "step": 289480 + }, + { + "epoch": 11.99, + "grad_norm": 0.73828125, + "learning_rate": 0.0003275491807043865, + "loss": 0.2237, + "step": 289490 + }, + { + "epoch": 11.99, + "grad_norm": 0.4609375, + "learning_rate": 0.00032753887040414394, + "loss": 0.2198, + "step": 289500 + }, + { + "epoch": 11.99, + "grad_norm": 1.1484375, + "learning_rate": 0.00032752855995798157, + "loss": 0.1396, + "step": 289510 + }, + { + "epoch": 11.99, + "grad_norm": 0.66796875, + "learning_rate": 0.00032751824936591876, + "loss": 0.2205, + "step": 289520 + }, + { + "epoch": 11.99, + "grad_norm": 1.109375, + "learning_rate": 0.00032750793862797514, + "loss": 0.2431, + "step": 289530 + }, + { + "epoch": 11.99, + "grad_norm": 0.90234375, + "learning_rate": 0.0003274976277441698, + "loss": 0.1657, + "step": 289540 + }, + { + "epoch": 11.99, + "grad_norm": 1.1875, + "learning_rate": 0.0003274873167145223, + "loss": 0.2339, + "step": 289550 + }, + { + "epoch": 11.99, + "grad_norm": 0.7890625, + "learning_rate": 0.00032747700553905214, + "loss": 0.2224, + "step": 289560 + }, + { + "epoch": 11.99, + "grad_norm": 0.90234375, + "learning_rate": 0.00032746669421777853, + "loss": 0.1531, + "step": 289570 + }, + { + "epoch": 11.99, + "grad_norm": 0.0, + "learning_rate": 0.000327456382750721, + "loss": 0.1603, + "step": 289580 + }, + { + "epoch": 11.99, + "grad_norm": 2.546875, + "learning_rate": 0.0003274460711378989, + "loss": 0.1944, + "step": 289590 + }, + { + "epoch": 12.0, + "grad_norm": 1.125, + "learning_rate": 0.00032743575937933166, + "loss": 0.2111, + "step": 289600 + }, + { + "epoch": 12.0, + "grad_norm": 0.765625, + "learning_rate": 0.0003274254474750388, + "loss": 0.1722, + "step": 289610 + }, + { + "epoch": 12.0, + "grad_norm": 0.76953125, + "learning_rate": 0.00032741513542503946, + "loss": 0.217, + "step": 289620 + }, + { + "epoch": 12.0, + "grad_norm": 0.357421875, + "learning_rate": 0.0003274048232293533, + "loss": 0.1867, + "step": 289630 + }, + { + "epoch": 12.0, + "grad_norm": 0.49609375, + "learning_rate": 0.00032739451088799955, + "loss": 0.1823, + "step": 289640 + }, + { + "epoch": 12.0, + "grad_norm": 0.451171875, + "learning_rate": 0.0003273841984009977, + "loss": 0.1837, + "step": 289650 + }, + { + "epoch": 12.0, + "grad_norm": 1.1875, + "learning_rate": 0.0003273738857683671, + "loss": 0.1709, + "step": 289660 + }, + { + "epoch": 12.0, + "grad_norm": 0.44921875, + "learning_rate": 0.0003273635729901273, + "loss": 0.2498, + "step": 289670 + }, + { + "epoch": 12.0, + "grad_norm": 0.5546875, + "learning_rate": 0.00032735326006629755, + "loss": 0.1871, + "step": 289680 + }, + { + "epoch": 12.0, + "grad_norm": 2.15625, + "learning_rate": 0.00032734294699689735, + "loss": 0.1879, + "step": 289690 + }, + { + "epoch": 12.0, + "grad_norm": 0.93359375, + "learning_rate": 0.00032733263378194604, + "loss": 0.2316, + "step": 289700 + }, + { + "epoch": 12.0, + "grad_norm": 0.2890625, + "learning_rate": 0.0003273223204214631, + "loss": 0.2323, + "step": 289710 + }, + { + "epoch": 12.0, + "grad_norm": 0.8359375, + "learning_rate": 0.00032731200691546794, + "loss": 0.2035, + "step": 289720 + }, + { + "epoch": 12.0, + "grad_norm": 0.8515625, + "learning_rate": 0.00032730169326397986, + "loss": 0.1696, + "step": 289730 + }, + { + "epoch": 12.0, + "grad_norm": 1.390625, + "learning_rate": 0.00032729137946701836, + "loss": 0.2162, + "step": 289740 + }, + { + "epoch": 12.0, + "grad_norm": 0.53125, + "learning_rate": 0.0003272810655246028, + "loss": 0.1495, + "step": 289750 + }, + { + "epoch": 12.0, + "grad_norm": 1.2890625, + "learning_rate": 0.00032727075143675267, + "loss": 0.1842, + "step": 289760 + }, + { + "epoch": 12.0, + "grad_norm": 0.298828125, + "learning_rate": 0.0003272604372034873, + "loss": 0.1879, + "step": 289770 + }, + { + "epoch": 12.0, + "grad_norm": 0.6875, + "learning_rate": 0.0003272501228248261, + "loss": 0.1568, + "step": 289780 + }, + { + "epoch": 12.0, + "grad_norm": 0.625, + "learning_rate": 0.0003272398083007886, + "loss": 0.2204, + "step": 289790 + }, + { + "epoch": 12.0, + "grad_norm": 0.9453125, + "learning_rate": 0.00032722949363139407, + "loss": 0.189, + "step": 289800 + }, + { + "epoch": 12.0, + "grad_norm": 0.8828125, + "learning_rate": 0.000327219178816662, + "loss": 0.1772, + "step": 289810 + }, + { + "epoch": 12.0, + "grad_norm": 1.109375, + "learning_rate": 0.00032720886385661173, + "loss": 0.1706, + "step": 289820 + }, + { + "epoch": 12.0, + "grad_norm": 0.93359375, + "learning_rate": 0.00032719854875126275, + "loss": 0.2117, + "step": 289830 + }, + { + "epoch": 12.01, + "grad_norm": 0.482421875, + "learning_rate": 0.0003271882335006344, + "loss": 0.2128, + "step": 289840 + }, + { + "epoch": 12.01, + "grad_norm": 0.8984375, + "learning_rate": 0.00032717791810474626, + "loss": 0.2623, + "step": 289850 + }, + { + "epoch": 12.01, + "grad_norm": 0.49609375, + "learning_rate": 0.00032716760256361744, + "loss": 0.1588, + "step": 289860 + }, + { + "epoch": 12.01, + "grad_norm": 0.1015625, + "learning_rate": 0.0003271572868772676, + "loss": 0.2184, + "step": 289870 + }, + { + "epoch": 12.01, + "grad_norm": 1.4140625, + "learning_rate": 0.00032714697104571606, + "loss": 0.1546, + "step": 289880 + }, + { + "epoch": 12.01, + "grad_norm": 0.66796875, + "learning_rate": 0.0003271366550689823, + "loss": 0.1916, + "step": 289890 + }, + { + "epoch": 12.01, + "grad_norm": 0.6875, + "learning_rate": 0.00032712633894708563, + "loss": 0.1817, + "step": 289900 + }, + { + "epoch": 12.01, + "grad_norm": 0.63671875, + "learning_rate": 0.0003271160226800455, + "loss": 0.2249, + "step": 289910 + }, + { + "epoch": 12.01, + "grad_norm": 0.51171875, + "learning_rate": 0.00032710570626788137, + "loss": 0.1914, + "step": 289920 + }, + { + "epoch": 12.01, + "grad_norm": 0.6171875, + "learning_rate": 0.0003270953897106127, + "loss": 0.2321, + "step": 289930 + }, + { + "epoch": 12.01, + "grad_norm": 0.5234375, + "learning_rate": 0.0003270850730082588, + "loss": 0.1961, + "step": 289940 + }, + { + "epoch": 12.01, + "grad_norm": 0.6875, + "learning_rate": 0.00032707475616083903, + "loss": 0.1722, + "step": 289950 + }, + { + "epoch": 12.01, + "grad_norm": 1.0546875, + "learning_rate": 0.00032706443916837296, + "loss": 0.1577, + "step": 289960 + }, + { + "epoch": 12.01, + "grad_norm": 0.8515625, + "learning_rate": 0.00032705412203088, + "loss": 0.197, + "step": 289970 + }, + { + "epoch": 12.01, + "grad_norm": 0.87109375, + "learning_rate": 0.00032704380474837943, + "loss": 0.1574, + "step": 289980 + }, + { + "epoch": 12.01, + "grad_norm": 0.36328125, + "learning_rate": 0.00032703348732089074, + "loss": 0.1637, + "step": 289990 + }, + { + "epoch": 12.01, + "grad_norm": 0.41796875, + "learning_rate": 0.00032702316974843326, + "loss": 0.1872, + "step": 290000 + }, + { + "epoch": 12.01, + "grad_norm": 0.66796875, + "learning_rate": 0.0003270128520310266, + "loss": 0.183, + "step": 290010 + }, + { + "epoch": 12.01, + "grad_norm": 0.8203125, + "learning_rate": 0.0003270025341686901, + "loss": 0.1924, + "step": 290020 + }, + { + "epoch": 12.01, + "grad_norm": 1.171875, + "learning_rate": 0.0003269922161614431, + "loss": 0.1999, + "step": 290030 + }, + { + "epoch": 12.01, + "grad_norm": 1.2578125, + "learning_rate": 0.000326981898009305, + "loss": 0.1836, + "step": 290040 + }, + { + "epoch": 12.01, + "grad_norm": 1.0078125, + "learning_rate": 0.00032697157971229543, + "loss": 0.2015, + "step": 290050 + }, + { + "epoch": 12.01, + "grad_norm": 1.0625, + "learning_rate": 0.0003269612612704336, + "loss": 0.2205, + "step": 290060 + }, + { + "epoch": 12.01, + "grad_norm": 1.0859375, + "learning_rate": 0.00032695094268373885, + "loss": 0.1937, + "step": 290070 + }, + { + "epoch": 12.02, + "grad_norm": 0.2314453125, + "learning_rate": 0.0003269406239522309, + "loss": 0.2064, + "step": 290080 + }, + { + "epoch": 12.02, + "grad_norm": 0.55859375, + "learning_rate": 0.0003269303050759289, + "loss": 0.1983, + "step": 290090 + }, + { + "epoch": 12.02, + "grad_norm": 0.8984375, + "learning_rate": 0.00032691998605485247, + "loss": 0.1682, + "step": 290100 + }, + { + "epoch": 12.02, + "grad_norm": 0.60546875, + "learning_rate": 0.00032690966688902085, + "loss": 0.1904, + "step": 290110 + }, + { + "epoch": 12.02, + "grad_norm": 0.54296875, + "learning_rate": 0.00032689934757845356, + "loss": 0.1556, + "step": 290120 + }, + { + "epoch": 12.02, + "grad_norm": 0.98828125, + "learning_rate": 0.0003268890281231701, + "loss": 0.1831, + "step": 290130 + }, + { + "epoch": 12.02, + "grad_norm": 2.265625, + "learning_rate": 0.0003268787085231897, + "loss": 0.2013, + "step": 290140 + }, + { + "epoch": 12.02, + "grad_norm": 0.90625, + "learning_rate": 0.00032686838877853184, + "loss": 0.2034, + "step": 290150 + }, + { + "epoch": 12.02, + "grad_norm": 0.79296875, + "learning_rate": 0.00032685806888921597, + "loss": 0.175, + "step": 290160 + }, + { + "epoch": 12.02, + "grad_norm": 1.0703125, + "learning_rate": 0.0003268477488552616, + "loss": 0.2139, + "step": 290170 + }, + { + "epoch": 12.02, + "grad_norm": 0.345703125, + "learning_rate": 0.0003268374286766881, + "loss": 0.1972, + "step": 290180 + }, + { + "epoch": 12.02, + "grad_norm": 0.86328125, + "learning_rate": 0.00032682710835351477, + "loss": 0.1843, + "step": 290190 + }, + { + "epoch": 12.02, + "grad_norm": 0.87109375, + "learning_rate": 0.0003268167878857611, + "loss": 0.1769, + "step": 290200 + }, + { + "epoch": 12.02, + "grad_norm": 0.80859375, + "learning_rate": 0.0003268064672734466, + "loss": 0.1614, + "step": 290210 + }, + { + "epoch": 12.02, + "grad_norm": 0.546875, + "learning_rate": 0.0003267961465165905, + "loss": 0.1803, + "step": 290220 + }, + { + "epoch": 12.02, + "grad_norm": 1.625, + "learning_rate": 0.0003267858256152125, + "loss": 0.2128, + "step": 290230 + }, + { + "epoch": 12.02, + "grad_norm": 0.390625, + "learning_rate": 0.0003267755045693318, + "loss": 0.1948, + "step": 290240 + }, + { + "epoch": 12.02, + "grad_norm": 1.953125, + "learning_rate": 0.0003267651833789679, + "loss": 0.2207, + "step": 290250 + }, + { + "epoch": 12.02, + "grad_norm": 0.40625, + "learning_rate": 0.0003267548620441402, + "loss": 0.1955, + "step": 290260 + }, + { + "epoch": 12.02, + "grad_norm": 0.57421875, + "learning_rate": 0.0003267445405648681, + "loss": 0.1915, + "step": 290270 + }, + { + "epoch": 12.02, + "grad_norm": 0.578125, + "learning_rate": 0.00032673421894117115, + "loss": 0.1367, + "step": 290280 + }, + { + "epoch": 12.02, + "grad_norm": 1.1484375, + "learning_rate": 0.0003267238971730686, + "loss": 0.1862, + "step": 290290 + }, + { + "epoch": 12.02, + "grad_norm": 0.62890625, + "learning_rate": 0.00032671357526057996, + "loss": 0.231, + "step": 290300 + }, + { + "epoch": 12.02, + "grad_norm": 0.6640625, + "learning_rate": 0.00032670325320372477, + "loss": 0.2291, + "step": 290310 + }, + { + "epoch": 12.03, + "grad_norm": 1.078125, + "learning_rate": 0.00032669293100252226, + "loss": 0.1923, + "step": 290320 + }, + { + "epoch": 12.03, + "grad_norm": 0.56640625, + "learning_rate": 0.00032668260865699195, + "loss": 0.1825, + "step": 290330 + }, + { + "epoch": 12.03, + "grad_norm": 0.44140625, + "learning_rate": 0.00032667228616715325, + "loss": 0.1686, + "step": 290340 + }, + { + "epoch": 12.03, + "grad_norm": 1.03125, + "learning_rate": 0.00032666196353302556, + "loss": 0.2424, + "step": 290350 + }, + { + "epoch": 12.03, + "grad_norm": 0.6171875, + "learning_rate": 0.0003266516407546284, + "loss": 0.2003, + "step": 290360 + }, + { + "epoch": 12.03, + "grad_norm": 0.326171875, + "learning_rate": 0.0003266413178319811, + "loss": 0.2415, + "step": 290370 + }, + { + "epoch": 12.03, + "grad_norm": 0.275390625, + "learning_rate": 0.00032663099476510304, + "loss": 0.1828, + "step": 290380 + }, + { + "epoch": 12.03, + "grad_norm": 0.90234375, + "learning_rate": 0.00032662067155401385, + "loss": 0.2043, + "step": 290390 + }, + { + "epoch": 12.03, + "grad_norm": 0.40625, + "learning_rate": 0.00032661034819873283, + "loss": 0.2032, + "step": 290400 + }, + { + "epoch": 12.03, + "grad_norm": 0.59375, + "learning_rate": 0.0003266000246992793, + "loss": 0.2022, + "step": 290410 + }, + { + "epoch": 12.03, + "grad_norm": 1.1640625, + "learning_rate": 0.0003265897010556729, + "loss": 0.22, + "step": 290420 + }, + { + "epoch": 12.03, + "grad_norm": 0.8984375, + "learning_rate": 0.0003265793772679328, + "loss": 0.1909, + "step": 290430 + }, + { + "epoch": 12.03, + "grad_norm": 1.140625, + "learning_rate": 0.00032656905333607874, + "loss": 0.2475, + "step": 290440 + }, + { + "epoch": 12.03, + "grad_norm": 0.333984375, + "learning_rate": 0.00032655872926012984, + "loss": 0.2317, + "step": 290450 + }, + { + "epoch": 12.03, + "grad_norm": 1.21875, + "learning_rate": 0.0003265484050401058, + "loss": 0.1987, + "step": 290460 + }, + { + "epoch": 12.03, + "grad_norm": 0.70703125, + "learning_rate": 0.000326538080676026, + "loss": 0.2027, + "step": 290470 + }, + { + "epoch": 12.03, + "grad_norm": 1.390625, + "learning_rate": 0.00032652775616790966, + "loss": 0.1823, + "step": 290480 + }, + { + "epoch": 12.03, + "grad_norm": 1.7890625, + "learning_rate": 0.00032651743151577636, + "loss": 0.2311, + "step": 290490 + }, + { + "epoch": 12.03, + "grad_norm": 2.0, + "learning_rate": 0.00032650710671964557, + "loss": 0.1831, + "step": 290500 + }, + { + "epoch": 12.03, + "grad_norm": 1.0546875, + "learning_rate": 0.0003264967817795366, + "loss": 0.1475, + "step": 290510 + }, + { + "epoch": 12.03, + "grad_norm": 0.80859375, + "learning_rate": 0.00032648645669546895, + "loss": 0.1818, + "step": 290520 + }, + { + "epoch": 12.03, + "grad_norm": 0.8828125, + "learning_rate": 0.0003264761314674621, + "loss": 0.1684, + "step": 290530 + }, + { + "epoch": 12.03, + "grad_norm": 0.43359375, + "learning_rate": 0.0003264658060955354, + "loss": 0.2159, + "step": 290540 + }, + { + "epoch": 12.03, + "grad_norm": 0.0, + "learning_rate": 0.00032645548057970837, + "loss": 0.201, + "step": 290550 + }, + { + "epoch": 12.03, + "grad_norm": 0.6328125, + "learning_rate": 0.0003264451549200003, + "loss": 0.2175, + "step": 290560 + }, + { + "epoch": 12.04, + "grad_norm": 0.671875, + "learning_rate": 0.00032643482911643074, + "loss": 0.182, + "step": 290570 + }, + { + "epoch": 12.04, + "grad_norm": 0.421875, + "learning_rate": 0.0003264245031690191, + "loss": 0.1945, + "step": 290580 + }, + { + "epoch": 12.04, + "grad_norm": 0.82421875, + "learning_rate": 0.00032641417707778475, + "loss": 0.1983, + "step": 290590 + }, + { + "epoch": 12.04, + "grad_norm": 2.3125, + "learning_rate": 0.0003264038508427472, + "loss": 0.1834, + "step": 290600 + }, + { + "epoch": 12.04, + "grad_norm": 0.75390625, + "learning_rate": 0.0003263935244639259, + "loss": 0.1728, + "step": 290610 + }, + { + "epoch": 12.04, + "grad_norm": 0.53515625, + "learning_rate": 0.00032638319794134015, + "loss": 0.1757, + "step": 290620 + }, + { + "epoch": 12.04, + "grad_norm": 0.46875, + "learning_rate": 0.00032637287127500956, + "loss": 0.2147, + "step": 290630 + }, + { + "epoch": 12.04, + "grad_norm": 0.8125, + "learning_rate": 0.0003263625444649534, + "loss": 0.2034, + "step": 290640 + }, + { + "epoch": 12.04, + "grad_norm": 1.3046875, + "learning_rate": 0.0003263522175111912, + "loss": 0.2083, + "step": 290650 + }, + { + "epoch": 12.04, + "grad_norm": 0.94921875, + "learning_rate": 0.0003263418904137424, + "loss": 0.2319, + "step": 290660 + }, + { + "epoch": 12.04, + "grad_norm": 0.671875, + "learning_rate": 0.00032633156317262633, + "loss": 0.182, + "step": 290670 + }, + { + "epoch": 12.04, + "grad_norm": 0.66015625, + "learning_rate": 0.0003263212357878626, + "loss": 0.1887, + "step": 290680 + }, + { + "epoch": 12.04, + "grad_norm": 0.8203125, + "learning_rate": 0.00032631090825947047, + "loss": 0.1548, + "step": 290690 + }, + { + "epoch": 12.04, + "grad_norm": 1.1640625, + "learning_rate": 0.0003263005805874695, + "loss": 0.2415, + "step": 290700 + }, + { + "epoch": 12.04, + "grad_norm": 1.203125, + "learning_rate": 0.0003262902527718791, + "loss": 0.1944, + "step": 290710 + }, + { + "epoch": 12.04, + "grad_norm": 0.7109375, + "learning_rate": 0.0003262799248127186, + "loss": 0.1859, + "step": 290720 + }, + { + "epoch": 12.04, + "grad_norm": 0.91796875, + "learning_rate": 0.00032626959671000754, + "loss": 0.1539, + "step": 290730 + }, + { + "epoch": 12.04, + "grad_norm": 1.6171875, + "learning_rate": 0.0003262592684637654, + "loss": 0.2004, + "step": 290740 + }, + { + "epoch": 12.04, + "grad_norm": 0.78125, + "learning_rate": 0.0003262489400740115, + "loss": 0.2147, + "step": 290750 + }, + { + "epoch": 12.04, + "grad_norm": 1.375, + "learning_rate": 0.00032623861154076536, + "loss": 0.1936, + "step": 290760 + }, + { + "epoch": 12.04, + "grad_norm": 0.455078125, + "learning_rate": 0.00032622828286404633, + "loss": 0.1964, + "step": 290770 + }, + { + "epoch": 12.04, + "grad_norm": 1.0078125, + "learning_rate": 0.0003262179540438739, + "loss": 0.2135, + "step": 290780 + }, + { + "epoch": 12.04, + "grad_norm": 0.95703125, + "learning_rate": 0.00032620762508026756, + "loss": 0.2062, + "step": 290790 + }, + { + "epoch": 12.04, + "grad_norm": 1.4296875, + "learning_rate": 0.00032619729597324664, + "loss": 0.2143, + "step": 290800 + }, + { + "epoch": 12.05, + "grad_norm": 1.421875, + "learning_rate": 0.00032618696672283076, + "loss": 0.2066, + "step": 290810 + }, + { + "epoch": 12.05, + "grad_norm": 0.34765625, + "learning_rate": 0.00032617663732903917, + "loss": 0.1634, + "step": 290820 + }, + { + "epoch": 12.05, + "grad_norm": 0.369140625, + "learning_rate": 0.00032616630779189133, + "loss": 0.1785, + "step": 290830 + }, + { + "epoch": 12.05, + "grad_norm": 0.291015625, + "learning_rate": 0.00032615597811140677, + "loss": 0.1648, + "step": 290840 + }, + { + "epoch": 12.05, + "grad_norm": 0.40625, + "learning_rate": 0.0003261456482876049, + "loss": 0.2033, + "step": 290850 + }, + { + "epoch": 12.05, + "grad_norm": 0.7578125, + "learning_rate": 0.00032613531832050507, + "loss": 0.1219, + "step": 290860 + }, + { + "epoch": 12.05, + "grad_norm": 0.193359375, + "learning_rate": 0.0003261249882101269, + "loss": 0.1662, + "step": 290870 + }, + { + "epoch": 12.05, + "grad_norm": 0.625, + "learning_rate": 0.0003261146579564896, + "loss": 0.1354, + "step": 290880 + }, + { + "epoch": 12.05, + "grad_norm": 1.1328125, + "learning_rate": 0.0003261043275596128, + "loss": 0.1627, + "step": 290890 + }, + { + "epoch": 12.05, + "grad_norm": 0.83984375, + "learning_rate": 0.0003260939970195159, + "loss": 0.1326, + "step": 290900 + }, + { + "epoch": 12.05, + "grad_norm": 0.4765625, + "learning_rate": 0.00032608366633621826, + "loss": 0.1422, + "step": 290910 + }, + { + "epoch": 12.05, + "grad_norm": 1.015625, + "learning_rate": 0.0003260733355097394, + "loss": 0.175, + "step": 290920 + }, + { + "epoch": 12.05, + "grad_norm": 1.5234375, + "learning_rate": 0.00032606300454009874, + "loss": 0.2131, + "step": 290930 + }, + { + "epoch": 12.05, + "grad_norm": 1.234375, + "learning_rate": 0.0003260526734273157, + "loss": 0.1923, + "step": 290940 + }, + { + "epoch": 12.05, + "grad_norm": 0.373046875, + "learning_rate": 0.0003260423421714098, + "loss": 0.1594, + "step": 290950 + }, + { + "epoch": 12.05, + "grad_norm": 1.2109375, + "learning_rate": 0.00032603201077240036, + "loss": 0.1577, + "step": 290960 + }, + { + "epoch": 12.05, + "grad_norm": 0.9296875, + "learning_rate": 0.00032602167923030694, + "loss": 0.1758, + "step": 290970 + }, + { + "epoch": 12.05, + "grad_norm": 0.59375, + "learning_rate": 0.0003260113475451489, + "loss": 0.201, + "step": 290980 + }, + { + "epoch": 12.05, + "grad_norm": 0.5859375, + "learning_rate": 0.0003260010157169457, + "loss": 0.1983, + "step": 290990 + }, + { + "epoch": 12.05, + "grad_norm": 4.78125, + "learning_rate": 0.00032599068374571685, + "loss": 0.2055, + "step": 291000 + }, + { + "epoch": 12.05, + "grad_norm": 1.0859375, + "learning_rate": 0.00032598035163148166, + "loss": 0.2095, + "step": 291010 + }, + { + "epoch": 12.05, + "grad_norm": 0.87890625, + "learning_rate": 0.0003259700193742597, + "loss": 0.2199, + "step": 291020 + }, + { + "epoch": 12.05, + "grad_norm": 2.265625, + "learning_rate": 0.0003259596869740704, + "loss": 0.19, + "step": 291030 + }, + { + "epoch": 12.05, + "grad_norm": 0.96484375, + "learning_rate": 0.00032594935443093303, + "loss": 0.2066, + "step": 291040 + }, + { + "epoch": 12.06, + "grad_norm": 1.1015625, + "learning_rate": 0.0003259390217448673, + "loss": 0.2051, + "step": 291050 + }, + { + "epoch": 12.06, + "grad_norm": 0.7421875, + "learning_rate": 0.00032592868891589255, + "loss": 0.2519, + "step": 291060 + }, + { + "epoch": 12.06, + "grad_norm": 1.3046875, + "learning_rate": 0.0003259183559440281, + "loss": 0.2104, + "step": 291070 + }, + { + "epoch": 12.06, + "grad_norm": 0.91796875, + "learning_rate": 0.00032590802282929366, + "loss": 0.1884, + "step": 291080 + }, + { + "epoch": 12.06, + "grad_norm": 0.87109375, + "learning_rate": 0.0003258976895717084, + "loss": 0.2289, + "step": 291090 + }, + { + "epoch": 12.06, + "grad_norm": 0.55859375, + "learning_rate": 0.0003258873561712919, + "loss": 0.1861, + "step": 291100 + }, + { + "epoch": 12.06, + "grad_norm": 0.3046875, + "learning_rate": 0.00032587702262806356, + "loss": 0.1863, + "step": 291110 + }, + { + "epoch": 12.06, + "grad_norm": 0.8828125, + "learning_rate": 0.0003258666889420429, + "loss": 0.2048, + "step": 291120 + }, + { + "epoch": 12.06, + "grad_norm": 1.046875, + "learning_rate": 0.0003258563551132494, + "loss": 0.2259, + "step": 291130 + }, + { + "epoch": 12.06, + "grad_norm": 0.92578125, + "learning_rate": 0.0003258460211417023, + "loss": 0.1812, + "step": 291140 + }, + { + "epoch": 12.06, + "grad_norm": 0.79296875, + "learning_rate": 0.00032583568702742124, + "loss": 0.2362, + "step": 291150 + }, + { + "epoch": 12.06, + "grad_norm": 0.859375, + "learning_rate": 0.00032582535277042563, + "loss": 0.2422, + "step": 291160 + }, + { + "epoch": 12.06, + "grad_norm": 1.109375, + "learning_rate": 0.00032581501837073486, + "loss": 0.1745, + "step": 291170 + }, + { + "epoch": 12.06, + "grad_norm": 0.87109375, + "learning_rate": 0.0003258046838283684, + "loss": 0.1584, + "step": 291180 + }, + { + "epoch": 12.06, + "grad_norm": 0.2353515625, + "learning_rate": 0.0003257943491433457, + "loss": 0.1953, + "step": 291190 + }, + { + "epoch": 12.06, + "grad_norm": 0.7265625, + "learning_rate": 0.00032578401431568625, + "loss": 0.2028, + "step": 291200 + }, + { + "epoch": 12.06, + "grad_norm": 0.73046875, + "learning_rate": 0.00032577367934540953, + "loss": 0.1989, + "step": 291210 + }, + { + "epoch": 12.06, + "grad_norm": 1.3671875, + "learning_rate": 0.00032576334423253486, + "loss": 0.2555, + "step": 291220 + }, + { + "epoch": 12.06, + "grad_norm": 1.7109375, + "learning_rate": 0.0003257530089770817, + "loss": 0.2057, + "step": 291230 + }, + { + "epoch": 12.06, + "grad_norm": 0.5, + "learning_rate": 0.0003257426735790697, + "loss": 0.1992, + "step": 291240 + }, + { + "epoch": 12.06, + "grad_norm": 0.408203125, + "learning_rate": 0.0003257323380385181, + "loss": 0.2227, + "step": 291250 + }, + { + "epoch": 12.06, + "grad_norm": 0.46875, + "learning_rate": 0.0003257220023554464, + "loss": 0.2202, + "step": 291260 + }, + { + "epoch": 12.06, + "grad_norm": 0.49609375, + "learning_rate": 0.00032571166652987406, + "loss": 0.1935, + "step": 291270 + }, + { + "epoch": 12.06, + "grad_norm": 0.6875, + "learning_rate": 0.0003257013305618205, + "loss": 0.1998, + "step": 291280 + }, + { + "epoch": 12.07, + "grad_norm": 0.92578125, + "learning_rate": 0.0003256909944513053, + "loss": 0.2335, + "step": 291290 + }, + { + "epoch": 12.07, + "grad_norm": 1.3828125, + "learning_rate": 0.0003256806581983478, + "loss": 0.1859, + "step": 291300 + }, + { + "epoch": 12.07, + "grad_norm": 1.9765625, + "learning_rate": 0.0003256703218029675, + "loss": 0.1809, + "step": 291310 + }, + { + "epoch": 12.07, + "grad_norm": 0.62109375, + "learning_rate": 0.00032565998526518374, + "loss": 0.1647, + "step": 291320 + }, + { + "epoch": 12.07, + "grad_norm": 0.98828125, + "learning_rate": 0.0003256496485850161, + "loss": 0.1563, + "step": 291330 + }, + { + "epoch": 12.07, + "grad_norm": 1.40625, + "learning_rate": 0.000325639311762484, + "loss": 0.2055, + "step": 291340 + }, + { + "epoch": 12.07, + "grad_norm": 0.53125, + "learning_rate": 0.0003256289747976069, + "loss": 0.1698, + "step": 291350 + }, + { + "epoch": 12.07, + "grad_norm": 0.80859375, + "learning_rate": 0.00032561863769040424, + "loss": 0.1987, + "step": 291360 + }, + { + "epoch": 12.07, + "grad_norm": 0.64453125, + "learning_rate": 0.00032560830044089543, + "loss": 0.1888, + "step": 291370 + }, + { + "epoch": 12.07, + "grad_norm": 1.84375, + "learning_rate": 0.0003255979630491, + "loss": 0.2115, + "step": 291380 + }, + { + "epoch": 12.07, + "grad_norm": 0.91796875, + "learning_rate": 0.00032558762551503735, + "loss": 0.148, + "step": 291390 + }, + { + "epoch": 12.07, + "grad_norm": 0.6171875, + "learning_rate": 0.00032557728783872694, + "loss": 0.1541, + "step": 291400 + }, + { + "epoch": 12.07, + "grad_norm": 0.9921875, + "learning_rate": 0.0003255669500201883, + "loss": 0.2216, + "step": 291410 + }, + { + "epoch": 12.07, + "grad_norm": 0.640625, + "learning_rate": 0.00032555661205944074, + "loss": 0.1601, + "step": 291420 + }, + { + "epoch": 12.07, + "grad_norm": 0.89453125, + "learning_rate": 0.0003255462739565038, + "loss": 0.227, + "step": 291430 + }, + { + "epoch": 12.07, + "grad_norm": 1.0, + "learning_rate": 0.00032553593571139694, + "loss": 0.154, + "step": 291440 + }, + { + "epoch": 12.07, + "grad_norm": 1.7265625, + "learning_rate": 0.00032552559732413965, + "loss": 0.1776, + "step": 291450 + }, + { + "epoch": 12.07, + "grad_norm": 0.2236328125, + "learning_rate": 0.0003255152587947513, + "loss": 0.1938, + "step": 291460 + }, + { + "epoch": 12.07, + "grad_norm": 1.7421875, + "learning_rate": 0.0003255049201232514, + "loss": 0.1622, + "step": 291470 + }, + { + "epoch": 12.07, + "grad_norm": 0.8671875, + "learning_rate": 0.0003254945813096594, + "loss": 0.1974, + "step": 291480 + }, + { + "epoch": 12.07, + "grad_norm": 1.1953125, + "learning_rate": 0.00032548424235399474, + "loss": 0.2106, + "step": 291490 + }, + { + "epoch": 12.07, + "grad_norm": 1.109375, + "learning_rate": 0.0003254739032562769, + "loss": 0.219, + "step": 291500 + }, + { + "epoch": 12.07, + "grad_norm": 0.7734375, + "learning_rate": 0.00032546356401652534, + "loss": 0.2102, + "step": 291510 + }, + { + "epoch": 12.07, + "grad_norm": 0.80859375, + "learning_rate": 0.0003254532246347595, + "loss": 0.2396, + "step": 291520 + }, + { + "epoch": 12.08, + "grad_norm": 1.296875, + "learning_rate": 0.0003254428851109987, + "loss": 0.1902, + "step": 291530 + }, + { + "epoch": 12.08, + "grad_norm": 2.28125, + "learning_rate": 0.00032543254544526275, + "loss": 0.2198, + "step": 291540 + }, + { + "epoch": 12.08, + "grad_norm": 1.0234375, + "learning_rate": 0.0003254222056375708, + "loss": 0.1826, + "step": 291550 + }, + { + "epoch": 12.08, + "grad_norm": 1.6484375, + "learning_rate": 0.0003254118656879424, + "loss": 0.1977, + "step": 291560 + }, + { + "epoch": 12.08, + "grad_norm": 0.77734375, + "learning_rate": 0.00032540152559639704, + "loss": 0.2411, + "step": 291570 + }, + { + "epoch": 12.08, + "grad_norm": 0.46484375, + "learning_rate": 0.0003253911853629541, + "loss": 0.139, + "step": 291580 + }, + { + "epoch": 12.08, + "grad_norm": 0.578125, + "learning_rate": 0.0003253808449876331, + "loss": 0.1507, + "step": 291590 + }, + { + "epoch": 12.08, + "grad_norm": 1.125, + "learning_rate": 0.00032537050447045354, + "loss": 0.1804, + "step": 291600 + }, + { + "epoch": 12.08, + "grad_norm": 0.55078125, + "learning_rate": 0.00032536016381143476, + "loss": 0.1609, + "step": 291610 + }, + { + "epoch": 12.08, + "grad_norm": 1.5859375, + "learning_rate": 0.00032534982301059636, + "loss": 0.1486, + "step": 291620 + }, + { + "epoch": 12.08, + "grad_norm": 0.921875, + "learning_rate": 0.0003253394820679577, + "loss": 0.1888, + "step": 291630 + }, + { + "epoch": 12.08, + "grad_norm": 0.8125, + "learning_rate": 0.0003253291409835383, + "loss": 0.2015, + "step": 291640 + }, + { + "epoch": 12.08, + "grad_norm": 0.84375, + "learning_rate": 0.00032531879975735757, + "loss": 0.2152, + "step": 291650 + }, + { + "epoch": 12.08, + "grad_norm": 0.55859375, + "learning_rate": 0.000325308458389435, + "loss": 0.1187, + "step": 291660 + }, + { + "epoch": 12.08, + "grad_norm": 0.8671875, + "learning_rate": 0.00032529811687979, + "loss": 0.1946, + "step": 291670 + }, + { + "epoch": 12.08, + "grad_norm": 0.6015625, + "learning_rate": 0.0003252877752284421, + "loss": 0.2077, + "step": 291680 + }, + { + "epoch": 12.08, + "grad_norm": 0.8203125, + "learning_rate": 0.00032527743343541073, + "loss": 0.2012, + "step": 291690 + }, + { + "epoch": 12.08, + "grad_norm": 1.15625, + "learning_rate": 0.0003252670915007154, + "loss": 0.1893, + "step": 291700 + }, + { + "epoch": 12.08, + "grad_norm": 1.25, + "learning_rate": 0.0003252567494243755, + "loss": 0.22, + "step": 291710 + }, + { + "epoch": 12.08, + "grad_norm": 0.423828125, + "learning_rate": 0.00032524640720641053, + "loss": 0.169, + "step": 291720 + }, + { + "epoch": 12.08, + "grad_norm": 0.8203125, + "learning_rate": 0.00032523606484684, + "loss": 0.1983, + "step": 291730 + }, + { + "epoch": 12.08, + "grad_norm": 0.59765625, + "learning_rate": 0.00032522572234568316, + "loss": 0.1779, + "step": 291740 + }, + { + "epoch": 12.08, + "grad_norm": 1.1328125, + "learning_rate": 0.0003252153797029598, + "loss": 0.2356, + "step": 291750 + }, + { + "epoch": 12.08, + "grad_norm": 0.76953125, + "learning_rate": 0.00032520503691868915, + "loss": 0.1837, + "step": 291760 + }, + { + "epoch": 12.09, + "grad_norm": 0.92578125, + "learning_rate": 0.0003251946939928907, + "loss": 0.1501, + "step": 291770 + }, + { + "epoch": 12.09, + "grad_norm": 0.7734375, + "learning_rate": 0.00032518435092558407, + "loss": 0.1901, + "step": 291780 + }, + { + "epoch": 12.09, + "grad_norm": 1.015625, + "learning_rate": 0.0003251740077167885, + "loss": 0.2066, + "step": 291790 + }, + { + "epoch": 12.09, + "grad_norm": 1.140625, + "learning_rate": 0.0003251636643665236, + "loss": 0.2295, + "step": 291800 + }, + { + "epoch": 12.09, + "grad_norm": 0.70703125, + "learning_rate": 0.0003251533208748089, + "loss": 0.1771, + "step": 291810 + }, + { + "epoch": 12.09, + "grad_norm": 0.77734375, + "learning_rate": 0.00032514297724166357, + "loss": 0.2416, + "step": 291820 + }, + { + "epoch": 12.09, + "grad_norm": 0.419921875, + "learning_rate": 0.00032513263346710744, + "loss": 0.226, + "step": 291830 + }, + { + "epoch": 12.09, + "grad_norm": 0.703125, + "learning_rate": 0.00032512228955115975, + "loss": 0.1951, + "step": 291840 + }, + { + "epoch": 12.09, + "grad_norm": 0.8671875, + "learning_rate": 0.00032511194549384, + "loss": 0.1782, + "step": 291850 + }, + { + "epoch": 12.09, + "grad_norm": 0.57421875, + "learning_rate": 0.00032510160129516774, + "loss": 0.1991, + "step": 291860 + }, + { + "epoch": 12.09, + "grad_norm": 1.015625, + "learning_rate": 0.0003250912569551623, + "loss": 0.1589, + "step": 291870 + }, + { + "epoch": 12.09, + "grad_norm": 1.2578125, + "learning_rate": 0.0003250809124738433, + "loss": 0.1694, + "step": 291880 + }, + { + "epoch": 12.09, + "grad_norm": 1.375, + "learning_rate": 0.0003250705678512301, + "loss": 0.2277, + "step": 291890 + }, + { + "epoch": 12.09, + "grad_norm": 0.89453125, + "learning_rate": 0.0003250602230873421, + "loss": 0.1875, + "step": 291900 + }, + { + "epoch": 12.09, + "grad_norm": 0.6640625, + "learning_rate": 0.00032504987818219905, + "loss": 0.1868, + "step": 291910 + }, + { + "epoch": 12.09, + "grad_norm": 0.8671875, + "learning_rate": 0.0003250395331358201, + "loss": 0.1831, + "step": 291920 + }, + { + "epoch": 12.09, + "grad_norm": 0.6875, + "learning_rate": 0.0003250291879482249, + "loss": 0.19, + "step": 291930 + }, + { + "epoch": 12.09, + "grad_norm": 0.9921875, + "learning_rate": 0.00032501884261943294, + "loss": 0.2254, + "step": 291940 + }, + { + "epoch": 12.09, + "grad_norm": 0.83203125, + "learning_rate": 0.00032500849714946347, + "loss": 0.2149, + "step": 291950 + }, + { + "epoch": 12.09, + "grad_norm": 2.484375, + "learning_rate": 0.0003249981515383362, + "loss": 0.1974, + "step": 291960 + }, + { + "epoch": 12.09, + "grad_norm": 1.5625, + "learning_rate": 0.0003249878057860705, + "loss": 0.1553, + "step": 291970 + }, + { + "epoch": 12.09, + "grad_norm": 0.54296875, + "learning_rate": 0.0003249774598926858, + "loss": 0.2016, + "step": 291980 + }, + { + "epoch": 12.09, + "grad_norm": 1.1328125, + "learning_rate": 0.0003249671138582017, + "loss": 0.217, + "step": 291990 + }, + { + "epoch": 12.09, + "grad_norm": 0.7578125, + "learning_rate": 0.0003249567676826375, + "loss": 0.22, + "step": 292000 + }, + { + "epoch": 12.1, + "grad_norm": 1.0234375, + "learning_rate": 0.00032494642136601283, + "loss": 0.2204, + "step": 292010 + }, + { + "epoch": 12.1, + "grad_norm": 0.55859375, + "learning_rate": 0.0003249360749083471, + "loss": 0.2064, + "step": 292020 + }, + { + "epoch": 12.1, + "grad_norm": 0.2490234375, + "learning_rate": 0.0003249257283096596, + "loss": 0.1686, + "step": 292030 + }, + { + "epoch": 12.1, + "grad_norm": 1.6640625, + "learning_rate": 0.0003249153815699702, + "loss": 0.2741, + "step": 292040 + }, + { + "epoch": 12.1, + "grad_norm": 1.3984375, + "learning_rate": 0.00032490503468929804, + "loss": 0.2177, + "step": 292050 + }, + { + "epoch": 12.1, + "grad_norm": 0.98046875, + "learning_rate": 0.00032489468766766266, + "loss": 0.2014, + "step": 292060 + }, + { + "epoch": 12.1, + "grad_norm": 0.6796875, + "learning_rate": 0.00032488434050508366, + "loss": 0.1768, + "step": 292070 + }, + { + "epoch": 12.1, + "grad_norm": 0.95703125, + "learning_rate": 0.00032487399320158027, + "loss": 0.1562, + "step": 292080 + }, + { + "epoch": 12.1, + "grad_norm": 0.412109375, + "learning_rate": 0.0003248636457571722, + "loss": 0.2148, + "step": 292090 + }, + { + "epoch": 12.1, + "grad_norm": 0.9375, + "learning_rate": 0.0003248532981718789, + "loss": 0.1754, + "step": 292100 + }, + { + "epoch": 12.1, + "grad_norm": 2.265625, + "learning_rate": 0.0003248429504457197, + "loss": 0.2409, + "step": 292110 + }, + { + "epoch": 12.1, + "grad_norm": 1.125, + "learning_rate": 0.0003248326025787142, + "loss": 0.2295, + "step": 292120 + }, + { + "epoch": 12.1, + "grad_norm": 0.91796875, + "learning_rate": 0.00032482225457088174, + "loss": 0.1707, + "step": 292130 + }, + { + "epoch": 12.1, + "grad_norm": 1.4453125, + "learning_rate": 0.0003248119064222419, + "loss": 0.1963, + "step": 292140 + }, + { + "epoch": 12.1, + "grad_norm": 0.6875, + "learning_rate": 0.0003248015581328142, + "loss": 0.2409, + "step": 292150 + }, + { + "epoch": 12.1, + "grad_norm": 0.5859375, + "learning_rate": 0.00032479120970261796, + "loss": 0.2116, + "step": 292160 + }, + { + "epoch": 12.1, + "grad_norm": 0.71875, + "learning_rate": 0.00032478086113167274, + "loss": 0.1714, + "step": 292170 + }, + { + "epoch": 12.1, + "grad_norm": 0.66796875, + "learning_rate": 0.0003247705124199981, + "loss": 0.1997, + "step": 292180 + }, + { + "epoch": 12.1, + "grad_norm": 0.8359375, + "learning_rate": 0.00032476016356761334, + "loss": 0.1447, + "step": 292190 + }, + { + "epoch": 12.1, + "grad_norm": 0.875, + "learning_rate": 0.00032474981457453813, + "loss": 0.1976, + "step": 292200 + }, + { + "epoch": 12.1, + "grad_norm": 0.95703125, + "learning_rate": 0.0003247394654407917, + "loss": 0.2036, + "step": 292210 + }, + { + "epoch": 12.1, + "grad_norm": 0.5078125, + "learning_rate": 0.0003247291161663938, + "loss": 0.1882, + "step": 292220 + }, + { + "epoch": 12.1, + "grad_norm": 1.203125, + "learning_rate": 0.00032471876675136367, + "loss": 0.1717, + "step": 292230 + }, + { + "epoch": 12.1, + "grad_norm": 1.0859375, + "learning_rate": 0.000324708417195721, + "loss": 0.196, + "step": 292240 + }, + { + "epoch": 12.1, + "grad_norm": 0.416015625, + "learning_rate": 0.00032469806749948506, + "loss": 0.2127, + "step": 292250 + }, + { + "epoch": 12.11, + "grad_norm": 0.95703125, + "learning_rate": 0.0003246877176626755, + "loss": 0.1658, + "step": 292260 + }, + { + "epoch": 12.11, + "grad_norm": 0.8984375, + "learning_rate": 0.0003246773676853116, + "loss": 0.2056, + "step": 292270 + }, + { + "epoch": 12.11, + "grad_norm": 0.5703125, + "learning_rate": 0.0003246670175674131, + "loss": 0.2118, + "step": 292280 + }, + { + "epoch": 12.11, + "grad_norm": 0.984375, + "learning_rate": 0.0003246566673089992, + "loss": 0.2019, + "step": 292290 + }, + { + "epoch": 12.11, + "grad_norm": 0.90625, + "learning_rate": 0.00032464631691008964, + "loss": 0.1994, + "step": 292300 + }, + { + "epoch": 12.11, + "grad_norm": 1.6328125, + "learning_rate": 0.00032463596637070377, + "loss": 0.1943, + "step": 292310 + }, + { + "epoch": 12.11, + "grad_norm": 1.078125, + "learning_rate": 0.00032462561569086097, + "loss": 0.2021, + "step": 292320 + }, + { + "epoch": 12.11, + "grad_norm": 0.58203125, + "learning_rate": 0.00032461526487058087, + "loss": 0.1534, + "step": 292330 + }, + { + "epoch": 12.11, + "grad_norm": 0.291015625, + "learning_rate": 0.0003246049139098829, + "loss": 0.1867, + "step": 292340 + }, + { + "epoch": 12.11, + "grad_norm": 0.796875, + "learning_rate": 0.00032459456280878654, + "loss": 0.2279, + "step": 292350 + }, + { + "epoch": 12.11, + "grad_norm": 0.92578125, + "learning_rate": 0.00032458421156731124, + "loss": 0.2833, + "step": 292360 + }, + { + "epoch": 12.11, + "grad_norm": 0.80078125, + "learning_rate": 0.0003245738601854766, + "loss": 0.2424, + "step": 292370 + }, + { + "epoch": 12.11, + "grad_norm": 0.93359375, + "learning_rate": 0.00032456350866330184, + "loss": 0.2116, + "step": 292380 + }, + { + "epoch": 12.11, + "grad_norm": 0.55859375, + "learning_rate": 0.00032455315700080677, + "loss": 0.2074, + "step": 292390 + }, + { + "epoch": 12.11, + "grad_norm": 0.87890625, + "learning_rate": 0.0003245428051980107, + "loss": 0.1803, + "step": 292400 + }, + { + "epoch": 12.11, + "grad_norm": 1.5390625, + "learning_rate": 0.00032453245325493303, + "loss": 0.1921, + "step": 292410 + }, + { + "epoch": 12.11, + "grad_norm": 0.6484375, + "learning_rate": 0.0003245221011715934, + "loss": 0.2196, + "step": 292420 + }, + { + "epoch": 12.11, + "grad_norm": 0.703125, + "learning_rate": 0.0003245117489480112, + "loss": 0.2398, + "step": 292430 + }, + { + "epoch": 12.11, + "grad_norm": 0.5625, + "learning_rate": 0.00032450139658420595, + "loss": 0.1475, + "step": 292440 + }, + { + "epoch": 12.11, + "grad_norm": 0.625, + "learning_rate": 0.00032449104408019706, + "loss": 0.1926, + "step": 292450 + }, + { + "epoch": 12.11, + "grad_norm": 0.51171875, + "learning_rate": 0.00032448069143600414, + "loss": 0.2192, + "step": 292460 + }, + { + "epoch": 12.11, + "grad_norm": 0.6796875, + "learning_rate": 0.0003244703386516466, + "loss": 0.196, + "step": 292470 + }, + { + "epoch": 12.11, + "grad_norm": 0.2890625, + "learning_rate": 0.0003244599857271439, + "loss": 0.1563, + "step": 292480 + }, + { + "epoch": 12.11, + "grad_norm": 0.62109375, + "learning_rate": 0.00032444963266251546, + "loss": 0.202, + "step": 292490 + }, + { + "epoch": 12.12, + "grad_norm": 0.7578125, + "learning_rate": 0.00032443927945778096, + "loss": 0.204, + "step": 292500 + }, + { + "epoch": 12.12, + "grad_norm": 1.09375, + "learning_rate": 0.00032442892611295974, + "loss": 0.2069, + "step": 292510 + }, + { + "epoch": 12.12, + "grad_norm": 0.8984375, + "learning_rate": 0.0003244185726280714, + "loss": 0.1927, + "step": 292520 + }, + { + "epoch": 12.12, + "grad_norm": 0.8515625, + "learning_rate": 0.0003244082190031352, + "loss": 0.1949, + "step": 292530 + }, + { + "epoch": 12.12, + "grad_norm": 0.4609375, + "learning_rate": 0.00032439786523817086, + "loss": 0.1972, + "step": 292540 + }, + { + "epoch": 12.12, + "grad_norm": 0.7421875, + "learning_rate": 0.00032438751133319777, + "loss": 0.2314, + "step": 292550 + }, + { + "epoch": 12.12, + "grad_norm": 1.3984375, + "learning_rate": 0.00032437715728823543, + "loss": 0.2045, + "step": 292560 + }, + { + "epoch": 12.12, + "grad_norm": 0.53515625, + "learning_rate": 0.00032436680310330324, + "loss": 0.2498, + "step": 292570 + }, + { + "epoch": 12.12, + "grad_norm": 1.078125, + "learning_rate": 0.0003243564487784208, + "loss": 0.1922, + "step": 292580 + }, + { + "epoch": 12.12, + "grad_norm": 0.87890625, + "learning_rate": 0.00032434609431360753, + "loss": 0.2051, + "step": 292590 + }, + { + "epoch": 12.12, + "grad_norm": 1.0234375, + "learning_rate": 0.000324335739708883, + "loss": 0.2095, + "step": 292600 + }, + { + "epoch": 12.12, + "grad_norm": 0.482421875, + "learning_rate": 0.00032432538496426656, + "loss": 0.1913, + "step": 292610 + }, + { + "epoch": 12.12, + "grad_norm": 0.5078125, + "learning_rate": 0.0003243150300797778, + "loss": 0.1796, + "step": 292620 + }, + { + "epoch": 12.12, + "grad_norm": 1.265625, + "learning_rate": 0.0003243046750554362, + "loss": 0.1753, + "step": 292630 + }, + { + "epoch": 12.12, + "grad_norm": 0.71875, + "learning_rate": 0.00032429431989126125, + "loss": 0.2067, + "step": 292640 + }, + { + "epoch": 12.12, + "grad_norm": 0.93359375, + "learning_rate": 0.0003242839645872724, + "loss": 0.2151, + "step": 292650 + }, + { + "epoch": 12.12, + "grad_norm": 1.40625, + "learning_rate": 0.0003242736091434891, + "loss": 0.2091, + "step": 292660 + }, + { + "epoch": 12.12, + "grad_norm": 0.80078125, + "learning_rate": 0.00032426325355993085, + "loss": 0.1952, + "step": 292670 + }, + { + "epoch": 12.12, + "grad_norm": 0.79296875, + "learning_rate": 0.00032425289783661725, + "loss": 0.1938, + "step": 292680 + }, + { + "epoch": 12.12, + "grad_norm": 0.3984375, + "learning_rate": 0.0003242425419735677, + "loss": 0.1743, + "step": 292690 + }, + { + "epoch": 12.12, + "grad_norm": 0.53515625, + "learning_rate": 0.00032423218597080173, + "loss": 0.2221, + "step": 292700 + }, + { + "epoch": 12.12, + "grad_norm": 0.859375, + "learning_rate": 0.00032422182982833883, + "loss": 0.2241, + "step": 292710 + }, + { + "epoch": 12.12, + "grad_norm": 0.796875, + "learning_rate": 0.0003242114735461983, + "loss": 0.1659, + "step": 292720 + }, + { + "epoch": 12.12, + "grad_norm": 0.74609375, + "learning_rate": 0.0003242011171244, + "loss": 0.2074, + "step": 292730 + }, + { + "epoch": 12.13, + "grad_norm": 1.78125, + "learning_rate": 0.0003241907605629631, + "loss": 0.1812, + "step": 292740 + }, + { + "epoch": 12.13, + "grad_norm": 0.953125, + "learning_rate": 0.00032418040386190716, + "loss": 0.2064, + "step": 292750 + }, + { + "epoch": 12.13, + "grad_norm": 0.88671875, + "learning_rate": 0.00032417004702125184, + "loss": 0.1705, + "step": 292760 + }, + { + "epoch": 12.13, + "grad_norm": 0.61328125, + "learning_rate": 0.00032415969004101643, + "loss": 0.2439, + "step": 292770 + }, + { + "epoch": 12.13, + "grad_norm": 0.86328125, + "learning_rate": 0.00032414933292122046, + "loss": 0.2109, + "step": 292780 + }, + { + "epoch": 12.13, + "grad_norm": 0.62890625, + "learning_rate": 0.0003241389756618835, + "loss": 0.2091, + "step": 292790 + }, + { + "epoch": 12.13, + "grad_norm": 0.984375, + "learning_rate": 0.00032412861826302495, + "loss": 0.1991, + "step": 292800 + }, + { + "epoch": 12.13, + "grad_norm": 0.59765625, + "learning_rate": 0.0003241182607246644, + "loss": 0.1885, + "step": 292810 + }, + { + "epoch": 12.13, + "grad_norm": 1.3515625, + "learning_rate": 0.00032410790304682126, + "loss": 0.1914, + "step": 292820 + }, + { + "epoch": 12.13, + "grad_norm": 0.69140625, + "learning_rate": 0.000324097545229515, + "loss": 0.1754, + "step": 292830 + }, + { + "epoch": 12.13, + "grad_norm": 0.66015625, + "learning_rate": 0.0003240871872727652, + "loss": 0.2044, + "step": 292840 + }, + { + "epoch": 12.13, + "grad_norm": 0.5078125, + "learning_rate": 0.0003240768291765914, + "loss": 0.2069, + "step": 292850 + }, + { + "epoch": 12.13, + "grad_norm": 0.58984375, + "learning_rate": 0.00032406647094101294, + "loss": 0.1866, + "step": 292860 + }, + { + "epoch": 12.13, + "grad_norm": 1.2578125, + "learning_rate": 0.00032405611256604934, + "loss": 0.1995, + "step": 292870 + }, + { + "epoch": 12.13, + "grad_norm": 1.0234375, + "learning_rate": 0.00032404575405172017, + "loss": 0.1866, + "step": 292880 + }, + { + "epoch": 12.13, + "grad_norm": 0.85546875, + "learning_rate": 0.0003240353953980449, + "loss": 0.1962, + "step": 292890 + }, + { + "epoch": 12.13, + "grad_norm": 0.73828125, + "learning_rate": 0.000324025036605043, + "loss": 0.2014, + "step": 292900 + }, + { + "epoch": 12.13, + "grad_norm": 0.412109375, + "learning_rate": 0.000324014677672734, + "loss": 0.1785, + "step": 292910 + }, + { + "epoch": 12.13, + "grad_norm": 0.73828125, + "learning_rate": 0.0003240043186011372, + "loss": 0.1765, + "step": 292920 + }, + { + "epoch": 12.13, + "grad_norm": 0.79296875, + "learning_rate": 0.0003239939593902725, + "loss": 0.1865, + "step": 292930 + }, + { + "epoch": 12.13, + "grad_norm": 0.447265625, + "learning_rate": 0.000323983600040159, + "loss": 0.1833, + "step": 292940 + }, + { + "epoch": 12.13, + "grad_norm": 0.92578125, + "learning_rate": 0.00032397324055081643, + "loss": 0.1621, + "step": 292950 + }, + { + "epoch": 12.13, + "grad_norm": 1.171875, + "learning_rate": 0.00032396288092226424, + "loss": 0.1728, + "step": 292960 + }, + { + "epoch": 12.13, + "grad_norm": 0.60546875, + "learning_rate": 0.00032395252115452185, + "loss": 0.1706, + "step": 292970 + }, + { + "epoch": 12.14, + "grad_norm": 0.77734375, + "learning_rate": 0.00032394216124760877, + "loss": 0.2432, + "step": 292980 + }, + { + "epoch": 12.14, + "grad_norm": 0.35546875, + "learning_rate": 0.0003239318012015446, + "loss": 0.1629, + "step": 292990 + }, + { + "epoch": 12.14, + "grad_norm": 0.8046875, + "learning_rate": 0.0003239214410163487, + "loss": 0.1744, + "step": 293000 + }, + { + "epoch": 12.14, + "grad_norm": 0.671875, + "learning_rate": 0.0003239110806920407, + "loss": 0.1426, + "step": 293010 + }, + { + "epoch": 12.14, + "grad_norm": 1.5, + "learning_rate": 0.0003239007202286399, + "loss": 0.1748, + "step": 293020 + }, + { + "epoch": 12.14, + "grad_norm": 0.435546875, + "learning_rate": 0.00032389035962616605, + "loss": 0.1951, + "step": 293030 + }, + { + "epoch": 12.14, + "grad_norm": 0.5234375, + "learning_rate": 0.0003238799988846385, + "loss": 0.2185, + "step": 293040 + }, + { + "epoch": 12.14, + "grad_norm": 0.6953125, + "learning_rate": 0.0003238696380040767, + "loss": 0.2414, + "step": 293050 + }, + { + "epoch": 12.14, + "grad_norm": 0.67578125, + "learning_rate": 0.0003238592769845003, + "loss": 0.1693, + "step": 293060 + }, + { + "epoch": 12.14, + "grad_norm": 0.91015625, + "learning_rate": 0.00032384891582592866, + "loss": 0.2171, + "step": 293070 + }, + { + "epoch": 12.14, + "grad_norm": 0.69140625, + "learning_rate": 0.00032383855452838134, + "loss": 0.2016, + "step": 293080 + }, + { + "epoch": 12.14, + "grad_norm": 0.81640625, + "learning_rate": 0.00032382819309187795, + "loss": 0.2281, + "step": 293090 + }, + { + "epoch": 12.14, + "grad_norm": 0.62890625, + "learning_rate": 0.00032381783151643774, + "loss": 0.1796, + "step": 293100 + }, + { + "epoch": 12.14, + "grad_norm": 0.0, + "learning_rate": 0.0003238074698020804, + "loss": 0.2109, + "step": 293110 + }, + { + "epoch": 12.14, + "grad_norm": 0.89453125, + "learning_rate": 0.00032379710794882535, + "loss": 0.2282, + "step": 293120 + }, + { + "epoch": 12.14, + "grad_norm": 0.7578125, + "learning_rate": 0.00032378674595669204, + "loss": 0.2227, + "step": 293130 + }, + { + "epoch": 12.14, + "grad_norm": 0.6796875, + "learning_rate": 0.0003237763838257002, + "loss": 0.1946, + "step": 293140 + }, + { + "epoch": 12.14, + "grad_norm": 0.48828125, + "learning_rate": 0.00032376602155586903, + "loss": 0.1527, + "step": 293150 + }, + { + "epoch": 12.14, + "grad_norm": 0.7890625, + "learning_rate": 0.00032375565914721826, + "loss": 0.1881, + "step": 293160 + }, + { + "epoch": 12.14, + "grad_norm": 0.71875, + "learning_rate": 0.00032374529659976733, + "loss": 0.1628, + "step": 293170 + }, + { + "epoch": 12.14, + "grad_norm": 0.6640625, + "learning_rate": 0.00032373493391353565, + "loss": 0.1762, + "step": 293180 + }, + { + "epoch": 12.14, + "grad_norm": 0.8359375, + "learning_rate": 0.0003237245710885427, + "loss": 0.1832, + "step": 293190 + }, + { + "epoch": 12.14, + "grad_norm": 0.7890625, + "learning_rate": 0.00032371420812480825, + "loss": 0.1684, + "step": 293200 + }, + { + "epoch": 12.14, + "grad_norm": 0.9921875, + "learning_rate": 0.0003237038450223515, + "loss": 0.1617, + "step": 293210 + }, + { + "epoch": 12.15, + "grad_norm": 1.6875, + "learning_rate": 0.0003236934817811922, + "loss": 0.1428, + "step": 293220 + }, + { + "epoch": 12.15, + "grad_norm": 0.671875, + "learning_rate": 0.00032368311840134955, + "loss": 0.1521, + "step": 293230 + }, + { + "epoch": 12.15, + "grad_norm": 0.56640625, + "learning_rate": 0.0003236727548828433, + "loss": 0.1396, + "step": 293240 + }, + { + "epoch": 12.15, + "grad_norm": 2.515625, + "learning_rate": 0.00032366239122569296, + "loss": 0.1776, + "step": 293250 + }, + { + "epoch": 12.15, + "grad_norm": 0.95703125, + "learning_rate": 0.00032365202742991783, + "loss": 0.184, + "step": 293260 + }, + { + "epoch": 12.15, + "grad_norm": 0.546875, + "learning_rate": 0.0003236416634955377, + "loss": 0.1471, + "step": 293270 + }, + { + "epoch": 12.15, + "grad_norm": 1.2265625, + "learning_rate": 0.00032363129942257176, + "loss": 0.1482, + "step": 293280 + }, + { + "epoch": 12.15, + "grad_norm": 0.77734375, + "learning_rate": 0.00032362093521103974, + "loss": 0.1729, + "step": 293290 + }, + { + "epoch": 12.15, + "grad_norm": 0.78515625, + "learning_rate": 0.0003236105708609611, + "loss": 0.1893, + "step": 293300 + }, + { + "epoch": 12.15, + "grad_norm": 0.8359375, + "learning_rate": 0.0003236002063723552, + "loss": 0.1996, + "step": 293310 + }, + { + "epoch": 12.15, + "grad_norm": 0.640625, + "learning_rate": 0.00032358984174524175, + "loss": 0.2144, + "step": 293320 + }, + { + "epoch": 12.15, + "grad_norm": 1.734375, + "learning_rate": 0.0003235794769796402, + "loss": 0.2444, + "step": 293330 + }, + { + "epoch": 12.15, + "grad_norm": 0.5859375, + "learning_rate": 0.0003235691120755698, + "loss": 0.2456, + "step": 293340 + }, + { + "epoch": 12.15, + "grad_norm": 0.79296875, + "learning_rate": 0.00032355874703305054, + "loss": 0.1486, + "step": 293350 + }, + { + "epoch": 12.15, + "grad_norm": 1.1328125, + "learning_rate": 0.00032354838185210153, + "loss": 0.1886, + "step": 293360 + }, + { + "epoch": 12.15, + "grad_norm": 1.1484375, + "learning_rate": 0.0003235380165327424, + "loss": 0.1966, + "step": 293370 + }, + { + "epoch": 12.15, + "grad_norm": 0.498046875, + "learning_rate": 0.00032352765107499274, + "loss": 0.2028, + "step": 293380 + }, + { + "epoch": 12.15, + "grad_norm": 0.263671875, + "learning_rate": 0.00032351728547887193, + "loss": 0.178, + "step": 293390 + }, + { + "epoch": 12.15, + "grad_norm": 2.0, + "learning_rate": 0.00032350691974439955, + "loss": 0.1999, + "step": 293400 + }, + { + "epoch": 12.15, + "grad_norm": 0.236328125, + "learning_rate": 0.0003234965538715951, + "loss": 0.1536, + "step": 293410 + }, + { + "epoch": 12.15, + "grad_norm": 0.84375, + "learning_rate": 0.000323486187860478, + "loss": 0.1549, + "step": 293420 + }, + { + "epoch": 12.15, + "grad_norm": 1.328125, + "learning_rate": 0.00032347582171106794, + "loss": 0.2263, + "step": 293430 + }, + { + "epoch": 12.15, + "grad_norm": 0.80859375, + "learning_rate": 0.0003234654554233842, + "loss": 0.2061, + "step": 293440 + }, + { + "epoch": 12.15, + "grad_norm": 0.84765625, + "learning_rate": 0.00032345508899744646, + "loss": 0.2257, + "step": 293450 + }, + { + "epoch": 12.16, + "grad_norm": 2.53125, + "learning_rate": 0.0003234447224332742, + "loss": 0.1884, + "step": 293460 + }, + { + "epoch": 12.16, + "grad_norm": 0.3984375, + "learning_rate": 0.00032343435573088685, + "loss": 0.1771, + "step": 293470 + }, + { + "epoch": 12.16, + "grad_norm": 1.046875, + "learning_rate": 0.000323423988890304, + "loss": 0.1553, + "step": 293480 + }, + { + "epoch": 12.16, + "grad_norm": 0.89453125, + "learning_rate": 0.0003234136219115452, + "loss": 0.1545, + "step": 293490 + }, + { + "epoch": 12.16, + "grad_norm": 1.0703125, + "learning_rate": 0.0003234032547946297, + "loss": 0.1696, + "step": 293500 + }, + { + "epoch": 12.16, + "grad_norm": 0.546875, + "learning_rate": 0.0003233928875395774, + "loss": 0.2529, + "step": 293510 + }, + { + "epoch": 12.16, + "grad_norm": 0.94140625, + "learning_rate": 0.00032338252014640754, + "loss": 0.171, + "step": 293520 + }, + { + "epoch": 12.16, + "grad_norm": 1.4765625, + "learning_rate": 0.0003233721526151397, + "loss": 0.1892, + "step": 293530 + }, + { + "epoch": 12.16, + "grad_norm": 0.498046875, + "learning_rate": 0.00032336178494579346, + "loss": 0.2069, + "step": 293540 + }, + { + "epoch": 12.16, + "grad_norm": 0.81640625, + "learning_rate": 0.0003233514171383881, + "loss": 0.2299, + "step": 293550 + }, + { + "epoch": 12.16, + "grad_norm": 0.462890625, + "learning_rate": 0.00032334104919294344, + "loss": 0.1512, + "step": 293560 + }, + { + "epoch": 12.16, + "grad_norm": 1.0625, + "learning_rate": 0.0003233306811094788, + "loss": 0.2119, + "step": 293570 + }, + { + "epoch": 12.16, + "grad_norm": 0.7265625, + "learning_rate": 0.0003233203128880137, + "loss": 0.1972, + "step": 293580 + }, + { + "epoch": 12.16, + "grad_norm": 1.1953125, + "learning_rate": 0.00032330994452856775, + "loss": 0.2232, + "step": 293590 + }, + { + "epoch": 12.16, + "grad_norm": 0.88671875, + "learning_rate": 0.0003232995760311604, + "loss": 0.2106, + "step": 293600 + }, + { + "epoch": 12.16, + "grad_norm": 0.7734375, + "learning_rate": 0.0003232892073958111, + "loss": 0.2147, + "step": 293610 + }, + { + "epoch": 12.16, + "grad_norm": 1.5078125, + "learning_rate": 0.0003232788386225395, + "loss": 0.2004, + "step": 293620 + }, + { + "epoch": 12.16, + "grad_norm": 0.703125, + "learning_rate": 0.00032326846971136495, + "loss": 0.1981, + "step": 293630 + }, + { + "epoch": 12.16, + "grad_norm": 0.53515625, + "learning_rate": 0.00032325810066230714, + "loss": 0.2247, + "step": 293640 + }, + { + "epoch": 12.16, + "grad_norm": 0.267578125, + "learning_rate": 0.0003232477314753855, + "loss": 0.106, + "step": 293650 + }, + { + "epoch": 12.16, + "grad_norm": 1.09375, + "learning_rate": 0.0003232373621506194, + "loss": 0.1898, + "step": 293660 + }, + { + "epoch": 12.16, + "grad_norm": 0.80078125, + "learning_rate": 0.0003232269926880287, + "loss": 0.234, + "step": 293670 + }, + { + "epoch": 12.16, + "grad_norm": 1.0625, + "learning_rate": 0.00032321662308763257, + "loss": 0.2124, + "step": 293680 + }, + { + "epoch": 12.16, + "grad_norm": 1.65625, + "learning_rate": 0.00032320625334945067, + "loss": 0.1666, + "step": 293690 + }, + { + "epoch": 12.17, + "grad_norm": 0.68359375, + "learning_rate": 0.0003231958834735026, + "loss": 0.1663, + "step": 293700 + }, + { + "epoch": 12.17, + "grad_norm": 0.369140625, + "learning_rate": 0.00032318551345980763, + "loss": 0.1883, + "step": 293710 + }, + { + "epoch": 12.17, + "grad_norm": 1.5, + "learning_rate": 0.00032317514330838554, + "loss": 0.2043, + "step": 293720 + }, + { + "epoch": 12.17, + "grad_norm": 0.1640625, + "learning_rate": 0.00032316477301925573, + "loss": 0.2379, + "step": 293730 + }, + { + "epoch": 12.17, + "grad_norm": 1.046875, + "learning_rate": 0.0003231544025924376, + "loss": 0.2062, + "step": 293740 + }, + { + "epoch": 12.17, + "grad_norm": 0.494140625, + "learning_rate": 0.00032314403202795096, + "loss": 0.2012, + "step": 293750 + }, + { + "epoch": 12.17, + "grad_norm": 0.6015625, + "learning_rate": 0.000323133661325815, + "loss": 0.1537, + "step": 293760 + }, + { + "epoch": 12.17, + "grad_norm": 0.87109375, + "learning_rate": 0.00032312329048604947, + "loss": 0.2026, + "step": 293770 + }, + { + "epoch": 12.17, + "grad_norm": 0.328125, + "learning_rate": 0.0003231129195086738, + "loss": 0.1915, + "step": 293780 + }, + { + "epoch": 12.17, + "grad_norm": 1.4296875, + "learning_rate": 0.0003231025483937075, + "loss": 0.1825, + "step": 293790 + }, + { + "epoch": 12.17, + "grad_norm": 0.79296875, + "learning_rate": 0.00032309217714117014, + "loss": 0.1892, + "step": 293800 + }, + { + "epoch": 12.17, + "grad_norm": 0.27734375, + "learning_rate": 0.0003230818057510811, + "loss": 0.2128, + "step": 293810 + }, + { + "epoch": 12.17, + "grad_norm": 0.51171875, + "learning_rate": 0.00032307143422346, + "loss": 0.2176, + "step": 293820 + }, + { + "epoch": 12.17, + "grad_norm": 0.796875, + "learning_rate": 0.00032306106255832644, + "loss": 0.2179, + "step": 293830 + }, + { + "epoch": 12.17, + "grad_norm": 0.87890625, + "learning_rate": 0.00032305069075569984, + "loss": 0.2228, + "step": 293840 + }, + { + "epoch": 12.17, + "grad_norm": 0.58984375, + "learning_rate": 0.00032304031881559965, + "loss": 0.1406, + "step": 293850 + }, + { + "epoch": 12.17, + "grad_norm": 0.5625, + "learning_rate": 0.00032302994673804556, + "loss": 0.1996, + "step": 293860 + }, + { + "epoch": 12.17, + "grad_norm": 1.3515625, + "learning_rate": 0.00032301957452305686, + "loss": 0.1692, + "step": 293870 + }, + { + "epoch": 12.17, + "grad_norm": 2.109375, + "learning_rate": 0.00032300920217065334, + "loss": 0.1753, + "step": 293880 + }, + { + "epoch": 12.17, + "grad_norm": 0.53515625, + "learning_rate": 0.00032299882968085436, + "loss": 0.1769, + "step": 293890 + }, + { + "epoch": 12.17, + "grad_norm": 0.71484375, + "learning_rate": 0.00032298845705367943, + "loss": 0.2062, + "step": 293900 + }, + { + "epoch": 12.17, + "grad_norm": 0.431640625, + "learning_rate": 0.00032297808428914817, + "loss": 0.1973, + "step": 293910 + }, + { + "epoch": 12.17, + "grad_norm": 0.88671875, + "learning_rate": 0.00032296771138728, + "loss": 0.2066, + "step": 293920 + }, + { + "epoch": 12.17, + "grad_norm": 1.328125, + "learning_rate": 0.0003229573383480944, + "loss": 0.202, + "step": 293930 + }, + { + "epoch": 12.17, + "grad_norm": 1.4609375, + "learning_rate": 0.0003229469651716111, + "loss": 0.2001, + "step": 293940 + }, + { + "epoch": 12.18, + "grad_norm": 0.87890625, + "learning_rate": 0.00032293659185784936, + "loss": 0.1648, + "step": 293950 + }, + { + "epoch": 12.18, + "grad_norm": 0.7578125, + "learning_rate": 0.0003229262184068289, + "loss": 0.2098, + "step": 293960 + }, + { + "epoch": 12.18, + "grad_norm": 0.64453125, + "learning_rate": 0.00032291584481856917, + "loss": 0.1789, + "step": 293970 + }, + { + "epoch": 12.18, + "grad_norm": 2.484375, + "learning_rate": 0.0003229054710930896, + "loss": 0.233, + "step": 293980 + }, + { + "epoch": 12.18, + "grad_norm": 1.046875, + "learning_rate": 0.00032289509723041, + "loss": 0.2774, + "step": 293990 + }, + { + "epoch": 12.18, + "grad_norm": 0.6484375, + "learning_rate": 0.0003228847232305496, + "loss": 0.1503, + "step": 294000 + }, + { + "epoch": 12.18, + "grad_norm": 0.80859375, + "learning_rate": 0.000322874349093528, + "loss": 0.2278, + "step": 294010 + }, + { + "epoch": 12.18, + "grad_norm": 0.58203125, + "learning_rate": 0.00032286397481936477, + "loss": 0.202, + "step": 294020 + }, + { + "epoch": 12.18, + "grad_norm": 0.55859375, + "learning_rate": 0.0003228536004080794, + "loss": 0.2023, + "step": 294030 + }, + { + "epoch": 12.18, + "grad_norm": 0.48046875, + "learning_rate": 0.0003228432258596914, + "loss": 0.1582, + "step": 294040 + }, + { + "epoch": 12.18, + "grad_norm": 0.94921875, + "learning_rate": 0.00032283285117422036, + "loss": 0.1781, + "step": 294050 + }, + { + "epoch": 12.18, + "grad_norm": 0.57421875, + "learning_rate": 0.0003228224763516857, + "loss": 0.2035, + "step": 294060 + }, + { + "epoch": 12.18, + "grad_norm": 0.9140625, + "learning_rate": 0.00032281210139210707, + "loss": 0.1606, + "step": 294070 + }, + { + "epoch": 12.18, + "grad_norm": 0.494140625, + "learning_rate": 0.00032280172629550394, + "loss": 0.1537, + "step": 294080 + }, + { + "epoch": 12.18, + "grad_norm": 1.515625, + "learning_rate": 0.00032279135106189576, + "loss": 0.1701, + "step": 294090 + }, + { + "epoch": 12.18, + "grad_norm": 0.84765625, + "learning_rate": 0.0003227809756913021, + "loss": 0.2429, + "step": 294100 + }, + { + "epoch": 12.18, + "grad_norm": 0.9453125, + "learning_rate": 0.00032277060018374255, + "loss": 0.1829, + "step": 294110 + }, + { + "epoch": 12.18, + "grad_norm": 0.6171875, + "learning_rate": 0.00032276022453923666, + "loss": 0.1934, + "step": 294120 + }, + { + "epoch": 12.18, + "grad_norm": 0.515625, + "learning_rate": 0.0003227498487578038, + "loss": 0.224, + "step": 294130 + }, + { + "epoch": 12.18, + "grad_norm": 0.55078125, + "learning_rate": 0.00032273947283946355, + "loss": 0.1851, + "step": 294140 + }, + { + "epoch": 12.18, + "grad_norm": 1.65625, + "learning_rate": 0.00032272909678423547, + "loss": 0.1413, + "step": 294150 + }, + { + "epoch": 12.18, + "grad_norm": 0.78515625, + "learning_rate": 0.0003227187205921391, + "loss": 0.2035, + "step": 294160 + }, + { + "epoch": 12.18, + "grad_norm": 1.2421875, + "learning_rate": 0.000322708344263194, + "loss": 0.1891, + "step": 294170 + }, + { + "epoch": 12.18, + "grad_norm": 1.0234375, + "learning_rate": 0.00032269796779741967, + "loss": 0.2861, + "step": 294180 + }, + { + "epoch": 12.19, + "grad_norm": 0.9140625, + "learning_rate": 0.0003226875911948355, + "loss": 0.2232, + "step": 294190 + }, + { + "epoch": 12.19, + "grad_norm": 0.99609375, + "learning_rate": 0.00032267721445546126, + "loss": 0.1568, + "step": 294200 + }, + { + "epoch": 12.19, + "grad_norm": 3.09375, + "learning_rate": 0.0003226668375793163, + "loss": 0.1608, + "step": 294210 + }, + { + "epoch": 12.19, + "grad_norm": 1.359375, + "learning_rate": 0.0003226564605664201, + "loss": 0.1892, + "step": 294220 + }, + { + "epoch": 12.19, + "grad_norm": 0.462890625, + "learning_rate": 0.0003226460834167924, + "loss": 0.1772, + "step": 294230 + }, + { + "epoch": 12.19, + "grad_norm": 0.69921875, + "learning_rate": 0.0003226357061304526, + "loss": 0.2063, + "step": 294240 + }, + { + "epoch": 12.19, + "grad_norm": 0.72265625, + "learning_rate": 0.0003226253287074202, + "loss": 0.2236, + "step": 294250 + }, + { + "epoch": 12.19, + "grad_norm": 2.25, + "learning_rate": 0.00032261495114771483, + "loss": 0.1962, + "step": 294260 + }, + { + "epoch": 12.19, + "grad_norm": 0.458984375, + "learning_rate": 0.00032260457345135594, + "loss": 0.2046, + "step": 294270 + }, + { + "epoch": 12.19, + "grad_norm": 1.15625, + "learning_rate": 0.00032259419561836314, + "loss": 0.1731, + "step": 294280 + }, + { + "epoch": 12.19, + "grad_norm": 0.671875, + "learning_rate": 0.0003225838176487558, + "loss": 0.1757, + "step": 294290 + }, + { + "epoch": 12.19, + "grad_norm": 0.54296875, + "learning_rate": 0.00032257343954255366, + "loss": 0.2353, + "step": 294300 + }, + { + "epoch": 12.19, + "grad_norm": 1.1640625, + "learning_rate": 0.000322563061299776, + "loss": 0.2219, + "step": 294310 + }, + { + "epoch": 12.19, + "grad_norm": 0.4921875, + "learning_rate": 0.0003225526829204427, + "loss": 0.2077, + "step": 294320 + }, + { + "epoch": 12.19, + "grad_norm": 0.263671875, + "learning_rate": 0.0003225423044045729, + "loss": 0.142, + "step": 294330 + }, + { + "epoch": 12.19, + "grad_norm": 0.37890625, + "learning_rate": 0.00032253192575218637, + "loss": 0.1868, + "step": 294340 + }, + { + "epoch": 12.19, + "grad_norm": 0.796875, + "learning_rate": 0.0003225215469633026, + "loss": 0.1952, + "step": 294350 + }, + { + "epoch": 12.19, + "grad_norm": 0.63671875, + "learning_rate": 0.00032251116803794114, + "loss": 0.2038, + "step": 294360 + }, + { + "epoch": 12.19, + "grad_norm": 0.578125, + "learning_rate": 0.00032250078897612145, + "loss": 0.1941, + "step": 294370 + }, + { + "epoch": 12.19, + "grad_norm": 2.90625, + "learning_rate": 0.00032249040977786316, + "loss": 0.2098, + "step": 294380 + }, + { + "epoch": 12.19, + "grad_norm": 0.80078125, + "learning_rate": 0.0003224800304431856, + "loss": 0.2013, + "step": 294390 + }, + { + "epoch": 12.19, + "grad_norm": 1.5625, + "learning_rate": 0.00032246965097210866, + "loss": 0.19, + "step": 294400 + }, + { + "epoch": 12.19, + "grad_norm": 0.83203125, + "learning_rate": 0.00032245927136465154, + "loss": 0.1704, + "step": 294410 + }, + { + "epoch": 12.19, + "grad_norm": 0.423828125, + "learning_rate": 0.0003224488916208339, + "loss": 0.1708, + "step": 294420 + }, + { + "epoch": 12.2, + "grad_norm": 1.0859375, + "learning_rate": 0.0003224385117406753, + "loss": 0.2107, + "step": 294430 + }, + { + "epoch": 12.2, + "grad_norm": 0.59765625, + "learning_rate": 0.00032242813172419523, + "loss": 0.1847, + "step": 294440 + }, + { + "epoch": 12.2, + "grad_norm": 2.859375, + "learning_rate": 0.0003224177515714132, + "loss": 0.1614, + "step": 294450 + }, + { + "epoch": 12.2, + "grad_norm": 1.390625, + "learning_rate": 0.00032240737128234886, + "loss": 0.1931, + "step": 294460 + }, + { + "epoch": 12.2, + "grad_norm": 0.52734375, + "learning_rate": 0.0003223969908570216, + "loss": 0.1529, + "step": 294470 + }, + { + "epoch": 12.2, + "grad_norm": 0.76171875, + "learning_rate": 0.0003223866102954511, + "loss": 0.192, + "step": 294480 + }, + { + "epoch": 12.2, + "grad_norm": 0.79296875, + "learning_rate": 0.0003223762295976568, + "loss": 0.2078, + "step": 294490 + }, + { + "epoch": 12.2, + "grad_norm": 1.5390625, + "learning_rate": 0.0003223658487636582, + "loss": 0.2663, + "step": 294500 + }, + { + "epoch": 12.2, + "grad_norm": 0.3984375, + "learning_rate": 0.0003223554677934749, + "loss": 0.1685, + "step": 294510 + }, + { + "epoch": 12.2, + "grad_norm": 1.2890625, + "learning_rate": 0.00032234508668712637, + "loss": 0.1659, + "step": 294520 + }, + { + "epoch": 12.2, + "grad_norm": 0.64453125, + "learning_rate": 0.00032233470544463226, + "loss": 0.2257, + "step": 294530 + }, + { + "epoch": 12.2, + "grad_norm": 1.28125, + "learning_rate": 0.000322324324066012, + "loss": 0.2529, + "step": 294540 + }, + { + "epoch": 12.2, + "grad_norm": 0.6484375, + "learning_rate": 0.00032231394255128525, + "loss": 0.1585, + "step": 294550 + }, + { + "epoch": 12.2, + "grad_norm": 0.62890625, + "learning_rate": 0.00032230356090047144, + "loss": 0.1759, + "step": 294560 + }, + { + "epoch": 12.2, + "grad_norm": 0.53515625, + "learning_rate": 0.00032229317911359014, + "loss": 0.2088, + "step": 294570 + }, + { + "epoch": 12.2, + "grad_norm": 0.7265625, + "learning_rate": 0.0003222827971906609, + "loss": 0.1649, + "step": 294580 + }, + { + "epoch": 12.2, + "grad_norm": 0.734375, + "learning_rate": 0.0003222724151317032, + "loss": 0.2363, + "step": 294590 + }, + { + "epoch": 12.2, + "grad_norm": 2.03125, + "learning_rate": 0.0003222620329367366, + "loss": 0.2241, + "step": 294600 + }, + { + "epoch": 12.2, + "grad_norm": 2.046875, + "learning_rate": 0.0003222516506057807, + "loss": 0.1982, + "step": 294610 + }, + { + "epoch": 12.2, + "grad_norm": 0.5546875, + "learning_rate": 0.000322241268138855, + "loss": 0.2152, + "step": 294620 + }, + { + "epoch": 12.2, + "grad_norm": 0.765625, + "learning_rate": 0.00032223088553597903, + "loss": 0.1874, + "step": 294630 + }, + { + "epoch": 12.2, + "grad_norm": 1.1796875, + "learning_rate": 0.0003222205027971723, + "loss": 0.1762, + "step": 294640 + }, + { + "epoch": 12.2, + "grad_norm": 0.60546875, + "learning_rate": 0.0003222101199224544, + "loss": 0.1804, + "step": 294650 + }, + { + "epoch": 12.2, + "grad_norm": 3.828125, + "learning_rate": 0.0003221997369118449, + "loss": 0.2024, + "step": 294660 + }, + { + "epoch": 12.21, + "grad_norm": 1.1484375, + "learning_rate": 0.0003221893537653632, + "loss": 0.224, + "step": 294670 + }, + { + "epoch": 12.21, + "grad_norm": 0.486328125, + "learning_rate": 0.000322178970483029, + "loss": 0.1623, + "step": 294680 + }, + { + "epoch": 12.21, + "grad_norm": 0.85546875, + "learning_rate": 0.0003221685870648618, + "loss": 0.2067, + "step": 294690 + }, + { + "epoch": 12.21, + "grad_norm": 1.28125, + "learning_rate": 0.0003221582035108811, + "loss": 0.1765, + "step": 294700 + }, + { + "epoch": 12.21, + "grad_norm": 0.345703125, + "learning_rate": 0.0003221478198211064, + "loss": 0.1674, + "step": 294710 + }, + { + "epoch": 12.21, + "grad_norm": 1.234375, + "learning_rate": 0.00032213743599555727, + "loss": 0.1695, + "step": 294720 + }, + { + "epoch": 12.21, + "grad_norm": 1.078125, + "learning_rate": 0.0003221270520342533, + "loss": 0.1791, + "step": 294730 + }, + { + "epoch": 12.21, + "grad_norm": 0.53125, + "learning_rate": 0.0003221166679372141, + "loss": 0.1611, + "step": 294740 + }, + { + "epoch": 12.21, + "grad_norm": 0.78515625, + "learning_rate": 0.000322106283704459, + "loss": 0.1932, + "step": 294750 + }, + { + "epoch": 12.21, + "grad_norm": 0.7421875, + "learning_rate": 0.00032209589933600774, + "loss": 0.1505, + "step": 294760 + }, + { + "epoch": 12.21, + "grad_norm": 0.27734375, + "learning_rate": 0.0003220855148318798, + "loss": 0.1514, + "step": 294770 + }, + { + "epoch": 12.21, + "grad_norm": 0.640625, + "learning_rate": 0.0003220751301920946, + "loss": 0.2053, + "step": 294780 + }, + { + "epoch": 12.21, + "grad_norm": 0.80078125, + "learning_rate": 0.00032206474541667185, + "loss": 0.1705, + "step": 294790 + }, + { + "epoch": 12.21, + "grad_norm": 0.7265625, + "learning_rate": 0.00032205436050563105, + "loss": 0.1682, + "step": 294800 + }, + { + "epoch": 12.21, + "grad_norm": 1.890625, + "learning_rate": 0.0003220439754589917, + "loss": 0.2216, + "step": 294810 + }, + { + "epoch": 12.21, + "grad_norm": 0.57421875, + "learning_rate": 0.00032203359027677337, + "loss": 0.1413, + "step": 294820 + }, + { + "epoch": 12.21, + "grad_norm": 0.7578125, + "learning_rate": 0.00032202320495899563, + "loss": 0.1879, + "step": 294830 + }, + { + "epoch": 12.21, + "grad_norm": 2.59375, + "learning_rate": 0.00032201281950567794, + "loss": 0.1535, + "step": 294840 + }, + { + "epoch": 12.21, + "grad_norm": 1.3359375, + "learning_rate": 0.00032200243391683996, + "loss": 0.2178, + "step": 294850 + }, + { + "epoch": 12.21, + "grad_norm": 0.75390625, + "learning_rate": 0.0003219920481925011, + "loss": 0.1471, + "step": 294860 + }, + { + "epoch": 12.21, + "grad_norm": 1.390625, + "learning_rate": 0.00032198166233268104, + "loss": 0.1733, + "step": 294870 + }, + { + "epoch": 12.21, + "grad_norm": 1.0625, + "learning_rate": 0.00032197127633739925, + "loss": 0.188, + "step": 294880 + }, + { + "epoch": 12.21, + "grad_norm": 0.392578125, + "learning_rate": 0.0003219608902066753, + "loss": 0.2032, + "step": 294890 + }, + { + "epoch": 12.21, + "grad_norm": 0.89453125, + "learning_rate": 0.0003219505039405288, + "loss": 0.189, + "step": 294900 + }, + { + "epoch": 12.22, + "grad_norm": 1.28125, + "learning_rate": 0.0003219401175389791, + "loss": 0.1911, + "step": 294910 + }, + { + "epoch": 12.22, + "grad_norm": 0.53125, + "learning_rate": 0.0003219297310020459, + "loss": 0.1914, + "step": 294920 + }, + { + "epoch": 12.22, + "grad_norm": 0.3828125, + "learning_rate": 0.00032191934432974873, + "loss": 0.1489, + "step": 294930 + }, + { + "epoch": 12.22, + "grad_norm": 0.984375, + "learning_rate": 0.00032190895752210705, + "loss": 0.1674, + "step": 294940 + }, + { + "epoch": 12.22, + "grad_norm": 1.0546875, + "learning_rate": 0.00032189857057914056, + "loss": 0.1749, + "step": 294950 + }, + { + "epoch": 12.22, + "grad_norm": 0.92578125, + "learning_rate": 0.0003218881835008687, + "loss": 0.2575, + "step": 294960 + }, + { + "epoch": 12.22, + "grad_norm": 0.466796875, + "learning_rate": 0.0003218777962873111, + "loss": 0.199, + "step": 294970 + }, + { + "epoch": 12.22, + "grad_norm": 0.388671875, + "learning_rate": 0.0003218674089384872, + "loss": 0.1891, + "step": 294980 + }, + { + "epoch": 12.22, + "grad_norm": 1.2734375, + "learning_rate": 0.0003218570214544165, + "loss": 0.2233, + "step": 294990 + }, + { + "epoch": 12.22, + "grad_norm": 0.7109375, + "learning_rate": 0.0003218466338351188, + "loss": 0.1704, + "step": 295000 + }, + { + "epoch": 12.22, + "grad_norm": 0.90625, + "learning_rate": 0.00032183624608061345, + "loss": 0.1925, + "step": 295010 + }, + { + "epoch": 12.22, + "grad_norm": 1.5, + "learning_rate": 0.00032182585819092, + "loss": 0.1905, + "step": 295020 + }, + { + "epoch": 12.22, + "grad_norm": 0.921875, + "learning_rate": 0.00032181547016605803, + "loss": 0.1448, + "step": 295030 + }, + { + "epoch": 12.22, + "grad_norm": 1.0546875, + "learning_rate": 0.0003218050820060472, + "loss": 0.191, + "step": 295040 + }, + { + "epoch": 12.22, + "grad_norm": 1.0078125, + "learning_rate": 0.00032179469371090684, + "loss": 0.2455, + "step": 295050 + }, + { + "epoch": 12.22, + "grad_norm": 0.55078125, + "learning_rate": 0.00032178430528065675, + "loss": 0.2349, + "step": 295060 + }, + { + "epoch": 12.22, + "grad_norm": 0.5234375, + "learning_rate": 0.0003217739167153162, + "loss": 0.1872, + "step": 295070 + }, + { + "epoch": 12.22, + "grad_norm": 0.9296875, + "learning_rate": 0.000321763528014905, + "loss": 0.1298, + "step": 295080 + }, + { + "epoch": 12.22, + "grad_norm": 0.9609375, + "learning_rate": 0.0003217531391794426, + "loss": 0.2225, + "step": 295090 + }, + { + "epoch": 12.22, + "grad_norm": 0.95703125, + "learning_rate": 0.0003217427502089484, + "loss": 0.2251, + "step": 295100 + }, + { + "epoch": 12.22, + "grad_norm": 0.859375, + "learning_rate": 0.0003217323611034422, + "loss": 0.1722, + "step": 295110 + }, + { + "epoch": 12.22, + "grad_norm": 0.671875, + "learning_rate": 0.00032172197186294344, + "loss": 0.1907, + "step": 295120 + }, + { + "epoch": 12.22, + "grad_norm": 0.388671875, + "learning_rate": 0.0003217115824874717, + "loss": 0.1775, + "step": 295130 + }, + { + "epoch": 12.22, + "grad_norm": 1.1640625, + "learning_rate": 0.0003217011929770465, + "loss": 0.19, + "step": 295140 + }, + { + "epoch": 12.23, + "grad_norm": 1.2421875, + "learning_rate": 0.00032169080333168727, + "loss": 0.2261, + "step": 295150 + }, + { + "epoch": 12.23, + "grad_norm": 0.37109375, + "learning_rate": 0.0003216804135514138, + "loss": 0.1655, + "step": 295160 + }, + { + "epoch": 12.23, + "grad_norm": 1.203125, + "learning_rate": 0.0003216700236362456, + "loss": 0.1886, + "step": 295170 + }, + { + "epoch": 12.23, + "grad_norm": 0.380859375, + "learning_rate": 0.00032165963358620197, + "loss": 0.2162, + "step": 295180 + }, + { + "epoch": 12.23, + "grad_norm": 1.1484375, + "learning_rate": 0.0003216492434013028, + "loss": 0.1912, + "step": 295190 + }, + { + "epoch": 12.23, + "grad_norm": 0.5703125, + "learning_rate": 0.0003216388530815675, + "loss": 0.1958, + "step": 295200 + }, + { + "epoch": 12.23, + "grad_norm": 0.82421875, + "learning_rate": 0.0003216284626270155, + "loss": 0.1497, + "step": 295210 + }, + { + "epoch": 12.23, + "grad_norm": 0.5625, + "learning_rate": 0.0003216180720376666, + "loss": 0.1812, + "step": 295220 + }, + { + "epoch": 12.23, + "grad_norm": 1.109375, + "learning_rate": 0.00032160768131354005, + "loss": 0.1448, + "step": 295230 + }, + { + "epoch": 12.23, + "grad_norm": 0.765625, + "learning_rate": 0.0003215972904546557, + "loss": 0.1868, + "step": 295240 + }, + { + "epoch": 12.23, + "grad_norm": 2.296875, + "learning_rate": 0.00032158689946103306, + "loss": 0.2249, + "step": 295250 + }, + { + "epoch": 12.23, + "grad_norm": 0.65625, + "learning_rate": 0.00032157650833269144, + "loss": 0.1619, + "step": 295260 + }, + { + "epoch": 12.23, + "grad_norm": 1.203125, + "learning_rate": 0.0003215661170696506, + "loss": 0.1893, + "step": 295270 + }, + { + "epoch": 12.23, + "grad_norm": 0.38671875, + "learning_rate": 0.0003215557256719301, + "loss": 0.202, + "step": 295280 + }, + { + "epoch": 12.23, + "grad_norm": 1.8671875, + "learning_rate": 0.0003215453341395494, + "loss": 0.1897, + "step": 295290 + }, + { + "epoch": 12.23, + "grad_norm": 0.76953125, + "learning_rate": 0.0003215349424725282, + "loss": 0.2018, + "step": 295300 + }, + { + "epoch": 12.23, + "grad_norm": 0.71484375, + "learning_rate": 0.0003215245506708858, + "loss": 0.1565, + "step": 295310 + }, + { + "epoch": 12.23, + "grad_norm": 0.80078125, + "learning_rate": 0.0003215141587346421, + "loss": 0.1905, + "step": 295320 + }, + { + "epoch": 12.23, + "grad_norm": 0.8359375, + "learning_rate": 0.00032150376666381636, + "loss": 0.2019, + "step": 295330 + }, + { + "epoch": 12.23, + "grad_norm": 1.1328125, + "learning_rate": 0.0003214933744584283, + "loss": 0.1513, + "step": 295340 + }, + { + "epoch": 12.23, + "grad_norm": 0.9375, + "learning_rate": 0.00032148298211849747, + "loss": 0.2452, + "step": 295350 + }, + { + "epoch": 12.23, + "grad_norm": 1.296875, + "learning_rate": 0.00032147258964404334, + "loss": 0.1813, + "step": 295360 + }, + { + "epoch": 12.23, + "grad_norm": 0.82421875, + "learning_rate": 0.0003214621970350855, + "loss": 0.1963, + "step": 295370 + }, + { + "epoch": 12.23, + "grad_norm": 0.447265625, + "learning_rate": 0.00032145180429164354, + "loss": 0.2218, + "step": 295380 + }, + { + "epoch": 12.24, + "grad_norm": 0.84765625, + "learning_rate": 0.00032144141141373696, + "loss": 0.2385, + "step": 295390 + }, + { + "epoch": 12.24, + "grad_norm": 1.1015625, + "learning_rate": 0.00032143101840138546, + "loss": 0.217, + "step": 295400 + }, + { + "epoch": 12.24, + "grad_norm": 0.7109375, + "learning_rate": 0.0003214206252546084, + "loss": 0.2134, + "step": 295410 + }, + { + "epoch": 12.24, + "grad_norm": 0.498046875, + "learning_rate": 0.00032141023197342544, + "loss": 0.2006, + "step": 295420 + }, + { + "epoch": 12.24, + "grad_norm": 1.9921875, + "learning_rate": 0.00032139983855785623, + "loss": 0.1541, + "step": 295430 + }, + { + "epoch": 12.24, + "grad_norm": 0.3515625, + "learning_rate": 0.0003213894450079201, + "loss": 0.1446, + "step": 295440 + }, + { + "epoch": 12.24, + "grad_norm": 0.0, + "learning_rate": 0.00032137905132363686, + "loss": 0.1924, + "step": 295450 + }, + { + "epoch": 12.24, + "grad_norm": 0.69921875, + "learning_rate": 0.0003213686575050259, + "loss": 0.2121, + "step": 295460 + }, + { + "epoch": 12.24, + "grad_norm": 0.388671875, + "learning_rate": 0.0003213582635521068, + "loss": 0.2177, + "step": 295470 + }, + { + "epoch": 12.24, + "grad_norm": 0.62109375, + "learning_rate": 0.00032134786946489926, + "loss": 0.1505, + "step": 295480 + }, + { + "epoch": 12.24, + "grad_norm": 0.7109375, + "learning_rate": 0.0003213374752434226, + "loss": 0.2199, + "step": 295490 + }, + { + "epoch": 12.24, + "grad_norm": 0.5390625, + "learning_rate": 0.0003213270808876966, + "loss": 0.2255, + "step": 295500 + }, + { + "epoch": 12.24, + "grad_norm": 0.81640625, + "learning_rate": 0.00032131668639774077, + "loss": 0.1521, + "step": 295510 + }, + { + "epoch": 12.24, + "grad_norm": 0.42578125, + "learning_rate": 0.0003213062917735746, + "loss": 0.1856, + "step": 295520 + }, + { + "epoch": 12.24, + "grad_norm": 0.75390625, + "learning_rate": 0.00032129589701521767, + "loss": 0.1935, + "step": 295530 + }, + { + "epoch": 12.24, + "grad_norm": 1.3125, + "learning_rate": 0.00032128550212268955, + "loss": 0.2388, + "step": 295540 + }, + { + "epoch": 12.24, + "grad_norm": 0.90234375, + "learning_rate": 0.0003212751070960098, + "loss": 0.1884, + "step": 295550 + }, + { + "epoch": 12.24, + "grad_norm": 0.80859375, + "learning_rate": 0.00032126471193519806, + "loss": 0.2247, + "step": 295560 + }, + { + "epoch": 12.24, + "grad_norm": 0.328125, + "learning_rate": 0.00032125431664027377, + "loss": 0.1808, + "step": 295570 + }, + { + "epoch": 12.24, + "grad_norm": 0.50390625, + "learning_rate": 0.00032124392121125656, + "loss": 0.1898, + "step": 295580 + }, + { + "epoch": 12.24, + "grad_norm": 0.490234375, + "learning_rate": 0.00032123352564816603, + "loss": 0.1851, + "step": 295590 + }, + { + "epoch": 12.24, + "grad_norm": 0.734375, + "learning_rate": 0.00032122312995102166, + "loss": 0.2157, + "step": 295600 + }, + { + "epoch": 12.24, + "grad_norm": 0.578125, + "learning_rate": 0.00032121273411984307, + "loss": 0.1329, + "step": 295610 + }, + { + "epoch": 12.24, + "grad_norm": 1.25, + "learning_rate": 0.00032120233815464977, + "loss": 0.2667, + "step": 295620 + }, + { + "epoch": 12.24, + "grad_norm": 1.7578125, + "learning_rate": 0.0003211919420554614, + "loss": 0.1842, + "step": 295630 + }, + { + "epoch": 12.25, + "grad_norm": 2.28125, + "learning_rate": 0.0003211815458222975, + "loss": 0.1699, + "step": 295640 + }, + { + "epoch": 12.25, + "grad_norm": 0.56640625, + "learning_rate": 0.00032117114945517754, + "loss": 0.1898, + "step": 295650 + }, + { + "epoch": 12.25, + "grad_norm": 0.578125, + "learning_rate": 0.00032116075295412117, + "loss": 0.2288, + "step": 295660 + }, + { + "epoch": 12.25, + "grad_norm": 1.328125, + "learning_rate": 0.00032115035631914804, + "loss": 0.1661, + "step": 295670 + }, + { + "epoch": 12.25, + "grad_norm": 1.1953125, + "learning_rate": 0.0003211399595502776, + "loss": 0.2003, + "step": 295680 + }, + { + "epoch": 12.25, + "grad_norm": 1.1953125, + "learning_rate": 0.00032112956264752934, + "loss": 0.1962, + "step": 295690 + }, + { + "epoch": 12.25, + "grad_norm": 0.5625, + "learning_rate": 0.00032111916561092295, + "loss": 0.1578, + "step": 295700 + }, + { + "epoch": 12.25, + "grad_norm": 0.39453125, + "learning_rate": 0.00032110876844047804, + "loss": 0.2473, + "step": 295710 + }, + { + "epoch": 12.25, + "grad_norm": 1.5859375, + "learning_rate": 0.0003210983711362141, + "loss": 0.198, + "step": 295720 + }, + { + "epoch": 12.25, + "grad_norm": 0.3125, + "learning_rate": 0.00032108797369815066, + "loss": 0.1765, + "step": 295730 + }, + { + "epoch": 12.25, + "grad_norm": 0.5234375, + "learning_rate": 0.0003210775761263073, + "loss": 0.182, + "step": 295740 + }, + { + "epoch": 12.25, + "grad_norm": 2.296875, + "learning_rate": 0.0003210671784207037, + "loss": 0.2323, + "step": 295750 + }, + { + "epoch": 12.25, + "grad_norm": 1.328125, + "learning_rate": 0.0003210567805813593, + "loss": 0.2042, + "step": 295760 + }, + { + "epoch": 12.25, + "grad_norm": 0.462890625, + "learning_rate": 0.00032104638260829375, + "loss": 0.2108, + "step": 295770 + }, + { + "epoch": 12.25, + "grad_norm": 0.8125, + "learning_rate": 0.0003210359845015266, + "loss": 0.1462, + "step": 295780 + }, + { + "epoch": 12.25, + "grad_norm": 0.65625, + "learning_rate": 0.0003210255862610773, + "loss": 0.2193, + "step": 295790 + }, + { + "epoch": 12.25, + "grad_norm": 3.46875, + "learning_rate": 0.0003210151878869656, + "loss": 0.1778, + "step": 295800 + }, + { + "epoch": 12.25, + "grad_norm": 1.078125, + "learning_rate": 0.000321004789379211, + "loss": 0.2217, + "step": 295810 + }, + { + "epoch": 12.25, + "grad_norm": 0.39453125, + "learning_rate": 0.000320994390737833, + "loss": 0.1654, + "step": 295820 + }, + { + "epoch": 12.25, + "grad_norm": 0.55859375, + "learning_rate": 0.00032098399196285116, + "loss": 0.1628, + "step": 295830 + }, + { + "epoch": 12.25, + "grad_norm": 0.75, + "learning_rate": 0.00032097359305428526, + "loss": 0.2176, + "step": 295840 + }, + { + "epoch": 12.25, + "grad_norm": 1.125, + "learning_rate": 0.0003209631940121546, + "loss": 0.1936, + "step": 295850 + }, + { + "epoch": 12.25, + "grad_norm": 0.40234375, + "learning_rate": 0.0003209527948364789, + "loss": 0.1864, + "step": 295860 + }, + { + "epoch": 12.25, + "grad_norm": 0.53125, + "learning_rate": 0.0003209423955272777, + "loss": 0.1979, + "step": 295870 + }, + { + "epoch": 12.26, + "grad_norm": 1.3515625, + "learning_rate": 0.0003209319960845706, + "loss": 0.1834, + "step": 295880 + }, + { + "epoch": 12.26, + "grad_norm": 1.5390625, + "learning_rate": 0.0003209215965083772, + "loss": 0.2144, + "step": 295890 + }, + { + "epoch": 12.26, + "grad_norm": 0.734375, + "learning_rate": 0.0003209111967987169, + "loss": 0.1835, + "step": 295900 + }, + { + "epoch": 12.26, + "grad_norm": 1.265625, + "learning_rate": 0.00032090079695560946, + "loss": 0.1841, + "step": 295910 + }, + { + "epoch": 12.26, + "grad_norm": 0.9921875, + "learning_rate": 0.0003208903969790744, + "loss": 0.2039, + "step": 295920 + }, + { + "epoch": 12.26, + "grad_norm": 0.8984375, + "learning_rate": 0.00032087999686913123, + "loss": 0.2131, + "step": 295930 + }, + { + "epoch": 12.26, + "grad_norm": 0.96484375, + "learning_rate": 0.0003208695966257995, + "loss": 0.1386, + "step": 295940 + }, + { + "epoch": 12.26, + "grad_norm": 0.7578125, + "learning_rate": 0.0003208591962490989, + "loss": 0.2521, + "step": 295950 + }, + { + "epoch": 12.26, + "grad_norm": 0.5703125, + "learning_rate": 0.0003208487957390489, + "loss": 0.2043, + "step": 295960 + }, + { + "epoch": 12.26, + "grad_norm": 0.50390625, + "learning_rate": 0.0003208383950956693, + "loss": 0.1817, + "step": 295970 + }, + { + "epoch": 12.26, + "grad_norm": 0.447265625, + "learning_rate": 0.00032082799431897924, + "loss": 0.1977, + "step": 295980 + }, + { + "epoch": 12.26, + "grad_norm": 0.96484375, + "learning_rate": 0.0003208175934089987, + "loss": 0.2036, + "step": 295990 + }, + { + "epoch": 12.26, + "grad_norm": 1.3046875, + "learning_rate": 0.0003208071923657471, + "loss": 0.2079, + "step": 296000 + }, + { + "epoch": 12.26, + "grad_norm": 1.2109375, + "learning_rate": 0.0003207967911892439, + "loss": 0.2013, + "step": 296010 + }, + { + "epoch": 12.26, + "grad_norm": 0.314453125, + "learning_rate": 0.0003207863898795088, + "loss": 0.1848, + "step": 296020 + }, + { + "epoch": 12.26, + "grad_norm": 0.8515625, + "learning_rate": 0.0003207759884365614, + "loss": 0.1754, + "step": 296030 + }, + { + "epoch": 12.26, + "grad_norm": 0.9609375, + "learning_rate": 0.00032076558686042124, + "loss": 0.1977, + "step": 296040 + }, + { + "epoch": 12.26, + "grad_norm": 0.451171875, + "learning_rate": 0.0003207551851511079, + "loss": 0.2072, + "step": 296050 + }, + { + "epoch": 12.26, + "grad_norm": 0.4765625, + "learning_rate": 0.0003207447833086409, + "loss": 0.2007, + "step": 296060 + }, + { + "epoch": 12.26, + "grad_norm": 0.8203125, + "learning_rate": 0.0003207343813330399, + "loss": 0.2646, + "step": 296070 + }, + { + "epoch": 12.26, + "grad_norm": 0.671875, + "learning_rate": 0.0003207239792243245, + "loss": 0.1612, + "step": 296080 + }, + { + "epoch": 12.26, + "grad_norm": 0.474609375, + "learning_rate": 0.00032071357698251404, + "loss": 0.2363, + "step": 296090 + }, + { + "epoch": 12.26, + "grad_norm": 1.3671875, + "learning_rate": 0.00032070317460762836, + "loss": 0.2291, + "step": 296100 + }, + { + "epoch": 12.26, + "grad_norm": 0.71875, + "learning_rate": 0.00032069277209968695, + "loss": 0.1917, + "step": 296110 + }, + { + "epoch": 12.27, + "grad_norm": 0.91015625, + "learning_rate": 0.0003206823694587093, + "loss": 0.2112, + "step": 296120 + }, + { + "epoch": 12.27, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003206719666847152, + "loss": 0.1826, + "step": 296130 + }, + { + "epoch": 12.27, + "grad_norm": 0.7421875, + "learning_rate": 0.00032066156377772397, + "loss": 0.1834, + "step": 296140 + }, + { + "epoch": 12.27, + "grad_norm": 0.49609375, + "learning_rate": 0.00032065116073775533, + "loss": 0.1827, + "step": 296150 + }, + { + "epoch": 12.27, + "grad_norm": 0.77734375, + "learning_rate": 0.0003206407575648289, + "loss": 0.1693, + "step": 296160 + }, + { + "epoch": 12.27, + "grad_norm": 2.765625, + "learning_rate": 0.0003206303542589641, + "loss": 0.1415, + "step": 296170 + }, + { + "epoch": 12.27, + "grad_norm": 1.390625, + "learning_rate": 0.0003206199508201807, + "loss": 0.1866, + "step": 296180 + }, + { + "epoch": 12.27, + "grad_norm": 0.9140625, + "learning_rate": 0.0003206095472484981, + "loss": 0.1659, + "step": 296190 + }, + { + "epoch": 12.27, + "grad_norm": 0.59765625, + "learning_rate": 0.000320599143543936, + "loss": 0.2148, + "step": 296200 + }, + { + "epoch": 12.27, + "grad_norm": 1.1328125, + "learning_rate": 0.00032058873970651394, + "loss": 0.206, + "step": 296210 + }, + { + "epoch": 12.27, + "grad_norm": 0.4765625, + "learning_rate": 0.00032057833573625143, + "loss": 0.2248, + "step": 296220 + }, + { + "epoch": 12.27, + "grad_norm": 1.1640625, + "learning_rate": 0.0003205679316331682, + "loss": 0.2119, + "step": 296230 + }, + { + "epoch": 12.27, + "grad_norm": 0.94921875, + "learning_rate": 0.00032055752739728374, + "loss": 0.214, + "step": 296240 + }, + { + "epoch": 12.27, + "grad_norm": 0.84375, + "learning_rate": 0.0003205471230286175, + "loss": 0.2101, + "step": 296250 + }, + { + "epoch": 12.27, + "grad_norm": 0.515625, + "learning_rate": 0.00032053671852718936, + "loss": 0.1668, + "step": 296260 + }, + { + "epoch": 12.27, + "grad_norm": 0.31640625, + "learning_rate": 0.00032052631389301863, + "loss": 0.1812, + "step": 296270 + }, + { + "epoch": 12.27, + "grad_norm": 0.60546875, + "learning_rate": 0.00032051590912612497, + "loss": 0.148, + "step": 296280 + }, + { + "epoch": 12.27, + "grad_norm": 0.73046875, + "learning_rate": 0.0003205055042265281, + "loss": 0.1674, + "step": 296290 + }, + { + "epoch": 12.27, + "grad_norm": 3.375, + "learning_rate": 0.00032049509919424734, + "loss": 0.2299, + "step": 296300 + }, + { + "epoch": 12.27, + "grad_norm": 0.56640625, + "learning_rate": 0.00032048469402930256, + "loss": 0.2445, + "step": 296310 + }, + { + "epoch": 12.27, + "grad_norm": 1.09375, + "learning_rate": 0.0003204742887317132, + "loss": 0.1941, + "step": 296320 + }, + { + "epoch": 12.27, + "grad_norm": 0.94921875, + "learning_rate": 0.0003204638833014987, + "loss": 0.1942, + "step": 296330 + }, + { + "epoch": 12.27, + "grad_norm": 0.74609375, + "learning_rate": 0.0003204534777386789, + "loss": 0.2044, + "step": 296340 + }, + { + "epoch": 12.27, + "grad_norm": 1.140625, + "learning_rate": 0.00032044307204327316, + "loss": 0.1615, + "step": 296350 + }, + { + "epoch": 12.28, + "grad_norm": 0.392578125, + "learning_rate": 0.00032043266621530124, + "loss": 0.1521, + "step": 296360 + }, + { + "epoch": 12.28, + "grad_norm": 0.392578125, + "learning_rate": 0.0003204222602547827, + "loss": 0.1294, + "step": 296370 + }, + { + "epoch": 12.28, + "grad_norm": 0.314453125, + "learning_rate": 0.0003204118541617369, + "loss": 0.194, + "step": 296380 + }, + { + "epoch": 12.28, + "grad_norm": 0.8125, + "learning_rate": 0.00032040144793618375, + "loss": 0.1571, + "step": 296390 + }, + { + "epoch": 12.28, + "grad_norm": 1.1328125, + "learning_rate": 0.0003203910415781426, + "loss": 0.1528, + "step": 296400 + }, + { + "epoch": 12.28, + "grad_norm": 1.203125, + "learning_rate": 0.00032038063508763314, + "loss": 0.1877, + "step": 296410 + }, + { + "epoch": 12.28, + "grad_norm": 1.46875, + "learning_rate": 0.00032037022846467496, + "loss": 0.1835, + "step": 296420 + }, + { + "epoch": 12.28, + "grad_norm": 1.09375, + "learning_rate": 0.00032035982170928757, + "loss": 0.1578, + "step": 296430 + }, + { + "epoch": 12.28, + "grad_norm": 1.140625, + "learning_rate": 0.0003203494148214906, + "loss": 0.2195, + "step": 296440 + }, + { + "epoch": 12.28, + "grad_norm": 1.0078125, + "learning_rate": 0.00032033900780130365, + "loss": 0.2002, + "step": 296450 + }, + { + "epoch": 12.28, + "grad_norm": 1.3125, + "learning_rate": 0.00032032860064874617, + "loss": 0.1175, + "step": 296460 + }, + { + "epoch": 12.28, + "grad_norm": 1.0625, + "learning_rate": 0.000320318193363838, + "loss": 0.2047, + "step": 296470 + }, + { + "epoch": 12.28, + "grad_norm": 1.21875, + "learning_rate": 0.00032030778594659853, + "loss": 0.2029, + "step": 296480 + }, + { + "epoch": 12.28, + "grad_norm": 1.0546875, + "learning_rate": 0.00032029737839704735, + "loss": 0.2086, + "step": 296490 + }, + { + "epoch": 12.28, + "grad_norm": 1.3046875, + "learning_rate": 0.00032028697071520427, + "loss": 0.1813, + "step": 296500 + }, + { + "epoch": 12.28, + "grad_norm": 0.97265625, + "learning_rate": 0.0003202765629010885, + "loss": 0.2045, + "step": 296510 + }, + { + "epoch": 12.28, + "grad_norm": 0.330078125, + "learning_rate": 0.0003202661549547199, + "loss": 0.1759, + "step": 296520 + }, + { + "epoch": 12.28, + "grad_norm": 0.98828125, + "learning_rate": 0.000320255746876118, + "loss": 0.169, + "step": 296530 + }, + { + "epoch": 12.28, + "grad_norm": 0.4296875, + "learning_rate": 0.0003202453386653024, + "loss": 0.2134, + "step": 296540 + }, + { + "epoch": 12.28, + "grad_norm": 0.84375, + "learning_rate": 0.00032023493032229253, + "loss": 0.17, + "step": 296550 + }, + { + "epoch": 12.28, + "grad_norm": 1.0859375, + "learning_rate": 0.00032022452184710825, + "loss": 0.1759, + "step": 296560 + }, + { + "epoch": 12.28, + "grad_norm": 0.38671875, + "learning_rate": 0.0003202141132397689, + "loss": 0.1619, + "step": 296570 + }, + { + "epoch": 12.28, + "grad_norm": 0.380859375, + "learning_rate": 0.0003202037045002943, + "loss": 0.205, + "step": 296580 + }, + { + "epoch": 12.28, + "grad_norm": 0.84765625, + "learning_rate": 0.0003201932956287038, + "loss": 0.1923, + "step": 296590 + }, + { + "epoch": 12.29, + "grad_norm": 0.94921875, + "learning_rate": 0.0003201828866250171, + "loss": 0.1949, + "step": 296600 + }, + { + "epoch": 12.29, + "grad_norm": 1.0234375, + "learning_rate": 0.0003201724774892539, + "loss": 0.2065, + "step": 296610 + }, + { + "epoch": 12.29, + "grad_norm": 1.046875, + "learning_rate": 0.0003201620682214335, + "loss": 0.1249, + "step": 296620 + }, + { + "epoch": 12.29, + "grad_norm": 0.828125, + "learning_rate": 0.0003201516588215758, + "loss": 0.2043, + "step": 296630 + }, + { + "epoch": 12.29, + "grad_norm": 0.72265625, + "learning_rate": 0.0003201412492897002, + "loss": 0.1901, + "step": 296640 + }, + { + "epoch": 12.29, + "grad_norm": 0.9375, + "learning_rate": 0.00032013083962582634, + "loss": 0.2247, + "step": 296650 + }, + { + "epoch": 12.29, + "grad_norm": 0.82421875, + "learning_rate": 0.0003201204298299739, + "loss": 0.203, + "step": 296660 + }, + { + "epoch": 12.29, + "grad_norm": 0.439453125, + "learning_rate": 0.00032011001990216224, + "loss": 0.1789, + "step": 296670 + }, + { + "epoch": 12.29, + "grad_norm": 0.67578125, + "learning_rate": 0.0003200996098424112, + "loss": 0.1688, + "step": 296680 + }, + { + "epoch": 12.29, + "grad_norm": 1.8203125, + "learning_rate": 0.00032008919965074024, + "loss": 0.164, + "step": 296690 + }, + { + "epoch": 12.29, + "grad_norm": 0.94921875, + "learning_rate": 0.0003200787893271689, + "loss": 0.1652, + "step": 296700 + }, + { + "epoch": 12.29, + "grad_norm": 0.9921875, + "learning_rate": 0.0003200683788717169, + "loss": 0.1899, + "step": 296710 + }, + { + "epoch": 12.29, + "grad_norm": 1.1953125, + "learning_rate": 0.00032005796828440383, + "loss": 0.2319, + "step": 296720 + }, + { + "epoch": 12.29, + "grad_norm": 0.12451171875, + "learning_rate": 0.00032004755756524913, + "loss": 0.1862, + "step": 296730 + }, + { + "epoch": 12.29, + "grad_norm": 0.75390625, + "learning_rate": 0.0003200371467142726, + "loss": 0.1984, + "step": 296740 + }, + { + "epoch": 12.29, + "grad_norm": 1.2890625, + "learning_rate": 0.00032002673573149363, + "loss": 0.1872, + "step": 296750 + }, + { + "epoch": 12.29, + "grad_norm": 0.73828125, + "learning_rate": 0.0003200163246169319, + "loss": 0.1795, + "step": 296760 + }, + { + "epoch": 12.29, + "grad_norm": 0.0, + "learning_rate": 0.0003200059133706071, + "loss": 0.1987, + "step": 296770 + }, + { + "epoch": 12.29, + "grad_norm": 2.640625, + "learning_rate": 0.00031999550199253863, + "loss": 0.1911, + "step": 296780 + }, + { + "epoch": 12.29, + "grad_norm": 0.91796875, + "learning_rate": 0.00031998509048274625, + "loss": 0.1359, + "step": 296790 + }, + { + "epoch": 12.29, + "grad_norm": 0.40625, + "learning_rate": 0.00031997467884124954, + "loss": 0.2146, + "step": 296800 + }, + { + "epoch": 12.29, + "grad_norm": 1.0234375, + "learning_rate": 0.0003199642670680679, + "loss": 0.1994, + "step": 296810 + }, + { + "epoch": 12.29, + "grad_norm": 3.0625, + "learning_rate": 0.0003199538551632212, + "loss": 0.2296, + "step": 296820 + }, + { + "epoch": 12.29, + "grad_norm": 0.546875, + "learning_rate": 0.00031994344312672883, + "loss": 0.1708, + "step": 296830 + }, + { + "epoch": 12.3, + "grad_norm": 1.3046875, + "learning_rate": 0.00031993303095861046, + "loss": 0.2112, + "step": 296840 + }, + { + "epoch": 12.3, + "grad_norm": 0.6171875, + "learning_rate": 0.00031992261865888567, + "loss": 0.2378, + "step": 296850 + }, + { + "epoch": 12.3, + "grad_norm": 0.7890625, + "learning_rate": 0.0003199122062275741, + "loss": 0.2182, + "step": 296860 + }, + { + "epoch": 12.3, + "grad_norm": 1.0625, + "learning_rate": 0.0003199017936646953, + "loss": 0.198, + "step": 296870 + }, + { + "epoch": 12.3, + "grad_norm": 0.98828125, + "learning_rate": 0.00031989138097026883, + "loss": 0.1999, + "step": 296880 + }, + { + "epoch": 12.3, + "grad_norm": 0.54296875, + "learning_rate": 0.0003198809681443143, + "loss": 0.2042, + "step": 296890 + }, + { + "epoch": 12.3, + "grad_norm": 0.50390625, + "learning_rate": 0.0003198705551868515, + "loss": 0.1987, + "step": 296900 + }, + { + "epoch": 12.3, + "grad_norm": 0.51953125, + "learning_rate": 0.00031986014209789973, + "loss": 0.1741, + "step": 296910 + }, + { + "epoch": 12.3, + "grad_norm": 0.65234375, + "learning_rate": 0.00031984972887747864, + "loss": 0.2122, + "step": 296920 + }, + { + "epoch": 12.3, + "grad_norm": 0.44921875, + "learning_rate": 0.000319839315525608, + "loss": 0.152, + "step": 296930 + }, + { + "epoch": 12.3, + "grad_norm": 1.3984375, + "learning_rate": 0.0003198289020423073, + "loss": 0.181, + "step": 296940 + }, + { + "epoch": 12.3, + "grad_norm": 0.8984375, + "learning_rate": 0.0003198184884275962, + "loss": 0.1727, + "step": 296950 + }, + { + "epoch": 12.3, + "grad_norm": 0.73046875, + "learning_rate": 0.0003198080746814942, + "loss": 0.1935, + "step": 296960 + }, + { + "epoch": 12.3, + "grad_norm": 1.0703125, + "learning_rate": 0.00031979766080402096, + "loss": 0.1478, + "step": 296970 + }, + { + "epoch": 12.3, + "grad_norm": 1.875, + "learning_rate": 0.0003197872467951961, + "loss": 0.2009, + "step": 296980 + }, + { + "epoch": 12.3, + "grad_norm": 0.76171875, + "learning_rate": 0.00031977683265503896, + "loss": 0.2257, + "step": 296990 + }, + { + "epoch": 12.3, + "grad_norm": 0.68359375, + "learning_rate": 0.0003197664183835696, + "loss": 0.2266, + "step": 297000 + }, + { + "epoch": 12.3, + "grad_norm": 0.61328125, + "learning_rate": 0.0003197560039808073, + "loss": 0.1782, + "step": 297010 + }, + { + "epoch": 12.3, + "grad_norm": 2.515625, + "learning_rate": 0.0003197455894467717, + "loss": 0.1873, + "step": 297020 + }, + { + "epoch": 12.3, + "grad_norm": 1.09375, + "learning_rate": 0.0003197351747814825, + "loss": 0.1806, + "step": 297030 + }, + { + "epoch": 12.3, + "grad_norm": 0.65625, + "learning_rate": 0.0003197247599849592, + "loss": 0.1595, + "step": 297040 + }, + { + "epoch": 12.3, + "grad_norm": 0.73046875, + "learning_rate": 0.00031971434505722137, + "loss": 0.2269, + "step": 297050 + }, + { + "epoch": 12.3, + "grad_norm": 1.7734375, + "learning_rate": 0.00031970392999828873, + "loss": 0.1749, + "step": 297060 + }, + { + "epoch": 12.3, + "grad_norm": 0.6875, + "learning_rate": 0.00031969351480818075, + "loss": 0.1796, + "step": 297070 + }, + { + "epoch": 12.31, + "grad_norm": 1.0390625, + "learning_rate": 0.0003196830994869172, + "loss": 0.1919, + "step": 297080 + }, + { + "epoch": 12.31, + "grad_norm": 1.9453125, + "learning_rate": 0.00031967268403451757, + "loss": 0.1654, + "step": 297090 + }, + { + "epoch": 12.31, + "grad_norm": 0.56640625, + "learning_rate": 0.00031966226845100146, + "loss": 0.1749, + "step": 297100 + }, + { + "epoch": 12.31, + "grad_norm": 0.2333984375, + "learning_rate": 0.00031965185273638845, + "loss": 0.1824, + "step": 297110 + }, + { + "epoch": 12.31, + "grad_norm": 0.33203125, + "learning_rate": 0.0003196414368906982, + "loss": 0.1923, + "step": 297120 + }, + { + "epoch": 12.31, + "grad_norm": 0.83203125, + "learning_rate": 0.0003196310209139503, + "loss": 0.1946, + "step": 297130 + }, + { + "epoch": 12.31, + "grad_norm": 1.7421875, + "learning_rate": 0.0003196206048061643, + "loss": 0.195, + "step": 297140 + }, + { + "epoch": 12.31, + "grad_norm": 0.78125, + "learning_rate": 0.00031961018856735987, + "loss": 0.1865, + "step": 297150 + }, + { + "epoch": 12.31, + "grad_norm": 0.65234375, + "learning_rate": 0.0003195997721975566, + "loss": 0.1935, + "step": 297160 + }, + { + "epoch": 12.31, + "grad_norm": 1.015625, + "learning_rate": 0.00031958935569677407, + "loss": 0.2393, + "step": 297170 + }, + { + "epoch": 12.31, + "grad_norm": 0.7578125, + "learning_rate": 0.00031957893906503184, + "loss": 0.2197, + "step": 297180 + }, + { + "epoch": 12.31, + "grad_norm": 0.58984375, + "learning_rate": 0.00031956852230234954, + "loss": 0.1973, + "step": 297190 + }, + { + "epoch": 12.31, + "grad_norm": 1.8203125, + "learning_rate": 0.00031955810540874684, + "loss": 0.2072, + "step": 297200 + }, + { + "epoch": 12.31, + "grad_norm": 0.68359375, + "learning_rate": 0.0003195476883842433, + "loss": 0.1668, + "step": 297210 + }, + { + "epoch": 12.31, + "grad_norm": 1.125, + "learning_rate": 0.00031953727122885855, + "loss": 0.2296, + "step": 297220 + }, + { + "epoch": 12.31, + "grad_norm": 0.96875, + "learning_rate": 0.0003195268539426121, + "loss": 0.1432, + "step": 297230 + }, + { + "epoch": 12.31, + "grad_norm": 0.921875, + "learning_rate": 0.0003195164365255237, + "loss": 0.165, + "step": 297240 + }, + { + "epoch": 12.31, + "grad_norm": 1.0703125, + "learning_rate": 0.0003195060189776128, + "loss": 0.1608, + "step": 297250 + }, + { + "epoch": 12.31, + "grad_norm": 0.765625, + "learning_rate": 0.0003194956012988991, + "loss": 0.2128, + "step": 297260 + }, + { + "epoch": 12.31, + "grad_norm": 1.4140625, + "learning_rate": 0.00031948518348940216, + "loss": 0.2076, + "step": 297270 + }, + { + "epoch": 12.31, + "grad_norm": 1.0, + "learning_rate": 0.0003194747655491417, + "loss": 0.2379, + "step": 297280 + }, + { + "epoch": 12.31, + "grad_norm": 0.83203125, + "learning_rate": 0.00031946434747813713, + "loss": 0.2119, + "step": 297290 + }, + { + "epoch": 12.31, + "grad_norm": 0.96484375, + "learning_rate": 0.0003194539292764082, + "loss": 0.2411, + "step": 297300 + }, + { + "epoch": 12.31, + "grad_norm": 1.75, + "learning_rate": 0.00031944351094397445, + "loss": 0.2085, + "step": 297310 + }, + { + "epoch": 12.31, + "grad_norm": 0.3359375, + "learning_rate": 0.0003194330924808556, + "loss": 0.1918, + "step": 297320 + }, + { + "epoch": 12.32, + "grad_norm": 0.97265625, + "learning_rate": 0.00031942267388707107, + "loss": 0.1902, + "step": 297330 + }, + { + "epoch": 12.32, + "grad_norm": 1.4921875, + "learning_rate": 0.0003194122551626406, + "loss": 0.2069, + "step": 297340 + }, + { + "epoch": 12.32, + "grad_norm": 0.90234375, + "learning_rate": 0.00031940183630758376, + "loss": 0.1929, + "step": 297350 + }, + { + "epoch": 12.32, + "grad_norm": 0.90234375, + "learning_rate": 0.00031939141732192016, + "loss": 0.1315, + "step": 297360 + }, + { + "epoch": 12.32, + "grad_norm": 0.1630859375, + "learning_rate": 0.0003193809982056694, + "loss": 0.1902, + "step": 297370 + }, + { + "epoch": 12.32, + "grad_norm": 0.98828125, + "learning_rate": 0.00031937057895885116, + "loss": 0.1876, + "step": 297380 + }, + { + "epoch": 12.32, + "grad_norm": 0.73828125, + "learning_rate": 0.00031936015958148483, + "loss": 0.1915, + "step": 297390 + }, + { + "epoch": 12.32, + "grad_norm": 0.66015625, + "learning_rate": 0.0003193497400735903, + "loss": 0.227, + "step": 297400 + }, + { + "epoch": 12.32, + "grad_norm": 0.87109375, + "learning_rate": 0.00031933932043518704, + "loss": 0.149, + "step": 297410 + }, + { + "epoch": 12.32, + "grad_norm": 1.1640625, + "learning_rate": 0.0003193289006662946, + "loss": 0.1918, + "step": 297420 + }, + { + "epoch": 12.32, + "grad_norm": 0.0, + "learning_rate": 0.0003193184807669327, + "loss": 0.1463, + "step": 297430 + }, + { + "epoch": 12.32, + "grad_norm": 1.6328125, + "learning_rate": 0.0003193080607371209, + "loss": 0.1855, + "step": 297440 + }, + { + "epoch": 12.32, + "grad_norm": 0.6640625, + "learning_rate": 0.00031929764057687883, + "loss": 0.1777, + "step": 297450 + }, + { + "epoch": 12.32, + "grad_norm": 1.40625, + "learning_rate": 0.000319287220286226, + "loss": 0.1765, + "step": 297460 + }, + { + "epoch": 12.32, + "grad_norm": 0.328125, + "learning_rate": 0.00031927679986518224, + "loss": 0.1853, + "step": 297470 + }, + { + "epoch": 12.32, + "grad_norm": 0.6328125, + "learning_rate": 0.0003192663793137669, + "loss": 0.1799, + "step": 297480 + }, + { + "epoch": 12.32, + "grad_norm": 1.2890625, + "learning_rate": 0.0003192559586319998, + "loss": 0.1795, + "step": 297490 + }, + { + "epoch": 12.32, + "grad_norm": 0.400390625, + "learning_rate": 0.00031924553781990044, + "loss": 0.228, + "step": 297500 + }, + { + "epoch": 12.32, + "grad_norm": 1.015625, + "learning_rate": 0.0003192351168774884, + "loss": 0.1946, + "step": 297510 + }, + { + "epoch": 12.32, + "grad_norm": 0.81640625, + "learning_rate": 0.0003192246958047835, + "loss": 0.1646, + "step": 297520 + }, + { + "epoch": 12.32, + "grad_norm": 0.765625, + "learning_rate": 0.00031921427460180506, + "loss": 0.2028, + "step": 297530 + }, + { + "epoch": 12.32, + "grad_norm": 0.71484375, + "learning_rate": 0.0003192038532685728, + "loss": 0.2079, + "step": 297540 + }, + { + "epoch": 12.32, + "grad_norm": 0.54296875, + "learning_rate": 0.00031919343180510643, + "loss": 0.2108, + "step": 297550 + }, + { + "epoch": 12.32, + "grad_norm": 2.125, + "learning_rate": 0.00031918301021142547, + "loss": 0.2249, + "step": 297560 + }, + { + "epoch": 12.33, + "grad_norm": 1.1328125, + "learning_rate": 0.00031917258848754965, + "loss": 0.2072, + "step": 297570 + }, + { + "epoch": 12.33, + "grad_norm": 1.265625, + "learning_rate": 0.0003191621666334984, + "loss": 0.2038, + "step": 297580 + }, + { + "epoch": 12.33, + "grad_norm": 0.74609375, + "learning_rate": 0.00031915174464929144, + "loss": 0.1355, + "step": 297590 + }, + { + "epoch": 12.33, + "grad_norm": 1.3203125, + "learning_rate": 0.0003191413225349484, + "loss": 0.242, + "step": 297600 + }, + { + "epoch": 12.33, + "grad_norm": 0.5546875, + "learning_rate": 0.00031913090029048874, + "loss": 0.2077, + "step": 297610 + }, + { + "epoch": 12.33, + "grad_norm": 0.62109375, + "learning_rate": 0.0003191204779159323, + "loss": 0.2361, + "step": 297620 + }, + { + "epoch": 12.33, + "grad_norm": 1.0, + "learning_rate": 0.0003191100554112985, + "loss": 0.2484, + "step": 297630 + }, + { + "epoch": 12.33, + "grad_norm": 0.87890625, + "learning_rate": 0.0003190996327766071, + "loss": 0.2129, + "step": 297640 + }, + { + "epoch": 12.33, + "grad_norm": 0.75390625, + "learning_rate": 0.0003190892100118777, + "loss": 0.2119, + "step": 297650 + }, + { + "epoch": 12.33, + "grad_norm": 0.51171875, + "learning_rate": 0.0003190787871171298, + "loss": 0.1784, + "step": 297660 + }, + { + "epoch": 12.33, + "grad_norm": 1.109375, + "learning_rate": 0.000319068364092383, + "loss": 0.2352, + "step": 297670 + }, + { + "epoch": 12.33, + "grad_norm": 1.1875, + "learning_rate": 0.00031905794093765714, + "loss": 0.1915, + "step": 297680 + }, + { + "epoch": 12.33, + "grad_norm": 0.68359375, + "learning_rate": 0.00031904751765297163, + "loss": 0.1917, + "step": 297690 + }, + { + "epoch": 12.33, + "grad_norm": 0.921875, + "learning_rate": 0.00031903709423834623, + "loss": 0.1875, + "step": 297700 + }, + { + "epoch": 12.33, + "grad_norm": 1.875, + "learning_rate": 0.0003190266706938004, + "loss": 0.1878, + "step": 297710 + }, + { + "epoch": 12.33, + "grad_norm": 0.9453125, + "learning_rate": 0.0003190162470193538, + "loss": 0.1524, + "step": 297720 + }, + { + "epoch": 12.33, + "grad_norm": 0.84375, + "learning_rate": 0.0003190058232150261, + "loss": 0.224, + "step": 297730 + }, + { + "epoch": 12.33, + "grad_norm": 0.69921875, + "learning_rate": 0.00031899539928083694, + "loss": 0.2023, + "step": 297740 + }, + { + "epoch": 12.33, + "grad_norm": 1.078125, + "learning_rate": 0.0003189849752168059, + "loss": 0.2213, + "step": 297750 + }, + { + "epoch": 12.33, + "grad_norm": 0.47265625, + "learning_rate": 0.00031897455102295255, + "loss": 0.1685, + "step": 297760 + }, + { + "epoch": 12.33, + "grad_norm": 1.359375, + "learning_rate": 0.00031896412669929643, + "loss": 0.2243, + "step": 297770 + }, + { + "epoch": 12.33, + "grad_norm": 1.34375, + "learning_rate": 0.0003189537022458574, + "loss": 0.1627, + "step": 297780 + }, + { + "epoch": 12.33, + "grad_norm": 0.921875, + "learning_rate": 0.00031894327766265494, + "loss": 0.195, + "step": 297790 + }, + { + "epoch": 12.33, + "grad_norm": 0.92578125, + "learning_rate": 0.00031893285294970865, + "loss": 0.239, + "step": 297800 + }, + { + "epoch": 12.34, + "grad_norm": 0.8125, + "learning_rate": 0.0003189224281070382, + "loss": 0.1736, + "step": 297810 + }, + { + "epoch": 12.34, + "grad_norm": 0.88671875, + "learning_rate": 0.00031891200313466313, + "loss": 0.1601, + "step": 297820 + }, + { + "epoch": 12.34, + "grad_norm": 0.51171875, + "learning_rate": 0.00031890157803260324, + "loss": 0.1644, + "step": 297830 + }, + { + "epoch": 12.34, + "grad_norm": 0.578125, + "learning_rate": 0.00031889115280087793, + "loss": 0.1872, + "step": 297840 + }, + { + "epoch": 12.34, + "grad_norm": 2.890625, + "learning_rate": 0.00031888072743950686, + "loss": 0.174, + "step": 297850 + }, + { + "epoch": 12.34, + "grad_norm": 0.5, + "learning_rate": 0.0003188703019485097, + "loss": 0.1627, + "step": 297860 + }, + { + "epoch": 12.34, + "grad_norm": 0.328125, + "learning_rate": 0.0003188598763279062, + "loss": 0.1771, + "step": 297870 + }, + { + "epoch": 12.34, + "grad_norm": 3.234375, + "learning_rate": 0.00031884945057771577, + "loss": 0.1884, + "step": 297880 + }, + { + "epoch": 12.34, + "grad_norm": 0.8046875, + "learning_rate": 0.0003188390246979581, + "loss": 0.1923, + "step": 297890 + }, + { + "epoch": 12.34, + "grad_norm": 1.25, + "learning_rate": 0.00031882859868865275, + "loss": 0.2155, + "step": 297900 + }, + { + "epoch": 12.34, + "grad_norm": 1.171875, + "learning_rate": 0.0003188181725498196, + "loss": 0.2088, + "step": 297910 + }, + { + "epoch": 12.34, + "grad_norm": 1.484375, + "learning_rate": 0.00031880774628147797, + "loss": 0.1371, + "step": 297920 + }, + { + "epoch": 12.34, + "grad_norm": 0.73828125, + "learning_rate": 0.0003187973198836475, + "loss": 0.1652, + "step": 297930 + }, + { + "epoch": 12.34, + "grad_norm": 0.5859375, + "learning_rate": 0.00031878689335634804, + "loss": 0.1618, + "step": 297940 + }, + { + "epoch": 12.34, + "grad_norm": 0.828125, + "learning_rate": 0.00031877646669959905, + "loss": 0.1802, + "step": 297950 + }, + { + "epoch": 12.34, + "grad_norm": 0.61328125, + "learning_rate": 0.0003187660399134201, + "loss": 0.1802, + "step": 297960 + }, + { + "epoch": 12.34, + "grad_norm": 0.9453125, + "learning_rate": 0.00031875561299783104, + "loss": 0.2019, + "step": 297970 + }, + { + "epoch": 12.34, + "grad_norm": 0.5390625, + "learning_rate": 0.0003187451859528512, + "loss": 0.1999, + "step": 297980 + }, + { + "epoch": 12.34, + "grad_norm": 0.734375, + "learning_rate": 0.0003187347587785004, + "loss": 0.2113, + "step": 297990 + }, + { + "epoch": 12.34, + "grad_norm": 1.453125, + "learning_rate": 0.00031872433147479823, + "loss": 0.2267, + "step": 298000 + }, + { + "epoch": 12.34, + "grad_norm": 0.8046875, + "learning_rate": 0.00031871390404176423, + "loss": 0.209, + "step": 298010 + }, + { + "epoch": 12.34, + "grad_norm": 0.55078125, + "learning_rate": 0.0003187034764794181, + "loss": 0.2126, + "step": 298020 + }, + { + "epoch": 12.34, + "grad_norm": 1.125, + "learning_rate": 0.0003186930487877794, + "loss": 0.2295, + "step": 298030 + }, + { + "epoch": 12.34, + "grad_norm": 1.21875, + "learning_rate": 0.0003186826209668679, + "loss": 0.1643, + "step": 298040 + }, + { + "epoch": 12.35, + "grad_norm": 0.41796875, + "learning_rate": 0.00031867219301670314, + "loss": 0.2296, + "step": 298050 + }, + { + "epoch": 12.35, + "grad_norm": 0.357421875, + "learning_rate": 0.0003186617649373046, + "loss": 0.2005, + "step": 298060 + }, + { + "epoch": 12.35, + "grad_norm": 1.28125, + "learning_rate": 0.0003186513367286921, + "loss": 0.1569, + "step": 298070 + }, + { + "epoch": 12.35, + "grad_norm": 0.5390625, + "learning_rate": 0.00031864090839088527, + "loss": 0.1498, + "step": 298080 + }, + { + "epoch": 12.35, + "grad_norm": 0.625, + "learning_rate": 0.00031863047992390355, + "loss": 0.2071, + "step": 298090 + }, + { + "epoch": 12.35, + "grad_norm": 0.73828125, + "learning_rate": 0.0003186200513277667, + "loss": 0.1916, + "step": 298100 + }, + { + "epoch": 12.35, + "grad_norm": 0.77734375, + "learning_rate": 0.0003186096226024944, + "loss": 0.1694, + "step": 298110 + }, + { + "epoch": 12.35, + "grad_norm": 0.703125, + "learning_rate": 0.0003185991937481061, + "loss": 0.1858, + "step": 298120 + }, + { + "epoch": 12.35, + "grad_norm": 1.4609375, + "learning_rate": 0.0003185887647646216, + "loss": 0.1696, + "step": 298130 + }, + { + "epoch": 12.35, + "grad_norm": 0.2431640625, + "learning_rate": 0.0003185783356520604, + "loss": 0.1975, + "step": 298140 + }, + { + "epoch": 12.35, + "grad_norm": 0.8671875, + "learning_rate": 0.0003185679064104422, + "loss": 0.224, + "step": 298150 + }, + { + "epoch": 12.35, + "grad_norm": 1.4375, + "learning_rate": 0.0003185574770397866, + "loss": 0.2169, + "step": 298160 + }, + { + "epoch": 12.35, + "grad_norm": 0.63671875, + "learning_rate": 0.00031854704754011326, + "loss": 0.217, + "step": 298170 + }, + { + "epoch": 12.35, + "grad_norm": 0.67578125, + "learning_rate": 0.0003185366179114418, + "loss": 0.147, + "step": 298180 + }, + { + "epoch": 12.35, + "grad_norm": 1.0625, + "learning_rate": 0.0003185261881537918, + "loss": 0.2082, + "step": 298190 + }, + { + "epoch": 12.35, + "grad_norm": 0.53125, + "learning_rate": 0.00031851575826718283, + "loss": 0.1977, + "step": 298200 + }, + { + "epoch": 12.35, + "grad_norm": 1.2734375, + "learning_rate": 0.0003185053282516347, + "loss": 0.24, + "step": 298210 + }, + { + "epoch": 12.35, + "grad_norm": 1.7890625, + "learning_rate": 0.00031849489810716687, + "loss": 0.2134, + "step": 298220 + }, + { + "epoch": 12.35, + "grad_norm": 1.078125, + "learning_rate": 0.00031848446783379916, + "loss": 0.1838, + "step": 298230 + }, + { + "epoch": 12.35, + "grad_norm": 1.2109375, + "learning_rate": 0.000318474037431551, + "loss": 0.1736, + "step": 298240 + }, + { + "epoch": 12.35, + "grad_norm": 0.83203125, + "learning_rate": 0.000318463606900442, + "loss": 0.2065, + "step": 298250 + }, + { + "epoch": 12.35, + "grad_norm": 1.1015625, + "learning_rate": 0.00031845317624049204, + "loss": 0.2094, + "step": 298260 + }, + { + "epoch": 12.35, + "grad_norm": 1.3984375, + "learning_rate": 0.00031844274545172053, + "loss": 0.261, + "step": 298270 + }, + { + "epoch": 12.35, + "grad_norm": 1.8828125, + "learning_rate": 0.0003184323145341471, + "loss": 0.1974, + "step": 298280 + }, + { + "epoch": 12.36, + "grad_norm": 0.5, + "learning_rate": 0.00031842188348779154, + "loss": 0.1443, + "step": 298290 + }, + { + "epoch": 12.36, + "grad_norm": 1.34375, + "learning_rate": 0.0003184114523126733, + "loss": 0.2247, + "step": 298300 + }, + { + "epoch": 12.36, + "grad_norm": 0.63671875, + "learning_rate": 0.00031840102100881226, + "loss": 0.1747, + "step": 298310 + }, + { + "epoch": 12.36, + "grad_norm": 0.37890625, + "learning_rate": 0.00031839058957622773, + "loss": 0.1613, + "step": 298320 + }, + { + "epoch": 12.36, + "grad_norm": 1.0078125, + "learning_rate": 0.0003183801580149395, + "loss": 0.2023, + "step": 298330 + }, + { + "epoch": 12.36, + "grad_norm": 0.99609375, + "learning_rate": 0.0003183697263249673, + "loss": 0.1846, + "step": 298340 + }, + { + "epoch": 12.36, + "grad_norm": 1.1171875, + "learning_rate": 0.0003183592945063306, + "loss": 0.2327, + "step": 298350 + }, + { + "epoch": 12.36, + "grad_norm": 0.59375, + "learning_rate": 0.00031834886255904907, + "loss": 0.2053, + "step": 298360 + }, + { + "epoch": 12.36, + "grad_norm": 0.890625, + "learning_rate": 0.0003183384304831424, + "loss": 0.1799, + "step": 298370 + }, + { + "epoch": 12.36, + "grad_norm": 0.75, + "learning_rate": 0.00031832799827863015, + "loss": 0.2037, + "step": 298380 + }, + { + "epoch": 12.36, + "grad_norm": 0.93359375, + "learning_rate": 0.00031831756594553196, + "loss": 0.1792, + "step": 298390 + }, + { + "epoch": 12.36, + "grad_norm": 1.078125, + "learning_rate": 0.00031830713348386755, + "loss": 0.2063, + "step": 298400 + }, + { + "epoch": 12.36, + "grad_norm": 0.66796875, + "learning_rate": 0.0003182967008936564, + "loss": 0.1487, + "step": 298410 + }, + { + "epoch": 12.36, + "grad_norm": 1.765625, + "learning_rate": 0.00031828626817491834, + "loss": 0.2153, + "step": 298420 + }, + { + "epoch": 12.36, + "grad_norm": 0.59765625, + "learning_rate": 0.0003182758353276729, + "loss": 0.1577, + "step": 298430 + }, + { + "epoch": 12.36, + "grad_norm": 0.92578125, + "learning_rate": 0.0003182654023519396, + "loss": 0.191, + "step": 298440 + }, + { + "epoch": 12.36, + "grad_norm": 0.466796875, + "learning_rate": 0.0003182549692477383, + "loss": 0.1916, + "step": 298450 + }, + { + "epoch": 12.36, + "grad_norm": 0.40234375, + "learning_rate": 0.00031824453601508846, + "loss": 0.1848, + "step": 298460 + }, + { + "epoch": 12.36, + "grad_norm": 0.67578125, + "learning_rate": 0.00031823410265400983, + "loss": 0.1913, + "step": 298470 + }, + { + "epoch": 12.36, + "grad_norm": 0.875, + "learning_rate": 0.0003182236691645219, + "loss": 0.2311, + "step": 298480 + }, + { + "epoch": 12.36, + "grad_norm": 0.77734375, + "learning_rate": 0.00031821323554664444, + "loss": 0.1707, + "step": 298490 + }, + { + "epoch": 12.36, + "grad_norm": 0.578125, + "learning_rate": 0.0003182028018003971, + "loss": 0.1967, + "step": 298500 + }, + { + "epoch": 12.36, + "grad_norm": 0.84765625, + "learning_rate": 0.00031819236792579933, + "loss": 0.1796, + "step": 298510 + }, + { + "epoch": 12.36, + "grad_norm": 0.8125, + "learning_rate": 0.0003181819339228709, + "loss": 0.2072, + "step": 298520 + }, + { + "epoch": 12.37, + "grad_norm": 0.7890625, + "learning_rate": 0.0003181714997916315, + "loss": 0.2235, + "step": 298530 + }, + { + "epoch": 12.37, + "grad_norm": 1.1875, + "learning_rate": 0.00031816106553210065, + "loss": 0.2175, + "step": 298540 + }, + { + "epoch": 12.37, + "grad_norm": 0.0, + "learning_rate": 0.0003181506311442981, + "loss": 0.1847, + "step": 298550 + }, + { + "epoch": 12.37, + "grad_norm": 0.73828125, + "learning_rate": 0.0003181401966282434, + "loss": 0.1991, + "step": 298560 + }, + { + "epoch": 12.37, + "grad_norm": 2.40625, + "learning_rate": 0.00031812976198395613, + "loss": 0.1957, + "step": 298570 + }, + { + "epoch": 12.37, + "grad_norm": 0.640625, + "learning_rate": 0.00031811932721145607, + "loss": 0.2211, + "step": 298580 + }, + { + "epoch": 12.37, + "grad_norm": 0.81640625, + "learning_rate": 0.0003181088923107628, + "loss": 0.2065, + "step": 298590 + }, + { + "epoch": 12.37, + "grad_norm": 0.87890625, + "learning_rate": 0.00031809845728189595, + "loss": 0.174, + "step": 298600 + }, + { + "epoch": 12.37, + "grad_norm": 1.171875, + "learning_rate": 0.00031808802212487513, + "loss": 0.155, + "step": 298610 + }, + { + "epoch": 12.37, + "grad_norm": 1.140625, + "learning_rate": 0.00031807758683972004, + "loss": 0.2166, + "step": 298620 + }, + { + "epoch": 12.37, + "grad_norm": 0.859375, + "learning_rate": 0.00031806715142645027, + "loss": 0.2079, + "step": 298630 + }, + { + "epoch": 12.37, + "grad_norm": 0.43359375, + "learning_rate": 0.00031805671588508547, + "loss": 0.165, + "step": 298640 + }, + { + "epoch": 12.37, + "grad_norm": 0.6875, + "learning_rate": 0.0003180462802156453, + "loss": 0.2093, + "step": 298650 + }, + { + "epoch": 12.37, + "grad_norm": 0.494140625, + "learning_rate": 0.0003180358444181493, + "loss": 0.1851, + "step": 298660 + }, + { + "epoch": 12.37, + "grad_norm": 0.453125, + "learning_rate": 0.0003180254084926173, + "loss": 0.235, + "step": 298670 + }, + { + "epoch": 12.37, + "grad_norm": 0.5625, + "learning_rate": 0.00031801497243906876, + "loss": 0.1801, + "step": 298680 + }, + { + "epoch": 12.37, + "grad_norm": 0.5234375, + "learning_rate": 0.00031800453625752335, + "loss": 0.1943, + "step": 298690 + }, + { + "epoch": 12.37, + "grad_norm": 0.78515625, + "learning_rate": 0.0003179940999480008, + "loss": 0.1743, + "step": 298700 + }, + { + "epoch": 12.37, + "grad_norm": 0.9609375, + "learning_rate": 0.00031798366351052065, + "loss": 0.2, + "step": 298710 + }, + { + "epoch": 12.37, + "grad_norm": 1.0, + "learning_rate": 0.0003179732269451027, + "loss": 0.1605, + "step": 298720 + }, + { + "epoch": 12.37, + "grad_norm": 1.3125, + "learning_rate": 0.0003179627902517664, + "loss": 0.1979, + "step": 298730 + }, + { + "epoch": 12.37, + "grad_norm": 0.73828125, + "learning_rate": 0.00031795235343053146, + "loss": 0.2076, + "step": 298740 + }, + { + "epoch": 12.37, + "grad_norm": 1.0546875, + "learning_rate": 0.0003179419164814176, + "loss": 0.161, + "step": 298750 + }, + { + "epoch": 12.37, + "grad_norm": 0.78515625, + "learning_rate": 0.0003179314794044443, + "loss": 0.2239, + "step": 298760 + }, + { + "epoch": 12.38, + "grad_norm": 0.68359375, + "learning_rate": 0.0003179210421996313, + "loss": 0.2024, + "step": 298770 + }, + { + "epoch": 12.38, + "grad_norm": 1.421875, + "learning_rate": 0.00031791060486699826, + "loss": 0.2006, + "step": 298780 + }, + { + "epoch": 12.38, + "grad_norm": 1.3046875, + "learning_rate": 0.0003179001674065648, + "loss": 0.2077, + "step": 298790 + }, + { + "epoch": 12.38, + "grad_norm": 0.86328125, + "learning_rate": 0.0003178897298183506, + "loss": 0.1445, + "step": 298800 + }, + { + "epoch": 12.38, + "grad_norm": 1.1953125, + "learning_rate": 0.00031787929210237514, + "loss": 0.2229, + "step": 298810 + }, + { + "epoch": 12.38, + "grad_norm": 0.76953125, + "learning_rate": 0.00031786885425865824, + "loss": 0.2109, + "step": 298820 + }, + { + "epoch": 12.38, + "grad_norm": 0.7421875, + "learning_rate": 0.0003178584162872195, + "loss": 0.1695, + "step": 298830 + }, + { + "epoch": 12.38, + "grad_norm": 0.359375, + "learning_rate": 0.0003178479781880786, + "loss": 0.1605, + "step": 298840 + }, + { + "epoch": 12.38, + "grad_norm": 1.125, + "learning_rate": 0.0003178375399612551, + "loss": 0.1591, + "step": 298850 + }, + { + "epoch": 12.38, + "grad_norm": 0.578125, + "learning_rate": 0.0003178271016067686, + "loss": 0.1854, + "step": 298860 + }, + { + "epoch": 12.38, + "grad_norm": 0.6484375, + "learning_rate": 0.00031781666312463887, + "loss": 0.1632, + "step": 298870 + }, + { + "epoch": 12.38, + "grad_norm": 1.0546875, + "learning_rate": 0.00031780622451488554, + "loss": 0.1888, + "step": 298880 + }, + { + "epoch": 12.38, + "grad_norm": 0.380859375, + "learning_rate": 0.00031779578577752817, + "loss": 0.1933, + "step": 298890 + }, + { + "epoch": 12.38, + "grad_norm": 0.66015625, + "learning_rate": 0.0003177853469125865, + "loss": 0.1763, + "step": 298900 + }, + { + "epoch": 12.38, + "grad_norm": 0.419921875, + "learning_rate": 0.00031777490792008, + "loss": 0.2189, + "step": 298910 + }, + { + "epoch": 12.38, + "grad_norm": 0.6015625, + "learning_rate": 0.0003177644688000286, + "loss": 0.2091, + "step": 298920 + }, + { + "epoch": 12.38, + "grad_norm": 1.0, + "learning_rate": 0.00031775402955245175, + "loss": 0.2084, + "step": 298930 + }, + { + "epoch": 12.38, + "grad_norm": 1.1640625, + "learning_rate": 0.0003177435901773691, + "loss": 0.2013, + "step": 298940 + }, + { + "epoch": 12.38, + "grad_norm": 0.46875, + "learning_rate": 0.0003177331506748003, + "loss": 0.1286, + "step": 298950 + }, + { + "epoch": 12.38, + "grad_norm": 1.109375, + "learning_rate": 0.00031772271104476516, + "loss": 0.2174, + "step": 298960 + }, + { + "epoch": 12.38, + "grad_norm": 0.8359375, + "learning_rate": 0.00031771227128728305, + "loss": 0.1855, + "step": 298970 + }, + { + "epoch": 12.38, + "grad_norm": 0.734375, + "learning_rate": 0.00031770183140237374, + "loss": 0.2194, + "step": 298980 + }, + { + "epoch": 12.38, + "grad_norm": 0.59375, + "learning_rate": 0.00031769139139005697, + "loss": 0.1842, + "step": 298990 + }, + { + "epoch": 12.38, + "grad_norm": 1.359375, + "learning_rate": 0.0003176809512503523, + "loss": 0.1779, + "step": 299000 + }, + { + "epoch": 12.38, + "grad_norm": 0.83984375, + "learning_rate": 0.00031767051098327944, + "loss": 0.1921, + "step": 299010 + }, + { + "epoch": 12.39, + "grad_norm": 0.65234375, + "learning_rate": 0.0003176600705888579, + "loss": 0.1846, + "step": 299020 + }, + { + "epoch": 12.39, + "grad_norm": 0.84375, + "learning_rate": 0.00031764963006710746, + "loss": 0.2252, + "step": 299030 + }, + { + "epoch": 12.39, + "grad_norm": 0.8828125, + "learning_rate": 0.0003176391894180477, + "loss": 0.1481, + "step": 299040 + }, + { + "epoch": 12.39, + "grad_norm": 0.55859375, + "learning_rate": 0.0003176287486416983, + "loss": 0.1865, + "step": 299050 + }, + { + "epoch": 12.39, + "grad_norm": 0.90625, + "learning_rate": 0.00031761830773807887, + "loss": 0.1855, + "step": 299060 + }, + { + "epoch": 12.39, + "grad_norm": 0.8984375, + "learning_rate": 0.0003176078667072091, + "loss": 0.1724, + "step": 299070 + }, + { + "epoch": 12.39, + "grad_norm": 0.78125, + "learning_rate": 0.0003175974255491086, + "loss": 0.1653, + "step": 299080 + }, + { + "epoch": 12.39, + "grad_norm": 0.84765625, + "learning_rate": 0.00031758698426379717, + "loss": 0.1538, + "step": 299090 + }, + { + "epoch": 12.39, + "grad_norm": 1.25, + "learning_rate": 0.0003175765428512942, + "loss": 0.1827, + "step": 299100 + }, + { + "epoch": 12.39, + "grad_norm": 0.9296875, + "learning_rate": 0.00031756610131161955, + "loss": 0.2149, + "step": 299110 + }, + { + "epoch": 12.39, + "grad_norm": 0.8671875, + "learning_rate": 0.00031755565964479275, + "loss": 0.1693, + "step": 299120 + }, + { + "epoch": 12.39, + "grad_norm": 0.921875, + "learning_rate": 0.0003175452178508335, + "loss": 0.1819, + "step": 299130 + }, + { + "epoch": 12.39, + "grad_norm": 2.28125, + "learning_rate": 0.0003175347759297615, + "loss": 0.2166, + "step": 299140 + }, + { + "epoch": 12.39, + "grad_norm": 0.8515625, + "learning_rate": 0.00031752433388159627, + "loss": 0.1616, + "step": 299150 + }, + { + "epoch": 12.39, + "grad_norm": 1.078125, + "learning_rate": 0.0003175138917063575, + "loss": 0.1551, + "step": 299160 + }, + { + "epoch": 12.39, + "grad_norm": 0.90234375, + "learning_rate": 0.0003175034494040651, + "loss": 0.2086, + "step": 299170 + }, + { + "epoch": 12.39, + "grad_norm": 1.34375, + "learning_rate": 0.00031749300697473824, + "loss": 0.2413, + "step": 299180 + }, + { + "epoch": 12.39, + "grad_norm": 0.703125, + "learning_rate": 0.00031748256441839696, + "loss": 0.1753, + "step": 299190 + }, + { + "epoch": 12.39, + "grad_norm": 0.92578125, + "learning_rate": 0.0003174721217350608, + "loss": 0.1505, + "step": 299200 + }, + { + "epoch": 12.39, + "grad_norm": 1.21875, + "learning_rate": 0.00031746167892474927, + "loss": 0.1672, + "step": 299210 + }, + { + "epoch": 12.39, + "grad_norm": 0.484375, + "learning_rate": 0.0003174512359874822, + "loss": 0.1668, + "step": 299220 + }, + { + "epoch": 12.39, + "grad_norm": 0.80859375, + "learning_rate": 0.0003174407929232792, + "loss": 0.1609, + "step": 299230 + }, + { + "epoch": 12.39, + "grad_norm": 0.443359375, + "learning_rate": 0.0003174303497321599, + "loss": 0.1322, + "step": 299240 + }, + { + "epoch": 12.39, + "grad_norm": 1.2734375, + "learning_rate": 0.00031741990641414404, + "loss": 0.1892, + "step": 299250 + }, + { + "epoch": 12.4, + "grad_norm": 0.93359375, + "learning_rate": 0.0003174094629692511, + "loss": 0.2299, + "step": 299260 + }, + { + "epoch": 12.4, + "grad_norm": 1.6328125, + "learning_rate": 0.00031739901939750086, + "loss": 0.209, + "step": 299270 + }, + { + "epoch": 12.4, + "grad_norm": 0.42578125, + "learning_rate": 0.000317388575698913, + "loss": 0.1807, + "step": 299280 + }, + { + "epoch": 12.4, + "grad_norm": 0.36328125, + "learning_rate": 0.00031737813187350706, + "loss": 0.1354, + "step": 299290 + }, + { + "epoch": 12.4, + "grad_norm": 1.0234375, + "learning_rate": 0.00031736768792130275, + "loss": 0.1905, + "step": 299300 + }, + { + "epoch": 12.4, + "grad_norm": 1.734375, + "learning_rate": 0.00031735724384231975, + "loss": 0.1947, + "step": 299310 + }, + { + "epoch": 12.4, + "grad_norm": 0.5546875, + "learning_rate": 0.00031734679963657764, + "loss": 0.1815, + "step": 299320 + }, + { + "epoch": 12.4, + "grad_norm": 1.0625, + "learning_rate": 0.0003173363553040962, + "loss": 0.2372, + "step": 299330 + }, + { + "epoch": 12.4, + "grad_norm": 1.6875, + "learning_rate": 0.0003173259108448949, + "loss": 0.2044, + "step": 299340 + }, + { + "epoch": 12.4, + "grad_norm": 0.345703125, + "learning_rate": 0.00031731546625899363, + "loss": 0.1748, + "step": 299350 + }, + { + "epoch": 12.4, + "grad_norm": 1.4296875, + "learning_rate": 0.00031730502154641194, + "loss": 0.2018, + "step": 299360 + }, + { + "epoch": 12.4, + "grad_norm": 0.74609375, + "learning_rate": 0.0003172945767071693, + "loss": 0.2132, + "step": 299370 + }, + { + "epoch": 12.4, + "grad_norm": 0.5546875, + "learning_rate": 0.0003172841317412857, + "loss": 0.2433, + "step": 299380 + }, + { + "epoch": 12.4, + "grad_norm": 0.69921875, + "learning_rate": 0.0003172736866487806, + "loss": 0.1823, + "step": 299390 + }, + { + "epoch": 12.4, + "grad_norm": 0.72265625, + "learning_rate": 0.0003172632414296736, + "loss": 0.1754, + "step": 299400 + }, + { + "epoch": 12.4, + "grad_norm": 1.015625, + "learning_rate": 0.00031725279608398456, + "loss": 0.2391, + "step": 299410 + }, + { + "epoch": 12.4, + "grad_norm": 0.671875, + "learning_rate": 0.0003172423506117329, + "loss": 0.1987, + "step": 299420 + }, + { + "epoch": 12.4, + "grad_norm": 0.81640625, + "learning_rate": 0.00031723190501293846, + "loss": 0.1853, + "step": 299430 + }, + { + "epoch": 12.4, + "grad_norm": 0.86328125, + "learning_rate": 0.0003172214592876209, + "loss": 0.1711, + "step": 299440 + }, + { + "epoch": 12.4, + "grad_norm": 1.078125, + "learning_rate": 0.00031721101343579964, + "loss": 0.1884, + "step": 299450 + }, + { + "epoch": 12.4, + "grad_norm": 0.193359375, + "learning_rate": 0.0003172005674574947, + "loss": 0.1777, + "step": 299460 + }, + { + "epoch": 12.4, + "grad_norm": 0.294921875, + "learning_rate": 0.0003171901213527254, + "loss": 0.2178, + "step": 299470 + }, + { + "epoch": 12.4, + "grad_norm": 0.7421875, + "learning_rate": 0.00031717967512151165, + "loss": 0.1776, + "step": 299480 + }, + { + "epoch": 12.4, + "grad_norm": 0.65234375, + "learning_rate": 0.000317169228763873, + "loss": 0.2059, + "step": 299490 + }, + { + "epoch": 12.41, + "grad_norm": 1.1640625, + "learning_rate": 0.00031715878227982897, + "loss": 0.2225, + "step": 299500 + }, + { + "epoch": 12.41, + "grad_norm": 0.4140625, + "learning_rate": 0.00031714833566939956, + "loss": 0.1495, + "step": 299510 + }, + { + "epoch": 12.41, + "grad_norm": 0.4296875, + "learning_rate": 0.0003171378889326042, + "loss": 0.1875, + "step": 299520 + }, + { + "epoch": 12.41, + "grad_norm": 0.81640625, + "learning_rate": 0.0003171274420694624, + "loss": 0.197, + "step": 299530 + }, + { + "epoch": 12.41, + "grad_norm": 1.859375, + "learning_rate": 0.0003171169950799942, + "loss": 0.2602, + "step": 299540 + }, + { + "epoch": 12.41, + "grad_norm": 0.52734375, + "learning_rate": 0.000317106547964219, + "loss": 0.1884, + "step": 299550 + }, + { + "epoch": 12.41, + "grad_norm": 0.859375, + "learning_rate": 0.0003170961007221565, + "loss": 0.1431, + "step": 299560 + }, + { + "epoch": 12.41, + "grad_norm": 0.9765625, + "learning_rate": 0.00031708565335382644, + "loss": 0.2523, + "step": 299570 + }, + { + "epoch": 12.41, + "grad_norm": 1.109375, + "learning_rate": 0.0003170752058592483, + "loss": 0.2011, + "step": 299580 + }, + { + "epoch": 12.41, + "grad_norm": 1.28125, + "learning_rate": 0.00031706475823844207, + "loss": 0.2241, + "step": 299590 + }, + { + "epoch": 12.41, + "grad_norm": 0.4921875, + "learning_rate": 0.0003170543104914271, + "loss": 0.1846, + "step": 299600 + }, + { + "epoch": 12.41, + "grad_norm": 0.48828125, + "learning_rate": 0.0003170438626182231, + "loss": 0.2029, + "step": 299610 + }, + { + "epoch": 12.41, + "grad_norm": 0.53515625, + "learning_rate": 0.00031703341461884987, + "loss": 0.2021, + "step": 299620 + }, + { + "epoch": 12.41, + "grad_norm": 1.0, + "learning_rate": 0.00031702296649332697, + "loss": 0.2112, + "step": 299630 + }, + { + "epoch": 12.41, + "grad_norm": 1.5078125, + "learning_rate": 0.0003170125182416741, + "loss": 0.233, + "step": 299640 + }, + { + "epoch": 12.41, + "grad_norm": 0.9375, + "learning_rate": 0.00031700206986391087, + "loss": 0.1719, + "step": 299650 + }, + { + "epoch": 12.41, + "grad_norm": 0.515625, + "learning_rate": 0.00031699162136005697, + "loss": 0.1699, + "step": 299660 + }, + { + "epoch": 12.41, + "grad_norm": 0.77734375, + "learning_rate": 0.00031698117273013215, + "loss": 0.1445, + "step": 299670 + }, + { + "epoch": 12.41, + "grad_norm": 0.8671875, + "learning_rate": 0.00031697072397415593, + "loss": 0.1894, + "step": 299680 + }, + { + "epoch": 12.41, + "grad_norm": 0.53125, + "learning_rate": 0.00031696027509214804, + "loss": 0.2417, + "step": 299690 + }, + { + "epoch": 12.41, + "grad_norm": 0.890625, + "learning_rate": 0.0003169498260841282, + "loss": 0.1666, + "step": 299700 + }, + { + "epoch": 12.41, + "grad_norm": 2.6875, + "learning_rate": 0.000316939376950116, + "loss": 0.1962, + "step": 299710 + }, + { + "epoch": 12.41, + "grad_norm": 1.1015625, + "learning_rate": 0.00031692892769013105, + "loss": 0.2017, + "step": 299720 + }, + { + "epoch": 12.41, + "grad_norm": 0.82421875, + "learning_rate": 0.0003169184783041932, + "loss": 0.182, + "step": 299730 + }, + { + "epoch": 12.42, + "grad_norm": 1.4453125, + "learning_rate": 0.00031690802879232185, + "loss": 0.1608, + "step": 299740 + }, + { + "epoch": 12.42, + "grad_norm": 1.28125, + "learning_rate": 0.00031689757915453695, + "loss": 0.164, + "step": 299750 + }, + { + "epoch": 12.42, + "grad_norm": 0.51953125, + "learning_rate": 0.00031688712939085796, + "loss": 0.2463, + "step": 299760 + }, + { + "epoch": 12.42, + "grad_norm": 0.61328125, + "learning_rate": 0.0003168766795013046, + "loss": 0.205, + "step": 299770 + }, + { + "epoch": 12.42, + "grad_norm": 0.54296875, + "learning_rate": 0.00031686622948589666, + "loss": 0.1939, + "step": 299780 + }, + { + "epoch": 12.42, + "grad_norm": 0.0, + "learning_rate": 0.0003168557793446536, + "loss": 0.1993, + "step": 299790 + }, + { + "epoch": 12.42, + "grad_norm": 0.8515625, + "learning_rate": 0.0003168453290775952, + "loss": 0.2211, + "step": 299800 + }, + { + "epoch": 12.42, + "grad_norm": 0.75, + "learning_rate": 0.0003168348786847411, + "loss": 0.2685, + "step": 299810 + }, + { + "epoch": 12.42, + "grad_norm": 0.70703125, + "learning_rate": 0.000316824428166111, + "loss": 0.1826, + "step": 299820 + }, + { + "epoch": 12.42, + "grad_norm": 0.9296875, + "learning_rate": 0.0003168139775217246, + "loss": 0.1478, + "step": 299830 + }, + { + "epoch": 12.42, + "grad_norm": 0.7890625, + "learning_rate": 0.0003168035267516014, + "loss": 0.2061, + "step": 299840 + }, + { + "epoch": 12.42, + "grad_norm": 0.388671875, + "learning_rate": 0.00031679307585576123, + "loss": 0.2057, + "step": 299850 + }, + { + "epoch": 12.42, + "grad_norm": 0.44140625, + "learning_rate": 0.0003167826248342237, + "loss": 0.1681, + "step": 299860 + }, + { + "epoch": 12.42, + "grad_norm": 0.55078125, + "learning_rate": 0.0003167721736870085, + "loss": 0.1459, + "step": 299870 + }, + { + "epoch": 12.42, + "grad_norm": 1.1171875, + "learning_rate": 0.0003167617224141353, + "loss": 0.1813, + "step": 299880 + }, + { + "epoch": 12.42, + "grad_norm": 0.63671875, + "learning_rate": 0.0003167512710156236, + "loss": 0.1967, + "step": 299890 + }, + { + "epoch": 12.42, + "grad_norm": 0.62890625, + "learning_rate": 0.00031674081949149336, + "loss": 0.2142, + "step": 299900 + }, + { + "epoch": 12.42, + "grad_norm": 0.98828125, + "learning_rate": 0.00031673036784176414, + "loss": 0.2242, + "step": 299910 + }, + { + "epoch": 12.42, + "grad_norm": 0.71875, + "learning_rate": 0.00031671991606645547, + "loss": 0.1796, + "step": 299920 + }, + { + "epoch": 12.42, + "grad_norm": 0.416015625, + "learning_rate": 0.0003167094641655871, + "loss": 0.2154, + "step": 299930 + }, + { + "epoch": 12.42, + "grad_norm": 0.58984375, + "learning_rate": 0.00031669901213917886, + "loss": 0.204, + "step": 299940 + }, + { + "epoch": 12.42, + "grad_norm": 1.2734375, + "learning_rate": 0.00031668855998725023, + "loss": 0.1972, + "step": 299950 + }, + { + "epoch": 12.42, + "grad_norm": 0.271484375, + "learning_rate": 0.00031667810770982086, + "loss": 0.1709, + "step": 299960 + }, + { + "epoch": 12.42, + "grad_norm": 0.37890625, + "learning_rate": 0.0003166676553069105, + "loss": 0.1701, + "step": 299970 + }, + { + "epoch": 12.43, + "grad_norm": 1.109375, + "learning_rate": 0.00031665720277853883, + "loss": 0.2606, + "step": 299980 + }, + { + "epoch": 12.43, + "grad_norm": 1.6484375, + "learning_rate": 0.00031664675012472554, + "loss": 0.1747, + "step": 299990 + }, + { + "epoch": 12.43, + "grad_norm": 0.49609375, + "learning_rate": 0.00031663629734549027, + "loss": 0.1847, + "step": 300000 + }, + { + "epoch": 12.43, + "grad_norm": 0.5390625, + "learning_rate": 0.00031662584444085265, + "loss": 0.2369, + "step": 300010 + }, + { + "epoch": 12.43, + "grad_norm": 1.265625, + "learning_rate": 0.0003166153914108324, + "loss": 0.2065, + "step": 300020 + }, + { + "epoch": 12.43, + "grad_norm": 0.7734375, + "learning_rate": 0.00031660493825544914, + "loss": 0.2021, + "step": 300030 + }, + { + "epoch": 12.43, + "grad_norm": 1.2890625, + "learning_rate": 0.00031659448497472266, + "loss": 0.17, + "step": 300040 + }, + { + "epoch": 12.43, + "grad_norm": 1.4921875, + "learning_rate": 0.00031658403156867244, + "loss": 0.2024, + "step": 300050 + }, + { + "epoch": 12.43, + "grad_norm": 1.2109375, + "learning_rate": 0.0003165735780373183, + "loss": 0.1968, + "step": 300060 + }, + { + "epoch": 12.43, + "grad_norm": 1.3515625, + "learning_rate": 0.00031656312438068, + "loss": 0.2044, + "step": 300070 + }, + { + "epoch": 12.43, + "grad_norm": 0.416015625, + "learning_rate": 0.000316552670598777, + "loss": 0.2086, + "step": 300080 + }, + { + "epoch": 12.43, + "grad_norm": 0.53125, + "learning_rate": 0.00031654221669162895, + "loss": 0.1649, + "step": 300090 + }, + { + "epoch": 12.43, + "grad_norm": 1.375, + "learning_rate": 0.0003165317626592558, + "loss": 0.2184, + "step": 300100 + }, + { + "epoch": 12.43, + "grad_norm": 0.43359375, + "learning_rate": 0.000316521308501677, + "loss": 0.1639, + "step": 300110 + }, + { + "epoch": 12.43, + "grad_norm": 1.4296875, + "learning_rate": 0.00031651085421891223, + "loss": 0.2124, + "step": 300120 + }, + { + "epoch": 12.43, + "grad_norm": 0.59765625, + "learning_rate": 0.0003165003998109813, + "loss": 0.1593, + "step": 300130 + }, + { + "epoch": 12.43, + "grad_norm": 0.9609375, + "learning_rate": 0.0003164899452779037, + "loss": 0.1785, + "step": 300140 + }, + { + "epoch": 12.43, + "grad_norm": 1.15625, + "learning_rate": 0.00031647949061969926, + "loss": 0.2024, + "step": 300150 + }, + { + "epoch": 12.43, + "grad_norm": 1.3359375, + "learning_rate": 0.0003164690358363876, + "loss": 0.1242, + "step": 300160 + }, + { + "epoch": 12.43, + "grad_norm": 0.97265625, + "learning_rate": 0.0003164585809279884, + "loss": 0.1843, + "step": 300170 + }, + { + "epoch": 12.43, + "grad_norm": 0.546875, + "learning_rate": 0.0003164481258945213, + "loss": 0.1831, + "step": 300180 + }, + { + "epoch": 12.43, + "grad_norm": 1.2109375, + "learning_rate": 0.00031643767073600605, + "loss": 0.1689, + "step": 300190 + }, + { + "epoch": 12.43, + "grad_norm": 1.625, + "learning_rate": 0.0003164272154524622, + "loss": 0.1627, + "step": 300200 + }, + { + "epoch": 12.43, + "grad_norm": 0.53125, + "learning_rate": 0.00031641676004390956, + "loss": 0.1754, + "step": 300210 + }, + { + "epoch": 12.44, + "grad_norm": 0.6015625, + "learning_rate": 0.0003164063045103678, + "loss": 0.1595, + "step": 300220 + }, + { + "epoch": 12.44, + "grad_norm": 1.7109375, + "learning_rate": 0.00031639584885185645, + "loss": 0.1349, + "step": 300230 + }, + { + "epoch": 12.44, + "grad_norm": 0.77734375, + "learning_rate": 0.0003163853930683953, + "loss": 0.2197, + "step": 300240 + }, + { + "epoch": 12.44, + "grad_norm": 1.0078125, + "learning_rate": 0.000316374937160004, + "loss": 0.1922, + "step": 300250 + }, + { + "epoch": 12.44, + "grad_norm": 0.98046875, + "learning_rate": 0.0003163644811267023, + "loss": 0.2287, + "step": 300260 + }, + { + "epoch": 12.44, + "grad_norm": 1.265625, + "learning_rate": 0.00031635402496850987, + "loss": 0.1936, + "step": 300270 + }, + { + "epoch": 12.44, + "grad_norm": 1.359375, + "learning_rate": 0.0003163435686854462, + "loss": 0.1795, + "step": 300280 + }, + { + "epoch": 12.44, + "grad_norm": 0.8203125, + "learning_rate": 0.00031633311227753115, + "loss": 0.1881, + "step": 300290 + }, + { + "epoch": 12.44, + "grad_norm": 0.703125, + "learning_rate": 0.0003163226557447842, + "loss": 0.1989, + "step": 300300 + }, + { + "epoch": 12.44, + "grad_norm": 0.9453125, + "learning_rate": 0.00031631219908722535, + "loss": 0.2591, + "step": 300310 + }, + { + "epoch": 12.44, + "grad_norm": 0.5234375, + "learning_rate": 0.00031630174230487415, + "loss": 0.1642, + "step": 300320 + }, + { + "epoch": 12.44, + "grad_norm": 2.296875, + "learning_rate": 0.0003162912853977501, + "loss": 0.1922, + "step": 300330 + }, + { + "epoch": 12.44, + "grad_norm": 1.0625, + "learning_rate": 0.000316280828365873, + "loss": 0.1958, + "step": 300340 + }, + { + "epoch": 12.44, + "grad_norm": 1.1875, + "learning_rate": 0.00031627037120926266, + "loss": 0.2042, + "step": 300350 + }, + { + "epoch": 12.44, + "grad_norm": 0.984375, + "learning_rate": 0.0003162599139279386, + "loss": 0.1824, + "step": 300360 + }, + { + "epoch": 12.44, + "grad_norm": 1.3828125, + "learning_rate": 0.0003162494565219204, + "loss": 0.2342, + "step": 300370 + }, + { + "epoch": 12.44, + "grad_norm": 0.7265625, + "learning_rate": 0.00031623899899122806, + "loss": 0.1723, + "step": 300380 + }, + { + "epoch": 12.44, + "grad_norm": 0.408203125, + "learning_rate": 0.00031622854133588096, + "loss": 0.2001, + "step": 300390 + }, + { + "epoch": 12.44, + "grad_norm": 0.478515625, + "learning_rate": 0.00031621808355589896, + "loss": 0.2073, + "step": 300400 + }, + { + "epoch": 12.44, + "grad_norm": 1.109375, + "learning_rate": 0.0003162076256513017, + "loss": 0.1807, + "step": 300410 + }, + { + "epoch": 12.44, + "grad_norm": 0.703125, + "learning_rate": 0.0003161971676221087, + "loss": 0.2065, + "step": 300420 + }, + { + "epoch": 12.44, + "grad_norm": 2.84375, + "learning_rate": 0.00031618670946833996, + "loss": 0.2032, + "step": 300430 + }, + { + "epoch": 12.44, + "grad_norm": 0.5859375, + "learning_rate": 0.0003161762511900148, + "loss": 0.2098, + "step": 300440 + }, + { + "epoch": 12.44, + "grad_norm": 1.6875, + "learning_rate": 0.00031616579278715323, + "loss": 0.2242, + "step": 300450 + }, + { + "epoch": 12.45, + "grad_norm": 0.7421875, + "learning_rate": 0.00031615533425977474, + "loss": 0.1642, + "step": 300460 + }, + { + "epoch": 12.45, + "grad_norm": 0.6328125, + "learning_rate": 0.000316144875607899, + "loss": 0.2193, + "step": 300470 + }, + { + "epoch": 12.45, + "grad_norm": 1.2421875, + "learning_rate": 0.0003161344168315458, + "loss": 0.153, + "step": 300480 + }, + { + "epoch": 12.45, + "grad_norm": 0.59375, + "learning_rate": 0.00031612395793073476, + "loss": 0.1979, + "step": 300490 + }, + { + "epoch": 12.45, + "grad_norm": 0.984375, + "learning_rate": 0.0003161134989054856, + "loss": 0.2122, + "step": 300500 + }, + { + "epoch": 12.45, + "grad_norm": 0.55078125, + "learning_rate": 0.0003161030397558179, + "loss": 0.191, + "step": 300510 + }, + { + "epoch": 12.45, + "grad_norm": 0.625, + "learning_rate": 0.0003160925804817515, + "loss": 0.1282, + "step": 300520 + }, + { + "epoch": 12.45, + "grad_norm": 0.447265625, + "learning_rate": 0.00031608212108330595, + "loss": 0.2359, + "step": 300530 + }, + { + "epoch": 12.45, + "grad_norm": 0.0, + "learning_rate": 0.000316071661560501, + "loss": 0.1904, + "step": 300540 + }, + { + "epoch": 12.45, + "grad_norm": 0.69921875, + "learning_rate": 0.0003160612019133563, + "loss": 0.2253, + "step": 300550 + }, + { + "epoch": 12.45, + "grad_norm": 0.69140625, + "learning_rate": 0.0003160507421418916, + "loss": 0.1668, + "step": 300560 + }, + { + "epoch": 12.45, + "grad_norm": 0.515625, + "learning_rate": 0.0003160402822461265, + "loss": 0.1948, + "step": 300570 + }, + { + "epoch": 12.45, + "grad_norm": 0.53125, + "learning_rate": 0.00031602982222608074, + "loss": 0.2025, + "step": 300580 + }, + { + "epoch": 12.45, + "grad_norm": 0.98046875, + "learning_rate": 0.00031601936208177397, + "loss": 0.2377, + "step": 300590 + }, + { + "epoch": 12.45, + "grad_norm": 1.5625, + "learning_rate": 0.0003160089018132258, + "loss": 0.2078, + "step": 300600 + }, + { + "epoch": 12.45, + "grad_norm": 0.5234375, + "learning_rate": 0.0003159984414204562, + "loss": 0.1918, + "step": 300610 + }, + { + "epoch": 12.45, + "grad_norm": 1.1875, + "learning_rate": 0.0003159879809034845, + "loss": 0.1472, + "step": 300620 + }, + { + "epoch": 12.45, + "grad_norm": 0.921875, + "learning_rate": 0.0003159775202623306, + "loss": 0.2028, + "step": 300630 + }, + { + "epoch": 12.45, + "grad_norm": 0.77734375, + "learning_rate": 0.0003159670594970142, + "loss": 0.1804, + "step": 300640 + }, + { + "epoch": 12.45, + "grad_norm": 0.9609375, + "learning_rate": 0.00031595659860755474, + "loss": 0.1996, + "step": 300650 + }, + { + "epoch": 12.45, + "grad_norm": 1.3515625, + "learning_rate": 0.0003159461375939723, + "loss": 0.2432, + "step": 300660 + }, + { + "epoch": 12.45, + "grad_norm": 0.51953125, + "learning_rate": 0.00031593567645628616, + "loss": 0.1689, + "step": 300670 + }, + { + "epoch": 12.45, + "grad_norm": 1.1171875, + "learning_rate": 0.00031592521519451634, + "loss": 0.1508, + "step": 300680 + }, + { + "epoch": 12.45, + "grad_norm": 0.6953125, + "learning_rate": 0.00031591475380868234, + "loss": 0.2232, + "step": 300690 + }, + { + "epoch": 12.45, + "grad_norm": 0.63671875, + "learning_rate": 0.00031590429229880386, + "loss": 0.2227, + "step": 300700 + }, + { + "epoch": 12.46, + "grad_norm": 0.703125, + "learning_rate": 0.0003158938306649006, + "loss": 0.1991, + "step": 300710 + }, + { + "epoch": 12.46, + "grad_norm": 1.0234375, + "learning_rate": 0.0003158833689069924, + "loss": 0.2091, + "step": 300720 + }, + { + "epoch": 12.46, + "grad_norm": 0.56640625, + "learning_rate": 0.0003158729070250986, + "loss": 0.1371, + "step": 300730 + }, + { + "epoch": 12.46, + "grad_norm": 0.37890625, + "learning_rate": 0.00031586244501923927, + "loss": 0.2104, + "step": 300740 + }, + { + "epoch": 12.46, + "grad_norm": 0.455078125, + "learning_rate": 0.0003158519828894339, + "loss": 0.2437, + "step": 300750 + }, + { + "epoch": 12.46, + "grad_norm": 0.0, + "learning_rate": 0.00031584152063570217, + "loss": 0.1602, + "step": 300760 + }, + { + "epoch": 12.46, + "grad_norm": 1.125, + "learning_rate": 0.0003158310582580639, + "loss": 0.2133, + "step": 300770 + }, + { + "epoch": 12.46, + "grad_norm": 0.76953125, + "learning_rate": 0.0003158205957565386, + "loss": 0.2322, + "step": 300780 + }, + { + "epoch": 12.46, + "grad_norm": 0.875, + "learning_rate": 0.00031581013313114605, + "loss": 0.2749, + "step": 300790 + }, + { + "epoch": 12.46, + "grad_norm": 0.640625, + "learning_rate": 0.000315799670381906, + "loss": 0.2145, + "step": 300800 + }, + { + "epoch": 12.46, + "grad_norm": 0.6953125, + "learning_rate": 0.00031578920750883793, + "loss": 0.1974, + "step": 300810 + }, + { + "epoch": 12.46, + "grad_norm": 0.3046875, + "learning_rate": 0.0003157787445119618, + "loss": 0.1837, + "step": 300820 + }, + { + "epoch": 12.46, + "grad_norm": 1.5703125, + "learning_rate": 0.00031576828139129716, + "loss": 0.1729, + "step": 300830 + }, + { + "epoch": 12.46, + "grad_norm": 1.46875, + "learning_rate": 0.0003157578181468637, + "loss": 0.211, + "step": 300840 + }, + { + "epoch": 12.46, + "grad_norm": 1.4609375, + "learning_rate": 0.00031574735477868116, + "loss": 0.1764, + "step": 300850 + }, + { + "epoch": 12.46, + "grad_norm": 0.87890625, + "learning_rate": 0.0003157368912867692, + "loss": 0.1854, + "step": 300860 + }, + { + "epoch": 12.46, + "grad_norm": 0.9609375, + "learning_rate": 0.00031572642767114747, + "loss": 0.2435, + "step": 300870 + }, + { + "epoch": 12.46, + "grad_norm": 1.359375, + "learning_rate": 0.0003157159639318357, + "loss": 0.2422, + "step": 300880 + }, + { + "epoch": 12.46, + "grad_norm": 1.34375, + "learning_rate": 0.00031570550006885357, + "loss": 0.1741, + "step": 300890 + }, + { + "epoch": 12.46, + "grad_norm": 0.953125, + "learning_rate": 0.0003156950360822208, + "loss": 0.2065, + "step": 300900 + }, + { + "epoch": 12.46, + "grad_norm": 0.62109375, + "learning_rate": 0.0003156845719719571, + "loss": 0.2567, + "step": 300910 + }, + { + "epoch": 12.46, + "grad_norm": 0.6953125, + "learning_rate": 0.0003156741077380821, + "loss": 0.168, + "step": 300920 + }, + { + "epoch": 12.46, + "grad_norm": 0.52734375, + "learning_rate": 0.00031566364338061554, + "loss": 0.2041, + "step": 300930 + }, + { + "epoch": 12.46, + "grad_norm": 0.423828125, + "learning_rate": 0.0003156531788995771, + "loss": 0.1793, + "step": 300940 + }, + { + "epoch": 12.47, + "grad_norm": 0.88671875, + "learning_rate": 0.0003156427142949864, + "loss": 0.1519, + "step": 300950 + }, + { + "epoch": 12.47, + "grad_norm": 0.412109375, + "learning_rate": 0.0003156322495668633, + "loss": 0.1797, + "step": 300960 + }, + { + "epoch": 12.47, + "grad_norm": 0.70703125, + "learning_rate": 0.0003156217847152273, + "loss": 0.1697, + "step": 300970 + }, + { + "epoch": 12.47, + "grad_norm": 0.68359375, + "learning_rate": 0.00031561131974009826, + "loss": 0.1989, + "step": 300980 + }, + { + "epoch": 12.47, + "grad_norm": 1.15625, + "learning_rate": 0.0003156008546414958, + "loss": 0.1891, + "step": 300990 + }, + { + "epoch": 12.47, + "grad_norm": 1.03125, + "learning_rate": 0.0003155903894194395, + "loss": 0.2078, + "step": 301000 + }, + { + "epoch": 12.47, + "grad_norm": 0.74609375, + "learning_rate": 0.0003155799240739493, + "loss": 0.1482, + "step": 301010 + }, + { + "epoch": 12.47, + "grad_norm": 1.078125, + "learning_rate": 0.0003155694586050447, + "loss": 0.1857, + "step": 301020 + }, + { + "epoch": 12.47, + "grad_norm": 0.75390625, + "learning_rate": 0.0003155589930127455, + "loss": 0.2068, + "step": 301030 + }, + { + "epoch": 12.47, + "grad_norm": 1.6953125, + "learning_rate": 0.00031554852729707137, + "loss": 0.1833, + "step": 301040 + }, + { + "epoch": 12.47, + "grad_norm": 0.828125, + "learning_rate": 0.0003155380614580419, + "loss": 0.2113, + "step": 301050 + }, + { + "epoch": 12.47, + "grad_norm": 0.0, + "learning_rate": 0.000315527595495677, + "loss": 0.2075, + "step": 301060 + }, + { + "epoch": 12.47, + "grad_norm": 0.68359375, + "learning_rate": 0.00031551712940999614, + "loss": 0.2437, + "step": 301070 + }, + { + "epoch": 12.47, + "grad_norm": 0.8203125, + "learning_rate": 0.0003155066632010192, + "loss": 0.2157, + "step": 301080 + }, + { + "epoch": 12.47, + "grad_norm": 0.453125, + "learning_rate": 0.0003154961968687657, + "loss": 0.2028, + "step": 301090 + }, + { + "epoch": 12.47, + "grad_norm": 1.03125, + "learning_rate": 0.0003154857304132555, + "loss": 0.2228, + "step": 301100 + }, + { + "epoch": 12.47, + "grad_norm": 1.0546875, + "learning_rate": 0.0003154752638345082, + "loss": 0.1971, + "step": 301110 + }, + { + "epoch": 12.47, + "grad_norm": 0.62109375, + "learning_rate": 0.00031546479713254357, + "loss": 0.179, + "step": 301120 + }, + { + "epoch": 12.47, + "grad_norm": 0.65234375, + "learning_rate": 0.0003154543303073811, + "loss": 0.1931, + "step": 301130 + }, + { + "epoch": 12.47, + "grad_norm": 1.140625, + "learning_rate": 0.00031544386335904083, + "loss": 0.1818, + "step": 301140 + }, + { + "epoch": 12.47, + "grad_norm": 1.78125, + "learning_rate": 0.0003154333962875423, + "loss": 0.156, + "step": 301150 + }, + { + "epoch": 12.47, + "grad_norm": 0.67578125, + "learning_rate": 0.00031542292909290503, + "loss": 0.2229, + "step": 301160 + }, + { + "epoch": 12.47, + "grad_norm": 1.1875, + "learning_rate": 0.000315412461775149, + "loss": 0.219, + "step": 301170 + }, + { + "epoch": 12.47, + "grad_norm": 1.296875, + "learning_rate": 0.0003154019943342937, + "loss": 0.1976, + "step": 301180 + }, + { + "epoch": 12.48, + "grad_norm": 0.7890625, + "learning_rate": 0.00031539152677035893, + "loss": 0.2081, + "step": 301190 + }, + { + "epoch": 12.48, + "grad_norm": 0.68359375, + "learning_rate": 0.00031538105908336434, + "loss": 0.1764, + "step": 301200 + }, + { + "epoch": 12.48, + "grad_norm": 0.55859375, + "learning_rate": 0.00031537059127332965, + "loss": 0.175, + "step": 301210 + }, + { + "epoch": 12.48, + "grad_norm": 0.96484375, + "learning_rate": 0.00031536012334027467, + "loss": 0.1685, + "step": 301220 + }, + { + "epoch": 12.48, + "grad_norm": 0.95703125, + "learning_rate": 0.00031534965528421896, + "loss": 0.1628, + "step": 301230 + }, + { + "epoch": 12.48, + "grad_norm": 1.1171875, + "learning_rate": 0.0003153391871051822, + "loss": 0.1907, + "step": 301240 + }, + { + "epoch": 12.48, + "grad_norm": 0.9765625, + "learning_rate": 0.0003153287188031842, + "loss": 0.1803, + "step": 301250 + }, + { + "epoch": 12.48, + "grad_norm": 2.671875, + "learning_rate": 0.00031531825037824454, + "loss": 0.2517, + "step": 301260 + }, + { + "epoch": 12.48, + "grad_norm": 0.93359375, + "learning_rate": 0.00031530778183038306, + "loss": 0.1319, + "step": 301270 + }, + { + "epoch": 12.48, + "grad_norm": 1.0390625, + "learning_rate": 0.0003152973131596193, + "loss": 0.2232, + "step": 301280 + }, + { + "epoch": 12.48, + "grad_norm": 1.21875, + "learning_rate": 0.0003152868443659731, + "loss": 0.1803, + "step": 301290 + }, + { + "epoch": 12.48, + "grad_norm": 0.59375, + "learning_rate": 0.0003152763754494642, + "loss": 0.1767, + "step": 301300 + }, + { + "epoch": 12.48, + "grad_norm": 0.7734375, + "learning_rate": 0.0003152659064101121, + "loss": 0.2071, + "step": 301310 + }, + { + "epoch": 12.48, + "grad_norm": 0.7578125, + "learning_rate": 0.0003152554372479366, + "loss": 0.1849, + "step": 301320 + }, + { + "epoch": 12.48, + "grad_norm": 0.357421875, + "learning_rate": 0.00031524496796295746, + "loss": 0.1827, + "step": 301330 + }, + { + "epoch": 12.48, + "grad_norm": 1.015625, + "learning_rate": 0.00031523449855519426, + "loss": 0.1861, + "step": 301340 + }, + { + "epoch": 12.48, + "grad_norm": 0.8515625, + "learning_rate": 0.0003152240290246669, + "loss": 0.1607, + "step": 301350 + }, + { + "epoch": 12.48, + "grad_norm": 0.44921875, + "learning_rate": 0.0003152135593713949, + "loss": 0.1646, + "step": 301360 + }, + { + "epoch": 12.48, + "grad_norm": 0.72265625, + "learning_rate": 0.000315203089595398, + "loss": 0.2336, + "step": 301370 + }, + { + "epoch": 12.48, + "grad_norm": 1.9921875, + "learning_rate": 0.000315192619696696, + "loss": 0.1741, + "step": 301380 + }, + { + "epoch": 12.48, + "grad_norm": 0.890625, + "learning_rate": 0.00031518214967530845, + "loss": 0.1989, + "step": 301390 + }, + { + "epoch": 12.48, + "grad_norm": 0.7734375, + "learning_rate": 0.0003151716795312552, + "loss": 0.2047, + "step": 301400 + }, + { + "epoch": 12.48, + "grad_norm": 0.80859375, + "learning_rate": 0.0003151612092645558, + "loss": 0.1743, + "step": 301410 + }, + { + "epoch": 12.48, + "grad_norm": 0.5546875, + "learning_rate": 0.0003151507388752301, + "loss": 0.1788, + "step": 301420 + }, + { + "epoch": 12.49, + "grad_norm": 2.015625, + "learning_rate": 0.0003151402683632978, + "loss": 0.14, + "step": 301430 + }, + { + "epoch": 12.49, + "grad_norm": 0.73046875, + "learning_rate": 0.00031512979772877847, + "loss": 0.2188, + "step": 301440 + }, + { + "epoch": 12.49, + "grad_norm": 1.0703125, + "learning_rate": 0.0003151193269716919, + "loss": 0.2508, + "step": 301450 + }, + { + "epoch": 12.49, + "grad_norm": 0.9375, + "learning_rate": 0.00031510885609205785, + "loss": 0.1535, + "step": 301460 + }, + { + "epoch": 12.49, + "grad_norm": 0.76953125, + "learning_rate": 0.0003150983850898959, + "loss": 0.1492, + "step": 301470 + }, + { + "epoch": 12.49, + "grad_norm": 0.9375, + "learning_rate": 0.0003150879139652258, + "loss": 0.1488, + "step": 301480 + }, + { + "epoch": 12.49, + "grad_norm": 0.34765625, + "learning_rate": 0.0003150774427180673, + "loss": 0.1487, + "step": 301490 + }, + { + "epoch": 12.49, + "grad_norm": 0.80078125, + "learning_rate": 0.00031506697134844007, + "loss": 0.1668, + "step": 301500 + }, + { + "epoch": 12.49, + "grad_norm": 0.58984375, + "learning_rate": 0.0003150564998563639, + "loss": 0.2035, + "step": 301510 + }, + { + "epoch": 12.49, + "grad_norm": 0.451171875, + "learning_rate": 0.0003150460282418584, + "loss": 0.2401, + "step": 301520 + }, + { + "epoch": 12.49, + "grad_norm": 1.21875, + "learning_rate": 0.0003150355565049432, + "loss": 0.2139, + "step": 301530 + }, + { + "epoch": 12.49, + "grad_norm": 0.9609375, + "learning_rate": 0.00031502508464563824, + "loss": 0.1986, + "step": 301540 + }, + { + "epoch": 12.49, + "grad_norm": 0.640625, + "learning_rate": 0.00031501461266396304, + "loss": 0.2145, + "step": 301550 + }, + { + "epoch": 12.49, + "grad_norm": 2.546875, + "learning_rate": 0.0003150041405599373, + "loss": 0.1954, + "step": 301560 + }, + { + "epoch": 12.49, + "grad_norm": 0.53125, + "learning_rate": 0.0003149936683335808, + "loss": 0.1693, + "step": 301570 + }, + { + "epoch": 12.49, + "grad_norm": 0.244140625, + "learning_rate": 0.0003149831959849133, + "loss": 0.1916, + "step": 301580 + }, + { + "epoch": 12.49, + "grad_norm": 0.796875, + "learning_rate": 0.00031497272351395443, + "loss": 0.1631, + "step": 301590 + }, + { + "epoch": 12.49, + "grad_norm": 0.96875, + "learning_rate": 0.0003149622509207239, + "loss": 0.2242, + "step": 301600 + }, + { + "epoch": 12.49, + "grad_norm": 0.8515625, + "learning_rate": 0.0003149517782052414, + "loss": 0.1611, + "step": 301610 + }, + { + "epoch": 12.49, + "grad_norm": 1.0703125, + "learning_rate": 0.0003149413053675267, + "loss": 0.1545, + "step": 301620 + }, + { + "epoch": 12.49, + "grad_norm": 1.21875, + "learning_rate": 0.0003149308324075995, + "loss": 0.2381, + "step": 301630 + }, + { + "epoch": 12.49, + "grad_norm": 0.58984375, + "learning_rate": 0.0003149203593254795, + "loss": 0.1974, + "step": 301640 + }, + { + "epoch": 12.49, + "grad_norm": 1.7109375, + "learning_rate": 0.00031490988612118624, + "loss": 0.2415, + "step": 301650 + }, + { + "epoch": 12.49, + "grad_norm": 1.0390625, + "learning_rate": 0.00031489941279473974, + "loss": 0.1711, + "step": 301660 + }, + { + "epoch": 12.5, + "grad_norm": 0.494140625, + "learning_rate": 0.00031488893934615954, + "loss": 0.2099, + "step": 301670 + }, + { + "epoch": 12.5, + "grad_norm": 0.8203125, + "learning_rate": 0.00031487846577546533, + "loss": 0.2105, + "step": 301680 + }, + { + "epoch": 12.5, + "grad_norm": 0.609375, + "learning_rate": 0.0003148679920826768, + "loss": 0.1658, + "step": 301690 + }, + { + "epoch": 12.5, + "grad_norm": 1.625, + "learning_rate": 0.00031485751826781374, + "loss": 0.221, + "step": 301700 + }, + { + "epoch": 12.5, + "grad_norm": 0.62890625, + "learning_rate": 0.00031484704433089593, + "loss": 0.1972, + "step": 301710 + }, + { + "epoch": 12.5, + "grad_norm": 0.365234375, + "learning_rate": 0.00031483657027194293, + "loss": 0.2004, + "step": 301720 + }, + { + "epoch": 12.5, + "grad_norm": 1.21875, + "learning_rate": 0.00031482609609097446, + "loss": 0.2032, + "step": 301730 + }, + { + "epoch": 12.5, + "grad_norm": 0.439453125, + "learning_rate": 0.00031481562178801027, + "loss": 0.1564, + "step": 301740 + }, + { + "epoch": 12.5, + "grad_norm": 0.67578125, + "learning_rate": 0.0003148051473630701, + "loss": 0.2511, + "step": 301750 + }, + { + "epoch": 12.5, + "grad_norm": 0.87109375, + "learning_rate": 0.0003147946728161737, + "loss": 0.1831, + "step": 301760 + }, + { + "epoch": 12.5, + "grad_norm": 1.0078125, + "learning_rate": 0.00031478419814734074, + "loss": 0.2038, + "step": 301770 + }, + { + "epoch": 12.5, + "grad_norm": 0.5234375, + "learning_rate": 0.00031477372335659084, + "loss": 0.1819, + "step": 301780 + }, + { + "epoch": 12.5, + "grad_norm": 0.99609375, + "learning_rate": 0.00031476324844394377, + "loss": 0.1491, + "step": 301790 + }, + { + "epoch": 12.5, + "grad_norm": 1.0703125, + "learning_rate": 0.0003147527734094193, + "loss": 0.1807, + "step": 301800 + }, + { + "epoch": 12.5, + "grad_norm": 2.109375, + "learning_rate": 0.00031474229825303714, + "loss": 0.2043, + "step": 301810 + }, + { + "epoch": 12.5, + "grad_norm": 0.74609375, + "learning_rate": 0.0003147318229748169, + "loss": 0.1776, + "step": 301820 + }, + { + "epoch": 12.5, + "grad_norm": 1.1328125, + "learning_rate": 0.0003147213475747783, + "loss": 0.2214, + "step": 301830 + }, + { + "epoch": 12.5, + "grad_norm": 0.91015625, + "learning_rate": 0.0003147108720529413, + "loss": 0.2006, + "step": 301840 + }, + { + "epoch": 12.5, + "grad_norm": 0.34375, + "learning_rate": 0.0003147003964093253, + "loss": 0.176, + "step": 301850 + }, + { + "epoch": 12.5, + "grad_norm": 0.625, + "learning_rate": 0.0003146899206439502, + "loss": 0.1798, + "step": 301860 + }, + { + "epoch": 12.5, + "grad_norm": 0.8828125, + "learning_rate": 0.00031467944475683564, + "loss": 0.1569, + "step": 301870 + }, + { + "epoch": 12.5, + "grad_norm": 0.79296875, + "learning_rate": 0.00031466896874800135, + "loss": 0.1449, + "step": 301880 + }, + { + "epoch": 12.5, + "grad_norm": 0.6484375, + "learning_rate": 0.00031465849261746694, + "loss": 0.2113, + "step": 301890 + }, + { + "epoch": 12.5, + "grad_norm": 0.765625, + "learning_rate": 0.0003146480163652523, + "loss": 0.2075, + "step": 301900 + }, + { + "epoch": 12.51, + "grad_norm": 0.73046875, + "learning_rate": 0.0003146375399913771, + "loss": 0.2197, + "step": 301910 + }, + { + "epoch": 12.51, + "grad_norm": 2.046875, + "learning_rate": 0.00031462706349586107, + "loss": 0.2235, + "step": 301920 + }, + { + "epoch": 12.51, + "grad_norm": 0.66015625, + "learning_rate": 0.0003146165868787238, + "loss": 0.1911, + "step": 301930 + }, + { + "epoch": 12.51, + "grad_norm": 0.88671875, + "learning_rate": 0.0003146061101399851, + "loss": 0.1636, + "step": 301940 + }, + { + "epoch": 12.51, + "grad_norm": 2.171875, + "learning_rate": 0.0003145956332796648, + "loss": 0.1611, + "step": 301950 + }, + { + "epoch": 12.51, + "grad_norm": 0.71484375, + "learning_rate": 0.00031458515629778226, + "loss": 0.2309, + "step": 301960 + }, + { + "epoch": 12.51, + "grad_norm": 0.73828125, + "learning_rate": 0.0003145746791943577, + "loss": 0.1522, + "step": 301970 + }, + { + "epoch": 12.51, + "grad_norm": 1.171875, + "learning_rate": 0.00031456420196941036, + "loss": 0.1757, + "step": 301980 + }, + { + "epoch": 12.51, + "grad_norm": 0.95703125, + "learning_rate": 0.0003145537246229603, + "loss": 0.1752, + "step": 301990 + }, + { + "epoch": 12.51, + "grad_norm": 0.380859375, + "learning_rate": 0.00031454324715502705, + "loss": 0.1788, + "step": 302000 + }, + { + "epoch": 12.51, + "grad_norm": 0.7578125, + "learning_rate": 0.00031453276956563037, + "loss": 0.2121, + "step": 302010 + }, + { + "epoch": 12.51, + "grad_norm": 0.5625, + "learning_rate": 0.00031452229185479, + "loss": 0.1438, + "step": 302020 + }, + { + "epoch": 12.51, + "grad_norm": 0.77734375, + "learning_rate": 0.0003145118140225256, + "loss": 0.187, + "step": 302030 + }, + { + "epoch": 12.51, + "grad_norm": 0.376953125, + "learning_rate": 0.0003145013360688569, + "loss": 0.2208, + "step": 302040 + }, + { + "epoch": 12.51, + "grad_norm": 1.1484375, + "learning_rate": 0.0003144908579938038, + "loss": 0.207, + "step": 302050 + }, + { + "epoch": 12.51, + "grad_norm": 0.326171875, + "learning_rate": 0.00031448037979738577, + "loss": 0.1432, + "step": 302060 + }, + { + "epoch": 12.51, + "grad_norm": 0.6640625, + "learning_rate": 0.0003144699014796226, + "loss": 0.2179, + "step": 302070 + }, + { + "epoch": 12.51, + "grad_norm": 0.3359375, + "learning_rate": 0.00031445942304053414, + "loss": 0.1905, + "step": 302080 + }, + { + "epoch": 12.51, + "grad_norm": 0.57421875, + "learning_rate": 0.0003144489444801399, + "loss": 0.1702, + "step": 302090 + }, + { + "epoch": 12.51, + "grad_norm": 1.84375, + "learning_rate": 0.0003144384657984598, + "loss": 0.1827, + "step": 302100 + }, + { + "epoch": 12.51, + "grad_norm": 1.3046875, + "learning_rate": 0.00031442798699551347, + "loss": 0.2232, + "step": 302110 + }, + { + "epoch": 12.51, + "grad_norm": 1.09375, + "learning_rate": 0.00031441750807132047, + "loss": 0.2306, + "step": 302120 + }, + { + "epoch": 12.51, + "grad_norm": 0.4375, + "learning_rate": 0.0003144070290259008, + "loss": 0.1931, + "step": 302130 + }, + { + "epoch": 12.51, + "grad_norm": 0.5703125, + "learning_rate": 0.0003143965498592741, + "loss": 0.2204, + "step": 302140 + }, + { + "epoch": 12.52, + "grad_norm": 1.4921875, + "learning_rate": 0.00031438607057145997, + "loss": 0.1727, + "step": 302150 + }, + { + "epoch": 12.52, + "grad_norm": 0.91015625, + "learning_rate": 0.0003143755911624782, + "loss": 0.1861, + "step": 302160 + }, + { + "epoch": 12.52, + "grad_norm": 1.71875, + "learning_rate": 0.0003143651116323485, + "loss": 0.189, + "step": 302170 + }, + { + "epoch": 12.52, + "grad_norm": 0.42578125, + "learning_rate": 0.0003143546319810907, + "loss": 0.1959, + "step": 302180 + }, + { + "epoch": 12.52, + "grad_norm": 0.953125, + "learning_rate": 0.0003143441522087244, + "loss": 0.1809, + "step": 302190 + }, + { + "epoch": 12.52, + "grad_norm": 1.546875, + "learning_rate": 0.00031433367231526935, + "loss": 0.1823, + "step": 302200 + }, + { + "epoch": 12.52, + "grad_norm": 0.859375, + "learning_rate": 0.00031432319230074524, + "loss": 0.1766, + "step": 302210 + }, + { + "epoch": 12.52, + "grad_norm": 1.1640625, + "learning_rate": 0.00031431271216517185, + "loss": 0.1701, + "step": 302220 + }, + { + "epoch": 12.52, + "grad_norm": 0.85546875, + "learning_rate": 0.0003143022319085689, + "loss": 0.1567, + "step": 302230 + }, + { + "epoch": 12.52, + "grad_norm": 1.28125, + "learning_rate": 0.00031429175153095614, + "loss": 0.1901, + "step": 302240 + }, + { + "epoch": 12.52, + "grad_norm": 0.494140625, + "learning_rate": 0.0003142812710323531, + "loss": 0.1801, + "step": 302250 + }, + { + "epoch": 12.52, + "grad_norm": 0.58203125, + "learning_rate": 0.0003142707904127798, + "loss": 0.1731, + "step": 302260 + }, + { + "epoch": 12.52, + "grad_norm": 0.796875, + "learning_rate": 0.0003142603096722558, + "loss": 0.1535, + "step": 302270 + }, + { + "epoch": 12.52, + "grad_norm": 0.921875, + "learning_rate": 0.00031424982881080065, + "loss": 0.1777, + "step": 302280 + }, + { + "epoch": 12.52, + "grad_norm": 1.1953125, + "learning_rate": 0.0003142393478284345, + "loss": 0.1497, + "step": 302290 + }, + { + "epoch": 12.52, + "grad_norm": 1.0078125, + "learning_rate": 0.0003142288667251767, + "loss": 0.2027, + "step": 302300 + }, + { + "epoch": 12.52, + "grad_norm": 0.5390625, + "learning_rate": 0.00031421838550104716, + "loss": 0.1888, + "step": 302310 + }, + { + "epoch": 12.52, + "grad_norm": 0.275390625, + "learning_rate": 0.00031420790415606555, + "loss": 0.2273, + "step": 302320 + }, + { + "epoch": 12.52, + "grad_norm": 0.306640625, + "learning_rate": 0.0003141974226902515, + "loss": 0.2192, + "step": 302330 + }, + { + "epoch": 12.52, + "grad_norm": 0.59375, + "learning_rate": 0.000314186941103625, + "loss": 0.2215, + "step": 302340 + }, + { + "epoch": 12.52, + "grad_norm": 0.89453125, + "learning_rate": 0.00031417645939620553, + "loss": 0.1933, + "step": 302350 + }, + { + "epoch": 12.52, + "grad_norm": 1.2734375, + "learning_rate": 0.00031416597756801285, + "loss": 0.2272, + "step": 302360 + }, + { + "epoch": 12.52, + "grad_norm": 0.609375, + "learning_rate": 0.00031415549561906683, + "loss": 0.213, + "step": 302370 + }, + { + "epoch": 12.52, + "grad_norm": 2.296875, + "learning_rate": 0.000314145013549387, + "loss": 0.2242, + "step": 302380 + }, + { + "epoch": 12.52, + "grad_norm": 0.294921875, + "learning_rate": 0.0003141345313589932, + "loss": 0.2062, + "step": 302390 + }, + { + "epoch": 12.53, + "grad_norm": 1.109375, + "learning_rate": 0.00031412404904790516, + "loss": 0.156, + "step": 302400 + }, + { + "epoch": 12.53, + "grad_norm": 0.5390625, + "learning_rate": 0.00031411356661614255, + "loss": 0.1912, + "step": 302410 + }, + { + "epoch": 12.53, + "grad_norm": 1.0703125, + "learning_rate": 0.00031410308406372515, + "loss": 0.1807, + "step": 302420 + }, + { + "epoch": 12.53, + "grad_norm": 0.578125, + "learning_rate": 0.00031409260139067275, + "loss": 0.1949, + "step": 302430 + }, + { + "epoch": 12.53, + "grad_norm": 0.90625, + "learning_rate": 0.00031408211859700486, + "loss": 0.208, + "step": 302440 + }, + { + "epoch": 12.53, + "grad_norm": 1.1640625, + "learning_rate": 0.00031407163568274144, + "loss": 0.1603, + "step": 302450 + }, + { + "epoch": 12.53, + "grad_norm": 1.15625, + "learning_rate": 0.0003140611526479021, + "loss": 0.1848, + "step": 302460 + }, + { + "epoch": 12.53, + "grad_norm": 1.2109375, + "learning_rate": 0.00031405066949250653, + "loss": 0.21, + "step": 302470 + }, + { + "epoch": 12.53, + "grad_norm": 0.578125, + "learning_rate": 0.00031404018621657457, + "loss": 0.2495, + "step": 302480 + }, + { + "epoch": 12.53, + "grad_norm": 1.0625, + "learning_rate": 0.0003140297028201259, + "loss": 0.2238, + "step": 302490 + }, + { + "epoch": 12.53, + "grad_norm": 0.67578125, + "learning_rate": 0.00031401921930318023, + "loss": 0.2111, + "step": 302500 + }, + { + "epoch": 12.53, + "grad_norm": 1.015625, + "learning_rate": 0.00031400873566575727, + "loss": 0.2041, + "step": 302510 + }, + { + "epoch": 12.53, + "grad_norm": 0.63671875, + "learning_rate": 0.00031399825190787684, + "loss": 0.1422, + "step": 302520 + }, + { + "epoch": 12.53, + "grad_norm": 0.8828125, + "learning_rate": 0.0003139877680295586, + "loss": 0.214, + "step": 302530 + }, + { + "epoch": 12.53, + "grad_norm": 1.390625, + "learning_rate": 0.00031397728403082225, + "loss": 0.2034, + "step": 302540 + }, + { + "epoch": 12.53, + "grad_norm": 0.1181640625, + "learning_rate": 0.00031396679991168766, + "loss": 0.1921, + "step": 302550 + }, + { + "epoch": 12.53, + "grad_norm": 0.6484375, + "learning_rate": 0.0003139563156721744, + "loss": 0.19, + "step": 302560 + }, + { + "epoch": 12.53, + "grad_norm": 1.171875, + "learning_rate": 0.00031394583131230214, + "loss": 0.1734, + "step": 302570 + }, + { + "epoch": 12.53, + "grad_norm": 1.171875, + "learning_rate": 0.00031393534683209093, + "loss": 0.197, + "step": 302580 + }, + { + "epoch": 12.53, + "grad_norm": 0.53125, + "learning_rate": 0.00031392486223156017, + "loss": 0.171, + "step": 302590 + }, + { + "epoch": 12.53, + "grad_norm": 1.6875, + "learning_rate": 0.00031391437751072975, + "loss": 0.206, + "step": 302600 + }, + { + "epoch": 12.53, + "grad_norm": 0.7265625, + "learning_rate": 0.00031390389266961946, + "loss": 0.1932, + "step": 302610 + }, + { + "epoch": 12.53, + "grad_norm": 0.609375, + "learning_rate": 0.0003138934077082488, + "loss": 0.2052, + "step": 302620 + }, + { + "epoch": 12.53, + "grad_norm": 0.63671875, + "learning_rate": 0.00031388292262663777, + "loss": 0.1545, + "step": 302630 + }, + { + "epoch": 12.54, + "grad_norm": 0.98046875, + "learning_rate": 0.00031387243742480594, + "loss": 0.1631, + "step": 302640 + }, + { + "epoch": 12.54, + "grad_norm": 0.84375, + "learning_rate": 0.000313861952102773, + "loss": 0.1557, + "step": 302650 + }, + { + "epoch": 12.54, + "grad_norm": 0.87890625, + "learning_rate": 0.00031385146666055886, + "loss": 0.1939, + "step": 302660 + }, + { + "epoch": 12.54, + "grad_norm": 0.81640625, + "learning_rate": 0.0003138409810981831, + "loss": 0.1945, + "step": 302670 + }, + { + "epoch": 12.54, + "grad_norm": 1.0546875, + "learning_rate": 0.00031383049541566555, + "loss": 0.1863, + "step": 302680 + }, + { + "epoch": 12.54, + "grad_norm": 0.8203125, + "learning_rate": 0.00031382000961302595, + "loss": 0.1771, + "step": 302690 + }, + { + "epoch": 12.54, + "grad_norm": 0.5703125, + "learning_rate": 0.00031380952369028386, + "loss": 0.2194, + "step": 302700 + }, + { + "epoch": 12.54, + "grad_norm": 0.6796875, + "learning_rate": 0.00031379903764745923, + "loss": 0.2265, + "step": 302710 + }, + { + "epoch": 12.54, + "grad_norm": 1.2109375, + "learning_rate": 0.0003137885514845716, + "loss": 0.1695, + "step": 302720 + }, + { + "epoch": 12.54, + "grad_norm": 1.03125, + "learning_rate": 0.00031377806520164087, + "loss": 0.2434, + "step": 302730 + }, + { + "epoch": 12.54, + "grad_norm": 0.7109375, + "learning_rate": 0.0003137675787986868, + "loss": 0.2113, + "step": 302740 + }, + { + "epoch": 12.54, + "grad_norm": 0.66015625, + "learning_rate": 0.0003137570922757289, + "loss": 0.1817, + "step": 302750 + }, + { + "epoch": 12.54, + "grad_norm": 0.302734375, + "learning_rate": 0.0003137466056327871, + "loss": 0.1784, + "step": 302760 + }, + { + "epoch": 12.54, + "grad_norm": 0.94921875, + "learning_rate": 0.00031373611886988107, + "loss": 0.1949, + "step": 302770 + }, + { + "epoch": 12.54, + "grad_norm": 0.91796875, + "learning_rate": 0.0003137256319870305, + "loss": 0.2172, + "step": 302780 + }, + { + "epoch": 12.54, + "grad_norm": 0.69140625, + "learning_rate": 0.0003137151449842552, + "loss": 0.1505, + "step": 302790 + }, + { + "epoch": 12.54, + "grad_norm": 0.9453125, + "learning_rate": 0.0003137046578615749, + "loss": 0.2033, + "step": 302800 + }, + { + "epoch": 12.54, + "grad_norm": 0.859375, + "learning_rate": 0.00031369417061900926, + "loss": 0.1555, + "step": 302810 + }, + { + "epoch": 12.54, + "grad_norm": 0.85546875, + "learning_rate": 0.00031368368325657815, + "loss": 0.2175, + "step": 302820 + }, + { + "epoch": 12.54, + "grad_norm": 0.75390625, + "learning_rate": 0.00031367319577430117, + "loss": 0.197, + "step": 302830 + }, + { + "epoch": 12.54, + "grad_norm": 0.318359375, + "learning_rate": 0.0003136627081721981, + "loss": 0.1431, + "step": 302840 + }, + { + "epoch": 12.54, + "grad_norm": 0.4765625, + "learning_rate": 0.00031365222045028876, + "loss": 0.1976, + "step": 302850 + }, + { + "epoch": 12.54, + "grad_norm": 1.1484375, + "learning_rate": 0.00031364173260859274, + "loss": 0.1979, + "step": 302860 + }, + { + "epoch": 12.54, + "grad_norm": 1.1328125, + "learning_rate": 0.00031363124464712984, + "loss": 0.1453, + "step": 302870 + }, + { + "epoch": 12.55, + "grad_norm": 1.546875, + "learning_rate": 0.0003136207565659198, + "loss": 0.1956, + "step": 302880 + }, + { + "epoch": 12.55, + "grad_norm": 1.1171875, + "learning_rate": 0.0003136102683649824, + "loss": 0.1945, + "step": 302890 + }, + { + "epoch": 12.55, + "grad_norm": 0.71484375, + "learning_rate": 0.0003135997800443374, + "loss": 0.1604, + "step": 302900 + }, + { + "epoch": 12.55, + "grad_norm": 0.8984375, + "learning_rate": 0.0003135892916040045, + "loss": 0.2, + "step": 302910 + }, + { + "epoch": 12.55, + "grad_norm": 0.322265625, + "learning_rate": 0.0003135788030440033, + "loss": 0.1455, + "step": 302920 + }, + { + "epoch": 12.55, + "grad_norm": 0.53125, + "learning_rate": 0.00031356831436435374, + "loss": 0.222, + "step": 302930 + }, + { + "epoch": 12.55, + "grad_norm": 0.953125, + "learning_rate": 0.00031355782556507546, + "loss": 0.2932, + "step": 302940 + }, + { + "epoch": 12.55, + "grad_norm": 0.55859375, + "learning_rate": 0.00031354733664618815, + "loss": 0.1819, + "step": 302950 + }, + { + "epoch": 12.55, + "grad_norm": 0.55859375, + "learning_rate": 0.0003135368476077117, + "loss": 0.1618, + "step": 302960 + }, + { + "epoch": 12.55, + "grad_norm": 1.0703125, + "learning_rate": 0.0003135263584496657, + "loss": 0.179, + "step": 302970 + }, + { + "epoch": 12.55, + "grad_norm": 0.439453125, + "learning_rate": 0.0003135158691720701, + "loss": 0.1946, + "step": 302980 + }, + { + "epoch": 12.55, + "grad_norm": 0.8046875, + "learning_rate": 0.0003135053797749443, + "loss": 0.1863, + "step": 302990 + }, + { + "epoch": 12.55, + "grad_norm": 0.40234375, + "learning_rate": 0.00031349489025830836, + "loss": 0.1685, + "step": 303000 + }, + { + "epoch": 12.55, + "grad_norm": 0.5234375, + "learning_rate": 0.00031348440062218185, + "loss": 0.1707, + "step": 303010 + }, + { + "epoch": 12.55, + "grad_norm": 1.328125, + "learning_rate": 0.0003134739108665846, + "loss": 0.2094, + "step": 303020 + }, + { + "epoch": 12.55, + "grad_norm": 0.6640625, + "learning_rate": 0.0003134634209915362, + "loss": 0.2222, + "step": 303030 + }, + { + "epoch": 12.55, + "grad_norm": 0.78515625, + "learning_rate": 0.00031345293099705655, + "loss": 0.1907, + "step": 303040 + }, + { + "epoch": 12.55, + "grad_norm": 0.79296875, + "learning_rate": 0.0003134424408831653, + "loss": 0.1751, + "step": 303050 + }, + { + "epoch": 12.55, + "grad_norm": 0.65625, + "learning_rate": 0.00031343195064988236, + "loss": 0.184, + "step": 303060 + }, + { + "epoch": 12.55, + "grad_norm": 0.72265625, + "learning_rate": 0.00031342146029722723, + "loss": 0.195, + "step": 303070 + }, + { + "epoch": 12.55, + "grad_norm": 0.59375, + "learning_rate": 0.0003134109698252198, + "loss": 0.2474, + "step": 303080 + }, + { + "epoch": 12.55, + "grad_norm": 1.0546875, + "learning_rate": 0.0003134004792338797, + "loss": 0.2638, + "step": 303090 + }, + { + "epoch": 12.55, + "grad_norm": 0.625, + "learning_rate": 0.0003133899885232268, + "loss": 0.1659, + "step": 303100 + }, + { + "epoch": 12.55, + "grad_norm": 1.25, + "learning_rate": 0.00031337949769328076, + "loss": 0.2265, + "step": 303110 + }, + { + "epoch": 12.56, + "grad_norm": 0.78125, + "learning_rate": 0.00031336900674406145, + "loss": 0.2115, + "step": 303120 + }, + { + "epoch": 12.56, + "grad_norm": 0.9140625, + "learning_rate": 0.00031335851567558835, + "loss": 0.1456, + "step": 303130 + }, + { + "epoch": 12.56, + "grad_norm": 1.859375, + "learning_rate": 0.00031334802448788147, + "loss": 0.1772, + "step": 303140 + }, + { + "epoch": 12.56, + "grad_norm": 0.0, + "learning_rate": 0.0003133375331809605, + "loss": 0.1569, + "step": 303150 + }, + { + "epoch": 12.56, + "grad_norm": 0.59375, + "learning_rate": 0.00031332704175484504, + "loss": 0.1618, + "step": 303160 + }, + { + "epoch": 12.56, + "grad_norm": 0.55859375, + "learning_rate": 0.0003133165502095549, + "loss": 0.1868, + "step": 303170 + }, + { + "epoch": 12.56, + "grad_norm": 1.046875, + "learning_rate": 0.0003133060585451099, + "loss": 0.2158, + "step": 303180 + }, + { + "epoch": 12.56, + "grad_norm": 0.91796875, + "learning_rate": 0.00031329556676152974, + "loss": 0.1926, + "step": 303190 + }, + { + "epoch": 12.56, + "grad_norm": 0.7734375, + "learning_rate": 0.0003132850748588342, + "loss": 0.1978, + "step": 303200 + }, + { + "epoch": 12.56, + "grad_norm": 0.1484375, + "learning_rate": 0.0003132745828370429, + "loss": 0.1836, + "step": 303210 + }, + { + "epoch": 12.56, + "grad_norm": 0.427734375, + "learning_rate": 0.00031326409069617567, + "loss": 0.2192, + "step": 303220 + }, + { + "epoch": 12.56, + "grad_norm": 0.28515625, + "learning_rate": 0.00031325359843625237, + "loss": 0.1964, + "step": 303230 + }, + { + "epoch": 12.56, + "grad_norm": 0.9609375, + "learning_rate": 0.0003132431060572925, + "loss": 0.1567, + "step": 303240 + }, + { + "epoch": 12.56, + "grad_norm": 0.91015625, + "learning_rate": 0.00031323261355931597, + "loss": 0.2082, + "step": 303250 + }, + { + "epoch": 12.56, + "grad_norm": 1.2890625, + "learning_rate": 0.0003132221209423425, + "loss": 0.2509, + "step": 303260 + }, + { + "epoch": 12.56, + "grad_norm": 1.234375, + "learning_rate": 0.00031321162820639183, + "loss": 0.1987, + "step": 303270 + }, + { + "epoch": 12.56, + "grad_norm": 0.87890625, + "learning_rate": 0.0003132011353514837, + "loss": 0.1875, + "step": 303280 + }, + { + "epoch": 12.56, + "grad_norm": 0.73046875, + "learning_rate": 0.0003131906423776378, + "loss": 0.1705, + "step": 303290 + }, + { + "epoch": 12.56, + "grad_norm": 1.2109375, + "learning_rate": 0.00031318014928487405, + "loss": 0.1711, + "step": 303300 + }, + { + "epoch": 12.56, + "grad_norm": 0.5703125, + "learning_rate": 0.000313169656073212, + "loss": 0.1868, + "step": 303310 + }, + { + "epoch": 12.56, + "grad_norm": 1.078125, + "learning_rate": 0.0003131591627426715, + "loss": 0.2352, + "step": 303320 + }, + { + "epoch": 12.56, + "grad_norm": 1.6015625, + "learning_rate": 0.0003131486692932723, + "loss": 0.21, + "step": 303330 + }, + { + "epoch": 12.56, + "grad_norm": 0.58203125, + "learning_rate": 0.0003131381757250341, + "loss": 0.181, + "step": 303340 + }, + { + "epoch": 12.56, + "grad_norm": 0.3828125, + "learning_rate": 0.00031312768203797667, + "loss": 0.1595, + "step": 303350 + }, + { + "epoch": 12.57, + "grad_norm": 0.55859375, + "learning_rate": 0.00031311718823211974, + "loss": 0.2029, + "step": 303360 + }, + { + "epoch": 12.57, + "grad_norm": 1.0078125, + "learning_rate": 0.0003131066943074831, + "loss": 0.196, + "step": 303370 + }, + { + "epoch": 12.57, + "grad_norm": 0.83203125, + "learning_rate": 0.00031309620026408647, + "loss": 0.1928, + "step": 303380 + }, + { + "epoch": 12.57, + "grad_norm": 1.2109375, + "learning_rate": 0.00031308570610194964, + "loss": 0.1796, + "step": 303390 + }, + { + "epoch": 12.57, + "grad_norm": 0.2060546875, + "learning_rate": 0.00031307521182109236, + "loss": 0.1683, + "step": 303400 + }, + { + "epoch": 12.57, + "grad_norm": 0.462890625, + "learning_rate": 0.0003130647174215342, + "loss": 0.2216, + "step": 303410 + }, + { + "epoch": 12.57, + "grad_norm": 1.1484375, + "learning_rate": 0.0003130542229032951, + "loss": 0.1907, + "step": 303420 + }, + { + "epoch": 12.57, + "grad_norm": 0.96484375, + "learning_rate": 0.0003130437282663948, + "loss": 0.1598, + "step": 303430 + }, + { + "epoch": 12.57, + "grad_norm": 0.56640625, + "learning_rate": 0.00031303323351085307, + "loss": 0.2172, + "step": 303440 + }, + { + "epoch": 12.57, + "grad_norm": 0.0, + "learning_rate": 0.0003130227386366895, + "loss": 0.1702, + "step": 303450 + }, + { + "epoch": 12.57, + "grad_norm": 1.4375, + "learning_rate": 0.000313012243643924, + "loss": 0.2025, + "step": 303460 + }, + { + "epoch": 12.57, + "grad_norm": 0.46484375, + "learning_rate": 0.00031300174853257623, + "loss": 0.2157, + "step": 303470 + }, + { + "epoch": 12.57, + "grad_norm": 0.671875, + "learning_rate": 0.00031299125330266594, + "loss": 0.1968, + "step": 303480 + }, + { + "epoch": 12.57, + "grad_norm": 0.298828125, + "learning_rate": 0.000312980757954213, + "loss": 0.1635, + "step": 303490 + }, + { + "epoch": 12.57, + "grad_norm": 0.6015625, + "learning_rate": 0.00031297026248723706, + "loss": 0.1599, + "step": 303500 + }, + { + "epoch": 12.57, + "grad_norm": 1.1015625, + "learning_rate": 0.0003129597669017578, + "loss": 0.1323, + "step": 303510 + }, + { + "epoch": 12.57, + "grad_norm": 0.88671875, + "learning_rate": 0.00031294927119779515, + "loss": 0.1467, + "step": 303520 + }, + { + "epoch": 12.57, + "grad_norm": 0.79296875, + "learning_rate": 0.00031293877537536875, + "loss": 0.2389, + "step": 303530 + }, + { + "epoch": 12.57, + "grad_norm": 0.875, + "learning_rate": 0.0003129282794344983, + "loss": 0.2208, + "step": 303540 + }, + { + "epoch": 12.57, + "grad_norm": 2.234375, + "learning_rate": 0.0003129177833752037, + "loss": 0.1576, + "step": 303550 + }, + { + "epoch": 12.57, + "grad_norm": 1.2421875, + "learning_rate": 0.00031290728719750457, + "loss": 0.191, + "step": 303560 + }, + { + "epoch": 12.57, + "grad_norm": 0.78515625, + "learning_rate": 0.0003128967909014208, + "loss": 0.182, + "step": 303570 + }, + { + "epoch": 12.57, + "grad_norm": 0.1806640625, + "learning_rate": 0.000312886294486972, + "loss": 0.2243, + "step": 303580 + }, + { + "epoch": 12.57, + "grad_norm": 0.7421875, + "learning_rate": 0.000312875797954178, + "loss": 0.217, + "step": 303590 + }, + { + "epoch": 12.58, + "grad_norm": 0.67578125, + "learning_rate": 0.0003128653013030586, + "loss": 0.2064, + "step": 303600 + }, + { + "epoch": 12.58, + "grad_norm": 0.87890625, + "learning_rate": 0.00031285480453363334, + "loss": 0.2106, + "step": 303610 + }, + { + "epoch": 12.58, + "grad_norm": 1.0859375, + "learning_rate": 0.00031284430764592225, + "loss": 0.1808, + "step": 303620 + }, + { + "epoch": 12.58, + "grad_norm": 0.85546875, + "learning_rate": 0.00031283381063994497, + "loss": 0.1717, + "step": 303630 + }, + { + "epoch": 12.58, + "grad_norm": 0.494140625, + "learning_rate": 0.00031282331351572114, + "loss": 0.1835, + "step": 303640 + }, + { + "epoch": 12.58, + "grad_norm": 1.3359375, + "learning_rate": 0.00031281281627327075, + "loss": 0.1734, + "step": 303650 + }, + { + "epoch": 12.58, + "grad_norm": 0.58203125, + "learning_rate": 0.00031280231891261327, + "loss": 0.1495, + "step": 303660 + }, + { + "epoch": 12.58, + "grad_norm": 1.296875, + "learning_rate": 0.0003127918214337687, + "loss": 0.194, + "step": 303670 + }, + { + "epoch": 12.58, + "grad_norm": 1.03125, + "learning_rate": 0.0003127813238367567, + "loss": 0.1694, + "step": 303680 + }, + { + "epoch": 12.58, + "grad_norm": 1.375, + "learning_rate": 0.000312770826121597, + "loss": 0.2071, + "step": 303690 + }, + { + "epoch": 12.58, + "grad_norm": 0.94140625, + "learning_rate": 0.00031276032828830944, + "loss": 0.1964, + "step": 303700 + }, + { + "epoch": 12.58, + "grad_norm": 0.3359375, + "learning_rate": 0.0003127498303369137, + "loss": 0.2198, + "step": 303710 + }, + { + "epoch": 12.58, + "grad_norm": 0.4453125, + "learning_rate": 0.00031273933226742953, + "loss": 0.2116, + "step": 303720 + }, + { + "epoch": 12.58, + "grad_norm": 1.9375, + "learning_rate": 0.00031272883407987674, + "loss": 0.2329, + "step": 303730 + }, + { + "epoch": 12.58, + "grad_norm": 0.8046875, + "learning_rate": 0.000312718335774275, + "loss": 0.1757, + "step": 303740 + }, + { + "epoch": 12.58, + "grad_norm": 0.640625, + "learning_rate": 0.0003127078373506442, + "loss": 0.1881, + "step": 303750 + }, + { + "epoch": 12.58, + "grad_norm": 0.69140625, + "learning_rate": 0.000312697338809004, + "loss": 0.1499, + "step": 303760 + }, + { + "epoch": 12.58, + "grad_norm": 0.609375, + "learning_rate": 0.0003126868401493741, + "loss": 0.1679, + "step": 303770 + }, + { + "epoch": 12.58, + "grad_norm": 0.5625, + "learning_rate": 0.0003126763413717744, + "loss": 0.1611, + "step": 303780 + }, + { + "epoch": 12.58, + "grad_norm": 0.53125, + "learning_rate": 0.00031266584247622464, + "loss": 0.2068, + "step": 303790 + }, + { + "epoch": 12.58, + "grad_norm": 0.26171875, + "learning_rate": 0.00031265534346274447, + "loss": 0.1312, + "step": 303800 + }, + { + "epoch": 12.58, + "grad_norm": 0.451171875, + "learning_rate": 0.00031264484433135375, + "loss": 0.1937, + "step": 303810 + }, + { + "epoch": 12.58, + "grad_norm": 0.66015625, + "learning_rate": 0.0003126343450820722, + "loss": 0.1791, + "step": 303820 + }, + { + "epoch": 12.58, + "grad_norm": 0.6640625, + "learning_rate": 0.00031262384571491953, + "loss": 0.1858, + "step": 303830 + }, + { + "epoch": 12.59, + "grad_norm": 0.78515625, + "learning_rate": 0.0003126133462299156, + "loss": 0.2474, + "step": 303840 + }, + { + "epoch": 12.59, + "grad_norm": 1.5390625, + "learning_rate": 0.00031260284662708, + "loss": 0.2019, + "step": 303850 + }, + { + "epoch": 12.59, + "grad_norm": 0.55859375, + "learning_rate": 0.0003125923469064327, + "loss": 0.1868, + "step": 303860 + }, + { + "epoch": 12.59, + "grad_norm": 0.69140625, + "learning_rate": 0.0003125818470679934, + "loss": 0.1628, + "step": 303870 + }, + { + "epoch": 12.59, + "grad_norm": 0.87890625, + "learning_rate": 0.0003125713471117817, + "loss": 0.1817, + "step": 303880 + }, + { + "epoch": 12.59, + "grad_norm": 0.7578125, + "learning_rate": 0.00031256084703781763, + "loss": 0.2076, + "step": 303890 + }, + { + "epoch": 12.59, + "grad_norm": 0.67578125, + "learning_rate": 0.0003125503468461207, + "loss": 0.2234, + "step": 303900 + }, + { + "epoch": 12.59, + "grad_norm": 0.63671875, + "learning_rate": 0.00031253984653671076, + "loss": 0.1954, + "step": 303910 + }, + { + "epoch": 12.59, + "grad_norm": 0.70703125, + "learning_rate": 0.0003125293461096077, + "loss": 0.2152, + "step": 303920 + }, + { + "epoch": 12.59, + "grad_norm": 0.6875, + "learning_rate": 0.000312518845564831, + "loss": 0.2066, + "step": 303930 + }, + { + "epoch": 12.59, + "grad_norm": 0.62890625, + "learning_rate": 0.0003125083449024007, + "loss": 0.1819, + "step": 303940 + }, + { + "epoch": 12.59, + "grad_norm": 0.43359375, + "learning_rate": 0.00031249784412233644, + "loss": 0.2291, + "step": 303950 + }, + { + "epoch": 12.59, + "grad_norm": 0.73046875, + "learning_rate": 0.00031248734322465786, + "loss": 0.19, + "step": 303960 + }, + { + "epoch": 12.59, + "grad_norm": 1.4921875, + "learning_rate": 0.00031247684220938503, + "loss": 0.17, + "step": 303970 + }, + { + "epoch": 12.59, + "grad_norm": 0.6015625, + "learning_rate": 0.0003124663410765374, + "loss": 0.2076, + "step": 303980 + }, + { + "epoch": 12.59, + "grad_norm": 0.98828125, + "learning_rate": 0.0003124558398261349, + "loss": 0.1903, + "step": 303990 + }, + { + "epoch": 12.59, + "grad_norm": 1.234375, + "learning_rate": 0.00031244533845819735, + "loss": 0.1919, + "step": 304000 + }, + { + "epoch": 12.59, + "grad_norm": 0.412109375, + "learning_rate": 0.0003124348369727442, + "loss": 0.1907, + "step": 304010 + }, + { + "epoch": 12.59, + "grad_norm": 0.78125, + "learning_rate": 0.00031242433536979563, + "loss": 0.2565, + "step": 304020 + }, + { + "epoch": 12.59, + "grad_norm": 0.46875, + "learning_rate": 0.00031241383364937115, + "loss": 0.1873, + "step": 304030 + }, + { + "epoch": 12.59, + "grad_norm": 0.67578125, + "learning_rate": 0.0003124033318114905, + "loss": 0.1902, + "step": 304040 + }, + { + "epoch": 12.59, + "grad_norm": 0.435546875, + "learning_rate": 0.0003123928298561736, + "loss": 0.1638, + "step": 304050 + }, + { + "epoch": 12.59, + "grad_norm": 0.29296875, + "learning_rate": 0.00031238232778344014, + "loss": 0.1461, + "step": 304060 + }, + { + "epoch": 12.59, + "grad_norm": 1.140625, + "learning_rate": 0.0003123718255933098, + "loss": 0.1117, + "step": 304070 + }, + { + "epoch": 12.59, + "grad_norm": 1.1328125, + "learning_rate": 0.0003123613232858025, + "loss": 0.2152, + "step": 304080 + }, + { + "epoch": 12.6, + "grad_norm": 1.4765625, + "learning_rate": 0.0003123508208609378, + "loss": 0.2117, + "step": 304090 + }, + { + "epoch": 12.6, + "grad_norm": 0.5625, + "learning_rate": 0.0003123403183187357, + "loss": 0.226, + "step": 304100 + }, + { + "epoch": 12.6, + "grad_norm": 0.29296875, + "learning_rate": 0.00031232981565921587, + "loss": 0.1616, + "step": 304110 + }, + { + "epoch": 12.6, + "grad_norm": 1.359375, + "learning_rate": 0.00031231931288239797, + "loss": 0.1864, + "step": 304120 + }, + { + "epoch": 12.6, + "grad_norm": 0.78125, + "learning_rate": 0.000312308809988302, + "loss": 0.1241, + "step": 304130 + }, + { + "epoch": 12.6, + "grad_norm": 0.55078125, + "learning_rate": 0.0003122983069769474, + "loss": 0.1937, + "step": 304140 + }, + { + "epoch": 12.6, + "grad_norm": 0.78515625, + "learning_rate": 0.0003122878038483542, + "loss": 0.1874, + "step": 304150 + }, + { + "epoch": 12.6, + "grad_norm": 0.94921875, + "learning_rate": 0.0003122773006025421, + "loss": 0.1646, + "step": 304160 + }, + { + "epoch": 12.6, + "grad_norm": 1.09375, + "learning_rate": 0.0003122667972395307, + "loss": 0.2244, + "step": 304170 + }, + { + "epoch": 12.6, + "grad_norm": 0.2421875, + "learning_rate": 0.0003122562937593401, + "loss": 0.1823, + "step": 304180 + }, + { + "epoch": 12.6, + "grad_norm": 1.0859375, + "learning_rate": 0.00031224579016198977, + "loss": 0.221, + "step": 304190 + }, + { + "epoch": 12.6, + "grad_norm": 0.9921875, + "learning_rate": 0.0003122352864474997, + "loss": 0.2079, + "step": 304200 + }, + { + "epoch": 12.6, + "grad_norm": 0.53515625, + "learning_rate": 0.0003122247826158894, + "loss": 0.2475, + "step": 304210 + }, + { + "epoch": 12.6, + "grad_norm": 0.4453125, + "learning_rate": 0.0003122142786671789, + "loss": 0.1481, + "step": 304220 + }, + { + "epoch": 12.6, + "grad_norm": 0.8828125, + "learning_rate": 0.0003122037746013877, + "loss": 0.1942, + "step": 304230 + }, + { + "epoch": 12.6, + "grad_norm": 1.4140625, + "learning_rate": 0.00031219327041853587, + "loss": 0.181, + "step": 304240 + }, + { + "epoch": 12.6, + "grad_norm": 0.515625, + "learning_rate": 0.0003121827661186429, + "loss": 0.211, + "step": 304250 + }, + { + "epoch": 12.6, + "grad_norm": 2.015625, + "learning_rate": 0.00031217226170172875, + "loss": 0.2095, + "step": 304260 + }, + { + "epoch": 12.6, + "grad_norm": 0.9296875, + "learning_rate": 0.0003121617571678131, + "loss": 0.1972, + "step": 304270 + }, + { + "epoch": 12.6, + "grad_norm": 0.65234375, + "learning_rate": 0.00031215125251691573, + "loss": 0.2176, + "step": 304280 + }, + { + "epoch": 12.6, + "grad_norm": 0.6328125, + "learning_rate": 0.00031214074774905644, + "loss": 0.1781, + "step": 304290 + }, + { + "epoch": 12.6, + "grad_norm": 0.51953125, + "learning_rate": 0.00031213024286425495, + "loss": 0.1937, + "step": 304300 + }, + { + "epoch": 12.6, + "grad_norm": 1.3359375, + "learning_rate": 0.0003121197378625311, + "loss": 0.2011, + "step": 304310 + }, + { + "epoch": 12.6, + "grad_norm": 0.48828125, + "learning_rate": 0.00031210923274390453, + "loss": 0.1876, + "step": 304320 + }, + { + "epoch": 12.61, + "grad_norm": 0.8515625, + "learning_rate": 0.00031209872750839507, + "loss": 0.1572, + "step": 304330 + }, + { + "epoch": 12.61, + "grad_norm": 0.66796875, + "learning_rate": 0.0003120882221560226, + "loss": 0.2018, + "step": 304340 + }, + { + "epoch": 12.61, + "grad_norm": 2.09375, + "learning_rate": 0.0003120777166868068, + "loss": 0.1951, + "step": 304350 + }, + { + "epoch": 12.61, + "grad_norm": 0.423828125, + "learning_rate": 0.00031206721110076737, + "loss": 0.2323, + "step": 304360 + }, + { + "epoch": 12.61, + "grad_norm": 0.875, + "learning_rate": 0.00031205670539792427, + "loss": 0.1824, + "step": 304370 + }, + { + "epoch": 12.61, + "grad_norm": 1.375, + "learning_rate": 0.00031204619957829705, + "loss": 0.1899, + "step": 304380 + }, + { + "epoch": 12.61, + "grad_norm": 0.84375, + "learning_rate": 0.00031203569364190553, + "loss": 0.1861, + "step": 304390 + }, + { + "epoch": 12.61, + "grad_norm": 0.447265625, + "learning_rate": 0.0003120251875887696, + "loss": 0.2039, + "step": 304400 + }, + { + "epoch": 12.61, + "grad_norm": 0.5078125, + "learning_rate": 0.000312014681418909, + "loss": 0.225, + "step": 304410 + }, + { + "epoch": 12.61, + "grad_norm": 0.62109375, + "learning_rate": 0.00031200417513234347, + "loss": 0.1622, + "step": 304420 + }, + { + "epoch": 12.61, + "grad_norm": 1.2578125, + "learning_rate": 0.00031199366872909276, + "loss": 0.202, + "step": 304430 + }, + { + "epoch": 12.61, + "grad_norm": 0.1611328125, + "learning_rate": 0.0003119831622091766, + "loss": 0.1623, + "step": 304440 + }, + { + "epoch": 12.61, + "grad_norm": 1.0078125, + "learning_rate": 0.0003119726555726149, + "loss": 0.1816, + "step": 304450 + }, + { + "epoch": 12.61, + "grad_norm": 0.72265625, + "learning_rate": 0.00031196214881942737, + "loss": 0.2328, + "step": 304460 + }, + { + "epoch": 12.61, + "grad_norm": 0.51171875, + "learning_rate": 0.0003119516419496337, + "loss": 0.2102, + "step": 304470 + }, + { + "epoch": 12.61, + "grad_norm": 0.455078125, + "learning_rate": 0.0003119411349632537, + "loss": 0.1971, + "step": 304480 + }, + { + "epoch": 12.61, + "grad_norm": 1.0078125, + "learning_rate": 0.00031193062786030723, + "loss": 0.2072, + "step": 304490 + }, + { + "epoch": 12.61, + "grad_norm": 0.93359375, + "learning_rate": 0.00031192012064081405, + "loss": 0.1665, + "step": 304500 + }, + { + "epoch": 12.61, + "grad_norm": 1.0546875, + "learning_rate": 0.00031190961330479386, + "loss": 0.2019, + "step": 304510 + }, + { + "epoch": 12.61, + "grad_norm": 1.171875, + "learning_rate": 0.00031189910585226644, + "loss": 0.1804, + "step": 304520 + }, + { + "epoch": 12.61, + "grad_norm": 1.2578125, + "learning_rate": 0.0003118885982832516, + "loss": 0.1963, + "step": 304530 + }, + { + "epoch": 12.61, + "grad_norm": 0.88671875, + "learning_rate": 0.00031187809059776906, + "loss": 0.1891, + "step": 304540 + }, + { + "epoch": 12.61, + "grad_norm": 0.07080078125, + "learning_rate": 0.00031186758279583873, + "loss": 0.1953, + "step": 304550 + }, + { + "epoch": 12.61, + "grad_norm": 1.1796875, + "learning_rate": 0.00031185707487748016, + "loss": 0.1956, + "step": 304560 + }, + { + "epoch": 12.62, + "grad_norm": 0.56640625, + "learning_rate": 0.00031184656684271335, + "loss": 0.1851, + "step": 304570 + }, + { + "epoch": 12.62, + "grad_norm": 0.9296875, + "learning_rate": 0.000311836058691558, + "loss": 0.2137, + "step": 304580 + }, + { + "epoch": 12.62, + "grad_norm": 0.50390625, + "learning_rate": 0.0003118255504240338, + "loss": 0.1743, + "step": 304590 + }, + { + "epoch": 12.62, + "grad_norm": 0.57421875, + "learning_rate": 0.0003118150420401606, + "loss": 0.1436, + "step": 304600 + }, + { + "epoch": 12.62, + "grad_norm": 0.55859375, + "learning_rate": 0.00031180453353995814, + "loss": 0.1743, + "step": 304610 + }, + { + "epoch": 12.62, + "grad_norm": 1.796875, + "learning_rate": 0.00031179402492344634, + "loss": 0.2178, + "step": 304620 + }, + { + "epoch": 12.62, + "grad_norm": 0.53515625, + "learning_rate": 0.0003117835161906447, + "loss": 0.1912, + "step": 304630 + }, + { + "epoch": 12.62, + "grad_norm": 0.478515625, + "learning_rate": 0.00031177300734157326, + "loss": 0.2163, + "step": 304640 + }, + { + "epoch": 12.62, + "grad_norm": 1.6640625, + "learning_rate": 0.00031176249837625166, + "loss": 0.1731, + "step": 304650 + }, + { + "epoch": 12.62, + "grad_norm": 0.44140625, + "learning_rate": 0.00031175198929469974, + "loss": 0.2167, + "step": 304660 + }, + { + "epoch": 12.62, + "grad_norm": 0.7265625, + "learning_rate": 0.0003117414800969372, + "loss": 0.173, + "step": 304670 + }, + { + "epoch": 12.62, + "grad_norm": 1.234375, + "learning_rate": 0.0003117309707829839, + "loss": 0.182, + "step": 304680 + }, + { + "epoch": 12.62, + "grad_norm": 1.0, + "learning_rate": 0.00031172046135285954, + "loss": 0.2072, + "step": 304690 + }, + { + "epoch": 12.62, + "grad_norm": 0.35546875, + "learning_rate": 0.000311709951806584, + "loss": 0.1679, + "step": 304700 + }, + { + "epoch": 12.62, + "grad_norm": 0.7421875, + "learning_rate": 0.0003116994421441769, + "loss": 0.1688, + "step": 304710 + }, + { + "epoch": 12.62, + "grad_norm": 0.58203125, + "learning_rate": 0.00031168893236565816, + "loss": 0.191, + "step": 304720 + }, + { + "epoch": 12.62, + "grad_norm": 0.57421875, + "learning_rate": 0.0003116784224710475, + "loss": 0.226, + "step": 304730 + }, + { + "epoch": 12.62, + "grad_norm": 0.80078125, + "learning_rate": 0.00031166791246036476, + "loss": 0.163, + "step": 304740 + }, + { + "epoch": 12.62, + "grad_norm": 1.09375, + "learning_rate": 0.00031165740233362964, + "loss": 0.2038, + "step": 304750 + }, + { + "epoch": 12.62, + "grad_norm": 0.330078125, + "learning_rate": 0.00031164689209086196, + "loss": 0.1662, + "step": 304760 + }, + { + "epoch": 12.62, + "grad_norm": 1.2734375, + "learning_rate": 0.00031163638173208145, + "loss": 0.1801, + "step": 304770 + }, + { + "epoch": 12.62, + "grad_norm": 0.6796875, + "learning_rate": 0.00031162587125730794, + "loss": 0.1829, + "step": 304780 + }, + { + "epoch": 12.62, + "grad_norm": 0.474609375, + "learning_rate": 0.00031161536066656113, + "loss": 0.1704, + "step": 304790 + }, + { + "epoch": 12.62, + "grad_norm": 0.515625, + "learning_rate": 0.000311604849959861, + "loss": 0.2051, + "step": 304800 + }, + { + "epoch": 12.63, + "grad_norm": 1.53125, + "learning_rate": 0.00031159433913722714, + "loss": 0.1625, + "step": 304810 + }, + { + "epoch": 12.63, + "grad_norm": 0.3828125, + "learning_rate": 0.00031158382819867933, + "loss": 0.169, + "step": 304820 + }, + { + "epoch": 12.63, + "grad_norm": 0.44921875, + "learning_rate": 0.00031157331714423746, + "loss": 0.1972, + "step": 304830 + }, + { + "epoch": 12.63, + "grad_norm": 0.63671875, + "learning_rate": 0.0003115628059739212, + "loss": 0.195, + "step": 304840 + }, + { + "epoch": 12.63, + "grad_norm": 2.546875, + "learning_rate": 0.0003115522946877505, + "loss": 0.1792, + "step": 304850 + }, + { + "epoch": 12.63, + "grad_norm": 0.396484375, + "learning_rate": 0.00031154178328574494, + "loss": 0.1524, + "step": 304860 + }, + { + "epoch": 12.63, + "grad_norm": 0.60546875, + "learning_rate": 0.0003115312717679244, + "loss": 0.1784, + "step": 304870 + }, + { + "epoch": 12.63, + "grad_norm": 0.546875, + "learning_rate": 0.0003115207601343087, + "loss": 0.1831, + "step": 304880 + }, + { + "epoch": 12.63, + "grad_norm": 1.09375, + "learning_rate": 0.0003115102483849175, + "loss": 0.2224, + "step": 304890 + }, + { + "epoch": 12.63, + "grad_norm": 1.375, + "learning_rate": 0.0003114997365197707, + "loss": 0.1694, + "step": 304900 + }, + { + "epoch": 12.63, + "grad_norm": 0.89453125, + "learning_rate": 0.00031148922453888805, + "loss": 0.1718, + "step": 304910 + }, + { + "epoch": 12.63, + "grad_norm": 0.78125, + "learning_rate": 0.00031147871244228926, + "loss": 0.1915, + "step": 304920 + }, + { + "epoch": 12.63, + "grad_norm": 0.921875, + "learning_rate": 0.0003114682002299942, + "loss": 0.2177, + "step": 304930 + }, + { + "epoch": 12.63, + "grad_norm": 0.396484375, + "learning_rate": 0.0003114576879020227, + "loss": 0.1482, + "step": 304940 + }, + { + "epoch": 12.63, + "grad_norm": 1.46875, + "learning_rate": 0.00031144717545839436, + "loss": 0.1982, + "step": 304950 + }, + { + "epoch": 12.63, + "grad_norm": 0.7421875, + "learning_rate": 0.00031143666289912915, + "loss": 0.1824, + "step": 304960 + }, + { + "epoch": 12.63, + "grad_norm": 0.98828125, + "learning_rate": 0.0003114261502242467, + "loss": 0.2271, + "step": 304970 + }, + { + "epoch": 12.63, + "grad_norm": 1.609375, + "learning_rate": 0.0003114156374337669, + "loss": 0.1824, + "step": 304980 + }, + { + "epoch": 12.63, + "grad_norm": 1.3515625, + "learning_rate": 0.00031140512452770955, + "loss": 0.1969, + "step": 304990 + }, + { + "epoch": 12.63, + "grad_norm": 1.484375, + "learning_rate": 0.0003113946115060943, + "loss": 0.2466, + "step": 305000 + }, + { + "epoch": 12.63, + "grad_norm": 1.0546875, + "learning_rate": 0.0003113840983689411, + "loss": 0.1849, + "step": 305010 + }, + { + "epoch": 12.63, + "grad_norm": 0.56640625, + "learning_rate": 0.0003113735851162696, + "loss": 0.1863, + "step": 305020 + }, + { + "epoch": 12.63, + "grad_norm": 0.59375, + "learning_rate": 0.00031136307174809964, + "loss": 0.165, + "step": 305030 + }, + { + "epoch": 12.63, + "grad_norm": 1.140625, + "learning_rate": 0.0003113525582644511, + "loss": 0.1406, + "step": 305040 + }, + { + "epoch": 12.64, + "grad_norm": 0.0, + "learning_rate": 0.00031134204466534357, + "loss": 0.2199, + "step": 305050 + }, + { + "epoch": 12.64, + "grad_norm": 1.8203125, + "learning_rate": 0.000311331530950797, + "loss": 0.1627, + "step": 305060 + }, + { + "epoch": 12.64, + "grad_norm": 0.859375, + "learning_rate": 0.00031132101712083105, + "loss": 0.2416, + "step": 305070 + }, + { + "epoch": 12.64, + "grad_norm": 0.55078125, + "learning_rate": 0.00031131050317546557, + "loss": 0.2105, + "step": 305080 + }, + { + "epoch": 12.64, + "grad_norm": 1.3046875, + "learning_rate": 0.00031129998911472043, + "loss": 0.1799, + "step": 305090 + }, + { + "epoch": 12.64, + "grad_norm": 0.69140625, + "learning_rate": 0.00031128947493861526, + "loss": 0.1766, + "step": 305100 + }, + { + "epoch": 12.64, + "grad_norm": 1.0703125, + "learning_rate": 0.0003112789606471699, + "loss": 0.2176, + "step": 305110 + }, + { + "epoch": 12.64, + "grad_norm": 0.48828125, + "learning_rate": 0.00031126844624040424, + "loss": 0.1833, + "step": 305120 + }, + { + "epoch": 12.64, + "grad_norm": 0.453125, + "learning_rate": 0.0003112579317183379, + "loss": 0.1947, + "step": 305130 + }, + { + "epoch": 12.64, + "grad_norm": 0.83984375, + "learning_rate": 0.00031124741708099074, + "loss": 0.1861, + "step": 305140 + }, + { + "epoch": 12.64, + "grad_norm": 0.55078125, + "learning_rate": 0.00031123690232838264, + "loss": 0.1593, + "step": 305150 + }, + { + "epoch": 12.64, + "grad_norm": 0.921875, + "learning_rate": 0.00031122638746053317, + "loss": 0.1762, + "step": 305160 + }, + { + "epoch": 12.64, + "grad_norm": 1.421875, + "learning_rate": 0.00031121587247746234, + "loss": 0.2195, + "step": 305170 + }, + { + "epoch": 12.64, + "grad_norm": 0.80859375, + "learning_rate": 0.0003112053573791899, + "loss": 0.1967, + "step": 305180 + }, + { + "epoch": 12.64, + "grad_norm": 1.140625, + "learning_rate": 0.00031119484216573547, + "loss": 0.19, + "step": 305190 + }, + { + "epoch": 12.64, + "grad_norm": 0.625, + "learning_rate": 0.00031118432683711905, + "loss": 0.1805, + "step": 305200 + }, + { + "epoch": 12.64, + "grad_norm": 1.0625, + "learning_rate": 0.00031117381139336025, + "loss": 0.2037, + "step": 305210 + }, + { + "epoch": 12.64, + "grad_norm": 0.96484375, + "learning_rate": 0.00031116329583447897, + "loss": 0.1333, + "step": 305220 + }, + { + "epoch": 12.64, + "grad_norm": 1.21875, + "learning_rate": 0.0003111527801604951, + "loss": 0.1827, + "step": 305230 + }, + { + "epoch": 12.64, + "grad_norm": 0.80078125, + "learning_rate": 0.0003111422643714281, + "loss": 0.1479, + "step": 305240 + }, + { + "epoch": 12.64, + "grad_norm": 0.7890625, + "learning_rate": 0.0003111317484672981, + "loss": 0.2213, + "step": 305250 + }, + { + "epoch": 12.64, + "grad_norm": 0.64453125, + "learning_rate": 0.0003111212324481247, + "loss": 0.2039, + "step": 305260 + }, + { + "epoch": 12.64, + "grad_norm": 2.5625, + "learning_rate": 0.00031111071631392774, + "loss": 0.2216, + "step": 305270 + }, + { + "epoch": 12.64, + "grad_norm": 0.4921875, + "learning_rate": 0.0003111002000647271, + "loss": 0.1895, + "step": 305280 + }, + { + "epoch": 12.65, + "grad_norm": 1.453125, + "learning_rate": 0.0003110896837005423, + "loss": 0.2152, + "step": 305290 + }, + { + "epoch": 12.65, + "grad_norm": 0.51953125, + "learning_rate": 0.0003110791672213934, + "loss": 0.2316, + "step": 305300 + }, + { + "epoch": 12.65, + "grad_norm": 0.80859375, + "learning_rate": 0.00031106865062730015, + "loss": 0.2393, + "step": 305310 + }, + { + "epoch": 12.65, + "grad_norm": 0.91015625, + "learning_rate": 0.0003110581339182822, + "loss": 0.212, + "step": 305320 + }, + { + "epoch": 12.65, + "grad_norm": 0.439453125, + "learning_rate": 0.0003110476170943595, + "loss": 0.2359, + "step": 305330 + }, + { + "epoch": 12.65, + "grad_norm": 0.578125, + "learning_rate": 0.00031103710015555183, + "loss": 0.1943, + "step": 305340 + }, + { + "epoch": 12.65, + "grad_norm": 0.56640625, + "learning_rate": 0.0003110265831018788, + "loss": 0.2096, + "step": 305350 + }, + { + "epoch": 12.65, + "grad_norm": 0.34765625, + "learning_rate": 0.00031101606593336045, + "loss": 0.1658, + "step": 305360 + }, + { + "epoch": 12.65, + "grad_norm": 1.0703125, + "learning_rate": 0.0003110055486500164, + "loss": 0.2168, + "step": 305370 + }, + { + "epoch": 12.65, + "grad_norm": 0.578125, + "learning_rate": 0.0003109950312518665, + "loss": 0.1546, + "step": 305380 + }, + { + "epoch": 12.65, + "grad_norm": 0.4921875, + "learning_rate": 0.00031098451373893056, + "loss": 0.1733, + "step": 305390 + }, + { + "epoch": 12.65, + "grad_norm": 0.56640625, + "learning_rate": 0.0003109739961112283, + "loss": 0.246, + "step": 305400 + }, + { + "epoch": 12.65, + "grad_norm": 0.0, + "learning_rate": 0.00031096347836877963, + "loss": 0.1944, + "step": 305410 + }, + { + "epoch": 12.65, + "grad_norm": 0.8359375, + "learning_rate": 0.0003109529605116042, + "loss": 0.1814, + "step": 305420 + }, + { + "epoch": 12.65, + "grad_norm": 1.0859375, + "learning_rate": 0.0003109424425397219, + "loss": 0.1731, + "step": 305430 + }, + { + "epoch": 12.65, + "grad_norm": 0.6796875, + "learning_rate": 0.0003109319244531526, + "loss": 0.1725, + "step": 305440 + }, + { + "epoch": 12.65, + "grad_norm": 1.1875, + "learning_rate": 0.00031092140625191587, + "loss": 0.2607, + "step": 305450 + }, + { + "epoch": 12.65, + "grad_norm": 0.8046875, + "learning_rate": 0.00031091088793603174, + "loss": 0.1979, + "step": 305460 + }, + { + "epoch": 12.65, + "grad_norm": 1.4453125, + "learning_rate": 0.00031090036950551984, + "loss": 0.219, + "step": 305470 + }, + { + "epoch": 12.65, + "grad_norm": 0.59375, + "learning_rate": 0.0003108898509604, + "loss": 0.1925, + "step": 305480 + }, + { + "epoch": 12.65, + "grad_norm": 0.86328125, + "learning_rate": 0.000310879332300692, + "loss": 0.2084, + "step": 305490 + }, + { + "epoch": 12.65, + "grad_norm": 1.3203125, + "learning_rate": 0.0003108688135264158, + "loss": 0.2155, + "step": 305500 + }, + { + "epoch": 12.65, + "grad_norm": 1.0546875, + "learning_rate": 0.000310858294637591, + "loss": 0.2493, + "step": 305510 + }, + { + "epoch": 12.65, + "grad_norm": 0.416015625, + "learning_rate": 0.0003108477756342375, + "loss": 0.1765, + "step": 305520 + }, + { + "epoch": 12.66, + "grad_norm": 0.671875, + "learning_rate": 0.0003108372565163749, + "loss": 0.1997, + "step": 305530 + }, + { + "epoch": 12.66, + "grad_norm": 0.609375, + "learning_rate": 0.0003108267372840233, + "loss": 0.1782, + "step": 305540 + }, + { + "epoch": 12.66, + "grad_norm": 0.9453125, + "learning_rate": 0.0003108162179372023, + "loss": 0.2406, + "step": 305550 + }, + { + "epoch": 12.66, + "grad_norm": 1.2578125, + "learning_rate": 0.00031080569847593176, + "loss": 0.1749, + "step": 305560 + }, + { + "epoch": 12.66, + "grad_norm": 0.65234375, + "learning_rate": 0.00031079517890023154, + "loss": 0.1999, + "step": 305570 + }, + { + "epoch": 12.66, + "grad_norm": 0.59765625, + "learning_rate": 0.00031078465921012126, + "loss": 0.2087, + "step": 305580 + }, + { + "epoch": 12.66, + "grad_norm": 1.6640625, + "learning_rate": 0.0003107741394056208, + "loss": 0.2356, + "step": 305590 + }, + { + "epoch": 12.66, + "grad_norm": 0.8984375, + "learning_rate": 0.00031076361948675007, + "loss": 0.1581, + "step": 305600 + }, + { + "epoch": 12.66, + "grad_norm": 1.3359375, + "learning_rate": 0.0003107530994535286, + "loss": 0.1991, + "step": 305610 + }, + { + "epoch": 12.66, + "grad_norm": 0.71484375, + "learning_rate": 0.00031074257930597654, + "loss": 0.2124, + "step": 305620 + }, + { + "epoch": 12.66, + "grad_norm": 0.5078125, + "learning_rate": 0.00031073205904411344, + "loss": 0.2227, + "step": 305630 + }, + { + "epoch": 12.66, + "grad_norm": 1.1328125, + "learning_rate": 0.00031072153866795916, + "loss": 0.1984, + "step": 305640 + }, + { + "epoch": 12.66, + "grad_norm": 0.408203125, + "learning_rate": 0.00031071101817753356, + "loss": 0.1793, + "step": 305650 + }, + { + "epoch": 12.66, + "grad_norm": 0.890625, + "learning_rate": 0.0003107004975728563, + "loss": 0.2264, + "step": 305660 + }, + { + "epoch": 12.66, + "grad_norm": 0.79296875, + "learning_rate": 0.00031068997685394724, + "loss": 0.213, + "step": 305670 + }, + { + "epoch": 12.66, + "grad_norm": 0.5703125, + "learning_rate": 0.00031067945602082626, + "loss": 0.1942, + "step": 305680 + }, + { + "epoch": 12.66, + "grad_norm": 0.98828125, + "learning_rate": 0.00031066893507351303, + "loss": 0.2215, + "step": 305690 + }, + { + "epoch": 12.66, + "grad_norm": 0.6953125, + "learning_rate": 0.00031065841401202747, + "loss": 0.2406, + "step": 305700 + }, + { + "epoch": 12.66, + "grad_norm": 0.275390625, + "learning_rate": 0.00031064789283638934, + "loss": 0.1701, + "step": 305710 + }, + { + "epoch": 12.66, + "grad_norm": 0.71875, + "learning_rate": 0.00031063737154661834, + "loss": 0.165, + "step": 305720 + }, + { + "epoch": 12.66, + "grad_norm": 0.78125, + "learning_rate": 0.0003106268501427345, + "loss": 0.2145, + "step": 305730 + }, + { + "epoch": 12.66, + "grad_norm": 3.28125, + "learning_rate": 0.00031061632862475734, + "loss": 0.2171, + "step": 305740 + }, + { + "epoch": 12.66, + "grad_norm": 0.953125, + "learning_rate": 0.00031060580699270686, + "loss": 0.1967, + "step": 305750 + }, + { + "epoch": 12.66, + "grad_norm": 0.251953125, + "learning_rate": 0.00031059528524660275, + "loss": 0.1933, + "step": 305760 + }, + { + "epoch": 12.66, + "grad_norm": 0.34765625, + "learning_rate": 0.0003105847633864649, + "loss": 0.172, + "step": 305770 + }, + { + "epoch": 12.67, + "grad_norm": 0.70703125, + "learning_rate": 0.0003105742414123131, + "loss": 0.195, + "step": 305780 + }, + { + "epoch": 12.67, + "grad_norm": 0.80078125, + "learning_rate": 0.00031056371932416705, + "loss": 0.2445, + "step": 305790 + }, + { + "epoch": 12.67, + "grad_norm": 1.109375, + "learning_rate": 0.00031055319712204664, + "loss": 0.1827, + "step": 305800 + }, + { + "epoch": 12.67, + "grad_norm": 0.5234375, + "learning_rate": 0.0003105426748059717, + "loss": 0.2182, + "step": 305810 + }, + { + "epoch": 12.67, + "grad_norm": 1.0078125, + "learning_rate": 0.0003105321523759619, + "loss": 0.1874, + "step": 305820 + }, + { + "epoch": 12.67, + "grad_norm": 1.3359375, + "learning_rate": 0.0003105216298320372, + "loss": 0.1794, + "step": 305830 + }, + { + "epoch": 12.67, + "grad_norm": 0.34375, + "learning_rate": 0.0003105111071742173, + "loss": 0.1827, + "step": 305840 + }, + { + "epoch": 12.67, + "grad_norm": 0.55859375, + "learning_rate": 0.000310500584402522, + "loss": 0.2051, + "step": 305850 + }, + { + "epoch": 12.67, + "grad_norm": 0.85546875, + "learning_rate": 0.00031049006151697117, + "loss": 0.1448, + "step": 305860 + }, + { + "epoch": 12.67, + "grad_norm": 0.37890625, + "learning_rate": 0.0003104795385175846, + "loss": 0.2061, + "step": 305870 + }, + { + "epoch": 12.67, + "grad_norm": 0.71875, + "learning_rate": 0.000310469015404382, + "loss": 0.1974, + "step": 305880 + }, + { + "epoch": 12.67, + "grad_norm": 0.4296875, + "learning_rate": 0.0003104584921773833, + "loss": 0.2129, + "step": 305890 + }, + { + "epoch": 12.67, + "grad_norm": 0.5390625, + "learning_rate": 0.0003104479688366082, + "loss": 0.1494, + "step": 305900 + }, + { + "epoch": 12.67, + "grad_norm": 0.51171875, + "learning_rate": 0.0003104374453820766, + "loss": 0.2174, + "step": 305910 + }, + { + "epoch": 12.67, + "grad_norm": 0.3515625, + "learning_rate": 0.0003104269218138082, + "loss": 0.2071, + "step": 305920 + }, + { + "epoch": 12.67, + "grad_norm": 2.078125, + "learning_rate": 0.00031041639813182286, + "loss": 0.1856, + "step": 305930 + }, + { + "epoch": 12.67, + "grad_norm": 0.56640625, + "learning_rate": 0.00031040587433614047, + "loss": 0.1877, + "step": 305940 + }, + { + "epoch": 12.67, + "grad_norm": 1.390625, + "learning_rate": 0.0003103953504267807, + "loss": 0.1852, + "step": 305950 + }, + { + "epoch": 12.67, + "grad_norm": 0.39453125, + "learning_rate": 0.0003103848264037634, + "loss": 0.2242, + "step": 305960 + }, + { + "epoch": 12.67, + "grad_norm": 0.50390625, + "learning_rate": 0.00031037430226710835, + "loss": 0.1622, + "step": 305970 + }, + { + "epoch": 12.67, + "grad_norm": 0.96875, + "learning_rate": 0.0003103637780168354, + "loss": 0.2091, + "step": 305980 + }, + { + "epoch": 12.67, + "grad_norm": 1.2109375, + "learning_rate": 0.00031035325365296433, + "loss": 0.2233, + "step": 305990 + }, + { + "epoch": 12.67, + "grad_norm": 0.8671875, + "learning_rate": 0.00031034272917551496, + "loss": 0.1719, + "step": 306000 + }, + { + "epoch": 12.67, + "grad_norm": 1.6484375, + "learning_rate": 0.00031033220458450713, + "loss": 0.2554, + "step": 306010 + }, + { + "epoch": 12.68, + "grad_norm": 0.37890625, + "learning_rate": 0.00031032167987996056, + "loss": 0.1523, + "step": 306020 + }, + { + "epoch": 12.68, + "grad_norm": 1.2890625, + "learning_rate": 0.0003103111550618951, + "loss": 0.2364, + "step": 306030 + }, + { + "epoch": 12.68, + "grad_norm": 0.96875, + "learning_rate": 0.0003103006301303306, + "loss": 0.227, + "step": 306040 + }, + { + "epoch": 12.68, + "grad_norm": 1.0390625, + "learning_rate": 0.0003102901050852868, + "loss": 0.2548, + "step": 306050 + }, + { + "epoch": 12.68, + "grad_norm": 0.515625, + "learning_rate": 0.00031027957992678357, + "loss": 0.1837, + "step": 306060 + }, + { + "epoch": 12.68, + "grad_norm": 0.6796875, + "learning_rate": 0.0003102690546548406, + "loss": 0.1887, + "step": 306070 + }, + { + "epoch": 12.68, + "grad_norm": 0.63671875, + "learning_rate": 0.00031025852926947785, + "loss": 0.1775, + "step": 306080 + }, + { + "epoch": 12.68, + "grad_norm": 0.8046875, + "learning_rate": 0.00031024800377071497, + "loss": 0.235, + "step": 306090 + }, + { + "epoch": 12.68, + "grad_norm": 0.609375, + "learning_rate": 0.000310237478158572, + "loss": 0.2162, + "step": 306100 + }, + { + "epoch": 12.68, + "grad_norm": 0.85546875, + "learning_rate": 0.0003102269524330685, + "loss": 0.1813, + "step": 306110 + }, + { + "epoch": 12.68, + "grad_norm": 0.86328125, + "learning_rate": 0.00031021642659422434, + "loss": 0.1945, + "step": 306120 + }, + { + "epoch": 12.68, + "grad_norm": 0.447265625, + "learning_rate": 0.00031020590064205944, + "loss": 0.1596, + "step": 306130 + }, + { + "epoch": 12.68, + "grad_norm": 1.078125, + "learning_rate": 0.0003101953745765935, + "loss": 0.1506, + "step": 306140 + }, + { + "epoch": 12.68, + "grad_norm": 0.294921875, + "learning_rate": 0.0003101848483978464, + "loss": 0.1329, + "step": 306150 + }, + { + "epoch": 12.68, + "grad_norm": 0.72265625, + "learning_rate": 0.0003101743221058379, + "loss": 0.1661, + "step": 306160 + }, + { + "epoch": 12.68, + "grad_norm": 0.51171875, + "learning_rate": 0.0003101637957005878, + "loss": 0.2087, + "step": 306170 + }, + { + "epoch": 12.68, + "grad_norm": 0.3828125, + "learning_rate": 0.00031015326918211597, + "loss": 0.1868, + "step": 306180 + }, + { + "epoch": 12.68, + "grad_norm": 0.72265625, + "learning_rate": 0.00031014274255044215, + "loss": 0.2204, + "step": 306190 + }, + { + "epoch": 12.68, + "grad_norm": 1.0703125, + "learning_rate": 0.0003101322158055862, + "loss": 0.1883, + "step": 306200 + }, + { + "epoch": 12.68, + "grad_norm": 0.40234375, + "learning_rate": 0.00031012168894756795, + "loss": 0.2033, + "step": 306210 + }, + { + "epoch": 12.68, + "grad_norm": 0.63671875, + "learning_rate": 0.0003101111619764071, + "loss": 0.1979, + "step": 306220 + }, + { + "epoch": 12.68, + "grad_norm": 0.66015625, + "learning_rate": 0.0003101006348921236, + "loss": 0.1919, + "step": 306230 + }, + { + "epoch": 12.68, + "grad_norm": 0.6953125, + "learning_rate": 0.00031009010769473714, + "loss": 0.133, + "step": 306240 + }, + { + "epoch": 12.68, + "grad_norm": 0.984375, + "learning_rate": 0.00031007958038426766, + "loss": 0.1497, + "step": 306250 + }, + { + "epoch": 12.69, + "grad_norm": 0.59765625, + "learning_rate": 0.0003100690529607348, + "loss": 0.1689, + "step": 306260 + }, + { + "epoch": 12.69, + "grad_norm": 2.125, + "learning_rate": 0.0003100585254241586, + "loss": 0.1673, + "step": 306270 + }, + { + "epoch": 12.69, + "grad_norm": 0.703125, + "learning_rate": 0.0003100479977745586, + "loss": 0.2038, + "step": 306280 + }, + { + "epoch": 12.69, + "grad_norm": 0.88671875, + "learning_rate": 0.0003100374700119548, + "loss": 0.1915, + "step": 306290 + }, + { + "epoch": 12.69, + "grad_norm": 0.68359375, + "learning_rate": 0.00031002694213636705, + "loss": 0.1645, + "step": 306300 + }, + { + "epoch": 12.69, + "grad_norm": 0.60546875, + "learning_rate": 0.0003100164141478149, + "loss": 0.2058, + "step": 306310 + }, + { + "epoch": 12.69, + "grad_norm": 0.60546875, + "learning_rate": 0.00031000588604631853, + "loss": 0.1821, + "step": 306320 + }, + { + "epoch": 12.69, + "grad_norm": 1.875, + "learning_rate": 0.0003099953578318975, + "loss": 0.1774, + "step": 306330 + }, + { + "epoch": 12.69, + "grad_norm": 0.51953125, + "learning_rate": 0.0003099848295045716, + "loss": 0.2098, + "step": 306340 + }, + { + "epoch": 12.69, + "grad_norm": 0.86328125, + "learning_rate": 0.00030997430106436086, + "loss": 0.1896, + "step": 306350 + }, + { + "epoch": 12.69, + "grad_norm": 0.9296875, + "learning_rate": 0.0003099637725112849, + "loss": 0.1785, + "step": 306360 + }, + { + "epoch": 12.69, + "grad_norm": 0.349609375, + "learning_rate": 0.00030995324384536357, + "loss": 0.1711, + "step": 306370 + }, + { + "epoch": 12.69, + "grad_norm": 1.9453125, + "learning_rate": 0.00030994271506661677, + "loss": 0.2287, + "step": 306380 + }, + { + "epoch": 12.69, + "grad_norm": 0.326171875, + "learning_rate": 0.00030993218617506415, + "loss": 0.1697, + "step": 306390 + }, + { + "epoch": 12.69, + "grad_norm": 1.6328125, + "learning_rate": 0.0003099216571707257, + "loss": 0.1933, + "step": 306400 + }, + { + "epoch": 12.69, + "grad_norm": 0.66015625, + "learning_rate": 0.00030991112805362115, + "loss": 0.2443, + "step": 306410 + }, + { + "epoch": 12.69, + "grad_norm": 0.419921875, + "learning_rate": 0.0003099005988237703, + "loss": 0.1386, + "step": 306420 + }, + { + "epoch": 12.69, + "grad_norm": 0.890625, + "learning_rate": 0.0003098900694811931, + "loss": 0.2735, + "step": 306430 + }, + { + "epoch": 12.69, + "grad_norm": 0.384765625, + "learning_rate": 0.0003098795400259092, + "loss": 0.2038, + "step": 306440 + }, + { + "epoch": 12.69, + "grad_norm": 0.71875, + "learning_rate": 0.00030986901045793844, + "loss": 0.1625, + "step": 306450 + }, + { + "epoch": 12.69, + "grad_norm": 0.66796875, + "learning_rate": 0.00030985848077730066, + "loss": 0.1629, + "step": 306460 + }, + { + "epoch": 12.69, + "grad_norm": 1.5234375, + "learning_rate": 0.00030984795098401563, + "loss": 0.2047, + "step": 306470 + }, + { + "epoch": 12.69, + "grad_norm": 1.078125, + "learning_rate": 0.00030983742107810336, + "loss": 0.1761, + "step": 306480 + }, + { + "epoch": 12.69, + "grad_norm": 0.8984375, + "learning_rate": 0.00030982689105958346, + "loss": 0.2123, + "step": 306490 + }, + { + "epoch": 12.7, + "grad_norm": 0.50390625, + "learning_rate": 0.0003098163609284758, + "loss": 0.177, + "step": 306500 + }, + { + "epoch": 12.7, + "grad_norm": 0.87109375, + "learning_rate": 0.0003098058306848002, + "loss": 0.1944, + "step": 306510 + }, + { + "epoch": 12.7, + "grad_norm": 0.5625, + "learning_rate": 0.0003097953003285765, + "loss": 0.205, + "step": 306520 + }, + { + "epoch": 12.7, + "grad_norm": 1.9296875, + "learning_rate": 0.0003097847698598245, + "loss": 0.1975, + "step": 306530 + }, + { + "epoch": 12.7, + "grad_norm": 0.5859375, + "learning_rate": 0.00030977423927856405, + "loss": 0.1801, + "step": 306540 + }, + { + "epoch": 12.7, + "grad_norm": 1.9375, + "learning_rate": 0.0003097637085848149, + "loss": 0.1915, + "step": 306550 + }, + { + "epoch": 12.7, + "grad_norm": 0.66796875, + "learning_rate": 0.000309753177778597, + "loss": 0.2045, + "step": 306560 + }, + { + "epoch": 12.7, + "grad_norm": 0.890625, + "learning_rate": 0.00030974264685992994, + "loss": 0.2183, + "step": 306570 + }, + { + "epoch": 12.7, + "grad_norm": 1.125, + "learning_rate": 0.00030973211582883373, + "loss": 0.1954, + "step": 306580 + }, + { + "epoch": 12.7, + "grad_norm": 0.71875, + "learning_rate": 0.00030972158468532815, + "loss": 0.2169, + "step": 306590 + }, + { + "epoch": 12.7, + "grad_norm": 0.98046875, + "learning_rate": 0.00030971105342943286, + "loss": 0.2042, + "step": 306600 + }, + { + "epoch": 12.7, + "grad_norm": 0.76171875, + "learning_rate": 0.00030970052206116794, + "loss": 0.1801, + "step": 306610 + }, + { + "epoch": 12.7, + "grad_norm": 0.62890625, + "learning_rate": 0.0003096899905805531, + "loss": 0.1172, + "step": 306620 + }, + { + "epoch": 12.7, + "grad_norm": 0.7109375, + "learning_rate": 0.00030967945898760805, + "loss": 0.1966, + "step": 306630 + }, + { + "epoch": 12.7, + "grad_norm": 0.388671875, + "learning_rate": 0.0003096689272823529, + "loss": 0.1786, + "step": 306640 + }, + { + "epoch": 12.7, + "grad_norm": 0.87109375, + "learning_rate": 0.00030965839546480704, + "loss": 0.1561, + "step": 306650 + }, + { + "epoch": 12.7, + "grad_norm": 1.15625, + "learning_rate": 0.00030964786353499066, + "loss": 0.164, + "step": 306660 + }, + { + "epoch": 12.7, + "grad_norm": 0.7578125, + "learning_rate": 0.00030963733149292337, + "loss": 0.2192, + "step": 306670 + }, + { + "epoch": 12.7, + "grad_norm": 0.61328125, + "learning_rate": 0.00030962679933862504, + "loss": 0.2368, + "step": 306680 + }, + { + "epoch": 12.7, + "grad_norm": 0.65234375, + "learning_rate": 0.0003096162670721156, + "loss": 0.2212, + "step": 306690 + }, + { + "epoch": 12.7, + "grad_norm": 1.53125, + "learning_rate": 0.0003096057346934148, + "loss": 0.1836, + "step": 306700 + }, + { + "epoch": 12.7, + "grad_norm": 1.1953125, + "learning_rate": 0.00030959520220254233, + "loss": 0.2043, + "step": 306710 + }, + { + "epoch": 12.7, + "grad_norm": 0.63671875, + "learning_rate": 0.0003095846695995182, + "loss": 0.1778, + "step": 306720 + }, + { + "epoch": 12.7, + "grad_norm": 1.0859375, + "learning_rate": 0.00030957413688436215, + "loss": 0.1975, + "step": 306730 + }, + { + "epoch": 12.71, + "grad_norm": 0.408203125, + "learning_rate": 0.000309563604057094, + "loss": 0.2017, + "step": 306740 + }, + { + "epoch": 12.71, + "grad_norm": 0.52734375, + "learning_rate": 0.0003095530711177336, + "loss": 0.2166, + "step": 306750 + }, + { + "epoch": 12.71, + "grad_norm": 1.4453125, + "learning_rate": 0.00030954253806630066, + "loss": 0.1763, + "step": 306760 + }, + { + "epoch": 12.71, + "grad_norm": 0.796875, + "learning_rate": 0.0003095320049028152, + "loss": 0.2352, + "step": 306770 + }, + { + "epoch": 12.71, + "grad_norm": 0.7109375, + "learning_rate": 0.00030952147162729694, + "loss": 0.1944, + "step": 306780 + }, + { + "epoch": 12.71, + "grad_norm": 0.66796875, + "learning_rate": 0.0003095109382397656, + "loss": 0.1873, + "step": 306790 + }, + { + "epoch": 12.71, + "grad_norm": 0.34375, + "learning_rate": 0.0003095004047402412, + "loss": 0.2077, + "step": 306800 + }, + { + "epoch": 12.71, + "grad_norm": 0.62109375, + "learning_rate": 0.00030948987112874344, + "loss": 0.2023, + "step": 306810 + }, + { + "epoch": 12.71, + "grad_norm": 1.8046875, + "learning_rate": 0.0003094793374052921, + "loss": 0.2125, + "step": 306820 + }, + { + "epoch": 12.71, + "grad_norm": 0.89453125, + "learning_rate": 0.0003094688035699072, + "loss": 0.1954, + "step": 306830 + }, + { + "epoch": 12.71, + "grad_norm": 0.443359375, + "learning_rate": 0.00030945826962260823, + "loss": 0.19, + "step": 306840 + }, + { + "epoch": 12.71, + "grad_norm": 0.5625, + "learning_rate": 0.00030944773556341543, + "loss": 0.1664, + "step": 306850 + }, + { + "epoch": 12.71, + "grad_norm": 0.66015625, + "learning_rate": 0.0003094372013923483, + "loss": 0.212, + "step": 306860 + }, + { + "epoch": 12.71, + "grad_norm": 1.3671875, + "learning_rate": 0.0003094266671094268, + "loss": 0.1705, + "step": 306870 + }, + { + "epoch": 12.71, + "grad_norm": 0.96484375, + "learning_rate": 0.0003094161327146707, + "loss": 0.1773, + "step": 306880 + }, + { + "epoch": 12.71, + "grad_norm": 0.84765625, + "learning_rate": 0.00030940559820809993, + "loss": 0.1629, + "step": 306890 + }, + { + "epoch": 12.71, + "grad_norm": 2.1875, + "learning_rate": 0.0003093950635897342, + "loss": 0.2197, + "step": 306900 + }, + { + "epoch": 12.71, + "grad_norm": 0.5625, + "learning_rate": 0.0003093845288595934, + "loss": 0.2119, + "step": 306910 + }, + { + "epoch": 12.71, + "grad_norm": 1.2890625, + "learning_rate": 0.00030937399401769725, + "loss": 0.1594, + "step": 306920 + }, + { + "epoch": 12.71, + "grad_norm": 0.6015625, + "learning_rate": 0.00030936345906406573, + "loss": 0.2087, + "step": 306930 + }, + { + "epoch": 12.71, + "grad_norm": 0.8828125, + "learning_rate": 0.0003093529239987185, + "loss": 0.1996, + "step": 306940 + }, + { + "epoch": 12.71, + "grad_norm": 0.73828125, + "learning_rate": 0.00030934238882167553, + "loss": 0.1614, + "step": 306950 + }, + { + "epoch": 12.71, + "grad_norm": 0.86328125, + "learning_rate": 0.0003093318535329567, + "loss": 0.1807, + "step": 306960 + }, + { + "epoch": 12.71, + "grad_norm": 0.66796875, + "learning_rate": 0.0003093213181325816, + "loss": 0.1798, + "step": 306970 + }, + { + "epoch": 12.72, + "grad_norm": 0.2275390625, + "learning_rate": 0.00030931078262057016, + "loss": 0.156, + "step": 306980 + }, + { + "epoch": 12.72, + "grad_norm": 0.96484375, + "learning_rate": 0.00030930024699694236, + "loss": 0.1972, + "step": 306990 + }, + { + "epoch": 12.72, + "grad_norm": 1.15625, + "learning_rate": 0.00030928971126171774, + "loss": 0.2101, + "step": 307000 + }, + { + "epoch": 12.72, + "grad_norm": 1.3125, + "learning_rate": 0.0003092791754149164, + "loss": 0.1827, + "step": 307010 + }, + { + "epoch": 12.72, + "grad_norm": 0.7734375, + "learning_rate": 0.00030926863945655804, + "loss": 0.2019, + "step": 307020 + }, + { + "epoch": 12.72, + "grad_norm": 0.95703125, + "learning_rate": 0.0003092581033866625, + "loss": 0.1688, + "step": 307030 + }, + { + "epoch": 12.72, + "grad_norm": 1.046875, + "learning_rate": 0.0003092475672052496, + "loss": 0.1779, + "step": 307040 + }, + { + "epoch": 12.72, + "grad_norm": 1.15625, + "learning_rate": 0.00030923703091233917, + "loss": 0.2373, + "step": 307050 + }, + { + "epoch": 12.72, + "grad_norm": 0.58203125, + "learning_rate": 0.00030922649450795104, + "loss": 0.1759, + "step": 307060 + }, + { + "epoch": 12.72, + "grad_norm": 0.73046875, + "learning_rate": 0.000309215957992105, + "loss": 0.2001, + "step": 307070 + }, + { + "epoch": 12.72, + "grad_norm": 1.0234375, + "learning_rate": 0.00030920542136482097, + "loss": 0.2039, + "step": 307080 + }, + { + "epoch": 12.72, + "grad_norm": 0.61328125, + "learning_rate": 0.00030919488462611877, + "loss": 0.1619, + "step": 307090 + }, + { + "epoch": 12.72, + "grad_norm": 0.5234375, + "learning_rate": 0.00030918434777601816, + "loss": 0.188, + "step": 307100 + }, + { + "epoch": 12.72, + "grad_norm": 1.34375, + "learning_rate": 0.000309173810814539, + "loss": 0.1494, + "step": 307110 + }, + { + "epoch": 12.72, + "grad_norm": 0.72265625, + "learning_rate": 0.0003091632737417012, + "loss": 0.2154, + "step": 307120 + }, + { + "epoch": 12.72, + "grad_norm": 1.0390625, + "learning_rate": 0.0003091527365575244, + "loss": 0.1784, + "step": 307130 + }, + { + "epoch": 12.72, + "grad_norm": 1.2421875, + "learning_rate": 0.00030914219926202847, + "loss": 0.2488, + "step": 307140 + }, + { + "epoch": 12.72, + "grad_norm": 0.67578125, + "learning_rate": 0.0003091316618552334, + "loss": 0.1526, + "step": 307150 + }, + { + "epoch": 12.72, + "grad_norm": 1.0390625, + "learning_rate": 0.0003091211243371589, + "loss": 0.2122, + "step": 307160 + }, + { + "epoch": 12.72, + "grad_norm": 0.45703125, + "learning_rate": 0.0003091105867078249, + "loss": 0.2023, + "step": 307170 + }, + { + "epoch": 12.72, + "grad_norm": 0.84765625, + "learning_rate": 0.0003091000489672511, + "loss": 0.214, + "step": 307180 + }, + { + "epoch": 12.72, + "grad_norm": 0.92578125, + "learning_rate": 0.0003090895111154574, + "loss": 0.1998, + "step": 307190 + }, + { + "epoch": 12.72, + "grad_norm": 1.6875, + "learning_rate": 0.0003090789731524636, + "loss": 0.1997, + "step": 307200 + }, + { + "epoch": 12.72, + "grad_norm": 0.65625, + "learning_rate": 0.00030906843507828966, + "loss": 0.1505, + "step": 307210 + }, + { + "epoch": 12.73, + "grad_norm": 0.447265625, + "learning_rate": 0.00030905789689295515, + "loss": 0.1793, + "step": 307220 + }, + { + "epoch": 12.73, + "grad_norm": 1.0546875, + "learning_rate": 0.0003090473585964801, + "loss": 0.1881, + "step": 307230 + }, + { + "epoch": 12.73, + "grad_norm": 1.21875, + "learning_rate": 0.0003090368201888843, + "loss": 0.2464, + "step": 307240 + }, + { + "epoch": 12.73, + "grad_norm": 0.70703125, + "learning_rate": 0.00030902628167018767, + "loss": 0.1847, + "step": 307250 + }, + { + "epoch": 12.73, + "grad_norm": 1.1171875, + "learning_rate": 0.00030901574304040985, + "loss": 0.1978, + "step": 307260 + }, + { + "epoch": 12.73, + "grad_norm": 1.3046875, + "learning_rate": 0.0003090052042995708, + "loss": 0.1743, + "step": 307270 + }, + { + "epoch": 12.73, + "grad_norm": 1.3203125, + "learning_rate": 0.0003089946654476903, + "loss": 0.2059, + "step": 307280 + }, + { + "epoch": 12.73, + "grad_norm": 2.5, + "learning_rate": 0.0003089841264847883, + "loss": 0.1338, + "step": 307290 + }, + { + "epoch": 12.73, + "grad_norm": 0.7109375, + "learning_rate": 0.0003089735874108845, + "loss": 0.1853, + "step": 307300 + }, + { + "epoch": 12.73, + "grad_norm": 0.58203125, + "learning_rate": 0.00030896304822599876, + "loss": 0.1985, + "step": 307310 + }, + { + "epoch": 12.73, + "grad_norm": 0.474609375, + "learning_rate": 0.00030895250893015093, + "loss": 0.1483, + "step": 307320 + }, + { + "epoch": 12.73, + "grad_norm": 0.78515625, + "learning_rate": 0.00030894196952336087, + "loss": 0.1965, + "step": 307330 + }, + { + "epoch": 12.73, + "grad_norm": 0.76171875, + "learning_rate": 0.00030893143000564836, + "loss": 0.1929, + "step": 307340 + }, + { + "epoch": 12.73, + "grad_norm": 0.94921875, + "learning_rate": 0.00030892089037703324, + "loss": 0.2641, + "step": 307350 + }, + { + "epoch": 12.73, + "grad_norm": 1.7265625, + "learning_rate": 0.0003089103506375354, + "loss": 0.2028, + "step": 307360 + }, + { + "epoch": 12.73, + "grad_norm": 0.70703125, + "learning_rate": 0.00030889981078717466, + "loss": 0.1324, + "step": 307370 + }, + { + "epoch": 12.73, + "grad_norm": 0.2216796875, + "learning_rate": 0.0003088892708259708, + "loss": 0.1968, + "step": 307380 + }, + { + "epoch": 12.73, + "grad_norm": 0.6796875, + "learning_rate": 0.00030887873075394374, + "loss": 0.1702, + "step": 307390 + }, + { + "epoch": 12.73, + "grad_norm": 1.46875, + "learning_rate": 0.0003088681905711132, + "loss": 0.196, + "step": 307400 + }, + { + "epoch": 12.73, + "grad_norm": 0.75390625, + "learning_rate": 0.00030885765027749917, + "loss": 0.2003, + "step": 307410 + }, + { + "epoch": 12.73, + "grad_norm": 0.7421875, + "learning_rate": 0.00030884710987312135, + "loss": 0.2483, + "step": 307420 + }, + { + "epoch": 12.73, + "grad_norm": 0.75390625, + "learning_rate": 0.0003088365693579996, + "loss": 0.1603, + "step": 307430 + }, + { + "epoch": 12.73, + "grad_norm": 2.640625, + "learning_rate": 0.0003088260287321538, + "loss": 0.1985, + "step": 307440 + }, + { + "epoch": 12.73, + "grad_norm": 1.0703125, + "learning_rate": 0.00030881548799560377, + "loss": 0.1907, + "step": 307450 + }, + { + "epoch": 12.73, + "grad_norm": 0.8359375, + "learning_rate": 0.00030880494714836937, + "loss": 0.2083, + "step": 307460 + }, + { + "epoch": 12.74, + "grad_norm": 0.64453125, + "learning_rate": 0.0003087944061904704, + "loss": 0.161, + "step": 307470 + }, + { + "epoch": 12.74, + "grad_norm": 0.6171875, + "learning_rate": 0.00030878386512192665, + "loss": 0.2273, + "step": 307480 + }, + { + "epoch": 12.74, + "grad_norm": 0.82421875, + "learning_rate": 0.000308773323942758, + "loss": 0.2758, + "step": 307490 + }, + { + "epoch": 12.74, + "grad_norm": 0.8359375, + "learning_rate": 0.0003087627826529844, + "loss": 0.1783, + "step": 307500 + }, + { + "epoch": 12.74, + "grad_norm": 0.361328125, + "learning_rate": 0.0003087522412526256, + "loss": 0.1659, + "step": 307510 + }, + { + "epoch": 12.74, + "grad_norm": 0.66796875, + "learning_rate": 0.0003087416997417013, + "loss": 0.1886, + "step": 307520 + }, + { + "epoch": 12.74, + "grad_norm": 0.265625, + "learning_rate": 0.00030873115812023156, + "loss": 0.1859, + "step": 307530 + }, + { + "epoch": 12.74, + "grad_norm": 0.53515625, + "learning_rate": 0.00030872061638823615, + "loss": 0.1955, + "step": 307540 + }, + { + "epoch": 12.74, + "grad_norm": 0.96875, + "learning_rate": 0.0003087100745457348, + "loss": 0.1909, + "step": 307550 + }, + { + "epoch": 12.74, + "grad_norm": 0.78125, + "learning_rate": 0.0003086995325927474, + "loss": 0.2116, + "step": 307560 + }, + { + "epoch": 12.74, + "grad_norm": 1.1484375, + "learning_rate": 0.0003086889905292939, + "loss": 0.2075, + "step": 307570 + }, + { + "epoch": 12.74, + "grad_norm": 0.83984375, + "learning_rate": 0.00030867844835539405, + "loss": 0.2054, + "step": 307580 + }, + { + "epoch": 12.74, + "grad_norm": 0.2578125, + "learning_rate": 0.0003086679060710677, + "loss": 0.1878, + "step": 307590 + }, + { + "epoch": 12.74, + "grad_norm": 0.796875, + "learning_rate": 0.00030865736367633465, + "loss": 0.1835, + "step": 307600 + }, + { + "epoch": 12.74, + "grad_norm": 1.2890625, + "learning_rate": 0.0003086468211712148, + "loss": 0.188, + "step": 307610 + }, + { + "epoch": 12.74, + "grad_norm": 1.203125, + "learning_rate": 0.0003086362785557279, + "loss": 0.2044, + "step": 307620 + }, + { + "epoch": 12.74, + "grad_norm": 0.49609375, + "learning_rate": 0.00030862573582989396, + "loss": 0.2275, + "step": 307630 + }, + { + "epoch": 12.74, + "grad_norm": 1.375, + "learning_rate": 0.0003086151929937327, + "loss": 0.2075, + "step": 307640 + }, + { + "epoch": 12.74, + "grad_norm": 0.859375, + "learning_rate": 0.0003086046500472639, + "loss": 0.1818, + "step": 307650 + }, + { + "epoch": 12.74, + "grad_norm": 0.5625, + "learning_rate": 0.0003085941069905076, + "loss": 0.1975, + "step": 307660 + }, + { + "epoch": 12.74, + "grad_norm": 0.6328125, + "learning_rate": 0.0003085835638234834, + "loss": 0.2314, + "step": 307670 + }, + { + "epoch": 12.74, + "grad_norm": 0.828125, + "learning_rate": 0.0003085730205462113, + "loss": 0.1786, + "step": 307680 + }, + { + "epoch": 12.74, + "grad_norm": 0.671875, + "learning_rate": 0.0003085624771587111, + "loss": 0.205, + "step": 307690 + }, + { + "epoch": 12.74, + "grad_norm": 1.515625, + "learning_rate": 0.00030855193366100263, + "loss": 0.1909, + "step": 307700 + }, + { + "epoch": 12.75, + "grad_norm": 0.6171875, + "learning_rate": 0.00030854139005310577, + "loss": 0.1856, + "step": 307710 + }, + { + "epoch": 12.75, + "grad_norm": 1.5546875, + "learning_rate": 0.00030853084633504035, + "loss": 0.1695, + "step": 307720 + }, + { + "epoch": 12.75, + "grad_norm": 0.76171875, + "learning_rate": 0.0003085203025068262, + "loss": 0.1721, + "step": 307730 + }, + { + "epoch": 12.75, + "grad_norm": 0.392578125, + "learning_rate": 0.00030850975856848314, + "loss": 0.2022, + "step": 307740 + }, + { + "epoch": 12.75, + "grad_norm": 0.0, + "learning_rate": 0.00030849921452003097, + "loss": 0.1886, + "step": 307750 + }, + { + "epoch": 12.75, + "grad_norm": 0.55859375, + "learning_rate": 0.0003084886703614896, + "loss": 0.203, + "step": 307760 + }, + { + "epoch": 12.75, + "grad_norm": 0.734375, + "learning_rate": 0.000308478126092879, + "loss": 0.1844, + "step": 307770 + }, + { + "epoch": 12.75, + "grad_norm": 0.4765625, + "learning_rate": 0.0003084675817142188, + "loss": 0.2172, + "step": 307780 + }, + { + "epoch": 12.75, + "grad_norm": 0.8515625, + "learning_rate": 0.0003084570372255289, + "loss": 0.1773, + "step": 307790 + }, + { + "epoch": 12.75, + "grad_norm": 0.5, + "learning_rate": 0.00030844649262682923, + "loss": 0.2071, + "step": 307800 + }, + { + "epoch": 12.75, + "grad_norm": 0.59765625, + "learning_rate": 0.0003084359479181395, + "loss": 0.2194, + "step": 307810 + }, + { + "epoch": 12.75, + "grad_norm": 0.9921875, + "learning_rate": 0.00030842540309947977, + "loss": 0.2396, + "step": 307820 + }, + { + "epoch": 12.75, + "grad_norm": 0.98046875, + "learning_rate": 0.00030841485817086956, + "loss": 0.2252, + "step": 307830 + }, + { + "epoch": 12.75, + "grad_norm": 1.0078125, + "learning_rate": 0.00030840431313232896, + "loss": 0.1799, + "step": 307840 + }, + { + "epoch": 12.75, + "grad_norm": 1.40625, + "learning_rate": 0.00030839376798387773, + "loss": 0.1991, + "step": 307850 + }, + { + "epoch": 12.75, + "grad_norm": 1.6015625, + "learning_rate": 0.00030838322272553584, + "loss": 0.1905, + "step": 307860 + }, + { + "epoch": 12.75, + "grad_norm": 0.8671875, + "learning_rate": 0.00030837267735732296, + "loss": 0.18, + "step": 307870 + }, + { + "epoch": 12.75, + "grad_norm": 1.15625, + "learning_rate": 0.000308362131879259, + "loss": 0.1749, + "step": 307880 + }, + { + "epoch": 12.75, + "grad_norm": 0.71484375, + "learning_rate": 0.00030835158629136377, + "loss": 0.2201, + "step": 307890 + }, + { + "epoch": 12.75, + "grad_norm": 1.5703125, + "learning_rate": 0.00030834104059365724, + "loss": 0.1652, + "step": 307900 + }, + { + "epoch": 12.75, + "grad_norm": 1.71875, + "learning_rate": 0.0003083304947861591, + "loss": 0.2183, + "step": 307910 + }, + { + "epoch": 12.75, + "grad_norm": 1.171875, + "learning_rate": 0.0003083199488688893, + "loss": 0.2185, + "step": 307920 + }, + { + "epoch": 12.75, + "grad_norm": 1.0703125, + "learning_rate": 0.00030830940284186766, + "loss": 0.1673, + "step": 307930 + }, + { + "epoch": 12.75, + "grad_norm": 1.1328125, + "learning_rate": 0.000308298856705114, + "loss": 0.2472, + "step": 307940 + }, + { + "epoch": 12.76, + "grad_norm": 0.6796875, + "learning_rate": 0.00030828831045864825, + "loss": 0.1571, + "step": 307950 + }, + { + "epoch": 12.76, + "grad_norm": 0.87890625, + "learning_rate": 0.00030827776410249013, + "loss": 0.1981, + "step": 307960 + }, + { + "epoch": 12.76, + "grad_norm": 1.671875, + "learning_rate": 0.00030826721763665954, + "loss": 0.2069, + "step": 307970 + }, + { + "epoch": 12.76, + "grad_norm": 1.0234375, + "learning_rate": 0.0003082566710611764, + "loss": 0.2104, + "step": 307980 + }, + { + "epoch": 12.76, + "grad_norm": 0.2060546875, + "learning_rate": 0.0003082461243760604, + "loss": 0.1867, + "step": 307990 + }, + { + "epoch": 12.76, + "grad_norm": 1.375, + "learning_rate": 0.0003082355775813316, + "loss": 0.1632, + "step": 308000 + }, + { + "epoch": 12.76, + "grad_norm": 0.765625, + "learning_rate": 0.0003082250306770096, + "loss": 0.1571, + "step": 308010 + }, + { + "epoch": 12.76, + "grad_norm": 1.859375, + "learning_rate": 0.0003082144836631145, + "loss": 0.243, + "step": 308020 + }, + { + "epoch": 12.76, + "grad_norm": 0.79296875, + "learning_rate": 0.000308203936539666, + "loss": 0.1657, + "step": 308030 + }, + { + "epoch": 12.76, + "grad_norm": 0.765625, + "learning_rate": 0.00030819338930668396, + "loss": 0.1726, + "step": 308040 + }, + { + "epoch": 12.76, + "grad_norm": 0.1953125, + "learning_rate": 0.0003081828419641882, + "loss": 0.1543, + "step": 308050 + }, + { + "epoch": 12.76, + "grad_norm": 0.08544921875, + "learning_rate": 0.00030817229451219876, + "loss": 0.1415, + "step": 308060 + }, + { + "epoch": 12.76, + "grad_norm": 1.046875, + "learning_rate": 0.0003081617469507352, + "loss": 0.1484, + "step": 308070 + }, + { + "epoch": 12.76, + "grad_norm": 0.765625, + "learning_rate": 0.0003081511992798175, + "loss": 0.1876, + "step": 308080 + }, + { + "epoch": 12.76, + "grad_norm": 0.58984375, + "learning_rate": 0.00030814065149946564, + "loss": 0.1945, + "step": 308090 + }, + { + "epoch": 12.76, + "grad_norm": 1.3828125, + "learning_rate": 0.00030813010360969926, + "loss": 0.1896, + "step": 308100 + }, + { + "epoch": 12.76, + "grad_norm": 0.5625, + "learning_rate": 0.0003081195556105384, + "loss": 0.2026, + "step": 308110 + }, + { + "epoch": 12.76, + "grad_norm": 0.75390625, + "learning_rate": 0.00030810900750200264, + "loss": 0.1545, + "step": 308120 + }, + { + "epoch": 12.76, + "grad_norm": 0.419921875, + "learning_rate": 0.0003080984592841122, + "loss": 0.1951, + "step": 308130 + }, + { + "epoch": 12.76, + "grad_norm": 0.6640625, + "learning_rate": 0.0003080879109568867, + "loss": 0.2381, + "step": 308140 + }, + { + "epoch": 12.76, + "grad_norm": 0.62109375, + "learning_rate": 0.0003080773625203459, + "loss": 0.2388, + "step": 308150 + }, + { + "epoch": 12.76, + "grad_norm": 0.9765625, + "learning_rate": 0.0003080668139745099, + "loss": 0.1623, + "step": 308160 + }, + { + "epoch": 12.76, + "grad_norm": 0.9140625, + "learning_rate": 0.00030805626531939836, + "loss": 0.1782, + "step": 308170 + }, + { + "epoch": 12.76, + "grad_norm": 0.60546875, + "learning_rate": 0.00030804571655503114, + "loss": 0.1446, + "step": 308180 + }, + { + "epoch": 12.77, + "grad_norm": 0.66015625, + "learning_rate": 0.00030803516768142833, + "loss": 0.1886, + "step": 308190 + }, + { + "epoch": 12.77, + "grad_norm": 0.5703125, + "learning_rate": 0.00030802461869860944, + "loss": 0.2144, + "step": 308200 + }, + { + "epoch": 12.77, + "grad_norm": 0.55078125, + "learning_rate": 0.0003080140696065946, + "loss": 0.1878, + "step": 308210 + }, + { + "epoch": 12.77, + "grad_norm": 0.5546875, + "learning_rate": 0.00030800352040540345, + "loss": 0.1961, + "step": 308220 + }, + { + "epoch": 12.77, + "grad_norm": 0.671875, + "learning_rate": 0.0003079929710950559, + "loss": 0.1875, + "step": 308230 + }, + { + "epoch": 12.77, + "grad_norm": 0.58203125, + "learning_rate": 0.000307982421675572, + "loss": 0.2243, + "step": 308240 + }, + { + "epoch": 12.77, + "grad_norm": 0.5546875, + "learning_rate": 0.0003079718721469713, + "loss": 0.2178, + "step": 308250 + }, + { + "epoch": 12.77, + "grad_norm": 0.625, + "learning_rate": 0.00030796132250927383, + "loss": 0.2021, + "step": 308260 + }, + { + "epoch": 12.77, + "grad_norm": 0.482421875, + "learning_rate": 0.0003079507727624995, + "loss": 0.1769, + "step": 308270 + }, + { + "epoch": 12.77, + "grad_norm": 0.99609375, + "learning_rate": 0.00030794022290666793, + "loss": 0.1724, + "step": 308280 + }, + { + "epoch": 12.77, + "grad_norm": 0.80078125, + "learning_rate": 0.00030792967294179926, + "loss": 0.1941, + "step": 308290 + }, + { + "epoch": 12.77, + "grad_norm": 1.0, + "learning_rate": 0.0003079191228679132, + "loss": 0.1933, + "step": 308300 + }, + { + "epoch": 12.77, + "grad_norm": 0.98828125, + "learning_rate": 0.0003079085726850294, + "loss": 0.1689, + "step": 308310 + }, + { + "epoch": 12.77, + "grad_norm": 1.15625, + "learning_rate": 0.0003078980223931682, + "loss": 0.1983, + "step": 308320 + }, + { + "epoch": 12.77, + "grad_norm": 0.734375, + "learning_rate": 0.000307887471992349, + "loss": 0.2176, + "step": 308330 + }, + { + "epoch": 12.77, + "grad_norm": 0.6953125, + "learning_rate": 0.00030787692148259184, + "loss": 0.1689, + "step": 308340 + }, + { + "epoch": 12.77, + "grad_norm": 0.78515625, + "learning_rate": 0.0003078663708639166, + "loss": 0.2142, + "step": 308350 + }, + { + "epoch": 12.77, + "grad_norm": 0.84765625, + "learning_rate": 0.00030785582013634306, + "loss": 0.185, + "step": 308360 + }, + { + "epoch": 12.77, + "grad_norm": 0.96484375, + "learning_rate": 0.00030784526929989115, + "loss": 0.2107, + "step": 308370 + }, + { + "epoch": 12.77, + "grad_norm": 0.7265625, + "learning_rate": 0.0003078347183545807, + "loss": 0.1841, + "step": 308380 + }, + { + "epoch": 12.77, + "grad_norm": 0.921875, + "learning_rate": 0.0003078241673004315, + "loss": 0.1734, + "step": 308390 + }, + { + "epoch": 12.77, + "grad_norm": 0.7109375, + "learning_rate": 0.0003078136161374636, + "loss": 0.1732, + "step": 308400 + }, + { + "epoch": 12.77, + "grad_norm": 0.68359375, + "learning_rate": 0.0003078030648656965, + "loss": 0.2025, + "step": 308410 + }, + { + "epoch": 12.77, + "grad_norm": 0.5625, + "learning_rate": 0.00030779251348515043, + "loss": 0.1848, + "step": 308420 + }, + { + "epoch": 12.78, + "grad_norm": 0.98828125, + "learning_rate": 0.0003077819619958451, + "loss": 0.1537, + "step": 308430 + }, + { + "epoch": 12.78, + "grad_norm": 1.4921875, + "learning_rate": 0.0003077714103978002, + "loss": 0.2169, + "step": 308440 + }, + { + "epoch": 12.78, + "grad_norm": 0.90625, + "learning_rate": 0.00030776085869103594, + "loss": 0.1818, + "step": 308450 + }, + { + "epoch": 12.78, + "grad_norm": 0.50390625, + "learning_rate": 0.00030775030687557186, + "loss": 0.1741, + "step": 308460 + }, + { + "epoch": 12.78, + "grad_norm": 0.384765625, + "learning_rate": 0.000307739754951428, + "loss": 0.1755, + "step": 308470 + }, + { + "epoch": 12.78, + "grad_norm": 4.125, + "learning_rate": 0.00030772920291862414, + "loss": 0.2242, + "step": 308480 + }, + { + "epoch": 12.78, + "grad_norm": 1.359375, + "learning_rate": 0.00030771865077718015, + "loss": 0.2017, + "step": 308490 + }, + { + "epoch": 12.78, + "grad_norm": 0.6875, + "learning_rate": 0.00030770809852711584, + "loss": 0.2629, + "step": 308500 + }, + { + "epoch": 12.78, + "grad_norm": 0.8203125, + "learning_rate": 0.00030769754616845124, + "loss": 0.203, + "step": 308510 + }, + { + "epoch": 12.78, + "grad_norm": 1.015625, + "learning_rate": 0.00030768699370120595, + "loss": 0.183, + "step": 308520 + }, + { + "epoch": 12.78, + "grad_norm": 1.1875, + "learning_rate": 0.00030767644112540004, + "loss": 0.213, + "step": 308530 + }, + { + "epoch": 12.78, + "grad_norm": 1.6484375, + "learning_rate": 0.0003076658884410533, + "loss": 0.2147, + "step": 308540 + }, + { + "epoch": 12.78, + "grad_norm": 0.298828125, + "learning_rate": 0.0003076553356481856, + "loss": 0.182, + "step": 308550 + }, + { + "epoch": 12.78, + "grad_norm": 0.65234375, + "learning_rate": 0.0003076447827468168, + "loss": 0.2135, + "step": 308560 + }, + { + "epoch": 12.78, + "grad_norm": 1.8671875, + "learning_rate": 0.00030763422973696664, + "loss": 0.2315, + "step": 308570 + }, + { + "epoch": 12.78, + "grad_norm": 0.78125, + "learning_rate": 0.0003076236766186551, + "loss": 0.1839, + "step": 308580 + }, + { + "epoch": 12.78, + "grad_norm": 0.9296875, + "learning_rate": 0.0003076131233919021, + "loss": 0.2236, + "step": 308590 + }, + { + "epoch": 12.78, + "grad_norm": 0.4453125, + "learning_rate": 0.00030760257005672744, + "loss": 0.2029, + "step": 308600 + }, + { + "epoch": 12.78, + "grad_norm": 0.51953125, + "learning_rate": 0.00030759201661315094, + "loss": 0.2062, + "step": 308610 + }, + { + "epoch": 12.78, + "grad_norm": 0.96875, + "learning_rate": 0.00030758146306119244, + "loss": 0.2163, + "step": 308620 + }, + { + "epoch": 12.78, + "grad_norm": 2.9375, + "learning_rate": 0.00030757090940087184, + "loss": 0.1714, + "step": 308630 + }, + { + "epoch": 12.78, + "grad_norm": 0.98828125, + "learning_rate": 0.0003075603556322091, + "loss": 0.1936, + "step": 308640 + }, + { + "epoch": 12.78, + "grad_norm": 1.796875, + "learning_rate": 0.0003075498017552239, + "loss": 0.1631, + "step": 308650 + }, + { + "epoch": 12.78, + "grad_norm": 0.96484375, + "learning_rate": 0.0003075392477699362, + "loss": 0.204, + "step": 308660 + }, + { + "epoch": 12.79, + "grad_norm": 0.51171875, + "learning_rate": 0.0003075286936763658, + "loss": 0.1925, + "step": 308670 + }, + { + "epoch": 12.79, + "grad_norm": 0.40625, + "learning_rate": 0.0003075181394745327, + "loss": 0.1944, + "step": 308680 + }, + { + "epoch": 12.79, + "grad_norm": 0.62890625, + "learning_rate": 0.00030750758516445665, + "loss": 0.1965, + "step": 308690 + }, + { + "epoch": 12.79, + "grad_norm": 0.8671875, + "learning_rate": 0.0003074970307461575, + "loss": 0.1538, + "step": 308700 + }, + { + "epoch": 12.79, + "grad_norm": 0.4375, + "learning_rate": 0.00030748647621965517, + "loss": 0.2246, + "step": 308710 + }, + { + "epoch": 12.79, + "grad_norm": 0.94921875, + "learning_rate": 0.00030747592158496956, + "loss": 0.2053, + "step": 308720 + }, + { + "epoch": 12.79, + "grad_norm": 1.3046875, + "learning_rate": 0.0003074653668421204, + "loss": 0.2183, + "step": 308730 + }, + { + "epoch": 12.79, + "grad_norm": 0.890625, + "learning_rate": 0.00030745481199112763, + "loss": 0.172, + "step": 308740 + }, + { + "epoch": 12.79, + "grad_norm": 1.53125, + "learning_rate": 0.0003074442570320111, + "loss": 0.2412, + "step": 308750 + }, + { + "epoch": 12.79, + "grad_norm": 1.0703125, + "learning_rate": 0.0003074337019647907, + "loss": 0.1861, + "step": 308760 + }, + { + "epoch": 12.79, + "grad_norm": 1.2265625, + "learning_rate": 0.00030742314678948636, + "loss": 0.1833, + "step": 308770 + }, + { + "epoch": 12.79, + "grad_norm": 0.33984375, + "learning_rate": 0.0003074125915061177, + "loss": 0.1815, + "step": 308780 + }, + { + "epoch": 12.79, + "grad_norm": 0.375, + "learning_rate": 0.00030740203611470484, + "loss": 0.1808, + "step": 308790 + }, + { + "epoch": 12.79, + "grad_norm": 0.68359375, + "learning_rate": 0.0003073914806152675, + "loss": 0.1778, + "step": 308800 + }, + { + "epoch": 12.79, + "grad_norm": 0.90625, + "learning_rate": 0.00030738092500782565, + "loss": 0.2025, + "step": 308810 + }, + { + "epoch": 12.79, + "grad_norm": 0.482421875, + "learning_rate": 0.000307370369292399, + "loss": 0.2662, + "step": 308820 + }, + { + "epoch": 12.79, + "grad_norm": 1.1484375, + "learning_rate": 0.0003073598134690076, + "loss": 0.1742, + "step": 308830 + }, + { + "epoch": 12.79, + "grad_norm": 0.4453125, + "learning_rate": 0.0003073492575376712, + "loss": 0.1849, + "step": 308840 + }, + { + "epoch": 12.79, + "grad_norm": 0.6328125, + "learning_rate": 0.00030733870149840975, + "loss": 0.1956, + "step": 308850 + }, + { + "epoch": 12.79, + "grad_norm": 2.0625, + "learning_rate": 0.000307328145351243, + "loss": 0.1926, + "step": 308860 + }, + { + "epoch": 12.79, + "grad_norm": 0.82421875, + "learning_rate": 0.00030731758909619087, + "loss": 0.202, + "step": 308870 + }, + { + "epoch": 12.79, + "grad_norm": 0.026611328125, + "learning_rate": 0.00030730703273327317, + "loss": 0.1707, + "step": 308880 + }, + { + "epoch": 12.79, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003072964762625099, + "loss": 0.1697, + "step": 308890 + }, + { + "epoch": 12.79, + "grad_norm": 0.8984375, + "learning_rate": 0.00030728591968392084, + "loss": 0.208, + "step": 308900 + }, + { + "epoch": 12.8, + "grad_norm": 0.67578125, + "learning_rate": 0.00030727536299752584, + "loss": 0.1946, + "step": 308910 + }, + { + "epoch": 12.8, + "grad_norm": 0.58203125, + "learning_rate": 0.0003072648062033447, + "loss": 0.1706, + "step": 308920 + }, + { + "epoch": 12.8, + "grad_norm": 0.51171875, + "learning_rate": 0.00030725424930139764, + "loss": 0.1799, + "step": 308930 + }, + { + "epoch": 12.8, + "grad_norm": 0.6640625, + "learning_rate": 0.00030724369229170406, + "loss": 0.1759, + "step": 308940 + }, + { + "epoch": 12.8, + "grad_norm": 0.94921875, + "learning_rate": 0.0003072331351742841, + "loss": 0.2075, + "step": 308950 + }, + { + "epoch": 12.8, + "grad_norm": 0.86328125, + "learning_rate": 0.00030722257794915745, + "loss": 0.197, + "step": 308960 + }, + { + "epoch": 12.8, + "grad_norm": 0.474609375, + "learning_rate": 0.00030721202061634426, + "loss": 0.2091, + "step": 308970 + }, + { + "epoch": 12.8, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0003072014631758641, + "loss": 0.2467, + "step": 308980 + }, + { + "epoch": 12.8, + "grad_norm": 1.0546875, + "learning_rate": 0.000307190905627737, + "loss": 0.2144, + "step": 308990 + }, + { + "epoch": 12.8, + "grad_norm": 0.5625, + "learning_rate": 0.0003071803479719828, + "loss": 0.2159, + "step": 309000 + }, + { + "epoch": 12.8, + "grad_norm": 0.4921875, + "learning_rate": 0.00030716979020862134, + "loss": 0.1815, + "step": 309010 + }, + { + "epoch": 12.8, + "grad_norm": 0.484375, + "learning_rate": 0.0003071592323376726, + "loss": 0.1802, + "step": 309020 + }, + { + "epoch": 12.8, + "grad_norm": 0.5703125, + "learning_rate": 0.0003071486743591563, + "loss": 0.1703, + "step": 309030 + }, + { + "epoch": 12.8, + "grad_norm": 1.0078125, + "learning_rate": 0.00030713811627309227, + "loss": 0.2542, + "step": 309040 + }, + { + "epoch": 12.8, + "grad_norm": 1.234375, + "learning_rate": 0.0003071275580795006, + "loss": 0.2305, + "step": 309050 + }, + { + "epoch": 12.8, + "grad_norm": 0.216796875, + "learning_rate": 0.00030711699977840104, + "loss": 0.1777, + "step": 309060 + }, + { + "epoch": 12.8, + "grad_norm": 0.68359375, + "learning_rate": 0.00030710644136981333, + "loss": 0.1723, + "step": 309070 + }, + { + "epoch": 12.8, + "grad_norm": 1.15625, + "learning_rate": 0.00030709588285375756, + "loss": 0.2335, + "step": 309080 + }, + { + "epoch": 12.8, + "grad_norm": 1.15625, + "learning_rate": 0.0003070853242302535, + "loss": 0.1867, + "step": 309090 + }, + { + "epoch": 12.8, + "grad_norm": 0.96484375, + "learning_rate": 0.000307074765499321, + "loss": 0.111, + "step": 309100 + }, + { + "epoch": 12.8, + "grad_norm": 0.4453125, + "learning_rate": 0.00030706420666097995, + "loss": 0.2129, + "step": 309110 + }, + { + "epoch": 12.8, + "grad_norm": 0.462890625, + "learning_rate": 0.00030705364771525025, + "loss": 0.2141, + "step": 309120 + }, + { + "epoch": 12.8, + "grad_norm": 0.9296875, + "learning_rate": 0.00030704308866215174, + "loss": 0.1673, + "step": 309130 + }, + { + "epoch": 12.8, + "grad_norm": 1.09375, + "learning_rate": 0.00030703252950170425, + "loss": 0.1861, + "step": 309140 + }, + { + "epoch": 12.8, + "grad_norm": 0.73046875, + "learning_rate": 0.0003070219702339278, + "loss": 0.1815, + "step": 309150 + }, + { + "epoch": 12.81, + "grad_norm": 0.6953125, + "learning_rate": 0.0003070114108588421, + "loss": 0.2379, + "step": 309160 + }, + { + "epoch": 12.81, + "grad_norm": 0.60546875, + "learning_rate": 0.00030700085137646703, + "loss": 0.1936, + "step": 309170 + }, + { + "epoch": 12.81, + "grad_norm": 0.70703125, + "learning_rate": 0.00030699029178682266, + "loss": 0.1939, + "step": 309180 + }, + { + "epoch": 12.81, + "grad_norm": 0.671875, + "learning_rate": 0.0003069797320899286, + "loss": 0.1593, + "step": 309190 + }, + { + "epoch": 12.81, + "grad_norm": 1.7421875, + "learning_rate": 0.0003069691722858048, + "loss": 0.1833, + "step": 309200 + }, + { + "epoch": 12.81, + "grad_norm": 0.68359375, + "learning_rate": 0.0003069586123744713, + "loss": 0.2337, + "step": 309210 + }, + { + "epoch": 12.81, + "grad_norm": 0.5078125, + "learning_rate": 0.0003069480523559477, + "loss": 0.1459, + "step": 309220 + }, + { + "epoch": 12.81, + "grad_norm": 0.498046875, + "learning_rate": 0.00030693749223025413, + "loss": 0.1853, + "step": 309230 + }, + { + "epoch": 12.81, + "grad_norm": 0.62109375, + "learning_rate": 0.0003069269319974103, + "loss": 0.2345, + "step": 309240 + }, + { + "epoch": 12.81, + "grad_norm": 0.9375, + "learning_rate": 0.0003069163716574361, + "loss": 0.1825, + "step": 309250 + }, + { + "epoch": 12.81, + "grad_norm": 1.1015625, + "learning_rate": 0.00030690581121035155, + "loss": 0.2129, + "step": 309260 + }, + { + "epoch": 12.81, + "grad_norm": 0.515625, + "learning_rate": 0.00030689525065617627, + "loss": 0.1394, + "step": 309270 + }, + { + "epoch": 12.81, + "grad_norm": 0.6484375, + "learning_rate": 0.00030688468999493033, + "loss": 0.1494, + "step": 309280 + }, + { + "epoch": 12.81, + "grad_norm": 0.58203125, + "learning_rate": 0.0003068741292266335, + "loss": 0.2088, + "step": 309290 + }, + { + "epoch": 12.81, + "grad_norm": 1.203125, + "learning_rate": 0.0003068635683513058, + "loss": 0.1882, + "step": 309300 + }, + { + "epoch": 12.81, + "grad_norm": 0.39453125, + "learning_rate": 0.000306853007368967, + "loss": 0.1796, + "step": 309310 + }, + { + "epoch": 12.81, + "grad_norm": 0.7890625, + "learning_rate": 0.0003068424462796369, + "loss": 0.2178, + "step": 309320 + }, + { + "epoch": 12.81, + "grad_norm": 0.71875, + "learning_rate": 0.0003068318850833355, + "loss": 0.2151, + "step": 309330 + }, + { + "epoch": 12.81, + "grad_norm": 0.85546875, + "learning_rate": 0.00030682132378008264, + "loss": 0.1869, + "step": 309340 + }, + { + "epoch": 12.81, + "grad_norm": 0.95703125, + "learning_rate": 0.0003068107623698981, + "loss": 0.2209, + "step": 309350 + }, + { + "epoch": 12.81, + "grad_norm": 1.015625, + "learning_rate": 0.0003068002008528019, + "loss": 0.1934, + "step": 309360 + }, + { + "epoch": 12.81, + "grad_norm": 1.0078125, + "learning_rate": 0.0003067896392288139, + "loss": 0.208, + "step": 309370 + }, + { + "epoch": 12.81, + "grad_norm": 0.6640625, + "learning_rate": 0.0003067790774979539, + "loss": 0.2382, + "step": 309380 + }, + { + "epoch": 12.81, + "grad_norm": 0.55078125, + "learning_rate": 0.0003067685156602418, + "loss": 0.1907, + "step": 309390 + }, + { + "epoch": 12.82, + "grad_norm": 0.66796875, + "learning_rate": 0.00030675795371569746, + "loss": 0.1887, + "step": 309400 + }, + { + "epoch": 12.82, + "grad_norm": 0.6796875, + "learning_rate": 0.0003067473916643408, + "loss": 0.1831, + "step": 309410 + }, + { + "epoch": 12.82, + "grad_norm": 0.6328125, + "learning_rate": 0.00030673682950619176, + "loss": 0.1802, + "step": 309420 + }, + { + "epoch": 12.82, + "grad_norm": 0.65234375, + "learning_rate": 0.00030672626724126997, + "loss": 0.2598, + "step": 309430 + }, + { + "epoch": 12.82, + "grad_norm": 0.90625, + "learning_rate": 0.00030671570486959563, + "loss": 0.176, + "step": 309440 + }, + { + "epoch": 12.82, + "grad_norm": 1.3359375, + "learning_rate": 0.0003067051423911883, + "loss": 0.1411, + "step": 309450 + }, + { + "epoch": 12.82, + "grad_norm": 0.396484375, + "learning_rate": 0.0003066945798060681, + "loss": 0.2393, + "step": 309460 + }, + { + "epoch": 12.82, + "grad_norm": 0.57421875, + "learning_rate": 0.0003066840171142549, + "loss": 0.1935, + "step": 309470 + }, + { + "epoch": 12.82, + "grad_norm": 0.83984375, + "learning_rate": 0.00030667345431576836, + "loss": 0.1538, + "step": 309480 + }, + { + "epoch": 12.82, + "grad_norm": 0.94140625, + "learning_rate": 0.0003066628914106286, + "loss": 0.1684, + "step": 309490 + }, + { + "epoch": 12.82, + "grad_norm": 0.6484375, + "learning_rate": 0.0003066523283988554, + "loss": 0.2176, + "step": 309500 + }, + { + "epoch": 12.82, + "grad_norm": 0.875, + "learning_rate": 0.0003066417652804685, + "loss": 0.1799, + "step": 309510 + }, + { + "epoch": 12.82, + "grad_norm": 1.15625, + "learning_rate": 0.00030663120205548804, + "loss": 0.223, + "step": 309520 + }, + { + "epoch": 12.82, + "grad_norm": 1.0625, + "learning_rate": 0.00030662063872393373, + "loss": 0.1595, + "step": 309530 + }, + { + "epoch": 12.82, + "grad_norm": 1.1484375, + "learning_rate": 0.0003066100752858255, + "loss": 0.2208, + "step": 309540 + }, + { + "epoch": 12.82, + "grad_norm": 0.80078125, + "learning_rate": 0.0003065995117411833, + "loss": 0.2059, + "step": 309550 + }, + { + "epoch": 12.82, + "grad_norm": 0.53125, + "learning_rate": 0.0003065889480900268, + "loss": 0.2277, + "step": 309560 + }, + { + "epoch": 12.82, + "grad_norm": 0.546875, + "learning_rate": 0.00030657838433237604, + "loss": 0.1805, + "step": 309570 + }, + { + "epoch": 12.82, + "grad_norm": 1.1953125, + "learning_rate": 0.000306567820468251, + "loss": 0.1944, + "step": 309580 + }, + { + "epoch": 12.82, + "grad_norm": 0.455078125, + "learning_rate": 0.00030655725649767125, + "loss": 0.211, + "step": 309590 + }, + { + "epoch": 12.82, + "grad_norm": 0.83984375, + "learning_rate": 0.000306546692420657, + "loss": 0.1587, + "step": 309600 + }, + { + "epoch": 12.82, + "grad_norm": 0.9609375, + "learning_rate": 0.00030653612823722785, + "loss": 0.1799, + "step": 309610 + }, + { + "epoch": 12.82, + "grad_norm": 0.7890625, + "learning_rate": 0.0003065255639474039, + "loss": 0.196, + "step": 309620 + }, + { + "epoch": 12.82, + "grad_norm": 1.40625, + "learning_rate": 0.0003065149995512049, + "loss": 0.1875, + "step": 309630 + }, + { + "epoch": 12.83, + "grad_norm": 1.1796875, + "learning_rate": 0.00030650443504865084, + "loss": 0.2291, + "step": 309640 + }, + { + "epoch": 12.83, + "grad_norm": 0.3515625, + "learning_rate": 0.00030649387043976147, + "loss": 0.1841, + "step": 309650 + }, + { + "epoch": 12.83, + "grad_norm": 0.353515625, + "learning_rate": 0.0003064833057245568, + "loss": 0.1931, + "step": 309660 + }, + { + "epoch": 12.83, + "grad_norm": 0.7734375, + "learning_rate": 0.00030647274090305653, + "loss": 0.1994, + "step": 309670 + }, + { + "epoch": 12.83, + "grad_norm": 1.421875, + "learning_rate": 0.00030646217597528076, + "loss": 0.2011, + "step": 309680 + }, + { + "epoch": 12.83, + "grad_norm": 0.7578125, + "learning_rate": 0.00030645161094124926, + "loss": 0.2145, + "step": 309690 + }, + { + "epoch": 12.83, + "grad_norm": 1.5234375, + "learning_rate": 0.0003064410458009819, + "loss": 0.2563, + "step": 309700 + }, + { + "epoch": 12.83, + "grad_norm": 0.55859375, + "learning_rate": 0.00030643048055449865, + "loss": 0.1721, + "step": 309710 + }, + { + "epoch": 12.83, + "grad_norm": 0.9609375, + "learning_rate": 0.0003064199152018192, + "loss": 0.1403, + "step": 309720 + }, + { + "epoch": 12.83, + "grad_norm": 1.0546875, + "learning_rate": 0.00030640934974296365, + "loss": 0.1938, + "step": 309730 + }, + { + "epoch": 12.83, + "grad_norm": 1.1171875, + "learning_rate": 0.00030639878417795184, + "loss": 0.1497, + "step": 309740 + }, + { + "epoch": 12.83, + "grad_norm": 0.474609375, + "learning_rate": 0.00030638821850680347, + "loss": 0.1841, + "step": 309750 + }, + { + "epoch": 12.83, + "grad_norm": 2.28125, + "learning_rate": 0.0003063776527295387, + "loss": 0.1699, + "step": 309760 + }, + { + "epoch": 12.83, + "grad_norm": 0.7265625, + "learning_rate": 0.0003063670868461772, + "loss": 0.1658, + "step": 309770 + }, + { + "epoch": 12.83, + "grad_norm": 0.84765625, + "learning_rate": 0.0003063565208567389, + "loss": 0.1999, + "step": 309780 + }, + { + "epoch": 12.83, + "grad_norm": 0.66015625, + "learning_rate": 0.00030634595476124383, + "loss": 0.176, + "step": 309790 + }, + { + "epoch": 12.83, + "grad_norm": 1.625, + "learning_rate": 0.00030633538855971167, + "loss": 0.2126, + "step": 309800 + }, + { + "epoch": 12.83, + "grad_norm": 1.0703125, + "learning_rate": 0.0003063248222521624, + "loss": 0.2265, + "step": 309810 + }, + { + "epoch": 12.83, + "grad_norm": 1.3203125, + "learning_rate": 0.000306314255838616, + "loss": 0.1524, + "step": 309820 + }, + { + "epoch": 12.83, + "grad_norm": 0.80859375, + "learning_rate": 0.0003063036893190921, + "loss": 0.1777, + "step": 309830 + }, + { + "epoch": 12.83, + "grad_norm": 1.078125, + "learning_rate": 0.0003062931226936108, + "loss": 0.1762, + "step": 309840 + }, + { + "epoch": 12.83, + "grad_norm": 0.65625, + "learning_rate": 0.0003062825559621919, + "loss": 0.19, + "step": 309850 + }, + { + "epoch": 12.83, + "grad_norm": 0.7578125, + "learning_rate": 0.0003062719891248553, + "loss": 0.1798, + "step": 309860 + }, + { + "epoch": 12.83, + "grad_norm": 0.9609375, + "learning_rate": 0.000306261422181621, + "loss": 0.2228, + "step": 309870 + }, + { + "epoch": 12.84, + "grad_norm": 0.9375, + "learning_rate": 0.0003062508551325087, + "loss": 0.233, + "step": 309880 + }, + { + "epoch": 12.84, + "grad_norm": 0.87109375, + "learning_rate": 0.00030624028797753835, + "loss": 0.1867, + "step": 309890 + }, + { + "epoch": 12.84, + "grad_norm": 0.6640625, + "learning_rate": 0.0003062297207167299, + "loss": 0.1855, + "step": 309900 + }, + { + "epoch": 12.84, + "grad_norm": 0.1435546875, + "learning_rate": 0.00030621915335010306, + "loss": 0.2006, + "step": 309910 + }, + { + "epoch": 12.84, + "grad_norm": 1.25, + "learning_rate": 0.000306208585877678, + "loss": 0.1927, + "step": 309920 + }, + { + "epoch": 12.84, + "grad_norm": 0.640625, + "learning_rate": 0.0003061980182994744, + "loss": 0.2074, + "step": 309930 + }, + { + "epoch": 12.84, + "grad_norm": 0.34765625, + "learning_rate": 0.00030618745061551216, + "loss": 0.1208, + "step": 309940 + }, + { + "epoch": 12.84, + "grad_norm": 0.734375, + "learning_rate": 0.0003061768828258113, + "loss": 0.1789, + "step": 309950 + }, + { + "epoch": 12.84, + "grad_norm": 0.375, + "learning_rate": 0.00030616631493039147, + "loss": 0.1693, + "step": 309960 + }, + { + "epoch": 12.84, + "grad_norm": 1.8828125, + "learning_rate": 0.00030615574692927284, + "loss": 0.1821, + "step": 309970 + }, + { + "epoch": 12.84, + "grad_norm": 0.6953125, + "learning_rate": 0.00030614517882247513, + "loss": 0.1603, + "step": 309980 + }, + { + "epoch": 12.84, + "grad_norm": 0.96484375, + "learning_rate": 0.0003061346106100182, + "loss": 0.162, + "step": 309990 + }, + { + "epoch": 12.84, + "grad_norm": 1.40625, + "learning_rate": 0.00030612404229192204, + "loss": 0.2232, + "step": 310000 + }, + { + "epoch": 12.84, + "grad_norm": 1.0625, + "learning_rate": 0.0003061134738682065, + "loss": 0.1751, + "step": 310010 + }, + { + "epoch": 12.84, + "grad_norm": 0.95703125, + "learning_rate": 0.0003061029053388914, + "loss": 0.1845, + "step": 310020 + }, + { + "epoch": 12.84, + "grad_norm": 0.85546875, + "learning_rate": 0.00030609233670399677, + "loss": 0.1724, + "step": 310030 + }, + { + "epoch": 12.84, + "grad_norm": 1.4296875, + "learning_rate": 0.0003060817679635423, + "loss": 0.1984, + "step": 310040 + }, + { + "epoch": 12.84, + "grad_norm": 0.81640625, + "learning_rate": 0.0003060711991175481, + "loss": 0.2183, + "step": 310050 + }, + { + "epoch": 12.84, + "grad_norm": 0.7265625, + "learning_rate": 0.0003060606301660339, + "loss": 0.1943, + "step": 310060 + }, + { + "epoch": 12.84, + "grad_norm": 1.1796875, + "learning_rate": 0.0003060500611090197, + "loss": 0.1532, + "step": 310070 + }, + { + "epoch": 12.84, + "grad_norm": 0.94921875, + "learning_rate": 0.0003060394919465254, + "loss": 0.2076, + "step": 310080 + }, + { + "epoch": 12.84, + "grad_norm": 0.8125, + "learning_rate": 0.00030602892267857066, + "loss": 0.1183, + "step": 310090 + }, + { + "epoch": 12.84, + "grad_norm": 0.4609375, + "learning_rate": 0.0003060183533051756, + "loss": 0.1743, + "step": 310100 + }, + { + "epoch": 12.84, + "grad_norm": 0.77734375, + "learning_rate": 0.0003060077838263601, + "loss": 0.2055, + "step": 310110 + }, + { + "epoch": 12.85, + "grad_norm": 1.4296875, + "learning_rate": 0.000305997214242144, + "loss": 0.1914, + "step": 310120 + }, + { + "epoch": 12.85, + "grad_norm": 1.0859375, + "learning_rate": 0.0003059866445525472, + "loss": 0.1696, + "step": 310130 + }, + { + "epoch": 12.85, + "grad_norm": 0.71875, + "learning_rate": 0.0003059760747575895, + "loss": 0.1903, + "step": 310140 + }, + { + "epoch": 12.85, + "grad_norm": 1.2421875, + "learning_rate": 0.0003059655048572909, + "loss": 0.2181, + "step": 310150 + }, + { + "epoch": 12.85, + "grad_norm": 0.8203125, + "learning_rate": 0.0003059549348516714, + "loss": 0.1967, + "step": 310160 + }, + { + "epoch": 12.85, + "grad_norm": 1.3515625, + "learning_rate": 0.00030594436474075057, + "loss": 0.173, + "step": 310170 + }, + { + "epoch": 12.85, + "grad_norm": 1.0234375, + "learning_rate": 0.0003059337945245485, + "loss": 0.2025, + "step": 310180 + }, + { + "epoch": 12.85, + "grad_norm": 0.58984375, + "learning_rate": 0.0003059232242030852, + "loss": 0.2199, + "step": 310190 + }, + { + "epoch": 12.85, + "grad_norm": 0.486328125, + "learning_rate": 0.0003059126537763803, + "loss": 0.2013, + "step": 310200 + }, + { + "epoch": 12.85, + "grad_norm": 1.2109375, + "learning_rate": 0.0003059020832444539, + "loss": 0.1521, + "step": 310210 + }, + { + "epoch": 12.85, + "grad_norm": 0.9140625, + "learning_rate": 0.00030589151260732576, + "loss": 0.1849, + "step": 310220 + }, + { + "epoch": 12.85, + "grad_norm": 0.81640625, + "learning_rate": 0.00030588094186501585, + "loss": 0.2206, + "step": 310230 + }, + { + "epoch": 12.85, + "grad_norm": 0.75390625, + "learning_rate": 0.00030587037101754413, + "loss": 0.1812, + "step": 310240 + }, + { + "epoch": 12.85, + "grad_norm": 1.34375, + "learning_rate": 0.0003058598000649303, + "loss": 0.2006, + "step": 310250 + }, + { + "epoch": 12.85, + "grad_norm": 1.09375, + "learning_rate": 0.00030584922900719436, + "loss": 0.1716, + "step": 310260 + }, + { + "epoch": 12.85, + "grad_norm": 1.015625, + "learning_rate": 0.0003058386578443562, + "loss": 0.1726, + "step": 310270 + }, + { + "epoch": 12.85, + "grad_norm": 0.84765625, + "learning_rate": 0.0003058280865764358, + "loss": 0.1985, + "step": 310280 + }, + { + "epoch": 12.85, + "grad_norm": 0.58984375, + "learning_rate": 0.00030581751520345293, + "loss": 0.1237, + "step": 310290 + }, + { + "epoch": 12.85, + "grad_norm": 0.95703125, + "learning_rate": 0.0003058069437254275, + "loss": 0.2026, + "step": 310300 + }, + { + "epoch": 12.85, + "grad_norm": 1.6484375, + "learning_rate": 0.0003057963721423794, + "loss": 0.1225, + "step": 310310 + }, + { + "epoch": 12.85, + "grad_norm": 1.578125, + "learning_rate": 0.00030578580045432857, + "loss": 0.2499, + "step": 310320 + }, + { + "epoch": 12.85, + "grad_norm": 0.373046875, + "learning_rate": 0.00030577522866129497, + "loss": 0.2103, + "step": 310330 + }, + { + "epoch": 12.85, + "grad_norm": 1.1484375, + "learning_rate": 0.00030576465676329835, + "loss": 0.1622, + "step": 310340 + }, + { + "epoch": 12.85, + "grad_norm": 0.9765625, + "learning_rate": 0.00030575408476035866, + "loss": 0.1891, + "step": 310350 + }, + { + "epoch": 12.86, + "grad_norm": 0.46484375, + "learning_rate": 0.0003057435126524958, + "loss": 0.1635, + "step": 310360 + }, + { + "epoch": 12.86, + "grad_norm": 1.296875, + "learning_rate": 0.0003057329404397297, + "loss": 0.2005, + "step": 310370 + }, + { + "epoch": 12.86, + "grad_norm": 0.98046875, + "learning_rate": 0.0003057223681220802, + "loss": 0.185, + "step": 310380 + }, + { + "epoch": 12.86, + "grad_norm": 1.265625, + "learning_rate": 0.00030571179569956723, + "loss": 0.163, + "step": 310390 + }, + { + "epoch": 12.86, + "grad_norm": 0.66015625, + "learning_rate": 0.00030570122317221066, + "loss": 0.1835, + "step": 310400 + }, + { + "epoch": 12.86, + "grad_norm": 1.4609375, + "learning_rate": 0.0003056906505400305, + "loss": 0.2005, + "step": 310410 + }, + { + "epoch": 12.86, + "grad_norm": 0.6015625, + "learning_rate": 0.0003056800778030464, + "loss": 0.198, + "step": 310420 + }, + { + "epoch": 12.86, + "grad_norm": 0.61328125, + "learning_rate": 0.00030566950496127847, + "loss": 0.1809, + "step": 310430 + }, + { + "epoch": 12.86, + "grad_norm": 1.375, + "learning_rate": 0.0003056589320147465, + "loss": 0.1661, + "step": 310440 + }, + { + "epoch": 12.86, + "grad_norm": 0.7734375, + "learning_rate": 0.0003056483589634706, + "loss": 0.21, + "step": 310450 + }, + { + "epoch": 12.86, + "grad_norm": 0.6015625, + "learning_rate": 0.00030563778580747035, + "loss": 0.1949, + "step": 310460 + }, + { + "epoch": 12.86, + "grad_norm": 1.171875, + "learning_rate": 0.00030562721254676577, + "loss": 0.2138, + "step": 310470 + }, + { + "epoch": 12.86, + "grad_norm": 0.85546875, + "learning_rate": 0.0003056166391813768, + "loss": 0.2053, + "step": 310480 + }, + { + "epoch": 12.86, + "grad_norm": 1.421875, + "learning_rate": 0.0003056060657113234, + "loss": 0.14, + "step": 310490 + }, + { + "epoch": 12.86, + "grad_norm": 1.5546875, + "learning_rate": 0.00030559549213662543, + "loss": 0.1789, + "step": 310500 + }, + { + "epoch": 12.86, + "grad_norm": 1.1015625, + "learning_rate": 0.00030558491845730264, + "loss": 0.1942, + "step": 310510 + }, + { + "epoch": 12.86, + "grad_norm": 1.0078125, + "learning_rate": 0.00030557434467337504, + "loss": 0.2152, + "step": 310520 + }, + { + "epoch": 12.86, + "grad_norm": 0.70703125, + "learning_rate": 0.00030556377078486255, + "loss": 0.1685, + "step": 310530 + }, + { + "epoch": 12.86, + "grad_norm": 0.6953125, + "learning_rate": 0.00030555319679178504, + "loss": 0.173, + "step": 310540 + }, + { + "epoch": 12.86, + "grad_norm": 0.859375, + "learning_rate": 0.0003055426226941624, + "loss": 0.2631, + "step": 310550 + }, + { + "epoch": 12.86, + "grad_norm": 0.474609375, + "learning_rate": 0.00030553204849201456, + "loss": 0.1702, + "step": 310560 + }, + { + "epoch": 12.86, + "grad_norm": 1.046875, + "learning_rate": 0.0003055214741853614, + "loss": 0.1916, + "step": 310570 + }, + { + "epoch": 12.86, + "grad_norm": 1.09375, + "learning_rate": 0.0003055108997742228, + "loss": 0.2226, + "step": 310580 + }, + { + "epoch": 12.86, + "grad_norm": 2.671875, + "learning_rate": 0.0003055003252586187, + "loss": 0.1892, + "step": 310590 + }, + { + "epoch": 12.87, + "grad_norm": 0.671875, + "learning_rate": 0.000305489750638569, + "loss": 0.1794, + "step": 310600 + }, + { + "epoch": 12.87, + "grad_norm": 0.70703125, + "learning_rate": 0.0003054791759140935, + "loss": 0.2322, + "step": 310610 + }, + { + "epoch": 12.87, + "grad_norm": 0.302734375, + "learning_rate": 0.0003054686010852123, + "loss": 0.1641, + "step": 310620 + }, + { + "epoch": 12.87, + "grad_norm": 0.1767578125, + "learning_rate": 0.0003054580261519451, + "loss": 0.218, + "step": 310630 + }, + { + "epoch": 12.87, + "grad_norm": 0.83984375, + "learning_rate": 0.0003054474511143119, + "loss": 0.2126, + "step": 310640 + }, + { + "epoch": 12.87, + "grad_norm": 0.216796875, + "learning_rate": 0.00030543687597233267, + "loss": 0.2037, + "step": 310650 + }, + { + "epoch": 12.87, + "grad_norm": 0.7734375, + "learning_rate": 0.00030542630072602706, + "loss": 0.2059, + "step": 310660 + }, + { + "epoch": 12.87, + "grad_norm": 0.32421875, + "learning_rate": 0.0003054157253754153, + "loss": 0.2751, + "step": 310670 + }, + { + "epoch": 12.87, + "grad_norm": 1.078125, + "learning_rate": 0.000305405149920517, + "loss": 0.1945, + "step": 310680 + }, + { + "epoch": 12.87, + "grad_norm": 1.15625, + "learning_rate": 0.00030539457436135227, + "loss": 0.1623, + "step": 310690 + }, + { + "epoch": 12.87, + "grad_norm": 1.1796875, + "learning_rate": 0.0003053839986979409, + "loss": 0.1899, + "step": 310700 + }, + { + "epoch": 12.87, + "grad_norm": 0.5234375, + "learning_rate": 0.00030537342293030287, + "loss": 0.1741, + "step": 310710 + }, + { + "epoch": 12.87, + "grad_norm": 0.80859375, + "learning_rate": 0.000305362847058458, + "loss": 0.2391, + "step": 310720 + }, + { + "epoch": 12.87, + "grad_norm": 0.470703125, + "learning_rate": 0.00030535227108242625, + "loss": 0.1931, + "step": 310730 + }, + { + "epoch": 12.87, + "grad_norm": 0.1884765625, + "learning_rate": 0.0003053416950022274, + "loss": 0.2376, + "step": 310740 + }, + { + "epoch": 12.87, + "grad_norm": 0.734375, + "learning_rate": 0.00030533111881788163, + "loss": 0.1985, + "step": 310750 + }, + { + "epoch": 12.87, + "grad_norm": 0.546875, + "learning_rate": 0.00030532054252940853, + "loss": 0.1604, + "step": 310760 + }, + { + "epoch": 12.87, + "grad_norm": 0.375, + "learning_rate": 0.00030530996613682823, + "loss": 0.1708, + "step": 310770 + }, + { + "epoch": 12.87, + "grad_norm": 0.67578125, + "learning_rate": 0.0003052993896401605, + "loss": 0.1854, + "step": 310780 + }, + { + "epoch": 12.87, + "grad_norm": 2.109375, + "learning_rate": 0.00030528881303942536, + "loss": 0.1734, + "step": 310790 + }, + { + "epoch": 12.87, + "grad_norm": 0.89453125, + "learning_rate": 0.00030527823633464253, + "loss": 0.1656, + "step": 310800 + }, + { + "epoch": 12.87, + "grad_norm": 1.53125, + "learning_rate": 0.00030526765952583213, + "loss": 0.2148, + "step": 310810 + }, + { + "epoch": 12.87, + "grad_norm": 0.42578125, + "learning_rate": 0.0003052570826130139, + "loss": 0.2251, + "step": 310820 + }, + { + "epoch": 12.87, + "grad_norm": 0.7109375, + "learning_rate": 0.0003052465055962078, + "loss": 0.192, + "step": 310830 + }, + { + "epoch": 12.87, + "grad_norm": 0.74609375, + "learning_rate": 0.0003052359284754338, + "loss": 0.1723, + "step": 310840 + }, + { + "epoch": 12.88, + "grad_norm": 1.2734375, + "learning_rate": 0.00030522535125071174, + "loss": 0.1749, + "step": 310850 + }, + { + "epoch": 12.88, + "grad_norm": 0.73828125, + "learning_rate": 0.0003052147739220615, + "loss": 0.2187, + "step": 310860 + }, + { + "epoch": 12.88, + "grad_norm": 2.4375, + "learning_rate": 0.000305204196489503, + "loss": 0.1789, + "step": 310870 + }, + { + "epoch": 12.88, + "grad_norm": 0.76953125, + "learning_rate": 0.00030519361895305617, + "loss": 0.2476, + "step": 310880 + }, + { + "epoch": 12.88, + "grad_norm": 0.73828125, + "learning_rate": 0.000305183041312741, + "loss": 0.2101, + "step": 310890 + }, + { + "epoch": 12.88, + "grad_norm": 1.265625, + "learning_rate": 0.00030517246356857717, + "loss": 0.2208, + "step": 310900 + }, + { + "epoch": 12.88, + "grad_norm": 1.40625, + "learning_rate": 0.00030516188572058485, + "loss": 0.1828, + "step": 310910 + }, + { + "epoch": 12.88, + "grad_norm": 0.8671875, + "learning_rate": 0.00030515130776878374, + "loss": 0.2097, + "step": 310920 + }, + { + "epoch": 12.88, + "grad_norm": 0.8125, + "learning_rate": 0.00030514072971319386, + "loss": 0.2073, + "step": 310930 + }, + { + "epoch": 12.88, + "grad_norm": 0.59765625, + "learning_rate": 0.00030513015155383505, + "loss": 0.2041, + "step": 310940 + }, + { + "epoch": 12.88, + "grad_norm": 0.62890625, + "learning_rate": 0.0003051195732907272, + "loss": 0.1796, + "step": 310950 + }, + { + "epoch": 12.88, + "grad_norm": 0.625, + "learning_rate": 0.0003051089949238904, + "loss": 0.1667, + "step": 310960 + }, + { + "epoch": 12.88, + "grad_norm": 0.84765625, + "learning_rate": 0.0003050984164533444, + "loss": 0.1715, + "step": 310970 + }, + { + "epoch": 12.88, + "grad_norm": 0.67578125, + "learning_rate": 0.000305087837879109, + "loss": 0.1836, + "step": 310980 + }, + { + "epoch": 12.88, + "grad_norm": 0.48828125, + "learning_rate": 0.0003050772592012044, + "loss": 0.1983, + "step": 310990 + }, + { + "epoch": 12.88, + "grad_norm": 1.0703125, + "learning_rate": 0.00030506668041965025, + "loss": 0.2647, + "step": 311000 + }, + { + "epoch": 12.88, + "grad_norm": 1.0859375, + "learning_rate": 0.0003050561015344665, + "loss": 0.1894, + "step": 311010 + }, + { + "epoch": 12.88, + "grad_norm": 1.03125, + "learning_rate": 0.0003050455225456732, + "loss": 0.209, + "step": 311020 + }, + { + "epoch": 12.88, + "grad_norm": 0.7734375, + "learning_rate": 0.00030503494345329014, + "loss": 0.2134, + "step": 311030 + }, + { + "epoch": 12.88, + "grad_norm": 0.88671875, + "learning_rate": 0.00030502436425733733, + "loss": 0.1895, + "step": 311040 + }, + { + "epoch": 12.88, + "grad_norm": 0.93359375, + "learning_rate": 0.00030501378495783455, + "loss": 0.2107, + "step": 311050 + }, + { + "epoch": 12.88, + "grad_norm": 0.7421875, + "learning_rate": 0.00030500320555480173, + "loss": 0.1705, + "step": 311060 + }, + { + "epoch": 12.88, + "grad_norm": 1.0, + "learning_rate": 0.0003049926260482589, + "loss": 0.1914, + "step": 311070 + }, + { + "epoch": 12.88, + "grad_norm": 1.0703125, + "learning_rate": 0.0003049820464382258, + "loss": 0.192, + "step": 311080 + }, + { + "epoch": 12.89, + "grad_norm": 1.015625, + "learning_rate": 0.0003049714667247225, + "loss": 0.253, + "step": 311090 + }, + { + "epoch": 12.89, + "grad_norm": 0.60546875, + "learning_rate": 0.0003049608869077688, + "loss": 0.2231, + "step": 311100 + }, + { + "epoch": 12.89, + "grad_norm": 0.6484375, + "learning_rate": 0.0003049503069873846, + "loss": 0.2285, + "step": 311110 + }, + { + "epoch": 12.89, + "grad_norm": 0.97265625, + "learning_rate": 0.00030493972696358997, + "loss": 0.1891, + "step": 311120 + }, + { + "epoch": 12.89, + "grad_norm": 0.58203125, + "learning_rate": 0.0003049291468364046, + "loss": 0.2129, + "step": 311130 + }, + { + "epoch": 12.89, + "grad_norm": 0.58984375, + "learning_rate": 0.00030491856660584855, + "loss": 0.1784, + "step": 311140 + }, + { + "epoch": 12.89, + "grad_norm": 0.69140625, + "learning_rate": 0.0003049079862719417, + "loss": 0.1919, + "step": 311150 + }, + { + "epoch": 12.89, + "grad_norm": 1.140625, + "learning_rate": 0.00030489740583470394, + "loss": 0.2006, + "step": 311160 + }, + { + "epoch": 12.89, + "grad_norm": 0.80859375, + "learning_rate": 0.00030488682529415515, + "loss": 0.2164, + "step": 311170 + }, + { + "epoch": 12.89, + "grad_norm": 0.52734375, + "learning_rate": 0.0003048762446503154, + "loss": 0.2296, + "step": 311180 + }, + { + "epoch": 12.89, + "grad_norm": 0.482421875, + "learning_rate": 0.00030486566390320436, + "loss": 0.1607, + "step": 311190 + }, + { + "epoch": 12.89, + "grad_norm": 0.96875, + "learning_rate": 0.00030485508305284214, + "loss": 0.2241, + "step": 311200 + }, + { + "epoch": 12.89, + "grad_norm": 0.5234375, + "learning_rate": 0.00030484450209924854, + "loss": 0.1784, + "step": 311210 + }, + { + "epoch": 12.89, + "grad_norm": 0.59375, + "learning_rate": 0.0003048339210424435, + "loss": 0.2045, + "step": 311220 + }, + { + "epoch": 12.89, + "grad_norm": 0.859375, + "learning_rate": 0.000304823339882447, + "loss": 0.1894, + "step": 311230 + }, + { + "epoch": 12.89, + "grad_norm": 0.255859375, + "learning_rate": 0.00030481275861927884, + "loss": 0.2085, + "step": 311240 + }, + { + "epoch": 12.89, + "grad_norm": 0.6640625, + "learning_rate": 0.000304802177252959, + "loss": 0.1632, + "step": 311250 + }, + { + "epoch": 12.89, + "grad_norm": 2.078125, + "learning_rate": 0.0003047915957835074, + "loss": 0.1707, + "step": 311260 + }, + { + "epoch": 12.89, + "grad_norm": 1.390625, + "learning_rate": 0.0003047810142109439, + "loss": 0.1894, + "step": 311270 + }, + { + "epoch": 12.89, + "grad_norm": 0.8515625, + "learning_rate": 0.00030477043253528845, + "loss": 0.2237, + "step": 311280 + }, + { + "epoch": 12.89, + "grad_norm": 0.87890625, + "learning_rate": 0.000304759850756561, + "loss": 0.1874, + "step": 311290 + }, + { + "epoch": 12.89, + "grad_norm": 1.3828125, + "learning_rate": 0.00030474926887478137, + "loss": 0.246, + "step": 311300 + }, + { + "epoch": 12.89, + "grad_norm": 2.359375, + "learning_rate": 0.0003047386868899696, + "loss": 0.2243, + "step": 311310 + }, + { + "epoch": 12.89, + "grad_norm": 0.578125, + "learning_rate": 0.00030472810480214546, + "loss": 0.1988, + "step": 311320 + }, + { + "epoch": 12.9, + "grad_norm": 1.1328125, + "learning_rate": 0.00030471752261132897, + "loss": 0.1781, + "step": 311330 + }, + { + "epoch": 12.9, + "grad_norm": 0.6875, + "learning_rate": 0.00030470694031754005, + "loss": 0.1851, + "step": 311340 + }, + { + "epoch": 12.9, + "grad_norm": 0.546875, + "learning_rate": 0.0003046963579207985, + "loss": 0.2187, + "step": 311350 + }, + { + "epoch": 12.9, + "grad_norm": 0.6015625, + "learning_rate": 0.00030468577542112435, + "loss": 0.2316, + "step": 311360 + }, + { + "epoch": 12.9, + "grad_norm": 0.73828125, + "learning_rate": 0.00030467519281853747, + "loss": 0.214, + "step": 311370 + }, + { + "epoch": 12.9, + "grad_norm": 0.69921875, + "learning_rate": 0.0003046646101130578, + "loss": 0.201, + "step": 311380 + }, + { + "epoch": 12.9, + "grad_norm": 1.53125, + "learning_rate": 0.0003046540273047053, + "loss": 0.1802, + "step": 311390 + }, + { + "epoch": 12.9, + "grad_norm": 1.578125, + "learning_rate": 0.00030464344439349965, + "loss": 0.2461, + "step": 311400 + }, + { + "epoch": 12.9, + "grad_norm": 1.2421875, + "learning_rate": 0.0003046328613794611, + "loss": 0.2515, + "step": 311410 + }, + { + "epoch": 12.9, + "grad_norm": 0.75390625, + "learning_rate": 0.00030462227826260924, + "loss": 0.2108, + "step": 311420 + }, + { + "epoch": 12.9, + "grad_norm": 0.79296875, + "learning_rate": 0.00030461169504296425, + "loss": 0.1738, + "step": 311430 + }, + { + "epoch": 12.9, + "grad_norm": 0.7890625, + "learning_rate": 0.000304601111720546, + "loss": 0.1952, + "step": 311440 + }, + { + "epoch": 12.9, + "grad_norm": 0.9140625, + "learning_rate": 0.0003045905282953743, + "loss": 0.1822, + "step": 311450 + }, + { + "epoch": 12.9, + "grad_norm": 1.2109375, + "learning_rate": 0.0003045799447674691, + "loss": 0.1787, + "step": 311460 + }, + { + "epoch": 12.9, + "grad_norm": 0.90625, + "learning_rate": 0.00030456936113685044, + "loss": 0.2526, + "step": 311470 + }, + { + "epoch": 12.9, + "grad_norm": 0.53515625, + "learning_rate": 0.000304558777403538, + "loss": 0.2013, + "step": 311480 + }, + { + "epoch": 12.9, + "grad_norm": 1.640625, + "learning_rate": 0.0003045481935675519, + "loss": 0.1721, + "step": 311490 + }, + { + "epoch": 12.9, + "grad_norm": 2.09375, + "learning_rate": 0.00030453760962891193, + "loss": 0.2464, + "step": 311500 + }, + { + "epoch": 12.9, + "grad_norm": 0.53125, + "learning_rate": 0.0003045270255876381, + "loss": 0.2425, + "step": 311510 + }, + { + "epoch": 12.9, + "grad_norm": 0.79296875, + "learning_rate": 0.0003045164414437503, + "loss": 0.1963, + "step": 311520 + }, + { + "epoch": 12.9, + "grad_norm": 0.431640625, + "learning_rate": 0.00030450585719726846, + "loss": 0.2227, + "step": 311530 + }, + { + "epoch": 12.9, + "grad_norm": 1.4375, + "learning_rate": 0.0003044952728482125, + "loss": 0.1555, + "step": 311540 + }, + { + "epoch": 12.9, + "grad_norm": 0.65625, + "learning_rate": 0.0003044846883966023, + "loss": 0.1969, + "step": 311550 + }, + { + "epoch": 12.9, + "grad_norm": 0.921875, + "learning_rate": 0.00030447410384245785, + "loss": 0.2058, + "step": 311560 + }, + { + "epoch": 12.91, + "grad_norm": 0.671875, + "learning_rate": 0.00030446351918579894, + "loss": 0.2019, + "step": 311570 + }, + { + "epoch": 12.91, + "grad_norm": 0.8515625, + "learning_rate": 0.0003044529344266456, + "loss": 0.1912, + "step": 311580 + }, + { + "epoch": 12.91, + "grad_norm": 0.8359375, + "learning_rate": 0.0003044423495650177, + "loss": 0.2369, + "step": 311590 + }, + { + "epoch": 12.91, + "grad_norm": 1.078125, + "learning_rate": 0.00030443176460093527, + "loss": 0.1815, + "step": 311600 + }, + { + "epoch": 12.91, + "grad_norm": 0.8671875, + "learning_rate": 0.000304421179534418, + "loss": 0.2175, + "step": 311610 + }, + { + "epoch": 12.91, + "grad_norm": 1.25, + "learning_rate": 0.000304410594365486, + "loss": 0.2013, + "step": 311620 + }, + { + "epoch": 12.91, + "grad_norm": 0.341796875, + "learning_rate": 0.0003044000090941591, + "loss": 0.1644, + "step": 311630 + }, + { + "epoch": 12.91, + "grad_norm": 0.69140625, + "learning_rate": 0.0003043894237204573, + "loss": 0.2346, + "step": 311640 + }, + { + "epoch": 12.91, + "grad_norm": 1.5234375, + "learning_rate": 0.00030437883824440055, + "loss": 0.2201, + "step": 311650 + }, + { + "epoch": 12.91, + "grad_norm": 0.90234375, + "learning_rate": 0.00030436825266600857, + "loss": 0.1734, + "step": 311660 + }, + { + "epoch": 12.91, + "grad_norm": 1.1875, + "learning_rate": 0.00030435766698530146, + "loss": 0.2146, + "step": 311670 + }, + { + "epoch": 12.91, + "grad_norm": 0.65625, + "learning_rate": 0.0003043470812022991, + "loss": 0.1999, + "step": 311680 + }, + { + "epoch": 12.91, + "grad_norm": 0.85546875, + "learning_rate": 0.00030433649531702144, + "loss": 0.2429, + "step": 311690 + }, + { + "epoch": 12.91, + "grad_norm": 2.625, + "learning_rate": 0.0003043259093294883, + "loss": 0.2007, + "step": 311700 + }, + { + "epoch": 12.91, + "grad_norm": 1.28125, + "learning_rate": 0.0003043153232397197, + "loss": 0.206, + "step": 311710 + }, + { + "epoch": 12.91, + "grad_norm": 0.5078125, + "learning_rate": 0.00030430473704773554, + "loss": 0.1805, + "step": 311720 + }, + { + "epoch": 12.91, + "grad_norm": 1.1953125, + "learning_rate": 0.00030429415075355573, + "loss": 0.1858, + "step": 311730 + }, + { + "epoch": 12.91, + "grad_norm": 0.734375, + "learning_rate": 0.00030428356435720016, + "loss": 0.1808, + "step": 311740 + }, + { + "epoch": 12.91, + "grad_norm": 0.84375, + "learning_rate": 0.0003042729778586888, + "loss": 0.2048, + "step": 311750 + }, + { + "epoch": 12.91, + "grad_norm": 0.78125, + "learning_rate": 0.00030426239125804156, + "loss": 0.2371, + "step": 311760 + }, + { + "epoch": 12.91, + "grad_norm": 0.90234375, + "learning_rate": 0.00030425180455527837, + "loss": 0.185, + "step": 311770 + }, + { + "epoch": 12.91, + "grad_norm": 0.40234375, + "learning_rate": 0.00030424121775041906, + "loss": 0.2157, + "step": 311780 + }, + { + "epoch": 12.91, + "grad_norm": 0.53515625, + "learning_rate": 0.0003042306308434837, + "loss": 0.2134, + "step": 311790 + }, + { + "epoch": 12.91, + "grad_norm": 0.62890625, + "learning_rate": 0.00030422004383449215, + "loss": 0.1551, + "step": 311800 + }, + { + "epoch": 12.92, + "grad_norm": 0.51171875, + "learning_rate": 0.00030420945672346433, + "loss": 0.1834, + "step": 311810 + }, + { + "epoch": 12.92, + "grad_norm": 0.435546875, + "learning_rate": 0.0003041988695104202, + "loss": 0.2033, + "step": 311820 + }, + { + "epoch": 12.92, + "grad_norm": 0.87890625, + "learning_rate": 0.0003041882821953796, + "loss": 0.1638, + "step": 311830 + }, + { + "epoch": 12.92, + "grad_norm": 0.9765625, + "learning_rate": 0.0003041776947783625, + "loss": 0.2245, + "step": 311840 + }, + { + "epoch": 12.92, + "grad_norm": 0.72265625, + "learning_rate": 0.0003041671072593889, + "loss": 0.2391, + "step": 311850 + }, + { + "epoch": 12.92, + "grad_norm": 1.34375, + "learning_rate": 0.00030415651963847855, + "loss": 0.144, + "step": 311860 + }, + { + "epoch": 12.92, + "grad_norm": 0.0, + "learning_rate": 0.00030414593191565155, + "loss": 0.1868, + "step": 311870 + }, + { + "epoch": 12.92, + "grad_norm": 1.1953125, + "learning_rate": 0.0003041353440909277, + "loss": 0.2105, + "step": 311880 + }, + { + "epoch": 12.92, + "grad_norm": 0.52734375, + "learning_rate": 0.0003041247561643271, + "loss": 0.1679, + "step": 311890 + }, + { + "epoch": 12.92, + "grad_norm": 1.1484375, + "learning_rate": 0.00030411416813586945, + "loss": 0.2217, + "step": 311900 + }, + { + "epoch": 12.92, + "grad_norm": 1.0234375, + "learning_rate": 0.0003041035800055748, + "loss": 0.2138, + "step": 311910 + }, + { + "epoch": 12.92, + "grad_norm": 0.4609375, + "learning_rate": 0.000304092991773463, + "loss": 0.1744, + "step": 311920 + }, + { + "epoch": 12.92, + "grad_norm": 0.419921875, + "learning_rate": 0.0003040824034395541, + "loss": 0.1526, + "step": 311930 + }, + { + "epoch": 12.92, + "grad_norm": 1.2109375, + "learning_rate": 0.00030407181500386794, + "loss": 0.1611, + "step": 311940 + }, + { + "epoch": 12.92, + "grad_norm": 1.2421875, + "learning_rate": 0.0003040612264664245, + "loss": 0.2273, + "step": 311950 + }, + { + "epoch": 12.92, + "grad_norm": 0.76953125, + "learning_rate": 0.0003040506378272436, + "loss": 0.1617, + "step": 311960 + }, + { + "epoch": 12.92, + "grad_norm": 1.21875, + "learning_rate": 0.0003040400490863453, + "loss": 0.164, + "step": 311970 + }, + { + "epoch": 12.92, + "grad_norm": 1.515625, + "learning_rate": 0.00030402946024374946, + "loss": 0.1783, + "step": 311980 + }, + { + "epoch": 12.92, + "grad_norm": 0.53125, + "learning_rate": 0.0003040188712994759, + "loss": 0.1888, + "step": 311990 + }, + { + "epoch": 12.92, + "grad_norm": 0.62890625, + "learning_rate": 0.0003040082822535448, + "loss": 0.216, + "step": 312000 + }, + { + "epoch": 12.92, + "grad_norm": 0.69140625, + "learning_rate": 0.0003039976931059759, + "loss": 0.1737, + "step": 312010 + }, + { + "epoch": 12.92, + "grad_norm": 0.369140625, + "learning_rate": 0.00030398710385678914, + "loss": 0.1222, + "step": 312020 + }, + { + "epoch": 12.92, + "grad_norm": 1.078125, + "learning_rate": 0.0003039765145060045, + "loss": 0.2165, + "step": 312030 + }, + { + "epoch": 12.92, + "grad_norm": 0.369140625, + "learning_rate": 0.0003039659250536418, + "loss": 0.2193, + "step": 312040 + }, + { + "epoch": 12.93, + "grad_norm": 0.390625, + "learning_rate": 0.0003039553354997212, + "loss": 0.1693, + "step": 312050 + }, + { + "epoch": 12.93, + "grad_norm": 0.78125, + "learning_rate": 0.00030394474584426244, + "loss": 0.2096, + "step": 312060 + }, + { + "epoch": 12.93, + "grad_norm": 0.5703125, + "learning_rate": 0.0003039341560872855, + "loss": 0.1685, + "step": 312070 + }, + { + "epoch": 12.93, + "grad_norm": 0.77734375, + "learning_rate": 0.0003039235662288103, + "loss": 0.1558, + "step": 312080 + }, + { + "epoch": 12.93, + "grad_norm": 0.498046875, + "learning_rate": 0.00030391297626885676, + "loss": 0.2156, + "step": 312090 + }, + { + "epoch": 12.93, + "grad_norm": 0.318359375, + "learning_rate": 0.0003039023862074448, + "loss": 0.2002, + "step": 312100 + }, + { + "epoch": 12.93, + "grad_norm": 0.9765625, + "learning_rate": 0.00030389179604459437, + "loss": 0.205, + "step": 312110 + }, + { + "epoch": 12.93, + "grad_norm": 0.7890625, + "learning_rate": 0.0003038812057803255, + "loss": 0.186, + "step": 312120 + }, + { + "epoch": 12.93, + "grad_norm": 0.71875, + "learning_rate": 0.0003038706154146579, + "loss": 0.1863, + "step": 312130 + }, + { + "epoch": 12.93, + "grad_norm": 0.83203125, + "learning_rate": 0.00030386002494761167, + "loss": 0.1776, + "step": 312140 + }, + { + "epoch": 12.93, + "grad_norm": 1.203125, + "learning_rate": 0.0003038494343792066, + "loss": 0.173, + "step": 312150 + }, + { + "epoch": 12.93, + "grad_norm": 0.36328125, + "learning_rate": 0.0003038388437094629, + "loss": 0.1741, + "step": 312160 + }, + { + "epoch": 12.93, + "grad_norm": 1.0625, + "learning_rate": 0.0003038282529384002, + "loss": 0.2046, + "step": 312170 + }, + { + "epoch": 12.93, + "grad_norm": 0.953125, + "learning_rate": 0.00030381766206603845, + "loss": 0.1554, + "step": 312180 + }, + { + "epoch": 12.93, + "grad_norm": 0.59375, + "learning_rate": 0.0003038070710923978, + "loss": 0.1712, + "step": 312190 + }, + { + "epoch": 12.93, + "grad_norm": 0.75390625, + "learning_rate": 0.00030379648001749803, + "loss": 0.1631, + "step": 312200 + }, + { + "epoch": 12.93, + "grad_norm": 1.046875, + "learning_rate": 0.00030378588884135907, + "loss": 0.1957, + "step": 312210 + }, + { + "epoch": 12.93, + "grad_norm": 1.0703125, + "learning_rate": 0.0003037752975640009, + "loss": 0.1271, + "step": 312220 + }, + { + "epoch": 12.93, + "grad_norm": 0.80078125, + "learning_rate": 0.0003037647061854434, + "loss": 0.1578, + "step": 312230 + }, + { + "epoch": 12.93, + "grad_norm": 0.00038909912109375, + "learning_rate": 0.0003037541147057065, + "loss": 0.1925, + "step": 312240 + }, + { + "epoch": 12.93, + "grad_norm": 0.59765625, + "learning_rate": 0.00030374352312481027, + "loss": 0.2081, + "step": 312250 + }, + { + "epoch": 12.93, + "grad_norm": 0.671875, + "learning_rate": 0.00030373293144277444, + "loss": 0.19, + "step": 312260 + }, + { + "epoch": 12.93, + "grad_norm": 0.61328125, + "learning_rate": 0.00030372233965961904, + "loss": 0.1754, + "step": 312270 + }, + { + "epoch": 12.93, + "grad_norm": 1.7109375, + "learning_rate": 0.00030371174777536406, + "loss": 0.174, + "step": 312280 + }, + { + "epoch": 12.94, + "grad_norm": 1.015625, + "learning_rate": 0.0003037011557900293, + "loss": 0.1494, + "step": 312290 + }, + { + "epoch": 12.94, + "grad_norm": 1.3828125, + "learning_rate": 0.0003036905637036348, + "loss": 0.2108, + "step": 312300 + }, + { + "epoch": 12.94, + "grad_norm": 2.34375, + "learning_rate": 0.00030367997151620045, + "loss": 0.1916, + "step": 312310 + }, + { + "epoch": 12.94, + "grad_norm": 1.2109375, + "learning_rate": 0.00030366937922774617, + "loss": 0.1865, + "step": 312320 + }, + { + "epoch": 12.94, + "grad_norm": 0.69921875, + "learning_rate": 0.000303658786838292, + "loss": 0.22, + "step": 312330 + }, + { + "epoch": 12.94, + "grad_norm": 0.78515625, + "learning_rate": 0.00030364819434785764, + "loss": 0.1907, + "step": 312340 + }, + { + "epoch": 12.94, + "grad_norm": 0.765625, + "learning_rate": 0.00030363760175646334, + "loss": 0.2043, + "step": 312350 + }, + { + "epoch": 12.94, + "grad_norm": 0.30859375, + "learning_rate": 0.00030362700906412873, + "loss": 0.2055, + "step": 312360 + }, + { + "epoch": 12.94, + "grad_norm": 0.8984375, + "learning_rate": 0.00030361641627087393, + "loss": 0.1751, + "step": 312370 + }, + { + "epoch": 12.94, + "grad_norm": 0.490234375, + "learning_rate": 0.0003036058233767189, + "loss": 0.2017, + "step": 312380 + }, + { + "epoch": 12.94, + "grad_norm": 1.2421875, + "learning_rate": 0.0003035952303816833, + "loss": 0.1992, + "step": 312390 + }, + { + "epoch": 12.94, + "grad_norm": 0.490234375, + "learning_rate": 0.00030358463728578743, + "loss": 0.16, + "step": 312400 + }, + { + "epoch": 12.94, + "grad_norm": 0.625, + "learning_rate": 0.00030357404408905107, + "loss": 0.1528, + "step": 312410 + }, + { + "epoch": 12.94, + "grad_norm": 0.486328125, + "learning_rate": 0.00030356345079149405, + "loss": 0.2242, + "step": 312420 + }, + { + "epoch": 12.94, + "grad_norm": 0.99609375, + "learning_rate": 0.0003035528573931365, + "loss": 0.1682, + "step": 312430 + }, + { + "epoch": 12.94, + "grad_norm": 1.7265625, + "learning_rate": 0.00030354226389399814, + "loss": 0.2065, + "step": 312440 + }, + { + "epoch": 12.94, + "grad_norm": 1.0625, + "learning_rate": 0.00030353167029409904, + "loss": 0.1732, + "step": 312450 + }, + { + "epoch": 12.94, + "grad_norm": 1.59375, + "learning_rate": 0.00030352107659345923, + "loss": 0.2129, + "step": 312460 + }, + { + "epoch": 12.94, + "grad_norm": 0.6015625, + "learning_rate": 0.0003035104827920984, + "loss": 0.1893, + "step": 312470 + }, + { + "epoch": 12.94, + "grad_norm": 0.9296875, + "learning_rate": 0.0003034998888900367, + "loss": 0.1761, + "step": 312480 + }, + { + "epoch": 12.94, + "grad_norm": 0.78125, + "learning_rate": 0.00030348929488729405, + "loss": 0.191, + "step": 312490 + }, + { + "epoch": 12.94, + "grad_norm": 1.421875, + "learning_rate": 0.00030347870078389016, + "loss": 0.1637, + "step": 312500 + }, + { + "epoch": 12.94, + "grad_norm": 1.140625, + "learning_rate": 0.00030346810657984525, + "loss": 0.2206, + "step": 312510 + }, + { + "epoch": 12.94, + "grad_norm": 0.8203125, + "learning_rate": 0.00030345751227517907, + "loss": 0.1878, + "step": 312520 + }, + { + "epoch": 12.94, + "grad_norm": 0.423828125, + "learning_rate": 0.0003034469178699116, + "loss": 0.1814, + "step": 312530 + }, + { + "epoch": 12.95, + "grad_norm": 1.0859375, + "learning_rate": 0.000303436323364063, + "loss": 0.164, + "step": 312540 + }, + { + "epoch": 12.95, + "grad_norm": 0.302734375, + "learning_rate": 0.0003034257287576527, + "loss": 0.2323, + "step": 312550 + }, + { + "epoch": 12.95, + "grad_norm": 0.38671875, + "learning_rate": 0.0003034151340507012, + "loss": 0.1283, + "step": 312560 + }, + { + "epoch": 12.95, + "grad_norm": 0.5703125, + "learning_rate": 0.00030340453924322815, + "loss": 0.1768, + "step": 312570 + }, + { + "epoch": 12.95, + "grad_norm": 0.82421875, + "learning_rate": 0.0003033939443352534, + "loss": 0.1961, + "step": 312580 + }, + { + "epoch": 12.95, + "grad_norm": 0.65234375, + "learning_rate": 0.0003033833493267971, + "loss": 0.1828, + "step": 312590 + }, + { + "epoch": 12.95, + "grad_norm": 0.8203125, + "learning_rate": 0.0003033727542178791, + "loss": 0.168, + "step": 312600 + }, + { + "epoch": 12.95, + "grad_norm": 1.4609375, + "learning_rate": 0.0003033621590085193, + "loss": 0.2253, + "step": 312610 + }, + { + "epoch": 12.95, + "grad_norm": 1.03125, + "learning_rate": 0.0003033515636987378, + "loss": 0.2673, + "step": 312620 + }, + { + "epoch": 12.95, + "grad_norm": 1.6796875, + "learning_rate": 0.00030334096828855424, + "loss": 0.1907, + "step": 312630 + }, + { + "epoch": 12.95, + "grad_norm": 0.41796875, + "learning_rate": 0.00030333037277798885, + "loss": 0.191, + "step": 312640 + }, + { + "epoch": 12.95, + "grad_norm": 1.6484375, + "learning_rate": 0.0003033197771670615, + "loss": 0.1647, + "step": 312650 + }, + { + "epoch": 12.95, + "grad_norm": 0.7265625, + "learning_rate": 0.00030330918145579194, + "loss": 0.1593, + "step": 312660 + }, + { + "epoch": 12.95, + "grad_norm": 0.515625, + "learning_rate": 0.00030329858564420037, + "loss": 0.1637, + "step": 312670 + }, + { + "epoch": 12.95, + "grad_norm": 0.482421875, + "learning_rate": 0.0003032879897323066, + "loss": 0.2331, + "step": 312680 + }, + { + "epoch": 12.95, + "grad_norm": 0.8515625, + "learning_rate": 0.0003032773937201305, + "loss": 0.208, + "step": 312690 + }, + { + "epoch": 12.95, + "grad_norm": 0.36328125, + "learning_rate": 0.00030326679760769225, + "loss": 0.2139, + "step": 312700 + }, + { + "epoch": 12.95, + "grad_norm": 1.0078125, + "learning_rate": 0.00030325620139501146, + "loss": 0.1762, + "step": 312710 + }, + { + "epoch": 12.95, + "grad_norm": 1.421875, + "learning_rate": 0.0003032456050821084, + "loss": 0.1768, + "step": 312720 + }, + { + "epoch": 12.95, + "grad_norm": 0.318359375, + "learning_rate": 0.0003032350086690029, + "loss": 0.2076, + "step": 312730 + }, + { + "epoch": 12.95, + "grad_norm": 0.6015625, + "learning_rate": 0.00030322441215571466, + "loss": 0.1983, + "step": 312740 + }, + { + "epoch": 12.95, + "grad_norm": 0.734375, + "learning_rate": 0.000303213815542264, + "loss": 0.1778, + "step": 312750 + }, + { + "epoch": 12.95, + "grad_norm": 0.78125, + "learning_rate": 0.0003032032188286706, + "loss": 0.2172, + "step": 312760 + }, + { + "epoch": 12.95, + "grad_norm": 0.1875, + "learning_rate": 0.0003031926220149545, + "loss": 0.2284, + "step": 312770 + }, + { + "epoch": 12.96, + "grad_norm": 0.6796875, + "learning_rate": 0.0003031820251011357, + "loss": 0.1713, + "step": 312780 + }, + { + "epoch": 12.96, + "grad_norm": 0.63671875, + "learning_rate": 0.000303171428087234, + "loss": 0.1718, + "step": 312790 + }, + { + "epoch": 12.96, + "grad_norm": 0.83203125, + "learning_rate": 0.00030316083097326943, + "loss": 0.2251, + "step": 312800 + }, + { + "epoch": 12.96, + "grad_norm": 0.388671875, + "learning_rate": 0.00030315023375926186, + "loss": 0.1914, + "step": 312810 + }, + { + "epoch": 12.96, + "grad_norm": 1.3125, + "learning_rate": 0.00030313963644523133, + "loss": 0.2288, + "step": 312820 + }, + { + "epoch": 12.96, + "grad_norm": 1.234375, + "learning_rate": 0.00030312903903119783, + "loss": 0.1471, + "step": 312830 + }, + { + "epoch": 12.96, + "grad_norm": 0.859375, + "learning_rate": 0.00030311844151718117, + "loss": 0.2272, + "step": 312840 + }, + { + "epoch": 12.96, + "grad_norm": 0.8984375, + "learning_rate": 0.0003031078439032013, + "loss": 0.1606, + "step": 312850 + }, + { + "epoch": 12.96, + "grad_norm": 1.4375, + "learning_rate": 0.0003030972461892782, + "loss": 0.2044, + "step": 312860 + }, + { + "epoch": 12.96, + "grad_norm": 0.70703125, + "learning_rate": 0.0003030866483754318, + "loss": 0.176, + "step": 312870 + }, + { + "epoch": 12.96, + "grad_norm": 0.87890625, + "learning_rate": 0.00030307605046168214, + "loss": 0.2151, + "step": 312880 + }, + { + "epoch": 12.96, + "grad_norm": 0.53515625, + "learning_rate": 0.000303065452448049, + "loss": 0.2068, + "step": 312890 + }, + { + "epoch": 12.96, + "grad_norm": 0.388671875, + "learning_rate": 0.00030305485433455246, + "loss": 0.1483, + "step": 312900 + }, + { + "epoch": 12.96, + "grad_norm": 1.609375, + "learning_rate": 0.00030304425612121244, + "loss": 0.1848, + "step": 312910 + }, + { + "epoch": 12.96, + "grad_norm": 0.43359375, + "learning_rate": 0.00030303365780804883, + "loss": 0.1705, + "step": 312920 + }, + { + "epoch": 12.96, + "grad_norm": 0.3671875, + "learning_rate": 0.0003030230593950815, + "loss": 0.2099, + "step": 312930 + }, + { + "epoch": 12.96, + "grad_norm": 0.7890625, + "learning_rate": 0.0003030124608823306, + "loss": 0.2164, + "step": 312940 + }, + { + "epoch": 12.96, + "grad_norm": 1.71875, + "learning_rate": 0.00030300186226981594, + "loss": 0.2178, + "step": 312950 + }, + { + "epoch": 12.96, + "grad_norm": 1.1015625, + "learning_rate": 0.0003029912635575576, + "loss": 0.2025, + "step": 312960 + }, + { + "epoch": 12.96, + "grad_norm": 0.72265625, + "learning_rate": 0.0003029806647455753, + "loss": 0.1606, + "step": 312970 + }, + { + "epoch": 12.96, + "grad_norm": 0.671875, + "learning_rate": 0.00030297006583388913, + "loss": 0.2154, + "step": 312980 + }, + { + "epoch": 12.96, + "grad_norm": 0.0, + "learning_rate": 0.0003029594668225191, + "loss": 0.1726, + "step": 312990 + }, + { + "epoch": 12.96, + "grad_norm": 1.5078125, + "learning_rate": 0.000302948867711485, + "loss": 0.1751, + "step": 313000 + }, + { + "epoch": 12.96, + "grad_norm": 2.421875, + "learning_rate": 0.00030293826850080687, + "loss": 0.2008, + "step": 313010 + }, + { + "epoch": 12.97, + "grad_norm": 0.73046875, + "learning_rate": 0.00030292766919050464, + "loss": 0.1766, + "step": 313020 + }, + { + "epoch": 12.97, + "grad_norm": 0.74609375, + "learning_rate": 0.00030291706978059817, + "loss": 0.155, + "step": 313030 + }, + { + "epoch": 12.97, + "grad_norm": 0.6015625, + "learning_rate": 0.0003029064702711076, + "loss": 0.1466, + "step": 313040 + }, + { + "epoch": 12.97, + "grad_norm": 0.86328125, + "learning_rate": 0.0003028958706620528, + "loss": 0.2417, + "step": 313050 + }, + { + "epoch": 12.97, + "grad_norm": 0.88671875, + "learning_rate": 0.0003028852709534536, + "loss": 0.192, + "step": 313060 + }, + { + "epoch": 12.97, + "grad_norm": 0.92578125, + "learning_rate": 0.0003028746711453301, + "loss": 0.1212, + "step": 313070 + }, + { + "epoch": 12.97, + "grad_norm": 0.875, + "learning_rate": 0.00030286407123770206, + "loss": 0.2015, + "step": 313080 + }, + { + "epoch": 12.97, + "grad_norm": 1.1015625, + "learning_rate": 0.0003028534712305896, + "loss": 0.2933, + "step": 313090 + }, + { + "epoch": 12.97, + "grad_norm": 0.5078125, + "learning_rate": 0.0003028428711240126, + "loss": 0.185, + "step": 313100 + }, + { + "epoch": 12.97, + "grad_norm": 2.21875, + "learning_rate": 0.00030283227091799106, + "loss": 0.2188, + "step": 313110 + }, + { + "epoch": 12.97, + "grad_norm": 0.640625, + "learning_rate": 0.00030282167061254487, + "loss": 0.2219, + "step": 313120 + }, + { + "epoch": 12.97, + "grad_norm": 1.2578125, + "learning_rate": 0.00030281107020769404, + "loss": 0.2082, + "step": 313130 + }, + { + "epoch": 12.97, + "grad_norm": 0.640625, + "learning_rate": 0.0003028004697034584, + "loss": 0.2045, + "step": 313140 + }, + { + "epoch": 12.97, + "grad_norm": 0.447265625, + "learning_rate": 0.0003027898690998581, + "loss": 0.2024, + "step": 313150 + }, + { + "epoch": 12.97, + "grad_norm": 0.46484375, + "learning_rate": 0.0003027792683969129, + "loss": 0.1822, + "step": 313160 + }, + { + "epoch": 12.97, + "grad_norm": 0.30078125, + "learning_rate": 0.0003027686675946428, + "loss": 0.2144, + "step": 313170 + }, + { + "epoch": 12.97, + "grad_norm": 1.203125, + "learning_rate": 0.0003027580666930677, + "loss": 0.2144, + "step": 313180 + }, + { + "epoch": 12.97, + "grad_norm": 1.2265625, + "learning_rate": 0.0003027474656922077, + "loss": 0.1289, + "step": 313190 + }, + { + "epoch": 12.97, + "grad_norm": 0.859375, + "learning_rate": 0.0003027368645920827, + "loss": 0.1539, + "step": 313200 + }, + { + "epoch": 12.97, + "grad_norm": 0.34765625, + "learning_rate": 0.0003027262633927126, + "loss": 0.1835, + "step": 313210 + }, + { + "epoch": 12.97, + "grad_norm": 0.27734375, + "learning_rate": 0.0003027156620941173, + "loss": 0.2082, + "step": 313220 + }, + { + "epoch": 12.97, + "grad_norm": 0.53125, + "learning_rate": 0.0003027050606963168, + "loss": 0.1818, + "step": 313230 + }, + { + "epoch": 12.97, + "grad_norm": 0.984375, + "learning_rate": 0.0003026944591993312, + "loss": 0.1712, + "step": 313240 + }, + { + "epoch": 12.97, + "grad_norm": 0.68359375, + "learning_rate": 0.00030268385760318025, + "loss": 0.2009, + "step": 313250 + }, + { + "epoch": 12.98, + "grad_norm": 1.25, + "learning_rate": 0.0003026732559078839, + "loss": 0.2078, + "step": 313260 + }, + { + "epoch": 12.98, + "grad_norm": 1.03125, + "learning_rate": 0.0003026626541134622, + "loss": 0.1748, + "step": 313270 + }, + { + "epoch": 12.98, + "grad_norm": 1.2734375, + "learning_rate": 0.0003026520522199352, + "loss": 0.2129, + "step": 313280 + }, + { + "epoch": 12.98, + "grad_norm": 0.71875, + "learning_rate": 0.0003026414502273226, + "loss": 0.1781, + "step": 313290 + }, + { + "epoch": 12.98, + "grad_norm": 1.203125, + "learning_rate": 0.00030263084813564453, + "loss": 0.177, + "step": 313300 + }, + { + "epoch": 12.98, + "grad_norm": 0.2109375, + "learning_rate": 0.0003026202459449208, + "loss": 0.1661, + "step": 313310 + }, + { + "epoch": 12.98, + "grad_norm": 0.8359375, + "learning_rate": 0.00030260964365517155, + "loss": 0.2079, + "step": 313320 + }, + { + "epoch": 12.98, + "grad_norm": 0.0, + "learning_rate": 0.0003025990412664166, + "loss": 0.1924, + "step": 313330 + }, + { + "epoch": 12.98, + "grad_norm": 1.0859375, + "learning_rate": 0.00030258843877867595, + "loss": 0.1093, + "step": 313340 + }, + { + "epoch": 12.98, + "grad_norm": 0.59375, + "learning_rate": 0.00030257783619196945, + "loss": 0.1538, + "step": 313350 + }, + { + "epoch": 12.98, + "grad_norm": 1.234375, + "learning_rate": 0.0003025672335063172, + "loss": 0.1959, + "step": 313360 + }, + { + "epoch": 12.98, + "grad_norm": 1.265625, + "learning_rate": 0.0003025566307217391, + "loss": 0.1959, + "step": 313370 + }, + { + "epoch": 12.98, + "grad_norm": 0.78515625, + "learning_rate": 0.00030254602783825514, + "loss": 0.2191, + "step": 313380 + }, + { + "epoch": 12.98, + "grad_norm": 0.7890625, + "learning_rate": 0.00030253542485588516, + "loss": 0.2154, + "step": 313390 + }, + { + "epoch": 12.98, + "grad_norm": 1.4296875, + "learning_rate": 0.00030252482177464923, + "loss": 0.21, + "step": 313400 + }, + { + "epoch": 12.98, + "grad_norm": 1.21875, + "learning_rate": 0.00030251421859456724, + "loss": 0.2101, + "step": 313410 + }, + { + "epoch": 12.98, + "grad_norm": 0.6015625, + "learning_rate": 0.00030250361531565913, + "loss": 0.2238, + "step": 313420 + }, + { + "epoch": 12.98, + "grad_norm": 0.5703125, + "learning_rate": 0.00030249301193794486, + "loss": 0.2593, + "step": 313430 + }, + { + "epoch": 12.98, + "grad_norm": 0.45703125, + "learning_rate": 0.0003024824084614445, + "loss": 0.2205, + "step": 313440 + }, + { + "epoch": 12.98, + "grad_norm": 0.9140625, + "learning_rate": 0.0003024718048861779, + "loss": 0.1705, + "step": 313450 + }, + { + "epoch": 12.98, + "grad_norm": 1.2734375, + "learning_rate": 0.000302461201212165, + "loss": 0.1936, + "step": 313460 + }, + { + "epoch": 12.98, + "grad_norm": 1.5234375, + "learning_rate": 0.0003024505974394258, + "loss": 0.1594, + "step": 313470 + }, + { + "epoch": 12.98, + "grad_norm": 0.640625, + "learning_rate": 0.0003024399935679802, + "loss": 0.1917, + "step": 313480 + }, + { + "epoch": 12.98, + "grad_norm": 0.67578125, + "learning_rate": 0.0003024293895978483, + "loss": 0.2127, + "step": 313490 + }, + { + "epoch": 12.99, + "grad_norm": 0.6953125, + "learning_rate": 0.00030241878552904984, + "loss": 0.2074, + "step": 313500 + }, + { + "epoch": 12.99, + "grad_norm": 0.41796875, + "learning_rate": 0.00030240818136160493, + "loss": 0.2056, + "step": 313510 + }, + { + "epoch": 12.99, + "grad_norm": 1.046875, + "learning_rate": 0.00030239757709553345, + "loss": 0.1507, + "step": 313520 + }, + { + "epoch": 12.99, + "grad_norm": 0.6015625, + "learning_rate": 0.0003023869727308555, + "loss": 0.2213, + "step": 313530 + }, + { + "epoch": 12.99, + "grad_norm": 0.96484375, + "learning_rate": 0.00030237636826759077, + "loss": 0.1859, + "step": 313540 + }, + { + "epoch": 12.99, + "grad_norm": 0.64453125, + "learning_rate": 0.0003023657637057594, + "loss": 0.1743, + "step": 313550 + }, + { + "epoch": 12.99, + "grad_norm": 0.43359375, + "learning_rate": 0.00030235515904538133, + "loss": 0.1541, + "step": 313560 + }, + { + "epoch": 12.99, + "grad_norm": 0.68359375, + "learning_rate": 0.00030234455428647654, + "loss": 0.2508, + "step": 313570 + }, + { + "epoch": 12.99, + "grad_norm": 0.76953125, + "learning_rate": 0.000302333949429065, + "loss": 0.2188, + "step": 313580 + }, + { + "epoch": 12.99, + "grad_norm": 0.5078125, + "learning_rate": 0.0003023233444731666, + "loss": 0.1864, + "step": 313590 + }, + { + "epoch": 12.99, + "grad_norm": 0.55859375, + "learning_rate": 0.0003023127394188012, + "loss": 0.2019, + "step": 313600 + }, + { + "epoch": 12.99, + "grad_norm": 0.83984375, + "learning_rate": 0.000302302134265989, + "loss": 0.1612, + "step": 313610 + }, + { + "epoch": 12.99, + "grad_norm": 1.71875, + "learning_rate": 0.00030229152901474975, + "loss": 0.1945, + "step": 313620 + }, + { + "epoch": 12.99, + "grad_norm": 1.046875, + "learning_rate": 0.0003022809236651036, + "loss": 0.195, + "step": 313630 + }, + { + "epoch": 12.99, + "grad_norm": 0.87109375, + "learning_rate": 0.0003022703182170703, + "loss": 0.1676, + "step": 313640 + }, + { + "epoch": 12.99, + "grad_norm": 0.416015625, + "learning_rate": 0.00030225971267066985, + "loss": 0.1933, + "step": 313650 + }, + { + "epoch": 12.99, + "grad_norm": 1.4296875, + "learning_rate": 0.0003022491070259224, + "loss": 0.2228, + "step": 313660 + }, + { + "epoch": 12.99, + "grad_norm": 0.7890625, + "learning_rate": 0.00030223850128284767, + "loss": 0.2003, + "step": 313670 + }, + { + "epoch": 12.99, + "grad_norm": 1.1640625, + "learning_rate": 0.0003022278954414658, + "loss": 0.185, + "step": 313680 + }, + { + "epoch": 12.99, + "grad_norm": 0.93359375, + "learning_rate": 0.0003022172895017967, + "loss": 0.1918, + "step": 313690 + }, + { + "epoch": 12.99, + "grad_norm": 0.515625, + "learning_rate": 0.0003022066834638602, + "loss": 0.1785, + "step": 313700 + }, + { + "epoch": 12.99, + "grad_norm": 1.0859375, + "learning_rate": 0.0003021960773276764, + "loss": 0.193, + "step": 313710 + }, + { + "epoch": 12.99, + "grad_norm": 0.486328125, + "learning_rate": 0.0003021854710932652, + "loss": 0.1948, + "step": 313720 + }, + { + "epoch": 12.99, + "grad_norm": 0.38671875, + "learning_rate": 0.0003021748647606466, + "loss": 0.1823, + "step": 313730 + }, + { + "epoch": 13.0, + "grad_norm": 0.71484375, + "learning_rate": 0.0003021642583298406, + "loss": 0.2216, + "step": 313740 + }, + { + "epoch": 13.0, + "grad_norm": 1.125, + "learning_rate": 0.00030215365180086696, + "loss": 0.1605, + "step": 313750 + }, + { + "epoch": 13.0, + "grad_norm": 0.84765625, + "learning_rate": 0.00030214304517374587, + "loss": 0.189, + "step": 313760 + }, + { + "epoch": 13.0, + "grad_norm": 0.482421875, + "learning_rate": 0.00030213243844849717, + "loss": 0.1976, + "step": 313770 + }, + { + "epoch": 13.0, + "grad_norm": 0.62109375, + "learning_rate": 0.00030212183162514085, + "loss": 0.2097, + "step": 313780 + }, + { + "epoch": 13.0, + "grad_norm": 0.76171875, + "learning_rate": 0.0003021112247036969, + "loss": 0.1814, + "step": 313790 + }, + { + "epoch": 13.0, + "grad_norm": 2.375, + "learning_rate": 0.00030210061768418524, + "loss": 0.179, + "step": 313800 + }, + { + "epoch": 13.0, + "grad_norm": 0.388671875, + "learning_rate": 0.00030209001056662584, + "loss": 0.157, + "step": 313810 + }, + { + "epoch": 13.0, + "grad_norm": 0.62890625, + "learning_rate": 0.0003020794033510387, + "loss": 0.2022, + "step": 313820 + }, + { + "epoch": 13.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.0003020687960374437, + "loss": 0.1604, + "step": 313830 + }, + { + "epoch": 13.0, + "grad_norm": 0.384765625, + "learning_rate": 0.0003020581886258608, + "loss": 0.1918, + "step": 313840 + }, + { + "epoch": 13.0, + "grad_norm": 0.32421875, + "learning_rate": 0.00030204758111631013, + "loss": 0.1788, + "step": 313850 + }, + { + "epoch": 13.0, + "grad_norm": 0.5859375, + "learning_rate": 0.0003020369735088114, + "loss": 0.2173, + "step": 313860 + }, + { + "epoch": 13.0, + "grad_norm": 0.7265625, + "learning_rate": 0.0003020263658033848, + "loss": 0.1736, + "step": 313870 + }, + { + "epoch": 13.0, + "grad_norm": 0.5546875, + "learning_rate": 0.0003020157580000502, + "loss": 0.2014, + "step": 313880 + }, + { + "epoch": 13.0, + "grad_norm": 1.484375, + "learning_rate": 0.00030200515009882757, + "loss": 0.1585, + "step": 313890 + }, + { + "epoch": 13.0, + "grad_norm": 2.09375, + "learning_rate": 0.0003019945420997369, + "loss": 0.1597, + "step": 313900 + }, + { + "epoch": 13.0, + "grad_norm": 1.8359375, + "learning_rate": 0.000301983934002798, + "loss": 0.2206, + "step": 313910 + }, + { + "epoch": 13.0, + "grad_norm": 0.7109375, + "learning_rate": 0.0003019733258080309, + "loss": 0.191, + "step": 313920 + }, + { + "epoch": 13.0, + "grad_norm": 0.53125, + "learning_rate": 0.0003019627175154558, + "loss": 0.227, + "step": 313930 + }, + { + "epoch": 13.0, + "grad_norm": 0.9921875, + "learning_rate": 0.00030195210912509234, + "loss": 0.1937, + "step": 313940 + }, + { + "epoch": 13.0, + "grad_norm": 1.6171875, + "learning_rate": 0.0003019415006369607, + "loss": 0.1694, + "step": 313950 + }, + { + "epoch": 13.0, + "grad_norm": 0.671875, + "learning_rate": 0.00030193089205108074, + "loss": 0.1447, + "step": 313960 + }, + { + "epoch": 13.0, + "grad_norm": 1.2265625, + "learning_rate": 0.00030192028336747245, + "loss": 0.192, + "step": 313970 + }, + { + "epoch": 13.01, + "grad_norm": 0.64453125, + "learning_rate": 0.0003019096745861558, + "loss": 0.1844, + "step": 313980 + }, + { + "epoch": 13.01, + "grad_norm": 0.486328125, + "learning_rate": 0.00030189906570715077, + "loss": 0.1852, + "step": 313990 + }, + { + "epoch": 13.01, + "grad_norm": 0.80859375, + "learning_rate": 0.0003018884567304772, + "loss": 0.2092, + "step": 314000 + }, + { + "epoch": 13.01, + "grad_norm": 1.4765625, + "learning_rate": 0.00030187784765615527, + "loss": 0.1528, + "step": 314010 + }, + { + "epoch": 13.01, + "grad_norm": 0.7265625, + "learning_rate": 0.00030186723848420476, + "loss": 0.1987, + "step": 314020 + }, + { + "epoch": 13.01, + "grad_norm": 0.71484375, + "learning_rate": 0.0003018566292146458, + "loss": 0.2079, + "step": 314030 + }, + { + "epoch": 13.01, + "grad_norm": 1.5390625, + "learning_rate": 0.00030184601984749814, + "loss": 0.1711, + "step": 314040 + }, + { + "epoch": 13.01, + "grad_norm": 1.3203125, + "learning_rate": 0.00030183541038278193, + "loss": 0.1582, + "step": 314050 + }, + { + "epoch": 13.01, + "grad_norm": 0.8203125, + "learning_rate": 0.0003018248008205172, + "loss": 0.239, + "step": 314060 + }, + { + "epoch": 13.01, + "grad_norm": 0.921875, + "learning_rate": 0.00030181419116072356, + "loss": 0.182, + "step": 314070 + }, + { + "epoch": 13.01, + "grad_norm": 1.09375, + "learning_rate": 0.00030180358140342135, + "loss": 0.2219, + "step": 314080 + }, + { + "epoch": 13.01, + "grad_norm": 0.53515625, + "learning_rate": 0.0003017929715486304, + "loss": 0.1796, + "step": 314090 + }, + { + "epoch": 13.01, + "grad_norm": 0.41796875, + "learning_rate": 0.0003017823615963706, + "loss": 0.2195, + "step": 314100 + }, + { + "epoch": 13.01, + "grad_norm": 1.46875, + "learning_rate": 0.00030177175154666203, + "loss": 0.1719, + "step": 314110 + }, + { + "epoch": 13.01, + "grad_norm": 1.3046875, + "learning_rate": 0.0003017611413995246, + "loss": 0.1746, + "step": 314120 + }, + { + "epoch": 13.01, + "grad_norm": 0.51171875, + "learning_rate": 0.0003017505311549782, + "loss": 0.2076, + "step": 314130 + }, + { + "epoch": 13.01, + "grad_norm": 2.15625, + "learning_rate": 0.0003017399208130431, + "loss": 0.2231, + "step": 314140 + }, + { + "epoch": 13.01, + "grad_norm": 0.8515625, + "learning_rate": 0.00030172931037373885, + "loss": 0.1566, + "step": 314150 + }, + { + "epoch": 13.01, + "grad_norm": 0.71484375, + "learning_rate": 0.00030171869983708577, + "loss": 0.1815, + "step": 314160 + }, + { + "epoch": 13.01, + "grad_norm": 1.1953125, + "learning_rate": 0.00030170808920310365, + "loss": 0.211, + "step": 314170 + }, + { + "epoch": 13.01, + "grad_norm": 0.85546875, + "learning_rate": 0.00030169747847181243, + "loss": 0.1798, + "step": 314180 + }, + { + "epoch": 13.01, + "grad_norm": 0.5703125, + "learning_rate": 0.00030168686764323223, + "loss": 0.2239, + "step": 314190 + }, + { + "epoch": 13.01, + "grad_norm": 0.578125, + "learning_rate": 0.0003016762567173829, + "loss": 0.1593, + "step": 314200 + }, + { + "epoch": 13.01, + "grad_norm": 0.55859375, + "learning_rate": 0.00030166564569428434, + "loss": 0.2338, + "step": 314210 + }, + { + "epoch": 13.01, + "grad_norm": 0.0, + "learning_rate": 0.0003016550345739567, + "loss": 0.154, + "step": 314220 + }, + { + "epoch": 13.02, + "grad_norm": 2.71875, + "learning_rate": 0.0003016444233564198, + "loss": 0.2066, + "step": 314230 + }, + { + "epoch": 13.02, + "grad_norm": 0.98046875, + "learning_rate": 0.00030163381204169375, + "loss": 0.2236, + "step": 314240 + }, + { + "epoch": 13.02, + "grad_norm": 0.58203125, + "learning_rate": 0.00030162320062979847, + "loss": 0.1943, + "step": 314250 + }, + { + "epoch": 13.02, + "grad_norm": 0.71875, + "learning_rate": 0.0003016125891207538, + "loss": 0.1841, + "step": 314260 + }, + { + "epoch": 13.02, + "grad_norm": 0.41796875, + "learning_rate": 0.00030160197751457983, + "loss": 0.1702, + "step": 314270 + }, + { + "epoch": 13.02, + "grad_norm": 0.75, + "learning_rate": 0.00030159136581129654, + "loss": 0.2195, + "step": 314280 + }, + { + "epoch": 13.02, + "grad_norm": 1.0703125, + "learning_rate": 0.00030158075401092387, + "loss": 0.1488, + "step": 314290 + }, + { + "epoch": 13.02, + "grad_norm": 0.92578125, + "learning_rate": 0.0003015701421134818, + "loss": 0.1944, + "step": 314300 + }, + { + "epoch": 13.02, + "grad_norm": 2.109375, + "learning_rate": 0.0003015595301189902, + "loss": 0.2359, + "step": 314310 + }, + { + "epoch": 13.02, + "grad_norm": 1.359375, + "learning_rate": 0.0003015489180274693, + "loss": 0.2051, + "step": 314320 + }, + { + "epoch": 13.02, + "grad_norm": 0.796875, + "learning_rate": 0.00030153830583893877, + "loss": 0.1898, + "step": 314330 + }, + { + "epoch": 13.02, + "grad_norm": 1.0859375, + "learning_rate": 0.00030152769355341874, + "loss": 0.236, + "step": 314340 + }, + { + "epoch": 13.02, + "grad_norm": 0.341796875, + "learning_rate": 0.00030151708117092925, + "loss": 0.1771, + "step": 314350 + }, + { + "epoch": 13.02, + "grad_norm": 0.7109375, + "learning_rate": 0.0003015064686914901, + "loss": 0.2311, + "step": 314360 + }, + { + "epoch": 13.02, + "grad_norm": 1.671875, + "learning_rate": 0.0003014958561151213, + "loss": 0.1979, + "step": 314370 + }, + { + "epoch": 13.02, + "grad_norm": 0.9609375, + "learning_rate": 0.0003014852434418429, + "loss": 0.1944, + "step": 314380 + }, + { + "epoch": 13.02, + "grad_norm": 0.52734375, + "learning_rate": 0.0003014746306716748, + "loss": 0.1781, + "step": 314390 + }, + { + "epoch": 13.02, + "grad_norm": 0.953125, + "learning_rate": 0.00030146401780463704, + "loss": 0.1576, + "step": 314400 + }, + { + "epoch": 13.02, + "grad_norm": 0.66015625, + "learning_rate": 0.00030145340484074955, + "loss": 0.2311, + "step": 314410 + }, + { + "epoch": 13.02, + "grad_norm": 0.78125, + "learning_rate": 0.0003014427917800323, + "loss": 0.1533, + "step": 314420 + }, + { + "epoch": 13.02, + "grad_norm": 0.93359375, + "learning_rate": 0.0003014321786225053, + "loss": 0.2126, + "step": 314430 + }, + { + "epoch": 13.02, + "grad_norm": 0.90234375, + "learning_rate": 0.0003014215653681885, + "loss": 0.184, + "step": 314440 + }, + { + "epoch": 13.02, + "grad_norm": 1.0859375, + "learning_rate": 0.00030141095201710176, + "loss": 0.1928, + "step": 314450 + }, + { + "epoch": 13.02, + "grad_norm": 1.03125, + "learning_rate": 0.00030140033856926524, + "loss": 0.1646, + "step": 314460 + }, + { + "epoch": 13.03, + "grad_norm": 1.1171875, + "learning_rate": 0.00030138972502469886, + "loss": 0.1698, + "step": 314470 + }, + { + "epoch": 13.03, + "grad_norm": 0.8125, + "learning_rate": 0.00030137911138342256, + "loss": 0.2125, + "step": 314480 + }, + { + "epoch": 13.03, + "grad_norm": 1.546875, + "learning_rate": 0.00030136849764545625, + "loss": 0.1638, + "step": 314490 + }, + { + "epoch": 13.03, + "grad_norm": 0.77734375, + "learning_rate": 0.0003013578838108201, + "loss": 0.1738, + "step": 314500 + }, + { + "epoch": 13.03, + "grad_norm": 0.77734375, + "learning_rate": 0.00030134726987953385, + "loss": 0.2542, + "step": 314510 + }, + { + "epoch": 13.03, + "grad_norm": 0.62890625, + "learning_rate": 0.0003013366558516176, + "loss": 0.1622, + "step": 314520 + }, + { + "epoch": 13.03, + "grad_norm": 0.609375, + "learning_rate": 0.0003013260417270913, + "loss": 0.2265, + "step": 314530 + }, + { + "epoch": 13.03, + "grad_norm": 0.3125, + "learning_rate": 0.00030131542750597504, + "loss": 0.1535, + "step": 314540 + }, + { + "epoch": 13.03, + "grad_norm": 2.6875, + "learning_rate": 0.0003013048131882886, + "loss": 0.1834, + "step": 314550 + }, + { + "epoch": 13.03, + "grad_norm": 0.6328125, + "learning_rate": 0.00030129419877405203, + "loss": 0.2562, + "step": 314560 + }, + { + "epoch": 13.03, + "grad_norm": 4.4375, + "learning_rate": 0.00030128358426328533, + "loss": 0.2048, + "step": 314570 + }, + { + "epoch": 13.03, + "grad_norm": 0.67578125, + "learning_rate": 0.0003012729696560084, + "loss": 0.194, + "step": 314580 + }, + { + "epoch": 13.03, + "grad_norm": 0.5234375, + "learning_rate": 0.00030126235495224143, + "loss": 0.2021, + "step": 314590 + }, + { + "epoch": 13.03, + "grad_norm": 0.55859375, + "learning_rate": 0.00030125174015200413, + "loss": 0.207, + "step": 314600 + }, + { + "epoch": 13.03, + "grad_norm": 0.734375, + "learning_rate": 0.0003012411252553166, + "loss": 0.1746, + "step": 314610 + }, + { + "epoch": 13.03, + "grad_norm": 0.392578125, + "learning_rate": 0.00030123051026219886, + "loss": 0.2084, + "step": 314620 + }, + { + "epoch": 13.03, + "grad_norm": 1.1484375, + "learning_rate": 0.0003012198951726707, + "loss": 0.1905, + "step": 314630 + }, + { + "epoch": 13.03, + "grad_norm": 0.69140625, + "learning_rate": 0.0003012092799867524, + "loss": 0.1855, + "step": 314640 + }, + { + "epoch": 13.03, + "grad_norm": 0.58203125, + "learning_rate": 0.00030119866470446364, + "loss": 0.1797, + "step": 314650 + }, + { + "epoch": 13.03, + "grad_norm": 0.546875, + "learning_rate": 0.0003011880493258246, + "loss": 0.225, + "step": 314660 + }, + { + "epoch": 13.03, + "grad_norm": 0.78515625, + "learning_rate": 0.00030117743385085516, + "loss": 0.2261, + "step": 314670 + }, + { + "epoch": 13.03, + "grad_norm": 0.2578125, + "learning_rate": 0.0003011668182795753, + "loss": 0.1883, + "step": 314680 + }, + { + "epoch": 13.03, + "grad_norm": 0.8515625, + "learning_rate": 0.00030115620261200504, + "loss": 0.214, + "step": 314690 + }, + { + "epoch": 13.03, + "grad_norm": 1.515625, + "learning_rate": 0.0003011455868481643, + "loss": 0.191, + "step": 314700 + }, + { + "epoch": 13.04, + "grad_norm": 0.6328125, + "learning_rate": 0.0003011349709880731, + "loss": 0.1568, + "step": 314710 + }, + { + "epoch": 13.04, + "grad_norm": 0.58203125, + "learning_rate": 0.00030112435503175145, + "loss": 0.1724, + "step": 314720 + }, + { + "epoch": 13.04, + "grad_norm": 0.77734375, + "learning_rate": 0.0003011137389792192, + "loss": 0.1924, + "step": 314730 + }, + { + "epoch": 13.04, + "grad_norm": 0.609375, + "learning_rate": 0.00030110312283049647, + "loss": 0.2146, + "step": 314740 + }, + { + "epoch": 13.04, + "grad_norm": 0.8515625, + "learning_rate": 0.00030109250658560315, + "loss": 0.1768, + "step": 314750 + }, + { + "epoch": 13.04, + "grad_norm": 0.66015625, + "learning_rate": 0.0003010818902445593, + "loss": 0.2189, + "step": 314760 + }, + { + "epoch": 13.04, + "grad_norm": 0.451171875, + "learning_rate": 0.0003010712738073848, + "loss": 0.1282, + "step": 314770 + }, + { + "epoch": 13.04, + "grad_norm": 1.078125, + "learning_rate": 0.0003010606572740997, + "loss": 0.2319, + "step": 314780 + }, + { + "epoch": 13.04, + "grad_norm": 0.455078125, + "learning_rate": 0.0003010500406447239, + "loss": 0.1538, + "step": 314790 + }, + { + "epoch": 13.04, + "grad_norm": 0.70703125, + "learning_rate": 0.0003010394239192775, + "loss": 0.1856, + "step": 314800 + }, + { + "epoch": 13.04, + "grad_norm": 0.546875, + "learning_rate": 0.00030102880709778043, + "loss": 0.1612, + "step": 314810 + }, + { + "epoch": 13.04, + "grad_norm": 0.484375, + "learning_rate": 0.00030101819018025263, + "loss": 0.1926, + "step": 314820 + }, + { + "epoch": 13.04, + "grad_norm": 0.828125, + "learning_rate": 0.00030100757316671407, + "loss": 0.1624, + "step": 314830 + }, + { + "epoch": 13.04, + "grad_norm": 1.171875, + "learning_rate": 0.00030099695605718485, + "loss": 0.2178, + "step": 314840 + }, + { + "epoch": 13.04, + "grad_norm": 1.078125, + "learning_rate": 0.0003009863388516848, + "loss": 0.2448, + "step": 314850 + }, + { + "epoch": 13.04, + "grad_norm": 0.4453125, + "learning_rate": 0.00030097572155023393, + "loss": 0.1865, + "step": 314860 + }, + { + "epoch": 13.04, + "grad_norm": 0.53515625, + "learning_rate": 0.00030096510415285235, + "loss": 0.1629, + "step": 314870 + }, + { + "epoch": 13.04, + "grad_norm": 1.109375, + "learning_rate": 0.00030095448665955985, + "loss": 0.2338, + "step": 314880 + }, + { + "epoch": 13.04, + "grad_norm": 0.3984375, + "learning_rate": 0.0003009438690703766, + "loss": 0.1669, + "step": 314890 + }, + { + "epoch": 13.04, + "grad_norm": 1.5234375, + "learning_rate": 0.00030093325138532247, + "loss": 0.1855, + "step": 314900 + }, + { + "epoch": 13.04, + "grad_norm": 0.8671875, + "learning_rate": 0.00030092263360441743, + "loss": 0.2553, + "step": 314910 + }, + { + "epoch": 13.04, + "grad_norm": 1.0234375, + "learning_rate": 0.00030091201572768155, + "loss": 0.2238, + "step": 314920 + }, + { + "epoch": 13.04, + "grad_norm": 0.0, + "learning_rate": 0.0003009013977551347, + "loss": 0.1854, + "step": 314930 + }, + { + "epoch": 13.04, + "grad_norm": 0.5234375, + "learning_rate": 0.0003008907796867969, + "loss": 0.2061, + "step": 314940 + }, + { + "epoch": 13.05, + "grad_norm": 0.375, + "learning_rate": 0.00030088016152268813, + "loss": 0.2395, + "step": 314950 + }, + { + "epoch": 13.05, + "grad_norm": 0.98828125, + "learning_rate": 0.0003008695432628285, + "loss": 0.1951, + "step": 314960 + }, + { + "epoch": 13.05, + "grad_norm": 1.3359375, + "learning_rate": 0.00030085892490723786, + "loss": 0.1874, + "step": 314970 + }, + { + "epoch": 13.05, + "grad_norm": 0.9453125, + "learning_rate": 0.00030084830645593614, + "loss": 0.2161, + "step": 314980 + }, + { + "epoch": 13.05, + "grad_norm": 1.3046875, + "learning_rate": 0.00030083768790894344, + "loss": 0.1974, + "step": 314990 + }, + { + "epoch": 13.05, + "grad_norm": 0.6640625, + "learning_rate": 0.0003008270692662797, + "loss": 0.1699, + "step": 315000 + }, + { + "epoch": 13.05, + "grad_norm": 2.171875, + "learning_rate": 0.0003008164505279649, + "loss": 0.204, + "step": 315010 + }, + { + "epoch": 13.05, + "grad_norm": 0.9140625, + "learning_rate": 0.00030080583169401906, + "loss": 0.2125, + "step": 315020 + }, + { + "epoch": 13.05, + "grad_norm": 1.2421875, + "learning_rate": 0.0003007952127644621, + "loss": 0.1677, + "step": 315030 + }, + { + "epoch": 13.05, + "grad_norm": 0.73828125, + "learning_rate": 0.000300784593739314, + "loss": 0.2057, + "step": 315040 + }, + { + "epoch": 13.05, + "grad_norm": 0.59765625, + "learning_rate": 0.00030077397461859487, + "loss": 0.1821, + "step": 315050 + }, + { + "epoch": 13.05, + "grad_norm": 0.56640625, + "learning_rate": 0.00030076335540232455, + "loss": 0.2085, + "step": 315060 + }, + { + "epoch": 13.05, + "grad_norm": 1.171875, + "learning_rate": 0.000300752736090523, + "loss": 0.2151, + "step": 315070 + }, + { + "epoch": 13.05, + "grad_norm": 0.89453125, + "learning_rate": 0.0003007421166832104, + "loss": 0.2373, + "step": 315080 + }, + { + "epoch": 13.05, + "grad_norm": 1.078125, + "learning_rate": 0.0003007314971804065, + "loss": 0.192, + "step": 315090 + }, + { + "epoch": 13.05, + "grad_norm": 0.87109375, + "learning_rate": 0.0003007208775821315, + "loss": 0.174, + "step": 315100 + }, + { + "epoch": 13.05, + "grad_norm": 0.62109375, + "learning_rate": 0.0003007102578884053, + "loss": 0.171, + "step": 315110 + }, + { + "epoch": 13.05, + "grad_norm": 1.390625, + "learning_rate": 0.0003006996380992478, + "loss": 0.1559, + "step": 315120 + }, + { + "epoch": 13.05, + "grad_norm": 0.80078125, + "learning_rate": 0.0003006890182146791, + "loss": 0.2027, + "step": 315130 + }, + { + "epoch": 13.05, + "grad_norm": 0.66796875, + "learning_rate": 0.0003006783982347191, + "loss": 0.1263, + "step": 315140 + }, + { + "epoch": 13.05, + "grad_norm": 0.890625, + "learning_rate": 0.00030066777815938785, + "loss": 0.2271, + "step": 315150 + }, + { + "epoch": 13.05, + "grad_norm": 1.1015625, + "learning_rate": 0.00030065715798870536, + "loss": 0.172, + "step": 315160 + }, + { + "epoch": 13.05, + "grad_norm": 0.486328125, + "learning_rate": 0.00030064653772269147, + "loss": 0.173, + "step": 315170 + }, + { + "epoch": 13.05, + "grad_norm": 2.140625, + "learning_rate": 0.00030063591736136634, + "loss": 0.1851, + "step": 315180 + }, + { + "epoch": 13.06, + "grad_norm": 1.59375, + "learning_rate": 0.00030062529690474987, + "loss": 0.1807, + "step": 315190 + }, + { + "epoch": 13.06, + "grad_norm": 0.96484375, + "learning_rate": 0.00030061467635286206, + "loss": 0.208, + "step": 315200 + }, + { + "epoch": 13.06, + "grad_norm": 0.640625, + "learning_rate": 0.0003006040557057229, + "loss": 0.1628, + "step": 315210 + }, + { + "epoch": 13.06, + "grad_norm": 0.90625, + "learning_rate": 0.00030059343496335227, + "loss": 0.1325, + "step": 315220 + }, + { + "epoch": 13.06, + "grad_norm": 0.9609375, + "learning_rate": 0.0003005828141257704, + "loss": 0.2347, + "step": 315230 + }, + { + "epoch": 13.06, + "grad_norm": 0.62890625, + "learning_rate": 0.00030057219319299717, + "loss": 0.1874, + "step": 315240 + }, + { + "epoch": 13.06, + "grad_norm": 1.15625, + "learning_rate": 0.00030056157216505234, + "loss": 0.2152, + "step": 315250 + }, + { + "epoch": 13.06, + "grad_norm": 0.8828125, + "learning_rate": 0.0003005509510419563, + "loss": 0.1812, + "step": 315260 + }, + { + "epoch": 13.06, + "grad_norm": 0.373046875, + "learning_rate": 0.0003005403298237287, + "loss": 0.2644, + "step": 315270 + }, + { + "epoch": 13.06, + "grad_norm": 1.1015625, + "learning_rate": 0.00030052970851038964, + "loss": 0.1823, + "step": 315280 + }, + { + "epoch": 13.06, + "grad_norm": 0.455078125, + "learning_rate": 0.00030051908710195923, + "loss": 0.1813, + "step": 315290 + }, + { + "epoch": 13.06, + "grad_norm": 1.3125, + "learning_rate": 0.00030050846559845725, + "loss": 0.1474, + "step": 315300 + }, + { + "epoch": 13.06, + "grad_norm": 1.421875, + "learning_rate": 0.00030049784399990387, + "loss": 0.1799, + "step": 315310 + }, + { + "epoch": 13.06, + "grad_norm": 0.8046875, + "learning_rate": 0.00030048722230631897, + "loss": 0.2109, + "step": 315320 + }, + { + "epoch": 13.06, + "grad_norm": 0.984375, + "learning_rate": 0.0003004766005177225, + "loss": 0.1557, + "step": 315330 + }, + { + "epoch": 13.06, + "grad_norm": 0.84765625, + "learning_rate": 0.00030046597863413464, + "loss": 0.2143, + "step": 315340 + }, + { + "epoch": 13.06, + "grad_norm": 1.0234375, + "learning_rate": 0.0003004553566555752, + "loss": 0.1633, + "step": 315350 + }, + { + "epoch": 13.06, + "grad_norm": 0.27734375, + "learning_rate": 0.0003004447345820642, + "loss": 0.1733, + "step": 315360 + }, + { + "epoch": 13.06, + "grad_norm": 0.49609375, + "learning_rate": 0.00030043411241362174, + "loss": 0.2255, + "step": 315370 + }, + { + "epoch": 13.06, + "grad_norm": 0.5390625, + "learning_rate": 0.0003004234901502676, + "loss": 0.1564, + "step": 315380 + }, + { + "epoch": 13.06, + "grad_norm": 2.203125, + "learning_rate": 0.00030041286779202206, + "loss": 0.1855, + "step": 315390 + }, + { + "epoch": 13.06, + "grad_norm": 1.09375, + "learning_rate": 0.0003004022453389048, + "loss": 0.1424, + "step": 315400 + }, + { + "epoch": 13.06, + "grad_norm": 0.6953125, + "learning_rate": 0.000300391622790936, + "loss": 0.172, + "step": 315410 + }, + { + "epoch": 13.06, + "grad_norm": 1.1875, + "learning_rate": 0.0003003810001481357, + "loss": 0.1917, + "step": 315420 + }, + { + "epoch": 13.07, + "grad_norm": 0.53125, + "learning_rate": 0.0003003703774105237, + "loss": 0.2006, + "step": 315430 + }, + { + "epoch": 13.07, + "grad_norm": 1.1171875, + "learning_rate": 0.0003003597545781201, + "loss": 0.1974, + "step": 315440 + }, + { + "epoch": 13.07, + "grad_norm": 0.6796875, + "learning_rate": 0.0003003491316509449, + "loss": 0.2181, + "step": 315450 + }, + { + "epoch": 13.07, + "grad_norm": 1.3828125, + "learning_rate": 0.000300338508629018, + "loss": 0.1991, + "step": 315460 + }, + { + "epoch": 13.07, + "grad_norm": 0.765625, + "learning_rate": 0.0003003278855123596, + "loss": 0.2314, + "step": 315470 + }, + { + "epoch": 13.07, + "grad_norm": 0.5078125, + "learning_rate": 0.00030031726230098945, + "loss": 0.2074, + "step": 315480 + }, + { + "epoch": 13.07, + "grad_norm": 1.1953125, + "learning_rate": 0.0003003066389949277, + "loss": 0.1899, + "step": 315490 + }, + { + "epoch": 13.07, + "grad_norm": 0.65625, + "learning_rate": 0.00030029601559419426, + "loss": 0.1703, + "step": 315500 + }, + { + "epoch": 13.07, + "grad_norm": 0.5234375, + "learning_rate": 0.00030028539209880917, + "loss": 0.1645, + "step": 315510 + }, + { + "epoch": 13.07, + "grad_norm": 1.1171875, + "learning_rate": 0.00030027476850879236, + "loss": 0.1773, + "step": 315520 + }, + { + "epoch": 13.07, + "grad_norm": 1.0390625, + "learning_rate": 0.0003002641448241639, + "loss": 0.199, + "step": 315530 + }, + { + "epoch": 13.07, + "grad_norm": 0.609375, + "learning_rate": 0.00030025352104494374, + "loss": 0.1648, + "step": 315540 + }, + { + "epoch": 13.07, + "grad_norm": 1.1875, + "learning_rate": 0.00030024289717115187, + "loss": 0.1272, + "step": 315550 + }, + { + "epoch": 13.07, + "grad_norm": 1.1796875, + "learning_rate": 0.00030023227320280835, + "loss": 0.2102, + "step": 315560 + }, + { + "epoch": 13.07, + "grad_norm": 0.98828125, + "learning_rate": 0.00030022164913993303, + "loss": 0.1957, + "step": 315570 + }, + { + "epoch": 13.07, + "grad_norm": 0.92578125, + "learning_rate": 0.000300211024982546, + "loss": 0.2106, + "step": 315580 + }, + { + "epoch": 13.07, + "grad_norm": 0.55078125, + "learning_rate": 0.0003002004007306673, + "loss": 0.127, + "step": 315590 + }, + { + "epoch": 13.07, + "grad_norm": 0.494140625, + "learning_rate": 0.0003001897763843169, + "loss": 0.171, + "step": 315600 + }, + { + "epoch": 13.07, + "grad_norm": 0.0, + "learning_rate": 0.0003001791519435147, + "loss": 0.1798, + "step": 315610 + }, + { + "epoch": 13.07, + "grad_norm": 1.078125, + "learning_rate": 0.0003001685274082807, + "loss": 0.1998, + "step": 315620 + }, + { + "epoch": 13.07, + "grad_norm": 1.875, + "learning_rate": 0.000300157902778635, + "loss": 0.1628, + "step": 315630 + }, + { + "epoch": 13.07, + "grad_norm": 0.65234375, + "learning_rate": 0.0003001472780545976, + "loss": 0.2251, + "step": 315640 + }, + { + "epoch": 13.07, + "grad_norm": 1.3671875, + "learning_rate": 0.00030013665323618834, + "loss": 0.2359, + "step": 315650 + }, + { + "epoch": 13.07, + "grad_norm": 0.62109375, + "learning_rate": 0.0003001260283234274, + "loss": 0.2103, + "step": 315660 + }, + { + "epoch": 13.08, + "grad_norm": 1.03125, + "learning_rate": 0.00030011540331633464, + "loss": 0.1971, + "step": 315670 + }, + { + "epoch": 13.08, + "grad_norm": 0.63671875, + "learning_rate": 0.0003001047782149301, + "loss": 0.2021, + "step": 315680 + }, + { + "epoch": 13.08, + "grad_norm": 0.28125, + "learning_rate": 0.00030009415301923383, + "loss": 0.2079, + "step": 315690 + }, + { + "epoch": 13.08, + "grad_norm": 1.2890625, + "learning_rate": 0.00030008352772926565, + "loss": 0.212, + "step": 315700 + }, + { + "epoch": 13.08, + "grad_norm": 0.412109375, + "learning_rate": 0.0003000729023450458, + "loss": 0.2177, + "step": 315710 + }, + { + "epoch": 13.08, + "grad_norm": 1.25, + "learning_rate": 0.0003000622768665941, + "loss": 0.1766, + "step": 315720 + }, + { + "epoch": 13.08, + "grad_norm": 0.66015625, + "learning_rate": 0.00030005165129393064, + "loss": 0.171, + "step": 315730 + }, + { + "epoch": 13.08, + "grad_norm": 1.0859375, + "learning_rate": 0.00030004102562707535, + "loss": 0.2241, + "step": 315740 + }, + { + "epoch": 13.08, + "grad_norm": 0.58203125, + "learning_rate": 0.00030003039986604827, + "loss": 0.1429, + "step": 315750 + }, + { + "epoch": 13.08, + "grad_norm": 0.7734375, + "learning_rate": 0.0003000197740108694, + "loss": 0.2302, + "step": 315760 + }, + { + "epoch": 13.08, + "grad_norm": 0.8359375, + "learning_rate": 0.00030000914806155873, + "loss": 0.2292, + "step": 315770 + }, + { + "epoch": 13.08, + "grad_norm": 2.40625, + "learning_rate": 0.0002999985220181361, + "loss": 0.1446, + "step": 315780 + }, + { + "epoch": 13.08, + "grad_norm": 0.322265625, + "learning_rate": 0.0002999878958806218, + "loss": 0.2371, + "step": 315790 + }, + { + "epoch": 13.08, + "grad_norm": 0.5546875, + "learning_rate": 0.0002999772696490356, + "loss": 0.2085, + "step": 315800 + }, + { + "epoch": 13.08, + "grad_norm": 0.91796875, + "learning_rate": 0.0002999666433233976, + "loss": 0.2282, + "step": 315810 + }, + { + "epoch": 13.08, + "grad_norm": 0.59765625, + "learning_rate": 0.0002999560169037278, + "loss": 0.16, + "step": 315820 + }, + { + "epoch": 13.08, + "grad_norm": 1.9296875, + "learning_rate": 0.00029994539039004617, + "loss": 0.1687, + "step": 315830 + }, + { + "epoch": 13.08, + "grad_norm": 0.6640625, + "learning_rate": 0.00029993476378237265, + "loss": 0.1896, + "step": 315840 + }, + { + "epoch": 13.08, + "grad_norm": 0.32421875, + "learning_rate": 0.0002999241370807274, + "loss": 0.1789, + "step": 315850 + }, + { + "epoch": 13.08, + "grad_norm": 0.400390625, + "learning_rate": 0.00029991351028513014, + "loss": 0.2136, + "step": 315860 + }, + { + "epoch": 13.08, + "grad_norm": 1.0625, + "learning_rate": 0.00029990288339560125, + "loss": 0.1572, + "step": 315870 + }, + { + "epoch": 13.08, + "grad_norm": 1.046875, + "learning_rate": 0.0002998922564121604, + "loss": 0.1801, + "step": 315880 + }, + { + "epoch": 13.08, + "grad_norm": 0.8828125, + "learning_rate": 0.0002998816293348277, + "loss": 0.2267, + "step": 315890 + }, + { + "epoch": 13.08, + "grad_norm": 1.5, + "learning_rate": 0.0002998710021636233, + "loss": 0.1734, + "step": 315900 + }, + { + "epoch": 13.08, + "grad_norm": 1.421875, + "learning_rate": 0.0002998603748985669, + "loss": 0.1961, + "step": 315910 + }, + { + "epoch": 13.09, + "grad_norm": 1.46875, + "learning_rate": 0.0002998497475396787, + "loss": 0.1787, + "step": 315920 + }, + { + "epoch": 13.09, + "grad_norm": 0.93359375, + "learning_rate": 0.00029983912008697865, + "loss": 0.1756, + "step": 315930 + }, + { + "epoch": 13.09, + "grad_norm": 0.921875, + "learning_rate": 0.0002998284925404868, + "loss": 0.2492, + "step": 315940 + }, + { + "epoch": 13.09, + "grad_norm": 0.4140625, + "learning_rate": 0.00029981786490022313, + "loss": 0.1823, + "step": 315950 + }, + { + "epoch": 13.09, + "grad_norm": 0.77734375, + "learning_rate": 0.00029980723716620754, + "loss": 0.1791, + "step": 315960 + }, + { + "epoch": 13.09, + "grad_norm": 0.625, + "learning_rate": 0.00029979660933846013, + "loss": 0.1798, + "step": 315970 + }, + { + "epoch": 13.09, + "grad_norm": 0.54296875, + "learning_rate": 0.0002997859814170009, + "loss": 0.2035, + "step": 315980 + }, + { + "epoch": 13.09, + "grad_norm": 1.0703125, + "learning_rate": 0.0002997753534018498, + "loss": 0.2369, + "step": 315990 + }, + { + "epoch": 13.09, + "grad_norm": 0.412109375, + "learning_rate": 0.00029976472529302697, + "loss": 0.1985, + "step": 316000 + }, + { + "epoch": 13.09, + "grad_norm": 1.3828125, + "learning_rate": 0.0002997540970905522, + "loss": 0.1979, + "step": 316010 + }, + { + "epoch": 13.09, + "grad_norm": 0.5390625, + "learning_rate": 0.0002997434687944456, + "loss": 0.2382, + "step": 316020 + }, + { + "epoch": 13.09, + "grad_norm": 0.9140625, + "learning_rate": 0.0002997328404047272, + "loss": 0.1701, + "step": 316030 + }, + { + "epoch": 13.09, + "grad_norm": 0.71875, + "learning_rate": 0.0002997222119214169, + "loss": 0.2186, + "step": 316040 + }, + { + "epoch": 13.09, + "grad_norm": 1.3359375, + "learning_rate": 0.0002997115833445348, + "loss": 0.1494, + "step": 316050 + }, + { + "epoch": 13.09, + "grad_norm": 0.9765625, + "learning_rate": 0.00029970095467410086, + "loss": 0.188, + "step": 316060 + }, + { + "epoch": 13.09, + "grad_norm": 0.263671875, + "learning_rate": 0.0002996903259101351, + "loss": 0.2218, + "step": 316070 + }, + { + "epoch": 13.09, + "grad_norm": 0.86328125, + "learning_rate": 0.0002996796970526576, + "loss": 0.1911, + "step": 316080 + }, + { + "epoch": 13.09, + "grad_norm": 1.9921875, + "learning_rate": 0.00029966906810168815, + "loss": 0.1917, + "step": 316090 + }, + { + "epoch": 13.09, + "grad_norm": 0.73828125, + "learning_rate": 0.0002996584390572469, + "loss": 0.2278, + "step": 316100 + }, + { + "epoch": 13.09, + "grad_norm": 0.478515625, + "learning_rate": 0.0002996478099193539, + "loss": 0.1911, + "step": 316110 + }, + { + "epoch": 13.09, + "grad_norm": 1.3203125, + "learning_rate": 0.00029963718068802905, + "loss": 0.2162, + "step": 316120 + }, + { + "epoch": 13.09, + "grad_norm": 1.21875, + "learning_rate": 0.0002996265513632923, + "loss": 0.1941, + "step": 316130 + }, + { + "epoch": 13.09, + "grad_norm": 0.91796875, + "learning_rate": 0.00029961592194516386, + "loss": 0.1869, + "step": 316140 + }, + { + "epoch": 13.09, + "grad_norm": 1.21875, + "learning_rate": 0.00029960529243366354, + "loss": 0.2003, + "step": 316150 + }, + { + "epoch": 13.1, + "grad_norm": 0.7109375, + "learning_rate": 0.0002995946628288114, + "loss": 0.1927, + "step": 316160 + }, + { + "epoch": 13.1, + "grad_norm": 1.0390625, + "learning_rate": 0.00029958403313062756, + "loss": 0.2301, + "step": 316170 + }, + { + "epoch": 13.1, + "grad_norm": 1.046875, + "learning_rate": 0.00029957340333913187, + "loss": 0.2057, + "step": 316180 + }, + { + "epoch": 13.1, + "grad_norm": 0.51953125, + "learning_rate": 0.00029956277345434437, + "loss": 0.1867, + "step": 316190 + }, + { + "epoch": 13.1, + "grad_norm": 0.5859375, + "learning_rate": 0.00029955214347628513, + "loss": 0.1959, + "step": 316200 + }, + { + "epoch": 13.1, + "grad_norm": 0.302734375, + "learning_rate": 0.00029954151340497404, + "loss": 0.1848, + "step": 316210 + }, + { + "epoch": 13.1, + "grad_norm": 0.6875, + "learning_rate": 0.00029953088324043115, + "loss": 0.1486, + "step": 316220 + }, + { + "epoch": 13.1, + "grad_norm": 1.0625, + "learning_rate": 0.00029952025298267653, + "loss": 0.1981, + "step": 316230 + }, + { + "epoch": 13.1, + "grad_norm": 0.8359375, + "learning_rate": 0.0002995096226317302, + "loss": 0.2034, + "step": 316240 + }, + { + "epoch": 13.1, + "grad_norm": 1.3046875, + "learning_rate": 0.00029949899218761206, + "loss": 0.2206, + "step": 316250 + }, + { + "epoch": 13.1, + "grad_norm": 0.890625, + "learning_rate": 0.00029948836165034215, + "loss": 0.181, + "step": 316260 + }, + { + "epoch": 13.1, + "grad_norm": 0.96875, + "learning_rate": 0.00029947773101994047, + "loss": 0.235, + "step": 316270 + }, + { + "epoch": 13.1, + "grad_norm": 0.76953125, + "learning_rate": 0.00029946710029642707, + "loss": 0.2315, + "step": 316280 + }, + { + "epoch": 13.1, + "grad_norm": 0.9375, + "learning_rate": 0.00029945646947982196, + "loss": 0.1987, + "step": 316290 + }, + { + "epoch": 13.1, + "grad_norm": 0.66015625, + "learning_rate": 0.0002994458385701451, + "loss": 0.1723, + "step": 316300 + }, + { + "epoch": 13.1, + "grad_norm": 1.5859375, + "learning_rate": 0.0002994352075674165, + "loss": 0.1915, + "step": 316310 + }, + { + "epoch": 13.1, + "grad_norm": 0.271484375, + "learning_rate": 0.0002994245764716562, + "loss": 0.1722, + "step": 316320 + }, + { + "epoch": 13.1, + "grad_norm": 0.88671875, + "learning_rate": 0.00029941394528288414, + "loss": 0.282, + "step": 316330 + }, + { + "epoch": 13.1, + "grad_norm": 0.734375, + "learning_rate": 0.00029940331400112036, + "loss": 0.2111, + "step": 316340 + }, + { + "epoch": 13.1, + "grad_norm": 1.1015625, + "learning_rate": 0.00029939268262638494, + "loss": 0.2115, + "step": 316350 + }, + { + "epoch": 13.1, + "grad_norm": 0.90234375, + "learning_rate": 0.00029938205115869783, + "loss": 0.154, + "step": 316360 + }, + { + "epoch": 13.1, + "grad_norm": 1.296875, + "learning_rate": 0.00029937141959807893, + "loss": 0.2431, + "step": 316370 + }, + { + "epoch": 13.1, + "grad_norm": 0.6640625, + "learning_rate": 0.0002993607879445484, + "loss": 0.1837, + "step": 316380 + }, + { + "epoch": 13.1, + "grad_norm": 1.5546875, + "learning_rate": 0.0002993501561981262, + "loss": 0.2018, + "step": 316390 + }, + { + "epoch": 13.11, + "grad_norm": 0.83984375, + "learning_rate": 0.00029933952435883237, + "loss": 0.186, + "step": 316400 + }, + { + "epoch": 13.11, + "grad_norm": 1.1875, + "learning_rate": 0.00029932889242668693, + "loss": 0.161, + "step": 316410 + }, + { + "epoch": 13.11, + "grad_norm": 0.92578125, + "learning_rate": 0.0002993182604017097, + "loss": 0.1755, + "step": 316420 + }, + { + "epoch": 13.11, + "grad_norm": 0.96875, + "learning_rate": 0.00029930762828392094, + "loss": 0.1835, + "step": 316430 + }, + { + "epoch": 13.11, + "grad_norm": 2.75, + "learning_rate": 0.00029929699607334056, + "loss": 0.2194, + "step": 316440 + }, + { + "epoch": 13.11, + "grad_norm": 0.6015625, + "learning_rate": 0.0002992863637699885, + "loss": 0.1829, + "step": 316450 + }, + { + "epoch": 13.11, + "grad_norm": 3.625, + "learning_rate": 0.0002992757313738848, + "loss": 0.1504, + "step": 316460 + }, + { + "epoch": 13.11, + "grad_norm": 0.546875, + "learning_rate": 0.0002992650988850495, + "loss": 0.1549, + "step": 316470 + }, + { + "epoch": 13.11, + "grad_norm": 0.8671875, + "learning_rate": 0.00029925446630350264, + "loss": 0.223, + "step": 316480 + }, + { + "epoch": 13.11, + "grad_norm": 2.96875, + "learning_rate": 0.00029924383362926423, + "loss": 0.1861, + "step": 316490 + }, + { + "epoch": 13.11, + "grad_norm": 1.6796875, + "learning_rate": 0.0002992332008623542, + "loss": 0.1198, + "step": 316500 + }, + { + "epoch": 13.11, + "grad_norm": 1.59375, + "learning_rate": 0.00029922256800279256, + "loss": 0.1707, + "step": 316510 + }, + { + "epoch": 13.11, + "grad_norm": 1.1875, + "learning_rate": 0.00029921193505059947, + "loss": 0.2185, + "step": 316520 + }, + { + "epoch": 13.11, + "grad_norm": 0.9921875, + "learning_rate": 0.00029920130200579475, + "loss": 0.2146, + "step": 316530 + }, + { + "epoch": 13.11, + "grad_norm": 2.75, + "learning_rate": 0.0002991906688683985, + "loss": 0.1828, + "step": 316540 + }, + { + "epoch": 13.11, + "grad_norm": 0.78515625, + "learning_rate": 0.0002991800356384307, + "loss": 0.2651, + "step": 316550 + }, + { + "epoch": 13.11, + "grad_norm": 0.765625, + "learning_rate": 0.00029916940231591144, + "loss": 0.168, + "step": 316560 + }, + { + "epoch": 13.11, + "grad_norm": 2.515625, + "learning_rate": 0.0002991587689008607, + "loss": 0.1973, + "step": 316570 + }, + { + "epoch": 13.11, + "grad_norm": 1.8046875, + "learning_rate": 0.00029914813539329836, + "loss": 0.2226, + "step": 316580 + }, + { + "epoch": 13.11, + "grad_norm": 0.6640625, + "learning_rate": 0.0002991375017932446, + "loss": 0.1891, + "step": 316590 + }, + { + "epoch": 13.11, + "grad_norm": 0.98828125, + "learning_rate": 0.00029912686810071944, + "loss": 0.2377, + "step": 316600 + }, + { + "epoch": 13.11, + "grad_norm": 0.9609375, + "learning_rate": 0.00029911623431574263, + "loss": 0.2167, + "step": 316610 + }, + { + "epoch": 13.11, + "grad_norm": 0.578125, + "learning_rate": 0.00029910560043833455, + "loss": 0.1609, + "step": 316620 + }, + { + "epoch": 13.11, + "grad_norm": 0.45703125, + "learning_rate": 0.000299094966468515, + "loss": 0.2033, + "step": 316630 + }, + { + "epoch": 13.12, + "grad_norm": 1.046875, + "learning_rate": 0.00029908433240630395, + "loss": 0.2391, + "step": 316640 + }, + { + "epoch": 13.12, + "grad_norm": 0.423828125, + "learning_rate": 0.0002990736982517216, + "loss": 0.1245, + "step": 316650 + }, + { + "epoch": 13.12, + "grad_norm": 0.640625, + "learning_rate": 0.0002990630640047877, + "loss": 0.18, + "step": 316660 + }, + { + "epoch": 13.12, + "grad_norm": 1.3203125, + "learning_rate": 0.00029905242966552246, + "loss": 0.151, + "step": 316670 + }, + { + "epoch": 13.12, + "grad_norm": 2.171875, + "learning_rate": 0.00029904179523394597, + "loss": 0.2098, + "step": 316680 + }, + { + "epoch": 13.12, + "grad_norm": 1.09375, + "learning_rate": 0.00029903116071007795, + "loss": 0.1457, + "step": 316690 + }, + { + "epoch": 13.12, + "grad_norm": 0.546875, + "learning_rate": 0.0002990205260939387, + "loss": 0.1566, + "step": 316700 + }, + { + "epoch": 13.12, + "grad_norm": 1.5, + "learning_rate": 0.0002990098913855481, + "loss": 0.1984, + "step": 316710 + }, + { + "epoch": 13.12, + "grad_norm": 0.51953125, + "learning_rate": 0.0002989992565849261, + "loss": 0.2025, + "step": 316720 + }, + { + "epoch": 13.12, + "grad_norm": 0.4765625, + "learning_rate": 0.00029898862169209286, + "loss": 0.1832, + "step": 316730 + }, + { + "epoch": 13.12, + "grad_norm": 0.81640625, + "learning_rate": 0.00029897798670706823, + "loss": 0.2304, + "step": 316740 + }, + { + "epoch": 13.12, + "grad_norm": 1.6640625, + "learning_rate": 0.00029896735162987244, + "loss": 0.2158, + "step": 316750 + }, + { + "epoch": 13.12, + "grad_norm": 0.435546875, + "learning_rate": 0.00029895671646052536, + "loss": 0.2055, + "step": 316760 + }, + { + "epoch": 13.12, + "grad_norm": 0.46875, + "learning_rate": 0.0002989460811990469, + "loss": 0.1983, + "step": 316770 + }, + { + "epoch": 13.12, + "grad_norm": 0.76171875, + "learning_rate": 0.00029893544584545735, + "loss": 0.143, + "step": 316780 + }, + { + "epoch": 13.12, + "grad_norm": 0.8828125, + "learning_rate": 0.00029892481039977653, + "loss": 0.1293, + "step": 316790 + }, + { + "epoch": 13.12, + "grad_norm": 0.69921875, + "learning_rate": 0.0002989141748620245, + "loss": 0.1638, + "step": 316800 + }, + { + "epoch": 13.12, + "grad_norm": 0.8125, + "learning_rate": 0.0002989035392322213, + "loss": 0.2351, + "step": 316810 + }, + { + "epoch": 13.12, + "grad_norm": 0.84375, + "learning_rate": 0.00029889290351038677, + "loss": 0.1744, + "step": 316820 + }, + { + "epoch": 13.12, + "grad_norm": 0.890625, + "learning_rate": 0.00029888226769654125, + "loss": 0.2471, + "step": 316830 + }, + { + "epoch": 13.12, + "grad_norm": 0.322265625, + "learning_rate": 0.0002988716317907046, + "loss": 0.2029, + "step": 316840 + }, + { + "epoch": 13.12, + "grad_norm": 0.77734375, + "learning_rate": 0.0002988609957928967, + "loss": 0.1932, + "step": 316850 + }, + { + "epoch": 13.12, + "grad_norm": 0.703125, + "learning_rate": 0.00029885035970313775, + "loss": 0.1877, + "step": 316860 + }, + { + "epoch": 13.12, + "grad_norm": 0.90234375, + "learning_rate": 0.00029883972352144763, + "loss": 0.1917, + "step": 316870 + }, + { + "epoch": 13.13, + "grad_norm": 1.109375, + "learning_rate": 0.00029882908724784645, + "loss": 0.1867, + "step": 316880 + }, + { + "epoch": 13.13, + "grad_norm": 0.94921875, + "learning_rate": 0.00029881845088235425, + "loss": 0.2227, + "step": 316890 + }, + { + "epoch": 13.13, + "grad_norm": 0.5859375, + "learning_rate": 0.00029880781442499093, + "loss": 0.1868, + "step": 316900 + }, + { + "epoch": 13.13, + "grad_norm": 0.6875, + "learning_rate": 0.00029879717787577667, + "loss": 0.1518, + "step": 316910 + }, + { + "epoch": 13.13, + "grad_norm": 0.7734375, + "learning_rate": 0.00029878654123473134, + "loss": 0.1735, + "step": 316920 + }, + { + "epoch": 13.13, + "grad_norm": 1.3515625, + "learning_rate": 0.0002987759045018749, + "loss": 0.201, + "step": 316930 + }, + { + "epoch": 13.13, + "grad_norm": 0.3984375, + "learning_rate": 0.00029876526767722765, + "loss": 0.1607, + "step": 316940 + }, + { + "epoch": 13.13, + "grad_norm": 0.51171875, + "learning_rate": 0.00029875463076080934, + "loss": 0.2037, + "step": 316950 + }, + { + "epoch": 13.13, + "grad_norm": 0.6796875, + "learning_rate": 0.00029874399375264004, + "loss": 0.1972, + "step": 316960 + }, + { + "epoch": 13.13, + "grad_norm": 0.55078125, + "learning_rate": 0.0002987333566527399, + "loss": 0.1806, + "step": 316970 + }, + { + "epoch": 13.13, + "grad_norm": 1.21875, + "learning_rate": 0.00029872271946112877, + "loss": 0.1682, + "step": 316980 + }, + { + "epoch": 13.13, + "grad_norm": 1.78125, + "learning_rate": 0.00029871208217782685, + "loss": 0.1469, + "step": 316990 + }, + { + "epoch": 13.13, + "grad_norm": 0.52734375, + "learning_rate": 0.000298701444802854, + "loss": 0.2112, + "step": 317000 + }, + { + "epoch": 13.13, + "grad_norm": 0.578125, + "learning_rate": 0.0002986908073362302, + "loss": 0.1916, + "step": 317010 + }, + { + "epoch": 13.13, + "grad_norm": 0.81640625, + "learning_rate": 0.0002986801697779757, + "loss": 0.1638, + "step": 317020 + }, + { + "epoch": 13.13, + "grad_norm": 0.99609375, + "learning_rate": 0.00029866953212811034, + "loss": 0.1922, + "step": 317030 + }, + { + "epoch": 13.13, + "grad_norm": 0.66015625, + "learning_rate": 0.0002986588943866541, + "loss": 0.2221, + "step": 317040 + }, + { + "epoch": 13.13, + "grad_norm": 0.54296875, + "learning_rate": 0.00029864825655362713, + "loss": 0.2048, + "step": 317050 + }, + { + "epoch": 13.13, + "grad_norm": 0.9296875, + "learning_rate": 0.0002986376186290494, + "loss": 0.1618, + "step": 317060 + }, + { + "epoch": 13.13, + "grad_norm": 0.0, + "learning_rate": 0.0002986269806129409, + "loss": 0.2191, + "step": 317070 + }, + { + "epoch": 13.13, + "grad_norm": 0.4921875, + "learning_rate": 0.0002986163425053217, + "loss": 0.214, + "step": 317080 + }, + { + "epoch": 13.13, + "grad_norm": 1.2890625, + "learning_rate": 0.0002986057043062117, + "loss": 0.2257, + "step": 317090 + }, + { + "epoch": 13.13, + "grad_norm": 1.2265625, + "learning_rate": 0.00029859506601563114, + "loss": 0.1568, + "step": 317100 + }, + { + "epoch": 13.13, + "grad_norm": 1.1953125, + "learning_rate": 0.0002985844276335999, + "loss": 0.2023, + "step": 317110 + }, + { + "epoch": 13.14, + "grad_norm": 0.76171875, + "learning_rate": 0.00029857378916013796, + "loss": 0.2136, + "step": 317120 + }, + { + "epoch": 13.14, + "grad_norm": 1.3671875, + "learning_rate": 0.00029856315059526543, + "loss": 0.1854, + "step": 317130 + }, + { + "epoch": 13.14, + "grad_norm": 0.6484375, + "learning_rate": 0.00029855251193900225, + "loss": 0.2022, + "step": 317140 + }, + { + "epoch": 13.14, + "grad_norm": 0.220703125, + "learning_rate": 0.0002985418731913685, + "loss": 0.2217, + "step": 317150 + }, + { + "epoch": 13.14, + "grad_norm": 1.1484375, + "learning_rate": 0.0002985312343523842, + "loss": 0.2126, + "step": 317160 + }, + { + "epoch": 13.14, + "grad_norm": 1.1953125, + "learning_rate": 0.0002985205954220693, + "loss": 0.191, + "step": 317170 + }, + { + "epoch": 13.14, + "grad_norm": 1.1015625, + "learning_rate": 0.00029850995640044405, + "loss": 0.1898, + "step": 317180 + }, + { + "epoch": 13.14, + "grad_norm": 0.9140625, + "learning_rate": 0.00029849931728752815, + "loss": 0.2336, + "step": 317190 + }, + { + "epoch": 13.14, + "grad_norm": 0.72265625, + "learning_rate": 0.0002984886780833418, + "loss": 0.2441, + "step": 317200 + }, + { + "epoch": 13.14, + "grad_norm": 0.625, + "learning_rate": 0.00029847803878790504, + "loss": 0.2226, + "step": 317210 + }, + { + "epoch": 13.14, + "grad_norm": 0.435546875, + "learning_rate": 0.00029846739940123774, + "loss": 0.195, + "step": 317220 + }, + { + "epoch": 13.14, + "grad_norm": 0.77734375, + "learning_rate": 0.0002984567599233602, + "loss": 0.182, + "step": 317230 + }, + { + "epoch": 13.14, + "grad_norm": 0.8046875, + "learning_rate": 0.00029844612035429213, + "loss": 0.2109, + "step": 317240 + }, + { + "epoch": 13.14, + "grad_norm": 0.86328125, + "learning_rate": 0.0002984354806940537, + "loss": 0.2376, + "step": 317250 + }, + { + "epoch": 13.14, + "grad_norm": 1.3671875, + "learning_rate": 0.00029842484094266505, + "loss": 0.2069, + "step": 317260 + }, + { + "epoch": 13.14, + "grad_norm": 0.57421875, + "learning_rate": 0.00029841420110014597, + "loss": 0.1714, + "step": 317270 + }, + { + "epoch": 13.14, + "grad_norm": 0.87109375, + "learning_rate": 0.00029840356116651656, + "loss": 0.1971, + "step": 317280 + }, + { + "epoch": 13.14, + "grad_norm": 1.0390625, + "learning_rate": 0.0002983929211417969, + "loss": 0.24, + "step": 317290 + }, + { + "epoch": 13.14, + "grad_norm": 0.703125, + "learning_rate": 0.000298382281026007, + "loss": 0.2128, + "step": 317300 + }, + { + "epoch": 13.14, + "grad_norm": 0.71484375, + "learning_rate": 0.00029837164081916696, + "loss": 0.1846, + "step": 317310 + }, + { + "epoch": 13.14, + "grad_norm": 0.875, + "learning_rate": 0.00029836100052129663, + "loss": 0.1986, + "step": 317320 + }, + { + "epoch": 13.14, + "grad_norm": 0.6640625, + "learning_rate": 0.00029835036013241615, + "loss": 0.1735, + "step": 317330 + }, + { + "epoch": 13.14, + "grad_norm": 0.92578125, + "learning_rate": 0.00029833971965254546, + "loss": 0.2117, + "step": 317340 + }, + { + "epoch": 13.14, + "grad_norm": 0.55859375, + "learning_rate": 0.0002983290790817047, + "loss": 0.2275, + "step": 317350 + }, + { + "epoch": 13.15, + "grad_norm": 1.75, + "learning_rate": 0.0002983184384199138, + "loss": 0.1753, + "step": 317360 + }, + { + "epoch": 13.15, + "grad_norm": 1.84375, + "learning_rate": 0.0002983077976671928, + "loss": 0.2208, + "step": 317370 + }, + { + "epoch": 13.15, + "grad_norm": 1.1328125, + "learning_rate": 0.0002982971568235617, + "loss": 0.1969, + "step": 317380 + }, + { + "epoch": 13.15, + "grad_norm": 1.2578125, + "learning_rate": 0.0002982865158890407, + "loss": 0.2121, + "step": 317390 + }, + { + "epoch": 13.15, + "grad_norm": 0.8046875, + "learning_rate": 0.0002982758748636496, + "loss": 0.189, + "step": 317400 + }, + { + "epoch": 13.15, + "grad_norm": 0.55078125, + "learning_rate": 0.00029826523374740856, + "loss": 0.1826, + "step": 317410 + }, + { + "epoch": 13.15, + "grad_norm": 0.57421875, + "learning_rate": 0.00029825459254033756, + "loss": 0.2055, + "step": 317420 + }, + { + "epoch": 13.15, + "grad_norm": 0.7578125, + "learning_rate": 0.0002982439512424566, + "loss": 0.1945, + "step": 317430 + }, + { + "epoch": 13.15, + "grad_norm": 0.85546875, + "learning_rate": 0.0002982333098537857, + "loss": 0.1693, + "step": 317440 + }, + { + "epoch": 13.15, + "grad_norm": 0.7265625, + "learning_rate": 0.00029822266837434497, + "loss": 0.1962, + "step": 317450 + }, + { + "epoch": 13.15, + "grad_norm": 0.50390625, + "learning_rate": 0.0002982120268041544, + "loss": 0.1907, + "step": 317460 + }, + { + "epoch": 13.15, + "grad_norm": 0.5546875, + "learning_rate": 0.000298201385143234, + "loss": 0.1531, + "step": 317470 + }, + { + "epoch": 13.15, + "grad_norm": 0.44921875, + "learning_rate": 0.00029819074339160374, + "loss": 0.1735, + "step": 317480 + }, + { + "epoch": 13.15, + "grad_norm": 1.4453125, + "learning_rate": 0.0002981801015492838, + "loss": 0.2092, + "step": 317490 + }, + { + "epoch": 13.15, + "grad_norm": 2.109375, + "learning_rate": 0.000298169459616294, + "loss": 0.206, + "step": 317500 + }, + { + "epoch": 13.15, + "grad_norm": 1.1328125, + "learning_rate": 0.0002981588175926546, + "loss": 0.188, + "step": 317510 + }, + { + "epoch": 13.15, + "grad_norm": 0.9765625, + "learning_rate": 0.00029814817547838533, + "loss": 0.1632, + "step": 317520 + }, + { + "epoch": 13.15, + "grad_norm": 0.765625, + "learning_rate": 0.0002981375332735066, + "loss": 0.2444, + "step": 317530 + }, + { + "epoch": 13.15, + "grad_norm": 0.96484375, + "learning_rate": 0.0002981268909780381, + "loss": 0.138, + "step": 317540 + }, + { + "epoch": 13.15, + "grad_norm": 0.53125, + "learning_rate": 0.00029811624859200003, + "loss": 0.1961, + "step": 317550 + }, + { + "epoch": 13.15, + "grad_norm": 0.40625, + "learning_rate": 0.00029810560611541237, + "loss": 0.1396, + "step": 317560 + }, + { + "epoch": 13.15, + "grad_norm": 0.4765625, + "learning_rate": 0.00029809496354829513, + "loss": 0.1779, + "step": 317570 + }, + { + "epoch": 13.15, + "grad_norm": 0.671875, + "learning_rate": 0.00029808432089066837, + "loss": 0.2027, + "step": 317580 + }, + { + "epoch": 13.15, + "grad_norm": 0.466796875, + "learning_rate": 0.00029807367814255214, + "loss": 0.1892, + "step": 317590 + }, + { + "epoch": 13.15, + "grad_norm": 0.515625, + "learning_rate": 0.00029806303530396645, + "loss": 0.2314, + "step": 317600 + }, + { + "epoch": 13.16, + "grad_norm": 1.203125, + "learning_rate": 0.00029805239237493125, + "loss": 0.1923, + "step": 317610 + }, + { + "epoch": 13.16, + "grad_norm": 0.89453125, + "learning_rate": 0.0002980417493554667, + "loss": 0.2031, + "step": 317620 + }, + { + "epoch": 13.16, + "grad_norm": 1.1640625, + "learning_rate": 0.00029803110624559276, + "loss": 0.1466, + "step": 317630 + }, + { + "epoch": 13.16, + "grad_norm": 0.494140625, + "learning_rate": 0.0002980204630453294, + "loss": 0.1783, + "step": 317640 + }, + { + "epoch": 13.16, + "grad_norm": 0.8515625, + "learning_rate": 0.00029800981975469675, + "loss": 0.1691, + "step": 317650 + }, + { + "epoch": 13.16, + "grad_norm": 0.203125, + "learning_rate": 0.00029799917637371486, + "loss": 0.2086, + "step": 317660 + }, + { + "epoch": 13.16, + "grad_norm": 1.2109375, + "learning_rate": 0.0002979885329024037, + "loss": 0.1692, + "step": 317670 + }, + { + "epoch": 13.16, + "grad_norm": 0.65234375, + "learning_rate": 0.00029797788934078325, + "loss": 0.2008, + "step": 317680 + }, + { + "epoch": 13.16, + "grad_norm": 1.046875, + "learning_rate": 0.00029796724568887354, + "loss": 0.1847, + "step": 317690 + }, + { + "epoch": 13.16, + "grad_norm": 0.50390625, + "learning_rate": 0.0002979566019466948, + "loss": 0.1781, + "step": 317700 + }, + { + "epoch": 13.16, + "grad_norm": 0.6171875, + "learning_rate": 0.0002979459581142668, + "loss": 0.207, + "step": 317710 + }, + { + "epoch": 13.16, + "grad_norm": 0.5390625, + "learning_rate": 0.0002979353141916097, + "loss": 0.2134, + "step": 317720 + }, + { + "epoch": 13.16, + "grad_norm": 1.1171875, + "learning_rate": 0.0002979246701787436, + "loss": 0.2249, + "step": 317730 + }, + { + "epoch": 13.16, + "grad_norm": 0.95703125, + "learning_rate": 0.00029791402607568836, + "loss": 0.2328, + "step": 317740 + }, + { + "epoch": 13.16, + "grad_norm": 1.265625, + "learning_rate": 0.00029790338188246415, + "loss": 0.2111, + "step": 317750 + }, + { + "epoch": 13.16, + "grad_norm": 0.765625, + "learning_rate": 0.0002978927375990909, + "loss": 0.2072, + "step": 317760 + }, + { + "epoch": 13.16, + "grad_norm": 0.51953125, + "learning_rate": 0.00029788209322558867, + "loss": 0.23, + "step": 317770 + }, + { + "epoch": 13.16, + "grad_norm": 0.66796875, + "learning_rate": 0.0002978714487619775, + "loss": 0.1644, + "step": 317780 + }, + { + "epoch": 13.16, + "grad_norm": 0.8671875, + "learning_rate": 0.00029786080420827755, + "loss": 0.2127, + "step": 317790 + }, + { + "epoch": 13.16, + "grad_norm": 0.435546875, + "learning_rate": 0.0002978501595645087, + "loss": 0.1821, + "step": 317800 + }, + { + "epoch": 13.16, + "grad_norm": 1.2265625, + "learning_rate": 0.00029783951483069093, + "loss": 0.187, + "step": 317810 + }, + { + "epoch": 13.16, + "grad_norm": 0.796875, + "learning_rate": 0.00029782887000684443, + "loss": 0.196, + "step": 317820 + }, + { + "epoch": 13.16, + "grad_norm": 0.66796875, + "learning_rate": 0.0002978182250929891, + "loss": 0.1489, + "step": 317830 + }, + { + "epoch": 13.16, + "grad_norm": 0.443359375, + "learning_rate": 0.00029780758008914515, + "loss": 0.1737, + "step": 317840 + }, + { + "epoch": 13.17, + "grad_norm": 0.99609375, + "learning_rate": 0.00029779693499533237, + "loss": 0.1965, + "step": 317850 + }, + { + "epoch": 13.17, + "grad_norm": 0.87109375, + "learning_rate": 0.00029778628981157103, + "loss": 0.1509, + "step": 317860 + }, + { + "epoch": 13.17, + "grad_norm": 1.21875, + "learning_rate": 0.000297775644537881, + "loss": 0.2007, + "step": 317870 + }, + { + "epoch": 13.17, + "grad_norm": 0.859375, + "learning_rate": 0.0002977649991742823, + "loss": 0.2334, + "step": 317880 + }, + { + "epoch": 13.17, + "grad_norm": 0.69921875, + "learning_rate": 0.0002977543537207951, + "loss": 0.1652, + "step": 317890 + }, + { + "epoch": 13.17, + "grad_norm": 0.58203125, + "learning_rate": 0.0002977437081774394, + "loss": 0.1799, + "step": 317900 + }, + { + "epoch": 13.17, + "grad_norm": 1.3984375, + "learning_rate": 0.00029773306254423515, + "loss": 0.26, + "step": 317910 + }, + { + "epoch": 13.17, + "grad_norm": 0.54296875, + "learning_rate": 0.0002977224168212024, + "loss": 0.1514, + "step": 317920 + }, + { + "epoch": 13.17, + "grad_norm": 1.4140625, + "learning_rate": 0.0002977117710083613, + "loss": 0.1898, + "step": 317930 + }, + { + "epoch": 13.17, + "grad_norm": 0.8515625, + "learning_rate": 0.0002977011251057317, + "loss": 0.2128, + "step": 317940 + }, + { + "epoch": 13.17, + "grad_norm": 0.52734375, + "learning_rate": 0.0002976904791133338, + "loss": 0.2425, + "step": 317950 + }, + { + "epoch": 13.17, + "grad_norm": 0.796875, + "learning_rate": 0.0002976798330311876, + "loss": 0.2189, + "step": 317960 + }, + { + "epoch": 13.17, + "grad_norm": 1.125, + "learning_rate": 0.00029766918685931304, + "loss": 0.1898, + "step": 317970 + }, + { + "epoch": 13.17, + "grad_norm": 1.40625, + "learning_rate": 0.0002976585405977302, + "loss": 0.216, + "step": 317980 + }, + { + "epoch": 13.17, + "grad_norm": 0.5703125, + "learning_rate": 0.0002976478942464592, + "loss": 0.2019, + "step": 317990 + }, + { + "epoch": 13.17, + "grad_norm": 0.875, + "learning_rate": 0.00029763724780551993, + "loss": 0.161, + "step": 318000 + }, + { + "epoch": 13.17, + "grad_norm": 1.28125, + "learning_rate": 0.0002976266012749326, + "loss": 0.1847, + "step": 318010 + }, + { + "epoch": 13.17, + "grad_norm": 0.53125, + "learning_rate": 0.0002976159546547171, + "loss": 0.2203, + "step": 318020 + }, + { + "epoch": 13.17, + "grad_norm": 1.203125, + "learning_rate": 0.0002976053079448935, + "loss": 0.1909, + "step": 318030 + }, + { + "epoch": 13.17, + "grad_norm": 1.2109375, + "learning_rate": 0.0002975946611454819, + "loss": 0.1633, + "step": 318040 + }, + { + "epoch": 13.17, + "grad_norm": 1.953125, + "learning_rate": 0.00029758401425650227, + "loss": 0.2182, + "step": 318050 + }, + { + "epoch": 13.17, + "grad_norm": 0.91015625, + "learning_rate": 0.0002975733672779746, + "loss": 0.2348, + "step": 318060 + }, + { + "epoch": 13.17, + "grad_norm": 1.109375, + "learning_rate": 0.000297562720209919, + "loss": 0.1745, + "step": 318070 + }, + { + "epoch": 13.17, + "grad_norm": 0.78515625, + "learning_rate": 0.00029755207305235553, + "loss": 0.1863, + "step": 318080 + }, + { + "epoch": 13.18, + "grad_norm": 1.28125, + "learning_rate": 0.0002975414258053043, + "loss": 0.1484, + "step": 318090 + }, + { + "epoch": 13.18, + "grad_norm": 1.0859375, + "learning_rate": 0.0002975307784687851, + "loss": 0.2022, + "step": 318100 + }, + { + "epoch": 13.18, + "grad_norm": 1.0625, + "learning_rate": 0.00029752013104281816, + "loss": 0.1471, + "step": 318110 + }, + { + "epoch": 13.18, + "grad_norm": 0.9140625, + "learning_rate": 0.00029750948352742345, + "loss": 0.2184, + "step": 318120 + }, + { + "epoch": 13.18, + "grad_norm": 0.87890625, + "learning_rate": 0.000297498835922621, + "loss": 0.2163, + "step": 318130 + }, + { + "epoch": 13.18, + "grad_norm": 0.79296875, + "learning_rate": 0.00029748818822843094, + "loss": 0.2198, + "step": 318140 + }, + { + "epoch": 13.18, + "grad_norm": 0.53125, + "learning_rate": 0.0002974775404448732, + "loss": 0.2132, + "step": 318150 + }, + { + "epoch": 13.18, + "grad_norm": 0.5546875, + "learning_rate": 0.0002974668925719678, + "loss": 0.2318, + "step": 318160 + }, + { + "epoch": 13.18, + "grad_norm": 1.09375, + "learning_rate": 0.0002974562446097349, + "loss": 0.2227, + "step": 318170 + }, + { + "epoch": 13.18, + "grad_norm": 1.4375, + "learning_rate": 0.00029744559655819447, + "loss": 0.1851, + "step": 318180 + }, + { + "epoch": 13.18, + "grad_norm": 0.154296875, + "learning_rate": 0.0002974349484173665, + "loss": 0.1569, + "step": 318190 + }, + { + "epoch": 13.18, + "grad_norm": 0.0, + "learning_rate": 0.00029742430018727116, + "loss": 0.1803, + "step": 318200 + }, + { + "epoch": 13.18, + "grad_norm": 0.82421875, + "learning_rate": 0.00029741365186792835, + "loss": 0.1972, + "step": 318210 + }, + { + "epoch": 13.18, + "grad_norm": 1.0625, + "learning_rate": 0.00029740300345935825, + "loss": 0.1947, + "step": 318220 + }, + { + "epoch": 13.18, + "grad_norm": 0.76171875, + "learning_rate": 0.0002973923549615807, + "loss": 0.2016, + "step": 318230 + }, + { + "epoch": 13.18, + "grad_norm": 0.8984375, + "learning_rate": 0.00029738170637461585, + "loss": 0.1808, + "step": 318240 + }, + { + "epoch": 13.18, + "grad_norm": 0.609375, + "learning_rate": 0.0002973710576984839, + "loss": 0.1981, + "step": 318250 + }, + { + "epoch": 13.18, + "grad_norm": 0.8671875, + "learning_rate": 0.0002973604089332046, + "loss": 0.1711, + "step": 318260 + }, + { + "epoch": 13.18, + "grad_norm": 0.49609375, + "learning_rate": 0.00029734976007879816, + "loss": 0.1481, + "step": 318270 + }, + { + "epoch": 13.18, + "grad_norm": 1.203125, + "learning_rate": 0.0002973391111352846, + "loss": 0.2179, + "step": 318280 + }, + { + "epoch": 13.18, + "grad_norm": 0.609375, + "learning_rate": 0.0002973284621026839, + "loss": 0.1902, + "step": 318290 + }, + { + "epoch": 13.18, + "grad_norm": 0.53515625, + "learning_rate": 0.0002973178129810162, + "loss": 0.2335, + "step": 318300 + }, + { + "epoch": 13.18, + "grad_norm": 0.28515625, + "learning_rate": 0.00029730716377030143, + "loss": 0.1651, + "step": 318310 + }, + { + "epoch": 13.18, + "grad_norm": 0.63671875, + "learning_rate": 0.0002972965144705597, + "loss": 0.2158, + "step": 318320 + }, + { + "epoch": 13.19, + "grad_norm": 0.77734375, + "learning_rate": 0.0002972858650818111, + "loss": 0.211, + "step": 318330 + }, + { + "epoch": 13.19, + "grad_norm": 1.296875, + "learning_rate": 0.0002972752156040755, + "loss": 0.2154, + "step": 318340 + }, + { + "epoch": 13.19, + "grad_norm": 0.3359375, + "learning_rate": 0.00029726456603737316, + "loss": 0.2255, + "step": 318350 + }, + { + "epoch": 13.19, + "grad_norm": 0.8828125, + "learning_rate": 0.000297253916381724, + "loss": 0.2137, + "step": 318360 + }, + { + "epoch": 13.19, + "grad_norm": 1.046875, + "learning_rate": 0.00029724326663714795, + "loss": 0.2228, + "step": 318370 + }, + { + "epoch": 13.19, + "grad_norm": 1.0234375, + "learning_rate": 0.0002972326168036653, + "loss": 0.1793, + "step": 318380 + }, + { + "epoch": 13.19, + "grad_norm": 0.6015625, + "learning_rate": 0.0002972219668812959, + "loss": 0.2013, + "step": 318390 + }, + { + "epoch": 13.19, + "grad_norm": 0.8203125, + "learning_rate": 0.00029721131687005995, + "loss": 0.2126, + "step": 318400 + }, + { + "epoch": 13.19, + "grad_norm": 1.4765625, + "learning_rate": 0.0002972006667699773, + "loss": 0.1938, + "step": 318410 + }, + { + "epoch": 13.19, + "grad_norm": 0.67578125, + "learning_rate": 0.0002971900165810681, + "loss": 0.1961, + "step": 318420 + }, + { + "epoch": 13.19, + "grad_norm": 0.55078125, + "learning_rate": 0.00029717936630335246, + "loss": 0.1701, + "step": 318430 + }, + { + "epoch": 13.19, + "grad_norm": 0.75390625, + "learning_rate": 0.00029716871593685033, + "loss": 0.1501, + "step": 318440 + }, + { + "epoch": 13.19, + "grad_norm": 0.8828125, + "learning_rate": 0.00029715806548158164, + "loss": 0.2182, + "step": 318450 + }, + { + "epoch": 13.19, + "grad_norm": 0.16796875, + "learning_rate": 0.0002971474149375668, + "loss": 0.2045, + "step": 318460 + }, + { + "epoch": 13.19, + "grad_norm": 1.1796875, + "learning_rate": 0.0002971367643048254, + "loss": 0.2248, + "step": 318470 + }, + { + "epoch": 13.19, + "grad_norm": 0.37109375, + "learning_rate": 0.00029712611358337777, + "loss": 0.1525, + "step": 318480 + }, + { + "epoch": 13.19, + "grad_norm": 0.390625, + "learning_rate": 0.0002971154627732439, + "loss": 0.2185, + "step": 318490 + }, + { + "epoch": 13.19, + "grad_norm": 0.58203125, + "learning_rate": 0.0002971048118744438, + "loss": 0.2021, + "step": 318500 + }, + { + "epoch": 13.19, + "grad_norm": 1.25, + "learning_rate": 0.0002970941608869976, + "loss": 0.2092, + "step": 318510 + }, + { + "epoch": 13.19, + "grad_norm": 1.1484375, + "learning_rate": 0.00029708350981092526, + "loss": 0.1796, + "step": 318520 + }, + { + "epoch": 13.19, + "grad_norm": 0.98828125, + "learning_rate": 0.00029707285864624667, + "loss": 0.1892, + "step": 318530 + }, + { + "epoch": 13.19, + "grad_norm": 1.15625, + "learning_rate": 0.00029706220739298225, + "loss": 0.2324, + "step": 318540 + }, + { + "epoch": 13.19, + "grad_norm": 1.09375, + "learning_rate": 0.0002970515560511517, + "loss": 0.2338, + "step": 318550 + }, + { + "epoch": 13.19, + "grad_norm": 0.84765625, + "learning_rate": 0.00029704090462077525, + "loss": 0.1502, + "step": 318560 + }, + { + "epoch": 13.2, + "grad_norm": 0.8828125, + "learning_rate": 0.000297030253101873, + "loss": 0.1856, + "step": 318570 + }, + { + "epoch": 13.2, + "grad_norm": 0.54296875, + "learning_rate": 0.00029701960149446473, + "loss": 0.1802, + "step": 318580 + }, + { + "epoch": 13.2, + "grad_norm": 0.82421875, + "learning_rate": 0.0002970089497985707, + "loss": 0.2178, + "step": 318590 + }, + { + "epoch": 13.2, + "grad_norm": 0.36328125, + "learning_rate": 0.00029699829801421094, + "loss": 0.1726, + "step": 318600 + }, + { + "epoch": 13.2, + "grad_norm": 0.30859375, + "learning_rate": 0.0002969876461414054, + "loss": 0.1944, + "step": 318610 + }, + { + "epoch": 13.2, + "grad_norm": 1.3125, + "learning_rate": 0.0002969769941801743, + "loss": 0.244, + "step": 318620 + }, + { + "epoch": 13.2, + "grad_norm": 0.89453125, + "learning_rate": 0.00029696634213053746, + "loss": 0.1793, + "step": 318630 + }, + { + "epoch": 13.2, + "grad_norm": 0.765625, + "learning_rate": 0.00029695568999251506, + "loss": 0.1141, + "step": 318640 + }, + { + "epoch": 13.2, + "grad_norm": 1.0859375, + "learning_rate": 0.00029694503776612715, + "loss": 0.1999, + "step": 318650 + }, + { + "epoch": 13.2, + "grad_norm": 0.78515625, + "learning_rate": 0.0002969343854513937, + "loss": 0.184, + "step": 318660 + }, + { + "epoch": 13.2, + "grad_norm": 0.84765625, + "learning_rate": 0.00029692373304833486, + "loss": 0.1681, + "step": 318670 + }, + { + "epoch": 13.2, + "grad_norm": 0.87109375, + "learning_rate": 0.0002969130805569706, + "loss": 0.2004, + "step": 318680 + }, + { + "epoch": 13.2, + "grad_norm": 0.76171875, + "learning_rate": 0.00029690242797732094, + "loss": 0.1951, + "step": 318690 + }, + { + "epoch": 13.2, + "grad_norm": 1.609375, + "learning_rate": 0.0002968917753094061, + "loss": 0.1639, + "step": 318700 + }, + { + "epoch": 13.2, + "grad_norm": 0.78515625, + "learning_rate": 0.0002968811225532459, + "loss": 0.1718, + "step": 318710 + }, + { + "epoch": 13.2, + "grad_norm": 1.34375, + "learning_rate": 0.0002968704697088605, + "loss": 0.1643, + "step": 318720 + }, + { + "epoch": 13.2, + "grad_norm": 0.396484375, + "learning_rate": 0.00029685981677626995, + "loss": 0.1697, + "step": 318730 + }, + { + "epoch": 13.2, + "grad_norm": 0.6484375, + "learning_rate": 0.00029684916375549426, + "loss": 0.2243, + "step": 318740 + }, + { + "epoch": 13.2, + "grad_norm": 0.98046875, + "learning_rate": 0.00029683851064655353, + "loss": 0.1745, + "step": 318750 + }, + { + "epoch": 13.2, + "grad_norm": 0.74609375, + "learning_rate": 0.00029682785744946776, + "loss": 0.2411, + "step": 318760 + }, + { + "epoch": 13.2, + "grad_norm": 1.109375, + "learning_rate": 0.00029681720416425706, + "loss": 0.2046, + "step": 318770 + }, + { + "epoch": 13.2, + "grad_norm": 0.73828125, + "learning_rate": 0.00029680655079094144, + "loss": 0.2096, + "step": 318780 + }, + { + "epoch": 13.2, + "grad_norm": 0.91796875, + "learning_rate": 0.00029679589732954095, + "loss": 0.1979, + "step": 318790 + }, + { + "epoch": 13.2, + "grad_norm": 0.6015625, + "learning_rate": 0.00029678524378007554, + "loss": 0.2058, + "step": 318800 + }, + { + "epoch": 13.21, + "grad_norm": 0.8828125, + "learning_rate": 0.0002967745901425655, + "loss": 0.1746, + "step": 318810 + }, + { + "epoch": 13.21, + "grad_norm": 0.59765625, + "learning_rate": 0.0002967639364170306, + "loss": 0.2481, + "step": 318820 + }, + { + "epoch": 13.21, + "grad_norm": 0.765625, + "learning_rate": 0.0002967532826034911, + "loss": 0.1628, + "step": 318830 + }, + { + "epoch": 13.21, + "grad_norm": 0.90625, + "learning_rate": 0.00029674262870196696, + "loss": 0.1978, + "step": 318840 + }, + { + "epoch": 13.21, + "grad_norm": 1.2265625, + "learning_rate": 0.0002967319747124782, + "loss": 0.1695, + "step": 318850 + }, + { + "epoch": 13.21, + "grad_norm": 1.2265625, + "learning_rate": 0.000296721320635045, + "loss": 0.1804, + "step": 318860 + }, + { + "epoch": 13.21, + "grad_norm": 0.546875, + "learning_rate": 0.0002967106664696872, + "loss": 0.2202, + "step": 318870 + }, + { + "epoch": 13.21, + "grad_norm": 0.498046875, + "learning_rate": 0.00029670001221642503, + "loss": 0.2223, + "step": 318880 + }, + { + "epoch": 13.21, + "grad_norm": 1.09375, + "learning_rate": 0.00029668935787527845, + "loss": 0.2211, + "step": 318890 + }, + { + "epoch": 13.21, + "grad_norm": 0.5078125, + "learning_rate": 0.0002966787034462676, + "loss": 0.2253, + "step": 318900 + }, + { + "epoch": 13.21, + "grad_norm": 0.60546875, + "learning_rate": 0.0002966680489294125, + "loss": 0.2175, + "step": 318910 + }, + { + "epoch": 13.21, + "grad_norm": 0.92578125, + "learning_rate": 0.00029665739432473305, + "loss": 0.2053, + "step": 318920 + }, + { + "epoch": 13.21, + "grad_norm": 0.83203125, + "learning_rate": 0.00029664673963224945, + "loss": 0.1413, + "step": 318930 + }, + { + "epoch": 13.21, + "grad_norm": 1.4921875, + "learning_rate": 0.0002966360848519818, + "loss": 0.2196, + "step": 318940 + }, + { + "epoch": 13.21, + "grad_norm": 0.9140625, + "learning_rate": 0.00029662542998395004, + "loss": 0.2142, + "step": 318950 + }, + { + "epoch": 13.21, + "grad_norm": 2.015625, + "learning_rate": 0.00029661477502817424, + "loss": 0.1935, + "step": 318960 + }, + { + "epoch": 13.21, + "grad_norm": 0.7109375, + "learning_rate": 0.00029660411998467444, + "loss": 0.2102, + "step": 318970 + }, + { + "epoch": 13.21, + "grad_norm": 0.7265625, + "learning_rate": 0.0002965934648534707, + "loss": 0.1674, + "step": 318980 + }, + { + "epoch": 13.21, + "grad_norm": 0.7109375, + "learning_rate": 0.0002965828096345832, + "loss": 0.2038, + "step": 318990 + }, + { + "epoch": 13.21, + "grad_norm": 0.85546875, + "learning_rate": 0.00029657215432803185, + "loss": 0.1836, + "step": 319000 + }, + { + "epoch": 13.21, + "grad_norm": 1.40625, + "learning_rate": 0.00029656149893383666, + "loss": 0.191, + "step": 319010 + }, + { + "epoch": 13.21, + "grad_norm": 1.3671875, + "learning_rate": 0.00029655084345201784, + "loss": 0.2213, + "step": 319020 + }, + { + "epoch": 13.21, + "grad_norm": 0.59765625, + "learning_rate": 0.00029654018788259536, + "loss": 0.1817, + "step": 319030 + }, + { + "epoch": 13.21, + "grad_norm": 0.494140625, + "learning_rate": 0.0002965295322255892, + "loss": 0.1842, + "step": 319040 + }, + { + "epoch": 13.22, + "grad_norm": 1.4140625, + "learning_rate": 0.00029651887648101955, + "loss": 0.1568, + "step": 319050 + }, + { + "epoch": 13.22, + "grad_norm": 0.91796875, + "learning_rate": 0.0002965082206489064, + "loss": 0.1692, + "step": 319060 + }, + { + "epoch": 13.22, + "grad_norm": 0.44140625, + "learning_rate": 0.00029649756472926975, + "loss": 0.2393, + "step": 319070 + }, + { + "epoch": 13.22, + "grad_norm": 1.09375, + "learning_rate": 0.00029648690872212976, + "loss": 0.1855, + "step": 319080 + }, + { + "epoch": 13.22, + "grad_norm": 1.59375, + "learning_rate": 0.0002964762526275063, + "loss": 0.1794, + "step": 319090 + }, + { + "epoch": 13.22, + "grad_norm": 0.5859375, + "learning_rate": 0.0002964655964454197, + "loss": 0.1461, + "step": 319100 + }, + { + "epoch": 13.22, + "grad_norm": 0.78125, + "learning_rate": 0.00029645494017588986, + "loss": 0.2277, + "step": 319110 + }, + { + "epoch": 13.22, + "grad_norm": 0.5546875, + "learning_rate": 0.0002964442838189368, + "loss": 0.2027, + "step": 319120 + }, + { + "epoch": 13.22, + "grad_norm": 1.1484375, + "learning_rate": 0.0002964336273745806, + "loss": 0.1573, + "step": 319130 + }, + { + "epoch": 13.22, + "grad_norm": 0.828125, + "learning_rate": 0.0002964229708428413, + "loss": 0.2244, + "step": 319140 + }, + { + "epoch": 13.22, + "grad_norm": 0.5234375, + "learning_rate": 0.00029641231422373906, + "loss": 0.1825, + "step": 319150 + }, + { + "epoch": 13.22, + "grad_norm": 0.62109375, + "learning_rate": 0.00029640165751729377, + "loss": 0.189, + "step": 319160 + }, + { + "epoch": 13.22, + "grad_norm": 1.3125, + "learning_rate": 0.0002963910007235256, + "loss": 0.2031, + "step": 319170 + }, + { + "epoch": 13.22, + "grad_norm": 0.380859375, + "learning_rate": 0.0002963803438424546, + "loss": 0.1659, + "step": 319180 + }, + { + "epoch": 13.22, + "grad_norm": 0.62109375, + "learning_rate": 0.00029636968687410083, + "loss": 0.2077, + "step": 319190 + }, + { + "epoch": 13.22, + "grad_norm": 1.328125, + "learning_rate": 0.00029635902981848427, + "loss": 0.14, + "step": 319200 + }, + { + "epoch": 13.22, + "grad_norm": 0.333984375, + "learning_rate": 0.00029634837267562505, + "loss": 0.1581, + "step": 319210 + }, + { + "epoch": 13.22, + "grad_norm": 1.4921875, + "learning_rate": 0.0002963377154455431, + "loss": 0.198, + "step": 319220 + }, + { + "epoch": 13.22, + "grad_norm": 0.6875, + "learning_rate": 0.00029632705812825865, + "loss": 0.2303, + "step": 319230 + }, + { + "epoch": 13.22, + "grad_norm": 1.03125, + "learning_rate": 0.00029631640072379174, + "loss": 0.171, + "step": 319240 + }, + { + "epoch": 13.22, + "grad_norm": 1.4375, + "learning_rate": 0.0002963057432321623, + "loss": 0.183, + "step": 319250 + }, + { + "epoch": 13.22, + "grad_norm": 1.1015625, + "learning_rate": 0.0002962950856533905, + "loss": 0.207, + "step": 319260 + }, + { + "epoch": 13.22, + "grad_norm": 0.5859375, + "learning_rate": 0.0002962844279874963, + "loss": 0.2236, + "step": 319270 + }, + { + "epoch": 13.22, + "grad_norm": 0.9296875, + "learning_rate": 0.0002962737702344998, + "loss": 0.168, + "step": 319280 + }, + { + "epoch": 13.22, + "grad_norm": 1.96875, + "learning_rate": 0.00029626311239442103, + "loss": 0.2354, + "step": 319290 + }, + { + "epoch": 13.23, + "grad_norm": 1.6796875, + "learning_rate": 0.00029625245446728016, + "loss": 0.1467, + "step": 319300 + }, + { + "epoch": 13.23, + "grad_norm": 1.078125, + "learning_rate": 0.00029624179645309714, + "loss": 0.1504, + "step": 319310 + }, + { + "epoch": 13.23, + "grad_norm": 0.423828125, + "learning_rate": 0.00029623113835189205, + "loss": 0.1664, + "step": 319320 + }, + { + "epoch": 13.23, + "grad_norm": 1.4453125, + "learning_rate": 0.0002962204801636849, + "loss": 0.2493, + "step": 319330 + }, + { + "epoch": 13.23, + "grad_norm": 1.2578125, + "learning_rate": 0.00029620982188849587, + "loss": 0.2061, + "step": 319340 + }, + { + "epoch": 13.23, + "grad_norm": 0.419921875, + "learning_rate": 0.00029619916352634495, + "loss": 0.1259, + "step": 319350 + }, + { + "epoch": 13.23, + "grad_norm": 1.140625, + "learning_rate": 0.0002961885050772521, + "loss": 0.1831, + "step": 319360 + }, + { + "epoch": 13.23, + "grad_norm": 0.81640625, + "learning_rate": 0.00029617784654123756, + "loss": 0.1954, + "step": 319370 + }, + { + "epoch": 13.23, + "grad_norm": 0.44921875, + "learning_rate": 0.00029616718791832127, + "loss": 0.1659, + "step": 319380 + }, + { + "epoch": 13.23, + "grad_norm": 0.421875, + "learning_rate": 0.00029615652920852325, + "loss": 0.1518, + "step": 319390 + }, + { + "epoch": 13.23, + "grad_norm": 0.58984375, + "learning_rate": 0.0002961458704118637, + "loss": 0.1858, + "step": 319400 + }, + { + "epoch": 13.23, + "grad_norm": 0.0, + "learning_rate": 0.00029613521152836264, + "loss": 0.1845, + "step": 319410 + }, + { + "epoch": 13.23, + "grad_norm": 0.8515625, + "learning_rate": 0.00029612455255804005, + "loss": 0.1854, + "step": 319420 + }, + { + "epoch": 13.23, + "grad_norm": 0.859375, + "learning_rate": 0.00029611389350091613, + "loss": 0.173, + "step": 319430 + }, + { + "epoch": 13.23, + "grad_norm": 0.7265625, + "learning_rate": 0.00029610323435701067, + "loss": 0.223, + "step": 319440 + }, + { + "epoch": 13.23, + "grad_norm": 0.63671875, + "learning_rate": 0.000296092575126344, + "loss": 0.146, + "step": 319450 + }, + { + "epoch": 13.23, + "grad_norm": 1.21875, + "learning_rate": 0.0002960819158089361, + "loss": 0.1573, + "step": 319460 + }, + { + "epoch": 13.23, + "grad_norm": 0.5, + "learning_rate": 0.0002960712564048069, + "loss": 0.1838, + "step": 319470 + }, + { + "epoch": 13.23, + "grad_norm": 0.53515625, + "learning_rate": 0.0002960605969139767, + "loss": 0.188, + "step": 319480 + }, + { + "epoch": 13.23, + "grad_norm": 0.81640625, + "learning_rate": 0.00029604993733646533, + "loss": 0.2234, + "step": 319490 + }, + { + "epoch": 13.23, + "grad_norm": 1.09375, + "learning_rate": 0.000296039277672293, + "loss": 0.1746, + "step": 319500 + }, + { + "epoch": 13.23, + "grad_norm": 1.8203125, + "learning_rate": 0.0002960286179214797, + "loss": 0.2059, + "step": 319510 + }, + { + "epoch": 13.23, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002960179580840455, + "loss": 0.1792, + "step": 319520 + }, + { + "epoch": 13.23, + "grad_norm": 0.6640625, + "learning_rate": 0.00029600729816001056, + "loss": 0.2205, + "step": 319530 + }, + { + "epoch": 13.24, + "grad_norm": 0.0, + "learning_rate": 0.00029599663814939474, + "loss": 0.1576, + "step": 319540 + }, + { + "epoch": 13.24, + "grad_norm": 0.921875, + "learning_rate": 0.0002959859780522183, + "loss": 0.1585, + "step": 319550 + }, + { + "epoch": 13.24, + "grad_norm": 0.87109375, + "learning_rate": 0.00029597531786850124, + "loss": 0.1983, + "step": 319560 + }, + { + "epoch": 13.24, + "grad_norm": 0.482421875, + "learning_rate": 0.0002959646575982635, + "loss": 0.1883, + "step": 319570 + }, + { + "epoch": 13.24, + "grad_norm": 1.2421875, + "learning_rate": 0.00029595399724152526, + "loss": 0.1615, + "step": 319580 + }, + { + "epoch": 13.24, + "grad_norm": 0.89453125, + "learning_rate": 0.00029594333679830665, + "loss": 0.1109, + "step": 319590 + }, + { + "epoch": 13.24, + "grad_norm": 0.8125, + "learning_rate": 0.00029593267626862747, + "loss": 0.1972, + "step": 319600 + }, + { + "epoch": 13.24, + "grad_norm": 0.455078125, + "learning_rate": 0.0002959220156525081, + "loss": 0.2223, + "step": 319610 + }, + { + "epoch": 13.24, + "grad_norm": 1.15625, + "learning_rate": 0.0002959113549499684, + "loss": 0.1847, + "step": 319620 + }, + { + "epoch": 13.24, + "grad_norm": 0.55859375, + "learning_rate": 0.0002959006941610285, + "loss": 0.2027, + "step": 319630 + }, + { + "epoch": 13.24, + "grad_norm": 1.1484375, + "learning_rate": 0.00029589003328570843, + "loss": 0.193, + "step": 319640 + }, + { + "epoch": 13.24, + "grad_norm": 0.66796875, + "learning_rate": 0.0002958793723240283, + "loss": 0.2008, + "step": 319650 + }, + { + "epoch": 13.24, + "grad_norm": 0.76953125, + "learning_rate": 0.0002958687112760081, + "loss": 0.226, + "step": 319660 + }, + { + "epoch": 13.24, + "grad_norm": 0.56640625, + "learning_rate": 0.000295858050141668, + "loss": 0.1976, + "step": 319670 + }, + { + "epoch": 13.24, + "grad_norm": 0.57421875, + "learning_rate": 0.000295847388921028, + "loss": 0.1348, + "step": 319680 + }, + { + "epoch": 13.24, + "grad_norm": 0.671875, + "learning_rate": 0.00029583672761410813, + "loss": 0.1995, + "step": 319690 + }, + { + "epoch": 13.24, + "grad_norm": 0.060546875, + "learning_rate": 0.0002958260662209285, + "loss": 0.1315, + "step": 319700 + }, + { + "epoch": 13.24, + "grad_norm": 0.490234375, + "learning_rate": 0.00029581540474150913, + "loss": 0.1759, + "step": 319710 + }, + { + "epoch": 13.24, + "grad_norm": 0.7734375, + "learning_rate": 0.00029580474317587017, + "loss": 0.2241, + "step": 319720 + }, + { + "epoch": 13.24, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002957940815240316, + "loss": 0.1568, + "step": 319730 + }, + { + "epoch": 13.24, + "grad_norm": 0.8515625, + "learning_rate": 0.00029578341978601355, + "loss": 0.1842, + "step": 319740 + }, + { + "epoch": 13.24, + "grad_norm": 0.84765625, + "learning_rate": 0.000295772757961836, + "loss": 0.1917, + "step": 319750 + }, + { + "epoch": 13.24, + "grad_norm": 0.6328125, + "learning_rate": 0.00029576209605151904, + "loss": 0.1549, + "step": 319760 + }, + { + "epoch": 13.24, + "grad_norm": 1.2890625, + "learning_rate": 0.00029575143405508286, + "loss": 0.2758, + "step": 319770 + }, + { + "epoch": 13.25, + "grad_norm": 0.5703125, + "learning_rate": 0.00029574077197254737, + "loss": 0.1959, + "step": 319780 + }, + { + "epoch": 13.25, + "grad_norm": 2.234375, + "learning_rate": 0.00029573010980393267, + "loss": 0.1969, + "step": 319790 + }, + { + "epoch": 13.25, + "grad_norm": 0.5625, + "learning_rate": 0.0002957194475492589, + "loss": 0.1329, + "step": 319800 + }, + { + "epoch": 13.25, + "grad_norm": 0.87109375, + "learning_rate": 0.00029570878520854597, + "loss": 0.2427, + "step": 319810 + }, + { + "epoch": 13.25, + "grad_norm": 0.84765625, + "learning_rate": 0.00029569812278181407, + "loss": 0.2043, + "step": 319820 + }, + { + "epoch": 13.25, + "grad_norm": 0.0, + "learning_rate": 0.00029568746026908325, + "loss": 0.1874, + "step": 319830 + }, + { + "epoch": 13.25, + "grad_norm": 0.765625, + "learning_rate": 0.0002956767976703736, + "loss": 0.1761, + "step": 319840 + }, + { + "epoch": 13.25, + "grad_norm": 1.3671875, + "learning_rate": 0.00029566613498570516, + "loss": 0.234, + "step": 319850 + }, + { + "epoch": 13.25, + "grad_norm": 0.7109375, + "learning_rate": 0.00029565547221509794, + "loss": 0.1769, + "step": 319860 + }, + { + "epoch": 13.25, + "grad_norm": 1.40625, + "learning_rate": 0.00029564480935857204, + "loss": 0.2056, + "step": 319870 + }, + { + "epoch": 13.25, + "grad_norm": 0.97265625, + "learning_rate": 0.00029563414641614757, + "loss": 0.1897, + "step": 319880 + }, + { + "epoch": 13.25, + "grad_norm": 0.40234375, + "learning_rate": 0.0002956234833878445, + "loss": 0.1786, + "step": 319890 + }, + { + "epoch": 13.25, + "grad_norm": 0.70703125, + "learning_rate": 0.000295612820273683, + "loss": 0.234, + "step": 319900 + }, + { + "epoch": 13.25, + "grad_norm": 0.46875, + "learning_rate": 0.0002956021570736831, + "loss": 0.1645, + "step": 319910 + }, + { + "epoch": 13.25, + "grad_norm": 1.28125, + "learning_rate": 0.00029559149378786487, + "loss": 0.2482, + "step": 319920 + }, + { + "epoch": 13.25, + "grad_norm": 0.6484375, + "learning_rate": 0.00029558083041624845, + "loss": 0.2215, + "step": 319930 + }, + { + "epoch": 13.25, + "grad_norm": 0.64453125, + "learning_rate": 0.00029557016695885374, + "loss": 0.202, + "step": 319940 + }, + { + "epoch": 13.25, + "grad_norm": 1.09375, + "learning_rate": 0.00029555950341570087, + "loss": 0.1996, + "step": 319950 + }, + { + "epoch": 13.25, + "grad_norm": 1.125, + "learning_rate": 0.00029554883978681, + "loss": 0.1793, + "step": 319960 + }, + { + "epoch": 13.25, + "grad_norm": 0.0, + "learning_rate": 0.00029553817607220106, + "loss": 0.1766, + "step": 319970 + }, + { + "epoch": 13.25, + "grad_norm": 0.6796875, + "learning_rate": 0.0002955275122718943, + "loss": 0.2029, + "step": 319980 + }, + { + "epoch": 13.25, + "grad_norm": 1.0, + "learning_rate": 0.0002955168483859096, + "loss": 0.1829, + "step": 319990 + }, + { + "epoch": 13.25, + "grad_norm": 1.046875, + "learning_rate": 0.00029550618441426713, + "loss": 0.2073, + "step": 320000 + }, + { + "epoch": 13.25, + "grad_norm": 1.09375, + "learning_rate": 0.00029549552035698694, + "loss": 0.1759, + "step": 320010 + }, + { + "epoch": 13.26, + "grad_norm": 0.65234375, + "learning_rate": 0.0002954848562140891, + "loss": 0.2168, + "step": 320020 + }, + { + "epoch": 13.26, + "grad_norm": 0.9609375, + "learning_rate": 0.00029547419198559365, + "loss": 0.1665, + "step": 320030 + }, + { + "epoch": 13.26, + "grad_norm": 1.921875, + "learning_rate": 0.0002954635276715207, + "loss": 0.2167, + "step": 320040 + }, + { + "epoch": 13.26, + "grad_norm": 0.92578125, + "learning_rate": 0.00029545286327189025, + "loss": 0.2334, + "step": 320050 + }, + { + "epoch": 13.26, + "grad_norm": 0.58203125, + "learning_rate": 0.0002954421987867225, + "loss": 0.2019, + "step": 320060 + }, + { + "epoch": 13.26, + "grad_norm": 1.8203125, + "learning_rate": 0.00029543153421603737, + "loss": 0.1731, + "step": 320070 + }, + { + "epoch": 13.26, + "grad_norm": 0.58203125, + "learning_rate": 0.000295420869559855, + "loss": 0.1734, + "step": 320080 + }, + { + "epoch": 13.26, + "grad_norm": 1.0078125, + "learning_rate": 0.0002954102048181955, + "loss": 0.1939, + "step": 320090 + }, + { + "epoch": 13.26, + "grad_norm": 1.5859375, + "learning_rate": 0.00029539953999107893, + "loss": 0.2015, + "step": 320100 + }, + { + "epoch": 13.26, + "grad_norm": 0.46875, + "learning_rate": 0.00029538887507852527, + "loss": 0.175, + "step": 320110 + }, + { + "epoch": 13.26, + "grad_norm": 0.83984375, + "learning_rate": 0.0002953782100805547, + "loss": 0.1772, + "step": 320120 + }, + { + "epoch": 13.26, + "grad_norm": 0.51171875, + "learning_rate": 0.0002953675449971871, + "loss": 0.1932, + "step": 320130 + }, + { + "epoch": 13.26, + "grad_norm": 1.5625, + "learning_rate": 0.00029535687982844283, + "loss": 0.2073, + "step": 320140 + }, + { + "epoch": 13.26, + "grad_norm": 0.91796875, + "learning_rate": 0.00029534621457434173, + "loss": 0.1872, + "step": 320150 + }, + { + "epoch": 13.26, + "grad_norm": 1.203125, + "learning_rate": 0.00029533554923490395, + "loss": 0.1635, + "step": 320160 + }, + { + "epoch": 13.26, + "grad_norm": 1.078125, + "learning_rate": 0.00029532488381014965, + "loss": 0.1751, + "step": 320170 + }, + { + "epoch": 13.26, + "grad_norm": 0.7578125, + "learning_rate": 0.0002953142183000987, + "loss": 0.1883, + "step": 320180 + }, + { + "epoch": 13.26, + "grad_norm": 1.765625, + "learning_rate": 0.0002953035527047714, + "loss": 0.1477, + "step": 320190 + }, + { + "epoch": 13.26, + "grad_norm": 0.60546875, + "learning_rate": 0.0002952928870241876, + "loss": 0.1689, + "step": 320200 + }, + { + "epoch": 13.26, + "grad_norm": 0.11962890625, + "learning_rate": 0.00029528222125836754, + "loss": 0.1764, + "step": 320210 + }, + { + "epoch": 13.26, + "grad_norm": 0.8515625, + "learning_rate": 0.00029527155540733124, + "loss": 0.205, + "step": 320220 + }, + { + "epoch": 13.26, + "grad_norm": 1.1328125, + "learning_rate": 0.00029526088947109876, + "loss": 0.1701, + "step": 320230 + }, + { + "epoch": 13.26, + "grad_norm": 0.66015625, + "learning_rate": 0.0002952502234496901, + "loss": 0.2078, + "step": 320240 + }, + { + "epoch": 13.26, + "grad_norm": 1.140625, + "learning_rate": 0.0002952395573431255, + "loss": 0.1986, + "step": 320250 + }, + { + "epoch": 13.27, + "grad_norm": 0.4609375, + "learning_rate": 0.0002952288911514248, + "loss": 0.1665, + "step": 320260 + }, + { + "epoch": 13.27, + "grad_norm": 1.1171875, + "learning_rate": 0.0002952182248746084, + "loss": 0.2173, + "step": 320270 + }, + { + "epoch": 13.27, + "grad_norm": 1.7734375, + "learning_rate": 0.0002952075585126961, + "loss": 0.167, + "step": 320280 + }, + { + "epoch": 13.27, + "grad_norm": 0.427734375, + "learning_rate": 0.000295196892065708, + "loss": 0.1221, + "step": 320290 + }, + { + "epoch": 13.27, + "grad_norm": 1.125, + "learning_rate": 0.0002951862255336643, + "loss": 0.16, + "step": 320300 + }, + { + "epoch": 13.27, + "grad_norm": 0.8515625, + "learning_rate": 0.000295175558916585, + "loss": 0.2178, + "step": 320310 + }, + { + "epoch": 13.27, + "grad_norm": 1.2421875, + "learning_rate": 0.0002951648922144901, + "loss": 0.1608, + "step": 320320 + }, + { + "epoch": 13.27, + "grad_norm": 1.0234375, + "learning_rate": 0.0002951542254273999, + "loss": 0.2307, + "step": 320330 + }, + { + "epoch": 13.27, + "grad_norm": 1.125, + "learning_rate": 0.00029514355855533414, + "loss": 0.2013, + "step": 320340 + }, + { + "epoch": 13.27, + "grad_norm": 0.62109375, + "learning_rate": 0.00029513289159831317, + "loss": 0.1841, + "step": 320350 + }, + { + "epoch": 13.27, + "grad_norm": 0.5234375, + "learning_rate": 0.000295122224556357, + "loss": 0.2005, + "step": 320360 + }, + { + "epoch": 13.27, + "grad_norm": 0.51171875, + "learning_rate": 0.00029511155742948563, + "loss": 0.2227, + "step": 320370 + }, + { + "epoch": 13.27, + "grad_norm": 1.9921875, + "learning_rate": 0.0002951008902177192, + "loss": 0.1995, + "step": 320380 + }, + { + "epoch": 13.27, + "grad_norm": 1.7578125, + "learning_rate": 0.0002950902229210778, + "loss": 0.1858, + "step": 320390 + }, + { + "epoch": 13.27, + "grad_norm": 1.0078125, + "learning_rate": 0.0002950795555395814, + "loss": 0.1982, + "step": 320400 + }, + { + "epoch": 13.27, + "grad_norm": 1.4296875, + "learning_rate": 0.00029506888807325014, + "loss": 0.1983, + "step": 320410 + }, + { + "epoch": 13.27, + "grad_norm": 1.4765625, + "learning_rate": 0.0002950582205221041, + "loss": 0.1966, + "step": 320420 + }, + { + "epoch": 13.27, + "grad_norm": 0.357421875, + "learning_rate": 0.0002950475528861635, + "loss": 0.1949, + "step": 320430 + }, + { + "epoch": 13.27, + "grad_norm": 1.0625, + "learning_rate": 0.00029503688516544814, + "loss": 0.1771, + "step": 320440 + }, + { + "epoch": 13.27, + "grad_norm": 0.890625, + "learning_rate": 0.00029502621735997826, + "loss": 0.1777, + "step": 320450 + }, + { + "epoch": 13.27, + "grad_norm": 0.3984375, + "learning_rate": 0.00029501554946977386, + "loss": 0.1502, + "step": 320460 + }, + { + "epoch": 13.27, + "grad_norm": 0.6171875, + "learning_rate": 0.0002950048814948551, + "loss": 0.1508, + "step": 320470 + }, + { + "epoch": 13.27, + "grad_norm": 0.66796875, + "learning_rate": 0.000294994213435242, + "loss": 0.201, + "step": 320480 + }, + { + "epoch": 13.27, + "grad_norm": 0.515625, + "learning_rate": 0.0002949835452909546, + "loss": 0.1931, + "step": 320490 + }, + { + "epoch": 13.28, + "grad_norm": 0.7109375, + "learning_rate": 0.0002949728770620131, + "loss": 0.1754, + "step": 320500 + }, + { + "epoch": 13.28, + "grad_norm": 0.91015625, + "learning_rate": 0.0002949622087484375, + "loss": 0.2308, + "step": 320510 + }, + { + "epoch": 13.28, + "grad_norm": 0.470703125, + "learning_rate": 0.00029495154035024784, + "loss": 0.1547, + "step": 320520 + }, + { + "epoch": 13.28, + "grad_norm": 1.5703125, + "learning_rate": 0.00029494087186746423, + "loss": 0.1979, + "step": 320530 + }, + { + "epoch": 13.28, + "grad_norm": 0.93359375, + "learning_rate": 0.0002949302033001068, + "loss": 0.1697, + "step": 320540 + }, + { + "epoch": 13.28, + "grad_norm": 3.28125, + "learning_rate": 0.00029491953464819555, + "loss": 0.2097, + "step": 320550 + }, + { + "epoch": 13.28, + "grad_norm": 1.1640625, + "learning_rate": 0.0002949088659117506, + "loss": 0.1601, + "step": 320560 + }, + { + "epoch": 13.28, + "grad_norm": 1.2578125, + "learning_rate": 0.00029489819709079196, + "loss": 0.1984, + "step": 320570 + }, + { + "epoch": 13.28, + "grad_norm": 1.2734375, + "learning_rate": 0.00029488752818533984, + "loss": 0.1998, + "step": 320580 + }, + { + "epoch": 13.28, + "grad_norm": 0.4453125, + "learning_rate": 0.00029487685919541416, + "loss": 0.1502, + "step": 320590 + }, + { + "epoch": 13.28, + "grad_norm": 0.54296875, + "learning_rate": 0.00029486619012103515, + "loss": 0.1831, + "step": 320600 + }, + { + "epoch": 13.28, + "grad_norm": 1.171875, + "learning_rate": 0.0002948555209622228, + "loss": 0.2143, + "step": 320610 + }, + { + "epoch": 13.28, + "grad_norm": 1.953125, + "learning_rate": 0.00029484485171899714, + "loss": 0.1635, + "step": 320620 + }, + { + "epoch": 13.28, + "grad_norm": 0.4453125, + "learning_rate": 0.0002948341823913784, + "loss": 0.2243, + "step": 320630 + }, + { + "epoch": 13.28, + "grad_norm": 0.96484375, + "learning_rate": 0.00029482351297938655, + "loss": 0.1805, + "step": 320640 + }, + { + "epoch": 13.28, + "grad_norm": 0.5390625, + "learning_rate": 0.00029481284348304164, + "loss": 0.1541, + "step": 320650 + }, + { + "epoch": 13.28, + "grad_norm": 1.046875, + "learning_rate": 0.0002948021739023638, + "loss": 0.1826, + "step": 320660 + }, + { + "epoch": 13.28, + "grad_norm": 0.39453125, + "learning_rate": 0.00029479150423737324, + "loss": 0.1971, + "step": 320670 + }, + { + "epoch": 13.28, + "grad_norm": 0.41015625, + "learning_rate": 0.0002947808344880897, + "loss": 0.215, + "step": 320680 + }, + { + "epoch": 13.28, + "grad_norm": 0.59375, + "learning_rate": 0.0002947701646545336, + "loss": 0.1875, + "step": 320690 + }, + { + "epoch": 13.28, + "grad_norm": 0.6953125, + "learning_rate": 0.0002947594947367248, + "loss": 0.1652, + "step": 320700 + }, + { + "epoch": 13.28, + "grad_norm": 0.0, + "learning_rate": 0.0002947488247346836, + "loss": 0.2134, + "step": 320710 + }, + { + "epoch": 13.28, + "grad_norm": 0.859375, + "learning_rate": 0.00029473815464842987, + "loss": 0.1979, + "step": 320720 + }, + { + "epoch": 13.28, + "grad_norm": 0.83203125, + "learning_rate": 0.0002947274844779837, + "loss": 0.1965, + "step": 320730 + }, + { + "epoch": 13.29, + "grad_norm": 0.5390625, + "learning_rate": 0.00029471681422336526, + "loss": 0.2085, + "step": 320740 + }, + { + "epoch": 13.29, + "grad_norm": 0.82421875, + "learning_rate": 0.00029470614388459466, + "loss": 0.1741, + "step": 320750 + }, + { + "epoch": 13.29, + "grad_norm": 0.90234375, + "learning_rate": 0.00029469547346169193, + "loss": 0.1966, + "step": 320760 + }, + { + "epoch": 13.29, + "grad_norm": 1.0078125, + "learning_rate": 0.0002946848029546771, + "loss": 0.2679, + "step": 320770 + }, + { + "epoch": 13.29, + "grad_norm": 0.46484375, + "learning_rate": 0.00029467413236357033, + "loss": 0.1849, + "step": 320780 + }, + { + "epoch": 13.29, + "grad_norm": 1.0, + "learning_rate": 0.00029466346168839164, + "loss": 0.2098, + "step": 320790 + }, + { + "epoch": 13.29, + "grad_norm": 0.5859375, + "learning_rate": 0.00029465279092916115, + "loss": 0.2144, + "step": 320800 + }, + { + "epoch": 13.29, + "grad_norm": 0.7578125, + "learning_rate": 0.0002946421200858989, + "loss": 0.196, + "step": 320810 + }, + { + "epoch": 13.29, + "grad_norm": 1.8359375, + "learning_rate": 0.00029463144915862507, + "loss": 0.1952, + "step": 320820 + }, + { + "epoch": 13.29, + "grad_norm": 1.1953125, + "learning_rate": 0.0002946207781473596, + "loss": 0.2189, + "step": 320830 + }, + { + "epoch": 13.29, + "grad_norm": 1.5625, + "learning_rate": 0.0002946101070521227, + "loss": 0.2217, + "step": 320840 + }, + { + "epoch": 13.29, + "grad_norm": 0.734375, + "learning_rate": 0.00029459943587293436, + "loss": 0.1564, + "step": 320850 + }, + { + "epoch": 13.29, + "grad_norm": 0.984375, + "learning_rate": 0.00029458876460981473, + "loss": 0.1913, + "step": 320860 + }, + { + "epoch": 13.29, + "grad_norm": 0.82421875, + "learning_rate": 0.0002945780932627838, + "loss": 0.2351, + "step": 320870 + }, + { + "epoch": 13.29, + "grad_norm": 1.4921875, + "learning_rate": 0.0002945674218318618, + "loss": 0.2315, + "step": 320880 + }, + { + "epoch": 13.29, + "grad_norm": 0.9375, + "learning_rate": 0.00029455675031706864, + "loss": 0.1937, + "step": 320890 + }, + { + "epoch": 13.29, + "grad_norm": 0.359375, + "learning_rate": 0.00029454607871842454, + "loss": 0.1995, + "step": 320900 + }, + { + "epoch": 13.29, + "grad_norm": 1.1875, + "learning_rate": 0.00029453540703594945, + "loss": 0.1634, + "step": 320910 + }, + { + "epoch": 13.29, + "grad_norm": 1.109375, + "learning_rate": 0.0002945247352696637, + "loss": 0.1702, + "step": 320920 + }, + { + "epoch": 13.29, + "grad_norm": 0.93359375, + "learning_rate": 0.000294514063419587, + "loss": 0.1214, + "step": 320930 + }, + { + "epoch": 13.29, + "grad_norm": 3.265625, + "learning_rate": 0.0002945033914857398, + "loss": 0.1631, + "step": 320940 + }, + { + "epoch": 13.29, + "grad_norm": 0.296875, + "learning_rate": 0.000294492719468142, + "loss": 0.1938, + "step": 320950 + }, + { + "epoch": 13.29, + "grad_norm": 0.6171875, + "learning_rate": 0.0002944820473668136, + "loss": 0.2198, + "step": 320960 + }, + { + "epoch": 13.29, + "grad_norm": 0.65625, + "learning_rate": 0.0002944713751817749, + "loss": 0.2179, + "step": 320970 + }, + { + "epoch": 13.29, + "grad_norm": 0.7421875, + "learning_rate": 0.0002944607029130458, + "loss": 0.1341, + "step": 320980 + }, + { + "epoch": 13.3, + "grad_norm": 1.296875, + "learning_rate": 0.0002944500305606465, + "loss": 0.2132, + "step": 320990 + }, + { + "epoch": 13.3, + "grad_norm": 1.8125, + "learning_rate": 0.0002944393581245971, + "loss": 0.1687, + "step": 321000 + }, + { + "epoch": 13.3, + "grad_norm": 1.0234375, + "learning_rate": 0.0002944286856049175, + "loss": 0.2023, + "step": 321010 + }, + { + "epoch": 13.3, + "grad_norm": 0.53125, + "learning_rate": 0.000294418013001628, + "loss": 0.1596, + "step": 321020 + }, + { + "epoch": 13.3, + "grad_norm": 1.25, + "learning_rate": 0.0002944073403147486, + "loss": 0.1533, + "step": 321030 + }, + { + "epoch": 13.3, + "grad_norm": 0.69140625, + "learning_rate": 0.0002943966675442993, + "loss": 0.2509, + "step": 321040 + }, + { + "epoch": 13.3, + "grad_norm": 0.3046875, + "learning_rate": 0.0002943859946903004, + "loss": 0.2031, + "step": 321050 + }, + { + "epoch": 13.3, + "grad_norm": 1.1015625, + "learning_rate": 0.0002943753217527717, + "loss": 0.2345, + "step": 321060 + }, + { + "epoch": 13.3, + "grad_norm": 0.93359375, + "learning_rate": 0.0002943646487317335, + "loss": 0.1841, + "step": 321070 + }, + { + "epoch": 13.3, + "grad_norm": 0.9140625, + "learning_rate": 0.0002943539756272059, + "loss": 0.1601, + "step": 321080 + }, + { + "epoch": 13.3, + "grad_norm": 1.8984375, + "learning_rate": 0.0002943433024392088, + "loss": 0.2071, + "step": 321090 + }, + { + "epoch": 13.3, + "grad_norm": 0.6953125, + "learning_rate": 0.00029433262916776245, + "loss": 0.118, + "step": 321100 + }, + { + "epoch": 13.3, + "grad_norm": 0.80078125, + "learning_rate": 0.0002943219558128869, + "loss": 0.162, + "step": 321110 + }, + { + "epoch": 13.3, + "grad_norm": 1.0390625, + "learning_rate": 0.00029431128237460206, + "loss": 0.1573, + "step": 321120 + }, + { + "epoch": 13.3, + "grad_norm": 0.50390625, + "learning_rate": 0.00029430060885292836, + "loss": 0.1813, + "step": 321130 + }, + { + "epoch": 13.3, + "grad_norm": 0.84765625, + "learning_rate": 0.00029428993524788563, + "loss": 0.1723, + "step": 321140 + }, + { + "epoch": 13.3, + "grad_norm": 0.63671875, + "learning_rate": 0.00029427926155949405, + "loss": 0.1372, + "step": 321150 + }, + { + "epoch": 13.3, + "grad_norm": 0.86328125, + "learning_rate": 0.0002942685877877737, + "loss": 0.2303, + "step": 321160 + }, + { + "epoch": 13.3, + "grad_norm": 1.3203125, + "learning_rate": 0.0002942579139327445, + "loss": 0.2424, + "step": 321170 + }, + { + "epoch": 13.3, + "grad_norm": 0.62109375, + "learning_rate": 0.0002942472399944268, + "loss": 0.1514, + "step": 321180 + }, + { + "epoch": 13.3, + "grad_norm": 1.0078125, + "learning_rate": 0.0002942365659728406, + "loss": 0.2127, + "step": 321190 + }, + { + "epoch": 13.3, + "grad_norm": 0.5703125, + "learning_rate": 0.00029422589186800584, + "loss": 0.1572, + "step": 321200 + }, + { + "epoch": 13.3, + "grad_norm": 0.84765625, + "learning_rate": 0.0002942152176799429, + "loss": 0.166, + "step": 321210 + }, + { + "epoch": 13.3, + "grad_norm": 0.333984375, + "learning_rate": 0.0002942045434086715, + "loss": 0.146, + "step": 321220 + }, + { + "epoch": 13.31, + "grad_norm": 0.66015625, + "learning_rate": 0.0002941938690542121, + "loss": 0.1462, + "step": 321230 + }, + { + "epoch": 13.31, + "grad_norm": 0.71875, + "learning_rate": 0.00029418319461658457, + "loss": 0.1594, + "step": 321240 + }, + { + "epoch": 13.31, + "grad_norm": 0.8359375, + "learning_rate": 0.0002941725200958089, + "loss": 0.1825, + "step": 321250 + }, + { + "epoch": 13.31, + "grad_norm": 0.90625, + "learning_rate": 0.0002941618454919055, + "loss": 0.1951, + "step": 321260 + }, + { + "epoch": 13.31, + "grad_norm": 0.85546875, + "learning_rate": 0.00029415117080489425, + "loss": 0.2325, + "step": 321270 + }, + { + "epoch": 13.31, + "grad_norm": 0.61328125, + "learning_rate": 0.0002941404960347951, + "loss": 0.2283, + "step": 321280 + }, + { + "epoch": 13.31, + "grad_norm": 0.80078125, + "learning_rate": 0.00029412982118162846, + "loss": 0.1871, + "step": 321290 + }, + { + "epoch": 13.31, + "grad_norm": 0.953125, + "learning_rate": 0.00029411914624541424, + "loss": 0.2116, + "step": 321300 + }, + { + "epoch": 13.31, + "grad_norm": 0.7734375, + "learning_rate": 0.0002941084712261725, + "loss": 0.1933, + "step": 321310 + }, + { + "epoch": 13.31, + "grad_norm": 1.03125, + "learning_rate": 0.00029409779612392346, + "loss": 0.1654, + "step": 321320 + }, + { + "epoch": 13.31, + "grad_norm": 2.078125, + "learning_rate": 0.000294087120938687, + "loss": 0.2027, + "step": 321330 + }, + { + "epoch": 13.31, + "grad_norm": 1.4296875, + "learning_rate": 0.0002940764456704835, + "loss": 0.1428, + "step": 321340 + }, + { + "epoch": 13.31, + "grad_norm": 1.015625, + "learning_rate": 0.0002940657703193328, + "loss": 0.2128, + "step": 321350 + }, + { + "epoch": 13.31, + "grad_norm": 1.1796875, + "learning_rate": 0.0002940550948852551, + "loss": 0.2419, + "step": 321360 + }, + { + "epoch": 13.31, + "grad_norm": 0.68359375, + "learning_rate": 0.0002940444193682704, + "loss": 0.2257, + "step": 321370 + }, + { + "epoch": 13.31, + "grad_norm": 0.484375, + "learning_rate": 0.00029403374376839894, + "loss": 0.162, + "step": 321380 + }, + { + "epoch": 13.31, + "grad_norm": 0.6640625, + "learning_rate": 0.0002940230680856607, + "loss": 0.1662, + "step": 321390 + }, + { + "epoch": 13.31, + "grad_norm": 0.5546875, + "learning_rate": 0.0002940123923200758, + "loss": 0.1941, + "step": 321400 + }, + { + "epoch": 13.31, + "grad_norm": 0.458984375, + "learning_rate": 0.0002940017164716643, + "loss": 0.2027, + "step": 321410 + }, + { + "epoch": 13.31, + "grad_norm": 0.52734375, + "learning_rate": 0.0002939910405404464, + "loss": 0.2199, + "step": 321420 + }, + { + "epoch": 13.31, + "grad_norm": 0.93359375, + "learning_rate": 0.0002939803645264421, + "loss": 0.1748, + "step": 321430 + }, + { + "epoch": 13.31, + "grad_norm": 1.3828125, + "learning_rate": 0.00029396968842967144, + "loss": 0.2032, + "step": 321440 + }, + { + "epoch": 13.31, + "grad_norm": 0.50390625, + "learning_rate": 0.00029395901225015466, + "loss": 0.2287, + "step": 321450 + }, + { + "epoch": 13.31, + "grad_norm": 0.6015625, + "learning_rate": 0.00029394833598791163, + "loss": 0.2009, + "step": 321460 + }, + { + "epoch": 13.32, + "grad_norm": 1.1171875, + "learning_rate": 0.00029393765964296273, + "loss": 0.2002, + "step": 321470 + }, + { + "epoch": 13.32, + "grad_norm": 0.77734375, + "learning_rate": 0.00029392698321532786, + "loss": 0.1458, + "step": 321480 + }, + { + "epoch": 13.32, + "grad_norm": 0.765625, + "learning_rate": 0.000293916306705027, + "loss": 0.1852, + "step": 321490 + }, + { + "epoch": 13.32, + "grad_norm": 0.498046875, + "learning_rate": 0.0002939056301120805, + "loss": 0.1869, + "step": 321500 + }, + { + "epoch": 13.32, + "grad_norm": 0.494140625, + "learning_rate": 0.0002938949534365084, + "loss": 0.1375, + "step": 321510 + }, + { + "epoch": 13.32, + "grad_norm": 0.9375, + "learning_rate": 0.00029388427667833075, + "loss": 0.2319, + "step": 321520 + }, + { + "epoch": 13.32, + "grad_norm": 0.62890625, + "learning_rate": 0.0002938735998375676, + "loss": 0.1904, + "step": 321530 + }, + { + "epoch": 13.32, + "grad_norm": 0.6171875, + "learning_rate": 0.000293862922914239, + "loss": 0.2285, + "step": 321540 + }, + { + "epoch": 13.32, + "grad_norm": 0.94140625, + "learning_rate": 0.00029385224590836517, + "loss": 0.1859, + "step": 321550 + }, + { + "epoch": 13.32, + "grad_norm": 1.234375, + "learning_rate": 0.00029384156881996615, + "loss": 0.1936, + "step": 321560 + }, + { + "epoch": 13.32, + "grad_norm": 0.92578125, + "learning_rate": 0.000293830891649062, + "loss": 0.1914, + "step": 321570 + }, + { + "epoch": 13.32, + "grad_norm": 1.1796875, + "learning_rate": 0.0002938202143956729, + "loss": 0.1864, + "step": 321580 + }, + { + "epoch": 13.32, + "grad_norm": 1.546875, + "learning_rate": 0.00029380953705981884, + "loss": 0.1808, + "step": 321590 + }, + { + "epoch": 13.32, + "grad_norm": 0.96484375, + "learning_rate": 0.00029379885964152, + "loss": 0.2014, + "step": 321600 + }, + { + "epoch": 13.32, + "grad_norm": 0.640625, + "learning_rate": 0.00029378818214079644, + "loss": 0.1582, + "step": 321610 + }, + { + "epoch": 13.32, + "grad_norm": 0.8984375, + "learning_rate": 0.0002937775045576682, + "loss": 0.1796, + "step": 321620 + }, + { + "epoch": 13.32, + "grad_norm": 0.53515625, + "learning_rate": 0.0002937668268921555, + "loss": 0.176, + "step": 321630 + }, + { + "epoch": 13.32, + "grad_norm": 1.765625, + "learning_rate": 0.00029375614914427836, + "loss": 0.2038, + "step": 321640 + }, + { + "epoch": 13.32, + "grad_norm": 0.462890625, + "learning_rate": 0.0002937454713140567, + "loss": 0.175, + "step": 321650 + }, + { + "epoch": 13.32, + "grad_norm": 0.96484375, + "learning_rate": 0.00029373479340151097, + "loss": 0.1781, + "step": 321660 + }, + { + "epoch": 13.32, + "grad_norm": 1.0234375, + "learning_rate": 0.000293724115406661, + "loss": 0.1769, + "step": 321670 + }, + { + "epoch": 13.32, + "grad_norm": 0.63671875, + "learning_rate": 0.000293713437329527, + "loss": 0.2058, + "step": 321680 + }, + { + "epoch": 13.32, + "grad_norm": 0.396484375, + "learning_rate": 0.00029370275917012907, + "loss": 0.1831, + "step": 321690 + }, + { + "epoch": 13.32, + "grad_norm": 0.6171875, + "learning_rate": 0.0002936920809284872, + "loss": 0.1945, + "step": 321700 + }, + { + "epoch": 13.33, + "grad_norm": 0.5390625, + "learning_rate": 0.0002936814026046216, + "loss": 0.1635, + "step": 321710 + }, + { + "epoch": 13.33, + "grad_norm": 0.7265625, + "learning_rate": 0.00029367072419855225, + "loss": 0.2106, + "step": 321720 + }, + { + "epoch": 13.33, + "grad_norm": 1.078125, + "learning_rate": 0.0002936600457102993, + "loss": 0.2364, + "step": 321730 + }, + { + "epoch": 13.33, + "grad_norm": 3.796875, + "learning_rate": 0.000293649367139883, + "loss": 0.2149, + "step": 321740 + }, + { + "epoch": 13.33, + "grad_norm": 1.1015625, + "learning_rate": 0.0002936386884873232, + "loss": 0.1762, + "step": 321750 + }, + { + "epoch": 13.33, + "grad_norm": 0.474609375, + "learning_rate": 0.0002936280097526401, + "loss": 0.1645, + "step": 321760 + }, + { + "epoch": 13.33, + "grad_norm": 0.427734375, + "learning_rate": 0.00029361733093585384, + "loss": 0.2157, + "step": 321770 + }, + { + "epoch": 13.33, + "grad_norm": 0.9921875, + "learning_rate": 0.00029360665203698443, + "loss": 0.2056, + "step": 321780 + }, + { + "epoch": 13.33, + "grad_norm": 0.95703125, + "learning_rate": 0.00029359597305605203, + "loss": 0.2188, + "step": 321790 + }, + { + "epoch": 13.33, + "grad_norm": 0.70703125, + "learning_rate": 0.00029358529399307664, + "loss": 0.2163, + "step": 321800 + }, + { + "epoch": 13.33, + "grad_norm": 0.7734375, + "learning_rate": 0.0002935746148480785, + "loss": 0.1585, + "step": 321810 + }, + { + "epoch": 13.33, + "grad_norm": 0.67578125, + "learning_rate": 0.00029356393562107773, + "loss": 0.2072, + "step": 321820 + }, + { + "epoch": 13.33, + "grad_norm": 0.63671875, + "learning_rate": 0.0002935532563120942, + "loss": 0.2451, + "step": 321830 + }, + { + "epoch": 13.33, + "grad_norm": 0.337890625, + "learning_rate": 0.0002935425769211482, + "loss": 0.2178, + "step": 321840 + }, + { + "epoch": 13.33, + "grad_norm": 0.8046875, + "learning_rate": 0.00029353189744825984, + "loss": 0.2017, + "step": 321850 + }, + { + "epoch": 13.33, + "grad_norm": 1.03125, + "learning_rate": 0.00029352121789344904, + "loss": 0.1675, + "step": 321860 + }, + { + "epoch": 13.33, + "grad_norm": 2.9375, + "learning_rate": 0.0002935105382567361, + "loss": 0.2047, + "step": 321870 + }, + { + "epoch": 13.33, + "grad_norm": 1.1953125, + "learning_rate": 0.00029349985853814093, + "loss": 0.1516, + "step": 321880 + }, + { + "epoch": 13.33, + "grad_norm": 0.828125, + "learning_rate": 0.00029348917873768375, + "loss": 0.2494, + "step": 321890 + }, + { + "epoch": 13.33, + "grad_norm": 0.65625, + "learning_rate": 0.0002934784988553847, + "loss": 0.1607, + "step": 321900 + }, + { + "epoch": 13.33, + "grad_norm": 0.90234375, + "learning_rate": 0.0002934678188912637, + "loss": 0.2774, + "step": 321910 + }, + { + "epoch": 13.33, + "grad_norm": 0.65625, + "learning_rate": 0.0002934571388453411, + "loss": 0.1943, + "step": 321920 + }, + { + "epoch": 13.33, + "grad_norm": 3.265625, + "learning_rate": 0.0002934464587176367, + "loss": 0.1781, + "step": 321930 + }, + { + "epoch": 13.33, + "grad_norm": 1.171875, + "learning_rate": 0.0002934357785081709, + "loss": 0.1621, + "step": 321940 + }, + { + "epoch": 13.34, + "grad_norm": 0.5390625, + "learning_rate": 0.0002934250982169636, + "loss": 0.1946, + "step": 321950 + }, + { + "epoch": 13.34, + "grad_norm": 0.6796875, + "learning_rate": 0.0002934144178440349, + "loss": 0.154, + "step": 321960 + }, + { + "epoch": 13.34, + "grad_norm": 0.55078125, + "learning_rate": 0.000293403737389405, + "loss": 0.1932, + "step": 321970 + }, + { + "epoch": 13.34, + "grad_norm": 0.8671875, + "learning_rate": 0.000293393056853094, + "loss": 0.2651, + "step": 321980 + }, + { + "epoch": 13.34, + "grad_norm": 1.5625, + "learning_rate": 0.00029338237623512185, + "loss": 0.2233, + "step": 321990 + }, + { + "epoch": 13.34, + "grad_norm": 0.447265625, + "learning_rate": 0.00029337169553550886, + "loss": 0.1768, + "step": 322000 + }, + { + "epoch": 13.34, + "grad_norm": 1.0546875, + "learning_rate": 0.0002933610147542749, + "loss": 0.1781, + "step": 322010 + }, + { + "epoch": 13.34, + "grad_norm": 0.63671875, + "learning_rate": 0.0002933503338914403, + "loss": 0.1959, + "step": 322020 + }, + { + "epoch": 13.34, + "grad_norm": 0.65234375, + "learning_rate": 0.00029333965294702505, + "loss": 0.1975, + "step": 322030 + }, + { + "epoch": 13.34, + "grad_norm": 1.3515625, + "learning_rate": 0.00029332897192104916, + "loss": 0.1822, + "step": 322040 + }, + { + "epoch": 13.34, + "grad_norm": 1.3359375, + "learning_rate": 0.0002933182908135329, + "loss": 0.1934, + "step": 322050 + }, + { + "epoch": 13.34, + "grad_norm": 1.078125, + "learning_rate": 0.00029330760962449626, + "loss": 0.1814, + "step": 322060 + }, + { + "epoch": 13.34, + "grad_norm": 0.63671875, + "learning_rate": 0.0002932969283539594, + "loss": 0.1544, + "step": 322070 + }, + { + "epoch": 13.34, + "grad_norm": 1.90625, + "learning_rate": 0.00029328624700194247, + "loss": 0.1704, + "step": 322080 + }, + { + "epoch": 13.34, + "grad_norm": 1.4921875, + "learning_rate": 0.00029327556556846534, + "loss": 0.1873, + "step": 322090 + }, + { + "epoch": 13.34, + "grad_norm": 1.5390625, + "learning_rate": 0.00029326488405354836, + "loss": 0.2295, + "step": 322100 + }, + { + "epoch": 13.34, + "grad_norm": 0.61328125, + "learning_rate": 0.0002932542024572116, + "loss": 0.2466, + "step": 322110 + }, + { + "epoch": 13.34, + "grad_norm": 1.1015625, + "learning_rate": 0.000293243520779475, + "loss": 0.1662, + "step": 322120 + }, + { + "epoch": 13.34, + "grad_norm": 0.58203125, + "learning_rate": 0.0002932328390203588, + "loss": 0.1608, + "step": 322130 + }, + { + "epoch": 13.34, + "grad_norm": 0.77734375, + "learning_rate": 0.000293222157179883, + "loss": 0.1712, + "step": 322140 + }, + { + "epoch": 13.34, + "grad_norm": 1.25, + "learning_rate": 0.0002932114752580679, + "loss": 0.1517, + "step": 322150 + }, + { + "epoch": 13.34, + "grad_norm": 0.70703125, + "learning_rate": 0.0002932007932549334, + "loss": 0.1854, + "step": 322160 + }, + { + "epoch": 13.34, + "grad_norm": 0.8125, + "learning_rate": 0.00029319011117049965, + "loss": 0.1776, + "step": 322170 + }, + { + "epoch": 13.34, + "grad_norm": 0.7734375, + "learning_rate": 0.0002931794290047868, + "loss": 0.1948, + "step": 322180 + }, + { + "epoch": 13.35, + "grad_norm": 0.5546875, + "learning_rate": 0.0002931687467578149, + "loss": 0.1711, + "step": 322190 + }, + { + "epoch": 13.35, + "grad_norm": 0.6640625, + "learning_rate": 0.00029315806442960414, + "loss": 0.1725, + "step": 322200 + }, + { + "epoch": 13.35, + "grad_norm": 0.8359375, + "learning_rate": 0.0002931473820201745, + "loss": 0.2159, + "step": 322210 + }, + { + "epoch": 13.35, + "grad_norm": 0.90234375, + "learning_rate": 0.00029313669952954615, + "loss": 0.1555, + "step": 322220 + }, + { + "epoch": 13.35, + "grad_norm": 1.1015625, + "learning_rate": 0.00029312601695773925, + "loss": 0.2006, + "step": 322230 + }, + { + "epoch": 13.35, + "grad_norm": 0.59765625, + "learning_rate": 0.0002931153343047738, + "loss": 0.1722, + "step": 322240 + }, + { + "epoch": 13.35, + "grad_norm": 1.1015625, + "learning_rate": 0.00029310465157066995, + "loss": 0.1661, + "step": 322250 + }, + { + "epoch": 13.35, + "grad_norm": 1.3828125, + "learning_rate": 0.0002930939687554478, + "loss": 0.2206, + "step": 322260 + }, + { + "epoch": 13.35, + "grad_norm": 1.078125, + "learning_rate": 0.0002930832858591274, + "loss": 0.2016, + "step": 322270 + }, + { + "epoch": 13.35, + "grad_norm": 1.1484375, + "learning_rate": 0.00029307260288172904, + "loss": 0.2007, + "step": 322280 + }, + { + "epoch": 13.35, + "grad_norm": 0.63671875, + "learning_rate": 0.00029306191982327257, + "loss": 0.1857, + "step": 322290 + }, + { + "epoch": 13.35, + "grad_norm": 0.68359375, + "learning_rate": 0.0002930512366837783, + "loss": 0.1881, + "step": 322300 + }, + { + "epoch": 13.35, + "grad_norm": 0.79296875, + "learning_rate": 0.00029304055346326627, + "loss": 0.1759, + "step": 322310 + }, + { + "epoch": 13.35, + "grad_norm": 0.2265625, + "learning_rate": 0.0002930298701617565, + "loss": 0.1974, + "step": 322320 + }, + { + "epoch": 13.35, + "grad_norm": 1.0234375, + "learning_rate": 0.00029301918677926913, + "loss": 0.207, + "step": 322330 + }, + { + "epoch": 13.35, + "grad_norm": 0.6640625, + "learning_rate": 0.00029300850331582433, + "loss": 0.1676, + "step": 322340 + }, + { + "epoch": 13.35, + "grad_norm": 1.4609375, + "learning_rate": 0.00029299781977144216, + "loss": 0.198, + "step": 322350 + }, + { + "epoch": 13.35, + "grad_norm": 0.3125, + "learning_rate": 0.0002929871361461428, + "loss": 0.1872, + "step": 322360 + }, + { + "epoch": 13.35, + "grad_norm": 0.443359375, + "learning_rate": 0.00029297645243994626, + "loss": 0.1805, + "step": 322370 + }, + { + "epoch": 13.35, + "grad_norm": 1.6875, + "learning_rate": 0.00029296576865287265, + "loss": 0.2223, + "step": 322380 + }, + { + "epoch": 13.35, + "grad_norm": 0.58984375, + "learning_rate": 0.00029295508478494216, + "loss": 0.2115, + "step": 322390 + }, + { + "epoch": 13.35, + "grad_norm": 0.40234375, + "learning_rate": 0.0002929444008361748, + "loss": 0.1433, + "step": 322400 + }, + { + "epoch": 13.35, + "grad_norm": 0.76171875, + "learning_rate": 0.0002929337168065907, + "loss": 0.1792, + "step": 322410 + }, + { + "epoch": 13.35, + "grad_norm": 0.7578125, + "learning_rate": 0.00029292303269621, + "loss": 0.1865, + "step": 322420 + }, + { + "epoch": 13.36, + "grad_norm": 0.59375, + "learning_rate": 0.00029291234850505277, + "loss": 0.1705, + "step": 322430 + }, + { + "epoch": 13.36, + "grad_norm": 0.640625, + "learning_rate": 0.0002929016642331392, + "loss": 0.1807, + "step": 322440 + }, + { + "epoch": 13.36, + "grad_norm": 0.890625, + "learning_rate": 0.0002928909798804892, + "loss": 0.1706, + "step": 322450 + }, + { + "epoch": 13.36, + "grad_norm": 1.96875, + "learning_rate": 0.0002928802954471231, + "loss": 0.1941, + "step": 322460 + }, + { + "epoch": 13.36, + "grad_norm": 0.76953125, + "learning_rate": 0.000292869610933061, + "loss": 0.1995, + "step": 322470 + }, + { + "epoch": 13.36, + "grad_norm": 3.375, + "learning_rate": 0.00029285892633832273, + "loss": 0.1929, + "step": 322480 + }, + { + "epoch": 13.36, + "grad_norm": 0.8671875, + "learning_rate": 0.0002928482416629287, + "loss": 0.1796, + "step": 322490 + }, + { + "epoch": 13.36, + "grad_norm": 0.63671875, + "learning_rate": 0.0002928375569068989, + "loss": 0.1964, + "step": 322500 + }, + { + "epoch": 13.36, + "grad_norm": 0.76953125, + "learning_rate": 0.00029282687207025346, + "loss": 0.1221, + "step": 322510 + }, + { + "epoch": 13.36, + "grad_norm": 0.466796875, + "learning_rate": 0.00029281618715301243, + "loss": 0.1423, + "step": 322520 + }, + { + "epoch": 13.36, + "grad_norm": 0.5234375, + "learning_rate": 0.000292805502155196, + "loss": 0.1825, + "step": 322530 + }, + { + "epoch": 13.36, + "grad_norm": 1.1015625, + "learning_rate": 0.0002927948170768242, + "loss": 0.2285, + "step": 322540 + }, + { + "epoch": 13.36, + "grad_norm": 1.1015625, + "learning_rate": 0.0002927841319179172, + "loss": 0.2175, + "step": 322550 + }, + { + "epoch": 13.36, + "grad_norm": 0.6640625, + "learning_rate": 0.00029277344667849505, + "loss": 0.1533, + "step": 322560 + }, + { + "epoch": 13.36, + "grad_norm": 0.375, + "learning_rate": 0.0002927627613585779, + "loss": 0.1638, + "step": 322570 + }, + { + "epoch": 13.36, + "grad_norm": 0.515625, + "learning_rate": 0.0002927520759581859, + "loss": 0.128, + "step": 322580 + }, + { + "epoch": 13.36, + "grad_norm": 1.1171875, + "learning_rate": 0.00029274139047733906, + "loss": 0.1734, + "step": 322590 + }, + { + "epoch": 13.36, + "grad_norm": 1.515625, + "learning_rate": 0.0002927307049160576, + "loss": 0.2236, + "step": 322600 + }, + { + "epoch": 13.36, + "grad_norm": 1.046875, + "learning_rate": 0.00029272001927436146, + "loss": 0.2437, + "step": 322610 + }, + { + "epoch": 13.36, + "grad_norm": 0.8828125, + "learning_rate": 0.000292709333552271, + "loss": 0.1879, + "step": 322620 + }, + { + "epoch": 13.36, + "grad_norm": 0.5859375, + "learning_rate": 0.0002926986477498061, + "loss": 0.1675, + "step": 322630 + }, + { + "epoch": 13.36, + "grad_norm": 1.3984375, + "learning_rate": 0.00029268796186698686, + "loss": 0.174, + "step": 322640 + }, + { + "epoch": 13.36, + "grad_norm": 0.53515625, + "learning_rate": 0.00029267727590383366, + "loss": 0.2265, + "step": 322650 + }, + { + "epoch": 13.36, + "grad_norm": 0.83984375, + "learning_rate": 0.00029266658986036635, + "loss": 0.2433, + "step": 322660 + }, + { + "epoch": 13.36, + "grad_norm": 0.71484375, + "learning_rate": 0.0002926559037366051, + "loss": 0.1828, + "step": 322670 + }, + { + "epoch": 13.37, + "grad_norm": 0.5546875, + "learning_rate": 0.00029264521753257015, + "loss": 0.1941, + "step": 322680 + }, + { + "epoch": 13.37, + "grad_norm": 0.671875, + "learning_rate": 0.00029263453124828137, + "loss": 0.1936, + "step": 322690 + }, + { + "epoch": 13.37, + "grad_norm": 0.8359375, + "learning_rate": 0.00029262384488375914, + "loss": 0.208, + "step": 322700 + }, + { + "epoch": 13.37, + "grad_norm": 0.5625, + "learning_rate": 0.0002926131584390234, + "loss": 0.1961, + "step": 322710 + }, + { + "epoch": 13.37, + "grad_norm": 0.90234375, + "learning_rate": 0.0002926024719140942, + "loss": 0.1856, + "step": 322720 + }, + { + "epoch": 13.37, + "grad_norm": 0.82421875, + "learning_rate": 0.0002925917853089919, + "loss": 0.224, + "step": 322730 + }, + { + "epoch": 13.37, + "grad_norm": 0.9921875, + "learning_rate": 0.00029258109862373633, + "loss": 0.2466, + "step": 322740 + }, + { + "epoch": 13.37, + "grad_norm": 0.6171875, + "learning_rate": 0.00029257041185834775, + "loss": 0.179, + "step": 322750 + }, + { + "epoch": 13.37, + "grad_norm": 0.21484375, + "learning_rate": 0.0002925597250128463, + "loss": 0.1808, + "step": 322760 + }, + { + "epoch": 13.37, + "grad_norm": 0.95703125, + "learning_rate": 0.000292549038087252, + "loss": 0.1533, + "step": 322770 + }, + { + "epoch": 13.37, + "grad_norm": 0.81640625, + "learning_rate": 0.0002925383510815851, + "loss": 0.1332, + "step": 322780 + }, + { + "epoch": 13.37, + "grad_norm": 0.80078125, + "learning_rate": 0.0002925276639958656, + "loss": 0.2189, + "step": 322790 + }, + { + "epoch": 13.37, + "grad_norm": 0.9140625, + "learning_rate": 0.0002925169768301135, + "loss": 0.2113, + "step": 322800 + }, + { + "epoch": 13.37, + "grad_norm": 0.99609375, + "learning_rate": 0.00029250628958434917, + "loss": 0.175, + "step": 322810 + }, + { + "epoch": 13.37, + "grad_norm": 0.419921875, + "learning_rate": 0.0002924956022585925, + "loss": 0.222, + "step": 322820 + }, + { + "epoch": 13.37, + "grad_norm": 0.8828125, + "learning_rate": 0.00029248491485286374, + "loss": 0.1809, + "step": 322830 + }, + { + "epoch": 13.37, + "grad_norm": 0.72265625, + "learning_rate": 0.000292474227367183, + "loss": 0.1476, + "step": 322840 + }, + { + "epoch": 13.37, + "grad_norm": 0.64453125, + "learning_rate": 0.00029246353980157027, + "loss": 0.1734, + "step": 322850 + }, + { + "epoch": 13.37, + "grad_norm": 1.0078125, + "learning_rate": 0.00029245285215604577, + "loss": 0.1975, + "step": 322860 + }, + { + "epoch": 13.37, + "grad_norm": 0.453125, + "learning_rate": 0.00029244216443062966, + "loss": 0.222, + "step": 322870 + }, + { + "epoch": 13.37, + "grad_norm": 1.6796875, + "learning_rate": 0.00029243147662534185, + "loss": 0.1968, + "step": 322880 + }, + { + "epoch": 13.37, + "grad_norm": 0.515625, + "learning_rate": 0.00029242078874020265, + "loss": 0.1983, + "step": 322890 + }, + { + "epoch": 13.37, + "grad_norm": 1.125, + "learning_rate": 0.0002924101007752321, + "loss": 0.1534, + "step": 322900 + }, + { + "epoch": 13.37, + "grad_norm": 1.3984375, + "learning_rate": 0.00029239941273045034, + "loss": 0.2515, + "step": 322910 + }, + { + "epoch": 13.38, + "grad_norm": 1.1640625, + "learning_rate": 0.0002923887246058775, + "loss": 0.1926, + "step": 322920 + }, + { + "epoch": 13.38, + "grad_norm": 0.58984375, + "learning_rate": 0.0002923780364015335, + "loss": 0.2238, + "step": 322930 + }, + { + "epoch": 13.38, + "grad_norm": 0.8359375, + "learning_rate": 0.0002923673481174388, + "loss": 0.1689, + "step": 322940 + }, + { + "epoch": 13.38, + "grad_norm": 1.75, + "learning_rate": 0.0002923566597536133, + "loss": 0.1986, + "step": 322950 + }, + { + "epoch": 13.38, + "grad_norm": 0.439453125, + "learning_rate": 0.000292345971310077, + "loss": 0.1871, + "step": 322960 + }, + { + "epoch": 13.38, + "grad_norm": 0.71875, + "learning_rate": 0.00029233528278685026, + "loss": 0.193, + "step": 322970 + }, + { + "epoch": 13.38, + "grad_norm": 0.9140625, + "learning_rate": 0.00029232459418395307, + "loss": 0.2248, + "step": 322980 + }, + { + "epoch": 13.38, + "grad_norm": 2.0, + "learning_rate": 0.00029231390550140556, + "loss": 0.2654, + "step": 322990 + }, + { + "epoch": 13.38, + "grad_norm": 0.875, + "learning_rate": 0.00029230321673922796, + "loss": 0.2158, + "step": 323000 + }, + { + "epoch": 13.38, + "grad_norm": 0.7578125, + "learning_rate": 0.00029229252789744005, + "loss": 0.1407, + "step": 323010 + }, + { + "epoch": 13.38, + "grad_norm": 0.8671875, + "learning_rate": 0.00029228183897606237, + "loss": 0.1934, + "step": 323020 + }, + { + "epoch": 13.38, + "grad_norm": 1.359375, + "learning_rate": 0.0002922711499751147, + "loss": 0.149, + "step": 323030 + }, + { + "epoch": 13.38, + "grad_norm": 0.80859375, + "learning_rate": 0.0002922604608946173, + "loss": 0.1469, + "step": 323040 + }, + { + "epoch": 13.38, + "grad_norm": 0.8359375, + "learning_rate": 0.0002922497717345904, + "loss": 0.1702, + "step": 323050 + }, + { + "epoch": 13.38, + "grad_norm": 1.0078125, + "learning_rate": 0.00029223908249505394, + "loss": 0.1803, + "step": 323060 + }, + { + "epoch": 13.38, + "grad_norm": 0.9296875, + "learning_rate": 0.000292228393176028, + "loss": 0.2022, + "step": 323070 + }, + { + "epoch": 13.38, + "grad_norm": 0.75, + "learning_rate": 0.00029221770377753294, + "loss": 0.1797, + "step": 323080 + }, + { + "epoch": 13.38, + "grad_norm": 1.015625, + "learning_rate": 0.00029220701429958853, + "loss": 0.2093, + "step": 323090 + }, + { + "epoch": 13.38, + "grad_norm": 0.53125, + "learning_rate": 0.00029219632474221526, + "loss": 0.2109, + "step": 323100 + }, + { + "epoch": 13.38, + "grad_norm": 1.0859375, + "learning_rate": 0.00029218563510543296, + "loss": 0.1908, + "step": 323110 + }, + { + "epoch": 13.38, + "grad_norm": 1.203125, + "learning_rate": 0.0002921749453892618, + "loss": 0.222, + "step": 323120 + }, + { + "epoch": 13.38, + "grad_norm": 0.8515625, + "learning_rate": 0.00029216425559372205, + "loss": 0.1852, + "step": 323130 + }, + { + "epoch": 13.38, + "grad_norm": 0.7890625, + "learning_rate": 0.0002921535657188337, + "loss": 0.1953, + "step": 323140 + }, + { + "epoch": 13.38, + "grad_norm": 0.5234375, + "learning_rate": 0.0002921428757646169, + "loss": 0.2271, + "step": 323150 + }, + { + "epoch": 13.39, + "grad_norm": 1.0859375, + "learning_rate": 0.00029213218573109176, + "loss": 0.179, + "step": 323160 + }, + { + "epoch": 13.39, + "grad_norm": 0.98828125, + "learning_rate": 0.0002921214956182783, + "loss": 0.1756, + "step": 323170 + }, + { + "epoch": 13.39, + "grad_norm": 0.52734375, + "learning_rate": 0.0002921108054261968, + "loss": 0.198, + "step": 323180 + }, + { + "epoch": 13.39, + "grad_norm": 0.7421875, + "learning_rate": 0.00029210011515486736, + "loss": 0.1484, + "step": 323190 + }, + { + "epoch": 13.39, + "grad_norm": 0.578125, + "learning_rate": 0.00029208942480431, + "loss": 0.2082, + "step": 323200 + }, + { + "epoch": 13.39, + "grad_norm": 0.625, + "learning_rate": 0.000292078734374545, + "loss": 0.2011, + "step": 323210 + }, + { + "epoch": 13.39, + "grad_norm": 0.765625, + "learning_rate": 0.00029206804386559224, + "loss": 0.1892, + "step": 323220 + }, + { + "epoch": 13.39, + "grad_norm": 0.5234375, + "learning_rate": 0.00029205735327747197, + "loss": 0.182, + "step": 323230 + }, + { + "epoch": 13.39, + "grad_norm": 0.8671875, + "learning_rate": 0.0002920466626102043, + "loss": 0.1757, + "step": 323240 + }, + { + "epoch": 13.39, + "grad_norm": 0.427734375, + "learning_rate": 0.0002920359718638094, + "loss": 0.188, + "step": 323250 + }, + { + "epoch": 13.39, + "grad_norm": 1.578125, + "learning_rate": 0.0002920252810383073, + "loss": 0.2021, + "step": 323260 + }, + { + "epoch": 13.39, + "grad_norm": 0.33984375, + "learning_rate": 0.00029201459013371815, + "loss": 0.1707, + "step": 323270 + }, + { + "epoch": 13.39, + "grad_norm": 1.3359375, + "learning_rate": 0.00029200389915006207, + "loss": 0.1886, + "step": 323280 + }, + { + "epoch": 13.39, + "grad_norm": 0.921875, + "learning_rate": 0.0002919932080873593, + "loss": 0.2396, + "step": 323290 + }, + { + "epoch": 13.39, + "grad_norm": 1.0390625, + "learning_rate": 0.0002919825169456298, + "loss": 0.1989, + "step": 323300 + }, + { + "epoch": 13.39, + "grad_norm": 0.9609375, + "learning_rate": 0.00029197182572489365, + "loss": 0.1496, + "step": 323310 + }, + { + "epoch": 13.39, + "grad_norm": 1.3515625, + "learning_rate": 0.00029196113442517106, + "loss": 0.2309, + "step": 323320 + }, + { + "epoch": 13.39, + "grad_norm": 0.8671875, + "learning_rate": 0.00029195044304648223, + "loss": 0.1659, + "step": 323330 + }, + { + "epoch": 13.39, + "grad_norm": 0.5, + "learning_rate": 0.0002919397515888472, + "loss": 0.123, + "step": 323340 + }, + { + "epoch": 13.39, + "grad_norm": 0.77734375, + "learning_rate": 0.000291929060052286, + "loss": 0.2013, + "step": 323350 + }, + { + "epoch": 13.39, + "grad_norm": 1.1171875, + "learning_rate": 0.00029191836843681893, + "loss": 0.1807, + "step": 323360 + }, + { + "epoch": 13.39, + "grad_norm": 0.0, + "learning_rate": 0.000291907676742466, + "loss": 0.2059, + "step": 323370 + }, + { + "epoch": 13.39, + "grad_norm": 0.71875, + "learning_rate": 0.0002918969849692473, + "loss": 0.2091, + "step": 323380 + }, + { + "epoch": 13.39, + "grad_norm": 0.91015625, + "learning_rate": 0.000291886293117183, + "loss": 0.186, + "step": 323390 + }, + { + "epoch": 13.4, + "grad_norm": 0.7734375, + "learning_rate": 0.00029187560118629327, + "loss": 0.1562, + "step": 323400 + }, + { + "epoch": 13.4, + "grad_norm": 1.46875, + "learning_rate": 0.0002918649091765981, + "loss": 0.2191, + "step": 323410 + }, + { + "epoch": 13.4, + "grad_norm": 1.0234375, + "learning_rate": 0.0002918542170881178, + "loss": 0.1815, + "step": 323420 + }, + { + "epoch": 13.4, + "grad_norm": 0.93359375, + "learning_rate": 0.0002918435249208723, + "loss": 0.2476, + "step": 323430 + }, + { + "epoch": 13.4, + "grad_norm": 0.6953125, + "learning_rate": 0.00029183283267488185, + "loss": 0.129, + "step": 323440 + }, + { + "epoch": 13.4, + "grad_norm": 0.91796875, + "learning_rate": 0.0002918221403501665, + "loss": 0.1416, + "step": 323450 + }, + { + "epoch": 13.4, + "grad_norm": 0.373046875, + "learning_rate": 0.00029181144794674643, + "loss": 0.1838, + "step": 323460 + }, + { + "epoch": 13.4, + "grad_norm": 1.890625, + "learning_rate": 0.0002918007554646417, + "loss": 0.1681, + "step": 323470 + }, + { + "epoch": 13.4, + "grad_norm": 0.462890625, + "learning_rate": 0.00029179006290387243, + "loss": 0.223, + "step": 323480 + }, + { + "epoch": 13.4, + "grad_norm": 0.58984375, + "learning_rate": 0.00029177937026445884, + "loss": 0.1906, + "step": 323490 + }, + { + "epoch": 13.4, + "grad_norm": 0.63671875, + "learning_rate": 0.000291768677546421, + "loss": 0.2036, + "step": 323500 + }, + { + "epoch": 13.4, + "grad_norm": 1.8359375, + "learning_rate": 0.00029175798474977897, + "loss": 0.2067, + "step": 323510 + }, + { + "epoch": 13.4, + "grad_norm": 0.044189453125, + "learning_rate": 0.0002917472918745529, + "loss": 0.131, + "step": 323520 + }, + { + "epoch": 13.4, + "grad_norm": 0.77734375, + "learning_rate": 0.000291736598920763, + "loss": 0.1681, + "step": 323530 + }, + { + "epoch": 13.4, + "grad_norm": 1.3359375, + "learning_rate": 0.00029172590588842927, + "loss": 0.1998, + "step": 323540 + }, + { + "epoch": 13.4, + "grad_norm": 0.859375, + "learning_rate": 0.00029171521277757194, + "loss": 0.2067, + "step": 323550 + }, + { + "epoch": 13.4, + "grad_norm": 0.81640625, + "learning_rate": 0.00029170451958821105, + "loss": 0.2143, + "step": 323560 + }, + { + "epoch": 13.4, + "grad_norm": 0.6875, + "learning_rate": 0.0002916938263203668, + "loss": 0.1839, + "step": 323570 + }, + { + "epoch": 13.4, + "grad_norm": 0.54296875, + "learning_rate": 0.00029168313297405916, + "loss": 0.1625, + "step": 323580 + }, + { + "epoch": 13.4, + "grad_norm": 1.1328125, + "learning_rate": 0.0002916724395493085, + "loss": 0.2253, + "step": 323590 + }, + { + "epoch": 13.4, + "grad_norm": 1.5859375, + "learning_rate": 0.0002916617460461348, + "loss": 0.2656, + "step": 323600 + }, + { + "epoch": 13.4, + "grad_norm": 0.45703125, + "learning_rate": 0.00029165105246455806, + "loss": 0.1551, + "step": 323610 + }, + { + "epoch": 13.4, + "grad_norm": 0.60546875, + "learning_rate": 0.00029164035880459874, + "loss": 0.2107, + "step": 323620 + }, + { + "epoch": 13.4, + "grad_norm": 0.54296875, + "learning_rate": 0.0002916296650662766, + "loss": 0.2158, + "step": 323630 + }, + { + "epoch": 13.41, + "grad_norm": 1.0625, + "learning_rate": 0.00029161897124961193, + "loss": 0.1851, + "step": 323640 + }, + { + "epoch": 13.41, + "grad_norm": 0.98828125, + "learning_rate": 0.0002916082773546249, + "loss": 0.205, + "step": 323650 + }, + { + "epoch": 13.41, + "grad_norm": 1.515625, + "learning_rate": 0.0002915975833813356, + "loss": 0.1332, + "step": 323660 + }, + { + "epoch": 13.41, + "grad_norm": 0.734375, + "learning_rate": 0.0002915868893297641, + "loss": 0.2075, + "step": 323670 + }, + { + "epoch": 13.41, + "grad_norm": 0.7734375, + "learning_rate": 0.00029157619519993064, + "loss": 0.2037, + "step": 323680 + }, + { + "epoch": 13.41, + "grad_norm": 1.1875, + "learning_rate": 0.0002915655009918552, + "loss": 0.1813, + "step": 323690 + }, + { + "epoch": 13.41, + "grad_norm": 0.75390625, + "learning_rate": 0.000291554806705558, + "loss": 0.1974, + "step": 323700 + }, + { + "epoch": 13.41, + "grad_norm": 1.578125, + "learning_rate": 0.0002915441123410591, + "loss": 0.2338, + "step": 323710 + }, + { + "epoch": 13.41, + "grad_norm": 1.1015625, + "learning_rate": 0.00029153341789837873, + "loss": 0.169, + "step": 323720 + }, + { + "epoch": 13.41, + "grad_norm": 0.76171875, + "learning_rate": 0.00029152272337753686, + "loss": 0.19, + "step": 323730 + }, + { + "epoch": 13.41, + "grad_norm": 1.28125, + "learning_rate": 0.00029151202877855377, + "loss": 0.1841, + "step": 323740 + }, + { + "epoch": 13.41, + "grad_norm": 1.0546875, + "learning_rate": 0.0002915013341014496, + "loss": 0.1835, + "step": 323750 + }, + { + "epoch": 13.41, + "grad_norm": 1.6875, + "learning_rate": 0.00029149063934624434, + "loss": 0.246, + "step": 323760 + }, + { + "epoch": 13.41, + "grad_norm": 1.15625, + "learning_rate": 0.0002914799445129581, + "loss": 0.1709, + "step": 323770 + }, + { + "epoch": 13.41, + "grad_norm": 0.439453125, + "learning_rate": 0.0002914692496016112, + "loss": 0.1727, + "step": 323780 + }, + { + "epoch": 13.41, + "grad_norm": 1.3515625, + "learning_rate": 0.0002914585546122235, + "loss": 0.1607, + "step": 323790 + }, + { + "epoch": 13.41, + "grad_norm": 0.70703125, + "learning_rate": 0.00029144785954481544, + "loss": 0.2078, + "step": 323800 + }, + { + "epoch": 13.41, + "grad_norm": 1.3203125, + "learning_rate": 0.0002914371643994069, + "loss": 0.1951, + "step": 323810 + }, + { + "epoch": 13.41, + "grad_norm": 0.275390625, + "learning_rate": 0.00029142646917601814, + "loss": 0.1435, + "step": 323820 + }, + { + "epoch": 13.41, + "grad_norm": 1.203125, + "learning_rate": 0.0002914157738746692, + "loss": 0.18, + "step": 323830 + }, + { + "epoch": 13.41, + "grad_norm": 0.640625, + "learning_rate": 0.0002914050784953803, + "loss": 0.1608, + "step": 323840 + }, + { + "epoch": 13.41, + "grad_norm": 1.1484375, + "learning_rate": 0.0002913943830381714, + "loss": 0.1796, + "step": 323850 + }, + { + "epoch": 13.41, + "grad_norm": 0.275390625, + "learning_rate": 0.0002913836875030629, + "loss": 0.152, + "step": 323860 + }, + { + "epoch": 13.41, + "grad_norm": 1.1171875, + "learning_rate": 0.0002913729918900746, + "loss": 0.2188, + "step": 323870 + }, + { + "epoch": 13.42, + "grad_norm": 1.0546875, + "learning_rate": 0.0002913622961992269, + "loss": 0.1906, + "step": 323880 + }, + { + "epoch": 13.42, + "grad_norm": 0.59765625, + "learning_rate": 0.0002913516004305398, + "loss": 0.1425, + "step": 323890 + }, + { + "epoch": 13.42, + "grad_norm": 1.4765625, + "learning_rate": 0.00029134090458403345, + "loss": 0.1441, + "step": 323900 + }, + { + "epoch": 13.42, + "grad_norm": 0.6796875, + "learning_rate": 0.000291330208659728, + "loss": 0.1989, + "step": 323910 + }, + { + "epoch": 13.42, + "grad_norm": 1.0703125, + "learning_rate": 0.00029131951265764356, + "loss": 0.159, + "step": 323920 + }, + { + "epoch": 13.42, + "grad_norm": 0.41796875, + "learning_rate": 0.0002913088165778002, + "loss": 0.1931, + "step": 323930 + }, + { + "epoch": 13.42, + "grad_norm": 0.98046875, + "learning_rate": 0.0002912981204202182, + "loss": 0.1701, + "step": 323940 + }, + { + "epoch": 13.42, + "grad_norm": 1.3046875, + "learning_rate": 0.0002912874241849175, + "loss": 0.1831, + "step": 323950 + }, + { + "epoch": 13.42, + "grad_norm": 1.28125, + "learning_rate": 0.0002912767278719184, + "loss": 0.2337, + "step": 323960 + }, + { + "epoch": 13.42, + "grad_norm": 0.8515625, + "learning_rate": 0.000291266031481241, + "loss": 0.1919, + "step": 323970 + }, + { + "epoch": 13.42, + "grad_norm": 1.1015625, + "learning_rate": 0.00029125533501290525, + "loss": 0.2332, + "step": 323980 + }, + { + "epoch": 13.42, + "grad_norm": 1.15625, + "learning_rate": 0.0002912446384669316, + "loss": 0.2148, + "step": 323990 + }, + { + "epoch": 13.42, + "grad_norm": 1.4140625, + "learning_rate": 0.0002912339418433397, + "loss": 0.1278, + "step": 324000 + }, + { + "epoch": 13.42, + "grad_norm": 0.8203125, + "learning_rate": 0.00029122324514215025, + "loss": 0.1998, + "step": 324010 + }, + { + "epoch": 13.42, + "grad_norm": 0.89453125, + "learning_rate": 0.00029121254836338297, + "loss": 0.1904, + "step": 324020 + }, + { + "epoch": 13.42, + "grad_norm": 0.2119140625, + "learning_rate": 0.00029120185150705816, + "loss": 0.2075, + "step": 324030 + }, + { + "epoch": 13.42, + "grad_norm": 1.25, + "learning_rate": 0.00029119115457319593, + "loss": 0.1959, + "step": 324040 + }, + { + "epoch": 13.42, + "grad_norm": 0.546875, + "learning_rate": 0.0002911804575618164, + "loss": 0.1638, + "step": 324050 + }, + { + "epoch": 13.42, + "grad_norm": 0.81640625, + "learning_rate": 0.0002911697604729397, + "loss": 0.1497, + "step": 324060 + }, + { + "epoch": 13.42, + "grad_norm": 0.73046875, + "learning_rate": 0.0002911590633065859, + "loss": 0.1863, + "step": 324070 + }, + { + "epoch": 13.42, + "grad_norm": 1.4375, + "learning_rate": 0.0002911483660627752, + "loss": 0.1897, + "step": 324080 + }, + { + "epoch": 13.42, + "grad_norm": 0.94921875, + "learning_rate": 0.0002911376687415278, + "loss": 0.1624, + "step": 324090 + }, + { + "epoch": 13.42, + "grad_norm": 1.546875, + "learning_rate": 0.0002911269713428636, + "loss": 0.1968, + "step": 324100 + }, + { + "epoch": 13.42, + "grad_norm": 1.65625, + "learning_rate": 0.000291116273866803, + "loss": 0.2088, + "step": 324110 + }, + { + "epoch": 13.43, + "grad_norm": 1.4140625, + "learning_rate": 0.00029110557631336606, + "loss": 0.1961, + "step": 324120 + }, + { + "epoch": 13.43, + "grad_norm": 0.69921875, + "learning_rate": 0.00029109487868257276, + "loss": 0.2472, + "step": 324130 + }, + { + "epoch": 13.43, + "grad_norm": 0.6171875, + "learning_rate": 0.0002910841809744433, + "loss": 0.237, + "step": 324140 + }, + { + "epoch": 13.43, + "grad_norm": 1.2109375, + "learning_rate": 0.000291073483188998, + "loss": 0.2046, + "step": 324150 + }, + { + "epoch": 13.43, + "grad_norm": 1.1015625, + "learning_rate": 0.0002910627853262567, + "loss": 0.2001, + "step": 324160 + }, + { + "epoch": 13.43, + "grad_norm": 0.6171875, + "learning_rate": 0.00029105208738623977, + "loss": 0.2083, + "step": 324170 + }, + { + "epoch": 13.43, + "grad_norm": 0.359375, + "learning_rate": 0.00029104138936896716, + "loss": 0.2029, + "step": 324180 + }, + { + "epoch": 13.43, + "grad_norm": 1.109375, + "learning_rate": 0.0002910306912744591, + "loss": 0.2051, + "step": 324190 + }, + { + "epoch": 13.43, + "grad_norm": 0.57421875, + "learning_rate": 0.00029101999310273584, + "loss": 0.2095, + "step": 324200 + }, + { + "epoch": 13.43, + "grad_norm": 0.478515625, + "learning_rate": 0.00029100929485381726, + "loss": 0.2053, + "step": 324210 + }, + { + "epoch": 13.43, + "grad_norm": 1.0703125, + "learning_rate": 0.00029099859652772364, + "loss": 0.1576, + "step": 324220 + }, + { + "epoch": 13.43, + "grad_norm": 0.51953125, + "learning_rate": 0.0002909878981244751, + "loss": 0.1525, + "step": 324230 + }, + { + "epoch": 13.43, + "grad_norm": 2.0625, + "learning_rate": 0.0002909771996440917, + "loss": 0.167, + "step": 324240 + }, + { + "epoch": 13.43, + "grad_norm": 1.109375, + "learning_rate": 0.00029096650108659375, + "loss": 0.2088, + "step": 324250 + }, + { + "epoch": 13.43, + "grad_norm": 0.71484375, + "learning_rate": 0.00029095580245200114, + "loss": 0.2406, + "step": 324260 + }, + { + "epoch": 13.43, + "grad_norm": 1.2578125, + "learning_rate": 0.0002909451037403342, + "loss": 0.2354, + "step": 324270 + }, + { + "epoch": 13.43, + "grad_norm": 0.83203125, + "learning_rate": 0.00029093440495161306, + "loss": 0.1938, + "step": 324280 + }, + { + "epoch": 13.43, + "grad_norm": 0.4375, + "learning_rate": 0.00029092370608585765, + "loss": 0.1656, + "step": 324290 + }, + { + "epoch": 13.43, + "grad_norm": 0.734375, + "learning_rate": 0.0002909130071430883, + "loss": 0.1669, + "step": 324300 + }, + { + "epoch": 13.43, + "grad_norm": 0.9140625, + "learning_rate": 0.0002909023081233252, + "loss": 0.195, + "step": 324310 + }, + { + "epoch": 13.43, + "grad_norm": 1.2109375, + "learning_rate": 0.00029089160902658816, + "loss": 0.2098, + "step": 324320 + }, + { + "epoch": 13.43, + "grad_norm": 0.44921875, + "learning_rate": 0.0002908809098528977, + "loss": 0.1856, + "step": 324330 + }, + { + "epoch": 13.43, + "grad_norm": 0.5390625, + "learning_rate": 0.0002908702106022738, + "loss": 0.1783, + "step": 324340 + }, + { + "epoch": 13.43, + "grad_norm": 0.73828125, + "learning_rate": 0.00029085951127473644, + "loss": 0.1916, + "step": 324350 + }, + { + "epoch": 13.43, + "grad_norm": 1.4765625, + "learning_rate": 0.000290848811870306, + "loss": 0.2316, + "step": 324360 + }, + { + "epoch": 13.44, + "grad_norm": 1.578125, + "learning_rate": 0.0002908381123890024, + "loss": 0.1976, + "step": 324370 + }, + { + "epoch": 13.44, + "grad_norm": 0.63671875, + "learning_rate": 0.000290827412830846, + "loss": 0.2169, + "step": 324380 + }, + { + "epoch": 13.44, + "grad_norm": 0.87890625, + "learning_rate": 0.0002908167131958568, + "loss": 0.1614, + "step": 324390 + }, + { + "epoch": 13.44, + "grad_norm": 1.0546875, + "learning_rate": 0.0002908060134840548, + "loss": 0.2345, + "step": 324400 + }, + { + "epoch": 13.44, + "grad_norm": 0.51171875, + "learning_rate": 0.00029079531369546055, + "loss": 0.2101, + "step": 324410 + }, + { + "epoch": 13.44, + "grad_norm": 0.671875, + "learning_rate": 0.0002907846138300937, + "loss": 0.1627, + "step": 324420 + }, + { + "epoch": 13.44, + "grad_norm": 0.6015625, + "learning_rate": 0.0002907739138879747, + "loss": 0.1499, + "step": 324430 + }, + { + "epoch": 13.44, + "grad_norm": 0.388671875, + "learning_rate": 0.0002907632138691236, + "loss": 0.1853, + "step": 324440 + }, + { + "epoch": 13.44, + "grad_norm": 0.85546875, + "learning_rate": 0.00029075251377356056, + "loss": 0.2348, + "step": 324450 + }, + { + "epoch": 13.44, + "grad_norm": 1.5546875, + "learning_rate": 0.00029074181360130564, + "loss": 0.2084, + "step": 324460 + }, + { + "epoch": 13.44, + "grad_norm": 0.3203125, + "learning_rate": 0.00029073111335237904, + "loss": 0.163, + "step": 324470 + }, + { + "epoch": 13.44, + "grad_norm": 0.7421875, + "learning_rate": 0.00029072041302680087, + "loss": 0.1904, + "step": 324480 + }, + { + "epoch": 13.44, + "grad_norm": 1.8046875, + "learning_rate": 0.00029070971262459133, + "loss": 0.2127, + "step": 324490 + }, + { + "epoch": 13.44, + "grad_norm": 0.640625, + "learning_rate": 0.00029069901214577045, + "loss": 0.2224, + "step": 324500 + }, + { + "epoch": 13.44, + "grad_norm": 0.75390625, + "learning_rate": 0.0002906883115903585, + "loss": 0.1155, + "step": 324510 + }, + { + "epoch": 13.44, + "grad_norm": 1.2578125, + "learning_rate": 0.00029067761095837545, + "loss": 0.1825, + "step": 324520 + }, + { + "epoch": 13.44, + "grad_norm": 0.51171875, + "learning_rate": 0.00029066691024984156, + "loss": 0.1862, + "step": 324530 + }, + { + "epoch": 13.44, + "grad_norm": 0.412109375, + "learning_rate": 0.00029065620946477693, + "loss": 0.192, + "step": 324540 + }, + { + "epoch": 13.44, + "grad_norm": 0.6640625, + "learning_rate": 0.00029064550860320173, + "loss": 0.1903, + "step": 324550 + }, + { + "epoch": 13.44, + "grad_norm": 0.5546875, + "learning_rate": 0.00029063480766513606, + "loss": 0.1785, + "step": 324560 + }, + { + "epoch": 13.44, + "grad_norm": 0.498046875, + "learning_rate": 0.00029062410665060015, + "loss": 0.1743, + "step": 324570 + }, + { + "epoch": 13.44, + "grad_norm": 0.6953125, + "learning_rate": 0.0002906134055596139, + "loss": 0.1551, + "step": 324580 + }, + { + "epoch": 13.44, + "grad_norm": 0.42578125, + "learning_rate": 0.0002906027043921977, + "loss": 0.1791, + "step": 324590 + }, + { + "epoch": 13.44, + "grad_norm": 1.1640625, + "learning_rate": 0.00029059200314837164, + "loss": 0.2047, + "step": 324600 + }, + { + "epoch": 13.45, + "grad_norm": 0.74609375, + "learning_rate": 0.00029058130182815566, + "loss": 0.1969, + "step": 324610 + }, + { + "epoch": 13.45, + "grad_norm": 0.201171875, + "learning_rate": 0.00029057060043157016, + "loss": 0.1661, + "step": 324620 + }, + { + "epoch": 13.45, + "grad_norm": 0.66796875, + "learning_rate": 0.00029055989895863515, + "loss": 0.2293, + "step": 324630 + }, + { + "epoch": 13.45, + "grad_norm": 0.7890625, + "learning_rate": 0.0002905491974093708, + "loss": 0.1782, + "step": 324640 + }, + { + "epoch": 13.45, + "grad_norm": 1.2734375, + "learning_rate": 0.00029053849578379733, + "loss": 0.1539, + "step": 324650 + }, + { + "epoch": 13.45, + "grad_norm": 0.796875, + "learning_rate": 0.00029052779408193465, + "loss": 0.1805, + "step": 324660 + }, + { + "epoch": 13.45, + "grad_norm": 0.9140625, + "learning_rate": 0.00029051709230380307, + "loss": 0.1355, + "step": 324670 + }, + { + "epoch": 13.45, + "grad_norm": 0.7421875, + "learning_rate": 0.00029050639044942274, + "loss": 0.2205, + "step": 324680 + }, + { + "epoch": 13.45, + "grad_norm": 0.98046875, + "learning_rate": 0.0002904956885188137, + "loss": 0.2177, + "step": 324690 + }, + { + "epoch": 13.45, + "grad_norm": 1.0859375, + "learning_rate": 0.00029048498651199615, + "loss": 0.2219, + "step": 324700 + }, + { + "epoch": 13.45, + "grad_norm": 1.8515625, + "learning_rate": 0.00029047428442899027, + "loss": 0.1739, + "step": 324710 + }, + { + "epoch": 13.45, + "grad_norm": 1.078125, + "learning_rate": 0.00029046358226981616, + "loss": 0.1554, + "step": 324720 + }, + { + "epoch": 13.45, + "grad_norm": 0.7421875, + "learning_rate": 0.00029045288003449397, + "loss": 0.1974, + "step": 324730 + }, + { + "epoch": 13.45, + "grad_norm": 0.640625, + "learning_rate": 0.00029044217772304377, + "loss": 0.1978, + "step": 324740 + }, + { + "epoch": 13.45, + "grad_norm": 0.76171875, + "learning_rate": 0.00029043147533548583, + "loss": 0.2207, + "step": 324750 + }, + { + "epoch": 13.45, + "grad_norm": 0.5859375, + "learning_rate": 0.0002904207728718401, + "loss": 0.2269, + "step": 324760 + }, + { + "epoch": 13.45, + "grad_norm": 1.1875, + "learning_rate": 0.0002904100703321269, + "loss": 0.2119, + "step": 324770 + }, + { + "epoch": 13.45, + "grad_norm": 1.2109375, + "learning_rate": 0.00029039936771636636, + "loss": 0.1615, + "step": 324780 + }, + { + "epoch": 13.45, + "grad_norm": 0.765625, + "learning_rate": 0.0002903886650245786, + "loss": 0.1879, + "step": 324790 + }, + { + "epoch": 13.45, + "grad_norm": 0.39453125, + "learning_rate": 0.00029037796225678367, + "loss": 0.1514, + "step": 324800 + }, + { + "epoch": 13.45, + "grad_norm": 0.87109375, + "learning_rate": 0.0002903672594130018, + "loss": 0.1667, + "step": 324810 + }, + { + "epoch": 13.45, + "grad_norm": 1.2109375, + "learning_rate": 0.00029035655649325303, + "loss": 0.241, + "step": 324820 + }, + { + "epoch": 13.45, + "grad_norm": 1.0625, + "learning_rate": 0.00029034585349755764, + "loss": 0.2157, + "step": 324830 + }, + { + "epoch": 13.45, + "grad_norm": 0.5, + "learning_rate": 0.0002903351504259357, + "loss": 0.2096, + "step": 324840 + }, + { + "epoch": 13.46, + "grad_norm": 0.5703125, + "learning_rate": 0.00029032444727840735, + "loss": 0.2549, + "step": 324850 + }, + { + "epoch": 13.46, + "grad_norm": 1.015625, + "learning_rate": 0.00029031374405499285, + "loss": 0.2113, + "step": 324860 + }, + { + "epoch": 13.46, + "grad_norm": 0.81640625, + "learning_rate": 0.0002903030407557121, + "loss": 0.1873, + "step": 324870 + }, + { + "epoch": 13.46, + "grad_norm": 1.140625, + "learning_rate": 0.00029029233738058543, + "loss": 0.1666, + "step": 324880 + }, + { + "epoch": 13.46, + "grad_norm": 1.375, + "learning_rate": 0.000290281633929633, + "loss": 0.206, + "step": 324890 + }, + { + "epoch": 13.46, + "grad_norm": 0.62890625, + "learning_rate": 0.0002902709304028748, + "loss": 0.1884, + "step": 324900 + }, + { + "epoch": 13.46, + "grad_norm": 0.6328125, + "learning_rate": 0.00029026022680033104, + "loss": 0.218, + "step": 324910 + }, + { + "epoch": 13.46, + "grad_norm": 0.486328125, + "learning_rate": 0.0002902495231220219, + "loss": 0.2207, + "step": 324920 + }, + { + "epoch": 13.46, + "grad_norm": 0.51953125, + "learning_rate": 0.0002902388193679675, + "loss": 0.1772, + "step": 324930 + }, + { + "epoch": 13.46, + "grad_norm": 0.75, + "learning_rate": 0.0002902281155381881, + "loss": 0.2148, + "step": 324940 + }, + { + "epoch": 13.46, + "grad_norm": 0.251953125, + "learning_rate": 0.0002902174116327036, + "loss": 0.2003, + "step": 324950 + }, + { + "epoch": 13.46, + "grad_norm": 0.478515625, + "learning_rate": 0.00029020670765153426, + "loss": 0.2189, + "step": 324960 + }, + { + "epoch": 13.46, + "grad_norm": 1.53125, + "learning_rate": 0.00029019600359470033, + "loss": 0.2046, + "step": 324970 + }, + { + "epoch": 13.46, + "grad_norm": 0.8125, + "learning_rate": 0.0002901852994622219, + "loss": 0.2252, + "step": 324980 + }, + { + "epoch": 13.46, + "grad_norm": 0.37890625, + "learning_rate": 0.0002901745952541189, + "loss": 0.1714, + "step": 324990 + }, + { + "epoch": 13.46, + "grad_norm": 0.79296875, + "learning_rate": 0.0002901638909704118, + "loss": 0.1655, + "step": 325000 + }, + { + "epoch": 13.46, + "grad_norm": 0.62109375, + "learning_rate": 0.00029015318661112057, + "loss": 0.1771, + "step": 325010 + }, + { + "epoch": 13.46, + "grad_norm": 0.59375, + "learning_rate": 0.0002901424821762654, + "loss": 0.2299, + "step": 325020 + }, + { + "epoch": 13.46, + "grad_norm": 0.828125, + "learning_rate": 0.0002901317776658663, + "loss": 0.1832, + "step": 325030 + }, + { + "epoch": 13.46, + "grad_norm": 0.69140625, + "learning_rate": 0.00029012107307994364, + "loss": 0.1699, + "step": 325040 + }, + { + "epoch": 13.46, + "grad_norm": 0.5, + "learning_rate": 0.00029011036841851737, + "loss": 0.1714, + "step": 325050 + }, + { + "epoch": 13.46, + "grad_norm": 0.90625, + "learning_rate": 0.00029009966368160776, + "loss": 0.1801, + "step": 325060 + }, + { + "epoch": 13.46, + "grad_norm": 1.3671875, + "learning_rate": 0.0002900889588692349, + "loss": 0.1644, + "step": 325070 + }, + { + "epoch": 13.46, + "grad_norm": 0.546875, + "learning_rate": 0.00029007825398141894, + "loss": 0.2074, + "step": 325080 + }, + { + "epoch": 13.47, + "grad_norm": 0.76953125, + "learning_rate": 0.00029006754901818007, + "loss": 0.2366, + "step": 325090 + }, + { + "epoch": 13.47, + "grad_norm": 1.2109375, + "learning_rate": 0.00029005684397953834, + "loss": 0.1654, + "step": 325100 + }, + { + "epoch": 13.47, + "grad_norm": 1.0859375, + "learning_rate": 0.00029004613886551405, + "loss": 0.2228, + "step": 325110 + }, + { + "epoch": 13.47, + "grad_norm": 0.6796875, + "learning_rate": 0.00029003543367612725, + "loss": 0.1772, + "step": 325120 + }, + { + "epoch": 13.47, + "grad_norm": 1.2578125, + "learning_rate": 0.000290024728411398, + "loss": 0.235, + "step": 325130 + }, + { + "epoch": 13.47, + "grad_norm": 0.60546875, + "learning_rate": 0.0002900140230713466, + "loss": 0.2116, + "step": 325140 + }, + { + "epoch": 13.47, + "grad_norm": 0.796875, + "learning_rate": 0.0002900033176559931, + "loss": 0.1835, + "step": 325150 + }, + { + "epoch": 13.47, + "grad_norm": 1.25, + "learning_rate": 0.00028999261216535767, + "loss": 0.2324, + "step": 325160 + }, + { + "epoch": 13.47, + "grad_norm": 0.37109375, + "learning_rate": 0.00028998190659946046, + "loss": 0.2243, + "step": 325170 + }, + { + "epoch": 13.47, + "grad_norm": 0.353515625, + "learning_rate": 0.0002899712009583216, + "loss": 0.1953, + "step": 325180 + }, + { + "epoch": 13.47, + "grad_norm": 1.1328125, + "learning_rate": 0.00028996049524196134, + "loss": 0.1958, + "step": 325190 + }, + { + "epoch": 13.47, + "grad_norm": 1.390625, + "learning_rate": 0.0002899497894503997, + "loss": 0.203, + "step": 325200 + }, + { + "epoch": 13.47, + "grad_norm": 1.0625, + "learning_rate": 0.0002899390835836568, + "loss": 0.2327, + "step": 325210 + }, + { + "epoch": 13.47, + "grad_norm": 1.0625, + "learning_rate": 0.00028992837764175294, + "loss": 0.1796, + "step": 325220 + }, + { + "epoch": 13.47, + "grad_norm": 0.625, + "learning_rate": 0.00028991767162470816, + "loss": 0.1724, + "step": 325230 + }, + { + "epoch": 13.47, + "grad_norm": 0.3359375, + "learning_rate": 0.00028990696553254263, + "loss": 0.1677, + "step": 325240 + }, + { + "epoch": 13.47, + "grad_norm": 0.97265625, + "learning_rate": 0.00028989625936527653, + "loss": 0.2168, + "step": 325250 + }, + { + "epoch": 13.47, + "grad_norm": 0.302734375, + "learning_rate": 0.0002898855531229299, + "loss": 0.1857, + "step": 325260 + }, + { + "epoch": 13.47, + "grad_norm": 1.09375, + "learning_rate": 0.0002898748468055231, + "loss": 0.2292, + "step": 325270 + }, + { + "epoch": 13.47, + "grad_norm": 1.0703125, + "learning_rate": 0.000289864140413076, + "loss": 0.2544, + "step": 325280 + }, + { + "epoch": 13.47, + "grad_norm": 0.875, + "learning_rate": 0.00028985343394560896, + "loss": 0.1986, + "step": 325290 + }, + { + "epoch": 13.47, + "grad_norm": 0.69921875, + "learning_rate": 0.0002898427274031421, + "loss": 0.1763, + "step": 325300 + }, + { + "epoch": 13.47, + "grad_norm": 0.63671875, + "learning_rate": 0.00028983202078569544, + "loss": 0.1784, + "step": 325310 + }, + { + "epoch": 13.47, + "grad_norm": 0.890625, + "learning_rate": 0.00028982131409328925, + "loss": 0.2534, + "step": 325320 + }, + { + "epoch": 13.48, + "grad_norm": 0.40234375, + "learning_rate": 0.0002898106073259437, + "loss": 0.1623, + "step": 325330 + }, + { + "epoch": 13.48, + "grad_norm": 0.62890625, + "learning_rate": 0.0002897999004836788, + "loss": 0.2089, + "step": 325340 + }, + { + "epoch": 13.48, + "grad_norm": 0.80859375, + "learning_rate": 0.0002897891935665149, + "loss": 0.1603, + "step": 325350 + }, + { + "epoch": 13.48, + "grad_norm": 0.64453125, + "learning_rate": 0.000289778486574472, + "loss": 0.1866, + "step": 325360 + }, + { + "epoch": 13.48, + "grad_norm": 1.8203125, + "learning_rate": 0.0002897677795075702, + "loss": 0.1909, + "step": 325370 + }, + { + "epoch": 13.48, + "grad_norm": 0.328125, + "learning_rate": 0.0002897570723658298, + "loss": 0.1556, + "step": 325380 + }, + { + "epoch": 13.48, + "grad_norm": 0.70703125, + "learning_rate": 0.0002897463651492708, + "loss": 0.1695, + "step": 325390 + }, + { + "epoch": 13.48, + "grad_norm": 1.0859375, + "learning_rate": 0.00028973565785791355, + "loss": 0.1732, + "step": 325400 + }, + { + "epoch": 13.48, + "grad_norm": 0.65625, + "learning_rate": 0.00028972495049177805, + "loss": 0.2246, + "step": 325410 + }, + { + "epoch": 13.48, + "grad_norm": 0.72265625, + "learning_rate": 0.00028971424305088443, + "loss": 0.188, + "step": 325420 + }, + { + "epoch": 13.48, + "grad_norm": 1.40625, + "learning_rate": 0.000289703535535253, + "loss": 0.1582, + "step": 325430 + }, + { + "epoch": 13.48, + "grad_norm": 0.291015625, + "learning_rate": 0.00028969282794490375, + "loss": 0.1994, + "step": 325440 + }, + { + "epoch": 13.48, + "grad_norm": 0.58203125, + "learning_rate": 0.00028968212027985686, + "loss": 0.2164, + "step": 325450 + }, + { + "epoch": 13.48, + "grad_norm": 0.490234375, + "learning_rate": 0.0002896714125401325, + "loss": 0.1709, + "step": 325460 + }, + { + "epoch": 13.48, + "grad_norm": 0.68359375, + "learning_rate": 0.0002896607047257508, + "loss": 0.2195, + "step": 325470 + }, + { + "epoch": 13.48, + "grad_norm": 0.29296875, + "learning_rate": 0.00028964999683673203, + "loss": 0.1454, + "step": 325480 + }, + { + "epoch": 13.48, + "grad_norm": 0.423828125, + "learning_rate": 0.0002896392888730962, + "loss": 0.2018, + "step": 325490 + }, + { + "epoch": 13.48, + "grad_norm": 0.80078125, + "learning_rate": 0.0002896285808348635, + "loss": 0.1827, + "step": 325500 + }, + { + "epoch": 13.48, + "grad_norm": 0.9296875, + "learning_rate": 0.00028961787272205406, + "loss": 0.1308, + "step": 325510 + }, + { + "epoch": 13.48, + "grad_norm": 0.52734375, + "learning_rate": 0.0002896071645346881, + "loss": 0.1918, + "step": 325520 + }, + { + "epoch": 13.48, + "grad_norm": 1.671875, + "learning_rate": 0.00028959645627278576, + "loss": 0.2114, + "step": 325530 + }, + { + "epoch": 13.48, + "grad_norm": 0.5546875, + "learning_rate": 0.00028958574793636716, + "loss": 0.1221, + "step": 325540 + }, + { + "epoch": 13.48, + "grad_norm": 0.40234375, + "learning_rate": 0.0002895750395254524, + "loss": 0.1713, + "step": 325550 + }, + { + "epoch": 13.48, + "grad_norm": 0.73828125, + "learning_rate": 0.00028956433104006176, + "loss": 0.1695, + "step": 325560 + }, + { + "epoch": 13.49, + "grad_norm": 1.7734375, + "learning_rate": 0.0002895536224802153, + "loss": 0.1957, + "step": 325570 + }, + { + "epoch": 13.49, + "grad_norm": 0.921875, + "learning_rate": 0.0002895429138459331, + "loss": 0.2041, + "step": 325580 + }, + { + "epoch": 13.49, + "grad_norm": 0.396484375, + "learning_rate": 0.00028953220513723555, + "loss": 0.173, + "step": 325590 + }, + { + "epoch": 13.49, + "grad_norm": 1.578125, + "learning_rate": 0.00028952149635414257, + "loss": 0.1663, + "step": 325600 + }, + { + "epoch": 13.49, + "grad_norm": 0.8984375, + "learning_rate": 0.0002895107874966744, + "loss": 0.1616, + "step": 325610 + }, + { + "epoch": 13.49, + "grad_norm": 0.5859375, + "learning_rate": 0.0002895000785648513, + "loss": 0.2449, + "step": 325620 + }, + { + "epoch": 13.49, + "grad_norm": 0.60546875, + "learning_rate": 0.00028948936955869314, + "loss": 0.2067, + "step": 325630 + }, + { + "epoch": 13.49, + "grad_norm": 0.5859375, + "learning_rate": 0.0002894786604782204, + "loss": 0.2112, + "step": 325640 + }, + { + "epoch": 13.49, + "grad_norm": 1.078125, + "learning_rate": 0.000289467951323453, + "loss": 0.2261, + "step": 325650 + }, + { + "epoch": 13.49, + "grad_norm": 1.2265625, + "learning_rate": 0.0002894572420944112, + "loss": 0.1767, + "step": 325660 + }, + { + "epoch": 13.49, + "grad_norm": 0.9296875, + "learning_rate": 0.0002894465327911152, + "loss": 0.1971, + "step": 325670 + }, + { + "epoch": 13.49, + "grad_norm": 1.9375, + "learning_rate": 0.000289435823413585, + "loss": 0.1988, + "step": 325680 + }, + { + "epoch": 13.49, + "grad_norm": 1.03125, + "learning_rate": 0.0002894251139618409, + "loss": 0.2109, + "step": 325690 + }, + { + "epoch": 13.49, + "grad_norm": 0.87890625, + "learning_rate": 0.000289414404435903, + "loss": 0.2029, + "step": 325700 + }, + { + "epoch": 13.49, + "grad_norm": 2.1875, + "learning_rate": 0.0002894036948357913, + "loss": 0.1813, + "step": 325710 + }, + { + "epoch": 13.49, + "grad_norm": 0.3671875, + "learning_rate": 0.0002893929851615263, + "loss": 0.17, + "step": 325720 + }, + { + "epoch": 13.49, + "grad_norm": 0.0, + "learning_rate": 0.0002893822754131278, + "loss": 0.2312, + "step": 325730 + }, + { + "epoch": 13.49, + "grad_norm": 0.439453125, + "learning_rate": 0.0002893715655906162, + "loss": 0.2347, + "step": 325740 + }, + { + "epoch": 13.49, + "grad_norm": 0.515625, + "learning_rate": 0.00028936085569401153, + "loss": 0.2227, + "step": 325750 + }, + { + "epoch": 13.49, + "grad_norm": 0.9140625, + "learning_rate": 0.00028935014572333397, + "loss": 0.2245, + "step": 325760 + }, + { + "epoch": 13.49, + "grad_norm": 0.9765625, + "learning_rate": 0.00028933943567860377, + "loss": 0.2114, + "step": 325770 + }, + { + "epoch": 13.49, + "grad_norm": 1.5234375, + "learning_rate": 0.000289328725559841, + "loss": 0.1961, + "step": 325780 + }, + { + "epoch": 13.49, + "grad_norm": 0.94921875, + "learning_rate": 0.00028931801536706564, + "loss": 0.2206, + "step": 325790 + }, + { + "epoch": 13.49, + "grad_norm": 0.8125, + "learning_rate": 0.0002893073051002982, + "loss": 0.2067, + "step": 325800 + }, + { + "epoch": 13.5, + "grad_norm": 1.90625, + "learning_rate": 0.00028929659475955853, + "loss": 0.1967, + "step": 325810 + }, + { + "epoch": 13.5, + "grad_norm": 0.93359375, + "learning_rate": 0.00028928588434486694, + "loss": 0.176, + "step": 325820 + }, + { + "epoch": 13.5, + "grad_norm": 0.5, + "learning_rate": 0.0002892751738562437, + "loss": 0.2352, + "step": 325830 + }, + { + "epoch": 13.5, + "grad_norm": 0.765625, + "learning_rate": 0.0002892644632937086, + "loss": 0.2101, + "step": 325840 + }, + { + "epoch": 13.5, + "grad_norm": 0.6796875, + "learning_rate": 0.0002892537526572822, + "loss": 0.1426, + "step": 325850 + }, + { + "epoch": 13.5, + "grad_norm": 0.9375, + "learning_rate": 0.00028924304194698436, + "loss": 0.2082, + "step": 325860 + }, + { + "epoch": 13.5, + "grad_norm": 1.109375, + "learning_rate": 0.00028923233116283543, + "loss": 0.1927, + "step": 325870 + }, + { + "epoch": 13.5, + "grad_norm": 0.84375, + "learning_rate": 0.0002892216203048555, + "loss": 0.1652, + "step": 325880 + }, + { + "epoch": 13.5, + "grad_norm": 1.2890625, + "learning_rate": 0.0002892109093730647, + "loss": 0.1843, + "step": 325890 + }, + { + "epoch": 13.5, + "grad_norm": 0.80859375, + "learning_rate": 0.0002892001983674831, + "loss": 0.2505, + "step": 325900 + }, + { + "epoch": 13.5, + "grad_norm": 1.0859375, + "learning_rate": 0.0002891894872881311, + "loss": 0.1682, + "step": 325910 + }, + { + "epoch": 13.5, + "grad_norm": 2.21875, + "learning_rate": 0.00028917877613502866, + "loss": 0.21, + "step": 325920 + }, + { + "epoch": 13.5, + "grad_norm": 0.55859375, + "learning_rate": 0.00028916806490819597, + "loss": 0.1871, + "step": 325930 + }, + { + "epoch": 13.5, + "grad_norm": 0.4609375, + "learning_rate": 0.0002891573536076532, + "loss": 0.1536, + "step": 325940 + }, + { + "epoch": 13.5, + "grad_norm": 2.5, + "learning_rate": 0.0002891466422334206, + "loss": 0.2092, + "step": 325950 + }, + { + "epoch": 13.5, + "grad_norm": 1.3203125, + "learning_rate": 0.0002891359307855182, + "loss": 0.155, + "step": 325960 + }, + { + "epoch": 13.5, + "grad_norm": 1.2890625, + "learning_rate": 0.00028912521926396624, + "loss": 0.2347, + "step": 325970 + }, + { + "epoch": 13.5, + "grad_norm": 0.83203125, + "learning_rate": 0.00028911450766878476, + "loss": 0.1969, + "step": 325980 + }, + { + "epoch": 13.5, + "grad_norm": 0.6015625, + "learning_rate": 0.0002891037959999941, + "loss": 0.2557, + "step": 325990 + }, + { + "epoch": 13.5, + "grad_norm": 0.6328125, + "learning_rate": 0.0002890930842576142, + "loss": 0.2243, + "step": 326000 + }, + { + "epoch": 13.5, + "grad_norm": 0.8125, + "learning_rate": 0.0002890823724416655, + "loss": 0.2134, + "step": 326010 + }, + { + "epoch": 13.5, + "grad_norm": 0.53125, + "learning_rate": 0.0002890716605521678, + "loss": 0.2062, + "step": 326020 + }, + { + "epoch": 13.5, + "grad_norm": 1.171875, + "learning_rate": 0.00028906094858914154, + "loss": 0.1731, + "step": 326030 + }, + { + "epoch": 13.5, + "grad_norm": 0.9375, + "learning_rate": 0.0002890502365526069, + "loss": 0.182, + "step": 326040 + }, + { + "epoch": 13.5, + "grad_norm": 0.55078125, + "learning_rate": 0.0002890395244425838, + "loss": 0.1662, + "step": 326050 + }, + { + "epoch": 13.51, + "grad_norm": 0.76171875, + "learning_rate": 0.00028902881225909254, + "loss": 0.1938, + "step": 326060 + }, + { + "epoch": 13.51, + "grad_norm": 1.1015625, + "learning_rate": 0.00028901810000215336, + "loss": 0.1461, + "step": 326070 + }, + { + "epoch": 13.51, + "grad_norm": 0.0, + "learning_rate": 0.0002890073876717862, + "loss": 0.2341, + "step": 326080 + }, + { + "epoch": 13.51, + "grad_norm": 0.84765625, + "learning_rate": 0.0002889966752680115, + "loss": 0.1888, + "step": 326090 + }, + { + "epoch": 13.51, + "grad_norm": 1.6328125, + "learning_rate": 0.0002889859627908492, + "loss": 0.2018, + "step": 326100 + }, + { + "epoch": 13.51, + "grad_norm": 2.15625, + "learning_rate": 0.0002889752502403195, + "loss": 0.1915, + "step": 326110 + }, + { + "epoch": 13.51, + "grad_norm": 1.0078125, + "learning_rate": 0.0002889645376164426, + "loss": 0.1896, + "step": 326120 + }, + { + "epoch": 13.51, + "grad_norm": 0.88671875, + "learning_rate": 0.0002889538249192386, + "loss": 0.1981, + "step": 326130 + }, + { + "epoch": 13.51, + "grad_norm": 0.6171875, + "learning_rate": 0.00028894311214872775, + "loss": 0.2015, + "step": 326140 + }, + { + "epoch": 13.51, + "grad_norm": 1.0625, + "learning_rate": 0.0002889323993049302, + "loss": 0.2191, + "step": 326150 + }, + { + "epoch": 13.51, + "grad_norm": 0.75, + "learning_rate": 0.000288921686387866, + "loss": 0.1763, + "step": 326160 + }, + { + "epoch": 13.51, + "grad_norm": 1.171875, + "learning_rate": 0.0002889109733975555, + "loss": 0.1651, + "step": 326170 + }, + { + "epoch": 13.51, + "grad_norm": 0.8671875, + "learning_rate": 0.00028890026033401875, + "loss": 0.2056, + "step": 326180 + }, + { + "epoch": 13.51, + "grad_norm": 2.78125, + "learning_rate": 0.0002888895471972758, + "loss": 0.1597, + "step": 326190 + }, + { + "epoch": 13.51, + "grad_norm": 1.390625, + "learning_rate": 0.00028887883398734707, + "loss": 0.1878, + "step": 326200 + }, + { + "epoch": 13.51, + "grad_norm": 1.328125, + "learning_rate": 0.00028886812070425243, + "loss": 0.1615, + "step": 326210 + }, + { + "epoch": 13.51, + "grad_norm": 0.6171875, + "learning_rate": 0.00028885740734801226, + "loss": 0.2041, + "step": 326220 + }, + { + "epoch": 13.51, + "grad_norm": 0.94140625, + "learning_rate": 0.0002888466939186466, + "loss": 0.1564, + "step": 326230 + }, + { + "epoch": 13.51, + "grad_norm": 0.0, + "learning_rate": 0.00028883598041617563, + "loss": 0.2204, + "step": 326240 + }, + { + "epoch": 13.51, + "grad_norm": 1.0234375, + "learning_rate": 0.00028882526684061963, + "loss": 0.2649, + "step": 326250 + }, + { + "epoch": 13.51, + "grad_norm": 0.77734375, + "learning_rate": 0.00028881455319199864, + "loss": 0.1852, + "step": 326260 + }, + { + "epoch": 13.51, + "grad_norm": 1.296875, + "learning_rate": 0.0002888038394703328, + "loss": 0.146, + "step": 326270 + }, + { + "epoch": 13.51, + "grad_norm": 2.046875, + "learning_rate": 0.0002887931256756424, + "loss": 0.1792, + "step": 326280 + }, + { + "epoch": 13.51, + "grad_norm": 1.1796875, + "learning_rate": 0.0002887824118079475, + "loss": 0.1763, + "step": 326290 + }, + { + "epoch": 13.52, + "grad_norm": 1.0859375, + "learning_rate": 0.00028877169786726826, + "loss": 0.1787, + "step": 326300 + }, + { + "epoch": 13.52, + "grad_norm": 1.2578125, + "learning_rate": 0.00028876098385362486, + "loss": 0.2119, + "step": 326310 + }, + { + "epoch": 13.52, + "grad_norm": 0.29296875, + "learning_rate": 0.0002887502697670375, + "loss": 0.1859, + "step": 326320 + }, + { + "epoch": 13.52, + "grad_norm": 0.96875, + "learning_rate": 0.0002887395556075264, + "loss": 0.1839, + "step": 326330 + }, + { + "epoch": 13.52, + "grad_norm": 1.21875, + "learning_rate": 0.00028872884137511154, + "loss": 0.225, + "step": 326340 + }, + { + "epoch": 13.52, + "grad_norm": 0.392578125, + "learning_rate": 0.0002887181270698132, + "loss": 0.2679, + "step": 326350 + }, + { + "epoch": 13.52, + "grad_norm": 0.515625, + "learning_rate": 0.0002887074126916515, + "loss": 0.229, + "step": 326360 + }, + { + "epoch": 13.52, + "grad_norm": 0.7578125, + "learning_rate": 0.00028869669824064663, + "loss": 0.2105, + "step": 326370 + }, + { + "epoch": 13.52, + "grad_norm": 0.8359375, + "learning_rate": 0.0002886859837168189, + "loss": 0.1896, + "step": 326380 + }, + { + "epoch": 13.52, + "grad_norm": 0.33984375, + "learning_rate": 0.00028867526912018817, + "loss": 0.2082, + "step": 326390 + }, + { + "epoch": 13.52, + "grad_norm": 0.7578125, + "learning_rate": 0.0002886645544507748, + "loss": 0.2035, + "step": 326400 + }, + { + "epoch": 13.52, + "grad_norm": 0.71484375, + "learning_rate": 0.0002886538397085989, + "loss": 0.1928, + "step": 326410 + }, + { + "epoch": 13.52, + "grad_norm": 0.8984375, + "learning_rate": 0.0002886431248936807, + "loss": 0.1772, + "step": 326420 + }, + { + "epoch": 13.52, + "grad_norm": 0.8984375, + "learning_rate": 0.0002886324100060402, + "loss": 0.1638, + "step": 326430 + }, + { + "epoch": 13.52, + "grad_norm": 0.66015625, + "learning_rate": 0.0002886216950456978, + "loss": 0.1748, + "step": 326440 + }, + { + "epoch": 13.52, + "grad_norm": 0.953125, + "learning_rate": 0.00028861098001267346, + "loss": 0.2213, + "step": 326450 + }, + { + "epoch": 13.52, + "grad_norm": 0.78515625, + "learning_rate": 0.0002886002649069875, + "loss": 0.1642, + "step": 326460 + }, + { + "epoch": 13.52, + "grad_norm": 0.3203125, + "learning_rate": 0.00028858954972865996, + "loss": 0.1668, + "step": 326470 + }, + { + "epoch": 13.52, + "grad_norm": 0.421875, + "learning_rate": 0.00028857883447771105, + "loss": 0.1902, + "step": 326480 + }, + { + "epoch": 13.52, + "grad_norm": 0.353515625, + "learning_rate": 0.00028856811915416094, + "loss": 0.1914, + "step": 326490 + }, + { + "epoch": 13.52, + "grad_norm": 0.7265625, + "learning_rate": 0.0002885574037580299, + "loss": 0.175, + "step": 326500 + }, + { + "epoch": 13.52, + "grad_norm": 0.75390625, + "learning_rate": 0.00028854668828933784, + "loss": 0.1585, + "step": 326510 + }, + { + "epoch": 13.52, + "grad_norm": 0.72265625, + "learning_rate": 0.00028853597274810513, + "loss": 0.2161, + "step": 326520 + }, + { + "epoch": 13.52, + "grad_norm": 0.78515625, + "learning_rate": 0.0002885252571343518, + "loss": 0.1455, + "step": 326530 + }, + { + "epoch": 13.53, + "grad_norm": 1.640625, + "learning_rate": 0.00028851454144809824, + "loss": 0.1656, + "step": 326540 + }, + { + "epoch": 13.53, + "grad_norm": 0.81640625, + "learning_rate": 0.0002885038256893644, + "loss": 0.1238, + "step": 326550 + }, + { + "epoch": 13.53, + "grad_norm": 0.3359375, + "learning_rate": 0.0002884931098581706, + "loss": 0.1473, + "step": 326560 + }, + { + "epoch": 13.53, + "grad_norm": 2.296875, + "learning_rate": 0.0002884823939545368, + "loss": 0.1834, + "step": 326570 + }, + { + "epoch": 13.53, + "grad_norm": 0.37890625, + "learning_rate": 0.0002884716779784834, + "loss": 0.1954, + "step": 326580 + }, + { + "epoch": 13.53, + "grad_norm": 0.3359375, + "learning_rate": 0.00028846096193003035, + "loss": 0.1955, + "step": 326590 + }, + { + "epoch": 13.53, + "grad_norm": 1.40625, + "learning_rate": 0.00028845024580919797, + "loss": 0.2328, + "step": 326600 + }, + { + "epoch": 13.53, + "grad_norm": 1.0625, + "learning_rate": 0.0002884395296160064, + "loss": 0.2013, + "step": 326610 + }, + { + "epoch": 13.53, + "grad_norm": 1.8828125, + "learning_rate": 0.0002884288133504757, + "loss": 0.1829, + "step": 326620 + }, + { + "epoch": 13.53, + "grad_norm": 1.3828125, + "learning_rate": 0.0002884180970126262, + "loss": 0.1895, + "step": 326630 + }, + { + "epoch": 13.53, + "grad_norm": 1.03125, + "learning_rate": 0.000288407380602478, + "loss": 0.1607, + "step": 326640 + }, + { + "epoch": 13.53, + "grad_norm": 0.9921875, + "learning_rate": 0.0002883966641200513, + "loss": 0.1709, + "step": 326650 + }, + { + "epoch": 13.53, + "grad_norm": 0.84375, + "learning_rate": 0.0002883859475653662, + "loss": 0.173, + "step": 326660 + }, + { + "epoch": 13.53, + "grad_norm": 0.388671875, + "learning_rate": 0.0002883752309384428, + "loss": 0.2218, + "step": 326670 + }, + { + "epoch": 13.53, + "grad_norm": 0.9921875, + "learning_rate": 0.0002883645142393014, + "loss": 0.2182, + "step": 326680 + }, + { + "epoch": 13.53, + "grad_norm": 1.34375, + "learning_rate": 0.00028835379746796214, + "loss": 0.1534, + "step": 326690 + }, + { + "epoch": 13.53, + "grad_norm": 1.0390625, + "learning_rate": 0.00028834308062444515, + "loss": 0.2338, + "step": 326700 + }, + { + "epoch": 13.53, + "grad_norm": 1.9375, + "learning_rate": 0.0002883323637087707, + "loss": 0.185, + "step": 326710 + }, + { + "epoch": 13.53, + "grad_norm": 1.4765625, + "learning_rate": 0.0002883216467209588, + "loss": 0.1763, + "step": 326720 + }, + { + "epoch": 13.53, + "grad_norm": 0.65234375, + "learning_rate": 0.00028831092966102975, + "loss": 0.1979, + "step": 326730 + }, + { + "epoch": 13.53, + "grad_norm": 0.61328125, + "learning_rate": 0.0002883002125290037, + "loss": 0.1638, + "step": 326740 + }, + { + "epoch": 13.53, + "grad_norm": 0.515625, + "learning_rate": 0.00028828949532490073, + "loss": 0.1487, + "step": 326750 + }, + { + "epoch": 13.53, + "grad_norm": 0.65625, + "learning_rate": 0.00028827877804874104, + "loss": 0.1905, + "step": 326760 + }, + { + "epoch": 13.53, + "grad_norm": 0.4140625, + "learning_rate": 0.00028826806070054485, + "loss": 0.2381, + "step": 326770 + }, + { + "epoch": 13.54, + "grad_norm": 0.734375, + "learning_rate": 0.00028825734328033226, + "loss": 0.1991, + "step": 326780 + }, + { + "epoch": 13.54, + "grad_norm": 0.9140625, + "learning_rate": 0.0002882466257881236, + "loss": 0.2062, + "step": 326790 + }, + { + "epoch": 13.54, + "grad_norm": 0.88671875, + "learning_rate": 0.00028823590822393884, + "loss": 0.1969, + "step": 326800 + }, + { + "epoch": 13.54, + "grad_norm": 1.1015625, + "learning_rate": 0.00028822519058779823, + "loss": 0.2202, + "step": 326810 + }, + { + "epoch": 13.54, + "grad_norm": 0.390625, + "learning_rate": 0.00028821447287972194, + "loss": 0.1559, + "step": 326820 + }, + { + "epoch": 13.54, + "grad_norm": 0.5625, + "learning_rate": 0.00028820375509973014, + "loss": 0.225, + "step": 326830 + }, + { + "epoch": 13.54, + "grad_norm": 1.3359375, + "learning_rate": 0.000288193037247843, + "loss": 0.2114, + "step": 326840 + }, + { + "epoch": 13.54, + "grad_norm": 1.0, + "learning_rate": 0.0002881823193240807, + "loss": 0.1953, + "step": 326850 + }, + { + "epoch": 13.54, + "grad_norm": 0.90234375, + "learning_rate": 0.00028817160132846336, + "loss": 0.1685, + "step": 326860 + }, + { + "epoch": 13.54, + "grad_norm": 0.8984375, + "learning_rate": 0.0002881608832610113, + "loss": 0.1788, + "step": 326870 + }, + { + "epoch": 13.54, + "grad_norm": 0.99609375, + "learning_rate": 0.00028815016512174444, + "loss": 0.2071, + "step": 326880 + }, + { + "epoch": 13.54, + "grad_norm": 1.1328125, + "learning_rate": 0.0002881394469106832, + "loss": 0.2177, + "step": 326890 + }, + { + "epoch": 13.54, + "grad_norm": 1.15625, + "learning_rate": 0.0002881287286278476, + "loss": 0.1639, + "step": 326900 + }, + { + "epoch": 13.54, + "grad_norm": 0.421875, + "learning_rate": 0.00028811801027325774, + "loss": 0.2213, + "step": 326910 + }, + { + "epoch": 13.54, + "grad_norm": 1.390625, + "learning_rate": 0.00028810729184693406, + "loss": 0.1933, + "step": 326920 + }, + { + "epoch": 13.54, + "grad_norm": 1.0, + "learning_rate": 0.0002880965733488965, + "loss": 0.1818, + "step": 326930 + }, + { + "epoch": 13.54, + "grad_norm": 0.734375, + "learning_rate": 0.00028808585477916525, + "loss": 0.1965, + "step": 326940 + }, + { + "epoch": 13.54, + "grad_norm": 1.3125, + "learning_rate": 0.00028807513613776073, + "loss": 0.1684, + "step": 326950 + }, + { + "epoch": 13.54, + "grad_norm": 1.1015625, + "learning_rate": 0.00028806441742470275, + "loss": 0.1879, + "step": 326960 + }, + { + "epoch": 13.54, + "grad_norm": 1.28125, + "learning_rate": 0.00028805369864001165, + "loss": 0.1792, + "step": 326970 + }, + { + "epoch": 13.54, + "grad_norm": 0.52734375, + "learning_rate": 0.00028804297978370764, + "loss": 0.1856, + "step": 326980 + }, + { + "epoch": 13.54, + "grad_norm": 0.62890625, + "learning_rate": 0.0002880322608558108, + "loss": 0.1873, + "step": 326990 + }, + { + "epoch": 13.54, + "grad_norm": 0.55859375, + "learning_rate": 0.00028802154185634144, + "loss": 0.175, + "step": 327000 + }, + { + "epoch": 13.54, + "grad_norm": 1.3828125, + "learning_rate": 0.0002880108227853196, + "loss": 0.2037, + "step": 327010 + }, + { + "epoch": 13.55, + "grad_norm": 1.6953125, + "learning_rate": 0.00028800010364276544, + "loss": 0.1651, + "step": 327020 + }, + { + "epoch": 13.55, + "grad_norm": 0.609375, + "learning_rate": 0.0002879893844286993, + "loss": 0.2173, + "step": 327030 + }, + { + "epoch": 13.55, + "grad_norm": 0.73046875, + "learning_rate": 0.00028797866514314107, + "loss": 0.171, + "step": 327040 + }, + { + "epoch": 13.55, + "grad_norm": 1.2890625, + "learning_rate": 0.0002879679457861113, + "loss": 0.2057, + "step": 327050 + }, + { + "epoch": 13.55, + "grad_norm": 1.015625, + "learning_rate": 0.0002879572263576299, + "loss": 0.1811, + "step": 327060 + }, + { + "epoch": 13.55, + "grad_norm": 1.1328125, + "learning_rate": 0.000287946506857717, + "loss": 0.1866, + "step": 327070 + }, + { + "epoch": 13.55, + "grad_norm": 0.69921875, + "learning_rate": 0.000287935787286393, + "loss": 0.1945, + "step": 327080 + }, + { + "epoch": 13.55, + "grad_norm": 0.84375, + "learning_rate": 0.00028792506764367784, + "loss": 0.1811, + "step": 327090 + }, + { + "epoch": 13.55, + "grad_norm": 0.5546875, + "learning_rate": 0.00028791434792959183, + "loss": 0.205, + "step": 327100 + }, + { + "epoch": 13.55, + "grad_norm": 0.7734375, + "learning_rate": 0.0002879036281441552, + "loss": 0.2042, + "step": 327110 + }, + { + "epoch": 13.55, + "grad_norm": 0.8828125, + "learning_rate": 0.00028789290828738783, + "loss": 0.1988, + "step": 327120 + }, + { + "epoch": 13.55, + "grad_norm": 0.76171875, + "learning_rate": 0.00028788218835931024, + "loss": 0.1651, + "step": 327130 + }, + { + "epoch": 13.55, + "grad_norm": 0.8359375, + "learning_rate": 0.0002878714683599425, + "loss": 0.2224, + "step": 327140 + }, + { + "epoch": 13.55, + "grad_norm": 0.419921875, + "learning_rate": 0.0002878607482893046, + "loss": 0.1895, + "step": 327150 + }, + { + "epoch": 13.55, + "grad_norm": 0.73828125, + "learning_rate": 0.000287850028147417, + "loss": 0.2041, + "step": 327160 + }, + { + "epoch": 13.55, + "grad_norm": 1.25, + "learning_rate": 0.00028783930793429966, + "loss": 0.244, + "step": 327170 + }, + { + "epoch": 13.55, + "grad_norm": 0.76171875, + "learning_rate": 0.0002878285876499729, + "loss": 0.2052, + "step": 327180 + }, + { + "epoch": 13.55, + "grad_norm": 0.95703125, + "learning_rate": 0.0002878178672944568, + "loss": 0.2002, + "step": 327190 + }, + { + "epoch": 13.55, + "grad_norm": 0.49609375, + "learning_rate": 0.0002878071468677714, + "loss": 0.1672, + "step": 327200 + }, + { + "epoch": 13.55, + "grad_norm": 0.75, + "learning_rate": 0.00028779642636993723, + "loss": 0.1774, + "step": 327210 + }, + { + "epoch": 13.55, + "grad_norm": 0.59375, + "learning_rate": 0.0002877857058009743, + "loss": 0.1632, + "step": 327220 + }, + { + "epoch": 13.55, + "grad_norm": 0.6484375, + "learning_rate": 0.00028777498516090254, + "loss": 0.2203, + "step": 327230 + }, + { + "epoch": 13.55, + "grad_norm": 0.890625, + "learning_rate": 0.0002877642644497425, + "loss": 0.1686, + "step": 327240 + }, + { + "epoch": 13.55, + "grad_norm": 0.78125, + "learning_rate": 0.0002877535436675142, + "loss": 0.1685, + "step": 327250 + }, + { + "epoch": 13.56, + "grad_norm": 1.1640625, + "learning_rate": 0.0002877428228142377, + "loss": 0.2106, + "step": 327260 + }, + { + "epoch": 13.56, + "grad_norm": 1.25, + "learning_rate": 0.00028773210188993336, + "loss": 0.2183, + "step": 327270 + }, + { + "epoch": 13.56, + "grad_norm": 1.0859375, + "learning_rate": 0.0002877213808946212, + "loss": 0.2159, + "step": 327280 + }, + { + "epoch": 13.56, + "grad_norm": 0.703125, + "learning_rate": 0.0002877106598283216, + "loss": 0.2038, + "step": 327290 + }, + { + "epoch": 13.56, + "grad_norm": 1.0078125, + "learning_rate": 0.00028769993869105456, + "loss": 0.1571, + "step": 327300 + }, + { + "epoch": 13.56, + "grad_norm": 0.55078125, + "learning_rate": 0.0002876892174828402, + "loss": 0.2003, + "step": 327310 + }, + { + "epoch": 13.56, + "grad_norm": 0.6640625, + "learning_rate": 0.000287678496203699, + "loss": 0.2014, + "step": 327320 + }, + { + "epoch": 13.56, + "grad_norm": 2.515625, + "learning_rate": 0.0002876677748536508, + "loss": 0.1835, + "step": 327330 + }, + { + "epoch": 13.56, + "grad_norm": 0.67578125, + "learning_rate": 0.0002876570534327159, + "loss": 0.2075, + "step": 327340 + }, + { + "epoch": 13.56, + "grad_norm": 0.5078125, + "learning_rate": 0.0002876463319409146, + "loss": 0.1862, + "step": 327350 + }, + { + "epoch": 13.56, + "grad_norm": 1.4375, + "learning_rate": 0.0002876356103782668, + "loss": 0.2112, + "step": 327360 + }, + { + "epoch": 13.56, + "grad_norm": 1.9140625, + "learning_rate": 0.00028762488874479303, + "loss": 0.1848, + "step": 327370 + }, + { + "epoch": 13.56, + "grad_norm": 0.69921875, + "learning_rate": 0.0002876141670405132, + "loss": 0.2104, + "step": 327380 + }, + { + "epoch": 13.56, + "grad_norm": 0.66796875, + "learning_rate": 0.0002876034452654475, + "loss": 0.1997, + "step": 327390 + }, + { + "epoch": 13.56, + "grad_norm": 1.3203125, + "learning_rate": 0.00028759272341961635, + "loss": 0.2122, + "step": 327400 + }, + { + "epoch": 13.56, + "grad_norm": 0.447265625, + "learning_rate": 0.0002875820015030396, + "loss": 0.1857, + "step": 327410 + }, + { + "epoch": 13.56, + "grad_norm": 0.86328125, + "learning_rate": 0.0002875712795157376, + "loss": 0.1964, + "step": 327420 + }, + { + "epoch": 13.56, + "grad_norm": 0.66796875, + "learning_rate": 0.00028756055745773066, + "loss": 0.1657, + "step": 327430 + }, + { + "epoch": 13.56, + "grad_norm": 1.6015625, + "learning_rate": 0.0002875498353290386, + "loss": 0.1836, + "step": 327440 + }, + { + "epoch": 13.56, + "grad_norm": 0.2177734375, + "learning_rate": 0.00028753911312968196, + "loss": 0.1665, + "step": 327450 + }, + { + "epoch": 13.56, + "grad_norm": 0.625, + "learning_rate": 0.00028752839085968067, + "loss": 0.2006, + "step": 327460 + }, + { + "epoch": 13.56, + "grad_norm": 1.2421875, + "learning_rate": 0.00028751766851905505, + "loss": 0.1954, + "step": 327470 + }, + { + "epoch": 13.56, + "grad_norm": 0.54296875, + "learning_rate": 0.00028750694610782526, + "loss": 0.1894, + "step": 327480 + }, + { + "epoch": 13.56, + "grad_norm": 0.54296875, + "learning_rate": 0.00028749622362601136, + "loss": 0.1949, + "step": 327490 + }, + { + "epoch": 13.57, + "grad_norm": 0.64453125, + "learning_rate": 0.0002874855010736336, + "loss": 0.203, + "step": 327500 + }, + { + "epoch": 13.57, + "grad_norm": 1.1015625, + "learning_rate": 0.0002874747784507123, + "loss": 0.1819, + "step": 327510 + }, + { + "epoch": 13.57, + "grad_norm": 1.078125, + "learning_rate": 0.00028746405575726737, + "loss": 0.2202, + "step": 327520 + }, + { + "epoch": 13.57, + "grad_norm": 0.48828125, + "learning_rate": 0.0002874533329933193, + "loss": 0.1982, + "step": 327530 + }, + { + "epoch": 13.57, + "grad_norm": 0.78515625, + "learning_rate": 0.00028744261015888794, + "loss": 0.1741, + "step": 327540 + }, + { + "epoch": 13.57, + "grad_norm": 0.50390625, + "learning_rate": 0.00028743188725399374, + "loss": 0.1494, + "step": 327550 + }, + { + "epoch": 13.57, + "grad_norm": 1.3203125, + "learning_rate": 0.0002874211642786567, + "loss": 0.2109, + "step": 327560 + }, + { + "epoch": 13.57, + "grad_norm": 0.96484375, + "learning_rate": 0.0002874104412328971, + "loss": 0.2066, + "step": 327570 + }, + { + "epoch": 13.57, + "grad_norm": 1.1796875, + "learning_rate": 0.0002873997181167351, + "loss": 0.208, + "step": 327580 + }, + { + "epoch": 13.57, + "grad_norm": 0.9296875, + "learning_rate": 0.0002873889949301908, + "loss": 0.1733, + "step": 327590 + }, + { + "epoch": 13.57, + "grad_norm": 0.458984375, + "learning_rate": 0.0002873782716732846, + "loss": 0.1977, + "step": 327600 + }, + { + "epoch": 13.57, + "grad_norm": 0.890625, + "learning_rate": 0.0002873675483460364, + "loss": 0.2412, + "step": 327610 + }, + { + "epoch": 13.57, + "grad_norm": 1.5, + "learning_rate": 0.00028735682494846653, + "loss": 0.1688, + "step": 327620 + }, + { + "epoch": 13.57, + "grad_norm": 0.439453125, + "learning_rate": 0.0002873461014805952, + "loss": 0.147, + "step": 327630 + }, + { + "epoch": 13.57, + "grad_norm": 0.1640625, + "learning_rate": 0.00028733537794244253, + "loss": 0.1973, + "step": 327640 + }, + { + "epoch": 13.57, + "grad_norm": 0.478515625, + "learning_rate": 0.0002873246543340287, + "loss": 0.2372, + "step": 327650 + }, + { + "epoch": 13.57, + "grad_norm": 0.88671875, + "learning_rate": 0.00028731393065537387, + "loss": 0.1928, + "step": 327660 + }, + { + "epoch": 13.57, + "grad_norm": 0.62109375, + "learning_rate": 0.00028730320690649827, + "loss": 0.1712, + "step": 327670 + }, + { + "epoch": 13.57, + "grad_norm": 0.765625, + "learning_rate": 0.00028729248308742203, + "loss": 0.2045, + "step": 327680 + }, + { + "epoch": 13.57, + "grad_norm": 0.5078125, + "learning_rate": 0.00028728175919816545, + "loss": 0.175, + "step": 327690 + }, + { + "epoch": 13.57, + "grad_norm": 1.1484375, + "learning_rate": 0.0002872710352387485, + "loss": 0.1961, + "step": 327700 + }, + { + "epoch": 13.57, + "grad_norm": 0.8046875, + "learning_rate": 0.0002872603112091916, + "loss": 0.2175, + "step": 327710 + }, + { + "epoch": 13.57, + "grad_norm": 0.765625, + "learning_rate": 0.0002872495871095148, + "loss": 0.2105, + "step": 327720 + }, + { + "epoch": 13.57, + "grad_norm": 0.318359375, + "learning_rate": 0.00028723886293973827, + "loss": 0.1534, + "step": 327730 + }, + { + "epoch": 13.57, + "grad_norm": 0.4609375, + "learning_rate": 0.00028722813869988225, + "loss": 0.1928, + "step": 327740 + }, + { + "epoch": 13.58, + "grad_norm": 1.265625, + "learning_rate": 0.0002872174143899668, + "loss": 0.1853, + "step": 327750 + }, + { + "epoch": 13.58, + "grad_norm": 0.65625, + "learning_rate": 0.00028720669001001226, + "loss": 0.1648, + "step": 327760 + }, + { + "epoch": 13.58, + "grad_norm": 0.478515625, + "learning_rate": 0.0002871959655600388, + "loss": 0.1666, + "step": 327770 + }, + { + "epoch": 13.58, + "grad_norm": 0.515625, + "learning_rate": 0.0002871852410400665, + "loss": 0.1965, + "step": 327780 + }, + { + "epoch": 13.58, + "grad_norm": 0.52734375, + "learning_rate": 0.0002871745164501156, + "loss": 0.128, + "step": 327790 + }, + { + "epoch": 13.58, + "grad_norm": 0.66796875, + "learning_rate": 0.00028716379179020626, + "loss": 0.2282, + "step": 327800 + }, + { + "epoch": 13.58, + "grad_norm": 0.890625, + "learning_rate": 0.0002871530670603587, + "loss": 0.2082, + "step": 327810 + }, + { + "epoch": 13.58, + "grad_norm": 0.8203125, + "learning_rate": 0.0002871423422605931, + "loss": 0.1955, + "step": 327820 + }, + { + "epoch": 13.58, + "grad_norm": 0.384765625, + "learning_rate": 0.0002871316173909296, + "loss": 0.2184, + "step": 327830 + }, + { + "epoch": 13.58, + "grad_norm": 0.83203125, + "learning_rate": 0.0002871208924513883, + "loss": 0.2123, + "step": 327840 + }, + { + "epoch": 13.58, + "grad_norm": 1.734375, + "learning_rate": 0.0002871101674419897, + "loss": 0.2099, + "step": 327850 + }, + { + "epoch": 13.58, + "grad_norm": 0.5234375, + "learning_rate": 0.0002870994423627536, + "loss": 0.2284, + "step": 327860 + }, + { + "epoch": 13.58, + "grad_norm": 0.95703125, + "learning_rate": 0.00028708871721370044, + "loss": 0.2145, + "step": 327870 + }, + { + "epoch": 13.58, + "grad_norm": 0.82421875, + "learning_rate": 0.00028707799199485025, + "loss": 0.2066, + "step": 327880 + }, + { + "epoch": 13.58, + "grad_norm": 0.7421875, + "learning_rate": 0.0002870672667062234, + "loss": 0.162, + "step": 327890 + }, + { + "epoch": 13.58, + "grad_norm": 0.37890625, + "learning_rate": 0.00028705654134783987, + "loss": 0.1612, + "step": 327900 + }, + { + "epoch": 13.58, + "grad_norm": 0.98046875, + "learning_rate": 0.0002870458159197199, + "loss": 0.1269, + "step": 327910 + }, + { + "epoch": 13.58, + "grad_norm": 0.95703125, + "learning_rate": 0.00028703509042188373, + "loss": 0.1402, + "step": 327920 + }, + { + "epoch": 13.58, + "grad_norm": 1.2578125, + "learning_rate": 0.0002870243648543516, + "loss": 0.2234, + "step": 327930 + }, + { + "epoch": 13.58, + "grad_norm": 0.98046875, + "learning_rate": 0.00028701363921714354, + "loss": 0.1991, + "step": 327940 + }, + { + "epoch": 13.58, + "grad_norm": 1.2109375, + "learning_rate": 0.0002870029135102798, + "loss": 0.1673, + "step": 327950 + }, + { + "epoch": 13.58, + "grad_norm": 0.99609375, + "learning_rate": 0.0002869921877337806, + "loss": 0.1563, + "step": 327960 + }, + { + "epoch": 13.58, + "grad_norm": 0.7109375, + "learning_rate": 0.0002869814618876661, + "loss": 0.1874, + "step": 327970 + }, + { + "epoch": 13.58, + "grad_norm": 0.9140625, + "learning_rate": 0.0002869707359719565, + "loss": 0.2022, + "step": 327980 + }, + { + "epoch": 13.59, + "grad_norm": 1.234375, + "learning_rate": 0.0002869600099866719, + "loss": 0.1817, + "step": 327990 + }, + { + "epoch": 13.59, + "grad_norm": 0.85546875, + "learning_rate": 0.00028694928393183263, + "loss": 0.1817, + "step": 328000 + }, + { + "epoch": 13.59, + "grad_norm": 1.2890625, + "learning_rate": 0.0002869385578074587, + "loss": 0.1564, + "step": 328010 + }, + { + "epoch": 13.59, + "grad_norm": 0.474609375, + "learning_rate": 0.0002869278316135705, + "loss": 0.2231, + "step": 328020 + }, + { + "epoch": 13.59, + "grad_norm": 0.55078125, + "learning_rate": 0.0002869171053501881, + "loss": 0.1567, + "step": 328030 + }, + { + "epoch": 13.59, + "grad_norm": 0.6484375, + "learning_rate": 0.00028690637901733165, + "loss": 0.2258, + "step": 328040 + }, + { + "epoch": 13.59, + "grad_norm": 1.0390625, + "learning_rate": 0.0002868956526150214, + "loss": 0.1774, + "step": 328050 + }, + { + "epoch": 13.59, + "grad_norm": 0.9765625, + "learning_rate": 0.00028688492614327747, + "loss": 0.1951, + "step": 328060 + }, + { + "epoch": 13.59, + "grad_norm": 2.265625, + "learning_rate": 0.00028687419960212016, + "loss": 0.2161, + "step": 328070 + }, + { + "epoch": 13.59, + "grad_norm": 0.73046875, + "learning_rate": 0.0002868634729915695, + "loss": 0.1553, + "step": 328080 + }, + { + "epoch": 13.59, + "grad_norm": 0.443359375, + "learning_rate": 0.0002868527463116458, + "loss": 0.1751, + "step": 328090 + }, + { + "epoch": 13.59, + "grad_norm": 0.9921875, + "learning_rate": 0.0002868420195623693, + "loss": 0.1635, + "step": 328100 + }, + { + "epoch": 13.59, + "grad_norm": 1.875, + "learning_rate": 0.00028683129274376004, + "loss": 0.2233, + "step": 328110 + }, + { + "epoch": 13.59, + "grad_norm": 0.828125, + "learning_rate": 0.00028682056585583823, + "loss": 0.1801, + "step": 328120 + }, + { + "epoch": 13.59, + "grad_norm": 1.3125, + "learning_rate": 0.00028680983889862414, + "loss": 0.1752, + "step": 328130 + }, + { + "epoch": 13.59, + "grad_norm": 1.3046875, + "learning_rate": 0.0002867991118721378, + "loss": 0.1789, + "step": 328140 + }, + { + "epoch": 13.59, + "grad_norm": 0.9609375, + "learning_rate": 0.0002867883847763996, + "loss": 0.2224, + "step": 328150 + }, + { + "epoch": 13.59, + "grad_norm": 0.81640625, + "learning_rate": 0.00028677765761142966, + "loss": 0.1954, + "step": 328160 + }, + { + "epoch": 13.59, + "grad_norm": 1.3125, + "learning_rate": 0.00028676693037724805, + "loss": 0.1768, + "step": 328170 + }, + { + "epoch": 13.59, + "grad_norm": 0.859375, + "learning_rate": 0.0002867562030738752, + "loss": 0.2432, + "step": 328180 + }, + { + "epoch": 13.59, + "grad_norm": 0.65625, + "learning_rate": 0.00028674547570133095, + "loss": 0.1942, + "step": 328190 + }, + { + "epoch": 13.59, + "grad_norm": 1.0078125, + "learning_rate": 0.0002867347482596358, + "loss": 0.192, + "step": 328200 + }, + { + "epoch": 13.59, + "grad_norm": 0.326171875, + "learning_rate": 0.0002867240207488098, + "loss": 0.1385, + "step": 328210 + }, + { + "epoch": 13.59, + "grad_norm": 0.80078125, + "learning_rate": 0.00028671329316887315, + "loss": 0.1826, + "step": 328220 + }, + { + "epoch": 13.6, + "grad_norm": 0.91015625, + "learning_rate": 0.0002867025655198461, + "loss": 0.2129, + "step": 328230 + }, + { + "epoch": 13.6, + "grad_norm": 1.0625, + "learning_rate": 0.00028669183780174865, + "loss": 0.1554, + "step": 328240 + }, + { + "epoch": 13.6, + "grad_norm": 1.3125, + "learning_rate": 0.0002866811100146012, + "loss": 0.1662, + "step": 328250 + }, + { + "epoch": 13.6, + "grad_norm": 0.0, + "learning_rate": 0.00028667038215842393, + "loss": 0.2016, + "step": 328260 + }, + { + "epoch": 13.6, + "grad_norm": 0.98046875, + "learning_rate": 0.0002866596542332369, + "loss": 0.1831, + "step": 328270 + }, + { + "epoch": 13.6, + "grad_norm": 0.68359375, + "learning_rate": 0.00028664892623906036, + "loss": 0.2645, + "step": 328280 + }, + { + "epoch": 13.6, + "grad_norm": 0.6328125, + "learning_rate": 0.0002866381981759145, + "loss": 0.1484, + "step": 328290 + }, + { + "epoch": 13.6, + "grad_norm": 0.4609375, + "learning_rate": 0.0002866274700438195, + "loss": 0.1811, + "step": 328300 + }, + { + "epoch": 13.6, + "grad_norm": 1.0703125, + "learning_rate": 0.00028661674184279564, + "loss": 0.1499, + "step": 328310 + }, + { + "epoch": 13.6, + "grad_norm": 1.6171875, + "learning_rate": 0.0002866060135728629, + "loss": 0.217, + "step": 328320 + }, + { + "epoch": 13.6, + "grad_norm": 1.6953125, + "learning_rate": 0.0002865952852340417, + "loss": 0.1809, + "step": 328330 + }, + { + "epoch": 13.6, + "grad_norm": 0.54296875, + "learning_rate": 0.0002865845568263521, + "loss": 0.1993, + "step": 328340 + }, + { + "epoch": 13.6, + "grad_norm": 0.90625, + "learning_rate": 0.00028657382834981423, + "loss": 0.2235, + "step": 328350 + }, + { + "epoch": 13.6, + "grad_norm": 1.5625, + "learning_rate": 0.00028656309980444844, + "loss": 0.2004, + "step": 328360 + }, + { + "epoch": 13.6, + "grad_norm": 0.36328125, + "learning_rate": 0.0002865523711902748, + "loss": 0.2071, + "step": 328370 + }, + { + "epoch": 13.6, + "grad_norm": 0.734375, + "learning_rate": 0.0002865416425073136, + "loss": 0.193, + "step": 328380 + }, + { + "epoch": 13.6, + "grad_norm": 0.54296875, + "learning_rate": 0.00028653091375558493, + "loss": 0.1984, + "step": 328390 + }, + { + "epoch": 13.6, + "grad_norm": 1.0234375, + "learning_rate": 0.00028652018493510907, + "loss": 0.1779, + "step": 328400 + }, + { + "epoch": 13.6, + "grad_norm": 1.5546875, + "learning_rate": 0.0002865094560459061, + "loss": 0.1794, + "step": 328410 + }, + { + "epoch": 13.6, + "grad_norm": 0.61328125, + "learning_rate": 0.00028649872708799636, + "loss": 0.2203, + "step": 328420 + }, + { + "epoch": 13.6, + "grad_norm": 0.62109375, + "learning_rate": 0.00028648799806139984, + "loss": 0.1285, + "step": 328430 + }, + { + "epoch": 13.6, + "grad_norm": 1.4921875, + "learning_rate": 0.00028647726896613695, + "loss": 0.1923, + "step": 328440 + }, + { + "epoch": 13.6, + "grad_norm": 1.421875, + "learning_rate": 0.00028646653980222774, + "loss": 0.1383, + "step": 328450 + }, + { + "epoch": 13.6, + "grad_norm": 0.3046875, + "learning_rate": 0.00028645581056969244, + "loss": 0.1896, + "step": 328460 + }, + { + "epoch": 13.61, + "grad_norm": 1.7734375, + "learning_rate": 0.00028644508126855127, + "loss": 0.1801, + "step": 328470 + }, + { + "epoch": 13.61, + "grad_norm": 0.439453125, + "learning_rate": 0.0002864343518988244, + "loss": 0.1905, + "step": 328480 + }, + { + "epoch": 13.61, + "grad_norm": 1.2421875, + "learning_rate": 0.000286423622460532, + "loss": 0.1482, + "step": 328490 + }, + { + "epoch": 13.61, + "grad_norm": 1.453125, + "learning_rate": 0.00028641289295369427, + "loss": 0.2376, + "step": 328500 + }, + { + "epoch": 13.61, + "grad_norm": 1.421875, + "learning_rate": 0.00028640216337833137, + "loss": 0.2317, + "step": 328510 + }, + { + "epoch": 13.61, + "grad_norm": 0.07958984375, + "learning_rate": 0.0002863914337344636, + "loss": 0.1838, + "step": 328520 + }, + { + "epoch": 13.61, + "grad_norm": 0.384765625, + "learning_rate": 0.00028638070402211104, + "loss": 0.1849, + "step": 328530 + }, + { + "epoch": 13.61, + "grad_norm": 0.55859375, + "learning_rate": 0.0002863699742412939, + "loss": 0.2058, + "step": 328540 + }, + { + "epoch": 13.61, + "grad_norm": 1.1171875, + "learning_rate": 0.00028635924439203246, + "loss": 0.2251, + "step": 328550 + }, + { + "epoch": 13.61, + "grad_norm": 0.41015625, + "learning_rate": 0.00028634851447434677, + "loss": 0.1814, + "step": 328560 + }, + { + "epoch": 13.61, + "grad_norm": 0.625, + "learning_rate": 0.00028633778448825713, + "loss": 0.1886, + "step": 328570 + }, + { + "epoch": 13.61, + "grad_norm": 1.203125, + "learning_rate": 0.00028632705443378377, + "loss": 0.1449, + "step": 328580 + }, + { + "epoch": 13.61, + "grad_norm": 0.796875, + "learning_rate": 0.00028631632431094675, + "loss": 0.228, + "step": 328590 + }, + { + "epoch": 13.61, + "grad_norm": 0.69921875, + "learning_rate": 0.00028630559411976634, + "loss": 0.2235, + "step": 328600 + }, + { + "epoch": 13.61, + "grad_norm": 0.7265625, + "learning_rate": 0.0002862948638602627, + "loss": 0.1582, + "step": 328610 + }, + { + "epoch": 13.61, + "grad_norm": 0.6328125, + "learning_rate": 0.0002862841335324561, + "loss": 0.1668, + "step": 328620 + }, + { + "epoch": 13.61, + "grad_norm": 0.62109375, + "learning_rate": 0.00028627340313636666, + "loss": 0.211, + "step": 328630 + }, + { + "epoch": 13.61, + "grad_norm": 0.412109375, + "learning_rate": 0.00028626267267201457, + "loss": 0.1837, + "step": 328640 + }, + { + "epoch": 13.61, + "grad_norm": 0.7109375, + "learning_rate": 0.0002862519421394201, + "loss": 0.2071, + "step": 328650 + }, + { + "epoch": 13.61, + "grad_norm": 0.490234375, + "learning_rate": 0.0002862412115386033, + "loss": 0.1761, + "step": 328660 + }, + { + "epoch": 13.61, + "grad_norm": 0.46484375, + "learning_rate": 0.0002862304808695845, + "loss": 0.1884, + "step": 328670 + }, + { + "epoch": 13.61, + "grad_norm": 0.2314453125, + "learning_rate": 0.0002862197501323839, + "loss": 0.2172, + "step": 328680 + }, + { + "epoch": 13.61, + "grad_norm": 0.48828125, + "learning_rate": 0.00028620901932702153, + "loss": 0.1778, + "step": 328690 + }, + { + "epoch": 13.61, + "grad_norm": 1.171875, + "learning_rate": 0.00028619828845351773, + "loss": 0.2076, + "step": 328700 + }, + { + "epoch": 13.62, + "grad_norm": 0.478515625, + "learning_rate": 0.00028618755751189275, + "loss": 0.2131, + "step": 328710 + }, + { + "epoch": 13.62, + "grad_norm": 0.59765625, + "learning_rate": 0.00028617682650216664, + "loss": 0.1866, + "step": 328720 + }, + { + "epoch": 13.62, + "grad_norm": 1.7265625, + "learning_rate": 0.00028616609542435963, + "loss": 0.1891, + "step": 328730 + }, + { + "epoch": 13.62, + "grad_norm": 0.83984375, + "learning_rate": 0.000286155364278492, + "loss": 0.182, + "step": 328740 + }, + { + "epoch": 13.62, + "grad_norm": 0.6484375, + "learning_rate": 0.0002861446330645837, + "loss": 0.1959, + "step": 328750 + }, + { + "epoch": 13.62, + "grad_norm": 0.416015625, + "learning_rate": 0.0002861339017826553, + "loss": 0.1985, + "step": 328760 + }, + { + "epoch": 13.62, + "grad_norm": 0.93359375, + "learning_rate": 0.00028612317043272674, + "loss": 0.2007, + "step": 328770 + }, + { + "epoch": 13.62, + "grad_norm": 0.7734375, + "learning_rate": 0.0002861124390148182, + "loss": 0.1758, + "step": 328780 + }, + { + "epoch": 13.62, + "grad_norm": 0.6015625, + "learning_rate": 0.0002861017075289501, + "loss": 0.2436, + "step": 328790 + }, + { + "epoch": 13.62, + "grad_norm": 0.451171875, + "learning_rate": 0.00028609097597514236, + "loss": 0.1645, + "step": 328800 + }, + { + "epoch": 13.62, + "grad_norm": 0.3828125, + "learning_rate": 0.00028608024435341535, + "loss": 0.2245, + "step": 328810 + }, + { + "epoch": 13.62, + "grad_norm": 1.0625, + "learning_rate": 0.0002860695126637892, + "loss": 0.1747, + "step": 328820 + }, + { + "epoch": 13.62, + "grad_norm": 1.4140625, + "learning_rate": 0.0002860587809062841, + "loss": 0.1644, + "step": 328830 + }, + { + "epoch": 13.62, + "grad_norm": 0.79296875, + "learning_rate": 0.00028604804908092033, + "loss": 0.2147, + "step": 328840 + }, + { + "epoch": 13.62, + "grad_norm": 0.30078125, + "learning_rate": 0.000286037317187718, + "loss": 0.1917, + "step": 328850 + }, + { + "epoch": 13.62, + "grad_norm": 0.7421875, + "learning_rate": 0.0002860265852266973, + "loss": 0.1887, + "step": 328860 + }, + { + "epoch": 13.62, + "grad_norm": 0.8515625, + "learning_rate": 0.0002860158531978785, + "loss": 0.1887, + "step": 328870 + }, + { + "epoch": 13.62, + "grad_norm": 4.125, + "learning_rate": 0.00028600512110128166, + "loss": 0.2058, + "step": 328880 + }, + { + "epoch": 13.62, + "grad_norm": 3.453125, + "learning_rate": 0.0002859943889369271, + "loss": 0.2041, + "step": 328890 + }, + { + "epoch": 13.62, + "grad_norm": 0.75, + "learning_rate": 0.00028598365670483506, + "loss": 0.2016, + "step": 328900 + }, + { + "epoch": 13.62, + "grad_norm": 1.0078125, + "learning_rate": 0.0002859729244050256, + "loss": 0.1901, + "step": 328910 + }, + { + "epoch": 13.62, + "grad_norm": 1.0390625, + "learning_rate": 0.0002859621920375191, + "loss": 0.2111, + "step": 328920 + }, + { + "epoch": 13.62, + "grad_norm": 0.58203125, + "learning_rate": 0.0002859514596023355, + "loss": 0.2238, + "step": 328930 + }, + { + "epoch": 13.62, + "grad_norm": 0.484375, + "learning_rate": 0.0002859407270994952, + "loss": 0.1819, + "step": 328940 + }, + { + "epoch": 13.63, + "grad_norm": 1.2890625, + "learning_rate": 0.00028592999452901836, + "loss": 0.2149, + "step": 328950 + }, + { + "epoch": 13.63, + "grad_norm": 0.77734375, + "learning_rate": 0.00028591926189092505, + "loss": 0.1958, + "step": 328960 + }, + { + "epoch": 13.63, + "grad_norm": 0.796875, + "learning_rate": 0.00028590852918523564, + "loss": 0.2145, + "step": 328970 + }, + { + "epoch": 13.63, + "grad_norm": 1.453125, + "learning_rate": 0.00028589779641197024, + "loss": 0.177, + "step": 328980 + }, + { + "epoch": 13.63, + "grad_norm": 0.41796875, + "learning_rate": 0.00028588706357114907, + "loss": 0.1469, + "step": 328990 + }, + { + "epoch": 13.63, + "grad_norm": 0.734375, + "learning_rate": 0.00028587633066279235, + "loss": 0.1916, + "step": 329000 + }, + { + "epoch": 13.63, + "grad_norm": 1.0234375, + "learning_rate": 0.0002858655976869202, + "loss": 0.1739, + "step": 329010 + }, + { + "epoch": 13.63, + "grad_norm": 0.287109375, + "learning_rate": 0.0002858548646435529, + "loss": 0.1639, + "step": 329020 + }, + { + "epoch": 13.63, + "grad_norm": 0.6875, + "learning_rate": 0.0002858441315327106, + "loss": 0.2185, + "step": 329030 + }, + { + "epoch": 13.63, + "grad_norm": 0.63671875, + "learning_rate": 0.00028583339835441346, + "loss": 0.2007, + "step": 329040 + }, + { + "epoch": 13.63, + "grad_norm": 0.267578125, + "learning_rate": 0.0002858226651086818, + "loss": 0.1567, + "step": 329050 + }, + { + "epoch": 13.63, + "grad_norm": 0.921875, + "learning_rate": 0.00028581193179553577, + "loss": 0.2107, + "step": 329060 + }, + { + "epoch": 13.63, + "grad_norm": 0.625, + "learning_rate": 0.0002858011984149955, + "loss": 0.1913, + "step": 329070 + }, + { + "epoch": 13.63, + "grad_norm": 0.52734375, + "learning_rate": 0.00028579046496708134, + "loss": 0.1745, + "step": 329080 + }, + { + "epoch": 13.63, + "grad_norm": 1.0078125, + "learning_rate": 0.0002857797314518133, + "loss": 0.2337, + "step": 329090 + }, + { + "epoch": 13.63, + "grad_norm": 0.58203125, + "learning_rate": 0.0002857689978692117, + "loss": 0.2061, + "step": 329100 + }, + { + "epoch": 13.63, + "grad_norm": 0.60546875, + "learning_rate": 0.00028575826421929667, + "loss": 0.1797, + "step": 329110 + }, + { + "epoch": 13.63, + "grad_norm": 0.76953125, + "learning_rate": 0.0002857475305020885, + "loss": 0.1937, + "step": 329120 + }, + { + "epoch": 13.63, + "grad_norm": 0.8515625, + "learning_rate": 0.00028573679671760734, + "loss": 0.1825, + "step": 329130 + }, + { + "epoch": 13.63, + "grad_norm": 0.6796875, + "learning_rate": 0.00028572606286587335, + "loss": 0.194, + "step": 329140 + }, + { + "epoch": 13.63, + "grad_norm": 1.0234375, + "learning_rate": 0.00028571532894690674, + "loss": 0.2111, + "step": 329150 + }, + { + "epoch": 13.63, + "grad_norm": 1.171875, + "learning_rate": 0.00028570459496072783, + "loss": 0.2346, + "step": 329160 + }, + { + "epoch": 13.63, + "grad_norm": 0.69921875, + "learning_rate": 0.0002856938609073566, + "loss": 0.1626, + "step": 329170 + }, + { + "epoch": 13.63, + "grad_norm": 0.76953125, + "learning_rate": 0.0002856831267868135, + "loss": 0.1642, + "step": 329180 + }, + { + "epoch": 13.64, + "grad_norm": 0.625, + "learning_rate": 0.0002856723925991185, + "loss": 0.1894, + "step": 329190 + }, + { + "epoch": 13.64, + "grad_norm": 0.54296875, + "learning_rate": 0.000285661658344292, + "loss": 0.191, + "step": 329200 + }, + { + "epoch": 13.64, + "grad_norm": 0.703125, + "learning_rate": 0.0002856509240223541, + "loss": 0.2458, + "step": 329210 + }, + { + "epoch": 13.64, + "grad_norm": 0.65234375, + "learning_rate": 0.000285640189633325, + "loss": 0.1733, + "step": 329220 + }, + { + "epoch": 13.64, + "grad_norm": 0.59765625, + "learning_rate": 0.0002856294551772249, + "loss": 0.1763, + "step": 329230 + }, + { + "epoch": 13.64, + "grad_norm": 0.6484375, + "learning_rate": 0.00028561872065407403, + "loss": 0.2301, + "step": 329240 + }, + { + "epoch": 13.64, + "grad_norm": 1.640625, + "learning_rate": 0.00028560798606389254, + "loss": 0.2252, + "step": 329250 + }, + { + "epoch": 13.64, + "grad_norm": 0.55859375, + "learning_rate": 0.0002855972514067007, + "loss": 0.1878, + "step": 329260 + }, + { + "epoch": 13.64, + "grad_norm": 0.5625, + "learning_rate": 0.0002855865166825186, + "loss": 0.1224, + "step": 329270 + }, + { + "epoch": 13.64, + "grad_norm": 1.328125, + "learning_rate": 0.0002855757818913666, + "loss": 0.2342, + "step": 329280 + }, + { + "epoch": 13.64, + "grad_norm": 1.5078125, + "learning_rate": 0.00028556504703326486, + "loss": 0.2494, + "step": 329290 + }, + { + "epoch": 13.64, + "grad_norm": 0.8046875, + "learning_rate": 0.00028555431210823345, + "loss": 0.1694, + "step": 329300 + }, + { + "epoch": 13.64, + "grad_norm": 0.765625, + "learning_rate": 0.0002855435771162927, + "loss": 0.1572, + "step": 329310 + }, + { + "epoch": 13.64, + "grad_norm": 1.046875, + "learning_rate": 0.0002855328420574628, + "loss": 0.2063, + "step": 329320 + }, + { + "epoch": 13.64, + "grad_norm": 1.1953125, + "learning_rate": 0.00028552210693176387, + "loss": 0.1988, + "step": 329330 + }, + { + "epoch": 13.64, + "grad_norm": 0.5, + "learning_rate": 0.0002855113717392162, + "loss": 0.1588, + "step": 329340 + }, + { + "epoch": 13.64, + "grad_norm": 1.1640625, + "learning_rate": 0.00028550063647983997, + "loss": 0.1997, + "step": 329350 + }, + { + "epoch": 13.64, + "grad_norm": 1.484375, + "learning_rate": 0.0002854899011536554, + "loss": 0.238, + "step": 329360 + }, + { + "epoch": 13.64, + "grad_norm": 1.125, + "learning_rate": 0.0002854791657606827, + "loss": 0.1556, + "step": 329370 + }, + { + "epoch": 13.64, + "grad_norm": 0.90234375, + "learning_rate": 0.00028546843030094194, + "loss": 0.2083, + "step": 329380 + }, + { + "epoch": 13.64, + "grad_norm": 1.3359375, + "learning_rate": 0.00028545769477445344, + "loss": 0.1665, + "step": 329390 + }, + { + "epoch": 13.64, + "grad_norm": 0.27734375, + "learning_rate": 0.00028544695918123744, + "loss": 0.2103, + "step": 329400 + }, + { + "epoch": 13.64, + "grad_norm": 0.296875, + "learning_rate": 0.00028543622352131406, + "loss": 0.191, + "step": 329410 + }, + { + "epoch": 13.64, + "grad_norm": 1.171875, + "learning_rate": 0.0002854254877947035, + "loss": 0.2141, + "step": 329420 + }, + { + "epoch": 13.64, + "grad_norm": 1.46875, + "learning_rate": 0.00028541475200142603, + "loss": 0.2316, + "step": 329430 + }, + { + "epoch": 13.65, + "grad_norm": 0.50390625, + "learning_rate": 0.0002854040161415019, + "loss": 0.1857, + "step": 329440 + }, + { + "epoch": 13.65, + "grad_norm": 0.291015625, + "learning_rate": 0.00028539328021495107, + "loss": 0.2035, + "step": 329450 + }, + { + "epoch": 13.65, + "grad_norm": 0.3984375, + "learning_rate": 0.000285382544221794, + "loss": 0.204, + "step": 329460 + }, + { + "epoch": 13.65, + "grad_norm": 0.51953125, + "learning_rate": 0.0002853718081620509, + "loss": 0.184, + "step": 329470 + }, + { + "epoch": 13.65, + "grad_norm": 0.8125, + "learning_rate": 0.0002853610720357417, + "loss": 0.2131, + "step": 329480 + }, + { + "epoch": 13.65, + "grad_norm": 0.5703125, + "learning_rate": 0.00028535033584288694, + "loss": 0.1716, + "step": 329490 + }, + { + "epoch": 13.65, + "grad_norm": 0.244140625, + "learning_rate": 0.00028533959958350656, + "loss": 0.2005, + "step": 329500 + }, + { + "epoch": 13.65, + "grad_norm": 1.296875, + "learning_rate": 0.0002853288632576209, + "loss": 0.1945, + "step": 329510 + }, + { + "epoch": 13.65, + "grad_norm": 0.80078125, + "learning_rate": 0.0002853181268652501, + "loss": 0.1672, + "step": 329520 + }, + { + "epoch": 13.65, + "grad_norm": 1.6328125, + "learning_rate": 0.0002853073904064144, + "loss": 0.1985, + "step": 329530 + }, + { + "epoch": 13.65, + "grad_norm": 0.6328125, + "learning_rate": 0.0002852966538811341, + "loss": 0.2028, + "step": 329540 + }, + { + "epoch": 13.65, + "grad_norm": 1.0, + "learning_rate": 0.00028528591728942923, + "loss": 0.1855, + "step": 329550 + }, + { + "epoch": 13.65, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00028527518063132005, + "loss": 0.1863, + "step": 329560 + }, + { + "epoch": 13.65, + "grad_norm": 2.078125, + "learning_rate": 0.00028526444390682696, + "loss": 0.1681, + "step": 329570 + }, + { + "epoch": 13.65, + "grad_norm": 1.5546875, + "learning_rate": 0.0002852537071159698, + "loss": 0.2391, + "step": 329580 + }, + { + "epoch": 13.65, + "grad_norm": 0.51171875, + "learning_rate": 0.000285242970258769, + "loss": 0.1697, + "step": 329590 + }, + { + "epoch": 13.65, + "grad_norm": 1.3828125, + "learning_rate": 0.0002852322333352448, + "loss": 0.1928, + "step": 329600 + }, + { + "epoch": 13.65, + "grad_norm": 1.453125, + "learning_rate": 0.0002852214963454173, + "loss": 0.1994, + "step": 329610 + }, + { + "epoch": 13.65, + "grad_norm": 0.275390625, + "learning_rate": 0.00028521075928930676, + "loss": 0.2143, + "step": 329620 + }, + { + "epoch": 13.65, + "grad_norm": 0.79296875, + "learning_rate": 0.0002852000221669334, + "loss": 0.1903, + "step": 329630 + }, + { + "epoch": 13.65, + "grad_norm": 1.0234375, + "learning_rate": 0.0002851892849783173, + "loss": 0.2363, + "step": 329640 + }, + { + "epoch": 13.65, + "grad_norm": 0.75390625, + "learning_rate": 0.0002851785477234789, + "loss": 0.1819, + "step": 329650 + }, + { + "epoch": 13.65, + "grad_norm": 0.953125, + "learning_rate": 0.00028516781040243815, + "loss": 0.1658, + "step": 329660 + }, + { + "epoch": 13.65, + "grad_norm": 0.7421875, + "learning_rate": 0.0002851570730152155, + "loss": 0.2139, + "step": 329670 + }, + { + "epoch": 13.66, + "grad_norm": 0.5390625, + "learning_rate": 0.00028514633556183094, + "loss": 0.1818, + "step": 329680 + }, + { + "epoch": 13.66, + "grad_norm": 0.47265625, + "learning_rate": 0.0002851355980423048, + "loss": 0.1779, + "step": 329690 + }, + { + "epoch": 13.66, + "grad_norm": 0.84765625, + "learning_rate": 0.0002851248604566573, + "loss": 0.1733, + "step": 329700 + }, + { + "epoch": 13.66, + "grad_norm": 0.92578125, + "learning_rate": 0.00028511412280490854, + "loss": 0.1554, + "step": 329710 + }, + { + "epoch": 13.66, + "grad_norm": 1.03125, + "learning_rate": 0.00028510338508707885, + "loss": 0.214, + "step": 329720 + }, + { + "epoch": 13.66, + "grad_norm": 0.95703125, + "learning_rate": 0.0002850926473031884, + "loss": 0.1771, + "step": 329730 + }, + { + "epoch": 13.66, + "grad_norm": 0.73046875, + "learning_rate": 0.00028508190945325724, + "loss": 0.2097, + "step": 329740 + }, + { + "epoch": 13.66, + "grad_norm": 0.87890625, + "learning_rate": 0.00028507117153730587, + "loss": 0.1615, + "step": 329750 + }, + { + "epoch": 13.66, + "grad_norm": 0.91015625, + "learning_rate": 0.0002850604335553543, + "loss": 0.2057, + "step": 329760 + }, + { + "epoch": 13.66, + "grad_norm": 0.51171875, + "learning_rate": 0.0002850496955074227, + "loss": 0.1904, + "step": 329770 + }, + { + "epoch": 13.66, + "grad_norm": 0.3984375, + "learning_rate": 0.00028503895739353147, + "loss": 0.2018, + "step": 329780 + }, + { + "epoch": 13.66, + "grad_norm": 1.625, + "learning_rate": 0.00028502821921370063, + "loss": 0.1732, + "step": 329790 + }, + { + "epoch": 13.66, + "grad_norm": 1.046875, + "learning_rate": 0.00028501748096795046, + "loss": 0.2683, + "step": 329800 + }, + { + "epoch": 13.66, + "grad_norm": 0.890625, + "learning_rate": 0.00028500674265630123, + "loss": 0.1993, + "step": 329810 + }, + { + "epoch": 13.66, + "grad_norm": 0.8671875, + "learning_rate": 0.00028499600427877306, + "loss": 0.2277, + "step": 329820 + }, + { + "epoch": 13.66, + "grad_norm": 0.7578125, + "learning_rate": 0.0002849852658353862, + "loss": 0.2039, + "step": 329830 + }, + { + "epoch": 13.66, + "grad_norm": 1.0078125, + "learning_rate": 0.0002849745273261608, + "loss": 0.2395, + "step": 329840 + }, + { + "epoch": 13.66, + "grad_norm": 1.1015625, + "learning_rate": 0.00028496378875111716, + "loss": 0.2261, + "step": 329850 + }, + { + "epoch": 13.66, + "grad_norm": 1.1640625, + "learning_rate": 0.0002849530501102754, + "loss": 0.2166, + "step": 329860 + }, + { + "epoch": 13.66, + "grad_norm": 1.84375, + "learning_rate": 0.0002849423114036558, + "loss": 0.2087, + "step": 329870 + }, + { + "epoch": 13.66, + "grad_norm": 0.9453125, + "learning_rate": 0.0002849315726312786, + "loss": 0.2053, + "step": 329880 + }, + { + "epoch": 13.66, + "grad_norm": 0.56640625, + "learning_rate": 0.000284920833793164, + "loss": 0.1483, + "step": 329890 + }, + { + "epoch": 13.66, + "grad_norm": 0.9375, + "learning_rate": 0.00028491009488933194, + "loss": 0.1665, + "step": 329900 + }, + { + "epoch": 13.66, + "grad_norm": 0.6953125, + "learning_rate": 0.00028489935591980304, + "loss": 0.1827, + "step": 329910 + }, + { + "epoch": 13.67, + "grad_norm": 0.6171875, + "learning_rate": 0.0002848886168845973, + "loss": 0.1832, + "step": 329920 + }, + { + "epoch": 13.67, + "grad_norm": 0.2451171875, + "learning_rate": 0.00028487787778373486, + "loss": 0.1897, + "step": 329930 + }, + { + "epoch": 13.67, + "grad_norm": 0.578125, + "learning_rate": 0.00028486713861723613, + "loss": 0.1465, + "step": 329940 + }, + { + "epoch": 13.67, + "grad_norm": 0.470703125, + "learning_rate": 0.0002848563993851211, + "loss": 0.2001, + "step": 329950 + }, + { + "epoch": 13.67, + "grad_norm": 1.171875, + "learning_rate": 0.0002848456600874101, + "loss": 0.2014, + "step": 329960 + }, + { + "epoch": 13.67, + "grad_norm": 0.609375, + "learning_rate": 0.0002848349207241235, + "loss": 0.1278, + "step": 329970 + }, + { + "epoch": 13.67, + "grad_norm": 0.53125, + "learning_rate": 0.0002848241812952811, + "loss": 0.1649, + "step": 329980 + }, + { + "epoch": 13.67, + "grad_norm": 0.3671875, + "learning_rate": 0.0002848134418009035, + "loss": 0.1744, + "step": 329990 + }, + { + "epoch": 13.67, + "grad_norm": 1.0390625, + "learning_rate": 0.0002848027022410108, + "loss": 0.2156, + "step": 330000 + }, + { + "epoch": 13.67, + "grad_norm": 0.53515625, + "learning_rate": 0.00028479196261562307, + "loss": 0.1932, + "step": 330010 + }, + { + "epoch": 13.67, + "grad_norm": 1.1796875, + "learning_rate": 0.00028478122292476064, + "loss": 0.1796, + "step": 330020 + }, + { + "epoch": 13.67, + "grad_norm": 0.60546875, + "learning_rate": 0.0002847704831684437, + "loss": 0.1978, + "step": 330030 + }, + { + "epoch": 13.67, + "grad_norm": 0.8125, + "learning_rate": 0.0002847597433466925, + "loss": 0.1801, + "step": 330040 + }, + { + "epoch": 13.67, + "grad_norm": 0.5546875, + "learning_rate": 0.00028474900345952725, + "loss": 0.1889, + "step": 330050 + }, + { + "epoch": 13.67, + "grad_norm": 0.404296875, + "learning_rate": 0.000284738263506968, + "loss": 0.1896, + "step": 330060 + }, + { + "epoch": 13.67, + "grad_norm": 1.046875, + "learning_rate": 0.00028472752348903524, + "loss": 0.1967, + "step": 330070 + }, + { + "epoch": 13.67, + "grad_norm": 0.65234375, + "learning_rate": 0.000284716783405749, + "loss": 0.1678, + "step": 330080 + }, + { + "epoch": 13.67, + "grad_norm": 0.984375, + "learning_rate": 0.00028470604325712945, + "loss": 0.164, + "step": 330090 + }, + { + "epoch": 13.67, + "grad_norm": 1.3828125, + "learning_rate": 0.0002846953030431969, + "loss": 0.1901, + "step": 330100 + }, + { + "epoch": 13.67, + "grad_norm": 0.396484375, + "learning_rate": 0.00028468456276397155, + "loss": 0.163, + "step": 330110 + }, + { + "epoch": 13.67, + "grad_norm": 0.71875, + "learning_rate": 0.0002846738224194736, + "loss": 0.2295, + "step": 330120 + }, + { + "epoch": 13.67, + "grad_norm": 1.5625, + "learning_rate": 0.0002846630820097232, + "loss": 0.1647, + "step": 330130 + }, + { + "epoch": 13.67, + "grad_norm": 0.6875, + "learning_rate": 0.00028465234153474074, + "loss": 0.1852, + "step": 330140 + }, + { + "epoch": 13.67, + "grad_norm": 0.796875, + "learning_rate": 0.00028464160099454626, + "loss": 0.2189, + "step": 330150 + }, + { + "epoch": 13.68, + "grad_norm": 0.474609375, + "learning_rate": 0.00028463086038916005, + "loss": 0.2, + "step": 330160 + }, + { + "epoch": 13.68, + "grad_norm": 1.15625, + "learning_rate": 0.00028462011971860224, + "loss": 0.1991, + "step": 330170 + }, + { + "epoch": 13.68, + "grad_norm": 1.046875, + "learning_rate": 0.0002846093789828932, + "loss": 0.1675, + "step": 330180 + }, + { + "epoch": 13.68, + "grad_norm": 0.2431640625, + "learning_rate": 0.0002845986381820529, + "loss": 0.2105, + "step": 330190 + }, + { + "epoch": 13.68, + "grad_norm": 0.5234375, + "learning_rate": 0.0002845878973161018, + "loss": 0.1878, + "step": 330200 + }, + { + "epoch": 13.68, + "grad_norm": 0.58203125, + "learning_rate": 0.00028457715638506, + "loss": 0.2255, + "step": 330210 + }, + { + "epoch": 13.68, + "grad_norm": 0.474609375, + "learning_rate": 0.00028456641538894767, + "loss": 0.1641, + "step": 330220 + }, + { + "epoch": 13.68, + "grad_norm": 0.369140625, + "learning_rate": 0.00028455567432778515, + "loss": 0.2058, + "step": 330230 + }, + { + "epoch": 13.68, + "grad_norm": 0.76953125, + "learning_rate": 0.00028454493320159257, + "loss": 0.2005, + "step": 330240 + }, + { + "epoch": 13.68, + "grad_norm": 0.9609375, + "learning_rate": 0.00028453419201039014, + "loss": 0.2073, + "step": 330250 + }, + { + "epoch": 13.68, + "grad_norm": 0.74609375, + "learning_rate": 0.00028452345075419814, + "loss": 0.1771, + "step": 330260 + }, + { + "epoch": 13.68, + "grad_norm": 0.69921875, + "learning_rate": 0.0002845127094330366, + "loss": 0.1849, + "step": 330270 + }, + { + "epoch": 13.68, + "grad_norm": 0.26953125, + "learning_rate": 0.00028450196804692603, + "loss": 0.2216, + "step": 330280 + }, + { + "epoch": 13.68, + "grad_norm": 0.490234375, + "learning_rate": 0.00028449122659588637, + "loss": 0.1444, + "step": 330290 + }, + { + "epoch": 13.68, + "grad_norm": 0.890625, + "learning_rate": 0.000284480485079938, + "loss": 0.1729, + "step": 330300 + }, + { + "epoch": 13.68, + "grad_norm": 0.671875, + "learning_rate": 0.0002844697434991011, + "loss": 0.199, + "step": 330310 + }, + { + "epoch": 13.68, + "grad_norm": 0.74609375, + "learning_rate": 0.00028445900185339584, + "loss": 0.1591, + "step": 330320 + }, + { + "epoch": 13.68, + "grad_norm": 0.50390625, + "learning_rate": 0.0002844482601428424, + "loss": 0.1735, + "step": 330330 + }, + { + "epoch": 13.68, + "grad_norm": 0.431640625, + "learning_rate": 0.00028443751836746117, + "loss": 0.222, + "step": 330340 + }, + { + "epoch": 13.68, + "grad_norm": 0.734375, + "learning_rate": 0.0002844267765272721, + "loss": 0.2005, + "step": 330350 + }, + { + "epoch": 13.68, + "grad_norm": 0.66015625, + "learning_rate": 0.0002844160346222957, + "loss": 0.1636, + "step": 330360 + }, + { + "epoch": 13.68, + "grad_norm": 0.69140625, + "learning_rate": 0.000284405292652552, + "loss": 0.1997, + "step": 330370 + }, + { + "epoch": 13.68, + "grad_norm": 1.359375, + "learning_rate": 0.00028439455061806117, + "loss": 0.2423, + "step": 330380 + }, + { + "epoch": 13.68, + "grad_norm": 0.66796875, + "learning_rate": 0.00028438380851884365, + "loss": 0.2143, + "step": 330390 + }, + { + "epoch": 13.69, + "grad_norm": 0.78515625, + "learning_rate": 0.0002843730663549194, + "loss": 0.1616, + "step": 330400 + }, + { + "epoch": 13.69, + "grad_norm": 1.6328125, + "learning_rate": 0.0002843623241263088, + "loss": 0.176, + "step": 330410 + }, + { + "epoch": 13.69, + "grad_norm": 0.921875, + "learning_rate": 0.000284351581833032, + "loss": 0.1669, + "step": 330420 + }, + { + "epoch": 13.69, + "grad_norm": 1.078125, + "learning_rate": 0.0002843408394751092, + "loss": 0.1572, + "step": 330430 + }, + { + "epoch": 13.69, + "grad_norm": 0.92578125, + "learning_rate": 0.00028433009705256077, + "loss": 0.1968, + "step": 330440 + }, + { + "epoch": 13.69, + "grad_norm": 0.42578125, + "learning_rate": 0.0002843193545654067, + "loss": 0.1658, + "step": 330450 + }, + { + "epoch": 13.69, + "grad_norm": 0.59375, + "learning_rate": 0.00028430861201366733, + "loss": 0.1553, + "step": 330460 + }, + { + "epoch": 13.69, + "grad_norm": 1.28125, + "learning_rate": 0.0002842978693973629, + "loss": 0.1719, + "step": 330470 + }, + { + "epoch": 13.69, + "grad_norm": 0.625, + "learning_rate": 0.0002842871267165135, + "loss": 0.1916, + "step": 330480 + }, + { + "epoch": 13.69, + "grad_norm": 0.69921875, + "learning_rate": 0.00028427638397113947, + "loss": 0.1803, + "step": 330490 + }, + { + "epoch": 13.69, + "grad_norm": 0.859375, + "learning_rate": 0.00028426564116126104, + "loss": 0.2085, + "step": 330500 + }, + { + "epoch": 13.69, + "grad_norm": 0.0, + "learning_rate": 0.00028425489828689827, + "loss": 0.113, + "step": 330510 + }, + { + "epoch": 13.69, + "grad_norm": 0.5703125, + "learning_rate": 0.00028424415534807155, + "loss": 0.1706, + "step": 330520 + }, + { + "epoch": 13.69, + "grad_norm": 2.203125, + "learning_rate": 0.00028423341234480105, + "loss": 0.1584, + "step": 330530 + }, + { + "epoch": 13.69, + "grad_norm": 0.39453125, + "learning_rate": 0.00028422266927710687, + "loss": 0.1912, + "step": 330540 + }, + { + "epoch": 13.69, + "grad_norm": 0.78515625, + "learning_rate": 0.00028421192614500946, + "loss": 0.1671, + "step": 330550 + }, + { + "epoch": 13.69, + "grad_norm": 1.0, + "learning_rate": 0.0002842011829485288, + "loss": 0.1943, + "step": 330560 + }, + { + "epoch": 13.69, + "grad_norm": 0.66796875, + "learning_rate": 0.0002841904396876852, + "loss": 0.1607, + "step": 330570 + }, + { + "epoch": 13.69, + "grad_norm": 0.765625, + "learning_rate": 0.00028417969636249893, + "loss": 0.1688, + "step": 330580 + }, + { + "epoch": 13.69, + "grad_norm": 0.7890625, + "learning_rate": 0.00028416895297299013, + "loss": 0.148, + "step": 330590 + }, + { + "epoch": 13.69, + "grad_norm": 0.0, + "learning_rate": 0.0002841582095191791, + "loss": 0.1996, + "step": 330600 + }, + { + "epoch": 13.69, + "grad_norm": 0.71484375, + "learning_rate": 0.000284147466001086, + "loss": 0.2072, + "step": 330610 + }, + { + "epoch": 13.69, + "grad_norm": 0.72265625, + "learning_rate": 0.000284136722418731, + "loss": 0.1954, + "step": 330620 + }, + { + "epoch": 13.69, + "grad_norm": 0.359375, + "learning_rate": 0.00028412597877213434, + "loss": 0.182, + "step": 330630 + }, + { + "epoch": 13.7, + "grad_norm": 0.875, + "learning_rate": 0.00028411523506131637, + "loss": 0.1912, + "step": 330640 + }, + { + "epoch": 13.7, + "grad_norm": 0.62109375, + "learning_rate": 0.0002841044912862972, + "loss": 0.2243, + "step": 330650 + }, + { + "epoch": 13.7, + "grad_norm": 2.28125, + "learning_rate": 0.000284093747447097, + "loss": 0.1918, + "step": 330660 + }, + { + "epoch": 13.7, + "grad_norm": 0.71484375, + "learning_rate": 0.00028408300354373613, + "loss": 0.1957, + "step": 330670 + }, + { + "epoch": 13.7, + "grad_norm": 0.82421875, + "learning_rate": 0.0002840722595762347, + "loss": 0.1688, + "step": 330680 + }, + { + "epoch": 13.7, + "grad_norm": 0.55859375, + "learning_rate": 0.00028406151554461297, + "loss": 0.2154, + "step": 330690 + }, + { + "epoch": 13.7, + "grad_norm": 0.46484375, + "learning_rate": 0.00028405077144889114, + "loss": 0.1603, + "step": 330700 + }, + { + "epoch": 13.7, + "grad_norm": 0.48828125, + "learning_rate": 0.00028404002728908933, + "loss": 0.2395, + "step": 330710 + }, + { + "epoch": 13.7, + "grad_norm": 1.234375, + "learning_rate": 0.000284029283065228, + "loss": 0.2626, + "step": 330720 + }, + { + "epoch": 13.7, + "grad_norm": 0.72265625, + "learning_rate": 0.0002840185387773272, + "loss": 0.185, + "step": 330730 + }, + { + "epoch": 13.7, + "grad_norm": 1.171875, + "learning_rate": 0.00028400779442540715, + "loss": 0.1995, + "step": 330740 + }, + { + "epoch": 13.7, + "grad_norm": 1.1171875, + "learning_rate": 0.0002839970500094881, + "loss": 0.2235, + "step": 330750 + }, + { + "epoch": 13.7, + "grad_norm": 0.8359375, + "learning_rate": 0.00028398630552959036, + "loss": 0.1999, + "step": 330760 + }, + { + "epoch": 13.7, + "grad_norm": 1.109375, + "learning_rate": 0.00028397556098573394, + "loss": 0.1993, + "step": 330770 + }, + { + "epoch": 13.7, + "grad_norm": 1.1640625, + "learning_rate": 0.00028396481637793926, + "loss": 0.174, + "step": 330780 + }, + { + "epoch": 13.7, + "grad_norm": 1.0234375, + "learning_rate": 0.00028395407170622646, + "loss": 0.2033, + "step": 330790 + }, + { + "epoch": 13.7, + "grad_norm": 1.515625, + "learning_rate": 0.00028394332697061577, + "loss": 0.2817, + "step": 330800 + }, + { + "epoch": 13.7, + "grad_norm": 0.484375, + "learning_rate": 0.00028393258217112736, + "loss": 0.1633, + "step": 330810 + }, + { + "epoch": 13.7, + "grad_norm": 0.609375, + "learning_rate": 0.00028392183730778156, + "loss": 0.2008, + "step": 330820 + }, + { + "epoch": 13.7, + "grad_norm": 0.8984375, + "learning_rate": 0.0002839110923805984, + "loss": 0.1948, + "step": 330830 + }, + { + "epoch": 13.7, + "grad_norm": 1.0234375, + "learning_rate": 0.00028390034738959833, + "loss": 0.1861, + "step": 330840 + }, + { + "epoch": 13.7, + "grad_norm": 0.83984375, + "learning_rate": 0.0002838896023348015, + "loss": 0.1564, + "step": 330850 + }, + { + "epoch": 13.7, + "grad_norm": 0.95703125, + "learning_rate": 0.000283878857216228, + "loss": 0.1562, + "step": 330860 + }, + { + "epoch": 13.7, + "grad_norm": 0.7734375, + "learning_rate": 0.0002838681120338982, + "loss": 0.1748, + "step": 330870 + }, + { + "epoch": 13.71, + "grad_norm": 1.546875, + "learning_rate": 0.00028385736678783225, + "loss": 0.1667, + "step": 330880 + }, + { + "epoch": 13.71, + "grad_norm": 0.2451171875, + "learning_rate": 0.0002838466214780505, + "loss": 0.1666, + "step": 330890 + }, + { + "epoch": 13.71, + "grad_norm": 1.9296875, + "learning_rate": 0.00028383587610457297, + "loss": 0.2086, + "step": 330900 + }, + { + "epoch": 13.71, + "grad_norm": 1.0234375, + "learning_rate": 0.00028382513066741995, + "loss": 0.2186, + "step": 330910 + }, + { + "epoch": 13.71, + "grad_norm": 0.4140625, + "learning_rate": 0.0002838143851666117, + "loss": 0.2233, + "step": 330920 + }, + { + "epoch": 13.71, + "grad_norm": 1.046875, + "learning_rate": 0.0002838036396021685, + "loss": 0.21, + "step": 330930 + }, + { + "epoch": 13.71, + "grad_norm": 2.515625, + "learning_rate": 0.0002837928939741104, + "loss": 0.1768, + "step": 330940 + }, + { + "epoch": 13.71, + "grad_norm": 1.2109375, + "learning_rate": 0.00028378214828245777, + "loss": 0.1613, + "step": 330950 + }, + { + "epoch": 13.71, + "grad_norm": 0.71484375, + "learning_rate": 0.0002837714025272307, + "loss": 0.1786, + "step": 330960 + }, + { + "epoch": 13.71, + "grad_norm": 0.68359375, + "learning_rate": 0.0002837606567084496, + "loss": 0.2454, + "step": 330970 + }, + { + "epoch": 13.71, + "grad_norm": 0.5703125, + "learning_rate": 0.0002837499108261346, + "loss": 0.187, + "step": 330980 + }, + { + "epoch": 13.71, + "grad_norm": 0.291015625, + "learning_rate": 0.0002837391648803059, + "loss": 0.2351, + "step": 330990 + }, + { + "epoch": 13.71, + "grad_norm": 0.91796875, + "learning_rate": 0.00028372841887098365, + "loss": 0.1759, + "step": 331000 + }, + { + "epoch": 13.71, + "grad_norm": 0.40625, + "learning_rate": 0.00028371767279818826, + "loss": 0.2079, + "step": 331010 + }, + { + "epoch": 13.71, + "grad_norm": 0.671875, + "learning_rate": 0.0002837069266619399, + "loss": 0.2322, + "step": 331020 + }, + { + "epoch": 13.71, + "grad_norm": 1.078125, + "learning_rate": 0.0002836961804622586, + "loss": 0.174, + "step": 331030 + }, + { + "epoch": 13.71, + "grad_norm": 0.82421875, + "learning_rate": 0.0002836854341991648, + "loss": 0.1484, + "step": 331040 + }, + { + "epoch": 13.71, + "grad_norm": 0.609375, + "learning_rate": 0.00028367468787267856, + "loss": 0.1418, + "step": 331050 + }, + { + "epoch": 13.71, + "grad_norm": 1.4921875, + "learning_rate": 0.00028366394148282033, + "loss": 0.2147, + "step": 331060 + }, + { + "epoch": 13.71, + "grad_norm": 0.8671875, + "learning_rate": 0.0002836531950296101, + "loss": 0.2298, + "step": 331070 + }, + { + "epoch": 13.71, + "grad_norm": 1.1171875, + "learning_rate": 0.00028364244851306823, + "loss": 0.195, + "step": 331080 + }, + { + "epoch": 13.71, + "grad_norm": 0.60546875, + "learning_rate": 0.00028363170193321496, + "loss": 0.1955, + "step": 331090 + }, + { + "epoch": 13.71, + "grad_norm": 0.94140625, + "learning_rate": 0.00028362095529007035, + "loss": 0.1986, + "step": 331100 + }, + { + "epoch": 13.71, + "grad_norm": 0.6171875, + "learning_rate": 0.0002836102085836548, + "loss": 0.2022, + "step": 331110 + }, + { + "epoch": 13.71, + "grad_norm": 0.408203125, + "learning_rate": 0.0002835994618139884, + "loss": 0.2095, + "step": 331120 + }, + { + "epoch": 13.72, + "grad_norm": 1.25, + "learning_rate": 0.0002835887149810915, + "loss": 0.2331, + "step": 331130 + }, + { + "epoch": 13.72, + "grad_norm": 1.0, + "learning_rate": 0.0002835779680849844, + "loss": 0.2185, + "step": 331140 + }, + { + "epoch": 13.72, + "grad_norm": 0.58984375, + "learning_rate": 0.000283567221125687, + "loss": 0.1677, + "step": 331150 + }, + { + "epoch": 13.72, + "grad_norm": 0.828125, + "learning_rate": 0.0002835564741032197, + "loss": 0.1804, + "step": 331160 + }, + { + "epoch": 13.72, + "grad_norm": 0.453125, + "learning_rate": 0.00028354572701760287, + "loss": 0.1451, + "step": 331170 + }, + { + "epoch": 13.72, + "grad_norm": 0.65234375, + "learning_rate": 0.00028353497986885646, + "loss": 0.1853, + "step": 331180 + }, + { + "epoch": 13.72, + "grad_norm": 0.83984375, + "learning_rate": 0.000283524232657001, + "loss": 0.1568, + "step": 331190 + }, + { + "epoch": 13.72, + "grad_norm": 0.91015625, + "learning_rate": 0.0002835134853820565, + "loss": 0.2124, + "step": 331200 + }, + { + "epoch": 13.72, + "grad_norm": 0.384765625, + "learning_rate": 0.00028350273804404313, + "loss": 0.1956, + "step": 331210 + }, + { + "epoch": 13.72, + "grad_norm": 0.95703125, + "learning_rate": 0.00028349199064298136, + "loss": 0.2237, + "step": 331220 + }, + { + "epoch": 13.72, + "grad_norm": 0.77734375, + "learning_rate": 0.00028348124317889123, + "loss": 0.1797, + "step": 331230 + }, + { + "epoch": 13.72, + "grad_norm": 1.0078125, + "learning_rate": 0.000283470495651793, + "loss": 0.2131, + "step": 331240 + }, + { + "epoch": 13.72, + "grad_norm": 1.2578125, + "learning_rate": 0.00028345974806170694, + "loss": 0.2136, + "step": 331250 + }, + { + "epoch": 13.72, + "grad_norm": 0.69140625, + "learning_rate": 0.00028344900040865316, + "loss": 0.1796, + "step": 331260 + }, + { + "epoch": 13.72, + "grad_norm": 1.3359375, + "learning_rate": 0.00028343825269265213, + "loss": 0.1438, + "step": 331270 + }, + { + "epoch": 13.72, + "grad_norm": 0.98046875, + "learning_rate": 0.00028342750491372384, + "loss": 0.1921, + "step": 331280 + }, + { + "epoch": 13.72, + "grad_norm": 0.58203125, + "learning_rate": 0.0002834167570718885, + "loss": 0.215, + "step": 331290 + }, + { + "epoch": 13.72, + "grad_norm": 1.078125, + "learning_rate": 0.00028340600916716664, + "loss": 0.1869, + "step": 331300 + }, + { + "epoch": 13.72, + "grad_norm": 0.9296875, + "learning_rate": 0.0002833952611995781, + "loss": 0.2291, + "step": 331310 + }, + { + "epoch": 13.72, + "grad_norm": 0.9921875, + "learning_rate": 0.0002833845131691434, + "loss": 0.2065, + "step": 331320 + }, + { + "epoch": 13.72, + "grad_norm": 0.44140625, + "learning_rate": 0.0002833737650758826, + "loss": 0.2122, + "step": 331330 + }, + { + "epoch": 13.72, + "grad_norm": 0.828125, + "learning_rate": 0.00028336301691981593, + "loss": 0.1388, + "step": 331340 + }, + { + "epoch": 13.72, + "grad_norm": 0.25, + "learning_rate": 0.0002833522687009637, + "loss": 0.2216, + "step": 331350 + }, + { + "epoch": 13.72, + "grad_norm": 0.51171875, + "learning_rate": 0.0002833415204193461, + "loss": 0.1727, + "step": 331360 + }, + { + "epoch": 13.73, + "grad_norm": 0.59375, + "learning_rate": 0.00028333077207498336, + "loss": 0.2252, + "step": 331370 + }, + { + "epoch": 13.73, + "grad_norm": 1.1171875, + "learning_rate": 0.00028332002366789576, + "loss": 0.1931, + "step": 331380 + }, + { + "epoch": 13.73, + "grad_norm": 0.34375, + "learning_rate": 0.0002833092751981034, + "loss": 0.1631, + "step": 331390 + }, + { + "epoch": 13.73, + "grad_norm": 1.078125, + "learning_rate": 0.0002832985266656266, + "loss": 0.2055, + "step": 331400 + }, + { + "epoch": 13.73, + "grad_norm": 0.9921875, + "learning_rate": 0.00028328777807048566, + "loss": 0.1418, + "step": 331410 + }, + { + "epoch": 13.73, + "grad_norm": 0.5234375, + "learning_rate": 0.00028327702941270054, + "loss": 0.1553, + "step": 331420 + }, + { + "epoch": 13.73, + "grad_norm": 0.337890625, + "learning_rate": 0.00028326628069229185, + "loss": 0.182, + "step": 331430 + }, + { + "epoch": 13.73, + "grad_norm": 1.1484375, + "learning_rate": 0.0002832555319092794, + "loss": 0.2056, + "step": 331440 + }, + { + "epoch": 13.73, + "grad_norm": 0.921875, + "learning_rate": 0.00028324478306368376, + "loss": 0.1812, + "step": 331450 + }, + { + "epoch": 13.73, + "grad_norm": 0.671875, + "learning_rate": 0.0002832340341555251, + "loss": 0.1973, + "step": 331460 + }, + { + "epoch": 13.73, + "grad_norm": 0.48046875, + "learning_rate": 0.0002832232851848234, + "loss": 0.2286, + "step": 331470 + }, + { + "epoch": 13.73, + "grad_norm": 1.9375, + "learning_rate": 0.00028321253615159916, + "loss": 0.2068, + "step": 331480 + }, + { + "epoch": 13.73, + "grad_norm": 0.6171875, + "learning_rate": 0.0002832017870558726, + "loss": 0.2019, + "step": 331490 + }, + { + "epoch": 13.73, + "grad_norm": 0.765625, + "learning_rate": 0.00028319103789766365, + "loss": 0.2079, + "step": 331500 + }, + { + "epoch": 13.73, + "grad_norm": 0.9609375, + "learning_rate": 0.0002831802886769929, + "loss": 0.1575, + "step": 331510 + }, + { + "epoch": 13.73, + "grad_norm": 1.3046875, + "learning_rate": 0.00028316953939388036, + "loss": 0.2023, + "step": 331520 + }, + { + "epoch": 13.73, + "grad_norm": 0.5, + "learning_rate": 0.0002831587900483464, + "loss": 0.2036, + "step": 331530 + }, + { + "epoch": 13.73, + "grad_norm": 1.28125, + "learning_rate": 0.00028314804064041117, + "loss": 0.155, + "step": 331540 + }, + { + "epoch": 13.73, + "grad_norm": 1.203125, + "learning_rate": 0.0002831372911700948, + "loss": 0.2238, + "step": 331550 + }, + { + "epoch": 13.73, + "grad_norm": 0.6640625, + "learning_rate": 0.0002831265416374177, + "loss": 0.1876, + "step": 331560 + }, + { + "epoch": 13.73, + "grad_norm": 0.5859375, + "learning_rate": 0.0002831157920424001, + "loss": 0.2486, + "step": 331570 + }, + { + "epoch": 13.73, + "grad_norm": 1.09375, + "learning_rate": 0.000283105042385062, + "loss": 0.1616, + "step": 331580 + }, + { + "epoch": 13.73, + "grad_norm": 1.0078125, + "learning_rate": 0.0002830942926654239, + "loss": 0.19, + "step": 331590 + }, + { + "epoch": 13.73, + "grad_norm": 0.2275390625, + "learning_rate": 0.0002830835428835059, + "loss": 0.1916, + "step": 331600 + }, + { + "epoch": 13.74, + "grad_norm": 0.54296875, + "learning_rate": 0.0002830727930393282, + "loss": 0.1746, + "step": 331610 + }, + { + "epoch": 13.74, + "grad_norm": 0.46484375, + "learning_rate": 0.0002830620431329112, + "loss": 0.1766, + "step": 331620 + }, + { + "epoch": 13.74, + "grad_norm": 0.74609375, + "learning_rate": 0.0002830512931642748, + "loss": 0.1617, + "step": 331630 + }, + { + "epoch": 13.74, + "grad_norm": 0.53515625, + "learning_rate": 0.00028304054313343956, + "loss": 0.2093, + "step": 331640 + }, + { + "epoch": 13.74, + "grad_norm": 0.466796875, + "learning_rate": 0.00028302979304042566, + "loss": 0.1811, + "step": 331650 + }, + { + "epoch": 13.74, + "grad_norm": 0.82421875, + "learning_rate": 0.0002830190428852531, + "loss": 0.1977, + "step": 331660 + }, + { + "epoch": 13.74, + "grad_norm": 0.65234375, + "learning_rate": 0.00028300829266794236, + "loss": 0.187, + "step": 331670 + }, + { + "epoch": 13.74, + "grad_norm": 0.6640625, + "learning_rate": 0.00028299754238851355, + "loss": 0.2061, + "step": 331680 + }, + { + "epoch": 13.74, + "grad_norm": 1.6953125, + "learning_rate": 0.0002829867920469869, + "loss": 0.2025, + "step": 331690 + }, + { + "epoch": 13.74, + "grad_norm": 1.359375, + "learning_rate": 0.00028297604164338275, + "loss": 0.1758, + "step": 331700 + }, + { + "epoch": 13.74, + "grad_norm": 0.71484375, + "learning_rate": 0.00028296529117772113, + "loss": 0.1655, + "step": 331710 + }, + { + "epoch": 13.74, + "grad_norm": 0.78515625, + "learning_rate": 0.0002829545406500225, + "loss": 0.1742, + "step": 331720 + }, + { + "epoch": 13.74, + "grad_norm": 0.9375, + "learning_rate": 0.00028294379006030696, + "loss": 0.194, + "step": 331730 + }, + { + "epoch": 13.74, + "grad_norm": 0.74609375, + "learning_rate": 0.00028293303940859474, + "loss": 0.2162, + "step": 331740 + }, + { + "epoch": 13.74, + "grad_norm": 0.5703125, + "learning_rate": 0.00028292228869490616, + "loss": 0.1913, + "step": 331750 + }, + { + "epoch": 13.74, + "grad_norm": 1.0234375, + "learning_rate": 0.00028291153791926133, + "loss": 0.1991, + "step": 331760 + }, + { + "epoch": 13.74, + "grad_norm": 0.52734375, + "learning_rate": 0.0002829007870816805, + "loss": 0.2326, + "step": 331770 + }, + { + "epoch": 13.74, + "grad_norm": 0.72265625, + "learning_rate": 0.00028289003618218404, + "loss": 0.2122, + "step": 331780 + }, + { + "epoch": 13.74, + "grad_norm": 0.65234375, + "learning_rate": 0.00028287928522079196, + "loss": 0.1732, + "step": 331790 + }, + { + "epoch": 13.74, + "grad_norm": 0.4921875, + "learning_rate": 0.00028286853419752473, + "loss": 0.2087, + "step": 331800 + }, + { + "epoch": 13.74, + "grad_norm": 1.171875, + "learning_rate": 0.0002828577831124024, + "loss": 0.1722, + "step": 331810 + }, + { + "epoch": 13.74, + "grad_norm": 0.4140625, + "learning_rate": 0.0002828470319654453, + "loss": 0.1715, + "step": 331820 + }, + { + "epoch": 13.74, + "grad_norm": 0.447265625, + "learning_rate": 0.0002828362807566737, + "loss": 0.1577, + "step": 331830 + }, + { + "epoch": 13.74, + "grad_norm": 0.88671875, + "learning_rate": 0.00028282552948610763, + "loss": 0.2305, + "step": 331840 + }, + { + "epoch": 13.75, + "grad_norm": 0.91015625, + "learning_rate": 0.00028281477815376754, + "loss": 0.1873, + "step": 331850 + }, + { + "epoch": 13.75, + "grad_norm": 1.03125, + "learning_rate": 0.0002828040267596736, + "loss": 0.1827, + "step": 331860 + }, + { + "epoch": 13.75, + "grad_norm": 1.5546875, + "learning_rate": 0.0002827932753038459, + "loss": 0.1521, + "step": 331870 + }, + { + "epoch": 13.75, + "grad_norm": 0.80078125, + "learning_rate": 0.00028278252378630494, + "loss": 0.1417, + "step": 331880 + }, + { + "epoch": 13.75, + "grad_norm": 0.59375, + "learning_rate": 0.0002827717722070707, + "loss": 0.1638, + "step": 331890 + }, + { + "epoch": 13.75, + "grad_norm": 0.50390625, + "learning_rate": 0.0002827610205661636, + "loss": 0.2218, + "step": 331900 + }, + { + "epoch": 13.75, + "grad_norm": 0.6640625, + "learning_rate": 0.0002827502688636038, + "loss": 0.1884, + "step": 331910 + }, + { + "epoch": 13.75, + "grad_norm": 0.8203125, + "learning_rate": 0.0002827395170994115, + "loss": 0.1474, + "step": 331920 + }, + { + "epoch": 13.75, + "grad_norm": 1.578125, + "learning_rate": 0.00028272876527360696, + "loss": 0.1561, + "step": 331930 + }, + { + "epoch": 13.75, + "grad_norm": 0.328125, + "learning_rate": 0.00028271801338621043, + "loss": 0.2062, + "step": 331940 + }, + { + "epoch": 13.75, + "grad_norm": 0.96875, + "learning_rate": 0.0002827072614372421, + "loss": 0.1788, + "step": 331950 + }, + { + "epoch": 13.75, + "grad_norm": 1.3359375, + "learning_rate": 0.0002826965094267223, + "loss": 0.1774, + "step": 331960 + }, + { + "epoch": 13.75, + "grad_norm": 0.5390625, + "learning_rate": 0.0002826857573546712, + "loss": 0.2596, + "step": 331970 + }, + { + "epoch": 13.75, + "grad_norm": 1.5, + "learning_rate": 0.000282675005221109, + "loss": 0.2117, + "step": 331980 + }, + { + "epoch": 13.75, + "grad_norm": 1.3203125, + "learning_rate": 0.000282664253026056, + "loss": 0.1939, + "step": 331990 + }, + { + "epoch": 13.75, + "grad_norm": 0.76953125, + "learning_rate": 0.00028265350076953235, + "loss": 0.1945, + "step": 332000 + }, + { + "epoch": 13.75, + "grad_norm": 0.498046875, + "learning_rate": 0.0002826427484515584, + "loss": 0.1425, + "step": 332010 + }, + { + "epoch": 13.75, + "grad_norm": 1.15625, + "learning_rate": 0.0002826319960721542, + "loss": 0.148, + "step": 332020 + }, + { + "epoch": 13.75, + "grad_norm": 1.046875, + "learning_rate": 0.00028262124363134024, + "loss": 0.1825, + "step": 332030 + }, + { + "epoch": 13.75, + "grad_norm": 0.74609375, + "learning_rate": 0.00028261049112913663, + "loss": 0.1763, + "step": 332040 + }, + { + "epoch": 13.75, + "grad_norm": 1.140625, + "learning_rate": 0.0002825997385655635, + "loss": 0.1661, + "step": 332050 + }, + { + "epoch": 13.75, + "grad_norm": 1.2421875, + "learning_rate": 0.0002825889859406412, + "loss": 0.1924, + "step": 332060 + }, + { + "epoch": 13.75, + "grad_norm": 0.69140625, + "learning_rate": 0.00028257823325439005, + "loss": 0.1716, + "step": 332070 + }, + { + "epoch": 13.75, + "grad_norm": 0.58203125, + "learning_rate": 0.0002825674805068301, + "loss": 0.1507, + "step": 332080 + }, + { + "epoch": 13.76, + "grad_norm": 0.71484375, + "learning_rate": 0.0002825567276979816, + "loss": 0.2051, + "step": 332090 + }, + { + "epoch": 13.76, + "grad_norm": 0.59375, + "learning_rate": 0.0002825459748278649, + "loss": 0.1982, + "step": 332100 + }, + { + "epoch": 13.76, + "grad_norm": 0.73828125, + "learning_rate": 0.00028253522189650024, + "loss": 0.191, + "step": 332110 + }, + { + "epoch": 13.76, + "grad_norm": 0.83203125, + "learning_rate": 0.0002825244689039078, + "loss": 0.2004, + "step": 332120 + }, + { + "epoch": 13.76, + "grad_norm": 1.625, + "learning_rate": 0.00028251371585010784, + "loss": 0.2021, + "step": 332130 + }, + { + "epoch": 13.76, + "grad_norm": 1.1640625, + "learning_rate": 0.0002825029627351205, + "loss": 0.2079, + "step": 332140 + }, + { + "epoch": 13.76, + "grad_norm": 0.63671875, + "learning_rate": 0.0002824922095589662, + "loss": 0.1874, + "step": 332150 + }, + { + "epoch": 13.76, + "grad_norm": 0.7578125, + "learning_rate": 0.000282481456321665, + "loss": 0.2252, + "step": 332160 + }, + { + "epoch": 13.76, + "grad_norm": 0.6640625, + "learning_rate": 0.00028247070302323717, + "loss": 0.1756, + "step": 332170 + }, + { + "epoch": 13.76, + "grad_norm": 2.078125, + "learning_rate": 0.000282459949663703, + "loss": 0.1665, + "step": 332180 + }, + { + "epoch": 13.76, + "grad_norm": 0.8984375, + "learning_rate": 0.00028244919624308276, + "loss": 0.1993, + "step": 332190 + }, + { + "epoch": 13.76, + "grad_norm": 1.1875, + "learning_rate": 0.00028243844276139665, + "loss": 0.1738, + "step": 332200 + }, + { + "epoch": 13.76, + "grad_norm": 0.7734375, + "learning_rate": 0.00028242768921866486, + "loss": 0.2177, + "step": 332210 + }, + { + "epoch": 13.76, + "grad_norm": 0.384765625, + "learning_rate": 0.00028241693561490767, + "loss": 0.1661, + "step": 332220 + }, + { + "epoch": 13.76, + "grad_norm": 0.6328125, + "learning_rate": 0.0002824061819501453, + "loss": 0.2195, + "step": 332230 + }, + { + "epoch": 13.76, + "grad_norm": 0.96875, + "learning_rate": 0.000282395428224398, + "loss": 0.1592, + "step": 332240 + }, + { + "epoch": 13.76, + "grad_norm": 0.333984375, + "learning_rate": 0.000282384674437686, + "loss": 0.1755, + "step": 332250 + }, + { + "epoch": 13.76, + "grad_norm": 1.21875, + "learning_rate": 0.0002823739205900295, + "loss": 0.1851, + "step": 332260 + }, + { + "epoch": 13.76, + "grad_norm": 0.4921875, + "learning_rate": 0.0002823631666814488, + "loss": 0.181, + "step": 332270 + }, + { + "epoch": 13.76, + "grad_norm": 0.51171875, + "learning_rate": 0.00028235241271196414, + "loss": 0.1616, + "step": 332280 + }, + { + "epoch": 13.76, + "grad_norm": 0.67578125, + "learning_rate": 0.00028234165868159573, + "loss": 0.2407, + "step": 332290 + }, + { + "epoch": 13.76, + "grad_norm": 0.0615234375, + "learning_rate": 0.00028233090459036384, + "loss": 0.228, + "step": 332300 + }, + { + "epoch": 13.76, + "grad_norm": 0.828125, + "learning_rate": 0.0002823201504382886, + "loss": 0.1684, + "step": 332310 + }, + { + "epoch": 13.76, + "grad_norm": 0.65625, + "learning_rate": 0.0002823093962253904, + "loss": 0.2608, + "step": 332320 + }, + { + "epoch": 13.77, + "grad_norm": 1.1640625, + "learning_rate": 0.00028229864195168934, + "loss": 0.1907, + "step": 332330 + }, + { + "epoch": 13.77, + "grad_norm": 1.140625, + "learning_rate": 0.00028228788761720575, + "loss": 0.209, + "step": 332340 + }, + { + "epoch": 13.77, + "grad_norm": 0.96875, + "learning_rate": 0.0002822771332219599, + "loss": 0.1772, + "step": 332350 + }, + { + "epoch": 13.77, + "grad_norm": 0.1494140625, + "learning_rate": 0.00028226637876597186, + "loss": 0.1457, + "step": 332360 + }, + { + "epoch": 13.77, + "grad_norm": 0.5703125, + "learning_rate": 0.0002822556242492621, + "loss": 0.1449, + "step": 332370 + }, + { + "epoch": 13.77, + "grad_norm": 3.703125, + "learning_rate": 0.00028224486967185066, + "loss": 0.218, + "step": 332380 + }, + { + "epoch": 13.77, + "grad_norm": 1.1328125, + "learning_rate": 0.0002822341150337578, + "loss": 0.2152, + "step": 332390 + }, + { + "epoch": 13.77, + "grad_norm": 1.2421875, + "learning_rate": 0.00028222336033500403, + "loss": 0.2494, + "step": 332400 + }, + { + "epoch": 13.77, + "grad_norm": 1.390625, + "learning_rate": 0.0002822126055756092, + "loss": 0.1839, + "step": 332410 + }, + { + "epoch": 13.77, + "grad_norm": 1.53125, + "learning_rate": 0.00028220185075559373, + "loss": 0.2214, + "step": 332420 + }, + { + "epoch": 13.77, + "grad_norm": 0.66015625, + "learning_rate": 0.0002821910958749779, + "loss": 0.16, + "step": 332430 + }, + { + "epoch": 13.77, + "grad_norm": 0.2080078125, + "learning_rate": 0.00028218034093378184, + "loss": 0.1817, + "step": 332440 + }, + { + "epoch": 13.77, + "grad_norm": 0.65625, + "learning_rate": 0.00028216958593202596, + "loss": 0.1953, + "step": 332450 + }, + { + "epoch": 13.77, + "grad_norm": 1.046875, + "learning_rate": 0.0002821588308697303, + "loss": 0.1502, + "step": 332460 + }, + { + "epoch": 13.77, + "grad_norm": 0.98046875, + "learning_rate": 0.0002821480757469153, + "loss": 0.2367, + "step": 332470 + }, + { + "epoch": 13.77, + "grad_norm": 0.5625, + "learning_rate": 0.000282137320563601, + "loss": 0.1937, + "step": 332480 + }, + { + "epoch": 13.77, + "grad_norm": 0.71875, + "learning_rate": 0.00028212656531980773, + "loss": 0.1693, + "step": 332490 + }, + { + "epoch": 13.77, + "grad_norm": 0.72265625, + "learning_rate": 0.0002821158100155558, + "loss": 0.1907, + "step": 332500 + }, + { + "epoch": 13.77, + "grad_norm": 0.625, + "learning_rate": 0.0002821050546508653, + "loss": 0.1763, + "step": 332510 + }, + { + "epoch": 13.77, + "grad_norm": 0.8515625, + "learning_rate": 0.00028209429922575657, + "loss": 0.1456, + "step": 332520 + }, + { + "epoch": 13.77, + "grad_norm": 0.68359375, + "learning_rate": 0.0002820835437402499, + "loss": 0.1192, + "step": 332530 + }, + { + "epoch": 13.77, + "grad_norm": 0.6953125, + "learning_rate": 0.0002820727881943654, + "loss": 0.228, + "step": 332540 + }, + { + "epoch": 13.77, + "grad_norm": 0.640625, + "learning_rate": 0.00028206203258812347, + "loss": 0.145, + "step": 332550 + }, + { + "epoch": 13.77, + "grad_norm": 1.6640625, + "learning_rate": 0.00028205127692154414, + "loss": 0.1988, + "step": 332560 + }, + { + "epoch": 13.78, + "grad_norm": 1.7578125, + "learning_rate": 0.00028204052119464783, + "loss": 0.1458, + "step": 332570 + }, + { + "epoch": 13.78, + "grad_norm": 1.203125, + "learning_rate": 0.00028202976540745475, + "loss": 0.1811, + "step": 332580 + }, + { + "epoch": 13.78, + "grad_norm": 0.72265625, + "learning_rate": 0.00028201900955998507, + "loss": 0.2179, + "step": 332590 + }, + { + "epoch": 13.78, + "grad_norm": 0.671875, + "learning_rate": 0.00028200825365225907, + "loss": 0.1374, + "step": 332600 + }, + { + "epoch": 13.78, + "grad_norm": 1.046875, + "learning_rate": 0.000281997497684297, + "loss": 0.2122, + "step": 332610 + }, + { + "epoch": 13.78, + "grad_norm": 1.4765625, + "learning_rate": 0.0002819867416561191, + "loss": 0.2525, + "step": 332620 + }, + { + "epoch": 13.78, + "grad_norm": 1.078125, + "learning_rate": 0.0002819759855677456, + "loss": 0.211, + "step": 332630 + }, + { + "epoch": 13.78, + "grad_norm": 0.33203125, + "learning_rate": 0.0002819652294191967, + "loss": 0.1915, + "step": 332640 + }, + { + "epoch": 13.78, + "grad_norm": 0.8359375, + "learning_rate": 0.0002819544732104927, + "loss": 0.2312, + "step": 332650 + }, + { + "epoch": 13.78, + "grad_norm": 1.375, + "learning_rate": 0.00028194371694165393, + "loss": 0.1984, + "step": 332660 + }, + { + "epoch": 13.78, + "grad_norm": 1.34375, + "learning_rate": 0.0002819329606127005, + "loss": 0.1605, + "step": 332670 + }, + { + "epoch": 13.78, + "grad_norm": 0.86328125, + "learning_rate": 0.0002819222042236526, + "loss": 0.2171, + "step": 332680 + }, + { + "epoch": 13.78, + "grad_norm": 0.86328125, + "learning_rate": 0.0002819114477745307, + "loss": 0.1839, + "step": 332690 + }, + { + "epoch": 13.78, + "grad_norm": 0.72265625, + "learning_rate": 0.0002819006912653547, + "loss": 0.2132, + "step": 332700 + }, + { + "epoch": 13.78, + "grad_norm": 1.109375, + "learning_rate": 0.00028188993469614526, + "loss": 0.1632, + "step": 332710 + }, + { + "epoch": 13.78, + "grad_norm": 0.921875, + "learning_rate": 0.0002818791780669223, + "loss": 0.1965, + "step": 332720 + }, + { + "epoch": 13.78, + "grad_norm": 1.453125, + "learning_rate": 0.0002818684213777061, + "loss": 0.1866, + "step": 332730 + }, + { + "epoch": 13.78, + "grad_norm": 1.0859375, + "learning_rate": 0.00028185766462851715, + "loss": 0.16, + "step": 332740 + }, + { + "epoch": 13.78, + "grad_norm": 0.94140625, + "learning_rate": 0.0002818469078193754, + "loss": 0.2092, + "step": 332750 + }, + { + "epoch": 13.78, + "grad_norm": 0.86328125, + "learning_rate": 0.00028183615095030124, + "loss": 0.143, + "step": 332760 + }, + { + "epoch": 13.78, + "grad_norm": 0.6171875, + "learning_rate": 0.0002818253940213149, + "loss": 0.2017, + "step": 332770 + }, + { + "epoch": 13.78, + "grad_norm": 1.765625, + "learning_rate": 0.0002818146370324365, + "loss": 0.2156, + "step": 332780 + }, + { + "epoch": 13.78, + "grad_norm": 0.75390625, + "learning_rate": 0.0002818038799836865, + "loss": 0.186, + "step": 332790 + }, + { + "epoch": 13.78, + "grad_norm": 0.796875, + "learning_rate": 0.00028179312287508496, + "loss": 0.2169, + "step": 332800 + }, + { + "epoch": 13.78, + "grad_norm": 0.9375, + "learning_rate": 0.0002817823657066522, + "loss": 0.1658, + "step": 332810 + }, + { + "epoch": 13.79, + "grad_norm": 0.5234375, + "learning_rate": 0.00028177160847840855, + "loss": 0.212, + "step": 332820 + }, + { + "epoch": 13.79, + "grad_norm": 0.6640625, + "learning_rate": 0.0002817608511903741, + "loss": 0.1941, + "step": 332830 + }, + { + "epoch": 13.79, + "grad_norm": 0.44921875, + "learning_rate": 0.0002817500938425692, + "loss": 0.2344, + "step": 332840 + }, + { + "epoch": 13.79, + "grad_norm": 0.98828125, + "learning_rate": 0.00028173933643501405, + "loss": 0.1628, + "step": 332850 + }, + { + "epoch": 13.79, + "grad_norm": 0.7578125, + "learning_rate": 0.00028172857896772876, + "loss": 0.1721, + "step": 332860 + }, + { + "epoch": 13.79, + "grad_norm": 0.81640625, + "learning_rate": 0.0002817178214407338, + "loss": 0.1583, + "step": 332870 + }, + { + "epoch": 13.79, + "grad_norm": 0.50390625, + "learning_rate": 0.0002817070638540494, + "loss": 0.2057, + "step": 332880 + }, + { + "epoch": 13.79, + "grad_norm": 1.265625, + "learning_rate": 0.0002816963062076956, + "loss": 0.2105, + "step": 332890 + }, + { + "epoch": 13.79, + "grad_norm": 0.765625, + "learning_rate": 0.0002816855485016929, + "loss": 0.1968, + "step": 332900 + }, + { + "epoch": 13.79, + "grad_norm": 1.109375, + "learning_rate": 0.00028167479073606136, + "loss": 0.2022, + "step": 332910 + }, + { + "epoch": 13.79, + "grad_norm": 0.9453125, + "learning_rate": 0.00028166403291082125, + "loss": 0.1366, + "step": 332920 + }, + { + "epoch": 13.79, + "grad_norm": 1.3203125, + "learning_rate": 0.0002816532750259929, + "loss": 0.1936, + "step": 332930 + }, + { + "epoch": 13.79, + "grad_norm": 0.333984375, + "learning_rate": 0.0002816425170815965, + "loss": 0.1931, + "step": 332940 + }, + { + "epoch": 13.79, + "grad_norm": 1.078125, + "learning_rate": 0.0002816317590776523, + "loss": 0.1738, + "step": 332950 + }, + { + "epoch": 13.79, + "grad_norm": 3.015625, + "learning_rate": 0.0002816210010141805, + "loss": 0.1714, + "step": 332960 + }, + { + "epoch": 13.79, + "grad_norm": 0.5546875, + "learning_rate": 0.0002816102428912014, + "loss": 0.169, + "step": 332970 + }, + { + "epoch": 13.79, + "grad_norm": 0.58203125, + "learning_rate": 0.00028159948470873533, + "loss": 0.2097, + "step": 332980 + }, + { + "epoch": 13.79, + "grad_norm": 0.70703125, + "learning_rate": 0.00028158872646680234, + "loss": 0.2023, + "step": 332990 + }, + { + "epoch": 13.79, + "grad_norm": 1.65625, + "learning_rate": 0.0002815779681654228, + "loss": 0.1554, + "step": 333000 + }, + { + "epoch": 13.79, + "grad_norm": 1.9921875, + "learning_rate": 0.000281567209804617, + "loss": 0.2251, + "step": 333010 + }, + { + "epoch": 13.79, + "grad_norm": 2.078125, + "learning_rate": 0.000281556451384405, + "loss": 0.2438, + "step": 333020 + }, + { + "epoch": 13.79, + "grad_norm": 0.41015625, + "learning_rate": 0.00028154569290480733, + "loss": 0.2303, + "step": 333030 + }, + { + "epoch": 13.79, + "grad_norm": 0.45703125, + "learning_rate": 0.0002815349343658439, + "loss": 0.1908, + "step": 333040 + }, + { + "epoch": 13.79, + "grad_norm": 0.8359375, + "learning_rate": 0.0002815241757675352, + "loss": 0.2085, + "step": 333050 + }, + { + "epoch": 13.8, + "grad_norm": 0.5546875, + "learning_rate": 0.00028151341710990147, + "loss": 0.1751, + "step": 333060 + }, + { + "epoch": 13.8, + "grad_norm": 0.62109375, + "learning_rate": 0.00028150265839296275, + "loss": 0.2034, + "step": 333070 + }, + { + "epoch": 13.8, + "grad_norm": 1.1796875, + "learning_rate": 0.00028149189961673953, + "loss": 0.1598, + "step": 333080 + }, + { + "epoch": 13.8, + "grad_norm": 1.28125, + "learning_rate": 0.00028148114078125204, + "loss": 0.1744, + "step": 333090 + }, + { + "epoch": 13.8, + "grad_norm": 0.875, + "learning_rate": 0.00028147038188652026, + "loss": 0.1958, + "step": 333100 + }, + { + "epoch": 13.8, + "grad_norm": 0.373046875, + "learning_rate": 0.0002814596229325647, + "loss": 0.1687, + "step": 333110 + }, + { + "epoch": 13.8, + "grad_norm": 0.47265625, + "learning_rate": 0.00028144886391940553, + "loss": 0.2256, + "step": 333120 + }, + { + "epoch": 13.8, + "grad_norm": 0.45703125, + "learning_rate": 0.000281438104847063, + "loss": 0.1391, + "step": 333130 + }, + { + "epoch": 13.8, + "grad_norm": 0.828125, + "learning_rate": 0.0002814273457155574, + "loss": 0.1786, + "step": 333140 + }, + { + "epoch": 13.8, + "grad_norm": 1.0078125, + "learning_rate": 0.00028141658652490876, + "loss": 0.2083, + "step": 333150 + }, + { + "epoch": 13.8, + "grad_norm": 1.21875, + "learning_rate": 0.00028140582727513764, + "loss": 0.237, + "step": 333160 + }, + { + "epoch": 13.8, + "grad_norm": 0.88671875, + "learning_rate": 0.00028139506796626417, + "loss": 0.1913, + "step": 333170 + }, + { + "epoch": 13.8, + "grad_norm": 1.2890625, + "learning_rate": 0.00028138430859830845, + "loss": 0.1909, + "step": 333180 + }, + { + "epoch": 13.8, + "grad_norm": 0.91015625, + "learning_rate": 0.000281373549171291, + "loss": 0.17, + "step": 333190 + }, + { + "epoch": 13.8, + "grad_norm": 0.69140625, + "learning_rate": 0.00028136278968523176, + "loss": 0.14, + "step": 333200 + }, + { + "epoch": 13.8, + "grad_norm": 1.171875, + "learning_rate": 0.0002813520301401512, + "loss": 0.1646, + "step": 333210 + }, + { + "epoch": 13.8, + "grad_norm": 0.73046875, + "learning_rate": 0.00028134127053606957, + "loss": 0.2261, + "step": 333220 + }, + { + "epoch": 13.8, + "grad_norm": 0.275390625, + "learning_rate": 0.0002813305108730069, + "loss": 0.1681, + "step": 333230 + }, + { + "epoch": 13.8, + "grad_norm": 0.75390625, + "learning_rate": 0.0002813197511509838, + "loss": 0.1649, + "step": 333240 + }, + { + "epoch": 13.8, + "grad_norm": 1.0390625, + "learning_rate": 0.00028130899137002017, + "loss": 0.2141, + "step": 333250 + }, + { + "epoch": 13.8, + "grad_norm": 1.0703125, + "learning_rate": 0.0002812982315301364, + "loss": 0.1451, + "step": 333260 + }, + { + "epoch": 13.8, + "grad_norm": 0.7578125, + "learning_rate": 0.0002812874716313528, + "loss": 0.1563, + "step": 333270 + }, + { + "epoch": 13.8, + "grad_norm": 0.388671875, + "learning_rate": 0.0002812767116736895, + "loss": 0.1776, + "step": 333280 + }, + { + "epoch": 13.8, + "grad_norm": 0.8671875, + "learning_rate": 0.00028126595165716686, + "loss": 0.1722, + "step": 333290 + }, + { + "epoch": 13.81, + "grad_norm": 0.205078125, + "learning_rate": 0.00028125519158180506, + "loss": 0.1459, + "step": 333300 + }, + { + "epoch": 13.81, + "grad_norm": 0.291015625, + "learning_rate": 0.0002812444314476243, + "loss": 0.1963, + "step": 333310 + }, + { + "epoch": 13.81, + "grad_norm": 0.64453125, + "learning_rate": 0.000281233671254645, + "loss": 0.2165, + "step": 333320 + }, + { + "epoch": 13.81, + "grad_norm": 0.57421875, + "learning_rate": 0.0002812229110028872, + "loss": 0.187, + "step": 333330 + }, + { + "epoch": 13.81, + "grad_norm": 1.265625, + "learning_rate": 0.0002812121506923713, + "loss": 0.197, + "step": 333340 + }, + { + "epoch": 13.81, + "grad_norm": 0.66015625, + "learning_rate": 0.0002812013903231176, + "loss": 0.1735, + "step": 333350 + }, + { + "epoch": 13.81, + "grad_norm": 1.21875, + "learning_rate": 0.00028119062989514614, + "loss": 0.1806, + "step": 333360 + }, + { + "epoch": 13.81, + "grad_norm": 0.50390625, + "learning_rate": 0.0002811798694084773, + "loss": 0.1589, + "step": 333370 + }, + { + "epoch": 13.81, + "grad_norm": 0.95703125, + "learning_rate": 0.00028116910886313135, + "loss": 0.2121, + "step": 333380 + }, + { + "epoch": 13.81, + "grad_norm": 0.546875, + "learning_rate": 0.0002811583482591284, + "loss": 0.1936, + "step": 333390 + }, + { + "epoch": 13.81, + "grad_norm": 1.0390625, + "learning_rate": 0.0002811475875964889, + "loss": 0.2135, + "step": 333400 + }, + { + "epoch": 13.81, + "grad_norm": 0.85546875, + "learning_rate": 0.00028113682687523297, + "loss": 0.215, + "step": 333410 + }, + { + "epoch": 13.81, + "grad_norm": 0.9140625, + "learning_rate": 0.0002811260660953809, + "loss": 0.1738, + "step": 333420 + }, + { + "epoch": 13.81, + "grad_norm": 1.0390625, + "learning_rate": 0.00028111530525695293, + "loss": 0.2365, + "step": 333430 + }, + { + "epoch": 13.81, + "grad_norm": 1.1171875, + "learning_rate": 0.00028110454435996933, + "loss": 0.1682, + "step": 333440 + }, + { + "epoch": 13.81, + "grad_norm": 0.9765625, + "learning_rate": 0.0002810937834044503, + "loss": 0.2604, + "step": 333450 + }, + { + "epoch": 13.81, + "grad_norm": 0.41015625, + "learning_rate": 0.0002810830223904162, + "loss": 0.1339, + "step": 333460 + }, + { + "epoch": 13.81, + "grad_norm": 1.1796875, + "learning_rate": 0.0002810722613178871, + "loss": 0.2269, + "step": 333470 + }, + { + "epoch": 13.81, + "grad_norm": 0.8203125, + "learning_rate": 0.0002810615001868835, + "loss": 0.1378, + "step": 333480 + }, + { + "epoch": 13.81, + "grad_norm": 1.09375, + "learning_rate": 0.0002810507389974254, + "loss": 0.2039, + "step": 333490 + }, + { + "epoch": 13.81, + "grad_norm": 0.435546875, + "learning_rate": 0.00028103997774953316, + "loss": 0.1769, + "step": 333500 + }, + { + "epoch": 13.81, + "grad_norm": 0.52734375, + "learning_rate": 0.00028102921644322713, + "loss": 0.1809, + "step": 333510 + }, + { + "epoch": 13.81, + "grad_norm": 0.83984375, + "learning_rate": 0.0002810184550785274, + "loss": 0.2415, + "step": 333520 + }, + { + "epoch": 13.81, + "grad_norm": 0.6796875, + "learning_rate": 0.00028100769365545426, + "loss": 0.1919, + "step": 333530 + }, + { + "epoch": 13.82, + "grad_norm": 1.140625, + "learning_rate": 0.00028099693217402806, + "loss": 0.1864, + "step": 333540 + }, + { + "epoch": 13.82, + "grad_norm": 0.5703125, + "learning_rate": 0.0002809861706342689, + "loss": 0.1997, + "step": 333550 + }, + { + "epoch": 13.82, + "grad_norm": 0.7734375, + "learning_rate": 0.0002809754090361972, + "loss": 0.2487, + "step": 333560 + }, + { + "epoch": 13.82, + "grad_norm": 0.8359375, + "learning_rate": 0.0002809646473798331, + "loss": 0.1902, + "step": 333570 + }, + { + "epoch": 13.82, + "grad_norm": 0.73046875, + "learning_rate": 0.0002809538856651968, + "loss": 0.17, + "step": 333580 + }, + { + "epoch": 13.82, + "grad_norm": 0.9375, + "learning_rate": 0.00028094312389230876, + "loss": 0.1885, + "step": 333590 + }, + { + "epoch": 13.82, + "grad_norm": 0.36328125, + "learning_rate": 0.000280932362061189, + "loss": 0.1888, + "step": 333600 + }, + { + "epoch": 13.82, + "grad_norm": 2.5, + "learning_rate": 0.0002809216001718579, + "loss": 0.1574, + "step": 333610 + }, + { + "epoch": 13.82, + "grad_norm": 0.6640625, + "learning_rate": 0.00028091083822433564, + "loss": 0.2086, + "step": 333620 + }, + { + "epoch": 13.82, + "grad_norm": 0.408203125, + "learning_rate": 0.0002809000762186426, + "loss": 0.1582, + "step": 333630 + }, + { + "epoch": 13.82, + "grad_norm": 0.6875, + "learning_rate": 0.00028088931415479895, + "loss": 0.189, + "step": 333640 + }, + { + "epoch": 13.82, + "grad_norm": 0.69140625, + "learning_rate": 0.0002808785520328249, + "loss": 0.1602, + "step": 333650 + }, + { + "epoch": 13.82, + "grad_norm": 0.8515625, + "learning_rate": 0.0002808677898527408, + "loss": 0.1664, + "step": 333660 + }, + { + "epoch": 13.82, + "grad_norm": 1.125, + "learning_rate": 0.00028085702761456685, + "loss": 0.2136, + "step": 333670 + }, + { + "epoch": 13.82, + "grad_norm": 1.9609375, + "learning_rate": 0.0002808462653183233, + "loss": 0.2203, + "step": 333680 + }, + { + "epoch": 13.82, + "grad_norm": 0.5859375, + "learning_rate": 0.0002808355029640304, + "loss": 0.2344, + "step": 333690 + }, + { + "epoch": 13.82, + "grad_norm": 1.09375, + "learning_rate": 0.0002808247405517084, + "loss": 0.2269, + "step": 333700 + }, + { + "epoch": 13.82, + "grad_norm": 1.0546875, + "learning_rate": 0.00028081397808137754, + "loss": 0.1923, + "step": 333710 + }, + { + "epoch": 13.82, + "grad_norm": 2.53125, + "learning_rate": 0.00028080321555305814, + "loss": 0.1705, + "step": 333720 + }, + { + "epoch": 13.82, + "grad_norm": 1.890625, + "learning_rate": 0.0002807924529667704, + "loss": 0.1614, + "step": 333730 + }, + { + "epoch": 13.82, + "grad_norm": 0.69921875, + "learning_rate": 0.00028078169032253456, + "loss": 0.1224, + "step": 333740 + }, + { + "epoch": 13.82, + "grad_norm": 0.7734375, + "learning_rate": 0.000280770927620371, + "loss": 0.2209, + "step": 333750 + }, + { + "epoch": 13.82, + "grad_norm": 1.5, + "learning_rate": 0.0002807601648602998, + "loss": 0.2059, + "step": 333760 + }, + { + "epoch": 13.82, + "grad_norm": 1.265625, + "learning_rate": 0.0002807494020423413, + "loss": 0.2238, + "step": 333770 + }, + { + "epoch": 13.83, + "grad_norm": 0.388671875, + "learning_rate": 0.00028073863916651577, + "loss": 0.2046, + "step": 333780 + }, + { + "epoch": 13.83, + "grad_norm": 1.5, + "learning_rate": 0.00028072787623284343, + "loss": 0.1838, + "step": 333790 + }, + { + "epoch": 13.83, + "grad_norm": 0.81640625, + "learning_rate": 0.00028071711324134457, + "loss": 0.1615, + "step": 333800 + }, + { + "epoch": 13.83, + "grad_norm": 1.109375, + "learning_rate": 0.0002807063501920394, + "loss": 0.1916, + "step": 333810 + }, + { + "epoch": 13.83, + "grad_norm": 0.93359375, + "learning_rate": 0.0002806955870849481, + "loss": 0.1797, + "step": 333820 + }, + { + "epoch": 13.83, + "grad_norm": 1.0390625, + "learning_rate": 0.00028068482392009113, + "loss": 0.1532, + "step": 333830 + }, + { + "epoch": 13.83, + "grad_norm": 0.515625, + "learning_rate": 0.00028067406069748866, + "loss": 0.1906, + "step": 333840 + }, + { + "epoch": 13.83, + "grad_norm": 0.50390625, + "learning_rate": 0.00028066329741716084, + "loss": 0.2231, + "step": 333850 + }, + { + "epoch": 13.83, + "grad_norm": 0.890625, + "learning_rate": 0.00028065253407912803, + "loss": 0.2044, + "step": 333860 + }, + { + "epoch": 13.83, + "grad_norm": 1.1484375, + "learning_rate": 0.00028064177068341047, + "loss": 0.2301, + "step": 333870 + }, + { + "epoch": 13.83, + "grad_norm": 0.357421875, + "learning_rate": 0.00028063100723002836, + "loss": 0.1822, + "step": 333880 + }, + { + "epoch": 13.83, + "grad_norm": 0.90234375, + "learning_rate": 0.00028062024371900215, + "loss": 0.2216, + "step": 333890 + }, + { + "epoch": 13.83, + "grad_norm": 0.90625, + "learning_rate": 0.0002806094801503518, + "loss": 0.1656, + "step": 333900 + }, + { + "epoch": 13.83, + "grad_norm": 1.03125, + "learning_rate": 0.0002805987165240977, + "loss": 0.2084, + "step": 333910 + }, + { + "epoch": 13.83, + "grad_norm": 0.470703125, + "learning_rate": 0.0002805879528402603, + "loss": 0.1979, + "step": 333920 + }, + { + "epoch": 13.83, + "grad_norm": 1.1875, + "learning_rate": 0.0002805771890988595, + "loss": 0.2293, + "step": 333930 + }, + { + "epoch": 13.83, + "grad_norm": 0.73046875, + "learning_rate": 0.0002805664252999158, + "loss": 0.1674, + "step": 333940 + }, + { + "epoch": 13.83, + "grad_norm": 1.2734375, + "learning_rate": 0.00028055566144344937, + "loss": 0.2189, + "step": 333950 + }, + { + "epoch": 13.83, + "grad_norm": 0.90625, + "learning_rate": 0.00028054489752948045, + "loss": 0.219, + "step": 333960 + }, + { + "epoch": 13.83, + "grad_norm": 0.74609375, + "learning_rate": 0.0002805341335580294, + "loss": 0.2639, + "step": 333970 + }, + { + "epoch": 13.83, + "grad_norm": 0.65234375, + "learning_rate": 0.00028052336952911637, + "loss": 0.1915, + "step": 333980 + }, + { + "epoch": 13.83, + "grad_norm": 0.625, + "learning_rate": 0.0002805126054427617, + "loss": 0.2027, + "step": 333990 + }, + { + "epoch": 13.83, + "grad_norm": 0.0, + "learning_rate": 0.0002805018412989856, + "loss": 0.2027, + "step": 334000 + }, + { + "epoch": 13.83, + "grad_norm": 1.03125, + "learning_rate": 0.0002804910770978082, + "loss": 0.1766, + "step": 334010 + }, + { + "epoch": 13.84, + "grad_norm": 1.2421875, + "learning_rate": 0.00028048031283925, + "loss": 0.2301, + "step": 334020 + }, + { + "epoch": 13.84, + "grad_norm": 0.314453125, + "learning_rate": 0.0002804695485233312, + "loss": 0.2018, + "step": 334030 + }, + { + "epoch": 13.84, + "grad_norm": 1.34375, + "learning_rate": 0.0002804587841500719, + "loss": 0.2038, + "step": 334040 + }, + { + "epoch": 13.84, + "grad_norm": 0.66796875, + "learning_rate": 0.00028044801971949246, + "loss": 0.1369, + "step": 334050 + }, + { + "epoch": 13.84, + "grad_norm": 0.94140625, + "learning_rate": 0.00028043725523161316, + "loss": 0.1427, + "step": 334060 + }, + { + "epoch": 13.84, + "grad_norm": 0.61328125, + "learning_rate": 0.0002804264906864542, + "loss": 0.1976, + "step": 334070 + }, + { + "epoch": 13.84, + "grad_norm": 1.0390625, + "learning_rate": 0.000280415726084036, + "loss": 0.1741, + "step": 334080 + }, + { + "epoch": 13.84, + "grad_norm": 1.0859375, + "learning_rate": 0.00028040496142437847, + "loss": 0.1824, + "step": 334090 + }, + { + "epoch": 13.84, + "grad_norm": 1.0390625, + "learning_rate": 0.0002803941967075023, + "loss": 0.1983, + "step": 334100 + }, + { + "epoch": 13.84, + "grad_norm": 0.88671875, + "learning_rate": 0.0002803834319334274, + "loss": 0.1886, + "step": 334110 + }, + { + "epoch": 13.84, + "grad_norm": 1.21875, + "learning_rate": 0.0002803726671021742, + "loss": 0.1308, + "step": 334120 + }, + { + "epoch": 13.84, + "grad_norm": 1.1171875, + "learning_rate": 0.000280361902213763, + "loss": 0.1999, + "step": 334130 + }, + { + "epoch": 13.84, + "grad_norm": 0.6484375, + "learning_rate": 0.0002803511372682138, + "loss": 0.1699, + "step": 334140 + }, + { + "epoch": 13.84, + "grad_norm": 0.984375, + "learning_rate": 0.0002803403722655471, + "loss": 0.1941, + "step": 334150 + }, + { + "epoch": 13.84, + "grad_norm": 0.55859375, + "learning_rate": 0.0002803296072057832, + "loss": 0.2347, + "step": 334160 + }, + { + "epoch": 13.84, + "grad_norm": 1.0546875, + "learning_rate": 0.00028031884208894214, + "loss": 0.1779, + "step": 334170 + }, + { + "epoch": 13.84, + "grad_norm": 0.92578125, + "learning_rate": 0.0002803080769150444, + "loss": 0.1857, + "step": 334180 + }, + { + "epoch": 13.84, + "grad_norm": 1.046875, + "learning_rate": 0.00028029731168411004, + "loss": 0.2249, + "step": 334190 + }, + { + "epoch": 13.84, + "grad_norm": 0.44921875, + "learning_rate": 0.0002802865463961594, + "loss": 0.1334, + "step": 334200 + }, + { + "epoch": 13.84, + "grad_norm": 0.6796875, + "learning_rate": 0.0002802757810512129, + "loss": 0.199, + "step": 334210 + }, + { + "epoch": 13.84, + "grad_norm": 1.390625, + "learning_rate": 0.00028026501564929043, + "loss": 0.1974, + "step": 334220 + }, + { + "epoch": 13.84, + "grad_norm": 0.361328125, + "learning_rate": 0.0002802542501904126, + "loss": 0.2199, + "step": 334230 + }, + { + "epoch": 13.84, + "grad_norm": 0.42578125, + "learning_rate": 0.0002802434846745996, + "loss": 0.1932, + "step": 334240 + }, + { + "epoch": 13.84, + "grad_norm": 0.61328125, + "learning_rate": 0.00028023271910187147, + "loss": 0.2093, + "step": 334250 + }, + { + "epoch": 13.85, + "grad_norm": 0.69140625, + "learning_rate": 0.0002802219534722488, + "loss": 0.1833, + "step": 334260 + }, + { + "epoch": 13.85, + "grad_norm": 0.474609375, + "learning_rate": 0.0002802111877857515, + "loss": 0.1564, + "step": 334270 + }, + { + "epoch": 13.85, + "grad_norm": 0.6640625, + "learning_rate": 0.0002802004220424001, + "loss": 0.1622, + "step": 334280 + }, + { + "epoch": 13.85, + "grad_norm": 0.337890625, + "learning_rate": 0.00028018965624221484, + "loss": 0.2478, + "step": 334290 + }, + { + "epoch": 13.85, + "grad_norm": 0.51953125, + "learning_rate": 0.0002801788903852157, + "loss": 0.1221, + "step": 334300 + }, + { + "epoch": 13.85, + "grad_norm": 0.72265625, + "learning_rate": 0.0002801681244714233, + "loss": 0.2178, + "step": 334310 + }, + { + "epoch": 13.85, + "grad_norm": 0.515625, + "learning_rate": 0.00028015735850085775, + "loss": 0.2122, + "step": 334320 + }, + { + "epoch": 13.85, + "grad_norm": 0.72265625, + "learning_rate": 0.00028014659247353925, + "loss": 0.2189, + "step": 334330 + }, + { + "epoch": 13.85, + "grad_norm": 0.5078125, + "learning_rate": 0.0002801358263894882, + "loss": 0.2247, + "step": 334340 + }, + { + "epoch": 13.85, + "grad_norm": 1.578125, + "learning_rate": 0.0002801250602487247, + "loss": 0.2162, + "step": 334350 + }, + { + "epoch": 13.85, + "grad_norm": 0.94140625, + "learning_rate": 0.0002801142940512691, + "loss": 0.2008, + "step": 334360 + }, + { + "epoch": 13.85, + "grad_norm": 0.609375, + "learning_rate": 0.00028010352779714174, + "loss": 0.2223, + "step": 334370 + }, + { + "epoch": 13.85, + "grad_norm": 0.7109375, + "learning_rate": 0.00028009276148636264, + "loss": 0.1997, + "step": 334380 + }, + { + "epoch": 13.85, + "grad_norm": 0.80078125, + "learning_rate": 0.0002800819951189523, + "loss": 0.1692, + "step": 334390 + }, + { + "epoch": 13.85, + "grad_norm": 0.71875, + "learning_rate": 0.0002800712286949309, + "loss": 0.1939, + "step": 334400 + }, + { + "epoch": 13.85, + "grad_norm": 1.3125, + "learning_rate": 0.00028006046221431863, + "loss": 0.1987, + "step": 334410 + }, + { + "epoch": 13.85, + "grad_norm": 0.86328125, + "learning_rate": 0.00028004969567713595, + "loss": 0.219, + "step": 334420 + }, + { + "epoch": 13.85, + "grad_norm": 0.66015625, + "learning_rate": 0.0002800389290834029, + "loss": 0.1427, + "step": 334430 + }, + { + "epoch": 13.85, + "grad_norm": 1.171875, + "learning_rate": 0.0002800281624331398, + "loss": 0.2, + "step": 334440 + }, + { + "epoch": 13.85, + "grad_norm": 1.515625, + "learning_rate": 0.000280017395726367, + "loss": 0.2268, + "step": 334450 + }, + { + "epoch": 13.85, + "grad_norm": 0.8671875, + "learning_rate": 0.00028000662896310465, + "loss": 0.141, + "step": 334460 + }, + { + "epoch": 13.85, + "grad_norm": 0.6484375, + "learning_rate": 0.0002799958621433731, + "loss": 0.1832, + "step": 334470 + }, + { + "epoch": 13.85, + "grad_norm": 1.5390625, + "learning_rate": 0.0002799850952671926, + "loss": 0.1784, + "step": 334480 + }, + { + "epoch": 13.85, + "grad_norm": 1.375, + "learning_rate": 0.0002799743283345833, + "loss": 0.2396, + "step": 334490 + }, + { + "epoch": 13.85, + "grad_norm": 0.7734375, + "learning_rate": 0.00027996356134556567, + "loss": 0.207, + "step": 334500 + }, + { + "epoch": 13.86, + "grad_norm": 1.34375, + "learning_rate": 0.00027995279430015975, + "loss": 0.1409, + "step": 334510 + }, + { + "epoch": 13.86, + "grad_norm": 0.5625, + "learning_rate": 0.00027994202719838594, + "loss": 0.2278, + "step": 334520 + }, + { + "epoch": 13.86, + "grad_norm": 0.408203125, + "learning_rate": 0.0002799312600402645, + "loss": 0.2461, + "step": 334530 + }, + { + "epoch": 13.86, + "grad_norm": 1.140625, + "learning_rate": 0.00027992049282581555, + "loss": 0.2147, + "step": 334540 + }, + { + "epoch": 13.86, + "grad_norm": 1.3828125, + "learning_rate": 0.0002799097255550596, + "loss": 0.2047, + "step": 334550 + }, + { + "epoch": 13.86, + "grad_norm": 0.2236328125, + "learning_rate": 0.00027989895822801673, + "loss": 0.2319, + "step": 334560 + }, + { + "epoch": 13.86, + "grad_norm": 0.43359375, + "learning_rate": 0.0002798881908447072, + "loss": 0.1704, + "step": 334570 + }, + { + "epoch": 13.86, + "grad_norm": 2.59375, + "learning_rate": 0.0002798774234051514, + "loss": 0.2479, + "step": 334580 + }, + { + "epoch": 13.86, + "grad_norm": 0.8671875, + "learning_rate": 0.00027986665590936955, + "loss": 0.1688, + "step": 334590 + }, + { + "epoch": 13.86, + "grad_norm": 1.265625, + "learning_rate": 0.0002798558883573818, + "loss": 0.1811, + "step": 334600 + }, + { + "epoch": 13.86, + "grad_norm": 0.9375, + "learning_rate": 0.0002798451207492085, + "loss": 0.2567, + "step": 334610 + }, + { + "epoch": 13.86, + "grad_norm": 0.9921875, + "learning_rate": 0.0002798343530848699, + "loss": 0.1717, + "step": 334620 + }, + { + "epoch": 13.86, + "grad_norm": 0.875, + "learning_rate": 0.0002798235853643863, + "loss": 0.1782, + "step": 334630 + }, + { + "epoch": 13.86, + "grad_norm": 0.82421875, + "learning_rate": 0.00027981281758777786, + "loss": 0.1695, + "step": 334640 + }, + { + "epoch": 13.86, + "grad_norm": 1.2265625, + "learning_rate": 0.000279802049755065, + "loss": 0.1715, + "step": 334650 + }, + { + "epoch": 13.86, + "grad_norm": 0.2080078125, + "learning_rate": 0.0002797912818662679, + "loss": 0.1856, + "step": 334660 + }, + { + "epoch": 13.86, + "grad_norm": 0.6640625, + "learning_rate": 0.0002797805139214068, + "loss": 0.2055, + "step": 334670 + }, + { + "epoch": 13.86, + "grad_norm": 1.015625, + "learning_rate": 0.00027976974592050193, + "loss": 0.2358, + "step": 334680 + }, + { + "epoch": 13.86, + "grad_norm": 0.8203125, + "learning_rate": 0.0002797589778635737, + "loss": 0.1696, + "step": 334690 + }, + { + "epoch": 13.86, + "grad_norm": 1.5078125, + "learning_rate": 0.00027974820975064217, + "loss": 0.16, + "step": 334700 + }, + { + "epoch": 13.86, + "grad_norm": 0.43359375, + "learning_rate": 0.00027973744158172784, + "loss": 0.213, + "step": 334710 + }, + { + "epoch": 13.86, + "grad_norm": 0.66015625, + "learning_rate": 0.00027972667335685085, + "loss": 0.2309, + "step": 334720 + }, + { + "epoch": 13.86, + "grad_norm": 1.125, + "learning_rate": 0.00027971590507603147, + "loss": 0.1953, + "step": 334730 + }, + { + "epoch": 13.86, + "grad_norm": 0.73046875, + "learning_rate": 0.0002797051367392899, + "loss": 0.2455, + "step": 334740 + }, + { + "epoch": 13.87, + "grad_norm": 0.67578125, + "learning_rate": 0.0002796943683466465, + "loss": 0.1844, + "step": 334750 + }, + { + "epoch": 13.87, + "grad_norm": 0.85546875, + "learning_rate": 0.00027968359989812155, + "loss": 0.2074, + "step": 334760 + }, + { + "epoch": 13.87, + "grad_norm": 0.75390625, + "learning_rate": 0.00027967283139373523, + "loss": 0.1668, + "step": 334770 + }, + { + "epoch": 13.87, + "grad_norm": 1.5859375, + "learning_rate": 0.00027966206283350784, + "loss": 0.1588, + "step": 334780 + }, + { + "epoch": 13.87, + "grad_norm": 0.55078125, + "learning_rate": 0.0002796512942174597, + "loss": 0.1884, + "step": 334790 + }, + { + "epoch": 13.87, + "grad_norm": 1.1328125, + "learning_rate": 0.000279640525545611, + "loss": 0.2084, + "step": 334800 + }, + { + "epoch": 13.87, + "grad_norm": 2.140625, + "learning_rate": 0.00027962975681798197, + "loss": 0.2014, + "step": 334810 + }, + { + "epoch": 13.87, + "grad_norm": 0.9453125, + "learning_rate": 0.00027961898803459303, + "loss": 0.194, + "step": 334820 + }, + { + "epoch": 13.87, + "grad_norm": 0.84765625, + "learning_rate": 0.00027960821919546433, + "loss": 0.2021, + "step": 334830 + }, + { + "epoch": 13.87, + "grad_norm": 0.9375, + "learning_rate": 0.0002795974503006161, + "loss": 0.1659, + "step": 334840 + }, + { + "epoch": 13.87, + "grad_norm": 0.83203125, + "learning_rate": 0.0002795866813500687, + "loss": 0.2603, + "step": 334850 + }, + { + "epoch": 13.87, + "grad_norm": 1.3671875, + "learning_rate": 0.00027957591234384235, + "loss": 0.1721, + "step": 334860 + }, + { + "epoch": 13.87, + "grad_norm": 0.65625, + "learning_rate": 0.0002795651432819574, + "loss": 0.2228, + "step": 334870 + }, + { + "epoch": 13.87, + "grad_norm": 1.3125, + "learning_rate": 0.000279554374164434, + "loss": 0.1724, + "step": 334880 + }, + { + "epoch": 13.87, + "grad_norm": 1.375, + "learning_rate": 0.0002795436049912924, + "loss": 0.1398, + "step": 334890 + }, + { + "epoch": 13.87, + "grad_norm": 1.890625, + "learning_rate": 0.000279532835762553, + "loss": 0.2173, + "step": 334900 + }, + { + "epoch": 13.87, + "grad_norm": 0.515625, + "learning_rate": 0.00027952206647823593, + "loss": 0.1784, + "step": 334910 + }, + { + "epoch": 13.87, + "grad_norm": 0.93359375, + "learning_rate": 0.0002795112971383615, + "loss": 0.2326, + "step": 334920 + }, + { + "epoch": 13.87, + "grad_norm": 0.81640625, + "learning_rate": 0.0002795005277429501, + "loss": 0.1498, + "step": 334930 + }, + { + "epoch": 13.87, + "grad_norm": 0.890625, + "learning_rate": 0.00027948975829202185, + "loss": 0.2273, + "step": 334940 + }, + { + "epoch": 13.87, + "grad_norm": 0.67578125, + "learning_rate": 0.00027947898878559707, + "loss": 0.1944, + "step": 334950 + }, + { + "epoch": 13.87, + "grad_norm": 1.0390625, + "learning_rate": 0.00027946821922369595, + "loss": 0.1615, + "step": 334960 + }, + { + "epoch": 13.87, + "grad_norm": 0.94140625, + "learning_rate": 0.0002794574496063389, + "loss": 0.1454, + "step": 334970 + }, + { + "epoch": 13.87, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002794466799335461, + "loss": 0.2127, + "step": 334980 + }, + { + "epoch": 13.88, + "grad_norm": 0.07373046875, + "learning_rate": 0.0002794359102053378, + "loss": 0.184, + "step": 334990 + }, + { + "epoch": 13.88, + "grad_norm": 1.171875, + "learning_rate": 0.0002794251404217343, + "loss": 0.2175, + "step": 335000 + }, + { + "epoch": 13.88, + "grad_norm": 0.46484375, + "learning_rate": 0.0002794143705827558, + "loss": 0.1478, + "step": 335010 + }, + { + "epoch": 13.88, + "grad_norm": 0.87890625, + "learning_rate": 0.00027940360068842273, + "loss": 0.1652, + "step": 335020 + }, + { + "epoch": 13.88, + "grad_norm": 0.46484375, + "learning_rate": 0.0002793928307387553, + "loss": 0.1957, + "step": 335030 + }, + { + "epoch": 13.88, + "grad_norm": 0.53515625, + "learning_rate": 0.0002793820607337736, + "loss": 0.1783, + "step": 335040 + }, + { + "epoch": 13.88, + "grad_norm": 1.359375, + "learning_rate": 0.00027937129067349807, + "loss": 0.1879, + "step": 335050 + }, + { + "epoch": 13.88, + "grad_norm": 0.35546875, + "learning_rate": 0.000279360520557949, + "loss": 0.173, + "step": 335060 + }, + { + "epoch": 13.88, + "grad_norm": 0.412109375, + "learning_rate": 0.00027934975038714654, + "loss": 0.2028, + "step": 335070 + }, + { + "epoch": 13.88, + "grad_norm": 0.86328125, + "learning_rate": 0.0002793389801611111, + "loss": 0.168, + "step": 335080 + }, + { + "epoch": 13.88, + "grad_norm": 1.65625, + "learning_rate": 0.0002793282098798628, + "loss": 0.1739, + "step": 335090 + }, + { + "epoch": 13.88, + "grad_norm": 0.69140625, + "learning_rate": 0.00027931743954342196, + "loss": 0.1597, + "step": 335100 + }, + { + "epoch": 13.88, + "grad_norm": 1.5625, + "learning_rate": 0.000279306669151809, + "loss": 0.2034, + "step": 335110 + }, + { + "epoch": 13.88, + "grad_norm": 0.69921875, + "learning_rate": 0.0002792958987050439, + "loss": 0.239, + "step": 335120 + }, + { + "epoch": 13.88, + "grad_norm": 0.5546875, + "learning_rate": 0.00027928512820314715, + "loss": 0.2077, + "step": 335130 + }, + { + "epoch": 13.88, + "grad_norm": 1.2578125, + "learning_rate": 0.000279274357646139, + "loss": 0.1501, + "step": 335140 + }, + { + "epoch": 13.88, + "grad_norm": 0.8203125, + "learning_rate": 0.0002792635870340395, + "loss": 0.1604, + "step": 335150 + }, + { + "epoch": 13.88, + "grad_norm": 1.3828125, + "learning_rate": 0.0002792528163668693, + "loss": 0.201, + "step": 335160 + }, + { + "epoch": 13.88, + "grad_norm": 0.55078125, + "learning_rate": 0.0002792420456446483, + "loss": 0.1915, + "step": 335170 + }, + { + "epoch": 13.88, + "grad_norm": 0.83984375, + "learning_rate": 0.000279231274867397, + "loss": 0.2356, + "step": 335180 + }, + { + "epoch": 13.88, + "grad_norm": 0.9140625, + "learning_rate": 0.00027922050403513557, + "loss": 0.1683, + "step": 335190 + }, + { + "epoch": 13.88, + "grad_norm": 0.58984375, + "learning_rate": 0.00027920973314788435, + "loss": 0.1903, + "step": 335200 + }, + { + "epoch": 13.88, + "grad_norm": 0.83984375, + "learning_rate": 0.0002791989622056635, + "loss": 0.1133, + "step": 335210 + }, + { + "epoch": 13.88, + "grad_norm": 0.875, + "learning_rate": 0.00027918819120849336, + "loss": 0.1947, + "step": 335220 + }, + { + "epoch": 13.89, + "grad_norm": 8.344650268554688e-05, + "learning_rate": 0.00027917742015639424, + "loss": 0.1846, + "step": 335230 + }, + { + "epoch": 13.89, + "grad_norm": 1.140625, + "learning_rate": 0.00027916664904938636, + "loss": 0.1866, + "step": 335240 + }, + { + "epoch": 13.89, + "grad_norm": 0.89453125, + "learning_rate": 0.00027915587788749003, + "loss": 0.1888, + "step": 335250 + }, + { + "epoch": 13.89, + "grad_norm": 0.466796875, + "learning_rate": 0.0002791451066707254, + "loss": 0.1856, + "step": 335260 + }, + { + "epoch": 13.89, + "grad_norm": 0.609375, + "learning_rate": 0.0002791343353991129, + "loss": 0.1732, + "step": 335270 + }, + { + "epoch": 13.89, + "grad_norm": 1.1328125, + "learning_rate": 0.0002791235640726727, + "loss": 0.1922, + "step": 335280 + }, + { + "epoch": 13.89, + "grad_norm": 0.474609375, + "learning_rate": 0.00027911279269142507, + "loss": 0.1768, + "step": 335290 + }, + { + "epoch": 13.89, + "grad_norm": 1.2890625, + "learning_rate": 0.00027910202125539034, + "loss": 0.1937, + "step": 335300 + }, + { + "epoch": 13.89, + "grad_norm": 1.265625, + "learning_rate": 0.00027909124976458876, + "loss": 0.2186, + "step": 335310 + }, + { + "epoch": 13.89, + "grad_norm": 0.890625, + "learning_rate": 0.0002790804782190406, + "loss": 0.2211, + "step": 335320 + }, + { + "epoch": 13.89, + "grad_norm": 0.56640625, + "learning_rate": 0.00027906970661876606, + "loss": 0.2189, + "step": 335330 + }, + { + "epoch": 13.89, + "grad_norm": 1.015625, + "learning_rate": 0.00027905893496378553, + "loss": 0.1687, + "step": 335340 + }, + { + "epoch": 13.89, + "grad_norm": 0.58203125, + "learning_rate": 0.0002790481632541192, + "loss": 0.1309, + "step": 335350 + }, + { + "epoch": 13.89, + "grad_norm": 0.9609375, + "learning_rate": 0.0002790373914897874, + "loss": 0.204, + "step": 335360 + }, + { + "epoch": 13.89, + "grad_norm": 0.373046875, + "learning_rate": 0.00027902661967081025, + "loss": 0.1627, + "step": 335370 + }, + { + "epoch": 13.89, + "grad_norm": 1.2890625, + "learning_rate": 0.0002790158477972082, + "loss": 0.1815, + "step": 335380 + }, + { + "epoch": 13.89, + "grad_norm": 1.015625, + "learning_rate": 0.00027900507586900154, + "loss": 0.1792, + "step": 335390 + }, + { + "epoch": 13.89, + "grad_norm": 0.8828125, + "learning_rate": 0.0002789943038862103, + "loss": 0.1877, + "step": 335400 + }, + { + "epoch": 13.89, + "grad_norm": 1.1015625, + "learning_rate": 0.0002789835318488551, + "loss": 0.206, + "step": 335410 + }, + { + "epoch": 13.89, + "grad_norm": 0.91796875, + "learning_rate": 0.0002789727597569559, + "loss": 0.2027, + "step": 335420 + }, + { + "epoch": 13.89, + "grad_norm": 0.72265625, + "learning_rate": 0.0002789619876105331, + "loss": 0.1569, + "step": 335430 + }, + { + "epoch": 13.89, + "grad_norm": 1.1015625, + "learning_rate": 0.00027895121540960703, + "loss": 0.225, + "step": 335440 + }, + { + "epoch": 13.89, + "grad_norm": 0.91015625, + "learning_rate": 0.0002789404431541979, + "loss": 0.2077, + "step": 335450 + }, + { + "epoch": 13.89, + "grad_norm": 1.703125, + "learning_rate": 0.00027892967084432586, + "loss": 0.2337, + "step": 335460 + }, + { + "epoch": 13.9, + "grad_norm": 1.0, + "learning_rate": 0.0002789188984800114, + "loss": 0.2185, + "step": 335470 + }, + { + "epoch": 13.9, + "grad_norm": 0.515625, + "learning_rate": 0.0002789081260612746, + "loss": 0.1966, + "step": 335480 + }, + { + "epoch": 13.9, + "grad_norm": 1.4453125, + "learning_rate": 0.000278897353588136, + "loss": 0.2225, + "step": 335490 + }, + { + "epoch": 13.9, + "grad_norm": 1.2265625, + "learning_rate": 0.0002788865810606156, + "loss": 0.1834, + "step": 335500 + }, + { + "epoch": 13.9, + "grad_norm": 0.7578125, + "learning_rate": 0.00027887580847873376, + "loss": 0.2095, + "step": 335510 + }, + { + "epoch": 13.9, + "grad_norm": 0.82421875, + "learning_rate": 0.0002788650358425108, + "loss": 0.1423, + "step": 335520 + }, + { + "epoch": 13.9, + "grad_norm": 0.50390625, + "learning_rate": 0.00027885426315196694, + "loss": 0.2145, + "step": 335530 + }, + { + "epoch": 13.9, + "grad_norm": 0.65625, + "learning_rate": 0.0002788434904071225, + "loss": 0.2068, + "step": 335540 + }, + { + "epoch": 13.9, + "grad_norm": 1.1875, + "learning_rate": 0.00027883271760799767, + "loss": 0.2389, + "step": 335550 + }, + { + "epoch": 13.9, + "grad_norm": 0.5, + "learning_rate": 0.0002788219447546128, + "loss": 0.2167, + "step": 335560 + }, + { + "epoch": 13.9, + "grad_norm": 0.66796875, + "learning_rate": 0.00027881117184698826, + "loss": 0.185, + "step": 335570 + }, + { + "epoch": 13.9, + "grad_norm": 1.234375, + "learning_rate": 0.00027880039888514405, + "loss": 0.1787, + "step": 335580 + }, + { + "epoch": 13.9, + "grad_norm": 0.26953125, + "learning_rate": 0.0002787896258691006, + "loss": 0.1852, + "step": 335590 + }, + { + "epoch": 13.9, + "grad_norm": 0.48828125, + "learning_rate": 0.0002787788527988783, + "loss": 0.1944, + "step": 335600 + }, + { + "epoch": 13.9, + "grad_norm": 0.419921875, + "learning_rate": 0.0002787680796744972, + "loss": 0.2103, + "step": 335610 + }, + { + "epoch": 13.9, + "grad_norm": 0.75390625, + "learning_rate": 0.00027875730649597777, + "loss": 0.2357, + "step": 335620 + }, + { + "epoch": 13.9, + "grad_norm": 1.0546875, + "learning_rate": 0.0002787465332633401, + "loss": 0.1892, + "step": 335630 + }, + { + "epoch": 13.9, + "grad_norm": 0.9453125, + "learning_rate": 0.0002787357599766046, + "loss": 0.185, + "step": 335640 + }, + { + "epoch": 13.9, + "grad_norm": 0.61328125, + "learning_rate": 0.00027872498663579156, + "loss": 0.1926, + "step": 335650 + }, + { + "epoch": 13.9, + "grad_norm": 0.90234375, + "learning_rate": 0.0002787142132409212, + "loss": 0.2685, + "step": 335660 + }, + { + "epoch": 13.9, + "grad_norm": 0.94140625, + "learning_rate": 0.00027870343979201367, + "loss": 0.1931, + "step": 335670 + }, + { + "epoch": 13.9, + "grad_norm": 0.283203125, + "learning_rate": 0.0002786926662890895, + "loss": 0.1571, + "step": 335680 + }, + { + "epoch": 13.9, + "grad_norm": 0.85546875, + "learning_rate": 0.00027868189273216865, + "loss": 0.1609, + "step": 335690 + }, + { + "epoch": 13.9, + "grad_norm": 0.8671875, + "learning_rate": 0.00027867111912127174, + "loss": 0.1857, + "step": 335700 + }, + { + "epoch": 13.91, + "grad_norm": 0.8125, + "learning_rate": 0.0002786603454564188, + "loss": 0.1802, + "step": 335710 + }, + { + "epoch": 13.91, + "grad_norm": 1.0625, + "learning_rate": 0.00027864957173763023, + "loss": 0.2024, + "step": 335720 + }, + { + "epoch": 13.91, + "grad_norm": 1.046875, + "learning_rate": 0.0002786387979649263, + "loss": 0.1607, + "step": 335730 + }, + { + "epoch": 13.91, + "grad_norm": 1.7265625, + "learning_rate": 0.0002786280241383271, + "loss": 0.2019, + "step": 335740 + }, + { + "epoch": 13.91, + "grad_norm": 1.40625, + "learning_rate": 0.00027861725025785317, + "loss": 0.2024, + "step": 335750 + }, + { + "epoch": 13.91, + "grad_norm": 0.8046875, + "learning_rate": 0.0002786064763235247, + "loss": 0.1725, + "step": 335760 + }, + { + "epoch": 13.91, + "grad_norm": 1.1796875, + "learning_rate": 0.0002785957023353618, + "loss": 0.196, + "step": 335770 + }, + { + "epoch": 13.91, + "grad_norm": 0.74609375, + "learning_rate": 0.000278584928293385, + "loss": 0.183, + "step": 335780 + }, + { + "epoch": 13.91, + "grad_norm": 1.046875, + "learning_rate": 0.0002785741541976144, + "loss": 0.1706, + "step": 335790 + }, + { + "epoch": 13.91, + "grad_norm": 0.78515625, + "learning_rate": 0.0002785633800480703, + "loss": 0.1893, + "step": 335800 + }, + { + "epoch": 13.91, + "grad_norm": 0.7890625, + "learning_rate": 0.00027855260584477306, + "loss": 0.2629, + "step": 335810 + }, + { + "epoch": 13.91, + "grad_norm": 0.40625, + "learning_rate": 0.0002785418315877428, + "loss": 0.2056, + "step": 335820 + }, + { + "epoch": 13.91, + "grad_norm": 1.3671875, + "learning_rate": 0.00027853105727700003, + "loss": 0.1968, + "step": 335830 + }, + { + "epoch": 13.91, + "grad_norm": 1.265625, + "learning_rate": 0.00027852028291256483, + "loss": 0.1845, + "step": 335840 + }, + { + "epoch": 13.91, + "grad_norm": 1.9140625, + "learning_rate": 0.00027850950849445745, + "loss": 0.1504, + "step": 335850 + }, + { + "epoch": 13.91, + "grad_norm": 0.703125, + "learning_rate": 0.0002784987340226984, + "loss": 0.1898, + "step": 335860 + }, + { + "epoch": 13.91, + "grad_norm": 0.78515625, + "learning_rate": 0.00027848795949730773, + "loss": 0.2588, + "step": 335870 + }, + { + "epoch": 13.91, + "grad_norm": 0.4453125, + "learning_rate": 0.0002784771849183058, + "loss": 0.1692, + "step": 335880 + }, + { + "epoch": 13.91, + "grad_norm": 0.478515625, + "learning_rate": 0.0002784664102857129, + "loss": 0.2029, + "step": 335890 + }, + { + "epoch": 13.91, + "grad_norm": 0.5859375, + "learning_rate": 0.0002784556355995492, + "loss": 0.2044, + "step": 335900 + }, + { + "epoch": 13.91, + "grad_norm": 0.765625, + "learning_rate": 0.0002784448608598352, + "loss": 0.2016, + "step": 335910 + }, + { + "epoch": 13.91, + "grad_norm": 0.52734375, + "learning_rate": 0.00027843408606659107, + "loss": 0.1528, + "step": 335920 + }, + { + "epoch": 13.91, + "grad_norm": 0.90234375, + "learning_rate": 0.0002784233112198369, + "loss": 0.1537, + "step": 335930 + }, + { + "epoch": 13.91, + "grad_norm": 0.984375, + "learning_rate": 0.00027841253631959325, + "loss": 0.1634, + "step": 335940 + }, + { + "epoch": 13.92, + "grad_norm": 1.015625, + "learning_rate": 0.00027840176136588023, + "loss": 0.2507, + "step": 335950 + }, + { + "epoch": 13.92, + "grad_norm": 0.484375, + "learning_rate": 0.0002783909863587181, + "loss": 0.1694, + "step": 335960 + }, + { + "epoch": 13.92, + "grad_norm": 0.609375, + "learning_rate": 0.0002783802112981273, + "loss": 0.1871, + "step": 335970 + }, + { + "epoch": 13.92, + "grad_norm": 1.2734375, + "learning_rate": 0.000278369436184128, + "loss": 0.1664, + "step": 335980 + }, + { + "epoch": 13.92, + "grad_norm": 0.84375, + "learning_rate": 0.00027835866101674046, + "loss": 0.1941, + "step": 335990 + }, + { + "epoch": 13.92, + "grad_norm": 0.59765625, + "learning_rate": 0.00027834788579598503, + "loss": 0.209, + "step": 336000 + }, + { + "epoch": 13.92, + "grad_norm": 0.671875, + "learning_rate": 0.0002783371105218818, + "loss": 0.2089, + "step": 336010 + }, + { + "epoch": 13.92, + "grad_norm": 1.40625, + "learning_rate": 0.0002783263351944513, + "loss": 0.2127, + "step": 336020 + }, + { + "epoch": 13.92, + "grad_norm": 0.41796875, + "learning_rate": 0.0002783155598137137, + "loss": 0.1802, + "step": 336030 + }, + { + "epoch": 13.92, + "grad_norm": 0.70703125, + "learning_rate": 0.00027830478437968924, + "loss": 0.2116, + "step": 336040 + }, + { + "epoch": 13.92, + "grad_norm": 0.859375, + "learning_rate": 0.0002782940088923983, + "loss": 0.2125, + "step": 336050 + }, + { + "epoch": 13.92, + "grad_norm": 0.90234375, + "learning_rate": 0.00027828323335186094, + "loss": 0.1965, + "step": 336060 + }, + { + "epoch": 13.92, + "grad_norm": 1.796875, + "learning_rate": 0.00027827245775809774, + "loss": 0.2051, + "step": 336070 + }, + { + "epoch": 13.92, + "grad_norm": 0.87109375, + "learning_rate": 0.00027826168211112884, + "loss": 0.2278, + "step": 336080 + }, + { + "epoch": 13.92, + "grad_norm": 0.78125, + "learning_rate": 0.00027825090641097434, + "loss": 0.181, + "step": 336090 + }, + { + "epoch": 13.92, + "grad_norm": 1.0703125, + "learning_rate": 0.00027824013065765485, + "loss": 0.1891, + "step": 336100 + }, + { + "epoch": 13.92, + "grad_norm": 0.84375, + "learning_rate": 0.0002782293548511904, + "loss": 0.1859, + "step": 336110 + }, + { + "epoch": 13.92, + "grad_norm": 1.1484375, + "learning_rate": 0.00027821857899160135, + "loss": 0.2149, + "step": 336120 + }, + { + "epoch": 13.92, + "grad_norm": 0.84375, + "learning_rate": 0.00027820780307890805, + "loss": 0.2021, + "step": 336130 + }, + { + "epoch": 13.92, + "grad_norm": 0.7265625, + "learning_rate": 0.0002781970271131305, + "loss": 0.2053, + "step": 336140 + }, + { + "epoch": 13.92, + "grad_norm": 0.6953125, + "learning_rate": 0.00027818625109428944, + "loss": 0.21, + "step": 336150 + }, + { + "epoch": 13.92, + "grad_norm": 0.455078125, + "learning_rate": 0.0002781754750224048, + "loss": 0.1808, + "step": 336160 + }, + { + "epoch": 13.92, + "grad_norm": 0.1796875, + "learning_rate": 0.00027816469889749697, + "loss": 0.1883, + "step": 336170 + }, + { + "epoch": 13.92, + "grad_norm": 0.345703125, + "learning_rate": 0.0002781539227195863, + "loss": 0.1894, + "step": 336180 + }, + { + "epoch": 13.92, + "grad_norm": 1.109375, + "learning_rate": 0.0002781431464886929, + "loss": 0.2191, + "step": 336190 + }, + { + "epoch": 13.93, + "grad_norm": 1.359375, + "learning_rate": 0.0002781323702048371, + "loss": 0.2267, + "step": 336200 + }, + { + "epoch": 13.93, + "grad_norm": 0.81640625, + "learning_rate": 0.0002781215938680393, + "loss": 0.2392, + "step": 336210 + }, + { + "epoch": 13.93, + "grad_norm": 1.0234375, + "learning_rate": 0.00027811081747831955, + "loss": 0.2099, + "step": 336220 + }, + { + "epoch": 13.93, + "grad_norm": 0.45703125, + "learning_rate": 0.0002781000410356985, + "loss": 0.1822, + "step": 336230 + }, + { + "epoch": 13.93, + "grad_norm": 1.8984375, + "learning_rate": 0.00027808926454019604, + "loss": 0.1976, + "step": 336240 + }, + { + "epoch": 13.93, + "grad_norm": 0.78125, + "learning_rate": 0.00027807848799183267, + "loss": 0.2233, + "step": 336250 + }, + { + "epoch": 13.93, + "grad_norm": 1.046875, + "learning_rate": 0.00027806771139062865, + "loss": 0.1781, + "step": 336260 + }, + { + "epoch": 13.93, + "grad_norm": 1.734375, + "learning_rate": 0.0002780569347366042, + "loss": 0.1761, + "step": 336270 + }, + { + "epoch": 13.93, + "grad_norm": 0.54296875, + "learning_rate": 0.00027804615802977964, + "loss": 0.2115, + "step": 336280 + }, + { + "epoch": 13.93, + "grad_norm": 0.8671875, + "learning_rate": 0.0002780353812701752, + "loss": 0.2051, + "step": 336290 + }, + { + "epoch": 13.93, + "grad_norm": 0.3515625, + "learning_rate": 0.0002780246044578112, + "loss": 0.1654, + "step": 336300 + }, + { + "epoch": 13.93, + "grad_norm": 0.6171875, + "learning_rate": 0.000278013827592708, + "loss": 0.2394, + "step": 336310 + }, + { + "epoch": 13.93, + "grad_norm": 1.2265625, + "learning_rate": 0.0002780030506748857, + "loss": 0.1806, + "step": 336320 + }, + { + "epoch": 13.93, + "grad_norm": 0.890625, + "learning_rate": 0.0002779922737043647, + "loss": 0.19, + "step": 336330 + }, + { + "epoch": 13.93, + "grad_norm": 1.3359375, + "learning_rate": 0.00027798149668116533, + "loss": 0.1982, + "step": 336340 + }, + { + "epoch": 13.93, + "grad_norm": 1.671875, + "learning_rate": 0.0002779707196053078, + "loss": 0.1905, + "step": 336350 + }, + { + "epoch": 13.93, + "grad_norm": 0.765625, + "learning_rate": 0.00027795994247681235, + "loss": 0.2071, + "step": 336360 + }, + { + "epoch": 13.93, + "grad_norm": 0.6953125, + "learning_rate": 0.0002779491652956993, + "loss": 0.1641, + "step": 336370 + }, + { + "epoch": 13.93, + "grad_norm": 3.25, + "learning_rate": 0.00027793838806198894, + "loss": 0.2168, + "step": 336380 + }, + { + "epoch": 13.93, + "grad_norm": 0.8515625, + "learning_rate": 0.00027792761077570163, + "loss": 0.1635, + "step": 336390 + }, + { + "epoch": 13.93, + "grad_norm": 0.3515625, + "learning_rate": 0.00027791683343685746, + "loss": 0.1869, + "step": 336400 + }, + { + "epoch": 13.93, + "grad_norm": 1.3203125, + "learning_rate": 0.00027790605604547687, + "loss": 0.2101, + "step": 336410 + }, + { + "epoch": 13.93, + "grad_norm": 0.427734375, + "learning_rate": 0.00027789527860158013, + "loss": 0.1926, + "step": 336420 + }, + { + "epoch": 13.93, + "grad_norm": 0.84375, + "learning_rate": 0.0002778845011051875, + "loss": 0.2079, + "step": 336430 + }, + { + "epoch": 13.94, + "grad_norm": 0.67578125, + "learning_rate": 0.0002778737235563192, + "loss": 0.1997, + "step": 336440 + }, + { + "epoch": 13.94, + "grad_norm": 0.44921875, + "learning_rate": 0.00027786294595499557, + "loss": 0.1653, + "step": 336450 + }, + { + "epoch": 13.94, + "grad_norm": 1.3125, + "learning_rate": 0.0002778521683012368, + "loss": 0.1826, + "step": 336460 + }, + { + "epoch": 13.94, + "grad_norm": 0.73046875, + "learning_rate": 0.0002778413905950634, + "loss": 0.1965, + "step": 336470 + }, + { + "epoch": 13.94, + "grad_norm": 0.85546875, + "learning_rate": 0.0002778306128364955, + "loss": 0.2137, + "step": 336480 + }, + { + "epoch": 13.94, + "grad_norm": 1.421875, + "learning_rate": 0.0002778198350255533, + "loss": 0.1966, + "step": 336490 + }, + { + "epoch": 13.94, + "grad_norm": 0.734375, + "learning_rate": 0.00027780905716225725, + "loss": 0.2126, + "step": 336500 + }, + { + "epoch": 13.94, + "grad_norm": 0.41796875, + "learning_rate": 0.00027779827924662756, + "loss": 0.1542, + "step": 336510 + }, + { + "epoch": 13.94, + "grad_norm": 0.25390625, + "learning_rate": 0.00027778750127868446, + "loss": 0.1396, + "step": 336520 + }, + { + "epoch": 13.94, + "grad_norm": 0.625, + "learning_rate": 0.0002777767232584483, + "loss": 0.2213, + "step": 336530 + }, + { + "epoch": 13.94, + "grad_norm": 0.86328125, + "learning_rate": 0.00027776594518593936, + "loss": 0.1936, + "step": 336540 + }, + { + "epoch": 13.94, + "grad_norm": 0.486328125, + "learning_rate": 0.0002777551670611779, + "loss": 0.166, + "step": 336550 + }, + { + "epoch": 13.94, + "grad_norm": 1.2265625, + "learning_rate": 0.0002777443888841842, + "loss": 0.2085, + "step": 336560 + }, + { + "epoch": 13.94, + "grad_norm": 0.9375, + "learning_rate": 0.0002777336106549785, + "loss": 0.2038, + "step": 336570 + }, + { + "epoch": 13.94, + "grad_norm": 1.0546875, + "learning_rate": 0.00027772283237358124, + "loss": 0.1982, + "step": 336580 + }, + { + "epoch": 13.94, + "grad_norm": 0.921875, + "learning_rate": 0.00027771205404001264, + "loss": 0.2169, + "step": 336590 + }, + { + "epoch": 13.94, + "grad_norm": 1.09375, + "learning_rate": 0.0002777012756542929, + "loss": 0.2075, + "step": 336600 + }, + { + "epoch": 13.94, + "grad_norm": 0.78125, + "learning_rate": 0.0002776904972164423, + "loss": 0.1753, + "step": 336610 + }, + { + "epoch": 13.94, + "grad_norm": 1.640625, + "learning_rate": 0.0002776797187264812, + "loss": 0.1823, + "step": 336620 + }, + { + "epoch": 13.94, + "grad_norm": 0.82421875, + "learning_rate": 0.0002776689401844299, + "loss": 0.2324, + "step": 336630 + }, + { + "epoch": 13.94, + "grad_norm": 0.5546875, + "learning_rate": 0.00027765816159030853, + "loss": 0.193, + "step": 336640 + }, + { + "epoch": 13.94, + "grad_norm": 0.87109375, + "learning_rate": 0.0002776473829441376, + "loss": 0.2156, + "step": 336650 + }, + { + "epoch": 13.94, + "grad_norm": 1.1640625, + "learning_rate": 0.00027763660424593724, + "loss": 0.1877, + "step": 336660 + }, + { + "epoch": 13.94, + "grad_norm": 0.92578125, + "learning_rate": 0.0002776258254957278, + "loss": 0.1856, + "step": 336670 + }, + { + "epoch": 13.95, + "grad_norm": 0.7890625, + "learning_rate": 0.0002776150466935295, + "loss": 0.1511, + "step": 336680 + }, + { + "epoch": 13.95, + "grad_norm": 0.41796875, + "learning_rate": 0.00027760426783936265, + "loss": 0.2024, + "step": 336690 + }, + { + "epoch": 13.95, + "grad_norm": 0.5625, + "learning_rate": 0.0002775934889332476, + "loss": 0.2284, + "step": 336700 + }, + { + "epoch": 13.95, + "grad_norm": 1.1015625, + "learning_rate": 0.0002775827099752045, + "loss": 0.2096, + "step": 336710 + }, + { + "epoch": 13.95, + "grad_norm": 0.4375, + "learning_rate": 0.00027757193096525384, + "loss": 0.1889, + "step": 336720 + }, + { + "epoch": 13.95, + "grad_norm": 0.84765625, + "learning_rate": 0.00027756115190341574, + "loss": 0.2125, + "step": 336730 + }, + { + "epoch": 13.95, + "grad_norm": 0.53125, + "learning_rate": 0.0002775503727897105, + "loss": 0.1884, + "step": 336740 + }, + { + "epoch": 13.95, + "grad_norm": 1.5078125, + "learning_rate": 0.0002775395936241585, + "loss": 0.1681, + "step": 336750 + }, + { + "epoch": 13.95, + "grad_norm": 1.890625, + "learning_rate": 0.0002775288144067799, + "loss": 0.218, + "step": 336760 + }, + { + "epoch": 13.95, + "grad_norm": 1.078125, + "learning_rate": 0.00027751803513759507, + "loss": 0.1741, + "step": 336770 + }, + { + "epoch": 13.95, + "grad_norm": 0.98828125, + "learning_rate": 0.00027750725581662415, + "loss": 0.1577, + "step": 336780 + }, + { + "epoch": 13.95, + "grad_norm": 0.205078125, + "learning_rate": 0.0002774964764438877, + "loss": 0.1672, + "step": 336790 + }, + { + "epoch": 13.95, + "grad_norm": 1.3671875, + "learning_rate": 0.00027748569701940585, + "loss": 0.1409, + "step": 336800 + }, + { + "epoch": 13.95, + "grad_norm": 0.52734375, + "learning_rate": 0.0002774749175431988, + "loss": 0.1988, + "step": 336810 + }, + { + "epoch": 13.95, + "grad_norm": 1.6328125, + "learning_rate": 0.0002774641380152869, + "loss": 0.2121, + "step": 336820 + }, + { + "epoch": 13.95, + "grad_norm": 0.80078125, + "learning_rate": 0.0002774533584356905, + "loss": 0.1829, + "step": 336830 + }, + { + "epoch": 13.95, + "grad_norm": 0.80859375, + "learning_rate": 0.0002774425788044299, + "loss": 0.235, + "step": 336840 + }, + { + "epoch": 13.95, + "grad_norm": 0.640625, + "learning_rate": 0.0002774317991215253, + "loss": 0.1972, + "step": 336850 + }, + { + "epoch": 13.95, + "grad_norm": 0.671875, + "learning_rate": 0.00027742101938699697, + "loss": 0.2285, + "step": 336860 + }, + { + "epoch": 13.95, + "grad_norm": 0.4375, + "learning_rate": 0.0002774102396008653, + "loss": 0.1932, + "step": 336870 + }, + { + "epoch": 13.95, + "grad_norm": 0.78125, + "learning_rate": 0.00027739945976315057, + "loss": 0.1927, + "step": 336880 + }, + { + "epoch": 13.95, + "grad_norm": 0.8828125, + "learning_rate": 0.0002773886798738729, + "loss": 0.2432, + "step": 336890 + }, + { + "epoch": 13.95, + "grad_norm": 1.0078125, + "learning_rate": 0.00027737789993305276, + "loss": 0.1834, + "step": 336900 + }, + { + "epoch": 13.95, + "grad_norm": 0.423828125, + "learning_rate": 0.0002773671199407103, + "loss": 0.1678, + "step": 336910 + }, + { + "epoch": 13.96, + "grad_norm": 0.2294921875, + "learning_rate": 0.0002773563398968659, + "loss": 0.2207, + "step": 336920 + }, + { + "epoch": 13.96, + "grad_norm": 0.8515625, + "learning_rate": 0.00027734555980153984, + "loss": 0.2198, + "step": 336930 + }, + { + "epoch": 13.96, + "grad_norm": 0.77734375, + "learning_rate": 0.0002773347796547524, + "loss": 0.1961, + "step": 336940 + }, + { + "epoch": 13.96, + "grad_norm": 1.6875, + "learning_rate": 0.0002773239994565239, + "loss": 0.1961, + "step": 336950 + }, + { + "epoch": 13.96, + "grad_norm": 0.68359375, + "learning_rate": 0.0002773132192068745, + "loss": 0.1859, + "step": 336960 + }, + { + "epoch": 13.96, + "grad_norm": 0.419921875, + "learning_rate": 0.0002773024389058246, + "loss": 0.1797, + "step": 336970 + }, + { + "epoch": 13.96, + "grad_norm": 0.65234375, + "learning_rate": 0.0002772916585533944, + "loss": 0.1793, + "step": 336980 + }, + { + "epoch": 13.96, + "grad_norm": 1.1328125, + "learning_rate": 0.00027728087814960433, + "loss": 0.2199, + "step": 336990 + }, + { + "epoch": 13.96, + "grad_norm": 1.09375, + "learning_rate": 0.0002772700976944746, + "loss": 0.2057, + "step": 337000 + }, + { + "epoch": 13.96, + "grad_norm": 1.8203125, + "learning_rate": 0.0002772593171880255, + "loss": 0.1605, + "step": 337010 + }, + { + "epoch": 13.96, + "grad_norm": 0.9296875, + "learning_rate": 0.0002772485366302773, + "loss": 0.2282, + "step": 337020 + }, + { + "epoch": 13.96, + "grad_norm": 0.000537872314453125, + "learning_rate": 0.0002772377560212502, + "loss": 0.2092, + "step": 337030 + }, + { + "epoch": 13.96, + "grad_norm": 1.4921875, + "learning_rate": 0.0002772269753609647, + "loss": 0.1846, + "step": 337040 + }, + { + "epoch": 13.96, + "grad_norm": 0.8515625, + "learning_rate": 0.0002772161946494409, + "loss": 0.157, + "step": 337050 + }, + { + "epoch": 13.96, + "grad_norm": 0.5, + "learning_rate": 0.00027720541388669915, + "loss": 0.1444, + "step": 337060 + }, + { + "epoch": 13.96, + "grad_norm": 1.09375, + "learning_rate": 0.0002771946330727598, + "loss": 0.2027, + "step": 337070 + }, + { + "epoch": 13.96, + "grad_norm": 0.7421875, + "learning_rate": 0.00027718385220764307, + "loss": 0.2208, + "step": 337080 + }, + { + "epoch": 13.96, + "grad_norm": 0.49609375, + "learning_rate": 0.0002771730712913693, + "loss": 0.2015, + "step": 337090 + }, + { + "epoch": 13.96, + "grad_norm": 0.84765625, + "learning_rate": 0.0002771622903239587, + "loss": 0.2059, + "step": 337100 + }, + { + "epoch": 13.96, + "grad_norm": 0.609375, + "learning_rate": 0.0002771515093054316, + "loss": 0.2361, + "step": 337110 + }, + { + "epoch": 13.96, + "grad_norm": 1.0, + "learning_rate": 0.00027714072823580835, + "loss": 0.1924, + "step": 337120 + }, + { + "epoch": 13.96, + "grad_norm": 0.466796875, + "learning_rate": 0.0002771299471151091, + "loss": 0.2434, + "step": 337130 + }, + { + "epoch": 13.96, + "grad_norm": 0.9375, + "learning_rate": 0.0002771191659433543, + "loss": 0.1566, + "step": 337140 + }, + { + "epoch": 13.96, + "grad_norm": 0.6015625, + "learning_rate": 0.00027710838472056403, + "loss": 0.1704, + "step": 337150 + }, + { + "epoch": 13.97, + "grad_norm": 0.609375, + "learning_rate": 0.00027709760344675884, + "loss": 0.1567, + "step": 337160 + }, + { + "epoch": 13.97, + "grad_norm": 0.51953125, + "learning_rate": 0.00027708682212195884, + "loss": 0.1934, + "step": 337170 + }, + { + "epoch": 13.97, + "grad_norm": 2.125, + "learning_rate": 0.0002770760407461844, + "loss": 0.1769, + "step": 337180 + }, + { + "epoch": 13.97, + "grad_norm": 0.90625, + "learning_rate": 0.0002770652593194557, + "loss": 0.207, + "step": 337190 + }, + { + "epoch": 13.97, + "grad_norm": 0.921875, + "learning_rate": 0.00027705447784179315, + "loss": 0.1898, + "step": 337200 + }, + { + "epoch": 13.97, + "grad_norm": 0.75, + "learning_rate": 0.000277043696313217, + "loss": 0.2088, + "step": 337210 + }, + { + "epoch": 13.97, + "grad_norm": 0.55859375, + "learning_rate": 0.00027703291473374754, + "loss": 0.2057, + "step": 337220 + }, + { + "epoch": 13.97, + "grad_norm": 0.7421875, + "learning_rate": 0.00027702213310340496, + "loss": 0.1761, + "step": 337230 + }, + { + "epoch": 13.97, + "grad_norm": 1.296875, + "learning_rate": 0.0002770113514222097, + "loss": 0.1747, + "step": 337240 + }, + { + "epoch": 13.97, + "grad_norm": 1.0390625, + "learning_rate": 0.00027700056969018214, + "loss": 0.176, + "step": 337250 + }, + { + "epoch": 13.97, + "grad_norm": 0.8046875, + "learning_rate": 0.00027698978790734224, + "loss": 0.1754, + "step": 337260 + }, + { + "epoch": 13.97, + "grad_norm": 1.1015625, + "learning_rate": 0.0002769790060737105, + "loss": 0.2251, + "step": 337270 + }, + { + "epoch": 13.97, + "grad_norm": 0.251953125, + "learning_rate": 0.0002769682241893073, + "loss": 0.1784, + "step": 337280 + }, + { + "epoch": 13.97, + "grad_norm": 0.62890625, + "learning_rate": 0.00027695744225415266, + "loss": 0.2286, + "step": 337290 + }, + { + "epoch": 13.97, + "grad_norm": 1.8515625, + "learning_rate": 0.0002769466602682671, + "loss": 0.1862, + "step": 337300 + }, + { + "epoch": 13.97, + "grad_norm": 1.3515625, + "learning_rate": 0.00027693587823167084, + "loss": 0.1476, + "step": 337310 + }, + { + "epoch": 13.97, + "grad_norm": 2.234375, + "learning_rate": 0.0002769250961443841, + "loss": 0.1453, + "step": 337320 + }, + { + "epoch": 13.97, + "grad_norm": 0.9453125, + "learning_rate": 0.00027691431400642737, + "loss": 0.1973, + "step": 337330 + }, + { + "epoch": 13.97, + "grad_norm": 1.109375, + "learning_rate": 0.00027690353181782063, + "loss": 0.2253, + "step": 337340 + }, + { + "epoch": 13.97, + "grad_norm": 0.765625, + "learning_rate": 0.0002768927495785845, + "loss": 0.1638, + "step": 337350 + }, + { + "epoch": 13.97, + "grad_norm": 1.5234375, + "learning_rate": 0.0002768819672887391, + "loss": 0.1762, + "step": 337360 + }, + { + "epoch": 13.97, + "grad_norm": 1.0625, + "learning_rate": 0.00027687118494830465, + "loss": 0.1506, + "step": 337370 + }, + { + "epoch": 13.97, + "grad_norm": 0.7109375, + "learning_rate": 0.00027686040255730165, + "loss": 0.1807, + "step": 337380 + }, + { + "epoch": 13.97, + "grad_norm": 0.68359375, + "learning_rate": 0.0002768496201157502, + "loss": 0.2119, + "step": 337390 + }, + { + "epoch": 13.98, + "grad_norm": 0.421875, + "learning_rate": 0.0002768388376236707, + "loss": 0.122, + "step": 337400 + }, + { + "epoch": 13.98, + "grad_norm": 0.96484375, + "learning_rate": 0.00027682805508108337, + "loss": 0.1792, + "step": 337410 + }, + { + "epoch": 13.98, + "grad_norm": 0.69921875, + "learning_rate": 0.00027681727248800845, + "loss": 0.2024, + "step": 337420 + }, + { + "epoch": 13.98, + "grad_norm": 0.91796875, + "learning_rate": 0.00027680648984446647, + "loss": 0.1648, + "step": 337430 + }, + { + "epoch": 13.98, + "grad_norm": 1.7421875, + "learning_rate": 0.0002767957071504776, + "loss": 0.1908, + "step": 337440 + }, + { + "epoch": 13.98, + "grad_norm": 0.69140625, + "learning_rate": 0.000276784924406062, + "loss": 0.201, + "step": 337450 + }, + { + "epoch": 13.98, + "grad_norm": 0.890625, + "learning_rate": 0.00027677414161124007, + "loss": 0.1553, + "step": 337460 + }, + { + "epoch": 13.98, + "grad_norm": 1.0, + "learning_rate": 0.0002767633587660321, + "loss": 0.2418, + "step": 337470 + }, + { + "epoch": 13.98, + "grad_norm": 1.0390625, + "learning_rate": 0.00027675257587045837, + "loss": 0.2054, + "step": 337480 + }, + { + "epoch": 13.98, + "grad_norm": 0.515625, + "learning_rate": 0.0002767417929245393, + "loss": 0.1771, + "step": 337490 + }, + { + "epoch": 13.98, + "grad_norm": 0.45703125, + "learning_rate": 0.00027673100992829484, + "loss": 0.1768, + "step": 337500 + }, + { + "epoch": 13.98, + "grad_norm": 0.54296875, + "learning_rate": 0.0002767202268817457, + "loss": 0.1955, + "step": 337510 + }, + { + "epoch": 13.98, + "grad_norm": 0.734375, + "learning_rate": 0.00027670944378491196, + "loss": 0.205, + "step": 337520 + }, + { + "epoch": 13.98, + "grad_norm": 1.1484375, + "learning_rate": 0.00027669866063781385, + "loss": 0.2319, + "step": 337530 + }, + { + "epoch": 13.98, + "grad_norm": 0.80078125, + "learning_rate": 0.0002766878774404718, + "loss": 0.1556, + "step": 337540 + }, + { + "epoch": 13.98, + "grad_norm": 1.25, + "learning_rate": 0.000276677094192906, + "loss": 0.2033, + "step": 337550 + }, + { + "epoch": 13.98, + "grad_norm": 0.93359375, + "learning_rate": 0.00027666631089513685, + "loss": 0.2144, + "step": 337560 + }, + { + "epoch": 13.98, + "grad_norm": 0.6640625, + "learning_rate": 0.00027665552754718454, + "loss": 0.1524, + "step": 337570 + }, + { + "epoch": 13.98, + "grad_norm": 1.0078125, + "learning_rate": 0.0002766447441490694, + "loss": 0.1966, + "step": 337580 + }, + { + "epoch": 13.98, + "grad_norm": 0.85546875, + "learning_rate": 0.00027663396070081174, + "loss": 0.2177, + "step": 337590 + }, + { + "epoch": 13.98, + "grad_norm": 1.8515625, + "learning_rate": 0.0002766231772024319, + "loss": 0.1648, + "step": 337600 + }, + { + "epoch": 13.98, + "grad_norm": 0.921875, + "learning_rate": 0.00027661239365394996, + "loss": 0.1634, + "step": 337610 + }, + { + "epoch": 13.98, + "grad_norm": 0.466796875, + "learning_rate": 0.00027660161005538654, + "loss": 0.2039, + "step": 337620 + }, + { + "epoch": 13.98, + "grad_norm": 0.63671875, + "learning_rate": 0.00027659082640676166, + "loss": 0.1734, + "step": 337630 + }, + { + "epoch": 13.99, + "grad_norm": 0.66015625, + "learning_rate": 0.0002765800427080957, + "loss": 0.1634, + "step": 337640 + }, + { + "epoch": 13.99, + "grad_norm": 0.427734375, + "learning_rate": 0.0002765692589594091, + "loss": 0.1882, + "step": 337650 + }, + { + "epoch": 13.99, + "grad_norm": 1.3359375, + "learning_rate": 0.00027655847516072185, + "loss": 0.2371, + "step": 337660 + }, + { + "epoch": 13.99, + "grad_norm": 0.2890625, + "learning_rate": 0.0002765476913120546, + "loss": 0.1947, + "step": 337670 + }, + { + "epoch": 13.99, + "grad_norm": 1.03125, + "learning_rate": 0.00027653690741342735, + "loss": 0.2107, + "step": 337680 + }, + { + "epoch": 13.99, + "grad_norm": 0.5703125, + "learning_rate": 0.0002765261234648605, + "loss": 0.2295, + "step": 337690 + }, + { + "epoch": 13.99, + "grad_norm": 0.390625, + "learning_rate": 0.0002765153394663744, + "loss": 0.1924, + "step": 337700 + }, + { + "epoch": 13.99, + "grad_norm": 0.734375, + "learning_rate": 0.0002765045554179893, + "loss": 0.1955, + "step": 337710 + }, + { + "epoch": 13.99, + "grad_norm": 1.921875, + "learning_rate": 0.0002764937713197254, + "loss": 0.1677, + "step": 337720 + }, + { + "epoch": 13.99, + "grad_norm": 1.1484375, + "learning_rate": 0.0002764829871716032, + "loss": 0.2191, + "step": 337730 + }, + { + "epoch": 13.99, + "grad_norm": 0.45703125, + "learning_rate": 0.00027647220297364277, + "loss": 0.1235, + "step": 337740 + }, + { + "epoch": 13.99, + "grad_norm": 0.400390625, + "learning_rate": 0.0002764614187258646, + "loss": 0.1831, + "step": 337750 + }, + { + "epoch": 13.99, + "grad_norm": 0.703125, + "learning_rate": 0.00027645063442828885, + "loss": 0.2099, + "step": 337760 + }, + { + "epoch": 13.99, + "grad_norm": 1.0, + "learning_rate": 0.0002764398500809359, + "loss": 0.2274, + "step": 337770 + }, + { + "epoch": 13.99, + "grad_norm": 0.68359375, + "learning_rate": 0.00027642906568382596, + "loss": 0.1228, + "step": 337780 + }, + { + "epoch": 13.99, + "grad_norm": 3.375, + "learning_rate": 0.0002764182812369794, + "loss": 0.2173, + "step": 337790 + }, + { + "epoch": 13.99, + "grad_norm": 0.703125, + "learning_rate": 0.0002764074967404165, + "loss": 0.2297, + "step": 337800 + }, + { + "epoch": 13.99, + "grad_norm": 0.60546875, + "learning_rate": 0.00027639671219415753, + "loss": 0.1908, + "step": 337810 + }, + { + "epoch": 13.99, + "grad_norm": 0.59765625, + "learning_rate": 0.0002763859275982227, + "loss": 0.1713, + "step": 337820 + }, + { + "epoch": 13.99, + "grad_norm": 0.43359375, + "learning_rate": 0.0002763751429526326, + "loss": 0.1864, + "step": 337830 + }, + { + "epoch": 13.99, + "grad_norm": 0.443359375, + "learning_rate": 0.00027636435825740716, + "loss": 0.1953, + "step": 337840 + }, + { + "epoch": 13.99, + "grad_norm": 0.255859375, + "learning_rate": 0.00027635357351256693, + "loss": 0.2159, + "step": 337850 + }, + { + "epoch": 13.99, + "grad_norm": 0.8828125, + "learning_rate": 0.00027634278871813215, + "loss": 0.2163, + "step": 337860 + }, + { + "epoch": 13.99, + "grad_norm": 0.64453125, + "learning_rate": 0.00027633200387412304, + "loss": 0.1726, + "step": 337870 + }, + { + "epoch": 13.99, + "grad_norm": 1.0390625, + "learning_rate": 0.0002763212189805599, + "loss": 0.2106, + "step": 337880 + }, + { + "epoch": 14.0, + "grad_norm": 1.0703125, + "learning_rate": 0.0002763104340374631, + "loss": 0.2384, + "step": 337890 + }, + { + "epoch": 14.0, + "grad_norm": 1.09375, + "learning_rate": 0.00027629964904485294, + "loss": 0.148, + "step": 337900 + }, + { + "epoch": 14.0, + "grad_norm": 0.81640625, + "learning_rate": 0.0002762888640027497, + "loss": 0.1822, + "step": 337910 + }, + { + "epoch": 14.0, + "grad_norm": 0.703125, + "learning_rate": 0.0002762780789111736, + "loss": 0.174, + "step": 337920 + }, + { + "epoch": 14.0, + "grad_norm": 1.3125, + "learning_rate": 0.000276267293770145, + "loss": 0.1571, + "step": 337930 + }, + { + "epoch": 14.0, + "grad_norm": 0.6640625, + "learning_rate": 0.00027625650857968423, + "loss": 0.2093, + "step": 337940 + }, + { + "epoch": 14.0, + "grad_norm": 1.8671875, + "learning_rate": 0.00027624572333981144, + "loss": 0.2018, + "step": 337950 + }, + { + "epoch": 14.0, + "grad_norm": 0.5078125, + "learning_rate": 0.0002762349380505471, + "loss": 0.2032, + "step": 337960 + }, + { + "epoch": 14.0, + "grad_norm": 1.5078125, + "learning_rate": 0.0002762241527119114, + "loss": 0.1962, + "step": 337970 + }, + { + "epoch": 14.0, + "grad_norm": 0.83984375, + "learning_rate": 0.0002762133673239248, + "loss": 0.2022, + "step": 337980 + }, + { + "epoch": 14.0, + "grad_norm": 0.7265625, + "learning_rate": 0.0002762025818866074, + "loss": 0.1558, + "step": 337990 + }, + { + "epoch": 14.0, + "grad_norm": 0.361328125, + "learning_rate": 0.00027619179639997956, + "loss": 0.2158, + "step": 338000 + }, + { + "epoch": 14.0, + "grad_norm": 1.203125, + "learning_rate": 0.00027618101086406157, + "loss": 0.2472, + "step": 338010 + }, + { + "epoch": 14.0, + "grad_norm": 0.99609375, + "learning_rate": 0.00027617022527887374, + "loss": 0.1734, + "step": 338020 + }, + { + "epoch": 14.0, + "grad_norm": 0.734375, + "learning_rate": 0.00027615943964443647, + "loss": 0.2107, + "step": 338030 + }, + { + "epoch": 14.0, + "grad_norm": 1.5703125, + "learning_rate": 0.0002761486539607698, + "loss": 0.2201, + "step": 338040 + }, + { + "epoch": 14.0, + "grad_norm": 0.828125, + "learning_rate": 0.00027613786822789434, + "loss": 0.199, + "step": 338050 + }, + { + "epoch": 14.0, + "grad_norm": 0.71484375, + "learning_rate": 0.00027612708244583015, + "loss": 0.18, + "step": 338060 + }, + { + "epoch": 14.0, + "grad_norm": 0.48046875, + "learning_rate": 0.00027611629661459765, + "loss": 0.1764, + "step": 338070 + }, + { + "epoch": 14.0, + "grad_norm": 0.78125, + "learning_rate": 0.0002761055107342171, + "loss": 0.1742, + "step": 338080 + }, + { + "epoch": 14.0, + "grad_norm": 1.359375, + "learning_rate": 0.00027609472480470877, + "loss": 0.1887, + "step": 338090 + }, + { + "epoch": 14.0, + "grad_norm": 1.796875, + "learning_rate": 0.000276083938826093, + "loss": 0.1823, + "step": 338100 + }, + { + "epoch": 14.0, + "grad_norm": 0.55078125, + "learning_rate": 0.0002760731527983901, + "loss": 0.138, + "step": 338110 + }, + { + "epoch": 14.0, + "grad_norm": 0.66015625, + "learning_rate": 0.0002760623667216203, + "loss": 0.245, + "step": 338120 + }, + { + "epoch": 14.01, + "grad_norm": 0.8671875, + "learning_rate": 0.0002760515805958039, + "loss": 0.2204, + "step": 338130 + }, + { + "epoch": 14.01, + "grad_norm": 1.2421875, + "learning_rate": 0.0002760407944209613, + "loss": 0.2241, + "step": 338140 + }, + { + "epoch": 14.01, + "grad_norm": 0.984375, + "learning_rate": 0.0002760300081971128, + "loss": 0.1764, + "step": 338150 + }, + { + "epoch": 14.01, + "grad_norm": 1.2578125, + "learning_rate": 0.00027601922192427845, + "loss": 0.2027, + "step": 338160 + }, + { + "epoch": 14.01, + "grad_norm": 0.63671875, + "learning_rate": 0.0002760084356024789, + "loss": 0.2205, + "step": 338170 + }, + { + "epoch": 14.01, + "grad_norm": 0.8671875, + "learning_rate": 0.00027599764923173423, + "loss": 0.1193, + "step": 338180 + }, + { + "epoch": 14.01, + "grad_norm": 1.21875, + "learning_rate": 0.00027598686281206483, + "loss": 0.2127, + "step": 338190 + }, + { + "epoch": 14.01, + "grad_norm": 1.03125, + "learning_rate": 0.00027597607634349087, + "loss": 0.204, + "step": 338200 + }, + { + "epoch": 14.01, + "grad_norm": 0.3046875, + "learning_rate": 0.0002759652898260328, + "loss": 0.1697, + "step": 338210 + }, + { + "epoch": 14.01, + "grad_norm": 1.421875, + "learning_rate": 0.00027595450325971084, + "loss": 0.209, + "step": 338220 + }, + { + "epoch": 14.01, + "grad_norm": 1.3515625, + "learning_rate": 0.0002759437166445453, + "loss": 0.188, + "step": 338230 + }, + { + "epoch": 14.01, + "grad_norm": 1.0390625, + "learning_rate": 0.00027593292998055654, + "loss": 0.1951, + "step": 338240 + }, + { + "epoch": 14.01, + "grad_norm": 1.078125, + "learning_rate": 0.0002759221432677648, + "loss": 0.155, + "step": 338250 + }, + { + "epoch": 14.01, + "grad_norm": 0.84375, + "learning_rate": 0.00027591135650619034, + "loss": 0.1623, + "step": 338260 + }, + { + "epoch": 14.01, + "grad_norm": 1.796875, + "learning_rate": 0.0002759005696958535, + "loss": 0.1983, + "step": 338270 + }, + { + "epoch": 14.01, + "grad_norm": 0.625, + "learning_rate": 0.00027588978283677465, + "loss": 0.1695, + "step": 338280 + }, + { + "epoch": 14.01, + "grad_norm": 0.71875, + "learning_rate": 0.00027587899592897396, + "loss": 0.2101, + "step": 338290 + }, + { + "epoch": 14.01, + "grad_norm": 0.671875, + "learning_rate": 0.0002758682089724718, + "loss": 0.2016, + "step": 338300 + }, + { + "epoch": 14.01, + "grad_norm": 0.84375, + "learning_rate": 0.0002758574219672885, + "loss": 0.1905, + "step": 338310 + }, + { + "epoch": 14.01, + "grad_norm": 0.546875, + "learning_rate": 0.0002758466349134443, + "loss": 0.1796, + "step": 338320 + }, + { + "epoch": 14.01, + "grad_norm": 0.54296875, + "learning_rate": 0.00027583584781095956, + "loss": 0.2013, + "step": 338330 + }, + { + "epoch": 14.01, + "grad_norm": 0.69140625, + "learning_rate": 0.00027582506065985447, + "loss": 0.1621, + "step": 338340 + }, + { + "epoch": 14.01, + "grad_norm": 0.68359375, + "learning_rate": 0.0002758142734601495, + "loss": 0.2072, + "step": 338350 + }, + { + "epoch": 14.01, + "grad_norm": 1.0234375, + "learning_rate": 0.0002758034862118648, + "loss": 0.1937, + "step": 338360 + }, + { + "epoch": 14.02, + "grad_norm": 1.8984375, + "learning_rate": 0.00027579269891502066, + "loss": 0.2174, + "step": 338370 + }, + { + "epoch": 14.02, + "grad_norm": 0.9765625, + "learning_rate": 0.00027578191156963754, + "loss": 0.1403, + "step": 338380 + }, + { + "epoch": 14.02, + "grad_norm": 0.84765625, + "learning_rate": 0.00027577112417573557, + "loss": 0.2443, + "step": 338390 + }, + { + "epoch": 14.02, + "grad_norm": 0.7890625, + "learning_rate": 0.00027576033673333523, + "loss": 0.2222, + "step": 338400 + }, + { + "epoch": 14.02, + "grad_norm": 0.921875, + "learning_rate": 0.00027574954924245663, + "loss": 0.1507, + "step": 338410 + }, + { + "epoch": 14.02, + "grad_norm": 0.65625, + "learning_rate": 0.0002757387617031202, + "loss": 0.1514, + "step": 338420 + }, + { + "epoch": 14.02, + "grad_norm": 1.5234375, + "learning_rate": 0.0002757279741153463, + "loss": 0.1811, + "step": 338430 + }, + { + "epoch": 14.02, + "grad_norm": 0.69140625, + "learning_rate": 0.00027571718647915496, + "loss": 0.1888, + "step": 338440 + }, + { + "epoch": 14.02, + "grad_norm": 1.1875, + "learning_rate": 0.0002757063987945667, + "loss": 0.1929, + "step": 338450 + }, + { + "epoch": 14.02, + "grad_norm": 0.2255859375, + "learning_rate": 0.00027569561106160177, + "loss": 0.198, + "step": 338460 + }, + { + "epoch": 14.02, + "grad_norm": 1.1015625, + "learning_rate": 0.00027568482328028054, + "loss": 0.2233, + "step": 338470 + }, + { + "epoch": 14.02, + "grad_norm": 0.37109375, + "learning_rate": 0.00027567403545062323, + "loss": 0.2077, + "step": 338480 + }, + { + "epoch": 14.02, + "grad_norm": 1.4453125, + "learning_rate": 0.0002756632475726501, + "loss": 0.1717, + "step": 338490 + }, + { + "epoch": 14.02, + "grad_norm": 0.87890625, + "learning_rate": 0.00027565245964638154, + "loss": 0.2371, + "step": 338500 + }, + { + "epoch": 14.02, + "grad_norm": 0.345703125, + "learning_rate": 0.0002756416716718379, + "loss": 0.1757, + "step": 338510 + }, + { + "epoch": 14.02, + "grad_norm": 0.51953125, + "learning_rate": 0.0002756308836490392, + "loss": 0.1779, + "step": 338520 + }, + { + "epoch": 14.02, + "grad_norm": 0.6171875, + "learning_rate": 0.00027562009557800614, + "loss": 0.1654, + "step": 338530 + }, + { + "epoch": 14.02, + "grad_norm": 0.64453125, + "learning_rate": 0.0002756093074587587, + "loss": 0.1851, + "step": 338540 + }, + { + "epoch": 14.02, + "grad_norm": 0.52734375, + "learning_rate": 0.0002755985192913174, + "loss": 0.184, + "step": 338550 + }, + { + "epoch": 14.02, + "grad_norm": 1.6328125, + "learning_rate": 0.00027558773107570246, + "loss": 0.1991, + "step": 338560 + }, + { + "epoch": 14.02, + "grad_norm": 0.2734375, + "learning_rate": 0.0002755769428119341, + "loss": 0.1681, + "step": 338570 + }, + { + "epoch": 14.02, + "grad_norm": 1.171875, + "learning_rate": 0.00027556615450003276, + "loss": 0.1786, + "step": 338580 + }, + { + "epoch": 14.02, + "grad_norm": 1.015625, + "learning_rate": 0.0002755553661400186, + "loss": 0.1815, + "step": 338590 + }, + { + "epoch": 14.02, + "grad_norm": 0.0, + "learning_rate": 0.000275544577731912, + "loss": 0.1966, + "step": 338600 + }, + { + "epoch": 14.03, + "grad_norm": 0.6171875, + "learning_rate": 0.0002755337892757334, + "loss": 0.1489, + "step": 338610 + }, + { + "epoch": 14.03, + "grad_norm": 0.337890625, + "learning_rate": 0.00027552300077150283, + "loss": 0.1626, + "step": 338620 + }, + { + "epoch": 14.03, + "grad_norm": 0.86328125, + "learning_rate": 0.00027551221221924077, + "loss": 0.1802, + "step": 338630 + }, + { + "epoch": 14.03, + "grad_norm": 1.1953125, + "learning_rate": 0.00027550142361896753, + "loss": 0.2369, + "step": 338640 + }, + { + "epoch": 14.03, + "grad_norm": 0.734375, + "learning_rate": 0.0002754906349707033, + "loss": 0.2182, + "step": 338650 + }, + { + "epoch": 14.03, + "grad_norm": 0.5234375, + "learning_rate": 0.0002754798462744685, + "loss": 0.1963, + "step": 338660 + }, + { + "epoch": 14.03, + "grad_norm": 0.546875, + "learning_rate": 0.0002754690575302833, + "loss": 0.2334, + "step": 338670 + }, + { + "epoch": 14.03, + "grad_norm": 1.0625, + "learning_rate": 0.00027545826873816813, + "loss": 0.2179, + "step": 338680 + }, + { + "epoch": 14.03, + "grad_norm": 0.5703125, + "learning_rate": 0.00027544747989814326, + "loss": 0.1593, + "step": 338690 + }, + { + "epoch": 14.03, + "grad_norm": 2.609375, + "learning_rate": 0.000275436691010229, + "loss": 0.1993, + "step": 338700 + }, + { + "epoch": 14.03, + "grad_norm": 0.69140625, + "learning_rate": 0.00027542590207444556, + "loss": 0.1899, + "step": 338710 + }, + { + "epoch": 14.03, + "grad_norm": 0.64453125, + "learning_rate": 0.0002754151130908134, + "loss": 0.2237, + "step": 338720 + }, + { + "epoch": 14.03, + "grad_norm": 0.57421875, + "learning_rate": 0.0002754043240593527, + "loss": 0.1852, + "step": 338730 + }, + { + "epoch": 14.03, + "grad_norm": 1.75, + "learning_rate": 0.0002753935349800838, + "loss": 0.1676, + "step": 338740 + }, + { + "epoch": 14.03, + "grad_norm": 0.96484375, + "learning_rate": 0.00027538274585302703, + "loss": 0.1931, + "step": 338750 + }, + { + "epoch": 14.03, + "grad_norm": 0.57421875, + "learning_rate": 0.00027537195667820264, + "loss": 0.182, + "step": 338760 + }, + { + "epoch": 14.03, + "grad_norm": 0.396484375, + "learning_rate": 0.0002753611674556311, + "loss": 0.1374, + "step": 338770 + }, + { + "epoch": 14.03, + "grad_norm": 0.56640625, + "learning_rate": 0.0002753503781853324, + "loss": 0.1565, + "step": 338780 + }, + { + "epoch": 14.03, + "grad_norm": 0.87890625, + "learning_rate": 0.00027533958886732707, + "loss": 0.166, + "step": 338790 + }, + { + "epoch": 14.03, + "grad_norm": 1.0546875, + "learning_rate": 0.0002753287995016355, + "loss": 0.2016, + "step": 338800 + }, + { + "epoch": 14.03, + "grad_norm": 0.91796875, + "learning_rate": 0.0002753180100882777, + "loss": 0.2132, + "step": 338810 + }, + { + "epoch": 14.03, + "grad_norm": 1.5703125, + "learning_rate": 0.0002753072206272742, + "loss": 0.1946, + "step": 338820 + }, + { + "epoch": 14.03, + "grad_norm": 0.478515625, + "learning_rate": 0.0002752964311186452, + "loss": 0.1803, + "step": 338830 + }, + { + "epoch": 14.03, + "grad_norm": 0.99609375, + "learning_rate": 0.0002752856415624111, + "loss": 0.1916, + "step": 338840 + }, + { + "epoch": 14.04, + "grad_norm": 0.69140625, + "learning_rate": 0.0002752748519585922, + "loss": 0.1977, + "step": 338850 + }, + { + "epoch": 14.04, + "grad_norm": 0.53515625, + "learning_rate": 0.0002752640623072087, + "loss": 0.1871, + "step": 338860 + }, + { + "epoch": 14.04, + "grad_norm": 0.90234375, + "learning_rate": 0.00027525327260828094, + "loss": 0.1424, + "step": 338870 + }, + { + "epoch": 14.04, + "grad_norm": 0.796875, + "learning_rate": 0.00027524248286182933, + "loss": 0.1544, + "step": 338880 + }, + { + "epoch": 14.04, + "grad_norm": 1.1015625, + "learning_rate": 0.000275231693067874, + "loss": 0.1856, + "step": 338890 + }, + { + "epoch": 14.04, + "grad_norm": 3.84375, + "learning_rate": 0.0002752209032264354, + "loss": 0.1698, + "step": 338900 + }, + { + "epoch": 14.04, + "grad_norm": 1.265625, + "learning_rate": 0.00027521011333753376, + "loss": 0.1995, + "step": 338910 + }, + { + "epoch": 14.04, + "grad_norm": 0.546875, + "learning_rate": 0.0002751993234011894, + "loss": 0.1595, + "step": 338920 + }, + { + "epoch": 14.04, + "grad_norm": 0.8046875, + "learning_rate": 0.00027518853341742275, + "loss": 0.175, + "step": 338930 + }, + { + "epoch": 14.04, + "grad_norm": 1.265625, + "learning_rate": 0.00027517774338625385, + "loss": 0.1677, + "step": 338940 + }, + { + "epoch": 14.04, + "grad_norm": 0.74609375, + "learning_rate": 0.0002751669533077032, + "loss": 0.1853, + "step": 338950 + }, + { + "epoch": 14.04, + "grad_norm": 1.609375, + "learning_rate": 0.0002751561631817911, + "loss": 0.1895, + "step": 338960 + }, + { + "epoch": 14.04, + "grad_norm": 1.0, + "learning_rate": 0.00027514537300853776, + "loss": 0.2078, + "step": 338970 + }, + { + "epoch": 14.04, + "grad_norm": 0.578125, + "learning_rate": 0.0002751345827879636, + "loss": 0.1972, + "step": 338980 + }, + { + "epoch": 14.04, + "grad_norm": 1.0234375, + "learning_rate": 0.00027512379252008884, + "loss": 0.1751, + "step": 338990 + }, + { + "epoch": 14.04, + "grad_norm": 0.5625, + "learning_rate": 0.0002751130022049338, + "loss": 0.1871, + "step": 339000 + }, + { + "epoch": 14.04, + "grad_norm": 0.80859375, + "learning_rate": 0.0002751022118425189, + "loss": 0.2084, + "step": 339010 + }, + { + "epoch": 14.04, + "grad_norm": 1.0859375, + "learning_rate": 0.0002750914214328642, + "loss": 0.1916, + "step": 339020 + }, + { + "epoch": 14.04, + "grad_norm": 0.322265625, + "learning_rate": 0.00027508063097599024, + "loss": 0.1748, + "step": 339030 + }, + { + "epoch": 14.04, + "grad_norm": 0.87109375, + "learning_rate": 0.00027506984047191723, + "loss": 0.176, + "step": 339040 + }, + { + "epoch": 14.04, + "grad_norm": 0.921875, + "learning_rate": 0.00027505904992066544, + "loss": 0.2315, + "step": 339050 + }, + { + "epoch": 14.04, + "grad_norm": 0.69140625, + "learning_rate": 0.0002750482593222553, + "loss": 0.233, + "step": 339060 + }, + { + "epoch": 14.04, + "grad_norm": 2.328125, + "learning_rate": 0.000275037468676707, + "loss": 0.2092, + "step": 339070 + }, + { + "epoch": 14.04, + "grad_norm": 0.8828125, + "learning_rate": 0.0002750266779840409, + "loss": 0.2204, + "step": 339080 + }, + { + "epoch": 14.05, + "grad_norm": 1.03125, + "learning_rate": 0.0002750158872442773, + "loss": 0.1627, + "step": 339090 + }, + { + "epoch": 14.05, + "grad_norm": 0.640625, + "learning_rate": 0.0002750050964574365, + "loss": 0.1899, + "step": 339100 + }, + { + "epoch": 14.05, + "grad_norm": 1.015625, + "learning_rate": 0.00027499430562353877, + "loss": 0.1952, + "step": 339110 + }, + { + "epoch": 14.05, + "grad_norm": 1.875, + "learning_rate": 0.0002749835147426045, + "loss": 0.2058, + "step": 339120 + }, + { + "epoch": 14.05, + "grad_norm": 0.6484375, + "learning_rate": 0.00027497272381465395, + "loss": 0.2165, + "step": 339130 + }, + { + "epoch": 14.05, + "grad_norm": 0.5859375, + "learning_rate": 0.00027496193283970744, + "loss": 0.1873, + "step": 339140 + }, + { + "epoch": 14.05, + "grad_norm": 0.9765625, + "learning_rate": 0.0002749511418177852, + "loss": 0.1772, + "step": 339150 + }, + { + "epoch": 14.05, + "grad_norm": 2.234375, + "learning_rate": 0.00027494035074890767, + "loss": 0.1901, + "step": 339160 + }, + { + "epoch": 14.05, + "grad_norm": 1.3359375, + "learning_rate": 0.0002749295596330951, + "loss": 0.2357, + "step": 339170 + }, + { + "epoch": 14.05, + "grad_norm": 0.486328125, + "learning_rate": 0.00027491876847036777, + "loss": 0.1777, + "step": 339180 + }, + { + "epoch": 14.05, + "grad_norm": 0.63671875, + "learning_rate": 0.00027490797726074603, + "loss": 0.1829, + "step": 339190 + }, + { + "epoch": 14.05, + "grad_norm": 0.63671875, + "learning_rate": 0.0002748971860042501, + "loss": 0.1927, + "step": 339200 + }, + { + "epoch": 14.05, + "grad_norm": 0.71875, + "learning_rate": 0.0002748863947009004, + "loss": 0.1438, + "step": 339210 + }, + { + "epoch": 14.05, + "grad_norm": 1.078125, + "learning_rate": 0.0002748756033507172, + "loss": 0.1857, + "step": 339220 + }, + { + "epoch": 14.05, + "grad_norm": 1.828125, + "learning_rate": 0.00027486481195372083, + "loss": 0.2111, + "step": 339230 + }, + { + "epoch": 14.05, + "grad_norm": 1.3515625, + "learning_rate": 0.0002748540205099315, + "loss": 0.2076, + "step": 339240 + }, + { + "epoch": 14.05, + "grad_norm": 1.140625, + "learning_rate": 0.00027484322901936964, + "loss": 0.2364, + "step": 339250 + }, + { + "epoch": 14.05, + "grad_norm": 1.1171875, + "learning_rate": 0.00027483243748205544, + "loss": 0.1747, + "step": 339260 + }, + { + "epoch": 14.05, + "grad_norm": 0.84375, + "learning_rate": 0.00027482164589800936, + "loss": 0.2133, + "step": 339270 + }, + { + "epoch": 14.05, + "grad_norm": 0.76171875, + "learning_rate": 0.00027481085426725156, + "loss": 0.1782, + "step": 339280 + }, + { + "epoch": 14.05, + "grad_norm": 0.60546875, + "learning_rate": 0.0002748000625898024, + "loss": 0.1806, + "step": 339290 + }, + { + "epoch": 14.05, + "grad_norm": 1.375, + "learning_rate": 0.00027478927086568226, + "loss": 0.1576, + "step": 339300 + }, + { + "epoch": 14.05, + "grad_norm": 0.65234375, + "learning_rate": 0.00027477847909491136, + "loss": 0.1988, + "step": 339310 + }, + { + "epoch": 14.05, + "grad_norm": 1.015625, + "learning_rate": 0.00027476768727751006, + "loss": 0.1431, + "step": 339320 + }, + { + "epoch": 14.06, + "grad_norm": 0.380859375, + "learning_rate": 0.0002747568954134986, + "loss": 0.1535, + "step": 339330 + }, + { + "epoch": 14.06, + "grad_norm": 0.953125, + "learning_rate": 0.00027474610350289736, + "loss": 0.2048, + "step": 339340 + }, + { + "epoch": 14.06, + "grad_norm": 0.78125, + "learning_rate": 0.0002747353115457267, + "loss": 0.1932, + "step": 339350 + }, + { + "epoch": 14.06, + "grad_norm": 1.2265625, + "learning_rate": 0.0002747245195420067, + "loss": 0.1782, + "step": 339360 + }, + { + "epoch": 14.06, + "grad_norm": 1.046875, + "learning_rate": 0.0002747137274917579, + "loss": 0.2191, + "step": 339370 + }, + { + "epoch": 14.06, + "grad_norm": 0.8671875, + "learning_rate": 0.00027470293539500065, + "loss": 0.2054, + "step": 339380 + }, + { + "epoch": 14.06, + "grad_norm": 1.84375, + "learning_rate": 0.000274692143251755, + "loss": 0.1952, + "step": 339390 + }, + { + "epoch": 14.06, + "grad_norm": 0.71484375, + "learning_rate": 0.0002746813510620414, + "loss": 0.184, + "step": 339400 + }, + { + "epoch": 14.06, + "grad_norm": 0.76171875, + "learning_rate": 0.00027467055882588023, + "loss": 0.1612, + "step": 339410 + }, + { + "epoch": 14.06, + "grad_norm": 1.1953125, + "learning_rate": 0.0002746597665432917, + "loss": 0.2584, + "step": 339420 + }, + { + "epoch": 14.06, + "grad_norm": 1.0078125, + "learning_rate": 0.0002746489742142962, + "loss": 0.2021, + "step": 339430 + }, + { + "epoch": 14.06, + "grad_norm": 1.171875, + "learning_rate": 0.00027463818183891394, + "loss": 0.2335, + "step": 339440 + }, + { + "epoch": 14.06, + "grad_norm": 1.0078125, + "learning_rate": 0.00027462738941716523, + "loss": 0.2046, + "step": 339450 + }, + { + "epoch": 14.06, + "grad_norm": 0.7578125, + "learning_rate": 0.00027461659694907057, + "loss": 0.2219, + "step": 339460 + }, + { + "epoch": 14.06, + "grad_norm": 0.546875, + "learning_rate": 0.00027460580443465, + "loss": 0.1795, + "step": 339470 + }, + { + "epoch": 14.06, + "grad_norm": 1.09375, + "learning_rate": 0.000274595011873924, + "loss": 0.2075, + "step": 339480 + }, + { + "epoch": 14.06, + "grad_norm": 1.703125, + "learning_rate": 0.0002745842192669129, + "loss": 0.19, + "step": 339490 + }, + { + "epoch": 14.06, + "grad_norm": 0.7578125, + "learning_rate": 0.0002745734266136369, + "loss": 0.2096, + "step": 339500 + }, + { + "epoch": 14.06, + "grad_norm": 0.6328125, + "learning_rate": 0.0002745626339141164, + "loss": 0.2325, + "step": 339510 + }, + { + "epoch": 14.06, + "grad_norm": 0.94140625, + "learning_rate": 0.00027455184116837166, + "loss": 0.1835, + "step": 339520 + }, + { + "epoch": 14.06, + "grad_norm": 0.4375, + "learning_rate": 0.000274541048376423, + "loss": 0.2201, + "step": 339530 + }, + { + "epoch": 14.06, + "grad_norm": 1.171875, + "learning_rate": 0.0002745302555382907, + "loss": 0.1545, + "step": 339540 + }, + { + "epoch": 14.06, + "grad_norm": 0.8515625, + "learning_rate": 0.00027451946265399517, + "loss": 0.2112, + "step": 339550 + }, + { + "epoch": 14.06, + "grad_norm": 0.97265625, + "learning_rate": 0.00027450866972355664, + "loss": 0.1743, + "step": 339560 + }, + { + "epoch": 14.06, + "grad_norm": 1.578125, + "learning_rate": 0.00027449787674699544, + "loss": 0.1986, + "step": 339570 + }, + { + "epoch": 14.07, + "grad_norm": 0.71875, + "learning_rate": 0.0002744870837243319, + "loss": 0.1912, + "step": 339580 + }, + { + "epoch": 14.07, + "grad_norm": 0.7890625, + "learning_rate": 0.0002744762906555863, + "loss": 0.1998, + "step": 339590 + }, + { + "epoch": 14.07, + "grad_norm": 1.6015625, + "learning_rate": 0.00027446549754077897, + "loss": 0.1798, + "step": 339600 + }, + { + "epoch": 14.07, + "grad_norm": 0.6015625, + "learning_rate": 0.0002744547043799302, + "loss": 0.2096, + "step": 339610 + }, + { + "epoch": 14.07, + "grad_norm": 0.80859375, + "learning_rate": 0.00027444391117306026, + "loss": 0.2189, + "step": 339620 + }, + { + "epoch": 14.07, + "grad_norm": 0.98046875, + "learning_rate": 0.00027443311792018965, + "loss": 0.1925, + "step": 339630 + }, + { + "epoch": 14.07, + "grad_norm": 0.53125, + "learning_rate": 0.00027442232462133846, + "loss": 0.1626, + "step": 339640 + }, + { + "epoch": 14.07, + "grad_norm": 0.70703125, + "learning_rate": 0.0002744115312765271, + "loss": 0.2061, + "step": 339650 + }, + { + "epoch": 14.07, + "grad_norm": 0.7109375, + "learning_rate": 0.0002744007378857759, + "loss": 0.1296, + "step": 339660 + }, + { + "epoch": 14.07, + "grad_norm": 0.84375, + "learning_rate": 0.0002743899444491051, + "loss": 0.1912, + "step": 339670 + }, + { + "epoch": 14.07, + "grad_norm": 0.484375, + "learning_rate": 0.00027437915096653517, + "loss": 0.2125, + "step": 339680 + }, + { + "epoch": 14.07, + "grad_norm": 0.421875, + "learning_rate": 0.00027436835743808617, + "loss": 0.1693, + "step": 339690 + }, + { + "epoch": 14.07, + "grad_norm": 0.357421875, + "learning_rate": 0.0002743575638637786, + "loss": 0.2455, + "step": 339700 + }, + { + "epoch": 14.07, + "grad_norm": 0.921875, + "learning_rate": 0.0002743467702436328, + "loss": 0.2118, + "step": 339710 + }, + { + "epoch": 14.07, + "grad_norm": 0.60546875, + "learning_rate": 0.00027433597657766895, + "loss": 0.1655, + "step": 339720 + }, + { + "epoch": 14.07, + "grad_norm": 0.9296875, + "learning_rate": 0.00027432518286590743, + "loss": 0.2724, + "step": 339730 + }, + { + "epoch": 14.07, + "grad_norm": 0.5390625, + "learning_rate": 0.00027431438910836847, + "loss": 0.183, + "step": 339740 + }, + { + "epoch": 14.07, + "grad_norm": 1.984375, + "learning_rate": 0.00027430359530507254, + "loss": 0.2383, + "step": 339750 + }, + { + "epoch": 14.07, + "grad_norm": 1.125, + "learning_rate": 0.00027429280145603986, + "loss": 0.1621, + "step": 339760 + }, + { + "epoch": 14.07, + "grad_norm": 0.90234375, + "learning_rate": 0.0002742820075612907, + "loss": 0.177, + "step": 339770 + }, + { + "epoch": 14.07, + "grad_norm": 0.94140625, + "learning_rate": 0.00027427121362084544, + "loss": 0.1875, + "step": 339780 + }, + { + "epoch": 14.07, + "grad_norm": 0.9296875, + "learning_rate": 0.0002742604196347245, + "loss": 0.1938, + "step": 339790 + }, + { + "epoch": 14.07, + "grad_norm": 2.78125, + "learning_rate": 0.00027424962560294793, + "loss": 0.2381, + "step": 339800 + }, + { + "epoch": 14.07, + "grad_norm": 0.67578125, + "learning_rate": 0.0002742388315255362, + "loss": 0.2505, + "step": 339810 + }, + { + "epoch": 14.08, + "grad_norm": 0.6640625, + "learning_rate": 0.00027422803740250964, + "loss": 0.1985, + "step": 339820 + }, + { + "epoch": 14.08, + "grad_norm": 0.6875, + "learning_rate": 0.0002742172432338885, + "loss": 0.175, + "step": 339830 + }, + { + "epoch": 14.08, + "grad_norm": 0.578125, + "learning_rate": 0.00027420644901969316, + "loss": 0.1814, + "step": 339840 + }, + { + "epoch": 14.08, + "grad_norm": 1.0703125, + "learning_rate": 0.0002741956547599438, + "loss": 0.2088, + "step": 339850 + }, + { + "epoch": 14.08, + "grad_norm": 1.5390625, + "learning_rate": 0.00027418486045466094, + "loss": 0.1748, + "step": 339860 + }, + { + "epoch": 14.08, + "grad_norm": 0.71875, + "learning_rate": 0.0002741740661038648, + "loss": 0.1882, + "step": 339870 + }, + { + "epoch": 14.08, + "grad_norm": 0.68359375, + "learning_rate": 0.00027416327170757567, + "loss": 0.1891, + "step": 339880 + }, + { + "epoch": 14.08, + "grad_norm": 1.0390625, + "learning_rate": 0.0002741524772658138, + "loss": 0.2301, + "step": 339890 + }, + { + "epoch": 14.08, + "grad_norm": 0.95703125, + "learning_rate": 0.0002741416827785996, + "loss": 0.23, + "step": 339900 + }, + { + "epoch": 14.08, + "grad_norm": 0.79296875, + "learning_rate": 0.00027413088824595333, + "loss": 0.219, + "step": 339910 + }, + { + "epoch": 14.08, + "grad_norm": 0.76171875, + "learning_rate": 0.0002741200936678954, + "loss": 0.1839, + "step": 339920 + }, + { + "epoch": 14.08, + "grad_norm": 0.49609375, + "learning_rate": 0.000274109299044446, + "loss": 0.1778, + "step": 339930 + }, + { + "epoch": 14.08, + "grad_norm": 1.2734375, + "learning_rate": 0.00027409850437562554, + "loss": 0.1652, + "step": 339940 + }, + { + "epoch": 14.08, + "grad_norm": 1.453125, + "learning_rate": 0.00027408770966145434, + "loss": 0.1853, + "step": 339950 + }, + { + "epoch": 14.08, + "grad_norm": 1.1640625, + "learning_rate": 0.0002740769149019526, + "loss": 0.1352, + "step": 339960 + }, + { + "epoch": 14.08, + "grad_norm": 0.828125, + "learning_rate": 0.0002740661200971408, + "loss": 0.2107, + "step": 339970 + }, + { + "epoch": 14.08, + "grad_norm": 0.4921875, + "learning_rate": 0.00027405532524703904, + "loss": 0.1852, + "step": 339980 + }, + { + "epoch": 14.08, + "grad_norm": 0.271484375, + "learning_rate": 0.0002740445303516678, + "loss": 0.2135, + "step": 339990 + }, + { + "epoch": 14.08, + "grad_norm": 0.6015625, + "learning_rate": 0.00027403373541104735, + "loss": 0.1892, + "step": 340000 + }, + { + "epoch": 14.08, + "grad_norm": 0.80078125, + "learning_rate": 0.000274022940425198, + "loss": 0.2013, + "step": 340010 + }, + { + "epoch": 14.08, + "grad_norm": 0.875, + "learning_rate": 0.0002740121453941401, + "loss": 0.1996, + "step": 340020 + }, + { + "epoch": 14.08, + "grad_norm": 0.76953125, + "learning_rate": 0.00027400135031789395, + "loss": 0.1896, + "step": 340030 + }, + { + "epoch": 14.08, + "grad_norm": 0.66796875, + "learning_rate": 0.00027399055519647977, + "loss": 0.2277, + "step": 340040 + }, + { + "epoch": 14.08, + "grad_norm": 0.5625, + "learning_rate": 0.00027397976002991806, + "loss": 0.1151, + "step": 340050 + }, + { + "epoch": 14.09, + "grad_norm": 1.5390625, + "learning_rate": 0.0002739689648182289, + "loss": 0.1918, + "step": 340060 + }, + { + "epoch": 14.09, + "grad_norm": 0.9453125, + "learning_rate": 0.0002739581695614328, + "loss": 0.2349, + "step": 340070 + }, + { + "epoch": 14.09, + "grad_norm": 0.85546875, + "learning_rate": 0.0002739473742595501, + "loss": 0.17, + "step": 340080 + }, + { + "epoch": 14.09, + "grad_norm": 0.78515625, + "learning_rate": 0.00027393657891260087, + "loss": 0.1666, + "step": 340090 + }, + { + "epoch": 14.09, + "grad_norm": 1.265625, + "learning_rate": 0.0002739257835206057, + "loss": 0.183, + "step": 340100 + }, + { + "epoch": 14.09, + "grad_norm": 0.8359375, + "learning_rate": 0.00027391498808358483, + "loss": 0.196, + "step": 340110 + }, + { + "epoch": 14.09, + "grad_norm": 1.265625, + "learning_rate": 0.00027390419260155834, + "loss": 0.2125, + "step": 340120 + }, + { + "epoch": 14.09, + "grad_norm": 0.77734375, + "learning_rate": 0.0002738933970745469, + "loss": 0.1985, + "step": 340130 + }, + { + "epoch": 14.09, + "grad_norm": 0.54296875, + "learning_rate": 0.00027388260150257066, + "loss": 0.195, + "step": 340140 + }, + { + "epoch": 14.09, + "grad_norm": 1.046875, + "learning_rate": 0.0002738718058856499, + "loss": 0.1828, + "step": 340150 + }, + { + "epoch": 14.09, + "grad_norm": 0.6484375, + "learning_rate": 0.00027386101022380506, + "loss": 0.2106, + "step": 340160 + }, + { + "epoch": 14.09, + "grad_norm": 0.5859375, + "learning_rate": 0.0002738502145170562, + "loss": 0.1967, + "step": 340170 + }, + { + "epoch": 14.09, + "grad_norm": 0.490234375, + "learning_rate": 0.0002738394187654239, + "loss": 0.2011, + "step": 340180 + }, + { + "epoch": 14.09, + "grad_norm": 0.41015625, + "learning_rate": 0.0002738286229689285, + "loss": 0.1941, + "step": 340190 + }, + { + "epoch": 14.09, + "grad_norm": 0.55859375, + "learning_rate": 0.0002738178271275901, + "loss": 0.214, + "step": 340200 + }, + { + "epoch": 14.09, + "grad_norm": 1.1015625, + "learning_rate": 0.00027380703124142914, + "loss": 0.1816, + "step": 340210 + }, + { + "epoch": 14.09, + "grad_norm": 1.09375, + "learning_rate": 0.00027379623531046586, + "loss": 0.2087, + "step": 340220 + }, + { + "epoch": 14.09, + "grad_norm": 0.0, + "learning_rate": 0.00027378543933472064, + "loss": 0.2105, + "step": 340230 + }, + { + "epoch": 14.09, + "grad_norm": 0.62890625, + "learning_rate": 0.0002737746433142139, + "loss": 0.2465, + "step": 340240 + }, + { + "epoch": 14.09, + "grad_norm": 0.73828125, + "learning_rate": 0.00027376384724896576, + "loss": 0.1611, + "step": 340250 + }, + { + "epoch": 14.09, + "grad_norm": 0.9765625, + "learning_rate": 0.0002737530511389967, + "loss": 0.2, + "step": 340260 + }, + { + "epoch": 14.09, + "grad_norm": 1.546875, + "learning_rate": 0.00027374225498432694, + "loss": 0.2045, + "step": 340270 + }, + { + "epoch": 14.09, + "grad_norm": 0.40625, + "learning_rate": 0.00027373145878497674, + "loss": 0.1374, + "step": 340280 + }, + { + "epoch": 14.09, + "grad_norm": 0.44921875, + "learning_rate": 0.00027372066254096656, + "loss": 0.1799, + "step": 340290 + }, + { + "epoch": 14.1, + "grad_norm": 0.404296875, + "learning_rate": 0.00027370986625231664, + "loss": 0.2508, + "step": 340300 + }, + { + "epoch": 14.1, + "grad_norm": 1.6484375, + "learning_rate": 0.0002736990699190473, + "loss": 0.1679, + "step": 340310 + }, + { + "epoch": 14.1, + "grad_norm": 1.46875, + "learning_rate": 0.0002736882735411789, + "loss": 0.1764, + "step": 340320 + }, + { + "epoch": 14.1, + "grad_norm": 0.0, + "learning_rate": 0.0002736774771187317, + "loss": 0.1501, + "step": 340330 + }, + { + "epoch": 14.1, + "grad_norm": 0.8203125, + "learning_rate": 0.0002736666806517261, + "loss": 0.1957, + "step": 340340 + }, + { + "epoch": 14.1, + "grad_norm": 1.3359375, + "learning_rate": 0.0002736558841401824, + "loss": 0.1803, + "step": 340350 + }, + { + "epoch": 14.1, + "grad_norm": 1.0859375, + "learning_rate": 0.0002736450875841207, + "loss": 0.1544, + "step": 340360 + }, + { + "epoch": 14.1, + "grad_norm": 0.51171875, + "learning_rate": 0.0002736342909835616, + "loss": 0.2114, + "step": 340370 + }, + { + "epoch": 14.1, + "grad_norm": 0.67578125, + "learning_rate": 0.0002736234943385253, + "loss": 0.1281, + "step": 340380 + }, + { + "epoch": 14.1, + "grad_norm": 1.0625, + "learning_rate": 0.0002736126976490322, + "loss": 0.2141, + "step": 340390 + }, + { + "epoch": 14.1, + "grad_norm": 0.84765625, + "learning_rate": 0.0002736019009151025, + "loss": 0.1897, + "step": 340400 + }, + { + "epoch": 14.1, + "grad_norm": 1.484375, + "learning_rate": 0.0002735911041367565, + "loss": 0.2035, + "step": 340410 + }, + { + "epoch": 14.1, + "grad_norm": 0.55078125, + "learning_rate": 0.0002735803073140147, + "loss": 0.252, + "step": 340420 + }, + { + "epoch": 14.1, + "grad_norm": 0.8828125, + "learning_rate": 0.00027356951044689735, + "loss": 0.1719, + "step": 340430 + }, + { + "epoch": 14.1, + "grad_norm": 1.6171875, + "learning_rate": 0.0002735587135354246, + "loss": 0.1928, + "step": 340440 + }, + { + "epoch": 14.1, + "grad_norm": 0.48828125, + "learning_rate": 0.00027354791657961696, + "loss": 0.1948, + "step": 340450 + }, + { + "epoch": 14.1, + "grad_norm": 1.125, + "learning_rate": 0.00027353711957949466, + "loss": 0.1881, + "step": 340460 + }, + { + "epoch": 14.1, + "grad_norm": 1.03125, + "learning_rate": 0.0002735263225350781, + "loss": 0.1801, + "step": 340470 + }, + { + "epoch": 14.1, + "grad_norm": 0.66015625, + "learning_rate": 0.0002735155254463875, + "loss": 0.2, + "step": 340480 + }, + { + "epoch": 14.1, + "grad_norm": 0.625, + "learning_rate": 0.0002735047283134431, + "loss": 0.1409, + "step": 340490 + }, + { + "epoch": 14.1, + "grad_norm": 0.8046875, + "learning_rate": 0.00027349393113626556, + "loss": 0.1801, + "step": 340500 + }, + { + "epoch": 14.1, + "grad_norm": 0.98828125, + "learning_rate": 0.00027348313391487483, + "loss": 0.2364, + "step": 340510 + }, + { + "epoch": 14.1, + "grad_norm": 1.2265625, + "learning_rate": 0.0002734723366492914, + "loss": 0.1413, + "step": 340520 + }, + { + "epoch": 14.1, + "grad_norm": 0.50390625, + "learning_rate": 0.00027346153933953567, + "loss": 0.1827, + "step": 340530 + }, + { + "epoch": 14.11, + "grad_norm": 1.0625, + "learning_rate": 0.00027345074198562773, + "loss": 0.2007, + "step": 340540 + }, + { + "epoch": 14.11, + "grad_norm": 1.140625, + "learning_rate": 0.0002734399445875881, + "loss": 0.1565, + "step": 340550 + }, + { + "epoch": 14.11, + "grad_norm": 0.353515625, + "learning_rate": 0.00027342914714543706, + "loss": 0.2006, + "step": 340560 + }, + { + "epoch": 14.11, + "grad_norm": 0.32421875, + "learning_rate": 0.00027341834965919476, + "loss": 0.1484, + "step": 340570 + }, + { + "epoch": 14.11, + "grad_norm": 0.875, + "learning_rate": 0.00027340755212888176, + "loss": 0.1217, + "step": 340580 + }, + { + "epoch": 14.11, + "grad_norm": 0.85546875, + "learning_rate": 0.00027339675455451823, + "loss": 0.2358, + "step": 340590 + }, + { + "epoch": 14.11, + "grad_norm": 0.73828125, + "learning_rate": 0.00027338595693612454, + "loss": 0.1029, + "step": 340600 + }, + { + "epoch": 14.11, + "grad_norm": 0.6640625, + "learning_rate": 0.000273375159273721, + "loss": 0.1633, + "step": 340610 + }, + { + "epoch": 14.11, + "grad_norm": 1.984375, + "learning_rate": 0.00027336436156732795, + "loss": 0.2201, + "step": 340620 + }, + { + "epoch": 14.11, + "grad_norm": 0.93359375, + "learning_rate": 0.0002733535638169657, + "loss": 0.1519, + "step": 340630 + }, + { + "epoch": 14.11, + "grad_norm": 0.58203125, + "learning_rate": 0.00027334276602265464, + "loss": 0.1962, + "step": 340640 + }, + { + "epoch": 14.11, + "grad_norm": 0.66796875, + "learning_rate": 0.0002733319681844148, + "loss": 0.1408, + "step": 340650 + }, + { + "epoch": 14.11, + "grad_norm": 1.2578125, + "learning_rate": 0.0002733211703022669, + "loss": 0.2069, + "step": 340660 + }, + { + "epoch": 14.11, + "grad_norm": 0.7421875, + "learning_rate": 0.000273310372376231, + "loss": 0.2017, + "step": 340670 + }, + { + "epoch": 14.11, + "grad_norm": 0.546875, + "learning_rate": 0.00027329957440632753, + "loss": 0.2108, + "step": 340680 + }, + { + "epoch": 14.11, + "grad_norm": 3.5, + "learning_rate": 0.0002732887763925768, + "loss": 0.1883, + "step": 340690 + }, + { + "epoch": 14.11, + "grad_norm": 1.1875, + "learning_rate": 0.00027327797833499905, + "loss": 0.2433, + "step": 340700 + }, + { + "epoch": 14.11, + "grad_norm": 0.84765625, + "learning_rate": 0.0002732671802336147, + "loss": 0.2218, + "step": 340710 + }, + { + "epoch": 14.11, + "grad_norm": 0.93359375, + "learning_rate": 0.000273256382088444, + "loss": 0.1643, + "step": 340720 + }, + { + "epoch": 14.11, + "grad_norm": 0.82421875, + "learning_rate": 0.0002732455838995073, + "loss": 0.189, + "step": 340730 + }, + { + "epoch": 14.11, + "grad_norm": 0.59375, + "learning_rate": 0.00027323478566682497, + "loss": 0.1938, + "step": 340740 + }, + { + "epoch": 14.11, + "grad_norm": 0.640625, + "learning_rate": 0.0002732239873904172, + "loss": 0.204, + "step": 340750 + }, + { + "epoch": 14.11, + "grad_norm": 0.91796875, + "learning_rate": 0.00027321318907030446, + "loss": 0.1891, + "step": 340760 + }, + { + "epoch": 14.11, + "grad_norm": 0.55078125, + "learning_rate": 0.00027320239070650703, + "loss": 0.1761, + "step": 340770 + }, + { + "epoch": 14.12, + "grad_norm": 1.625, + "learning_rate": 0.00027319159229904514, + "loss": 0.201, + "step": 340780 + }, + { + "epoch": 14.12, + "grad_norm": 0.7890625, + "learning_rate": 0.0002731807938479392, + "loss": 0.2016, + "step": 340790 + }, + { + "epoch": 14.12, + "grad_norm": 1.484375, + "learning_rate": 0.00027316999535320944, + "loss": 0.2201, + "step": 340800 + }, + { + "epoch": 14.12, + "grad_norm": 0.2119140625, + "learning_rate": 0.00027315919681487633, + "loss": 0.223, + "step": 340810 + }, + { + "epoch": 14.12, + "grad_norm": 0.9765625, + "learning_rate": 0.0002731483982329602, + "loss": 0.1403, + "step": 340820 + }, + { + "epoch": 14.12, + "grad_norm": 0.5234375, + "learning_rate": 0.00027313759960748114, + "loss": 0.1619, + "step": 340830 + }, + { + "epoch": 14.12, + "grad_norm": 1.34375, + "learning_rate": 0.00027312680093845965, + "loss": 0.1728, + "step": 340840 + }, + { + "epoch": 14.12, + "grad_norm": 0.828125, + "learning_rate": 0.000273116002225916, + "loss": 0.2047, + "step": 340850 + }, + { + "epoch": 14.12, + "grad_norm": 0.71875, + "learning_rate": 0.0002731052034698706, + "loss": 0.2425, + "step": 340860 + }, + { + "epoch": 14.12, + "grad_norm": 1.7734375, + "learning_rate": 0.0002730944046703437, + "loss": 0.2013, + "step": 340870 + }, + { + "epoch": 14.12, + "grad_norm": 0.353515625, + "learning_rate": 0.00027308360582735554, + "loss": 0.2381, + "step": 340880 + }, + { + "epoch": 14.12, + "grad_norm": 1.0546875, + "learning_rate": 0.0002730728069409266, + "loss": 0.1767, + "step": 340890 + }, + { + "epoch": 14.12, + "grad_norm": 0.73828125, + "learning_rate": 0.0002730620080110771, + "loss": 0.1861, + "step": 340900 + }, + { + "epoch": 14.12, + "grad_norm": 0.640625, + "learning_rate": 0.00027305120903782737, + "loss": 0.1728, + "step": 340910 + }, + { + "epoch": 14.12, + "grad_norm": 0.23046875, + "learning_rate": 0.00027304041002119777, + "loss": 0.1688, + "step": 340920 + }, + { + "epoch": 14.12, + "grad_norm": 1.59375, + "learning_rate": 0.0002730296109612086, + "loss": 0.192, + "step": 340930 + }, + { + "epoch": 14.12, + "grad_norm": 0.7265625, + "learning_rate": 0.0002730188118578802, + "loss": 0.2117, + "step": 340940 + }, + { + "epoch": 14.12, + "grad_norm": 0.55078125, + "learning_rate": 0.00027300801271123293, + "loss": 0.1422, + "step": 340950 + }, + { + "epoch": 14.12, + "grad_norm": 0.9140625, + "learning_rate": 0.00027299721352128705, + "loss": 0.2236, + "step": 340960 + }, + { + "epoch": 14.12, + "grad_norm": 0.4453125, + "learning_rate": 0.0002729864142880628, + "loss": 0.2084, + "step": 340970 + }, + { + "epoch": 14.12, + "grad_norm": 0.91015625, + "learning_rate": 0.00027297561501158073, + "loss": 0.1663, + "step": 340980 + }, + { + "epoch": 14.12, + "grad_norm": 1.8046875, + "learning_rate": 0.00027296481569186095, + "loss": 0.1822, + "step": 340990 + }, + { + "epoch": 14.12, + "grad_norm": 1.0546875, + "learning_rate": 0.00027295401632892384, + "loss": 0.1783, + "step": 341000 + }, + { + "epoch": 14.12, + "grad_norm": 2.15625, + "learning_rate": 0.00027294321692278984, + "loss": 0.2289, + "step": 341010 + }, + { + "epoch": 14.13, + "grad_norm": 1.171875, + "learning_rate": 0.00027293241747347916, + "loss": 0.2091, + "step": 341020 + }, + { + "epoch": 14.13, + "grad_norm": 1.15625, + "learning_rate": 0.0002729216179810121, + "loss": 0.1394, + "step": 341030 + }, + { + "epoch": 14.13, + "grad_norm": 1.109375, + "learning_rate": 0.0002729108184454091, + "loss": 0.1853, + "step": 341040 + }, + { + "epoch": 14.13, + "grad_norm": 1.3671875, + "learning_rate": 0.00027290001886669035, + "loss": 0.2034, + "step": 341050 + }, + { + "epoch": 14.13, + "grad_norm": 0.5234375, + "learning_rate": 0.0002728892192448763, + "loss": 0.1797, + "step": 341060 + }, + { + "epoch": 14.13, + "grad_norm": 1.0625, + "learning_rate": 0.0002728784195799872, + "loss": 0.2285, + "step": 341070 + }, + { + "epoch": 14.13, + "grad_norm": 1.2265625, + "learning_rate": 0.0002728676198720433, + "loss": 0.1679, + "step": 341080 + }, + { + "epoch": 14.13, + "grad_norm": 0.80078125, + "learning_rate": 0.00027285682012106506, + "loss": 0.1817, + "step": 341090 + }, + { + "epoch": 14.13, + "grad_norm": 0.77734375, + "learning_rate": 0.00027284602032707274, + "loss": 0.1609, + "step": 341100 + }, + { + "epoch": 14.13, + "grad_norm": 0.87109375, + "learning_rate": 0.0002728352204900867, + "loss": 0.2239, + "step": 341110 + }, + { + "epoch": 14.13, + "grad_norm": 2.25, + "learning_rate": 0.00027282442061012726, + "loss": 0.1733, + "step": 341120 + }, + { + "epoch": 14.13, + "grad_norm": 1.40625, + "learning_rate": 0.00027281362068721474, + "loss": 0.1463, + "step": 341130 + }, + { + "epoch": 14.13, + "grad_norm": 0.69140625, + "learning_rate": 0.0002728028207213694, + "loss": 0.2024, + "step": 341140 + }, + { + "epoch": 14.13, + "grad_norm": 0.80859375, + "learning_rate": 0.0002727920207126116, + "loss": 0.2116, + "step": 341150 + }, + { + "epoch": 14.13, + "grad_norm": 0.4375, + "learning_rate": 0.0002727812206609617, + "loss": 0.1948, + "step": 341160 + }, + { + "epoch": 14.13, + "grad_norm": 0.6171875, + "learning_rate": 0.00027277042056644005, + "loss": 0.2098, + "step": 341170 + }, + { + "epoch": 14.13, + "grad_norm": 0.890625, + "learning_rate": 0.0002727596204290668, + "loss": 0.1531, + "step": 341180 + }, + { + "epoch": 14.13, + "grad_norm": 0.5234375, + "learning_rate": 0.00027274882024886255, + "loss": 0.1424, + "step": 341190 + }, + { + "epoch": 14.13, + "grad_norm": 1.4765625, + "learning_rate": 0.0002727380200258474, + "loss": 0.1852, + "step": 341200 + }, + { + "epoch": 14.13, + "grad_norm": 0.439453125, + "learning_rate": 0.00027272721976004177, + "loss": 0.2167, + "step": 341210 + }, + { + "epoch": 14.13, + "grad_norm": 0.357421875, + "learning_rate": 0.0002727164194514659, + "loss": 0.1982, + "step": 341220 + }, + { + "epoch": 14.13, + "grad_norm": 0.72265625, + "learning_rate": 0.0002727056191001403, + "loss": 0.1704, + "step": 341230 + }, + { + "epoch": 14.13, + "grad_norm": 1.484375, + "learning_rate": 0.00027269481870608516, + "loss": 0.1793, + "step": 341240 + }, + { + "epoch": 14.13, + "grad_norm": 1.2265625, + "learning_rate": 0.0002726840182693207, + "loss": 0.1723, + "step": 341250 + }, + { + "epoch": 14.13, + "grad_norm": 0.98046875, + "learning_rate": 0.0002726732177898675, + "loss": 0.1987, + "step": 341260 + }, + { + "epoch": 14.14, + "grad_norm": 0.5625, + "learning_rate": 0.00027266241726774565, + "loss": 0.1755, + "step": 341270 + }, + { + "epoch": 14.14, + "grad_norm": 1.078125, + "learning_rate": 0.0002726516167029757, + "loss": 0.2191, + "step": 341280 + }, + { + "epoch": 14.14, + "grad_norm": 1.3359375, + "learning_rate": 0.00027264081609557777, + "loss": 0.1653, + "step": 341290 + }, + { + "epoch": 14.14, + "grad_norm": 0.75390625, + "learning_rate": 0.0002726300154455723, + "loss": 0.1924, + "step": 341300 + }, + { + "epoch": 14.14, + "grad_norm": 0.380859375, + "learning_rate": 0.00027261921475297955, + "loss": 0.169, + "step": 341310 + }, + { + "epoch": 14.14, + "grad_norm": 1.609375, + "learning_rate": 0.0002726084140178199, + "loss": 0.2195, + "step": 341320 + }, + { + "epoch": 14.14, + "grad_norm": 0.75390625, + "learning_rate": 0.00027259761324011366, + "loss": 0.206, + "step": 341330 + }, + { + "epoch": 14.14, + "grad_norm": 1.234375, + "learning_rate": 0.00027258681241988116, + "loss": 0.1725, + "step": 341340 + }, + { + "epoch": 14.14, + "grad_norm": 0.97265625, + "learning_rate": 0.0002725760115571426, + "loss": 0.203, + "step": 341350 + }, + { + "epoch": 14.14, + "grad_norm": 0.97265625, + "learning_rate": 0.00027256521065191865, + "loss": 0.2319, + "step": 341360 + }, + { + "epoch": 14.14, + "grad_norm": 1.8671875, + "learning_rate": 0.00027255440970422925, + "loss": 0.1798, + "step": 341370 + }, + { + "epoch": 14.14, + "grad_norm": 0.86328125, + "learning_rate": 0.0002725436087140949, + "loss": 0.1919, + "step": 341380 + }, + { + "epoch": 14.14, + "grad_norm": 1.9453125, + "learning_rate": 0.000272532807681536, + "loss": 0.1943, + "step": 341390 + }, + { + "epoch": 14.14, + "grad_norm": 0.8515625, + "learning_rate": 0.00027252200660657274, + "loss": 0.1942, + "step": 341400 + }, + { + "epoch": 14.14, + "grad_norm": 1.90625, + "learning_rate": 0.00027251120548922543, + "loss": 0.1726, + "step": 341410 + }, + { + "epoch": 14.14, + "grad_norm": 1.2265625, + "learning_rate": 0.0002725004043295145, + "loss": 0.1982, + "step": 341420 + }, + { + "epoch": 14.14, + "grad_norm": 0.466796875, + "learning_rate": 0.00027248960312746025, + "loss": 0.1972, + "step": 341430 + }, + { + "epoch": 14.14, + "grad_norm": 0.69921875, + "learning_rate": 0.00027247880188308304, + "loss": 0.1744, + "step": 341440 + }, + { + "epoch": 14.14, + "grad_norm": 0.56640625, + "learning_rate": 0.0002724680005964031, + "loss": 0.179, + "step": 341450 + }, + { + "epoch": 14.14, + "grad_norm": 0.99609375, + "learning_rate": 0.00027245719926744086, + "loss": 0.2022, + "step": 341460 + }, + { + "epoch": 14.14, + "grad_norm": 1.125, + "learning_rate": 0.00027244639789621654, + "loss": 0.1469, + "step": 341470 + }, + { + "epoch": 14.14, + "grad_norm": 0.75, + "learning_rate": 0.0002724355964827505, + "loss": 0.1331, + "step": 341480 + }, + { + "epoch": 14.14, + "grad_norm": 0.76171875, + "learning_rate": 0.0002724247950270632, + "loss": 0.1727, + "step": 341490 + }, + { + "epoch": 14.14, + "grad_norm": 1.0390625, + "learning_rate": 0.0002724139935291747, + "loss": 0.1496, + "step": 341500 + }, + { + "epoch": 14.15, + "grad_norm": 0.41015625, + "learning_rate": 0.0002724031919891056, + "loss": 0.1841, + "step": 341510 + }, + { + "epoch": 14.15, + "grad_norm": 1.5078125, + "learning_rate": 0.00027239239040687614, + "loss": 0.2174, + "step": 341520 + }, + { + "epoch": 14.15, + "grad_norm": 0.72265625, + "learning_rate": 0.00027238158878250656, + "loss": 0.1977, + "step": 341530 + }, + { + "epoch": 14.15, + "grad_norm": 2.734375, + "learning_rate": 0.00027237078711601724, + "loss": 0.1835, + "step": 341540 + }, + { + "epoch": 14.15, + "grad_norm": 0.478515625, + "learning_rate": 0.0002723599854074286, + "loss": 0.2673, + "step": 341550 + }, + { + "epoch": 14.15, + "grad_norm": 3.4375, + "learning_rate": 0.0002723491836567607, + "loss": 0.2491, + "step": 341560 + }, + { + "epoch": 14.15, + "grad_norm": 0.55078125, + "learning_rate": 0.0002723383818640342, + "loss": 0.1744, + "step": 341570 + }, + { + "epoch": 14.15, + "grad_norm": 0.8515625, + "learning_rate": 0.00027232758002926924, + "loss": 0.1743, + "step": 341580 + }, + { + "epoch": 14.15, + "grad_norm": 1.0703125, + "learning_rate": 0.00027231677815248617, + "loss": 0.1614, + "step": 341590 + }, + { + "epoch": 14.15, + "grad_norm": 0.72265625, + "learning_rate": 0.0002723059762337054, + "loss": 0.1828, + "step": 341600 + }, + { + "epoch": 14.15, + "grad_norm": 0.8515625, + "learning_rate": 0.000272295174272947, + "loss": 0.1652, + "step": 341610 + }, + { + "epoch": 14.15, + "grad_norm": 0.82421875, + "learning_rate": 0.00027228437227023176, + "loss": 0.2075, + "step": 341620 + }, + { + "epoch": 14.15, + "grad_norm": 0.439453125, + "learning_rate": 0.0002722735702255796, + "loss": 0.1591, + "step": 341630 + }, + { + "epoch": 14.15, + "grad_norm": 0.62890625, + "learning_rate": 0.0002722627681390109, + "loss": 0.1682, + "step": 341640 + }, + { + "epoch": 14.15, + "grad_norm": 0.443359375, + "learning_rate": 0.00027225196601054623, + "loss": 0.2027, + "step": 341650 + }, + { + "epoch": 14.15, + "grad_norm": 2.28125, + "learning_rate": 0.0002722411638402057, + "loss": 0.1793, + "step": 341660 + }, + { + "epoch": 14.15, + "grad_norm": 0.77734375, + "learning_rate": 0.0002722303616280097, + "loss": 0.1994, + "step": 341670 + }, + { + "epoch": 14.15, + "grad_norm": 0.796875, + "learning_rate": 0.0002722195593739786, + "loss": 0.174, + "step": 341680 + }, + { + "epoch": 14.15, + "grad_norm": 0.69921875, + "learning_rate": 0.00027220875707813257, + "loss": 0.1842, + "step": 341690 + }, + { + "epoch": 14.15, + "grad_norm": 0.96484375, + "learning_rate": 0.0002721979547404922, + "loss": 0.1675, + "step": 341700 + }, + { + "epoch": 14.15, + "grad_norm": 0.640625, + "learning_rate": 0.0002721871523610776, + "loss": 0.1854, + "step": 341710 + }, + { + "epoch": 14.15, + "grad_norm": 0.91796875, + "learning_rate": 0.00027217634993990914, + "loss": 0.1992, + "step": 341720 + }, + { + "epoch": 14.15, + "grad_norm": 0.55859375, + "learning_rate": 0.0002721655474770073, + "loss": 0.164, + "step": 341730 + }, + { + "epoch": 14.15, + "grad_norm": 0.51171875, + "learning_rate": 0.0002721547449723922, + "loss": 0.1953, + "step": 341740 + }, + { + "epoch": 14.16, + "grad_norm": 0.56640625, + "learning_rate": 0.00027214394242608425, + "loss": 0.1794, + "step": 341750 + }, + { + "epoch": 14.16, + "grad_norm": 0.470703125, + "learning_rate": 0.0002721331398381039, + "loss": 0.2124, + "step": 341760 + }, + { + "epoch": 14.16, + "grad_norm": 1.2421875, + "learning_rate": 0.00027212233720847124, + "loss": 0.1716, + "step": 341770 + }, + { + "epoch": 14.16, + "grad_norm": 0.9140625, + "learning_rate": 0.0002721115345372068, + "loss": 0.2069, + "step": 341780 + }, + { + "epoch": 14.16, + "grad_norm": 1.265625, + "learning_rate": 0.00027210073182433084, + "loss": 0.176, + "step": 341790 + }, + { + "epoch": 14.16, + "grad_norm": 0.87890625, + "learning_rate": 0.0002720899290698637, + "loss": 0.2429, + "step": 341800 + }, + { + "epoch": 14.16, + "grad_norm": 1.484375, + "learning_rate": 0.0002720791262738257, + "loss": 0.1986, + "step": 341810 + }, + { + "epoch": 14.16, + "grad_norm": 0.5390625, + "learning_rate": 0.0002720683234362371, + "loss": 0.2215, + "step": 341820 + }, + { + "epoch": 14.16, + "grad_norm": 0.7421875, + "learning_rate": 0.0002720575205571184, + "loss": 0.1858, + "step": 341830 + }, + { + "epoch": 14.16, + "grad_norm": 0.80859375, + "learning_rate": 0.00027204671763648975, + "loss": 0.1641, + "step": 341840 + }, + { + "epoch": 14.16, + "grad_norm": 1.4609375, + "learning_rate": 0.00027203591467437157, + "loss": 0.1943, + "step": 341850 + }, + { + "epoch": 14.16, + "grad_norm": 0.89453125, + "learning_rate": 0.0002720251116707842, + "loss": 0.1515, + "step": 341860 + }, + { + "epoch": 14.16, + "grad_norm": 1.015625, + "learning_rate": 0.000272014308625748, + "loss": 0.2315, + "step": 341870 + }, + { + "epoch": 14.16, + "grad_norm": 0.96875, + "learning_rate": 0.0002720035055392831, + "loss": 0.1702, + "step": 341880 + }, + { + "epoch": 14.16, + "grad_norm": 0.90234375, + "learning_rate": 0.0002719927024114101, + "loss": 0.2224, + "step": 341890 + }, + { + "epoch": 14.16, + "grad_norm": 0.0, + "learning_rate": 0.0002719818992421492, + "loss": 0.2331, + "step": 341900 + }, + { + "epoch": 14.16, + "grad_norm": 0.99609375, + "learning_rate": 0.0002719710960315207, + "loss": 0.1507, + "step": 341910 + }, + { + "epoch": 14.16, + "grad_norm": 0.224609375, + "learning_rate": 0.000271960292779545, + "loss": 0.1691, + "step": 341920 + }, + { + "epoch": 14.16, + "grad_norm": 0.4140625, + "learning_rate": 0.00027194948948624235, + "loss": 0.2017, + "step": 341930 + }, + { + "epoch": 14.16, + "grad_norm": 0.5859375, + "learning_rate": 0.0002719386861516332, + "loss": 0.2062, + "step": 341940 + }, + { + "epoch": 14.16, + "grad_norm": 0.734375, + "learning_rate": 0.0002719278827757378, + "loss": 0.1814, + "step": 341950 + }, + { + "epoch": 14.16, + "grad_norm": 0.58984375, + "learning_rate": 0.0002719170793585764, + "loss": 0.1372, + "step": 341960 + }, + { + "epoch": 14.16, + "grad_norm": 0.90234375, + "learning_rate": 0.00027190627590016955, + "loss": 0.2301, + "step": 341970 + }, + { + "epoch": 14.16, + "grad_norm": 0.65625, + "learning_rate": 0.00027189547240053733, + "loss": 0.1439, + "step": 341980 + }, + { + "epoch": 14.17, + "grad_norm": 0.76171875, + "learning_rate": 0.00027188466885970033, + "loss": 0.1561, + "step": 341990 + }, + { + "epoch": 14.17, + "grad_norm": 0.365234375, + "learning_rate": 0.0002718738652776787, + "loss": 0.215, + "step": 342000 + }, + { + "epoch": 14.17, + "grad_norm": 0.62890625, + "learning_rate": 0.0002718630616544927, + "loss": 0.1787, + "step": 342010 + }, + { + "epoch": 14.17, + "grad_norm": 0.91015625, + "learning_rate": 0.0002718522579901629, + "loss": 0.1451, + "step": 342020 + }, + { + "epoch": 14.17, + "grad_norm": 1.734375, + "learning_rate": 0.0002718414542847095, + "loss": 0.1874, + "step": 342030 + }, + { + "epoch": 14.17, + "grad_norm": 0.5859375, + "learning_rate": 0.00027183065053815284, + "loss": 0.2098, + "step": 342040 + }, + { + "epoch": 14.17, + "grad_norm": 0.9296875, + "learning_rate": 0.0002718198467505133, + "loss": 0.1778, + "step": 342050 + }, + { + "epoch": 14.17, + "grad_norm": 2.015625, + "learning_rate": 0.0002718090429218111, + "loss": 0.2545, + "step": 342060 + }, + { + "epoch": 14.17, + "grad_norm": 0.44140625, + "learning_rate": 0.0002717982390520666, + "loss": 0.2474, + "step": 342070 + }, + { + "epoch": 14.17, + "grad_norm": 0.875, + "learning_rate": 0.0002717874351413003, + "loss": 0.2038, + "step": 342080 + }, + { + "epoch": 14.17, + "grad_norm": 0.953125, + "learning_rate": 0.0002717766311895322, + "loss": 0.1908, + "step": 342090 + }, + { + "epoch": 14.17, + "grad_norm": 1.03125, + "learning_rate": 0.000271765827196783, + "loss": 0.1855, + "step": 342100 + }, + { + "epoch": 14.17, + "grad_norm": 1.875, + "learning_rate": 0.0002717550231630728, + "loss": 0.1922, + "step": 342110 + }, + { + "epoch": 14.17, + "grad_norm": 2.03125, + "learning_rate": 0.000271744219088422, + "loss": 0.1707, + "step": 342120 + }, + { + "epoch": 14.17, + "grad_norm": 0.9609375, + "learning_rate": 0.00027173341497285097, + "loss": 0.1775, + "step": 342130 + }, + { + "epoch": 14.17, + "grad_norm": 0.291015625, + "learning_rate": 0.00027172261081637996, + "loss": 0.165, + "step": 342140 + }, + { + "epoch": 14.17, + "grad_norm": 0.609375, + "learning_rate": 0.00027171180661902933, + "loss": 0.1979, + "step": 342150 + }, + { + "epoch": 14.17, + "grad_norm": 2.90625, + "learning_rate": 0.00027170100238081945, + "loss": 0.2218, + "step": 342160 + }, + { + "epoch": 14.17, + "grad_norm": 1.28125, + "learning_rate": 0.00027169019810177055, + "loss": 0.196, + "step": 342170 + }, + { + "epoch": 14.17, + "grad_norm": 1.0546875, + "learning_rate": 0.0002716793937819032, + "loss": 0.18, + "step": 342180 + }, + { + "epoch": 14.17, + "grad_norm": 0.6484375, + "learning_rate": 0.0002716685894212374, + "loss": 0.1674, + "step": 342190 + }, + { + "epoch": 14.17, + "grad_norm": 0.8125, + "learning_rate": 0.00027165778501979376, + "loss": 0.1631, + "step": 342200 + }, + { + "epoch": 14.17, + "grad_norm": 1.15625, + "learning_rate": 0.0002716469805775925, + "loss": 0.222, + "step": 342210 + }, + { + "epoch": 14.17, + "grad_norm": 0.83203125, + "learning_rate": 0.0002716361760946539, + "loss": 0.2117, + "step": 342220 + }, + { + "epoch": 14.18, + "grad_norm": 1.109375, + "learning_rate": 0.0002716253715709984, + "loss": 0.1936, + "step": 342230 + }, + { + "epoch": 14.18, + "grad_norm": 0.99609375, + "learning_rate": 0.00027161456700664627, + "loss": 0.1831, + "step": 342240 + }, + { + "epoch": 14.18, + "grad_norm": 0.66015625, + "learning_rate": 0.0002716037624016179, + "loss": 0.1501, + "step": 342250 + }, + { + "epoch": 14.18, + "grad_norm": 1.03125, + "learning_rate": 0.00027159295775593356, + "loss": 0.13, + "step": 342260 + }, + { + "epoch": 14.18, + "grad_norm": 0.51171875, + "learning_rate": 0.0002715821530696136, + "loss": 0.1517, + "step": 342270 + }, + { + "epoch": 14.18, + "grad_norm": 1.390625, + "learning_rate": 0.00027157134834267834, + "loss": 0.155, + "step": 342280 + }, + { + "epoch": 14.18, + "grad_norm": 0.6328125, + "learning_rate": 0.0002715605435751481, + "loss": 0.1862, + "step": 342290 + }, + { + "epoch": 14.18, + "grad_norm": 0.89453125, + "learning_rate": 0.00027154973876704336, + "loss": 0.2031, + "step": 342300 + }, + { + "epoch": 14.18, + "grad_norm": 1.109375, + "learning_rate": 0.00027153893391838424, + "loss": 0.1917, + "step": 342310 + }, + { + "epoch": 14.18, + "grad_norm": 1.1875, + "learning_rate": 0.00027152812902919124, + "loss": 0.2032, + "step": 342320 + }, + { + "epoch": 14.18, + "grad_norm": 0.4921875, + "learning_rate": 0.00027151732409948454, + "loss": 0.187, + "step": 342330 + }, + { + "epoch": 14.18, + "grad_norm": 0.7421875, + "learning_rate": 0.00027150651912928467, + "loss": 0.1543, + "step": 342340 + }, + { + "epoch": 14.18, + "grad_norm": 0.6171875, + "learning_rate": 0.00027149571411861175, + "loss": 0.1616, + "step": 342350 + }, + { + "epoch": 14.18, + "grad_norm": 0.78515625, + "learning_rate": 0.0002714849090674862, + "loss": 0.1602, + "step": 342360 + }, + { + "epoch": 14.18, + "grad_norm": 0.6796875, + "learning_rate": 0.00027147410397592846, + "loss": 0.2157, + "step": 342370 + }, + { + "epoch": 14.18, + "grad_norm": 0.5234375, + "learning_rate": 0.00027146329884395875, + "loss": 0.1789, + "step": 342380 + }, + { + "epoch": 14.18, + "grad_norm": 0.63671875, + "learning_rate": 0.00027145249367159746, + "loss": 0.1606, + "step": 342390 + }, + { + "epoch": 14.18, + "grad_norm": 0.279296875, + "learning_rate": 0.0002714416884588649, + "loss": 0.1699, + "step": 342400 + }, + { + "epoch": 14.18, + "grad_norm": 0.322265625, + "learning_rate": 0.0002714308832057813, + "loss": 0.1349, + "step": 342410 + }, + { + "epoch": 14.18, + "grad_norm": 1.875, + "learning_rate": 0.0002714200779123672, + "loss": 0.2051, + "step": 342420 + }, + { + "epoch": 14.18, + "grad_norm": 0.6875, + "learning_rate": 0.0002714092725786427, + "loss": 0.1572, + "step": 342430 + }, + { + "epoch": 14.18, + "grad_norm": 1.375, + "learning_rate": 0.00027139846720462834, + "loss": 0.1524, + "step": 342440 + }, + { + "epoch": 14.18, + "grad_norm": 0.9765625, + "learning_rate": 0.00027138766179034436, + "loss": 0.1996, + "step": 342450 + }, + { + "epoch": 14.18, + "grad_norm": 1.5703125, + "learning_rate": 0.00027137685633581117, + "loss": 0.209, + "step": 342460 + }, + { + "epoch": 14.19, + "grad_norm": 0.671875, + "learning_rate": 0.00027136605084104897, + "loss": 0.1949, + "step": 342470 + }, + { + "epoch": 14.19, + "grad_norm": 0.43359375, + "learning_rate": 0.00027135524530607816, + "loss": 0.1849, + "step": 342480 + }, + { + "epoch": 14.19, + "grad_norm": 0.58203125, + "learning_rate": 0.0002713444397309192, + "loss": 0.1854, + "step": 342490 + }, + { + "epoch": 14.19, + "grad_norm": 2.15625, + "learning_rate": 0.0002713336341155922, + "loss": 0.2258, + "step": 342500 + }, + { + "epoch": 14.19, + "grad_norm": 0.189453125, + "learning_rate": 0.0002713228284601176, + "loss": 0.1963, + "step": 342510 + }, + { + "epoch": 14.19, + "grad_norm": 0.68359375, + "learning_rate": 0.0002713120227645158, + "loss": 0.1784, + "step": 342520 + }, + { + "epoch": 14.19, + "grad_norm": 0.9765625, + "learning_rate": 0.00027130121702880703, + "loss": 0.2704, + "step": 342530 + }, + { + "epoch": 14.19, + "grad_norm": 0.478515625, + "learning_rate": 0.0002712904112530117, + "loss": 0.1523, + "step": 342540 + }, + { + "epoch": 14.19, + "grad_norm": 0.51171875, + "learning_rate": 0.00027127960543715015, + "loss": 0.1638, + "step": 342550 + }, + { + "epoch": 14.19, + "grad_norm": 0.984375, + "learning_rate": 0.0002712687995812426, + "loss": 0.1751, + "step": 342560 + }, + { + "epoch": 14.19, + "grad_norm": 1.015625, + "learning_rate": 0.0002712579936853095, + "loss": 0.1889, + "step": 342570 + }, + { + "epoch": 14.19, + "grad_norm": 0.435546875, + "learning_rate": 0.00027124718774937113, + "loss": 0.1404, + "step": 342580 + }, + { + "epoch": 14.19, + "grad_norm": 0.8828125, + "learning_rate": 0.0002712363817734479, + "loss": 0.1722, + "step": 342590 + }, + { + "epoch": 14.19, + "grad_norm": 1.5546875, + "learning_rate": 0.00027122557575756006, + "loss": 0.2064, + "step": 342600 + }, + { + "epoch": 14.19, + "grad_norm": 1.9375, + "learning_rate": 0.00027121476970172795, + "loss": 0.2102, + "step": 342610 + }, + { + "epoch": 14.19, + "grad_norm": 0.349609375, + "learning_rate": 0.00027120396360597206, + "loss": 0.2122, + "step": 342620 + }, + { + "epoch": 14.19, + "grad_norm": 0.94921875, + "learning_rate": 0.0002711931574703125, + "loss": 0.2043, + "step": 342630 + }, + { + "epoch": 14.19, + "grad_norm": 1.2890625, + "learning_rate": 0.00027118235129476966, + "loss": 0.2124, + "step": 342640 + }, + { + "epoch": 14.19, + "grad_norm": 0.205078125, + "learning_rate": 0.000271171545079364, + "loss": 0.1821, + "step": 342650 + }, + { + "epoch": 14.19, + "grad_norm": 1.3203125, + "learning_rate": 0.00027116073882411566, + "loss": 0.2003, + "step": 342660 + }, + { + "epoch": 14.19, + "grad_norm": 1.8359375, + "learning_rate": 0.0002711499325290453, + "loss": 0.1977, + "step": 342670 + }, + { + "epoch": 14.19, + "grad_norm": 0.99609375, + "learning_rate": 0.0002711391261941729, + "loss": 0.1478, + "step": 342680 + }, + { + "epoch": 14.19, + "grad_norm": 0.578125, + "learning_rate": 0.00027112831981951894, + "loss": 0.2508, + "step": 342690 + }, + { + "epoch": 14.19, + "grad_norm": 1.3125, + "learning_rate": 0.0002711175134051039, + "loss": 0.1597, + "step": 342700 + }, + { + "epoch": 14.2, + "grad_norm": 0.5625, + "learning_rate": 0.00027110670695094783, + "loss": 0.2148, + "step": 342710 + }, + { + "epoch": 14.2, + "grad_norm": 0.70703125, + "learning_rate": 0.00027109590045707127, + "loss": 0.1533, + "step": 342720 + }, + { + "epoch": 14.2, + "grad_norm": 0.8515625, + "learning_rate": 0.0002710850939234945, + "loss": 0.2421, + "step": 342730 + }, + { + "epoch": 14.2, + "grad_norm": 0.5625, + "learning_rate": 0.00027107428735023784, + "loss": 0.1763, + "step": 342740 + }, + { + "epoch": 14.2, + "grad_norm": 1.5234375, + "learning_rate": 0.00027106348073732174, + "loss": 0.2087, + "step": 342750 + }, + { + "epoch": 14.2, + "grad_norm": 0.50390625, + "learning_rate": 0.00027105267408476634, + "loss": 0.2198, + "step": 342760 + }, + { + "epoch": 14.2, + "grad_norm": 1.9140625, + "learning_rate": 0.0002710418673925922, + "loss": 0.2045, + "step": 342770 + }, + { + "epoch": 14.2, + "grad_norm": 0.46484375, + "learning_rate": 0.00027103106066081944, + "loss": 0.1384, + "step": 342780 + }, + { + "epoch": 14.2, + "grad_norm": 0.8984375, + "learning_rate": 0.00027102025388946844, + "loss": 0.2002, + "step": 342790 + }, + { + "epoch": 14.2, + "grad_norm": 1.0625, + "learning_rate": 0.0002710094470785597, + "loss": 0.1957, + "step": 342800 + }, + { + "epoch": 14.2, + "grad_norm": 1.3828125, + "learning_rate": 0.0002709986402281134, + "loss": 0.1816, + "step": 342810 + }, + { + "epoch": 14.2, + "grad_norm": 1.5546875, + "learning_rate": 0.00027098783333815, + "loss": 0.1774, + "step": 342820 + }, + { + "epoch": 14.2, + "grad_norm": 0.79296875, + "learning_rate": 0.00027097702640868973, + "loss": 0.1982, + "step": 342830 + }, + { + "epoch": 14.2, + "grad_norm": 0.64453125, + "learning_rate": 0.0002709662194397529, + "loss": 0.2015, + "step": 342840 + }, + { + "epoch": 14.2, + "grad_norm": 0.302734375, + "learning_rate": 0.00027095541243136, + "loss": 0.1965, + "step": 342850 + }, + { + "epoch": 14.2, + "grad_norm": 0.2431640625, + "learning_rate": 0.00027094460538353124, + "loss": 0.175, + "step": 342860 + }, + { + "epoch": 14.2, + "grad_norm": 0.7421875, + "learning_rate": 0.0002709337982962869, + "loss": 0.1657, + "step": 342870 + }, + { + "epoch": 14.2, + "grad_norm": 0.341796875, + "learning_rate": 0.00027092299116964754, + "loss": 0.2183, + "step": 342880 + }, + { + "epoch": 14.2, + "grad_norm": 1.0078125, + "learning_rate": 0.0002709121840036333, + "loss": 0.186, + "step": 342890 + }, + { + "epoch": 14.2, + "grad_norm": 0.83984375, + "learning_rate": 0.0002709013767982646, + "loss": 0.1519, + "step": 342900 + }, + { + "epoch": 14.2, + "grad_norm": 0.828125, + "learning_rate": 0.0002708905695535618, + "loss": 0.2182, + "step": 342910 + }, + { + "epoch": 14.2, + "grad_norm": 0.890625, + "learning_rate": 0.00027087976226954514, + "loss": 0.1366, + "step": 342920 + }, + { + "epoch": 14.2, + "grad_norm": 0.78125, + "learning_rate": 0.0002708689549462351, + "loss": 0.1968, + "step": 342930 + }, + { + "epoch": 14.2, + "grad_norm": 0.75, + "learning_rate": 0.00027085814758365186, + "loss": 0.1741, + "step": 342940 + }, + { + "epoch": 14.2, + "grad_norm": 1.25, + "learning_rate": 0.0002708473401818159, + "loss": 0.2028, + "step": 342950 + }, + { + "epoch": 14.21, + "grad_norm": 0.55859375, + "learning_rate": 0.0002708365327407475, + "loss": 0.2053, + "step": 342960 + }, + { + "epoch": 14.21, + "grad_norm": 0.63671875, + "learning_rate": 0.00027082572526046695, + "loss": 0.2157, + "step": 342970 + }, + { + "epoch": 14.21, + "grad_norm": 1.0859375, + "learning_rate": 0.00027081491774099466, + "loss": 0.1865, + "step": 342980 + }, + { + "epoch": 14.21, + "grad_norm": 0.80859375, + "learning_rate": 0.00027080411018235096, + "loss": 0.1805, + "step": 342990 + }, + { + "epoch": 14.21, + "grad_norm": 0.89453125, + "learning_rate": 0.00027079330258455607, + "loss": 0.2106, + "step": 343000 + }, + { + "epoch": 14.21, + "grad_norm": 1.1640625, + "learning_rate": 0.0002707824949476306, + "loss": 0.196, + "step": 343010 + }, + { + "epoch": 14.21, + "grad_norm": 0.640625, + "learning_rate": 0.0002707716872715946, + "loss": 0.2095, + "step": 343020 + }, + { + "epoch": 14.21, + "grad_norm": 0.7109375, + "learning_rate": 0.00027076087955646855, + "loss": 0.1891, + "step": 343030 + }, + { + "epoch": 14.21, + "grad_norm": 1.265625, + "learning_rate": 0.0002707500718022728, + "loss": 0.1942, + "step": 343040 + }, + { + "epoch": 14.21, + "grad_norm": 0.66796875, + "learning_rate": 0.0002707392640090277, + "loss": 0.1818, + "step": 343050 + }, + { + "epoch": 14.21, + "grad_norm": 0.6796875, + "learning_rate": 0.0002707284561767534, + "loss": 0.228, + "step": 343060 + }, + { + "epoch": 14.21, + "grad_norm": 1.1484375, + "learning_rate": 0.0002707176483054705, + "loss": 0.1884, + "step": 343070 + }, + { + "epoch": 14.21, + "grad_norm": 0.98828125, + "learning_rate": 0.00027070684039519913, + "loss": 0.1899, + "step": 343080 + }, + { + "epoch": 14.21, + "grad_norm": 1.015625, + "learning_rate": 0.00027069603244595983, + "loss": 0.2237, + "step": 343090 + }, + { + "epoch": 14.21, + "grad_norm": 0.95703125, + "learning_rate": 0.00027068522445777274, + "loss": 0.2029, + "step": 343100 + }, + { + "epoch": 14.21, + "grad_norm": 1.3046875, + "learning_rate": 0.00027067441643065834, + "loss": 0.1553, + "step": 343110 + }, + { + "epoch": 14.21, + "grad_norm": 1.5703125, + "learning_rate": 0.0002706636083646369, + "loss": 0.2493, + "step": 343120 + }, + { + "epoch": 14.21, + "grad_norm": 0.33984375, + "learning_rate": 0.0002706528002597288, + "loss": 0.2032, + "step": 343130 + }, + { + "epoch": 14.21, + "grad_norm": 1.171875, + "learning_rate": 0.0002706419921159543, + "loss": 0.223, + "step": 343140 + }, + { + "epoch": 14.21, + "grad_norm": 0.8046875, + "learning_rate": 0.0002706311839333339, + "loss": 0.1784, + "step": 343150 + }, + { + "epoch": 14.21, + "grad_norm": 0.93359375, + "learning_rate": 0.00027062037571188777, + "loss": 0.1943, + "step": 343160 + }, + { + "epoch": 14.21, + "grad_norm": 0.6171875, + "learning_rate": 0.00027060956745163635, + "loss": 0.2133, + "step": 343170 + }, + { + "epoch": 14.21, + "grad_norm": 0.87890625, + "learning_rate": 0.0002705987591526, + "loss": 0.1993, + "step": 343180 + }, + { + "epoch": 14.21, + "grad_norm": 0.3984375, + "learning_rate": 0.0002705879508147989, + "loss": 0.2101, + "step": 343190 + }, + { + "epoch": 14.22, + "grad_norm": 0.482421875, + "learning_rate": 0.00027057714243825356, + "loss": 0.138, + "step": 343200 + }, + { + "epoch": 14.22, + "grad_norm": 0.69921875, + "learning_rate": 0.00027056633402298426, + "loss": 0.1758, + "step": 343210 + }, + { + "epoch": 14.22, + "grad_norm": 0.0004367828369140625, + "learning_rate": 0.0002705555255690113, + "loss": 0.1656, + "step": 343220 + }, + { + "epoch": 14.22, + "grad_norm": 0.88671875, + "learning_rate": 0.00027054471707635517, + "loss": 0.1949, + "step": 343230 + }, + { + "epoch": 14.22, + "grad_norm": 0.640625, + "learning_rate": 0.0002705339085450359, + "loss": 0.1734, + "step": 343240 + }, + { + "epoch": 14.22, + "grad_norm": 0.6953125, + "learning_rate": 0.00027052309997507417, + "loss": 0.1816, + "step": 343250 + }, + { + "epoch": 14.22, + "grad_norm": 0.640625, + "learning_rate": 0.00027051229136649017, + "loss": 0.1544, + "step": 343260 + }, + { + "epoch": 14.22, + "grad_norm": 0.9453125, + "learning_rate": 0.00027050148271930423, + "loss": 0.224, + "step": 343270 + }, + { + "epoch": 14.22, + "grad_norm": 1.53125, + "learning_rate": 0.0002704906740335368, + "loss": 0.1553, + "step": 343280 + }, + { + "epoch": 14.22, + "grad_norm": 0.96484375, + "learning_rate": 0.00027047986530920803, + "loss": 0.1769, + "step": 343290 + }, + { + "epoch": 14.22, + "grad_norm": 1.3828125, + "learning_rate": 0.00027046905654633843, + "loss": 0.2356, + "step": 343300 + }, + { + "epoch": 14.22, + "grad_norm": 0.28125, + "learning_rate": 0.00027045824774494826, + "loss": 0.1714, + "step": 343310 + }, + { + "epoch": 14.22, + "grad_norm": 0.6328125, + "learning_rate": 0.0002704474389050578, + "loss": 0.2367, + "step": 343320 + }, + { + "epoch": 14.22, + "grad_norm": 0.76953125, + "learning_rate": 0.00027043663002668754, + "loss": 0.2052, + "step": 343330 + }, + { + "epoch": 14.22, + "grad_norm": 0.72265625, + "learning_rate": 0.00027042582110985776, + "loss": 0.1719, + "step": 343340 + }, + { + "epoch": 14.22, + "grad_norm": 0.37890625, + "learning_rate": 0.00027041501215458873, + "loss": 0.1562, + "step": 343350 + }, + { + "epoch": 14.22, + "grad_norm": 0.470703125, + "learning_rate": 0.000270404203160901, + "loss": 0.192, + "step": 343360 + }, + { + "epoch": 14.22, + "grad_norm": 0.9609375, + "learning_rate": 0.0002703933941288146, + "loss": 0.1998, + "step": 343370 + }, + { + "epoch": 14.22, + "grad_norm": 0.69921875, + "learning_rate": 0.00027038258505835013, + "loss": 0.2152, + "step": 343380 + }, + { + "epoch": 14.22, + "grad_norm": 0.80078125, + "learning_rate": 0.00027037177594952777, + "loss": 0.1974, + "step": 343390 + }, + { + "epoch": 14.22, + "grad_norm": 0.37109375, + "learning_rate": 0.00027036096680236796, + "loss": 0.2033, + "step": 343400 + }, + { + "epoch": 14.22, + "grad_norm": 0.6328125, + "learning_rate": 0.00027035015761689104, + "loss": 0.1692, + "step": 343410 + }, + { + "epoch": 14.22, + "grad_norm": 0.423828125, + "learning_rate": 0.0002703393483931172, + "loss": 0.1955, + "step": 343420 + }, + { + "epoch": 14.22, + "grad_norm": 0.68359375, + "learning_rate": 0.000270328539131067, + "loss": 0.2026, + "step": 343430 + }, + { + "epoch": 14.23, + "grad_norm": 1.015625, + "learning_rate": 0.0002703177298307608, + "loss": 0.2011, + "step": 343440 + }, + { + "epoch": 14.23, + "grad_norm": 0.90625, + "learning_rate": 0.00027030692049221863, + "loss": 0.1631, + "step": 343450 + }, + { + "epoch": 14.23, + "grad_norm": 0.68359375, + "learning_rate": 0.000270296111115461, + "loss": 0.1841, + "step": 343460 + }, + { + "epoch": 14.23, + "grad_norm": 0.6953125, + "learning_rate": 0.0002702853017005085, + "loss": 0.177, + "step": 343470 + }, + { + "epoch": 14.23, + "grad_norm": 0.92578125, + "learning_rate": 0.000270274492247381, + "loss": 0.1824, + "step": 343480 + }, + { + "epoch": 14.23, + "grad_norm": 1.5390625, + "learning_rate": 0.0002702636827560993, + "loss": 0.1505, + "step": 343490 + }, + { + "epoch": 14.23, + "grad_norm": 0.6328125, + "learning_rate": 0.00027025287322668347, + "loss": 0.1906, + "step": 343500 + }, + { + "epoch": 14.23, + "grad_norm": 0.89453125, + "learning_rate": 0.00027024206365915387, + "loss": 0.1723, + "step": 343510 + }, + { + "epoch": 14.23, + "grad_norm": 0.494140625, + "learning_rate": 0.000270231254053531, + "loss": 0.1688, + "step": 343520 + }, + { + "epoch": 14.23, + "grad_norm": 0.310546875, + "learning_rate": 0.00027022044440983494, + "loss": 0.1941, + "step": 343530 + }, + { + "epoch": 14.23, + "grad_norm": 0.8828125, + "learning_rate": 0.0002702096347280864, + "loss": 0.178, + "step": 343540 + }, + { + "epoch": 14.23, + "grad_norm": 0.0, + "learning_rate": 0.00027019882500830536, + "loss": 0.1958, + "step": 343550 + }, + { + "epoch": 14.23, + "grad_norm": 0.82421875, + "learning_rate": 0.00027018801525051226, + "loss": 0.1943, + "step": 343560 + }, + { + "epoch": 14.23, + "grad_norm": 2.4375, + "learning_rate": 0.00027017720545472764, + "loss": 0.1671, + "step": 343570 + }, + { + "epoch": 14.23, + "grad_norm": 1.515625, + "learning_rate": 0.0002701663956209716, + "loss": 0.2152, + "step": 343580 + }, + { + "epoch": 14.23, + "grad_norm": 1.65625, + "learning_rate": 0.00027015558574926463, + "loss": 0.1681, + "step": 343590 + }, + { + "epoch": 14.23, + "grad_norm": 1.1015625, + "learning_rate": 0.000270144775839627, + "loss": 0.2198, + "step": 343600 + }, + { + "epoch": 14.23, + "grad_norm": 0.439453125, + "learning_rate": 0.0002701339658920791, + "loss": 0.164, + "step": 343610 + }, + { + "epoch": 14.23, + "grad_norm": 0.51953125, + "learning_rate": 0.00027012315590664126, + "loss": 0.2582, + "step": 343620 + }, + { + "epoch": 14.23, + "grad_norm": 2.28125, + "learning_rate": 0.0002701123458833338, + "loss": 0.1954, + "step": 343630 + }, + { + "epoch": 14.23, + "grad_norm": 0.8828125, + "learning_rate": 0.000270101535822177, + "loss": 0.1729, + "step": 343640 + }, + { + "epoch": 14.23, + "grad_norm": 1.8984375, + "learning_rate": 0.00027009072572319135, + "loss": 0.22, + "step": 343650 + }, + { + "epoch": 14.23, + "grad_norm": 0.84375, + "learning_rate": 0.00027007991558639717, + "loss": 0.2002, + "step": 343660 + }, + { + "epoch": 14.23, + "grad_norm": 0.7578125, + "learning_rate": 0.0002700691054118147, + "loss": 0.1684, + "step": 343670 + }, + { + "epoch": 14.24, + "grad_norm": 2.359375, + "learning_rate": 0.0002700582951994644, + "loss": 0.1368, + "step": 343680 + }, + { + "epoch": 14.24, + "grad_norm": 0.9765625, + "learning_rate": 0.00027004748494936646, + "loss": 0.225, + "step": 343690 + }, + { + "epoch": 14.24, + "grad_norm": 2.15625, + "learning_rate": 0.00027003667466154134, + "loss": 0.2085, + "step": 343700 + }, + { + "epoch": 14.24, + "grad_norm": 0.77734375, + "learning_rate": 0.0002700258643360094, + "loss": 0.2265, + "step": 343710 + }, + { + "epoch": 14.24, + "grad_norm": 1.1484375, + "learning_rate": 0.00027001505397279094, + "loss": 0.2484, + "step": 343720 + }, + { + "epoch": 14.24, + "grad_norm": 2.015625, + "learning_rate": 0.00027000424357190636, + "loss": 0.1893, + "step": 343730 + }, + { + "epoch": 14.24, + "grad_norm": 0.8203125, + "learning_rate": 0.00026999343313337585, + "loss": 0.1753, + "step": 343740 + }, + { + "epoch": 14.24, + "grad_norm": 0.70703125, + "learning_rate": 0.00026998262265721983, + "loss": 0.1819, + "step": 343750 + }, + { + "epoch": 14.24, + "grad_norm": 1.421875, + "learning_rate": 0.0002699718121434588, + "loss": 0.1479, + "step": 343760 + }, + { + "epoch": 14.24, + "grad_norm": 1.1875, + "learning_rate": 0.0002699610015921129, + "loss": 0.221, + "step": 343770 + }, + { + "epoch": 14.24, + "grad_norm": 0.59765625, + "learning_rate": 0.0002699501910032026, + "loss": 0.1973, + "step": 343780 + }, + { + "epoch": 14.24, + "grad_norm": 1.296875, + "learning_rate": 0.00026993938037674813, + "loss": 0.2125, + "step": 343790 + }, + { + "epoch": 14.24, + "grad_norm": 0.8828125, + "learning_rate": 0.00026992856971276997, + "loss": 0.1711, + "step": 343800 + }, + { + "epoch": 14.24, + "grad_norm": 0.44140625, + "learning_rate": 0.0002699177590112884, + "loss": 0.2289, + "step": 343810 + }, + { + "epoch": 14.24, + "grad_norm": 1.296875, + "learning_rate": 0.0002699069482723237, + "loss": 0.1856, + "step": 343820 + }, + { + "epoch": 14.24, + "grad_norm": 0.5390625, + "learning_rate": 0.0002698961374958963, + "loss": 0.1865, + "step": 343830 + }, + { + "epoch": 14.24, + "grad_norm": 0.515625, + "learning_rate": 0.00026988532668202646, + "loss": 0.212, + "step": 343840 + }, + { + "epoch": 14.24, + "grad_norm": 0.0002841949462890625, + "learning_rate": 0.00026987451583073465, + "loss": 0.1631, + "step": 343850 + }, + { + "epoch": 14.24, + "grad_norm": 0.890625, + "learning_rate": 0.00026986370494204117, + "loss": 0.1507, + "step": 343860 + }, + { + "epoch": 14.24, + "grad_norm": 0.64453125, + "learning_rate": 0.00026985289401596624, + "loss": 0.2025, + "step": 343870 + }, + { + "epoch": 14.24, + "grad_norm": 1.9921875, + "learning_rate": 0.0002698420830525304, + "loss": 0.1652, + "step": 343880 + }, + { + "epoch": 14.24, + "grad_norm": 0.37109375, + "learning_rate": 0.00026983127205175384, + "loss": 0.1967, + "step": 343890 + }, + { + "epoch": 14.24, + "grad_norm": 1.0390625, + "learning_rate": 0.00026982046101365703, + "loss": 0.2124, + "step": 343900 + }, + { + "epoch": 14.24, + "grad_norm": 0.98828125, + "learning_rate": 0.0002698096499382602, + "loss": 0.1882, + "step": 343910 + }, + { + "epoch": 14.25, + "grad_norm": 1.234375, + "learning_rate": 0.0002697988388255838, + "loss": 0.1575, + "step": 343920 + }, + { + "epoch": 14.25, + "grad_norm": 1.875, + "learning_rate": 0.00026978802767564804, + "loss": 0.2208, + "step": 343930 + }, + { + "epoch": 14.25, + "grad_norm": 0.984375, + "learning_rate": 0.00026977721648847343, + "loss": 0.146, + "step": 343940 + }, + { + "epoch": 14.25, + "grad_norm": 0.47265625, + "learning_rate": 0.0002697664052640802, + "loss": 0.2087, + "step": 343950 + }, + { + "epoch": 14.25, + "grad_norm": 0.82421875, + "learning_rate": 0.00026975559400248874, + "loss": 0.2085, + "step": 343960 + }, + { + "epoch": 14.25, + "grad_norm": 0.90234375, + "learning_rate": 0.00026974478270371937, + "loss": 0.1811, + "step": 343970 + }, + { + "epoch": 14.25, + "grad_norm": 1.234375, + "learning_rate": 0.0002697339713677925, + "loss": 0.1782, + "step": 343980 + }, + { + "epoch": 14.25, + "grad_norm": 0.8984375, + "learning_rate": 0.00026972315999472833, + "loss": 0.2039, + "step": 343990 + }, + { + "epoch": 14.25, + "grad_norm": 0.59375, + "learning_rate": 0.0002697123485845474, + "loss": 0.1803, + "step": 344000 + }, + { + "epoch": 14.25, + "grad_norm": 0.95703125, + "learning_rate": 0.0002697015371372699, + "loss": 0.1694, + "step": 344010 + }, + { + "epoch": 14.25, + "grad_norm": 0.90234375, + "learning_rate": 0.0002696907256529163, + "loss": 0.1824, + "step": 344020 + }, + { + "epoch": 14.25, + "grad_norm": 0.8515625, + "learning_rate": 0.0002696799141315068, + "loss": 0.1885, + "step": 344030 + }, + { + "epoch": 14.25, + "grad_norm": 0.859375, + "learning_rate": 0.00026966910257306186, + "loss": 0.2566, + "step": 344040 + }, + { + "epoch": 14.25, + "grad_norm": 0.828125, + "learning_rate": 0.0002696582909776018, + "loss": 0.1791, + "step": 344050 + }, + { + "epoch": 14.25, + "grad_norm": 3.046875, + "learning_rate": 0.000269647479345147, + "loss": 0.1713, + "step": 344060 + }, + { + "epoch": 14.25, + "grad_norm": 0.99609375, + "learning_rate": 0.0002696366676757177, + "loss": 0.205, + "step": 344070 + }, + { + "epoch": 14.25, + "grad_norm": 0.78125, + "learning_rate": 0.0002696258559693343, + "loss": 0.2095, + "step": 344080 + }, + { + "epoch": 14.25, + "grad_norm": 0.484375, + "learning_rate": 0.0002696150442260172, + "loss": 0.1805, + "step": 344090 + }, + { + "epoch": 14.25, + "grad_norm": 1.671875, + "learning_rate": 0.0002696042324457867, + "loss": 0.2184, + "step": 344100 + }, + { + "epoch": 14.25, + "grad_norm": 0.52734375, + "learning_rate": 0.00026959342062866313, + "loss": 0.2107, + "step": 344110 + }, + { + "epoch": 14.25, + "grad_norm": 0.83984375, + "learning_rate": 0.0002695826087746669, + "loss": 0.1338, + "step": 344120 + }, + { + "epoch": 14.25, + "grad_norm": 0.73828125, + "learning_rate": 0.0002695717968838183, + "loss": 0.199, + "step": 344130 + }, + { + "epoch": 14.25, + "grad_norm": 0.490234375, + "learning_rate": 0.0002695609849561377, + "loss": 0.2029, + "step": 344140 + }, + { + "epoch": 14.25, + "grad_norm": 0.69921875, + "learning_rate": 0.0002695501729916454, + "loss": 0.1871, + "step": 344150 + }, + { + "epoch": 14.26, + "grad_norm": 0.3828125, + "learning_rate": 0.0002695393609903618, + "loss": 0.1858, + "step": 344160 + }, + { + "epoch": 14.26, + "grad_norm": 0.84375, + "learning_rate": 0.0002695285489523073, + "loss": 0.1769, + "step": 344170 + }, + { + "epoch": 14.26, + "grad_norm": 0.18359375, + "learning_rate": 0.0002695177368775021, + "loss": 0.2156, + "step": 344180 + }, + { + "epoch": 14.26, + "grad_norm": 1.5390625, + "learning_rate": 0.0002695069247659667, + "loss": 0.2173, + "step": 344190 + }, + { + "epoch": 14.26, + "grad_norm": 0.73828125, + "learning_rate": 0.0002694961126177213, + "loss": 0.1885, + "step": 344200 + }, + { + "epoch": 14.26, + "grad_norm": 0.9453125, + "learning_rate": 0.0002694853004327863, + "loss": 0.2053, + "step": 344210 + }, + { + "epoch": 14.26, + "grad_norm": 0.50390625, + "learning_rate": 0.00026947448821118216, + "loss": 0.1992, + "step": 344220 + }, + { + "epoch": 14.26, + "grad_norm": 1.0078125, + "learning_rate": 0.00026946367595292905, + "loss": 0.1747, + "step": 344230 + }, + { + "epoch": 14.26, + "grad_norm": 0.435546875, + "learning_rate": 0.0002694528636580474, + "loss": 0.1564, + "step": 344240 + }, + { + "epoch": 14.26, + "grad_norm": 0.7109375, + "learning_rate": 0.00026944205132655764, + "loss": 0.2018, + "step": 344250 + }, + { + "epoch": 14.26, + "grad_norm": 0.296875, + "learning_rate": 0.00026943123895848, + "loss": 0.1783, + "step": 344260 + }, + { + "epoch": 14.26, + "grad_norm": 1.1171875, + "learning_rate": 0.0002694204265538349, + "loss": 0.165, + "step": 344270 + }, + { + "epoch": 14.26, + "grad_norm": 0.98828125, + "learning_rate": 0.0002694096141126426, + "loss": 0.2029, + "step": 344280 + }, + { + "epoch": 14.26, + "grad_norm": 1.390625, + "learning_rate": 0.00026939880163492355, + "loss": 0.1816, + "step": 344290 + }, + { + "epoch": 14.26, + "grad_norm": 0.77734375, + "learning_rate": 0.000269387989120698, + "loss": 0.2089, + "step": 344300 + }, + { + "epoch": 14.26, + "grad_norm": 0.53515625, + "learning_rate": 0.00026937717656998634, + "loss": 0.1344, + "step": 344310 + }, + { + "epoch": 14.26, + "grad_norm": 0.0, + "learning_rate": 0.000269366363982809, + "loss": 0.1505, + "step": 344320 + }, + { + "epoch": 14.26, + "grad_norm": 0.66796875, + "learning_rate": 0.0002693555513591862, + "loss": 0.1845, + "step": 344330 + }, + { + "epoch": 14.26, + "grad_norm": 0.9765625, + "learning_rate": 0.00026934473869913835, + "loss": 0.1823, + "step": 344340 + }, + { + "epoch": 14.26, + "grad_norm": 2.34375, + "learning_rate": 0.00026933392600268584, + "loss": 0.2475, + "step": 344350 + }, + { + "epoch": 14.26, + "grad_norm": 0.7109375, + "learning_rate": 0.00026932311326984893, + "loss": 0.2632, + "step": 344360 + }, + { + "epoch": 14.26, + "grad_norm": 0.625, + "learning_rate": 0.000269312300500648, + "loss": 0.2212, + "step": 344370 + }, + { + "epoch": 14.26, + "grad_norm": 1.59375, + "learning_rate": 0.00026930148769510343, + "loss": 0.1874, + "step": 344380 + }, + { + "epoch": 14.26, + "grad_norm": 2.40625, + "learning_rate": 0.00026929067485323546, + "loss": 0.2023, + "step": 344390 + }, + { + "epoch": 14.27, + "grad_norm": 1.2109375, + "learning_rate": 0.0002692798619750646, + "loss": 0.1691, + "step": 344400 + }, + { + "epoch": 14.27, + "grad_norm": 0.80859375, + "learning_rate": 0.00026926904906061114, + "loss": 0.1666, + "step": 344410 + }, + { + "epoch": 14.27, + "grad_norm": 0.65625, + "learning_rate": 0.00026925823610989534, + "loss": 0.2346, + "step": 344420 + }, + { + "epoch": 14.27, + "grad_norm": 1.4375, + "learning_rate": 0.0002692474231229377, + "loss": 0.2312, + "step": 344430 + }, + { + "epoch": 14.27, + "grad_norm": 0.408203125, + "learning_rate": 0.00026923661009975843, + "loss": 0.2302, + "step": 344440 + }, + { + "epoch": 14.27, + "grad_norm": 0.5390625, + "learning_rate": 0.000269225797040378, + "loss": 0.193, + "step": 344450 + }, + { + "epoch": 14.27, + "grad_norm": 0.87109375, + "learning_rate": 0.0002692149839448166, + "loss": 0.1462, + "step": 344460 + }, + { + "epoch": 14.27, + "grad_norm": 0.703125, + "learning_rate": 0.0002692041708130947, + "loss": 0.1675, + "step": 344470 + }, + { + "epoch": 14.27, + "grad_norm": 0.443359375, + "learning_rate": 0.00026919335764523275, + "loss": 0.1512, + "step": 344480 + }, + { + "epoch": 14.27, + "grad_norm": 0.55078125, + "learning_rate": 0.0002691825444412508, + "loss": 0.1535, + "step": 344490 + }, + { + "epoch": 14.27, + "grad_norm": 1.125, + "learning_rate": 0.00026917173120116945, + "loss": 0.1906, + "step": 344500 + }, + { + "epoch": 14.27, + "grad_norm": 0.859375, + "learning_rate": 0.000269160917925009, + "loss": 0.1846, + "step": 344510 + }, + { + "epoch": 14.27, + "grad_norm": 0.80859375, + "learning_rate": 0.00026915010461278963, + "loss": 0.1814, + "step": 344520 + }, + { + "epoch": 14.27, + "grad_norm": 0.7890625, + "learning_rate": 0.000269139291264532, + "loss": 0.1619, + "step": 344530 + }, + { + "epoch": 14.27, + "grad_norm": 0.734375, + "learning_rate": 0.0002691284778802563, + "loss": 0.1982, + "step": 344540 + }, + { + "epoch": 14.27, + "grad_norm": 0.70703125, + "learning_rate": 0.0002691176644599828, + "loss": 0.1947, + "step": 344550 + }, + { + "epoch": 14.27, + "grad_norm": 1.0, + "learning_rate": 0.0002691068510037319, + "loss": 0.1551, + "step": 344560 + }, + { + "epoch": 14.27, + "grad_norm": 0.91796875, + "learning_rate": 0.000269096037511524, + "loss": 0.224, + "step": 344570 + }, + { + "epoch": 14.27, + "grad_norm": 0.73046875, + "learning_rate": 0.00026908522398337944, + "loss": 0.1668, + "step": 344580 + }, + { + "epoch": 14.27, + "grad_norm": 1.0625, + "learning_rate": 0.0002690744104193185, + "loss": 0.2027, + "step": 344590 + }, + { + "epoch": 14.27, + "grad_norm": 0.703125, + "learning_rate": 0.00026906359681936156, + "loss": 0.1992, + "step": 344600 + }, + { + "epoch": 14.27, + "grad_norm": 0.7421875, + "learning_rate": 0.00026905278318352903, + "loss": 0.2117, + "step": 344610 + }, + { + "epoch": 14.27, + "grad_norm": 0.50390625, + "learning_rate": 0.0002690419695118413, + "loss": 0.2048, + "step": 344620 + }, + { + "epoch": 14.27, + "grad_norm": 1.15625, + "learning_rate": 0.0002690311558043185, + "loss": 0.1926, + "step": 344630 + }, + { + "epoch": 14.27, + "grad_norm": 1.65625, + "learning_rate": 0.00026902034206098123, + "loss": 0.1836, + "step": 344640 + }, + { + "epoch": 14.28, + "grad_norm": 0.88671875, + "learning_rate": 0.00026900952828184965, + "loss": 0.2104, + "step": 344650 + }, + { + "epoch": 14.28, + "grad_norm": 1.625, + "learning_rate": 0.0002689987144669442, + "loss": 0.1949, + "step": 344660 + }, + { + "epoch": 14.28, + "grad_norm": 1.0859375, + "learning_rate": 0.0002689879006162853, + "loss": 0.1854, + "step": 344670 + }, + { + "epoch": 14.28, + "grad_norm": 0.3671875, + "learning_rate": 0.0002689770867298931, + "loss": 0.1892, + "step": 344680 + }, + { + "epoch": 14.28, + "grad_norm": 0.7578125, + "learning_rate": 0.00026896627280778817, + "loss": 0.2096, + "step": 344690 + }, + { + "epoch": 14.28, + "grad_norm": 1.1328125, + "learning_rate": 0.0002689554588499908, + "loss": 0.2545, + "step": 344700 + }, + { + "epoch": 14.28, + "grad_norm": 2.1875, + "learning_rate": 0.0002689446448565212, + "loss": 0.201, + "step": 344710 + }, + { + "epoch": 14.28, + "grad_norm": 0.73046875, + "learning_rate": 0.0002689338308273999, + "loss": 0.1894, + "step": 344720 + }, + { + "epoch": 14.28, + "grad_norm": 0.546875, + "learning_rate": 0.00026892301676264706, + "loss": 0.2247, + "step": 344730 + }, + { + "epoch": 14.28, + "grad_norm": 0.484375, + "learning_rate": 0.0002689122026622832, + "loss": 0.2013, + "step": 344740 + }, + { + "epoch": 14.28, + "grad_norm": 0.6484375, + "learning_rate": 0.00026890138852632867, + "loss": 0.231, + "step": 344750 + }, + { + "epoch": 14.28, + "grad_norm": 0.74609375, + "learning_rate": 0.00026889057435480366, + "loss": 0.2257, + "step": 344760 + }, + { + "epoch": 14.28, + "grad_norm": 0.482421875, + "learning_rate": 0.00026887976014772875, + "loss": 0.2425, + "step": 344770 + }, + { + "epoch": 14.28, + "grad_norm": 0.9140625, + "learning_rate": 0.00026886894590512415, + "loss": 0.1753, + "step": 344780 + }, + { + "epoch": 14.28, + "grad_norm": 1.3125, + "learning_rate": 0.00026885813162701015, + "loss": 0.1693, + "step": 344790 + }, + { + "epoch": 14.28, + "grad_norm": 1.5234375, + "learning_rate": 0.0002688473173134073, + "loss": 0.2082, + "step": 344800 + }, + { + "epoch": 14.28, + "grad_norm": 0.63671875, + "learning_rate": 0.00026883650296433576, + "loss": 0.2178, + "step": 344810 + }, + { + "epoch": 14.28, + "grad_norm": 1.1015625, + "learning_rate": 0.00026882568857981593, + "loss": 0.2269, + "step": 344820 + }, + { + "epoch": 14.28, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002688148741598683, + "loss": 0.1945, + "step": 344830 + }, + { + "epoch": 14.28, + "grad_norm": 0.375, + "learning_rate": 0.00026880405970451296, + "loss": 0.146, + "step": 344840 + }, + { + "epoch": 14.28, + "grad_norm": 0.41015625, + "learning_rate": 0.00026879324521377054, + "loss": 0.1869, + "step": 344850 + }, + { + "epoch": 14.28, + "grad_norm": 0.3671875, + "learning_rate": 0.00026878243068766117, + "loss": 0.1823, + "step": 344860 + }, + { + "epoch": 14.28, + "grad_norm": 0.2890625, + "learning_rate": 0.00026877161612620534, + "loss": 0.1659, + "step": 344870 + }, + { + "epoch": 14.28, + "grad_norm": 1.0625, + "learning_rate": 0.00026876080152942334, + "loss": 0.1399, + "step": 344880 + }, + { + "epoch": 14.29, + "grad_norm": 0.6796875, + "learning_rate": 0.00026874998689733554, + "loss": 0.1918, + "step": 344890 + }, + { + "epoch": 14.29, + "grad_norm": 0.984375, + "learning_rate": 0.0002687391722299623, + "loss": 0.2162, + "step": 344900 + }, + { + "epoch": 14.29, + "grad_norm": 0.95703125, + "learning_rate": 0.000268728357527324, + "loss": 0.1599, + "step": 344910 + }, + { + "epoch": 14.29, + "grad_norm": 0.9609375, + "learning_rate": 0.0002687175427894408, + "loss": 0.1698, + "step": 344920 + }, + { + "epoch": 14.29, + "grad_norm": 1.1171875, + "learning_rate": 0.0002687067280163334, + "loss": 0.2247, + "step": 344930 + }, + { + "epoch": 14.29, + "grad_norm": 1.1796875, + "learning_rate": 0.00026869591320802184, + "loss": 0.157, + "step": 344940 + }, + { + "epoch": 14.29, + "grad_norm": 0.8671875, + "learning_rate": 0.0002686850983645266, + "loss": 0.194, + "step": 344950 + }, + { + "epoch": 14.29, + "grad_norm": 1.484375, + "learning_rate": 0.0002686742834858681, + "loss": 0.1967, + "step": 344960 + }, + { + "epoch": 14.29, + "grad_norm": 1.109375, + "learning_rate": 0.00026866346857206655, + "loss": 0.2034, + "step": 344970 + }, + { + "epoch": 14.29, + "grad_norm": 0.71875, + "learning_rate": 0.00026865265362314235, + "loss": 0.1876, + "step": 344980 + }, + { + "epoch": 14.29, + "grad_norm": 1.328125, + "learning_rate": 0.0002686418386391159, + "loss": 0.1797, + "step": 344990 + }, + { + "epoch": 14.29, + "grad_norm": 0.96875, + "learning_rate": 0.00026863102362000745, + "loss": 0.2006, + "step": 345000 + }, + { + "epoch": 14.29, + "grad_norm": 1.234375, + "learning_rate": 0.0002686202085658376, + "loss": 0.1716, + "step": 345010 + }, + { + "epoch": 14.29, + "grad_norm": 0.40234375, + "learning_rate": 0.0002686093934766264, + "loss": 0.1592, + "step": 345020 + }, + { + "epoch": 14.29, + "grad_norm": 0.8671875, + "learning_rate": 0.00026859857835239434, + "loss": 0.1923, + "step": 345030 + }, + { + "epoch": 14.29, + "grad_norm": 1.7890625, + "learning_rate": 0.0002685877631931618, + "loss": 0.1761, + "step": 345040 + }, + { + "epoch": 14.29, + "grad_norm": 0.75, + "learning_rate": 0.00026857694799894906, + "loss": 0.2104, + "step": 345050 + }, + { + "epoch": 14.29, + "grad_norm": 0.6953125, + "learning_rate": 0.0002685661327697765, + "loss": 0.1465, + "step": 345060 + }, + { + "epoch": 14.29, + "grad_norm": 0.91796875, + "learning_rate": 0.00026855531750566447, + "loss": 0.1541, + "step": 345070 + }, + { + "epoch": 14.29, + "grad_norm": 1.421875, + "learning_rate": 0.0002685445022066334, + "loss": 0.1998, + "step": 345080 + }, + { + "epoch": 14.29, + "grad_norm": 0.78125, + "learning_rate": 0.00026853368687270357, + "loss": 0.1977, + "step": 345090 + }, + { + "epoch": 14.29, + "grad_norm": 0.6953125, + "learning_rate": 0.0002685228715038953, + "loss": 0.2069, + "step": 345100 + }, + { + "epoch": 14.29, + "grad_norm": 0.91015625, + "learning_rate": 0.000268512056100229, + "loss": 0.1746, + "step": 345110 + }, + { + "epoch": 14.29, + "grad_norm": 0.59375, + "learning_rate": 0.0002685012406617251, + "loss": 0.1975, + "step": 345120 + }, + { + "epoch": 14.3, + "grad_norm": 1.4921875, + "learning_rate": 0.00026849042518840377, + "loss": 0.2094, + "step": 345130 + }, + { + "epoch": 14.3, + "grad_norm": 1.2734375, + "learning_rate": 0.00026847960968028544, + "loss": 0.2266, + "step": 345140 + }, + { + "epoch": 14.3, + "grad_norm": 0.76953125, + "learning_rate": 0.0002684687941373905, + "loss": 0.1868, + "step": 345150 + }, + { + "epoch": 14.3, + "grad_norm": 1.140625, + "learning_rate": 0.0002684579785597393, + "loss": 0.1921, + "step": 345160 + }, + { + "epoch": 14.3, + "grad_norm": 1.3984375, + "learning_rate": 0.0002684471629473522, + "loss": 0.2433, + "step": 345170 + }, + { + "epoch": 14.3, + "grad_norm": 0.53125, + "learning_rate": 0.00026843634730024953, + "loss": 0.1993, + "step": 345180 + }, + { + "epoch": 14.3, + "grad_norm": 0.6640625, + "learning_rate": 0.00026842553161845155, + "loss": 0.2312, + "step": 345190 + }, + { + "epoch": 14.3, + "grad_norm": 1.1015625, + "learning_rate": 0.00026841471590197883, + "loss": 0.2114, + "step": 345200 + }, + { + "epoch": 14.3, + "grad_norm": 0.58203125, + "learning_rate": 0.00026840390015085155, + "loss": 0.1717, + "step": 345210 + }, + { + "epoch": 14.3, + "grad_norm": 1.046875, + "learning_rate": 0.0002683930843650901, + "loss": 0.1439, + "step": 345220 + }, + { + "epoch": 14.3, + "grad_norm": 0.55859375, + "learning_rate": 0.00026838226854471485, + "loss": 0.1704, + "step": 345230 + }, + { + "epoch": 14.3, + "grad_norm": 0.59765625, + "learning_rate": 0.00026837145268974614, + "loss": 0.17, + "step": 345240 + }, + { + "epoch": 14.3, + "grad_norm": 0.55078125, + "learning_rate": 0.00026836063680020446, + "loss": 0.2093, + "step": 345250 + }, + { + "epoch": 14.3, + "grad_norm": 1.390625, + "learning_rate": 0.00026834982087610995, + "loss": 0.2209, + "step": 345260 + }, + { + "epoch": 14.3, + "grad_norm": 0.625, + "learning_rate": 0.00026833900491748307, + "loss": 0.1475, + "step": 345270 + }, + { + "epoch": 14.3, + "grad_norm": 0.376953125, + "learning_rate": 0.00026832818892434413, + "loss": 0.1956, + "step": 345280 + }, + { + "epoch": 14.3, + "grad_norm": 1.40625, + "learning_rate": 0.0002683173728967136, + "loss": 0.2227, + "step": 345290 + }, + { + "epoch": 14.3, + "grad_norm": 0.69921875, + "learning_rate": 0.0002683065568346117, + "loss": 0.2221, + "step": 345300 + }, + { + "epoch": 14.3, + "grad_norm": 0.9453125, + "learning_rate": 0.00026829574073805883, + "loss": 0.2266, + "step": 345310 + }, + { + "epoch": 14.3, + "grad_norm": 1.046875, + "learning_rate": 0.00026828492460707535, + "loss": 0.1768, + "step": 345320 + }, + { + "epoch": 14.3, + "grad_norm": 0.478515625, + "learning_rate": 0.0002682741084416817, + "loss": 0.1777, + "step": 345330 + }, + { + "epoch": 14.3, + "grad_norm": 0.373046875, + "learning_rate": 0.00026826329224189803, + "loss": 0.2562, + "step": 345340 + }, + { + "epoch": 14.3, + "grad_norm": 0.57421875, + "learning_rate": 0.00026825247600774485, + "loss": 0.1865, + "step": 345350 + }, + { + "epoch": 14.3, + "grad_norm": 1.171875, + "learning_rate": 0.00026824165973924247, + "loss": 0.1589, + "step": 345360 + }, + { + "epoch": 14.31, + "grad_norm": 0.76953125, + "learning_rate": 0.0002682308434364113, + "loss": 0.1942, + "step": 345370 + }, + { + "epoch": 14.31, + "grad_norm": 0.8828125, + "learning_rate": 0.00026822002709927167, + "loss": 0.1933, + "step": 345380 + }, + { + "epoch": 14.31, + "grad_norm": 0.5078125, + "learning_rate": 0.00026820921072784386, + "loss": 0.151, + "step": 345390 + }, + { + "epoch": 14.31, + "grad_norm": 1.59375, + "learning_rate": 0.0002681983943221483, + "loss": 0.1798, + "step": 345400 + }, + { + "epoch": 14.31, + "grad_norm": 1.09375, + "learning_rate": 0.00026818757788220533, + "loss": 0.2091, + "step": 345410 + }, + { + "epoch": 14.31, + "grad_norm": 0.87109375, + "learning_rate": 0.0002681767614080353, + "loss": 0.1777, + "step": 345420 + }, + { + "epoch": 14.31, + "grad_norm": 0.69140625, + "learning_rate": 0.0002681659448996586, + "loss": 0.1778, + "step": 345430 + }, + { + "epoch": 14.31, + "grad_norm": 0.78515625, + "learning_rate": 0.00026815512835709547, + "loss": 0.239, + "step": 345440 + }, + { + "epoch": 14.31, + "grad_norm": 0.3671875, + "learning_rate": 0.0002681443117803664, + "loss": 0.1956, + "step": 345450 + }, + { + "epoch": 14.31, + "grad_norm": 1.9140625, + "learning_rate": 0.00026813349516949177, + "loss": 0.1729, + "step": 345460 + }, + { + "epoch": 14.31, + "grad_norm": 1.0234375, + "learning_rate": 0.0002681226785244917, + "loss": 0.1976, + "step": 345470 + }, + { + "epoch": 14.31, + "grad_norm": 0.7421875, + "learning_rate": 0.00026811186184538685, + "loss": 0.1903, + "step": 345480 + }, + { + "epoch": 14.31, + "grad_norm": 0.369140625, + "learning_rate": 0.0002681010451321974, + "loss": 0.187, + "step": 345490 + }, + { + "epoch": 14.31, + "grad_norm": 0.50390625, + "learning_rate": 0.00026809022838494373, + "loss": 0.1781, + "step": 345500 + }, + { + "epoch": 14.31, + "grad_norm": 0.87109375, + "learning_rate": 0.0002680794116036462, + "loss": 0.1761, + "step": 345510 + }, + { + "epoch": 14.31, + "grad_norm": 1.0078125, + "learning_rate": 0.00026806859478832514, + "loss": 0.1699, + "step": 345520 + }, + { + "epoch": 14.31, + "grad_norm": 0.365234375, + "learning_rate": 0.00026805777793900085, + "loss": 0.1985, + "step": 345530 + }, + { + "epoch": 14.31, + "grad_norm": 0.71484375, + "learning_rate": 0.000268046961055694, + "loss": 0.2264, + "step": 345540 + }, + { + "epoch": 14.31, + "grad_norm": 1.1640625, + "learning_rate": 0.00026803614413842454, + "loss": 0.214, + "step": 345550 + }, + { + "epoch": 14.31, + "grad_norm": 1.3828125, + "learning_rate": 0.00026802532718721306, + "loss": 0.2317, + "step": 345560 + }, + { + "epoch": 14.31, + "grad_norm": 0.93359375, + "learning_rate": 0.0002680145102020798, + "loss": 0.2395, + "step": 345570 + }, + { + "epoch": 14.31, + "grad_norm": 0.5234375, + "learning_rate": 0.0002680036931830453, + "loss": 0.1843, + "step": 345580 + }, + { + "epoch": 14.31, + "grad_norm": 0.7109375, + "learning_rate": 0.00026799287613012975, + "loss": 0.1987, + "step": 345590 + }, + { + "epoch": 14.31, + "grad_norm": 1.328125, + "learning_rate": 0.0002679820590433535, + "loss": 0.1832, + "step": 345600 + }, + { + "epoch": 14.32, + "grad_norm": 0.828125, + "learning_rate": 0.000267971241922737, + "loss": 0.1738, + "step": 345610 + }, + { + "epoch": 14.32, + "grad_norm": 1.8515625, + "learning_rate": 0.00026796042476830053, + "loss": 0.1899, + "step": 345620 + }, + { + "epoch": 14.32, + "grad_norm": 0.5703125, + "learning_rate": 0.0002679496075800645, + "loss": 0.2453, + "step": 345630 + }, + { + "epoch": 14.32, + "grad_norm": 0.6015625, + "learning_rate": 0.00026793879035804925, + "loss": 0.1792, + "step": 345640 + }, + { + "epoch": 14.32, + "grad_norm": 0.59765625, + "learning_rate": 0.00026792797310227514, + "loss": 0.1817, + "step": 345650 + }, + { + "epoch": 14.32, + "grad_norm": 0.7734375, + "learning_rate": 0.00026791715581276254, + "loss": 0.1791, + "step": 345660 + }, + { + "epoch": 14.32, + "grad_norm": 0.5625, + "learning_rate": 0.00026790633848953175, + "loss": 0.17, + "step": 345670 + }, + { + "epoch": 14.32, + "grad_norm": 0.482421875, + "learning_rate": 0.0002678955211326032, + "loss": 0.1991, + "step": 345680 + }, + { + "epoch": 14.32, + "grad_norm": 0.60546875, + "learning_rate": 0.00026788470374199716, + "loss": 0.1954, + "step": 345690 + }, + { + "epoch": 14.32, + "grad_norm": 0.71484375, + "learning_rate": 0.000267873886317734, + "loss": 0.1877, + "step": 345700 + }, + { + "epoch": 14.32, + "grad_norm": 0.455078125, + "learning_rate": 0.00026786306885983425, + "loss": 0.219, + "step": 345710 + }, + { + "epoch": 14.32, + "grad_norm": 0.494140625, + "learning_rate": 0.00026785225136831804, + "loss": 0.1748, + "step": 345720 + }, + { + "epoch": 14.32, + "grad_norm": 0.416015625, + "learning_rate": 0.0002678414338432059, + "loss": 0.1488, + "step": 345730 + }, + { + "epoch": 14.32, + "grad_norm": 0.6875, + "learning_rate": 0.00026783061628451804, + "loss": 0.1748, + "step": 345740 + }, + { + "epoch": 14.32, + "grad_norm": 0.314453125, + "learning_rate": 0.0002678197986922749, + "loss": 0.2275, + "step": 345750 + }, + { + "epoch": 14.32, + "grad_norm": 0.77734375, + "learning_rate": 0.00026780898106649683, + "loss": 0.1445, + "step": 345760 + }, + { + "epoch": 14.32, + "grad_norm": 0.69140625, + "learning_rate": 0.00026779816340720414, + "loss": 0.1861, + "step": 345770 + }, + { + "epoch": 14.32, + "grad_norm": 0.50390625, + "learning_rate": 0.0002677873457144173, + "loss": 0.2571, + "step": 345780 + }, + { + "epoch": 14.32, + "grad_norm": 1.140625, + "learning_rate": 0.00026777652798815657, + "loss": 0.2324, + "step": 345790 + }, + { + "epoch": 14.32, + "grad_norm": 0.53125, + "learning_rate": 0.00026776571022844224, + "loss": 0.1917, + "step": 345800 + }, + { + "epoch": 14.32, + "grad_norm": 0.88671875, + "learning_rate": 0.00026775489243529486, + "loss": 0.2144, + "step": 345810 + }, + { + "epoch": 14.32, + "grad_norm": 0.6640625, + "learning_rate": 0.0002677440746087347, + "loss": 0.1702, + "step": 345820 + }, + { + "epoch": 14.32, + "grad_norm": 1.0, + "learning_rate": 0.000267733256748782, + "loss": 0.1743, + "step": 345830 + }, + { + "epoch": 14.32, + "grad_norm": 1.4921875, + "learning_rate": 0.00026772243885545737, + "loss": 0.1953, + "step": 345840 + }, + { + "epoch": 14.33, + "grad_norm": 1.015625, + "learning_rate": 0.000267711620928781, + "loss": 0.1821, + "step": 345850 + }, + { + "epoch": 14.33, + "grad_norm": 3.40625, + "learning_rate": 0.0002677008029687732, + "loss": 0.1843, + "step": 345860 + }, + { + "epoch": 14.33, + "grad_norm": 1.609375, + "learning_rate": 0.0002676899849754545, + "loss": 0.2041, + "step": 345870 + }, + { + "epoch": 14.33, + "grad_norm": 0.70703125, + "learning_rate": 0.00026767916694884504, + "loss": 0.17, + "step": 345880 + }, + { + "epoch": 14.33, + "grad_norm": 0.59375, + "learning_rate": 0.00026766834888896534, + "loss": 0.1779, + "step": 345890 + }, + { + "epoch": 14.33, + "grad_norm": 1.171875, + "learning_rate": 0.00026765753079583575, + "loss": 0.2307, + "step": 345900 + }, + { + "epoch": 14.33, + "grad_norm": 0.64453125, + "learning_rate": 0.0002676467126694765, + "loss": 0.1604, + "step": 345910 + }, + { + "epoch": 14.33, + "grad_norm": 0.60546875, + "learning_rate": 0.00026763589450990816, + "loss": 0.1851, + "step": 345920 + }, + { + "epoch": 14.33, + "grad_norm": 1.28125, + "learning_rate": 0.00026762507631715096, + "loss": 0.2041, + "step": 345930 + }, + { + "epoch": 14.33, + "grad_norm": 0.86328125, + "learning_rate": 0.0002676142580912252, + "loss": 0.2044, + "step": 345940 + }, + { + "epoch": 14.33, + "grad_norm": 0.87890625, + "learning_rate": 0.0002676034398321514, + "loss": 0.1908, + "step": 345950 + }, + { + "epoch": 14.33, + "grad_norm": 0.67578125, + "learning_rate": 0.00026759262153994975, + "loss": 0.2145, + "step": 345960 + }, + { + "epoch": 14.33, + "grad_norm": 0.46875, + "learning_rate": 0.00026758180321464066, + "loss": 0.1806, + "step": 345970 + }, + { + "epoch": 14.33, + "grad_norm": 1.265625, + "learning_rate": 0.0002675709848562446, + "loss": 0.215, + "step": 345980 + }, + { + "epoch": 14.33, + "grad_norm": 1.078125, + "learning_rate": 0.0002675601664647817, + "loss": 0.2214, + "step": 345990 + }, + { + "epoch": 14.33, + "grad_norm": 0.55859375, + "learning_rate": 0.00026754934804027266, + "loss": 0.1904, + "step": 346000 + }, + { + "epoch": 14.33, + "grad_norm": 1.765625, + "learning_rate": 0.0002675385295827375, + "loss": 0.2351, + "step": 346010 + }, + { + "epoch": 14.33, + "grad_norm": 0.87109375, + "learning_rate": 0.0002675277110921968, + "loss": 0.1724, + "step": 346020 + }, + { + "epoch": 14.33, + "grad_norm": 0.5703125, + "learning_rate": 0.00026751689256867086, + "loss": 0.1703, + "step": 346030 + }, + { + "epoch": 14.33, + "grad_norm": 0.8359375, + "learning_rate": 0.00026750607401217983, + "loss": 0.1676, + "step": 346040 + }, + { + "epoch": 14.33, + "grad_norm": 1.09375, + "learning_rate": 0.0002674952554227445, + "loss": 0.1937, + "step": 346050 + }, + { + "epoch": 14.33, + "grad_norm": 0.6484375, + "learning_rate": 0.00026748443680038494, + "loss": 0.193, + "step": 346060 + }, + { + "epoch": 14.33, + "grad_norm": 0.7421875, + "learning_rate": 0.0002674736181451214, + "loss": 0.1921, + "step": 346070 + }, + { + "epoch": 14.33, + "grad_norm": 0.294921875, + "learning_rate": 0.0002674627994569746, + "loss": 0.2152, + "step": 346080 + }, + { + "epoch": 14.34, + "grad_norm": 1.078125, + "learning_rate": 0.00026745198073596454, + "loss": 0.1421, + "step": 346090 + }, + { + "epoch": 14.34, + "grad_norm": 0.96875, + "learning_rate": 0.0002674411619821118, + "loss": 0.1814, + "step": 346100 + }, + { + "epoch": 14.34, + "grad_norm": 1.3203125, + "learning_rate": 0.0002674303431954367, + "loss": 0.178, + "step": 346110 + }, + { + "epoch": 14.34, + "grad_norm": 0.75390625, + "learning_rate": 0.00026741952437595945, + "loss": 0.2156, + "step": 346120 + }, + { + "epoch": 14.34, + "grad_norm": 1.484375, + "learning_rate": 0.0002674087055237007, + "loss": 0.1374, + "step": 346130 + }, + { + "epoch": 14.34, + "grad_norm": 0.431640625, + "learning_rate": 0.00026739788663868066, + "loss": 0.1794, + "step": 346140 + }, + { + "epoch": 14.34, + "grad_norm": 0.58203125, + "learning_rate": 0.0002673870677209195, + "loss": 0.1813, + "step": 346150 + }, + { + "epoch": 14.34, + "grad_norm": 0.546875, + "learning_rate": 0.00026737624877043795, + "loss": 0.1692, + "step": 346160 + }, + { + "epoch": 14.34, + "grad_norm": 0.80078125, + "learning_rate": 0.000267365429787256, + "loss": 0.2095, + "step": 346170 + }, + { + "epoch": 14.34, + "grad_norm": 0.33984375, + "learning_rate": 0.00026735461077139426, + "loss": 0.1944, + "step": 346180 + }, + { + "epoch": 14.34, + "grad_norm": 0.734375, + "learning_rate": 0.0002673437917228731, + "loss": 0.1847, + "step": 346190 + }, + { + "epoch": 14.34, + "grad_norm": 0.76171875, + "learning_rate": 0.0002673329726417126, + "loss": 0.1941, + "step": 346200 + }, + { + "epoch": 14.34, + "grad_norm": 1.328125, + "learning_rate": 0.00026732215352793354, + "loss": 0.1223, + "step": 346210 + }, + { + "epoch": 14.34, + "grad_norm": 0.318359375, + "learning_rate": 0.000267311334381556, + "loss": 0.1813, + "step": 346220 + }, + { + "epoch": 14.34, + "grad_norm": 0.8203125, + "learning_rate": 0.00026730051520260025, + "loss": 0.1982, + "step": 346230 + }, + { + "epoch": 14.34, + "grad_norm": 1.046875, + "learning_rate": 0.00026728969599108696, + "loss": 0.1526, + "step": 346240 + }, + { + "epoch": 14.34, + "grad_norm": 0.62890625, + "learning_rate": 0.00026727887674703627, + "loss": 0.1468, + "step": 346250 + }, + { + "epoch": 14.34, + "grad_norm": 1.234375, + "learning_rate": 0.00026726805747046857, + "loss": 0.1991, + "step": 346260 + }, + { + "epoch": 14.34, + "grad_norm": 0.306640625, + "learning_rate": 0.00026725723816140435, + "loss": 0.2176, + "step": 346270 + }, + { + "epoch": 14.34, + "grad_norm": 0.6015625, + "learning_rate": 0.00026724641881986373, + "loss": 0.1903, + "step": 346280 + }, + { + "epoch": 14.34, + "grad_norm": 0.890625, + "learning_rate": 0.00026723559944586735, + "loss": 0.2284, + "step": 346290 + }, + { + "epoch": 14.34, + "grad_norm": 0.54296875, + "learning_rate": 0.0002672247800394354, + "loss": 0.2125, + "step": 346300 + }, + { + "epoch": 14.34, + "grad_norm": 1.453125, + "learning_rate": 0.00026721396060058815, + "loss": 0.2203, + "step": 346310 + }, + { + "epoch": 14.34, + "grad_norm": 1.1328125, + "learning_rate": 0.00026720314112934626, + "loss": 0.1678, + "step": 346320 + }, + { + "epoch": 14.34, + "grad_norm": 1.4609375, + "learning_rate": 0.0002671923216257298, + "loss": 0.2074, + "step": 346330 + }, + { + "epoch": 14.35, + "grad_norm": 0.703125, + "learning_rate": 0.00026718150208975927, + "loss": 0.1825, + "step": 346340 + }, + { + "epoch": 14.35, + "grad_norm": 0.796875, + "learning_rate": 0.00026717068252145505, + "loss": 0.1771, + "step": 346350 + }, + { + "epoch": 14.35, + "grad_norm": 0.51171875, + "learning_rate": 0.0002671598629208374, + "loss": 0.1992, + "step": 346360 + }, + { + "epoch": 14.35, + "grad_norm": 0.7109375, + "learning_rate": 0.0002671490432879268, + "loss": 0.1976, + "step": 346370 + }, + { + "epoch": 14.35, + "grad_norm": 0.72265625, + "learning_rate": 0.00026713822362274347, + "loss": 0.206, + "step": 346380 + }, + { + "epoch": 14.35, + "grad_norm": 0.419921875, + "learning_rate": 0.00026712740392530795, + "loss": 0.2122, + "step": 346390 + }, + { + "epoch": 14.35, + "grad_norm": 0.0, + "learning_rate": 0.00026711658419564045, + "loss": 0.2292, + "step": 346400 + }, + { + "epoch": 14.35, + "grad_norm": 1.484375, + "learning_rate": 0.00026710576443376137, + "loss": 0.1674, + "step": 346410 + }, + { + "epoch": 14.35, + "grad_norm": 0.55078125, + "learning_rate": 0.0002670949446396911, + "loss": 0.1626, + "step": 346420 + }, + { + "epoch": 14.35, + "grad_norm": 0.4453125, + "learning_rate": 0.0002670841248134501, + "loss": 0.2182, + "step": 346430 + }, + { + "epoch": 14.35, + "grad_norm": 2.671875, + "learning_rate": 0.00026707330495505843, + "loss": 0.1964, + "step": 346440 + }, + { + "epoch": 14.35, + "grad_norm": 1.34375, + "learning_rate": 0.00026706248506453686, + "loss": 0.2151, + "step": 346450 + }, + { + "epoch": 14.35, + "grad_norm": 1.03125, + "learning_rate": 0.0002670516651419054, + "loss": 0.1583, + "step": 346460 + }, + { + "epoch": 14.35, + "grad_norm": 0.3125, + "learning_rate": 0.0002670408451871845, + "loss": 0.1829, + "step": 346470 + }, + { + "epoch": 14.35, + "grad_norm": 0.90625, + "learning_rate": 0.0002670300252003947, + "loss": 0.1639, + "step": 346480 + }, + { + "epoch": 14.35, + "grad_norm": 0.859375, + "learning_rate": 0.00026701920518155617, + "loss": 0.1734, + "step": 346490 + }, + { + "epoch": 14.35, + "grad_norm": 0.3125, + "learning_rate": 0.00026700838513068933, + "loss": 0.1705, + "step": 346500 + }, + { + "epoch": 14.35, + "grad_norm": 1.859375, + "learning_rate": 0.0002669975650478146, + "loss": 0.1554, + "step": 346510 + }, + { + "epoch": 14.35, + "grad_norm": 0.75, + "learning_rate": 0.00026698674493295216, + "loss": 0.1967, + "step": 346520 + }, + { + "epoch": 14.35, + "grad_norm": 0.58203125, + "learning_rate": 0.0002669759247861226, + "loss": 0.1833, + "step": 346530 + }, + { + "epoch": 14.35, + "grad_norm": 0.71484375, + "learning_rate": 0.00026696510460734614, + "loss": 0.2116, + "step": 346540 + }, + { + "epoch": 14.35, + "grad_norm": 0.69140625, + "learning_rate": 0.0002669542843966432, + "loss": 0.2556, + "step": 346550 + }, + { + "epoch": 14.35, + "grad_norm": 1.3203125, + "learning_rate": 0.00026694346415403417, + "loss": 0.2352, + "step": 346560 + }, + { + "epoch": 14.35, + "grad_norm": 1.046875, + "learning_rate": 0.0002669326438795393, + "loss": 0.189, + "step": 346570 + }, + { + "epoch": 14.36, + "grad_norm": 0.88671875, + "learning_rate": 0.00026692182357317904, + "loss": 0.1796, + "step": 346580 + }, + { + "epoch": 14.36, + "grad_norm": 0.78125, + "learning_rate": 0.00026691100323497375, + "loss": 0.1821, + "step": 346590 + }, + { + "epoch": 14.36, + "grad_norm": 0.0, + "learning_rate": 0.00026690018286494376, + "loss": 0.2196, + "step": 346600 + }, + { + "epoch": 14.36, + "grad_norm": 0.494140625, + "learning_rate": 0.0002668893624631096, + "loss": 0.1831, + "step": 346610 + }, + { + "epoch": 14.36, + "grad_norm": 0.69921875, + "learning_rate": 0.0002668785420294913, + "loss": 0.1758, + "step": 346620 + }, + { + "epoch": 14.36, + "grad_norm": 0.486328125, + "learning_rate": 0.0002668677215641094, + "loss": 0.1248, + "step": 346630 + }, + { + "epoch": 14.36, + "grad_norm": 1.34375, + "learning_rate": 0.0002668569010669844, + "loss": 0.2072, + "step": 346640 + }, + { + "epoch": 14.36, + "grad_norm": 2.28125, + "learning_rate": 0.0002668460805381365, + "loss": 0.2099, + "step": 346650 + }, + { + "epoch": 14.36, + "grad_norm": 0.77734375, + "learning_rate": 0.000266835259977586, + "loss": 0.1854, + "step": 346660 + }, + { + "epoch": 14.36, + "grad_norm": 0.7578125, + "learning_rate": 0.0002668244393853534, + "loss": 0.1711, + "step": 346670 + }, + { + "epoch": 14.36, + "grad_norm": 0.4375, + "learning_rate": 0.00026681361876145903, + "loss": 0.2074, + "step": 346680 + }, + { + "epoch": 14.36, + "grad_norm": 1.5390625, + "learning_rate": 0.0002668027981059233, + "loss": 0.1596, + "step": 346690 + }, + { + "epoch": 14.36, + "grad_norm": 1.1640625, + "learning_rate": 0.0002667919774187665, + "loss": 0.163, + "step": 346700 + }, + { + "epoch": 14.36, + "grad_norm": 0.6953125, + "learning_rate": 0.000266781156700009, + "loss": 0.1627, + "step": 346710 + }, + { + "epoch": 14.36, + "grad_norm": 1.203125, + "learning_rate": 0.0002667703359496711, + "loss": 0.2129, + "step": 346720 + }, + { + "epoch": 14.36, + "grad_norm": 0.6640625, + "learning_rate": 0.00026675951516777333, + "loss": 0.1771, + "step": 346730 + }, + { + "epoch": 14.36, + "grad_norm": 1.4921875, + "learning_rate": 0.0002667486943543359, + "loss": 0.1262, + "step": 346740 + }, + { + "epoch": 14.36, + "grad_norm": 4.375, + "learning_rate": 0.00026673787350937927, + "loss": 0.2049, + "step": 346750 + }, + { + "epoch": 14.36, + "grad_norm": 0.3359375, + "learning_rate": 0.00026672705263292376, + "loss": 0.2323, + "step": 346760 + }, + { + "epoch": 14.36, + "grad_norm": 0.59765625, + "learning_rate": 0.0002667162317249898, + "loss": 0.2004, + "step": 346770 + }, + { + "epoch": 14.36, + "grad_norm": 1.265625, + "learning_rate": 0.00026670541078559765, + "loss": 0.1955, + "step": 346780 + }, + { + "epoch": 14.36, + "grad_norm": 1.4140625, + "learning_rate": 0.0002666945898147677, + "loss": 0.2349, + "step": 346790 + }, + { + "epoch": 14.36, + "grad_norm": 0.69140625, + "learning_rate": 0.0002666837688125203, + "loss": 0.1731, + "step": 346800 + }, + { + "epoch": 14.36, + "grad_norm": 0.69140625, + "learning_rate": 0.00026667294777887597, + "loss": 0.2077, + "step": 346810 + }, + { + "epoch": 14.37, + "grad_norm": 1.4296875, + "learning_rate": 0.0002666621267138549, + "loss": 0.2033, + "step": 346820 + }, + { + "epoch": 14.37, + "grad_norm": 0.9609375, + "learning_rate": 0.00026665130561747747, + "loss": 0.1877, + "step": 346830 + }, + { + "epoch": 14.37, + "grad_norm": 1.1328125, + "learning_rate": 0.00026664048448976407, + "loss": 0.2028, + "step": 346840 + }, + { + "epoch": 14.37, + "grad_norm": 2.765625, + "learning_rate": 0.0002666296633307352, + "loss": 0.2105, + "step": 346850 + }, + { + "epoch": 14.37, + "grad_norm": 0.671875, + "learning_rate": 0.00026661884214041094, + "loss": 0.1946, + "step": 346860 + }, + { + "epoch": 14.37, + "grad_norm": 0.90625, + "learning_rate": 0.00026660802091881187, + "loss": 0.1868, + "step": 346870 + }, + { + "epoch": 14.37, + "grad_norm": 0.60546875, + "learning_rate": 0.0002665971996659583, + "loss": 0.1075, + "step": 346880 + }, + { + "epoch": 14.37, + "grad_norm": 0.87890625, + "learning_rate": 0.0002665863783818707, + "loss": 0.1867, + "step": 346890 + }, + { + "epoch": 14.37, + "grad_norm": 0.54296875, + "learning_rate": 0.00026657555706656916, + "loss": 0.1577, + "step": 346900 + }, + { + "epoch": 14.37, + "grad_norm": 1.15625, + "learning_rate": 0.0002665647357200743, + "loss": 0.1719, + "step": 346910 + }, + { + "epoch": 14.37, + "grad_norm": 0.8203125, + "learning_rate": 0.0002665539143424063, + "loss": 0.247, + "step": 346920 + }, + { + "epoch": 14.37, + "grad_norm": 1.625, + "learning_rate": 0.00026654309293358574, + "loss": 0.2505, + "step": 346930 + }, + { + "epoch": 14.37, + "grad_norm": 0.67578125, + "learning_rate": 0.00026653227149363285, + "loss": 0.1433, + "step": 346940 + }, + { + "epoch": 14.37, + "grad_norm": 1.46875, + "learning_rate": 0.000266521450022568, + "loss": 0.1967, + "step": 346950 + }, + { + "epoch": 14.37, + "grad_norm": 0.435546875, + "learning_rate": 0.0002665106285204115, + "loss": 0.191, + "step": 346960 + }, + { + "epoch": 14.37, + "grad_norm": 1.1484375, + "learning_rate": 0.0002664998069871839, + "loss": 0.2152, + "step": 346970 + }, + { + "epoch": 14.37, + "grad_norm": 0.50390625, + "learning_rate": 0.00026648898542290533, + "loss": 0.2103, + "step": 346980 + }, + { + "epoch": 14.37, + "grad_norm": 2.484375, + "learning_rate": 0.0002664781638275963, + "loss": 0.1854, + "step": 346990 + }, + { + "epoch": 14.37, + "grad_norm": 1.6796875, + "learning_rate": 0.0002664673422012772, + "loss": 0.1971, + "step": 347000 + }, + { + "epoch": 14.37, + "grad_norm": 0.67578125, + "learning_rate": 0.00026645652054396827, + "loss": 0.1433, + "step": 347010 + }, + { + "epoch": 14.37, + "grad_norm": 1.40625, + "learning_rate": 0.00026644569885569, + "loss": 0.1998, + "step": 347020 + }, + { + "epoch": 14.37, + "grad_norm": 1.0, + "learning_rate": 0.00026643487713646266, + "loss": 0.1807, + "step": 347030 + }, + { + "epoch": 14.37, + "grad_norm": 0.86328125, + "learning_rate": 0.0002664240553863067, + "loss": 0.1621, + "step": 347040 + }, + { + "epoch": 14.37, + "grad_norm": 2.203125, + "learning_rate": 0.0002664132336052424, + "loss": 0.2428, + "step": 347050 + }, + { + "epoch": 14.38, + "grad_norm": 0.73046875, + "learning_rate": 0.00026640241179329017, + "loss": 0.2113, + "step": 347060 + }, + { + "epoch": 14.38, + "grad_norm": 0.7578125, + "learning_rate": 0.0002663915899504704, + "loss": 0.1844, + "step": 347070 + }, + { + "epoch": 14.38, + "grad_norm": 0.404296875, + "learning_rate": 0.00026638076807680337, + "loss": 0.1967, + "step": 347080 + }, + { + "epoch": 14.38, + "grad_norm": 1.0390625, + "learning_rate": 0.0002663699461723095, + "loss": 0.2113, + "step": 347090 + }, + { + "epoch": 14.38, + "grad_norm": 1.0390625, + "learning_rate": 0.00026635912423700933, + "loss": 0.1773, + "step": 347100 + }, + { + "epoch": 14.38, + "grad_norm": 0.466796875, + "learning_rate": 0.0002663483022709229, + "loss": 0.1876, + "step": 347110 + }, + { + "epoch": 14.38, + "grad_norm": 0.96875, + "learning_rate": 0.00026633748027407074, + "loss": 0.1665, + "step": 347120 + }, + { + "epoch": 14.38, + "grad_norm": 0.63671875, + "learning_rate": 0.0002663266582464732, + "loss": 0.1657, + "step": 347130 + }, + { + "epoch": 14.38, + "grad_norm": 0.75, + "learning_rate": 0.00026631583618815066, + "loss": 0.2, + "step": 347140 + }, + { + "epoch": 14.38, + "grad_norm": 0.62890625, + "learning_rate": 0.00026630501409912355, + "loss": 0.1765, + "step": 347150 + }, + { + "epoch": 14.38, + "grad_norm": 0.953125, + "learning_rate": 0.00026629419197941206, + "loss": 0.1534, + "step": 347160 + }, + { + "epoch": 14.38, + "grad_norm": 0.341796875, + "learning_rate": 0.0002662833698290367, + "loss": 0.2201, + "step": 347170 + }, + { + "epoch": 14.38, + "grad_norm": 0.828125, + "learning_rate": 0.0002662725476480179, + "loss": 0.1795, + "step": 347180 + }, + { + "epoch": 14.38, + "grad_norm": 0.87890625, + "learning_rate": 0.0002662617254363758, + "loss": 0.195, + "step": 347190 + }, + { + "epoch": 14.38, + "grad_norm": 0.337890625, + "learning_rate": 0.0002662509031941309, + "loss": 0.1868, + "step": 347200 + }, + { + "epoch": 14.38, + "grad_norm": 1.1171875, + "learning_rate": 0.00026624008092130357, + "loss": 0.2127, + "step": 347210 + }, + { + "epoch": 14.38, + "grad_norm": 0.62890625, + "learning_rate": 0.00026622925861791413, + "loss": 0.189, + "step": 347220 + }, + { + "epoch": 14.38, + "grad_norm": 0.353515625, + "learning_rate": 0.0002662184362839831, + "loss": 0.1826, + "step": 347230 + }, + { + "epoch": 14.38, + "grad_norm": 1.1796875, + "learning_rate": 0.0002662076139195306, + "loss": 0.1678, + "step": 347240 + }, + { + "epoch": 14.38, + "grad_norm": 1.515625, + "learning_rate": 0.0002661967915245772, + "loss": 0.1621, + "step": 347250 + }, + { + "epoch": 14.38, + "grad_norm": 1.3671875, + "learning_rate": 0.0002661859690991431, + "loss": 0.2102, + "step": 347260 + }, + { + "epoch": 14.38, + "grad_norm": 0.69921875, + "learning_rate": 0.0002661751466432488, + "loss": 0.1385, + "step": 347270 + }, + { + "epoch": 14.38, + "grad_norm": 0.67578125, + "learning_rate": 0.00026616432415691463, + "loss": 0.2958, + "step": 347280 + }, + { + "epoch": 14.38, + "grad_norm": 0.91796875, + "learning_rate": 0.00026615350164016087, + "loss": 0.167, + "step": 347290 + }, + { + "epoch": 14.39, + "grad_norm": 0.4609375, + "learning_rate": 0.00026614267909300804, + "loss": 0.1717, + "step": 347300 + }, + { + "epoch": 14.39, + "grad_norm": 1.625, + "learning_rate": 0.0002661318565154765, + "loss": 0.1841, + "step": 347310 + }, + { + "epoch": 14.39, + "grad_norm": 1.0, + "learning_rate": 0.00026612103390758645, + "loss": 0.1468, + "step": 347320 + }, + { + "epoch": 14.39, + "grad_norm": 1.15625, + "learning_rate": 0.00026611021126935836, + "loss": 0.1827, + "step": 347330 + }, + { + "epoch": 14.39, + "grad_norm": 0.373046875, + "learning_rate": 0.00026609938860081266, + "loss": 0.1958, + "step": 347340 + }, + { + "epoch": 14.39, + "grad_norm": 0.9375, + "learning_rate": 0.0002660885659019695, + "loss": 0.1593, + "step": 347350 + }, + { + "epoch": 14.39, + "grad_norm": 2.40625, + "learning_rate": 0.0002660777431728496, + "loss": 0.2066, + "step": 347360 + }, + { + "epoch": 14.39, + "grad_norm": 0.73828125, + "learning_rate": 0.00026606692041347295, + "loss": 0.2028, + "step": 347370 + }, + { + "epoch": 14.39, + "grad_norm": 0.384765625, + "learning_rate": 0.0002660560976238602, + "loss": 0.2067, + "step": 347380 + }, + { + "epoch": 14.39, + "grad_norm": 0.98046875, + "learning_rate": 0.00026604527480403155, + "loss": 0.2005, + "step": 347390 + }, + { + "epoch": 14.39, + "grad_norm": 0.796875, + "learning_rate": 0.00026603445195400743, + "loss": 0.1665, + "step": 347400 + }, + { + "epoch": 14.39, + "grad_norm": 0.921875, + "learning_rate": 0.00026602362907380823, + "loss": 0.177, + "step": 347410 + }, + { + "epoch": 14.39, + "grad_norm": 0.8671875, + "learning_rate": 0.0002660128061634543, + "loss": 0.2037, + "step": 347420 + }, + { + "epoch": 14.39, + "grad_norm": 0.59375, + "learning_rate": 0.0002660019832229659, + "loss": 0.195, + "step": 347430 + }, + { + "epoch": 14.39, + "grad_norm": 0.82421875, + "learning_rate": 0.00026599116025236366, + "loss": 0.1876, + "step": 347440 + }, + { + "epoch": 14.39, + "grad_norm": 0.8046875, + "learning_rate": 0.00026598033725166764, + "loss": 0.214, + "step": 347450 + }, + { + "epoch": 14.39, + "grad_norm": 1.4140625, + "learning_rate": 0.00026596951422089835, + "loss": 0.1919, + "step": 347460 + }, + { + "epoch": 14.39, + "grad_norm": 0.5703125, + "learning_rate": 0.00026595869116007633, + "loss": 0.2301, + "step": 347470 + }, + { + "epoch": 14.39, + "grad_norm": 0.73046875, + "learning_rate": 0.0002659478680692216, + "loss": 0.1885, + "step": 347480 + }, + { + "epoch": 14.39, + "grad_norm": 1.1875, + "learning_rate": 0.0002659370449483548, + "loss": 0.2101, + "step": 347490 + }, + { + "epoch": 14.39, + "grad_norm": 2.46875, + "learning_rate": 0.0002659262217974962, + "loss": 0.241, + "step": 347500 + }, + { + "epoch": 14.39, + "grad_norm": 0.80078125, + "learning_rate": 0.000265915398616666, + "loss": 0.2078, + "step": 347510 + }, + { + "epoch": 14.39, + "grad_norm": 0.83203125, + "learning_rate": 0.00026590457540588496, + "loss": 0.1427, + "step": 347520 + }, + { + "epoch": 14.39, + "grad_norm": 1.375, + "learning_rate": 0.00026589375216517315, + "loss": 0.2067, + "step": 347530 + }, + { + "epoch": 14.4, + "grad_norm": 0.78515625, + "learning_rate": 0.00026588292889455096, + "loss": 0.1796, + "step": 347540 + }, + { + "epoch": 14.4, + "grad_norm": 0.384765625, + "learning_rate": 0.0002658721055940389, + "loss": 0.1477, + "step": 347550 + }, + { + "epoch": 14.4, + "grad_norm": 0.55078125, + "learning_rate": 0.00026586128226365727, + "loss": 0.1192, + "step": 347560 + }, + { + "epoch": 14.4, + "grad_norm": 1.171875, + "learning_rate": 0.0002658504589034263, + "loss": 0.1809, + "step": 347570 + }, + { + "epoch": 14.4, + "grad_norm": 1.0703125, + "learning_rate": 0.0002658396355133666, + "loss": 0.2165, + "step": 347580 + }, + { + "epoch": 14.4, + "grad_norm": 1.015625, + "learning_rate": 0.0002658288120934983, + "loss": 0.1943, + "step": 347590 + }, + { + "epoch": 14.4, + "grad_norm": 0.62890625, + "learning_rate": 0.00026581798864384193, + "loss": 0.2109, + "step": 347600 + }, + { + "epoch": 14.4, + "grad_norm": 0.78515625, + "learning_rate": 0.00026580716516441785, + "loss": 0.1742, + "step": 347610 + }, + { + "epoch": 14.4, + "grad_norm": 1.40625, + "learning_rate": 0.00026579634165524636, + "loss": 0.199, + "step": 347620 + }, + { + "epoch": 14.4, + "grad_norm": 0.8046875, + "learning_rate": 0.0002657855181163479, + "loss": 0.1528, + "step": 347630 + }, + { + "epoch": 14.4, + "grad_norm": 0.68359375, + "learning_rate": 0.0002657746945477427, + "loss": 0.2235, + "step": 347640 + }, + { + "epoch": 14.4, + "grad_norm": 0.7734375, + "learning_rate": 0.0002657638709494513, + "loss": 0.1406, + "step": 347650 + }, + { + "epoch": 14.4, + "grad_norm": 1.375, + "learning_rate": 0.0002657530473214941, + "loss": 0.1885, + "step": 347660 + }, + { + "epoch": 14.4, + "grad_norm": 1.2421875, + "learning_rate": 0.0002657422236638912, + "loss": 0.1708, + "step": 347670 + }, + { + "epoch": 14.4, + "grad_norm": 1.140625, + "learning_rate": 0.0002657313999766632, + "loss": 0.1956, + "step": 347680 + }, + { + "epoch": 14.4, + "grad_norm": 1.640625, + "learning_rate": 0.0002657205762598304, + "loss": 0.1376, + "step": 347690 + }, + { + "epoch": 14.4, + "grad_norm": 0.4140625, + "learning_rate": 0.0002657097525134131, + "loss": 0.173, + "step": 347700 + }, + { + "epoch": 14.4, + "grad_norm": 0.53125, + "learning_rate": 0.0002656989287374319, + "loss": 0.1835, + "step": 347710 + }, + { + "epoch": 14.4, + "grad_norm": 0.83203125, + "learning_rate": 0.0002656881049319069, + "loss": 0.1834, + "step": 347720 + }, + { + "epoch": 14.4, + "grad_norm": 0.69921875, + "learning_rate": 0.0002656772810968586, + "loss": 0.1962, + "step": 347730 + }, + { + "epoch": 14.4, + "grad_norm": 0.6875, + "learning_rate": 0.00026566645723230737, + "loss": 0.1868, + "step": 347740 + }, + { + "epoch": 14.4, + "grad_norm": 1.28125, + "learning_rate": 0.0002656556333382735, + "loss": 0.19, + "step": 347750 + }, + { + "epoch": 14.4, + "grad_norm": 0.63671875, + "learning_rate": 0.0002656448094147775, + "loss": 0.2101, + "step": 347760 + }, + { + "epoch": 14.4, + "grad_norm": 0.890625, + "learning_rate": 0.0002656339854618396, + "loss": 0.1907, + "step": 347770 + }, + { + "epoch": 14.41, + "grad_norm": 0.5546875, + "learning_rate": 0.0002656231614794802, + "loss": 0.1424, + "step": 347780 + }, + { + "epoch": 14.41, + "grad_norm": 1.0703125, + "learning_rate": 0.00026561233746771987, + "loss": 0.2129, + "step": 347790 + }, + { + "epoch": 14.41, + "grad_norm": 0.5703125, + "learning_rate": 0.00026560151342657856, + "loss": 0.1893, + "step": 347800 + }, + { + "epoch": 14.41, + "grad_norm": 0.474609375, + "learning_rate": 0.00026559068935607706, + "loss": 0.1856, + "step": 347810 + }, + { + "epoch": 14.41, + "grad_norm": 0.375, + "learning_rate": 0.0002655798652562356, + "loss": 0.1718, + "step": 347820 + }, + { + "epoch": 14.41, + "grad_norm": 1.6796875, + "learning_rate": 0.0002655690411270743, + "loss": 0.1741, + "step": 347830 + }, + { + "epoch": 14.41, + "grad_norm": 0.40625, + "learning_rate": 0.0002655582169686139, + "loss": 0.1981, + "step": 347840 + }, + { + "epoch": 14.41, + "grad_norm": 0.734375, + "learning_rate": 0.00026554739278087465, + "loss": 0.1967, + "step": 347850 + }, + { + "epoch": 14.41, + "grad_norm": 1.96875, + "learning_rate": 0.00026553656856387683, + "loss": 0.1286, + "step": 347860 + }, + { + "epoch": 14.41, + "grad_norm": 0.87109375, + "learning_rate": 0.00026552574431764085, + "loss": 0.2551, + "step": 347870 + }, + { + "epoch": 14.41, + "grad_norm": 0.6640625, + "learning_rate": 0.0002655149200421871, + "loss": 0.1637, + "step": 347880 + }, + { + "epoch": 14.41, + "grad_norm": 1.125, + "learning_rate": 0.00026550409573753597, + "loss": 0.1956, + "step": 347890 + }, + { + "epoch": 14.41, + "grad_norm": 1.859375, + "learning_rate": 0.0002654932714037078, + "loss": 0.2043, + "step": 347900 + }, + { + "epoch": 14.41, + "grad_norm": 0.74609375, + "learning_rate": 0.00026548244704072295, + "loss": 0.1714, + "step": 347910 + }, + { + "epoch": 14.41, + "grad_norm": 0.734375, + "learning_rate": 0.0002654716226486018, + "loss": 0.1622, + "step": 347920 + }, + { + "epoch": 14.41, + "grad_norm": 0.330078125, + "learning_rate": 0.00026546079822736476, + "loss": 0.1111, + "step": 347930 + }, + { + "epoch": 14.41, + "grad_norm": 1.15625, + "learning_rate": 0.0002654499737770321, + "loss": 0.2117, + "step": 347940 + }, + { + "epoch": 14.41, + "grad_norm": 1.109375, + "learning_rate": 0.0002654391492976244, + "loss": 0.1836, + "step": 347950 + }, + { + "epoch": 14.41, + "grad_norm": 0.64453125, + "learning_rate": 0.0002654283247891617, + "loss": 0.1897, + "step": 347960 + }, + { + "epoch": 14.41, + "grad_norm": 1.125, + "learning_rate": 0.00026541750025166474, + "loss": 0.19, + "step": 347970 + }, + { + "epoch": 14.41, + "grad_norm": 0.61328125, + "learning_rate": 0.0002654066756851536, + "loss": 0.174, + "step": 347980 + }, + { + "epoch": 14.41, + "grad_norm": 1.7890625, + "learning_rate": 0.00026539585108964876, + "loss": 0.1773, + "step": 347990 + }, + { + "epoch": 14.41, + "grad_norm": 2.15625, + "learning_rate": 0.00026538502646517064, + "loss": 0.1899, + "step": 348000 + }, + { + "epoch": 14.41, + "grad_norm": 0.5546875, + "learning_rate": 0.00026537420181173954, + "loss": 0.2232, + "step": 348010 + }, + { + "epoch": 14.41, + "grad_norm": 1.4140625, + "learning_rate": 0.00026536337712937586, + "loss": 0.215, + "step": 348020 + }, + { + "epoch": 14.42, + "grad_norm": 0.640625, + "learning_rate": 0.0002653525524181, + "loss": 0.2244, + "step": 348030 + }, + { + "epoch": 14.42, + "grad_norm": 0.9921875, + "learning_rate": 0.0002653417276779321, + "loss": 0.211, + "step": 348040 + }, + { + "epoch": 14.42, + "grad_norm": 1.328125, + "learning_rate": 0.0002653309029088929, + "loss": 0.1767, + "step": 348050 + }, + { + "epoch": 14.42, + "grad_norm": 0.51171875, + "learning_rate": 0.0002653200781110026, + "loss": 0.1587, + "step": 348060 + }, + { + "epoch": 14.42, + "grad_norm": 0.275390625, + "learning_rate": 0.00026530925328428155, + "loss": 0.167, + "step": 348070 + }, + { + "epoch": 14.42, + "grad_norm": 0.625, + "learning_rate": 0.0002652984284287502, + "loss": 0.2121, + "step": 348080 + }, + { + "epoch": 14.42, + "grad_norm": 1.4765625, + "learning_rate": 0.0002652876035444287, + "loss": 0.2264, + "step": 348090 + }, + { + "epoch": 14.42, + "grad_norm": 1.03125, + "learning_rate": 0.0002652767786313377, + "loss": 0.2065, + "step": 348100 + }, + { + "epoch": 14.42, + "grad_norm": 0.69140625, + "learning_rate": 0.0002652659536894974, + "loss": 0.1857, + "step": 348110 + }, + { + "epoch": 14.42, + "grad_norm": 1.3203125, + "learning_rate": 0.0002652551287189282, + "loss": 0.2264, + "step": 348120 + }, + { + "epoch": 14.42, + "grad_norm": 2.375, + "learning_rate": 0.0002652443037196506, + "loss": 0.2537, + "step": 348130 + }, + { + "epoch": 14.42, + "grad_norm": 0.76171875, + "learning_rate": 0.0002652334786916848, + "loss": 0.2182, + "step": 348140 + }, + { + "epoch": 14.42, + "grad_norm": 0.6796875, + "learning_rate": 0.0002652226536350512, + "loss": 0.1908, + "step": 348150 + }, + { + "epoch": 14.42, + "grad_norm": 0.376953125, + "learning_rate": 0.0002652118285497703, + "loss": 0.1712, + "step": 348160 + }, + { + "epoch": 14.42, + "grad_norm": 0.58984375, + "learning_rate": 0.00026520100343586234, + "loss": 0.1895, + "step": 348170 + }, + { + "epoch": 14.42, + "grad_norm": 1.2109375, + "learning_rate": 0.00026519017829334775, + "loss": 0.1818, + "step": 348180 + }, + { + "epoch": 14.42, + "grad_norm": 0.6328125, + "learning_rate": 0.0002651793531222468, + "loss": 0.2069, + "step": 348190 + }, + { + "epoch": 14.42, + "grad_norm": 2.453125, + "learning_rate": 0.00026516852792258, + "loss": 0.1831, + "step": 348200 + }, + { + "epoch": 14.42, + "grad_norm": 1.375, + "learning_rate": 0.0002651577026943677, + "loss": 0.2159, + "step": 348210 + }, + { + "epoch": 14.42, + "grad_norm": 0.8046875, + "learning_rate": 0.0002651468774376302, + "loss": 0.231, + "step": 348220 + }, + { + "epoch": 14.42, + "grad_norm": 1.2578125, + "learning_rate": 0.00026513605215238795, + "loss": 0.2174, + "step": 348230 + }, + { + "epoch": 14.42, + "grad_norm": 0.8125, + "learning_rate": 0.0002651252268386613, + "loss": 0.1935, + "step": 348240 + }, + { + "epoch": 14.42, + "grad_norm": 0.6640625, + "learning_rate": 0.00026511440149647056, + "loss": 0.2314, + "step": 348250 + }, + { + "epoch": 14.42, + "grad_norm": 1.21875, + "learning_rate": 0.00026510357612583615, + "loss": 0.2115, + "step": 348260 + }, + { + "epoch": 14.43, + "grad_norm": 0.2001953125, + "learning_rate": 0.0002650927507267784, + "loss": 0.2046, + "step": 348270 + }, + { + "epoch": 14.43, + "grad_norm": 0.99609375, + "learning_rate": 0.0002650819252993178, + "loss": 0.1992, + "step": 348280 + }, + { + "epoch": 14.43, + "grad_norm": 1.234375, + "learning_rate": 0.00026507109984347466, + "loss": 0.2313, + "step": 348290 + }, + { + "epoch": 14.43, + "grad_norm": 0.5859375, + "learning_rate": 0.00026506027435926926, + "loss": 0.1594, + "step": 348300 + }, + { + "epoch": 14.43, + "grad_norm": 1.3984375, + "learning_rate": 0.0002650494488467221, + "loss": 0.2041, + "step": 348310 + }, + { + "epoch": 14.43, + "grad_norm": 0.73828125, + "learning_rate": 0.0002650386233058535, + "loss": 0.1444, + "step": 348320 + }, + { + "epoch": 14.43, + "grad_norm": 0.90625, + "learning_rate": 0.0002650277977366838, + "loss": 0.2129, + "step": 348330 + }, + { + "epoch": 14.43, + "grad_norm": 0.68359375, + "learning_rate": 0.00026501697213923345, + "loss": 0.1728, + "step": 348340 + }, + { + "epoch": 14.43, + "grad_norm": 1.4453125, + "learning_rate": 0.00026500614651352275, + "loss": 0.2081, + "step": 348350 + }, + { + "epoch": 14.43, + "grad_norm": 0.86328125, + "learning_rate": 0.0002649953208595721, + "loss": 0.183, + "step": 348360 + }, + { + "epoch": 14.43, + "grad_norm": 0.7421875, + "learning_rate": 0.0002649844951774019, + "loss": 0.1818, + "step": 348370 + }, + { + "epoch": 14.43, + "grad_norm": 0.88671875, + "learning_rate": 0.0002649736694670325, + "loss": 0.2164, + "step": 348380 + }, + { + "epoch": 14.43, + "grad_norm": 0.287109375, + "learning_rate": 0.0002649628437284842, + "loss": 0.2074, + "step": 348390 + }, + { + "epoch": 14.43, + "grad_norm": 0.60546875, + "learning_rate": 0.0002649520179617775, + "loss": 0.172, + "step": 348400 + }, + { + "epoch": 14.43, + "grad_norm": 1.0859375, + "learning_rate": 0.0002649411921669328, + "loss": 0.2123, + "step": 348410 + }, + { + "epoch": 14.43, + "grad_norm": 2.171875, + "learning_rate": 0.00026493036634397027, + "loss": 0.195, + "step": 348420 + }, + { + "epoch": 14.43, + "grad_norm": 0.5390625, + "learning_rate": 0.0002649195404929104, + "loss": 0.1751, + "step": 348430 + }, + { + "epoch": 14.43, + "grad_norm": 1.2109375, + "learning_rate": 0.00026490871461377365, + "loss": 0.1405, + "step": 348440 + }, + { + "epoch": 14.43, + "grad_norm": 0.36328125, + "learning_rate": 0.00026489788870658027, + "loss": 0.2005, + "step": 348450 + }, + { + "epoch": 14.43, + "grad_norm": 0.71875, + "learning_rate": 0.0002648870627713507, + "loss": 0.183, + "step": 348460 + }, + { + "epoch": 14.43, + "grad_norm": 0.77734375, + "learning_rate": 0.00026487623680810523, + "loss": 0.1797, + "step": 348470 + }, + { + "epoch": 14.43, + "grad_norm": 0.67578125, + "learning_rate": 0.00026486541081686434, + "loss": 0.1492, + "step": 348480 + }, + { + "epoch": 14.43, + "grad_norm": 0.388671875, + "learning_rate": 0.00026485458479764837, + "loss": 0.2184, + "step": 348490 + }, + { + "epoch": 14.43, + "grad_norm": 0.8203125, + "learning_rate": 0.00026484375875047755, + "loss": 0.1225, + "step": 348500 + }, + { + "epoch": 14.44, + "grad_norm": 0.7890625, + "learning_rate": 0.00026483293267537243, + "loss": 0.1619, + "step": 348510 + }, + { + "epoch": 14.44, + "grad_norm": 1.09375, + "learning_rate": 0.0002648221065723534, + "loss": 0.2153, + "step": 348520 + }, + { + "epoch": 14.44, + "grad_norm": 1.0390625, + "learning_rate": 0.0002648112804414407, + "loss": 0.1488, + "step": 348530 + }, + { + "epoch": 14.44, + "grad_norm": 0.41796875, + "learning_rate": 0.0002648004542826549, + "loss": 0.1568, + "step": 348540 + }, + { + "epoch": 14.44, + "grad_norm": 0.984375, + "learning_rate": 0.0002647896280960161, + "loss": 0.1871, + "step": 348550 + }, + { + "epoch": 14.44, + "grad_norm": 0.70703125, + "learning_rate": 0.00026477880188154485, + "loss": 0.2054, + "step": 348560 + }, + { + "epoch": 14.44, + "grad_norm": 0.97265625, + "learning_rate": 0.00026476797563926155, + "loss": 0.2002, + "step": 348570 + }, + { + "epoch": 14.44, + "grad_norm": 1.15625, + "learning_rate": 0.0002647571493691864, + "loss": 0.2775, + "step": 348580 + }, + { + "epoch": 14.44, + "grad_norm": 0.96484375, + "learning_rate": 0.0002647463230713399, + "loss": 0.1942, + "step": 348590 + }, + { + "epoch": 14.44, + "grad_norm": 0.72265625, + "learning_rate": 0.0002647354967457425, + "loss": 0.1856, + "step": 348600 + }, + { + "epoch": 14.44, + "grad_norm": 1.3125, + "learning_rate": 0.0002647246703924144, + "loss": 0.2003, + "step": 348610 + }, + { + "epoch": 14.44, + "grad_norm": 1.484375, + "learning_rate": 0.0002647138440113762, + "loss": 0.1854, + "step": 348620 + }, + { + "epoch": 14.44, + "grad_norm": 0.83203125, + "learning_rate": 0.00026470301760264805, + "loss": 0.1703, + "step": 348630 + }, + { + "epoch": 14.44, + "grad_norm": 0.296875, + "learning_rate": 0.0002646921911662504, + "loss": 0.1686, + "step": 348640 + }, + { + "epoch": 14.44, + "grad_norm": 0.69921875, + "learning_rate": 0.00026468136470220367, + "loss": 0.2053, + "step": 348650 + }, + { + "epoch": 14.44, + "grad_norm": 0.765625, + "learning_rate": 0.0002646705382105281, + "loss": 0.1475, + "step": 348660 + }, + { + "epoch": 14.44, + "grad_norm": 0.4765625, + "learning_rate": 0.00026465971169124423, + "loss": 0.1466, + "step": 348670 + }, + { + "epoch": 14.44, + "grad_norm": 1.3359375, + "learning_rate": 0.0002646488851443724, + "loss": 0.2386, + "step": 348680 + }, + { + "epoch": 14.44, + "grad_norm": 0.92578125, + "learning_rate": 0.0002646380585699329, + "loss": 0.1717, + "step": 348690 + }, + { + "epoch": 14.44, + "grad_norm": 0.640625, + "learning_rate": 0.0002646272319679462, + "loss": 0.1852, + "step": 348700 + }, + { + "epoch": 14.44, + "grad_norm": 0.91796875, + "learning_rate": 0.00026461640533843254, + "loss": 0.1942, + "step": 348710 + }, + { + "epoch": 14.44, + "grad_norm": 1.2890625, + "learning_rate": 0.0002646055786814124, + "loss": 0.2113, + "step": 348720 + }, + { + "epoch": 14.44, + "grad_norm": 0.90234375, + "learning_rate": 0.0002645947519969063, + "loss": 0.2019, + "step": 348730 + }, + { + "epoch": 14.44, + "grad_norm": 0.87890625, + "learning_rate": 0.00026458392528493427, + "loss": 0.1914, + "step": 348740 + }, + { + "epoch": 14.45, + "grad_norm": 1.25, + "learning_rate": 0.0002645730985455169, + "loss": 0.1565, + "step": 348750 + }, + { + "epoch": 14.45, + "grad_norm": 0.92578125, + "learning_rate": 0.0002645622717786746, + "loss": 0.2028, + "step": 348760 + }, + { + "epoch": 14.45, + "grad_norm": 0.81640625, + "learning_rate": 0.00026455144498442764, + "loss": 0.1773, + "step": 348770 + }, + { + "epoch": 14.45, + "grad_norm": 0.9296875, + "learning_rate": 0.00026454061816279646, + "loss": 0.2017, + "step": 348780 + }, + { + "epoch": 14.45, + "grad_norm": 1.5390625, + "learning_rate": 0.0002645297913138013, + "loss": 0.181, + "step": 348790 + }, + { + "epoch": 14.45, + "grad_norm": 0.82421875, + "learning_rate": 0.00026451896443746277, + "loss": 0.1376, + "step": 348800 + }, + { + "epoch": 14.45, + "grad_norm": 2.6875, + "learning_rate": 0.0002645081375338011, + "loss": 0.2027, + "step": 348810 + }, + { + "epoch": 14.45, + "grad_norm": 0.86328125, + "learning_rate": 0.00026449731060283657, + "loss": 0.1495, + "step": 348820 + }, + { + "epoch": 14.45, + "grad_norm": 1.546875, + "learning_rate": 0.0002644864836445898, + "loss": 0.1901, + "step": 348830 + }, + { + "epoch": 14.45, + "grad_norm": 0.45703125, + "learning_rate": 0.00026447565665908097, + "loss": 0.1735, + "step": 348840 + }, + { + "epoch": 14.45, + "grad_norm": 0.640625, + "learning_rate": 0.0002644648296463305, + "loss": 0.141, + "step": 348850 + }, + { + "epoch": 14.45, + "grad_norm": 1.2109375, + "learning_rate": 0.0002644540026063588, + "loss": 0.2032, + "step": 348860 + }, + { + "epoch": 14.45, + "grad_norm": 2.5625, + "learning_rate": 0.0002644431755391862, + "loss": 0.1853, + "step": 348870 + }, + { + "epoch": 14.45, + "grad_norm": 1.0625, + "learning_rate": 0.0002644323484448332, + "loss": 0.1965, + "step": 348880 + }, + { + "epoch": 14.45, + "grad_norm": 0.99609375, + "learning_rate": 0.00026442152132332005, + "loss": 0.2159, + "step": 348890 + }, + { + "epoch": 14.45, + "grad_norm": 0.62109375, + "learning_rate": 0.00026441069417466705, + "loss": 0.2076, + "step": 348900 + }, + { + "epoch": 14.45, + "grad_norm": 1.078125, + "learning_rate": 0.0002643998669988948, + "loss": 0.1838, + "step": 348910 + }, + { + "epoch": 14.45, + "grad_norm": 0.84375, + "learning_rate": 0.0002643890397960235, + "loss": 0.1595, + "step": 348920 + }, + { + "epoch": 14.45, + "grad_norm": 2.03125, + "learning_rate": 0.0002643782125660736, + "loss": 0.172, + "step": 348930 + }, + { + "epoch": 14.45, + "grad_norm": 0.80859375, + "learning_rate": 0.00026436738530906547, + "loss": 0.1977, + "step": 348940 + }, + { + "epoch": 14.45, + "grad_norm": 1.5625, + "learning_rate": 0.00026435655802501936, + "loss": 0.1628, + "step": 348950 + }, + { + "epoch": 14.45, + "grad_norm": 0.7578125, + "learning_rate": 0.0002643457307139559, + "loss": 0.1611, + "step": 348960 + }, + { + "epoch": 14.45, + "grad_norm": 0.0, + "learning_rate": 0.00026433490337589527, + "loss": 0.2187, + "step": 348970 + }, + { + "epoch": 14.45, + "grad_norm": 0.80859375, + "learning_rate": 0.00026432407601085784, + "loss": 0.1525, + "step": 348980 + }, + { + "epoch": 14.46, + "grad_norm": 0.443359375, + "learning_rate": 0.0002643132486188641, + "loss": 0.21, + "step": 348990 + }, + { + "epoch": 14.46, + "grad_norm": 0.51953125, + "learning_rate": 0.0002643024211999344, + "loss": 0.1852, + "step": 349000 + }, + { + "epoch": 14.46, + "grad_norm": 0.796875, + "learning_rate": 0.0002642915937540891, + "loss": 0.1514, + "step": 349010 + }, + { + "epoch": 14.46, + "grad_norm": 0.56640625, + "learning_rate": 0.0002642807662813486, + "loss": 0.2083, + "step": 349020 + }, + { + "epoch": 14.46, + "grad_norm": 0.73828125, + "learning_rate": 0.00026426993878173304, + "loss": 0.2092, + "step": 349030 + }, + { + "epoch": 14.46, + "grad_norm": 0.30078125, + "learning_rate": 0.0002642591112552631, + "loss": 0.1734, + "step": 349040 + }, + { + "epoch": 14.46, + "grad_norm": 0.96484375, + "learning_rate": 0.0002642482837019592, + "loss": 0.2126, + "step": 349050 + }, + { + "epoch": 14.46, + "grad_norm": 0.3046875, + "learning_rate": 0.00026423745612184137, + "loss": 0.1939, + "step": 349060 + }, + { + "epoch": 14.46, + "grad_norm": 1.3671875, + "learning_rate": 0.0002642266285149303, + "loss": 0.2342, + "step": 349070 + }, + { + "epoch": 14.46, + "grad_norm": 0.416015625, + "learning_rate": 0.0002642158008812462, + "loss": 0.1847, + "step": 349080 + }, + { + "epoch": 14.46, + "grad_norm": 0.71875, + "learning_rate": 0.0002642049732208095, + "loss": 0.186, + "step": 349090 + }, + { + "epoch": 14.46, + "grad_norm": 0.578125, + "learning_rate": 0.0002641941455336406, + "loss": 0.2548, + "step": 349100 + }, + { + "epoch": 14.46, + "grad_norm": 1.2265625, + "learning_rate": 0.0002641833178197598, + "loss": 0.2293, + "step": 349110 + }, + { + "epoch": 14.46, + "grad_norm": 1.109375, + "learning_rate": 0.0002641724900791876, + "loss": 0.1672, + "step": 349120 + }, + { + "epoch": 14.46, + "grad_norm": 0.263671875, + "learning_rate": 0.00026416166231194423, + "loss": 0.1524, + "step": 349130 + }, + { + "epoch": 14.46, + "grad_norm": 0.6640625, + "learning_rate": 0.00026415083451805015, + "loss": 0.1658, + "step": 349140 + }, + { + "epoch": 14.46, + "grad_norm": 0.71875, + "learning_rate": 0.0002641400066975258, + "loss": 0.2128, + "step": 349150 + }, + { + "epoch": 14.46, + "grad_norm": 0.6640625, + "learning_rate": 0.00026412917885039147, + "loss": 0.2299, + "step": 349160 + }, + { + "epoch": 14.46, + "grad_norm": 0.98828125, + "learning_rate": 0.00026411835097666747, + "loss": 0.2265, + "step": 349170 + }, + { + "epoch": 14.46, + "grad_norm": 0.455078125, + "learning_rate": 0.0002641075230763744, + "loss": 0.1684, + "step": 349180 + }, + { + "epoch": 14.46, + "grad_norm": 1.828125, + "learning_rate": 0.0002640966951495323, + "loss": 0.1751, + "step": 349190 + }, + { + "epoch": 14.46, + "grad_norm": 1.21875, + "learning_rate": 0.0002640858671961619, + "loss": 0.2174, + "step": 349200 + }, + { + "epoch": 14.46, + "grad_norm": 0.51953125, + "learning_rate": 0.00026407503921628333, + "loss": 0.1943, + "step": 349210 + }, + { + "epoch": 14.46, + "grad_norm": 0.92578125, + "learning_rate": 0.00026406421120991707, + "loss": 0.2047, + "step": 349220 + }, + { + "epoch": 14.47, + "grad_norm": 0.294921875, + "learning_rate": 0.00026405338317708353, + "loss": 0.2063, + "step": 349230 + }, + { + "epoch": 14.47, + "grad_norm": 0.9140625, + "learning_rate": 0.000264042555117803, + "loss": 0.2232, + "step": 349240 + }, + { + "epoch": 14.47, + "grad_norm": 0.9921875, + "learning_rate": 0.00026403172703209595, + "loss": 0.1579, + "step": 349250 + }, + { + "epoch": 14.47, + "grad_norm": 1.625, + "learning_rate": 0.00026402089891998273, + "loss": 0.1998, + "step": 349260 + }, + { + "epoch": 14.47, + "grad_norm": 0.90234375, + "learning_rate": 0.00026401007078148355, + "loss": 0.1725, + "step": 349270 + }, + { + "epoch": 14.47, + "grad_norm": 1.328125, + "learning_rate": 0.000263999242616619, + "loss": 0.1785, + "step": 349280 + }, + { + "epoch": 14.47, + "grad_norm": 0.2109375, + "learning_rate": 0.0002639884144254094, + "loss": 0.1859, + "step": 349290 + }, + { + "epoch": 14.47, + "grad_norm": 2.40625, + "learning_rate": 0.0002639775862078751, + "loss": 0.1574, + "step": 349300 + }, + { + "epoch": 14.47, + "grad_norm": 1.2734375, + "learning_rate": 0.00026396675796403656, + "loss": 0.1903, + "step": 349310 + }, + { + "epoch": 14.47, + "grad_norm": 0.82421875, + "learning_rate": 0.00026395592969391405, + "loss": 0.2012, + "step": 349320 + }, + { + "epoch": 14.47, + "grad_norm": 0.65625, + "learning_rate": 0.00026394510139752793, + "loss": 0.1601, + "step": 349330 + }, + { + "epoch": 14.47, + "grad_norm": 0.55078125, + "learning_rate": 0.0002639342730748987, + "loss": 0.1885, + "step": 349340 + }, + { + "epoch": 14.47, + "grad_norm": 0.6875, + "learning_rate": 0.00026392344472604657, + "loss": 0.192, + "step": 349350 + }, + { + "epoch": 14.47, + "grad_norm": 0.8671875, + "learning_rate": 0.00026391261635099215, + "loss": 0.1831, + "step": 349360 + }, + { + "epoch": 14.47, + "grad_norm": 0.703125, + "learning_rate": 0.0002639017879497556, + "loss": 0.2061, + "step": 349370 + }, + { + "epoch": 14.47, + "grad_norm": 0.71484375, + "learning_rate": 0.0002638909595223575, + "loss": 0.224, + "step": 349380 + }, + { + "epoch": 14.47, + "grad_norm": 0.515625, + "learning_rate": 0.00026388013106881804, + "loss": 0.1682, + "step": 349390 + }, + { + "epoch": 14.47, + "grad_norm": 0.8828125, + "learning_rate": 0.00026386930258915763, + "loss": 0.1847, + "step": 349400 + }, + { + "epoch": 14.47, + "grad_norm": 0.96484375, + "learning_rate": 0.00026385847408339673, + "loss": 0.1901, + "step": 349410 + }, + { + "epoch": 14.47, + "grad_norm": 0.70703125, + "learning_rate": 0.0002638476455515557, + "loss": 0.1939, + "step": 349420 + }, + { + "epoch": 14.47, + "grad_norm": 0.90234375, + "learning_rate": 0.00026383681699365484, + "loss": 0.1987, + "step": 349430 + }, + { + "epoch": 14.47, + "grad_norm": 2.078125, + "learning_rate": 0.0002638259884097147, + "loss": 0.2022, + "step": 349440 + }, + { + "epoch": 14.47, + "grad_norm": 0.546875, + "learning_rate": 0.0002638151597997554, + "loss": 0.2235, + "step": 349450 + }, + { + "epoch": 14.47, + "grad_norm": 0.43359375, + "learning_rate": 0.0002638043311637975, + "loss": 0.2053, + "step": 349460 + }, + { + "epoch": 14.48, + "grad_norm": 0.55078125, + "learning_rate": 0.0002637935025018614, + "loss": 0.1974, + "step": 349470 + }, + { + "epoch": 14.48, + "grad_norm": 0.66796875, + "learning_rate": 0.0002637826738139674, + "loss": 0.1811, + "step": 349480 + }, + { + "epoch": 14.48, + "grad_norm": 1.140625, + "learning_rate": 0.00026377184510013586, + "loss": 0.1954, + "step": 349490 + }, + { + "epoch": 14.48, + "grad_norm": 0.76171875, + "learning_rate": 0.00026376101636038716, + "loss": 0.1916, + "step": 349500 + }, + { + "epoch": 14.48, + "grad_norm": 1.1171875, + "learning_rate": 0.0002637501875947418, + "loss": 0.1849, + "step": 349510 + }, + { + "epoch": 14.48, + "grad_norm": 1.359375, + "learning_rate": 0.0002637393588032201, + "loss": 0.1797, + "step": 349520 + }, + { + "epoch": 14.48, + "grad_norm": 1.0078125, + "learning_rate": 0.0002637285299858423, + "loss": 0.2045, + "step": 349530 + }, + { + "epoch": 14.48, + "grad_norm": 1.5078125, + "learning_rate": 0.00026371770114262894, + "loss": 0.1942, + "step": 349540 + }, + { + "epoch": 14.48, + "grad_norm": 1.46875, + "learning_rate": 0.0002637068722736004, + "loss": 0.2097, + "step": 349550 + }, + { + "epoch": 14.48, + "grad_norm": 0.58984375, + "learning_rate": 0.000263696043378777, + "loss": 0.1803, + "step": 349560 + }, + { + "epoch": 14.48, + "grad_norm": 1.0078125, + "learning_rate": 0.000263685214458179, + "loss": 0.1455, + "step": 349570 + }, + { + "epoch": 14.48, + "grad_norm": 0.85546875, + "learning_rate": 0.0002636743855118269, + "loss": 0.1727, + "step": 349580 + }, + { + "epoch": 14.48, + "grad_norm": 0.462890625, + "learning_rate": 0.00026366355653974124, + "loss": 0.1272, + "step": 349590 + }, + { + "epoch": 14.48, + "grad_norm": 0.76171875, + "learning_rate": 0.0002636527275419422, + "loss": 0.2202, + "step": 349600 + }, + { + "epoch": 14.48, + "grad_norm": 1.0546875, + "learning_rate": 0.00026364189851845013, + "loss": 0.1631, + "step": 349610 + }, + { + "epoch": 14.48, + "grad_norm": 0.66796875, + "learning_rate": 0.0002636310694692855, + "loss": 0.1742, + "step": 349620 + }, + { + "epoch": 14.48, + "grad_norm": 0.578125, + "learning_rate": 0.0002636202403944687, + "loss": 0.2051, + "step": 349630 + }, + { + "epoch": 14.48, + "grad_norm": 1.2265625, + "learning_rate": 0.00026360941129402004, + "loss": 0.1609, + "step": 349640 + }, + { + "epoch": 14.48, + "grad_norm": 0.46875, + "learning_rate": 0.00026359858216796003, + "loss": 0.1856, + "step": 349650 + }, + { + "epoch": 14.48, + "grad_norm": 0.4921875, + "learning_rate": 0.00026358775301630884, + "loss": 0.1752, + "step": 349660 + }, + { + "epoch": 14.48, + "grad_norm": 0.62109375, + "learning_rate": 0.000263576923839087, + "loss": 0.166, + "step": 349670 + }, + { + "epoch": 14.48, + "grad_norm": 1.3984375, + "learning_rate": 0.0002635660946363149, + "loss": 0.1696, + "step": 349680 + }, + { + "epoch": 14.48, + "grad_norm": 1.625, + "learning_rate": 0.0002635552654080129, + "loss": 0.1958, + "step": 349690 + }, + { + "epoch": 14.48, + "grad_norm": 0.890625, + "learning_rate": 0.00026354443615420124, + "loss": 0.1904, + "step": 349700 + }, + { + "epoch": 14.48, + "grad_norm": 3.8623809814453125e-05, + "learning_rate": 0.00026353360687490047, + "loss": 0.203, + "step": 349710 + }, + { + "epoch": 14.49, + "grad_norm": 0.57421875, + "learning_rate": 0.0002635227775701309, + "loss": 0.1909, + "step": 349720 + }, + { + "epoch": 14.49, + "grad_norm": 0.68359375, + "learning_rate": 0.000263511948239913, + "loss": 0.1942, + "step": 349730 + }, + { + "epoch": 14.49, + "grad_norm": 0.671875, + "learning_rate": 0.000263501118884267, + "loss": 0.1793, + "step": 349740 + }, + { + "epoch": 14.49, + "grad_norm": 0.8359375, + "learning_rate": 0.00026349028950321335, + "loss": 0.186, + "step": 349750 + }, + { + "epoch": 14.49, + "grad_norm": 0.55078125, + "learning_rate": 0.0002634794600967725, + "loss": 0.2008, + "step": 349760 + }, + { + "epoch": 14.49, + "grad_norm": 1.59375, + "learning_rate": 0.0002634686306649647, + "loss": 0.2136, + "step": 349770 + }, + { + "epoch": 14.49, + "grad_norm": 1.0234375, + "learning_rate": 0.00026345780120781036, + "loss": 0.213, + "step": 349780 + }, + { + "epoch": 14.49, + "grad_norm": 0.77734375, + "learning_rate": 0.00026344697172532995, + "loss": 0.1853, + "step": 349790 + }, + { + "epoch": 14.49, + "grad_norm": 1.3515625, + "learning_rate": 0.00026343614221754376, + "loss": 0.1978, + "step": 349800 + }, + { + "epoch": 14.49, + "grad_norm": 0.921875, + "learning_rate": 0.00026342531268447225, + "loss": 0.1707, + "step": 349810 + }, + { + "epoch": 14.49, + "grad_norm": 0.4609375, + "learning_rate": 0.0002634144831261357, + "loss": 0.1369, + "step": 349820 + }, + { + "epoch": 14.49, + "grad_norm": 0.7421875, + "learning_rate": 0.00026340365354255455, + "loss": 0.184, + "step": 349830 + }, + { + "epoch": 14.49, + "grad_norm": 0.6640625, + "learning_rate": 0.0002633928239337491, + "loss": 0.2289, + "step": 349840 + }, + { + "epoch": 14.49, + "grad_norm": 0.5859375, + "learning_rate": 0.00026338199429973996, + "loss": 0.203, + "step": 349850 + }, + { + "epoch": 14.49, + "grad_norm": 0.5625, + "learning_rate": 0.00026337116464054727, + "loss": 0.1964, + "step": 349860 + }, + { + "epoch": 14.49, + "grad_norm": 0.71875, + "learning_rate": 0.0002633603349561915, + "loss": 0.1441, + "step": 349870 + }, + { + "epoch": 14.49, + "grad_norm": 1.5078125, + "learning_rate": 0.00026334950524669297, + "loss": 0.1535, + "step": 349880 + }, + { + "epoch": 14.49, + "grad_norm": 1.109375, + "learning_rate": 0.00026333867551207223, + "loss": 0.1784, + "step": 349890 + }, + { + "epoch": 14.49, + "grad_norm": 1.859375, + "learning_rate": 0.0002633278457523495, + "loss": 0.1872, + "step": 349900 + }, + { + "epoch": 14.49, + "grad_norm": 1.5625, + "learning_rate": 0.00026331701596754516, + "loss": 0.1892, + "step": 349910 + }, + { + "epoch": 14.49, + "grad_norm": 1.453125, + "learning_rate": 0.0002633061861576796, + "loss": 0.1878, + "step": 349920 + }, + { + "epoch": 14.49, + "grad_norm": 1.6953125, + "learning_rate": 0.00026329535632277335, + "loss": 0.1628, + "step": 349930 + }, + { + "epoch": 14.49, + "grad_norm": 1.4609375, + "learning_rate": 0.0002632845264628466, + "loss": 0.194, + "step": 349940 + }, + { + "epoch": 14.49, + "grad_norm": 1.453125, + "learning_rate": 0.00026327369657791976, + "loss": 0.2388, + "step": 349950 + }, + { + "epoch": 14.5, + "grad_norm": 2.203125, + "learning_rate": 0.00026326286666801334, + "loss": 0.2076, + "step": 349960 + }, + { + "epoch": 14.5, + "grad_norm": 0.462890625, + "learning_rate": 0.00026325203673314756, + "loss": 0.1639, + "step": 349970 + }, + { + "epoch": 14.5, + "grad_norm": 0.48046875, + "learning_rate": 0.000263241206773343, + "loss": 0.1599, + "step": 349980 + }, + { + "epoch": 14.5, + "grad_norm": 1.046875, + "learning_rate": 0.0002632303767886198, + "loss": 0.2077, + "step": 349990 + }, + { + "epoch": 14.5, + "grad_norm": 2.28125, + "learning_rate": 0.0002632195467789985, + "loss": 0.1669, + "step": 350000 + }, + { + "epoch": 14.5, + "grad_norm": 0.6171875, + "learning_rate": 0.00026320871674449954, + "loss": 0.2168, + "step": 350010 + }, + { + "epoch": 14.5, + "grad_norm": 1.1015625, + "learning_rate": 0.0002631978866851431, + "loss": 0.1588, + "step": 350020 + }, + { + "epoch": 14.5, + "grad_norm": 0.5, + "learning_rate": 0.00026318705660094963, + "loss": 0.1595, + "step": 350030 + }, + { + "epoch": 14.5, + "grad_norm": 1.3984375, + "learning_rate": 0.00026317622649193957, + "loss": 0.1965, + "step": 350040 + }, + { + "epoch": 14.5, + "grad_norm": 0.96484375, + "learning_rate": 0.0002631653963581333, + "loss": 0.1739, + "step": 350050 + }, + { + "epoch": 14.5, + "grad_norm": 1.3125, + "learning_rate": 0.0002631545661995512, + "loss": 0.2051, + "step": 350060 + }, + { + "epoch": 14.5, + "grad_norm": 0.466796875, + "learning_rate": 0.00026314373601621355, + "loss": 0.1829, + "step": 350070 + }, + { + "epoch": 14.5, + "grad_norm": 0.87109375, + "learning_rate": 0.00026313290580814086, + "loss": 0.1813, + "step": 350080 + }, + { + "epoch": 14.5, + "grad_norm": 0.703125, + "learning_rate": 0.0002631220755753535, + "loss": 0.1604, + "step": 350090 + }, + { + "epoch": 14.5, + "grad_norm": 0.5546875, + "learning_rate": 0.0002631112453178717, + "loss": 0.1916, + "step": 350100 + }, + { + "epoch": 14.5, + "grad_norm": 0.451171875, + "learning_rate": 0.000263100415035716, + "loss": 0.1655, + "step": 350110 + }, + { + "epoch": 14.5, + "grad_norm": 0.314453125, + "learning_rate": 0.0002630895847289068, + "loss": 0.2057, + "step": 350120 + }, + { + "epoch": 14.5, + "grad_norm": 1.2265625, + "learning_rate": 0.0002630787543974643, + "loss": 0.2154, + "step": 350130 + }, + { + "epoch": 14.5, + "grad_norm": 1.1015625, + "learning_rate": 0.0002630679240414091, + "loss": 0.2055, + "step": 350140 + }, + { + "epoch": 14.5, + "grad_norm": 1.2890625, + "learning_rate": 0.00026305709366076146, + "loss": 0.1906, + "step": 350150 + }, + { + "epoch": 14.5, + "grad_norm": 0.5, + "learning_rate": 0.0002630462632555418, + "loss": 0.2481, + "step": 350160 + }, + { + "epoch": 14.5, + "grad_norm": 1.140625, + "learning_rate": 0.0002630354328257704, + "loss": 0.2192, + "step": 350170 + }, + { + "epoch": 14.5, + "grad_norm": 0.8203125, + "learning_rate": 0.0002630246023714677, + "loss": 0.155, + "step": 350180 + }, + { + "epoch": 14.5, + "grad_norm": 0.76171875, + "learning_rate": 0.0002630137718926542, + "loss": 0.1529, + "step": 350190 + }, + { + "epoch": 14.51, + "grad_norm": 1.2109375, + "learning_rate": 0.0002630029413893502, + "loss": 0.2369, + "step": 350200 + }, + { + "epoch": 14.51, + "grad_norm": 1.4140625, + "learning_rate": 0.000262992110861576, + "loss": 0.164, + "step": 350210 + }, + { + "epoch": 14.51, + "grad_norm": 0.9140625, + "learning_rate": 0.0002629812803093521, + "loss": 0.1876, + "step": 350220 + }, + { + "epoch": 14.51, + "grad_norm": 0.84765625, + "learning_rate": 0.0002629704497326988, + "loss": 0.2141, + "step": 350230 + }, + { + "epoch": 14.51, + "grad_norm": 0.67578125, + "learning_rate": 0.0002629596191316365, + "loss": 0.1563, + "step": 350240 + }, + { + "epoch": 14.51, + "grad_norm": 1.2890625, + "learning_rate": 0.00026294878850618563, + "loss": 0.1722, + "step": 350250 + }, + { + "epoch": 14.51, + "grad_norm": 1.1328125, + "learning_rate": 0.00026293795785636647, + "loss": 0.1869, + "step": 350260 + }, + { + "epoch": 14.51, + "grad_norm": 2.15625, + "learning_rate": 0.0002629271271821996, + "loss": 0.19, + "step": 350270 + }, + { + "epoch": 14.51, + "grad_norm": 0.7734375, + "learning_rate": 0.00026291629648370513, + "loss": 0.157, + "step": 350280 + }, + { + "epoch": 14.51, + "grad_norm": 0.8984375, + "learning_rate": 0.0002629054657609036, + "loss": 0.2077, + "step": 350290 + }, + { + "epoch": 14.51, + "grad_norm": 0.609375, + "learning_rate": 0.0002628946350138155, + "loss": 0.1887, + "step": 350300 + }, + { + "epoch": 14.51, + "grad_norm": 0.037353515625, + "learning_rate": 0.00026288380424246104, + "loss": 0.1664, + "step": 350310 + }, + { + "epoch": 14.51, + "grad_norm": 1.203125, + "learning_rate": 0.0002628729734468606, + "loss": 0.1892, + "step": 350320 + }, + { + "epoch": 14.51, + "grad_norm": 1.203125, + "learning_rate": 0.0002628621426270347, + "loss": 0.2053, + "step": 350330 + }, + { + "epoch": 14.51, + "grad_norm": 1.1796875, + "learning_rate": 0.0002628513117830035, + "loss": 0.1564, + "step": 350340 + }, + { + "epoch": 14.51, + "grad_norm": 2.515625, + "learning_rate": 0.0002628404809147876, + "loss": 0.158, + "step": 350350 + }, + { + "epoch": 14.51, + "grad_norm": 0.70703125, + "learning_rate": 0.00026282965002240733, + "loss": 0.175, + "step": 350360 + }, + { + "epoch": 14.51, + "grad_norm": 0.74609375, + "learning_rate": 0.000262818819105883, + "loss": 0.2548, + "step": 350370 + }, + { + "epoch": 14.51, + "grad_norm": 0.482421875, + "learning_rate": 0.0002628079881652351, + "loss": 0.153, + "step": 350380 + }, + { + "epoch": 14.51, + "grad_norm": 1.0625, + "learning_rate": 0.0002627971572004838, + "loss": 0.1996, + "step": 350390 + }, + { + "epoch": 14.51, + "grad_norm": 0.75390625, + "learning_rate": 0.0002627863262116498, + "loss": 0.2414, + "step": 350400 + }, + { + "epoch": 14.51, + "grad_norm": 0.365234375, + "learning_rate": 0.00026277549519875334, + "loss": 0.1745, + "step": 350410 + }, + { + "epoch": 14.51, + "grad_norm": 0.490234375, + "learning_rate": 0.00026276466416181454, + "loss": 0.1645, + "step": 350420 + }, + { + "epoch": 14.51, + "grad_norm": 2.59375, + "learning_rate": 0.00026275383310085426, + "loss": 0.211, + "step": 350430 + }, + { + "epoch": 14.52, + "grad_norm": 0.71484375, + "learning_rate": 0.0002627430020158925, + "loss": 0.1391, + "step": 350440 + }, + { + "epoch": 14.52, + "grad_norm": 0.734375, + "learning_rate": 0.00026273217090694984, + "loss": 0.1981, + "step": 350450 + }, + { + "epoch": 14.52, + "grad_norm": 0.486328125, + "learning_rate": 0.0002627213397740467, + "loss": 0.197, + "step": 350460 + }, + { + "epoch": 14.52, + "grad_norm": 0.60546875, + "learning_rate": 0.00026271050861720313, + "loss": 0.1819, + "step": 350470 + }, + { + "epoch": 14.52, + "grad_norm": 0.8125, + "learning_rate": 0.00026269967743644, + "loss": 0.2049, + "step": 350480 + }, + { + "epoch": 14.52, + "grad_norm": 0.6484375, + "learning_rate": 0.0002626888462317774, + "loss": 0.1777, + "step": 350490 + }, + { + "epoch": 14.52, + "grad_norm": 0.7109375, + "learning_rate": 0.0002626780150032356, + "loss": 0.2484, + "step": 350500 + }, + { + "epoch": 14.52, + "grad_norm": 0.65625, + "learning_rate": 0.0002626671837508353, + "loss": 0.1748, + "step": 350510 + }, + { + "epoch": 14.52, + "grad_norm": 1.015625, + "learning_rate": 0.00026265635247459666, + "loss": 0.1933, + "step": 350520 + }, + { + "epoch": 14.52, + "grad_norm": 1.2578125, + "learning_rate": 0.00026264552117454015, + "loss": 0.2028, + "step": 350530 + }, + { + "epoch": 14.52, + "grad_norm": 0.59765625, + "learning_rate": 0.0002626346898506862, + "loss": 0.2033, + "step": 350540 + }, + { + "epoch": 14.52, + "grad_norm": 1.09375, + "learning_rate": 0.000262623858503055, + "loss": 0.2159, + "step": 350550 + }, + { + "epoch": 14.52, + "grad_norm": 0.86328125, + "learning_rate": 0.00026261302713166717, + "loss": 0.1807, + "step": 350560 + }, + { + "epoch": 14.52, + "grad_norm": 1.1015625, + "learning_rate": 0.000262602195736543, + "loss": 0.212, + "step": 350570 + }, + { + "epoch": 14.52, + "grad_norm": 0.93359375, + "learning_rate": 0.00026259136431770275, + "loss": 0.1508, + "step": 350580 + }, + { + "epoch": 14.52, + "grad_norm": 1.109375, + "learning_rate": 0.000262580532875167, + "loss": 0.1874, + "step": 350590 + }, + { + "epoch": 14.52, + "grad_norm": 0.76171875, + "learning_rate": 0.00026256970140895597, + "loss": 0.2254, + "step": 350600 + }, + { + "epoch": 14.52, + "grad_norm": 1.8359375, + "learning_rate": 0.0002625588699190902, + "loss": 0.1765, + "step": 350610 + }, + { + "epoch": 14.52, + "grad_norm": 1.265625, + "learning_rate": 0.00026254803840559, + "loss": 0.2463, + "step": 350620 + }, + { + "epoch": 14.52, + "grad_norm": 0.578125, + "learning_rate": 0.0002625372068684756, + "loss": 0.1895, + "step": 350630 + }, + { + "epoch": 14.52, + "grad_norm": 0.486328125, + "learning_rate": 0.0002625263753077677, + "loss": 0.2269, + "step": 350640 + }, + { + "epoch": 14.52, + "grad_norm": 1.7421875, + "learning_rate": 0.0002625155437234865, + "loss": 0.1606, + "step": 350650 + }, + { + "epoch": 14.52, + "grad_norm": 0.5, + "learning_rate": 0.00026250471211565225, + "loss": 0.184, + "step": 350660 + }, + { + "epoch": 14.52, + "grad_norm": 0.9140625, + "learning_rate": 0.0002624938804842856, + "loss": 0.193, + "step": 350670 + }, + { + "epoch": 14.53, + "grad_norm": 1.2109375, + "learning_rate": 0.0002624830488294068, + "loss": 0.1969, + "step": 350680 + }, + { + "epoch": 14.53, + "grad_norm": 1.2890625, + "learning_rate": 0.0002624722171510363, + "loss": 0.2328, + "step": 350690 + }, + { + "epoch": 14.53, + "grad_norm": 2.109375, + "learning_rate": 0.00026246138544919434, + "loss": 0.1912, + "step": 350700 + }, + { + "epoch": 14.53, + "grad_norm": 0.7890625, + "learning_rate": 0.0002624505537239014, + "loss": 0.155, + "step": 350710 + }, + { + "epoch": 14.53, + "grad_norm": 0.373046875, + "learning_rate": 0.00026243972197517794, + "loss": 0.1871, + "step": 350720 + }, + { + "epoch": 14.53, + "grad_norm": 1.3046875, + "learning_rate": 0.00026242889020304424, + "loss": 0.2507, + "step": 350730 + }, + { + "epoch": 14.53, + "grad_norm": 1.2578125, + "learning_rate": 0.0002624180584075206, + "loss": 0.2711, + "step": 350740 + }, + { + "epoch": 14.53, + "grad_norm": 0.97265625, + "learning_rate": 0.0002624072265886277, + "loss": 0.2151, + "step": 350750 + }, + { + "epoch": 14.53, + "grad_norm": 0.306640625, + "learning_rate": 0.00026239639474638567, + "loss": 0.1546, + "step": 350760 + }, + { + "epoch": 14.53, + "grad_norm": 0.86328125, + "learning_rate": 0.0002623855628808149, + "loss": 0.1961, + "step": 350770 + }, + { + "epoch": 14.53, + "grad_norm": 0.58203125, + "learning_rate": 0.00026237473099193593, + "loss": 0.2112, + "step": 350780 + }, + { + "epoch": 14.53, + "grad_norm": 1.234375, + "learning_rate": 0.0002623638990797689, + "loss": 0.2003, + "step": 350790 + }, + { + "epoch": 14.53, + "grad_norm": 0.490234375, + "learning_rate": 0.0002623530671443345, + "loss": 0.1541, + "step": 350800 + }, + { + "epoch": 14.53, + "grad_norm": 0.24609375, + "learning_rate": 0.0002623422351856529, + "loss": 0.2, + "step": 350810 + }, + { + "epoch": 14.53, + "grad_norm": 0.42578125, + "learning_rate": 0.0002623314032037446, + "loss": 0.1638, + "step": 350820 + }, + { + "epoch": 14.53, + "grad_norm": 0.408203125, + "learning_rate": 0.0002623205711986299, + "loss": 0.1729, + "step": 350830 + }, + { + "epoch": 14.53, + "grad_norm": 0.953125, + "learning_rate": 0.0002623097391703292, + "loss": 0.2204, + "step": 350840 + }, + { + "epoch": 14.53, + "grad_norm": 0.359375, + "learning_rate": 0.0002622989071188629, + "loss": 0.2172, + "step": 350850 + }, + { + "epoch": 14.53, + "grad_norm": 1.9453125, + "learning_rate": 0.0002622880750442515, + "loss": 0.195, + "step": 350860 + }, + { + "epoch": 14.53, + "grad_norm": 0.78515625, + "learning_rate": 0.00026227724294651506, + "loss": 0.2058, + "step": 350870 + }, + { + "epoch": 14.53, + "grad_norm": 0.890625, + "learning_rate": 0.0002622664108256743, + "loss": 0.1372, + "step": 350880 + }, + { + "epoch": 14.53, + "grad_norm": 2.515625, + "learning_rate": 0.0002622555786817494, + "loss": 0.1824, + "step": 350890 + }, + { + "epoch": 14.53, + "grad_norm": 1.109375, + "learning_rate": 0.0002622447465147609, + "loss": 0.2287, + "step": 350900 + }, + { + "epoch": 14.53, + "grad_norm": 0.83203125, + "learning_rate": 0.00026223391432472916, + "loss": 0.2216, + "step": 350910 + }, + { + "epoch": 14.54, + "grad_norm": 3.984375, + "learning_rate": 0.0002622230821116744, + "loss": 0.2486, + "step": 350920 + }, + { + "epoch": 14.54, + "grad_norm": 0.6640625, + "learning_rate": 0.00026221224987561716, + "loss": 0.2563, + "step": 350930 + }, + { + "epoch": 14.54, + "grad_norm": 0.828125, + "learning_rate": 0.0002622014176165778, + "loss": 0.2082, + "step": 350940 + }, + { + "epoch": 14.54, + "grad_norm": 0.828125, + "learning_rate": 0.00026219058533457663, + "loss": 0.216, + "step": 350950 + }, + { + "epoch": 14.54, + "grad_norm": 0.75390625, + "learning_rate": 0.0002621797530296342, + "loss": 0.2022, + "step": 350960 + }, + { + "epoch": 14.54, + "grad_norm": 1.296875, + "learning_rate": 0.00026216892070177075, + "loss": 0.1767, + "step": 350970 + }, + { + "epoch": 14.54, + "grad_norm": 0.953125, + "learning_rate": 0.00026215808835100665, + "loss": 0.2117, + "step": 350980 + }, + { + "epoch": 14.54, + "grad_norm": 1.0859375, + "learning_rate": 0.00026214725597736236, + "loss": 0.1792, + "step": 350990 + }, + { + "epoch": 14.54, + "grad_norm": 0.9921875, + "learning_rate": 0.0002621364235808583, + "loss": 0.2286, + "step": 351000 + }, + { + "epoch": 14.54, + "grad_norm": 0.310546875, + "learning_rate": 0.00026212559116151475, + "loss": 0.176, + "step": 351010 + }, + { + "epoch": 14.54, + "grad_norm": 0.8515625, + "learning_rate": 0.00026211475871935213, + "loss": 0.1902, + "step": 351020 + }, + { + "epoch": 14.54, + "grad_norm": 0.57421875, + "learning_rate": 0.0002621039262543909, + "loss": 0.137, + "step": 351030 + }, + { + "epoch": 14.54, + "grad_norm": 0.484375, + "learning_rate": 0.0002620930937666514, + "loss": 0.2008, + "step": 351040 + }, + { + "epoch": 14.54, + "grad_norm": 1.2578125, + "learning_rate": 0.000262082261256154, + "loss": 0.1792, + "step": 351050 + }, + { + "epoch": 14.54, + "grad_norm": 0.86328125, + "learning_rate": 0.000262071428722919, + "loss": 0.1911, + "step": 351060 + }, + { + "epoch": 14.54, + "grad_norm": 0.79296875, + "learning_rate": 0.00026206059616696697, + "loss": 0.1726, + "step": 351070 + }, + { + "epoch": 14.54, + "grad_norm": 1.0234375, + "learning_rate": 0.00026204976358831816, + "loss": 0.2498, + "step": 351080 + }, + { + "epoch": 14.54, + "grad_norm": 0.90234375, + "learning_rate": 0.000262038930986993, + "loss": 0.1769, + "step": 351090 + }, + { + "epoch": 14.54, + "grad_norm": 0.6875, + "learning_rate": 0.00026202809836301183, + "loss": 0.2011, + "step": 351100 + }, + { + "epoch": 14.54, + "grad_norm": 0.99609375, + "learning_rate": 0.0002620172657163951, + "loss": 0.2204, + "step": 351110 + }, + { + "epoch": 14.54, + "grad_norm": 0.85546875, + "learning_rate": 0.00026200643304716324, + "loss": 0.2496, + "step": 351120 + }, + { + "epoch": 14.54, + "grad_norm": 0.74609375, + "learning_rate": 0.00026199560035533646, + "loss": 0.1892, + "step": 351130 + }, + { + "epoch": 14.54, + "grad_norm": 1.109375, + "learning_rate": 0.0002619847676409354, + "loss": 0.1813, + "step": 351140 + }, + { + "epoch": 14.54, + "grad_norm": 0.95703125, + "learning_rate": 0.0002619739349039801, + "loss": 0.1959, + "step": 351150 + }, + { + "epoch": 14.55, + "grad_norm": 0.9921875, + "learning_rate": 0.00026196310214449136, + "loss": 0.1923, + "step": 351160 + }, + { + "epoch": 14.55, + "grad_norm": 0.8203125, + "learning_rate": 0.0002619522693624892, + "loss": 0.2012, + "step": 351170 + }, + { + "epoch": 14.55, + "grad_norm": 1.125, + "learning_rate": 0.00026194143655799425, + "loss": 0.1714, + "step": 351180 + }, + { + "epoch": 14.55, + "grad_norm": 0.80078125, + "learning_rate": 0.00026193060373102677, + "loss": 0.1535, + "step": 351190 + }, + { + "epoch": 14.55, + "grad_norm": 1.75, + "learning_rate": 0.00026191977088160724, + "loss": 0.1925, + "step": 351200 + }, + { + "epoch": 14.55, + "grad_norm": 0.40625, + "learning_rate": 0.0002619089380097559, + "loss": 0.1874, + "step": 351210 + }, + { + "epoch": 14.55, + "grad_norm": 0.85546875, + "learning_rate": 0.0002618981051154933, + "loss": 0.2203, + "step": 351220 + }, + { + "epoch": 14.55, + "grad_norm": 1.203125, + "learning_rate": 0.0002618872721988397, + "loss": 0.18, + "step": 351230 + }, + { + "epoch": 14.55, + "grad_norm": 0.423828125, + "learning_rate": 0.0002618764392598155, + "loss": 0.2014, + "step": 351240 + }, + { + "epoch": 14.55, + "grad_norm": 1.3984375, + "learning_rate": 0.0002618656062984412, + "loss": 0.1959, + "step": 351250 + }, + { + "epoch": 14.55, + "grad_norm": 0.99609375, + "learning_rate": 0.00026185477331473707, + "loss": 0.2079, + "step": 351260 + }, + { + "epoch": 14.55, + "grad_norm": 0.9609375, + "learning_rate": 0.00026184394030872356, + "loss": 0.1529, + "step": 351270 + }, + { + "epoch": 14.55, + "grad_norm": 0.796875, + "learning_rate": 0.000261833107280421, + "loss": 0.1458, + "step": 351280 + }, + { + "epoch": 14.55, + "grad_norm": 0.890625, + "learning_rate": 0.00026182227422984986, + "loss": 0.198, + "step": 351290 + }, + { + "epoch": 14.55, + "grad_norm": 0.455078125, + "learning_rate": 0.0002618114411570304, + "loss": 0.1835, + "step": 351300 + }, + { + "epoch": 14.55, + "grad_norm": 0.98828125, + "learning_rate": 0.0002618006080619831, + "loss": 0.1794, + "step": 351310 + }, + { + "epoch": 14.55, + "grad_norm": 0.44921875, + "learning_rate": 0.00026178977494472846, + "loss": 0.1554, + "step": 351320 + }, + { + "epoch": 14.55, + "grad_norm": 0.65234375, + "learning_rate": 0.0002617789418052866, + "loss": 0.1923, + "step": 351330 + }, + { + "epoch": 14.55, + "grad_norm": 0.54296875, + "learning_rate": 0.0002617681086436781, + "loss": 0.2007, + "step": 351340 + }, + { + "epoch": 14.55, + "grad_norm": 0.46875, + "learning_rate": 0.00026175727545992326, + "loss": 0.1678, + "step": 351350 + }, + { + "epoch": 14.55, + "grad_norm": 0.82421875, + "learning_rate": 0.0002617464422540425, + "loss": 0.1934, + "step": 351360 + }, + { + "epoch": 14.55, + "grad_norm": 0.43359375, + "learning_rate": 0.00026173560902605623, + "loss": 0.1332, + "step": 351370 + }, + { + "epoch": 14.55, + "grad_norm": 1.0234375, + "learning_rate": 0.0002617247757759848, + "loss": 0.2237, + "step": 351380 + }, + { + "epoch": 14.55, + "grad_norm": 1.0625, + "learning_rate": 0.00026171394250384863, + "loss": 0.1667, + "step": 351390 + }, + { + "epoch": 14.55, + "grad_norm": 0.236328125, + "learning_rate": 0.00026170310920966807, + "loss": 0.2266, + "step": 351400 + }, + { + "epoch": 14.56, + "grad_norm": 0.5859375, + "learning_rate": 0.0002616922758934635, + "loss": 0.1551, + "step": 351410 + }, + { + "epoch": 14.56, + "grad_norm": 0.5390625, + "learning_rate": 0.00026168144255525534, + "loss": 0.1505, + "step": 351420 + }, + { + "epoch": 14.56, + "grad_norm": 0.53125, + "learning_rate": 0.00026167060919506395, + "loss": 0.2006, + "step": 351430 + }, + { + "epoch": 14.56, + "grad_norm": 1.7265625, + "learning_rate": 0.0002616597758129097, + "loss": 0.2369, + "step": 351440 + }, + { + "epoch": 14.56, + "grad_norm": 1.0, + "learning_rate": 0.00026164894240881316, + "loss": 0.1865, + "step": 351450 + }, + { + "epoch": 14.56, + "grad_norm": 0.9921875, + "learning_rate": 0.00026163810898279443, + "loss": 0.2093, + "step": 351460 + }, + { + "epoch": 14.56, + "grad_norm": 0.98828125, + "learning_rate": 0.0002616272755348741, + "loss": 0.1998, + "step": 351470 + }, + { + "epoch": 14.56, + "grad_norm": 0.4375, + "learning_rate": 0.00026161644206507253, + "loss": 0.2204, + "step": 351480 + }, + { + "epoch": 14.56, + "grad_norm": 0.5234375, + "learning_rate": 0.0002616056085734099, + "loss": 0.1699, + "step": 351490 + }, + { + "epoch": 14.56, + "grad_norm": 0.83203125, + "learning_rate": 0.0002615947750599069, + "loss": 0.2532, + "step": 351500 + }, + { + "epoch": 14.56, + "grad_norm": 0.9921875, + "learning_rate": 0.00026158394152458374, + "loss": 0.1721, + "step": 351510 + }, + { + "epoch": 14.56, + "grad_norm": 0.84765625, + "learning_rate": 0.0002615731079674609, + "loss": 0.1678, + "step": 351520 + }, + { + "epoch": 14.56, + "grad_norm": 0.78125, + "learning_rate": 0.0002615622743885587, + "loss": 0.1308, + "step": 351530 + }, + { + "epoch": 14.56, + "grad_norm": 0.640625, + "learning_rate": 0.00026155144078789756, + "loss": 0.211, + "step": 351540 + }, + { + "epoch": 14.56, + "grad_norm": 2.234375, + "learning_rate": 0.00026154060716549783, + "loss": 0.1715, + "step": 351550 + }, + { + "epoch": 14.56, + "grad_norm": 0.345703125, + "learning_rate": 0.0002615297735213799, + "loss": 0.1992, + "step": 351560 + }, + { + "epoch": 14.56, + "grad_norm": 1.375, + "learning_rate": 0.0002615189398555642, + "loss": 0.1709, + "step": 351570 + }, + { + "epoch": 14.56, + "grad_norm": 0.921875, + "learning_rate": 0.00026150810616807116, + "loss": 0.2072, + "step": 351580 + }, + { + "epoch": 14.56, + "grad_norm": 1.1328125, + "learning_rate": 0.00026149727245892106, + "loss": 0.2457, + "step": 351590 + }, + { + "epoch": 14.56, + "grad_norm": 0.859375, + "learning_rate": 0.00026148643872813434, + "loss": 0.2048, + "step": 351600 + }, + { + "epoch": 14.56, + "grad_norm": 1.5859375, + "learning_rate": 0.0002614756049757314, + "loss": 0.1833, + "step": 351610 + }, + { + "epoch": 14.56, + "grad_norm": 0.8828125, + "learning_rate": 0.0002614647712017326, + "loss": 0.2337, + "step": 351620 + }, + { + "epoch": 14.56, + "grad_norm": 1.09375, + "learning_rate": 0.0002614539374061583, + "loss": 0.1514, + "step": 351630 + }, + { + "epoch": 14.56, + "grad_norm": 0.86328125, + "learning_rate": 0.00026144310358902894, + "loss": 0.1498, + "step": 351640 + }, + { + "epoch": 14.57, + "grad_norm": 1.0078125, + "learning_rate": 0.0002614322697503649, + "loss": 0.1978, + "step": 351650 + }, + { + "epoch": 14.57, + "grad_norm": 0.5078125, + "learning_rate": 0.00026142143589018665, + "loss": 0.1938, + "step": 351660 + }, + { + "epoch": 14.57, + "grad_norm": 0.72265625, + "learning_rate": 0.0002614106020085144, + "loss": 0.1877, + "step": 351670 + }, + { + "epoch": 14.57, + "grad_norm": 0.96875, + "learning_rate": 0.00026139976810536864, + "loss": 0.1764, + "step": 351680 + }, + { + "epoch": 14.57, + "grad_norm": 2.28125, + "learning_rate": 0.0002613889341807698, + "loss": 0.187, + "step": 351690 + }, + { + "epoch": 14.57, + "grad_norm": 1.1015625, + "learning_rate": 0.0002613781002347381, + "loss": 0.1812, + "step": 351700 + }, + { + "epoch": 14.57, + "grad_norm": 0.85546875, + "learning_rate": 0.0002613672662672941, + "loss": 0.127, + "step": 351710 + }, + { + "epoch": 14.57, + "grad_norm": 2.078125, + "learning_rate": 0.00026135643227845817, + "loss": 0.2055, + "step": 351720 + }, + { + "epoch": 14.57, + "grad_norm": 1.046875, + "learning_rate": 0.0002613455982682506, + "loss": 0.2126, + "step": 351730 + }, + { + "epoch": 14.57, + "grad_norm": 1.2578125, + "learning_rate": 0.00026133476423669194, + "loss": 0.2134, + "step": 351740 + }, + { + "epoch": 14.57, + "grad_norm": 1.09375, + "learning_rate": 0.0002613239301838024, + "loss": 0.1652, + "step": 351750 + }, + { + "epoch": 14.57, + "grad_norm": 1.0234375, + "learning_rate": 0.0002613130961096024, + "loss": 0.1911, + "step": 351760 + }, + { + "epoch": 14.57, + "grad_norm": 1.046875, + "learning_rate": 0.0002613022620141125, + "loss": 0.2103, + "step": 351770 + }, + { + "epoch": 14.57, + "grad_norm": 0.65234375, + "learning_rate": 0.00026129142789735283, + "loss": 0.1492, + "step": 351780 + }, + { + "epoch": 14.57, + "grad_norm": 0.9296875, + "learning_rate": 0.00026128059375934407, + "loss": 0.2178, + "step": 351790 + }, + { + "epoch": 14.57, + "grad_norm": 0.7734375, + "learning_rate": 0.0002612697596001063, + "loss": 0.1907, + "step": 351800 + }, + { + "epoch": 14.57, + "grad_norm": 1.3203125, + "learning_rate": 0.0002612589254196601, + "loss": 0.1804, + "step": 351810 + }, + { + "epoch": 14.57, + "grad_norm": 0.37109375, + "learning_rate": 0.0002612480912180259, + "loss": 0.2279, + "step": 351820 + }, + { + "epoch": 14.57, + "grad_norm": 0.25, + "learning_rate": 0.0002612372569952239, + "loss": 0.1383, + "step": 351830 + }, + { + "epoch": 14.57, + "grad_norm": 1.0390625, + "learning_rate": 0.0002612264227512746, + "loss": 0.2047, + "step": 351840 + }, + { + "epoch": 14.57, + "grad_norm": 0.90234375, + "learning_rate": 0.00026121558848619855, + "loss": 0.2064, + "step": 351850 + }, + { + "epoch": 14.57, + "grad_norm": 0.6796875, + "learning_rate": 0.0002612047542000157, + "loss": 0.1932, + "step": 351860 + }, + { + "epoch": 14.57, + "grad_norm": 1.046875, + "learning_rate": 0.00026119391989274694, + "loss": 0.1978, + "step": 351870 + }, + { + "epoch": 14.57, + "grad_norm": 1.9375, + "learning_rate": 0.00026118308556441234, + "loss": 0.244, + "step": 351880 + }, + { + "epoch": 14.58, + "grad_norm": 0.73828125, + "learning_rate": 0.0002611722512150324, + "loss": 0.1764, + "step": 351890 + }, + { + "epoch": 14.58, + "grad_norm": 0.91796875, + "learning_rate": 0.00026116141684462754, + "loss": 0.1671, + "step": 351900 + }, + { + "epoch": 14.58, + "grad_norm": 0.32421875, + "learning_rate": 0.000261150582453218, + "loss": 0.1724, + "step": 351910 + }, + { + "epoch": 14.58, + "grad_norm": 0.640625, + "learning_rate": 0.0002611397480408243, + "loss": 0.2102, + "step": 351920 + }, + { + "epoch": 14.58, + "grad_norm": 0.92578125, + "learning_rate": 0.0002611289136074668, + "loss": 0.2135, + "step": 351930 + }, + { + "epoch": 14.58, + "grad_norm": 1.8828125, + "learning_rate": 0.00026111807915316586, + "loss": 0.229, + "step": 351940 + }, + { + "epoch": 14.58, + "grad_norm": 0.5703125, + "learning_rate": 0.000261107244677942, + "loss": 0.1733, + "step": 351950 + }, + { + "epoch": 14.58, + "grad_norm": 0.494140625, + "learning_rate": 0.00026109641018181543, + "loss": 0.2092, + "step": 351960 + }, + { + "epoch": 14.58, + "grad_norm": 0.7265625, + "learning_rate": 0.00026108557566480655, + "loss": 0.2602, + "step": 351970 + }, + { + "epoch": 14.58, + "grad_norm": 1.6171875, + "learning_rate": 0.00026107474112693595, + "loss": 0.2491, + "step": 351980 + }, + { + "epoch": 14.58, + "grad_norm": 1.078125, + "learning_rate": 0.00026106390656822383, + "loss": 0.194, + "step": 351990 + }, + { + "epoch": 14.58, + "grad_norm": 0.94921875, + "learning_rate": 0.00026105307198869053, + "loss": 0.1728, + "step": 352000 + }, + { + "epoch": 14.58, + "grad_norm": 0.76953125, + "learning_rate": 0.0002610422373883567, + "loss": 0.2023, + "step": 352010 + }, + { + "epoch": 14.58, + "grad_norm": 0.78515625, + "learning_rate": 0.00026103140276724247, + "loss": 0.1152, + "step": 352020 + }, + { + "epoch": 14.58, + "grad_norm": 1.234375, + "learning_rate": 0.0002610205681253684, + "loss": 0.1684, + "step": 352030 + }, + { + "epoch": 14.58, + "grad_norm": 1.21875, + "learning_rate": 0.00026100973346275476, + "loss": 0.1975, + "step": 352040 + }, + { + "epoch": 14.58, + "grad_norm": 0.51953125, + "learning_rate": 0.00026099889877942195, + "loss": 0.1826, + "step": 352050 + }, + { + "epoch": 14.58, + "grad_norm": 0.9375, + "learning_rate": 0.00026098806407539053, + "loss": 0.1597, + "step": 352060 + }, + { + "epoch": 14.58, + "grad_norm": 0.259765625, + "learning_rate": 0.0002609772293506806, + "loss": 0.2169, + "step": 352070 + }, + { + "epoch": 14.58, + "grad_norm": 0.78515625, + "learning_rate": 0.00026096639460531284, + "loss": 0.1583, + "step": 352080 + }, + { + "epoch": 14.58, + "grad_norm": 0.8125, + "learning_rate": 0.0002609555598393075, + "loss": 0.1719, + "step": 352090 + }, + { + "epoch": 14.58, + "grad_norm": 0.490234375, + "learning_rate": 0.0002609447250526849, + "loss": 0.1744, + "step": 352100 + }, + { + "epoch": 14.58, + "grad_norm": 0.7890625, + "learning_rate": 0.0002609338902454656, + "loss": 0.1577, + "step": 352110 + }, + { + "epoch": 14.58, + "grad_norm": 0.26953125, + "learning_rate": 0.0002609230554176698, + "loss": 0.1998, + "step": 352120 + }, + { + "epoch": 14.59, + "grad_norm": 0.921875, + "learning_rate": 0.00026091222056931804, + "loss": 0.1677, + "step": 352130 + }, + { + "epoch": 14.59, + "grad_norm": 0.71875, + "learning_rate": 0.0002609013857004307, + "loss": 0.1528, + "step": 352140 + }, + { + "epoch": 14.59, + "grad_norm": 1.4296875, + "learning_rate": 0.00026089055081102803, + "loss": 0.1665, + "step": 352150 + }, + { + "epoch": 14.59, + "grad_norm": 0.77734375, + "learning_rate": 0.00026087971590113063, + "loss": 0.1973, + "step": 352160 + }, + { + "epoch": 14.59, + "grad_norm": 1.0546875, + "learning_rate": 0.0002608688809707588, + "loss": 0.2036, + "step": 352170 + }, + { + "epoch": 14.59, + "grad_norm": 0.77734375, + "learning_rate": 0.00026085804601993275, + "loss": 0.1684, + "step": 352180 + }, + { + "epoch": 14.59, + "grad_norm": 0.458984375, + "learning_rate": 0.00026084721104867314, + "loss": 0.1965, + "step": 352190 + }, + { + "epoch": 14.59, + "grad_norm": 0.625, + "learning_rate": 0.00026083637605700023, + "loss": 0.1592, + "step": 352200 + }, + { + "epoch": 14.59, + "grad_norm": 0.65625, + "learning_rate": 0.0002608255410449344, + "loss": 0.1678, + "step": 352210 + }, + { + "epoch": 14.59, + "grad_norm": 0.6328125, + "learning_rate": 0.0002608147060124962, + "loss": 0.2023, + "step": 352220 + }, + { + "epoch": 14.59, + "grad_norm": 0.97265625, + "learning_rate": 0.0002608038709597057, + "loss": 0.1826, + "step": 352230 + }, + { + "epoch": 14.59, + "grad_norm": 1.4453125, + "learning_rate": 0.0002607930358865836, + "loss": 0.1924, + "step": 352240 + }, + { + "epoch": 14.59, + "grad_norm": 0.7734375, + "learning_rate": 0.0002607822007931501, + "loss": 0.2187, + "step": 352250 + }, + { + "epoch": 14.59, + "grad_norm": 0.6796875, + "learning_rate": 0.00026077136567942573, + "loss": 0.1844, + "step": 352260 + }, + { + "epoch": 14.59, + "grad_norm": 0.8203125, + "learning_rate": 0.0002607605305454308, + "loss": 0.2212, + "step": 352270 + }, + { + "epoch": 14.59, + "grad_norm": 0.96875, + "learning_rate": 0.0002607496953911857, + "loss": 0.2115, + "step": 352280 + }, + { + "epoch": 14.59, + "grad_norm": 0.6171875, + "learning_rate": 0.00026073886021671085, + "loss": 0.2173, + "step": 352290 + }, + { + "epoch": 14.59, + "grad_norm": 1.0625, + "learning_rate": 0.00026072802502202666, + "loss": 0.1769, + "step": 352300 + }, + { + "epoch": 14.59, + "grad_norm": 1.6640625, + "learning_rate": 0.0002607171898071534, + "loss": 0.1835, + "step": 352310 + }, + { + "epoch": 14.59, + "grad_norm": 0.447265625, + "learning_rate": 0.00026070635457211156, + "loss": 0.2092, + "step": 352320 + }, + { + "epoch": 14.59, + "grad_norm": 1.375, + "learning_rate": 0.0002606955193169215, + "loss": 0.1903, + "step": 352330 + }, + { + "epoch": 14.59, + "grad_norm": 0.51953125, + "learning_rate": 0.0002606846840416037, + "loss": 0.1646, + "step": 352340 + }, + { + "epoch": 14.59, + "grad_norm": 1.3125, + "learning_rate": 0.0002606738487461785, + "loss": 0.1937, + "step": 352350 + }, + { + "epoch": 14.59, + "grad_norm": 0.498046875, + "learning_rate": 0.00026066301343066626, + "loss": 0.1979, + "step": 352360 + }, + { + "epoch": 14.6, + "grad_norm": 1.71875, + "learning_rate": 0.0002606521780950873, + "loss": 0.2371, + "step": 352370 + }, + { + "epoch": 14.6, + "grad_norm": 0.3359375, + "learning_rate": 0.0002606413427394622, + "loss": 0.1923, + "step": 352380 + }, + { + "epoch": 14.6, + "grad_norm": 0.71484375, + "learning_rate": 0.00026063050736381104, + "loss": 0.1569, + "step": 352390 + }, + { + "epoch": 14.6, + "grad_norm": 0.921875, + "learning_rate": 0.0002606196719681546, + "loss": 0.182, + "step": 352400 + }, + { + "epoch": 14.6, + "grad_norm": 0.578125, + "learning_rate": 0.000260608836552513, + "loss": 0.1589, + "step": 352410 + }, + { + "epoch": 14.6, + "grad_norm": 0.9375, + "learning_rate": 0.0002605980011169067, + "loss": 0.2016, + "step": 352420 + }, + { + "epoch": 14.6, + "grad_norm": 1.203125, + "learning_rate": 0.0002605871656613562, + "loss": 0.2259, + "step": 352430 + }, + { + "epoch": 14.6, + "grad_norm": 0.29296875, + "learning_rate": 0.0002605763301858818, + "loss": 0.1744, + "step": 352440 + }, + { + "epoch": 14.6, + "grad_norm": 1.078125, + "learning_rate": 0.00026056549469050377, + "loss": 0.186, + "step": 352450 + }, + { + "epoch": 14.6, + "grad_norm": 0.98828125, + "learning_rate": 0.0002605546591752427, + "loss": 0.1962, + "step": 352460 + }, + { + "epoch": 14.6, + "grad_norm": 0.79296875, + "learning_rate": 0.00026054382364011885, + "loss": 0.1453, + "step": 352470 + }, + { + "epoch": 14.6, + "grad_norm": 0.142578125, + "learning_rate": 0.00026053298808515277, + "loss": 0.186, + "step": 352480 + }, + { + "epoch": 14.6, + "grad_norm": 0.69921875, + "learning_rate": 0.00026052215251036464, + "loss": 0.1995, + "step": 352490 + }, + { + "epoch": 14.6, + "grad_norm": 0.94140625, + "learning_rate": 0.000260511316915775, + "loss": 0.1515, + "step": 352500 + }, + { + "epoch": 14.6, + "grad_norm": 1.1328125, + "learning_rate": 0.0002605004813014042, + "loss": 0.1853, + "step": 352510 + }, + { + "epoch": 14.6, + "grad_norm": 0.9765625, + "learning_rate": 0.0002604896456672726, + "loss": 0.1485, + "step": 352520 + }, + { + "epoch": 14.6, + "grad_norm": 0.83984375, + "learning_rate": 0.0002604788100134006, + "loss": 0.1974, + "step": 352530 + }, + { + "epoch": 14.6, + "grad_norm": 1.0703125, + "learning_rate": 0.00026046797433980866, + "loss": 0.2045, + "step": 352540 + }, + { + "epoch": 14.6, + "grad_norm": 0.5625, + "learning_rate": 0.0002604571386465171, + "loss": 0.1939, + "step": 352550 + }, + { + "epoch": 14.6, + "grad_norm": 0.53125, + "learning_rate": 0.00026044630293354636, + "loss": 0.1223, + "step": 352560 + }, + { + "epoch": 14.6, + "grad_norm": 1.7421875, + "learning_rate": 0.0002604354672009168, + "loss": 0.1839, + "step": 352570 + }, + { + "epoch": 14.6, + "grad_norm": 0.7578125, + "learning_rate": 0.0002604246314486487, + "loss": 0.2306, + "step": 352580 + }, + { + "epoch": 14.6, + "grad_norm": 0.08544921875, + "learning_rate": 0.00026041379567676284, + "loss": 0.1431, + "step": 352590 + }, + { + "epoch": 14.6, + "grad_norm": 0.78125, + "learning_rate": 0.00026040295988527914, + "loss": 0.238, + "step": 352600 + }, + { + "epoch": 14.61, + "grad_norm": 0.59375, + "learning_rate": 0.0002603921240742182, + "loss": 0.1559, + "step": 352610 + }, + { + "epoch": 14.61, + "grad_norm": 0.68359375, + "learning_rate": 0.0002603812882436004, + "loss": 0.1869, + "step": 352620 + }, + { + "epoch": 14.61, + "grad_norm": 0.84765625, + "learning_rate": 0.00026037045239344617, + "loss": 0.1925, + "step": 352630 + }, + { + "epoch": 14.61, + "grad_norm": 2.15625, + "learning_rate": 0.0002603596165237759, + "loss": 0.211, + "step": 352640 + }, + { + "epoch": 14.61, + "grad_norm": 0.486328125, + "learning_rate": 0.00026034878063460994, + "loss": 0.1924, + "step": 352650 + }, + { + "epoch": 14.61, + "grad_norm": 0.447265625, + "learning_rate": 0.0002603379447259686, + "loss": 0.1503, + "step": 352660 + }, + { + "epoch": 14.61, + "grad_norm": 0.30859375, + "learning_rate": 0.0002603271087978725, + "loss": 0.1858, + "step": 352670 + }, + { + "epoch": 14.61, + "grad_norm": 0.6328125, + "learning_rate": 0.00026031627285034184, + "loss": 0.2145, + "step": 352680 + }, + { + "epoch": 14.61, + "grad_norm": 0.64453125, + "learning_rate": 0.00026030543688339704, + "loss": 0.1517, + "step": 352690 + }, + { + "epoch": 14.61, + "grad_norm": 0.482421875, + "learning_rate": 0.0002602946008970585, + "loss": 0.1907, + "step": 352700 + }, + { + "epoch": 14.61, + "grad_norm": 1.484375, + "learning_rate": 0.0002602837648913467, + "loss": 0.2118, + "step": 352710 + }, + { + "epoch": 14.61, + "grad_norm": 0.71875, + "learning_rate": 0.00026027292886628194, + "loss": 0.1909, + "step": 352720 + }, + { + "epoch": 14.61, + "grad_norm": 2.03125, + "learning_rate": 0.00026026209282188466, + "loss": 0.2196, + "step": 352730 + }, + { + "epoch": 14.61, + "grad_norm": 0.671875, + "learning_rate": 0.0002602512567581752, + "loss": 0.1924, + "step": 352740 + }, + { + "epoch": 14.61, + "grad_norm": 0.90234375, + "learning_rate": 0.00026024042067517396, + "loss": 0.1708, + "step": 352750 + }, + { + "epoch": 14.61, + "grad_norm": 1.015625, + "learning_rate": 0.00026022958457290143, + "loss": 0.2376, + "step": 352760 + }, + { + "epoch": 14.61, + "grad_norm": 0.66015625, + "learning_rate": 0.0002602187484513779, + "loss": 0.2245, + "step": 352770 + }, + { + "epoch": 14.61, + "grad_norm": 2.078125, + "learning_rate": 0.00026020791231062373, + "loss": 0.2392, + "step": 352780 + }, + { + "epoch": 14.61, + "grad_norm": 0.91015625, + "learning_rate": 0.0002601970761506594, + "loss": 0.1826, + "step": 352790 + }, + { + "epoch": 14.61, + "grad_norm": 0.8359375, + "learning_rate": 0.00026018623997150526, + "loss": 0.2184, + "step": 352800 + }, + { + "epoch": 14.61, + "grad_norm": 0.8125, + "learning_rate": 0.0002601754037731818, + "loss": 0.2422, + "step": 352810 + }, + { + "epoch": 14.61, + "grad_norm": 0.76953125, + "learning_rate": 0.0002601645675557092, + "loss": 0.1932, + "step": 352820 + }, + { + "epoch": 14.61, + "grad_norm": 0.88671875, + "learning_rate": 0.0002601537313191081, + "loss": 0.1347, + "step": 352830 + }, + { + "epoch": 14.61, + "grad_norm": 0.57421875, + "learning_rate": 0.0002601428950633987, + "loss": 0.1918, + "step": 352840 + }, + { + "epoch": 14.62, + "grad_norm": 0.66796875, + "learning_rate": 0.00026013205878860153, + "loss": 0.2005, + "step": 352850 + }, + { + "epoch": 14.62, + "grad_norm": 2.125, + "learning_rate": 0.00026012122249473687, + "loss": 0.1692, + "step": 352860 + }, + { + "epoch": 14.62, + "grad_norm": 0.30078125, + "learning_rate": 0.00026011038618182515, + "loss": 0.15, + "step": 352870 + }, + { + "epoch": 14.62, + "grad_norm": 0.283203125, + "learning_rate": 0.0002600995498498868, + "loss": 0.2104, + "step": 352880 + }, + { + "epoch": 14.62, + "grad_norm": 0.322265625, + "learning_rate": 0.0002600887134989422, + "loss": 0.1967, + "step": 352890 + }, + { + "epoch": 14.62, + "grad_norm": 1.890625, + "learning_rate": 0.0002600778771290117, + "loss": 0.221, + "step": 352900 + }, + { + "epoch": 14.62, + "grad_norm": 0.7109375, + "learning_rate": 0.0002600670407401157, + "loss": 0.1915, + "step": 352910 + }, + { + "epoch": 14.62, + "grad_norm": 0.51953125, + "learning_rate": 0.0002600562043322747, + "loss": 0.1744, + "step": 352920 + }, + { + "epoch": 14.62, + "grad_norm": 1.6171875, + "learning_rate": 0.000260045367905509, + "loss": 0.1805, + "step": 352930 + }, + { + "epoch": 14.62, + "grad_norm": 1.1171875, + "learning_rate": 0.0002600345314598389, + "loss": 0.1774, + "step": 352940 + }, + { + "epoch": 14.62, + "grad_norm": 0.921875, + "learning_rate": 0.000260023694995285, + "loss": 0.1747, + "step": 352950 + }, + { + "epoch": 14.62, + "grad_norm": 1.234375, + "learning_rate": 0.0002600128585118675, + "loss": 0.1473, + "step": 352960 + }, + { + "epoch": 14.62, + "grad_norm": 1.5, + "learning_rate": 0.000260002022009607, + "loss": 0.196, + "step": 352970 + }, + { + "epoch": 14.62, + "grad_norm": 0.67578125, + "learning_rate": 0.0002599911854885237, + "loss": 0.1422, + "step": 352980 + }, + { + "epoch": 14.62, + "grad_norm": 1.1328125, + "learning_rate": 0.00025998034894863804, + "loss": 0.2203, + "step": 352990 + }, + { + "epoch": 14.62, + "grad_norm": 1.34375, + "learning_rate": 0.00025996951238997054, + "loss": 0.1507, + "step": 353000 + }, + { + "epoch": 14.62, + "grad_norm": 0.98828125, + "learning_rate": 0.00025995867581254137, + "loss": 0.1628, + "step": 353010 + }, + { + "epoch": 14.62, + "grad_norm": 0.88671875, + "learning_rate": 0.0002599478392163711, + "loss": 0.2017, + "step": 353020 + }, + { + "epoch": 14.62, + "grad_norm": 0.66796875, + "learning_rate": 0.00025993700260148014, + "loss": 0.1945, + "step": 353030 + }, + { + "epoch": 14.62, + "grad_norm": 0.7109375, + "learning_rate": 0.0002599261659678887, + "loss": 0.1633, + "step": 353040 + }, + { + "epoch": 14.62, + "grad_norm": 0.890625, + "learning_rate": 0.0002599153293156174, + "loss": 0.2256, + "step": 353050 + }, + { + "epoch": 14.62, + "grad_norm": 0.78125, + "learning_rate": 0.0002599044926446864, + "loss": 0.1904, + "step": 353060 + }, + { + "epoch": 14.62, + "grad_norm": 0.90234375, + "learning_rate": 0.00025989365595511626, + "loss": 0.2221, + "step": 353070 + }, + { + "epoch": 14.62, + "grad_norm": 0.455078125, + "learning_rate": 0.0002598828192469274, + "loss": 0.1475, + "step": 353080 + }, + { + "epoch": 14.62, + "grad_norm": 0.62890625, + "learning_rate": 0.00025987198252014007, + "loss": 0.2341, + "step": 353090 + }, + { + "epoch": 14.63, + "grad_norm": 0.53125, + "learning_rate": 0.00025986114577477475, + "loss": 0.1884, + "step": 353100 + }, + { + "epoch": 14.63, + "grad_norm": 0.375, + "learning_rate": 0.00025985030901085185, + "loss": 0.1825, + "step": 353110 + }, + { + "epoch": 14.63, + "grad_norm": 0.7578125, + "learning_rate": 0.00025983947222839163, + "loss": 0.1543, + "step": 353120 + }, + { + "epoch": 14.63, + "grad_norm": 0.25, + "learning_rate": 0.00025982863542741477, + "loss": 0.1981, + "step": 353130 + }, + { + "epoch": 14.63, + "grad_norm": 0.80859375, + "learning_rate": 0.0002598177986079413, + "loss": 0.2066, + "step": 353140 + }, + { + "epoch": 14.63, + "grad_norm": 1.0546875, + "learning_rate": 0.00025980696176999184, + "loss": 0.1929, + "step": 353150 + }, + { + "epoch": 14.63, + "grad_norm": 0.75390625, + "learning_rate": 0.0002597961249135868, + "loss": 0.1649, + "step": 353160 + }, + { + "epoch": 14.63, + "grad_norm": 1.5078125, + "learning_rate": 0.0002597852880387464, + "loss": 0.1802, + "step": 353170 + }, + { + "epoch": 14.63, + "grad_norm": 0.609375, + "learning_rate": 0.00025977445114549124, + "loss": 0.1986, + "step": 353180 + }, + { + "epoch": 14.63, + "grad_norm": 0.796875, + "learning_rate": 0.0002597636142338416, + "loss": 0.207, + "step": 353190 + }, + { + "epoch": 14.63, + "grad_norm": 1.28125, + "learning_rate": 0.0002597527773038179, + "loss": 0.1912, + "step": 353200 + }, + { + "epoch": 14.63, + "grad_norm": 0.953125, + "learning_rate": 0.0002597419403554405, + "loss": 0.1842, + "step": 353210 + }, + { + "epoch": 14.63, + "grad_norm": 1.140625, + "learning_rate": 0.0002597311033887298, + "loss": 0.1647, + "step": 353220 + }, + { + "epoch": 14.63, + "grad_norm": 0.90234375, + "learning_rate": 0.0002597202664037063, + "loss": 0.1271, + "step": 353230 + }, + { + "epoch": 14.63, + "grad_norm": 1.6171875, + "learning_rate": 0.00025970942940039025, + "loss": 0.1473, + "step": 353240 + }, + { + "epoch": 14.63, + "grad_norm": 0.71484375, + "learning_rate": 0.00025969859237880204, + "loss": 0.1554, + "step": 353250 + }, + { + "epoch": 14.63, + "grad_norm": 0.60546875, + "learning_rate": 0.00025968775533896226, + "loss": 0.2205, + "step": 353260 + }, + { + "epoch": 14.63, + "grad_norm": 0.92578125, + "learning_rate": 0.0002596769182808911, + "loss": 0.1546, + "step": 353270 + }, + { + "epoch": 14.63, + "grad_norm": 0.326171875, + "learning_rate": 0.000259666081204609, + "loss": 0.2154, + "step": 353280 + }, + { + "epoch": 14.63, + "grad_norm": 0.7890625, + "learning_rate": 0.0002596552441101365, + "loss": 0.2427, + "step": 353290 + }, + { + "epoch": 14.63, + "grad_norm": 0.6484375, + "learning_rate": 0.0002596444069974937, + "loss": 0.2023, + "step": 353300 + }, + { + "epoch": 14.63, + "grad_norm": 0.5625, + "learning_rate": 0.0002596335698667013, + "loss": 0.2119, + "step": 353310 + }, + { + "epoch": 14.63, + "grad_norm": 0.65625, + "learning_rate": 0.0002596227327177795, + "loss": 0.1866, + "step": 353320 + }, + { + "epoch": 14.63, + "grad_norm": 0.8125, + "learning_rate": 0.0002596118955507487, + "loss": 0.2093, + "step": 353330 + }, + { + "epoch": 14.64, + "grad_norm": 0.6171875, + "learning_rate": 0.0002596010583656295, + "loss": 0.2068, + "step": 353340 + }, + { + "epoch": 14.64, + "grad_norm": 0.640625, + "learning_rate": 0.000259590221162442, + "loss": 0.2094, + "step": 353350 + }, + { + "epoch": 14.64, + "grad_norm": 1.3671875, + "learning_rate": 0.0002595793839412068, + "loss": 0.2048, + "step": 353360 + }, + { + "epoch": 14.64, + "grad_norm": 2.296875, + "learning_rate": 0.0002595685467019443, + "loss": 0.2088, + "step": 353370 + }, + { + "epoch": 14.64, + "grad_norm": 0.359375, + "learning_rate": 0.00025955770944467466, + "loss": 0.1798, + "step": 353380 + }, + { + "epoch": 14.64, + "grad_norm": 0.85546875, + "learning_rate": 0.0002595468721694186, + "loss": 0.1997, + "step": 353390 + }, + { + "epoch": 14.64, + "grad_norm": 0.8203125, + "learning_rate": 0.0002595360348761963, + "loss": 0.228, + "step": 353400 + }, + { + "epoch": 14.64, + "grad_norm": 1.1484375, + "learning_rate": 0.00025952519756502817, + "loss": 0.1951, + "step": 353410 + }, + { + "epoch": 14.64, + "grad_norm": 1.359375, + "learning_rate": 0.0002595143602359348, + "loss": 0.1812, + "step": 353420 + }, + { + "epoch": 14.64, + "grad_norm": 1.5546875, + "learning_rate": 0.00025950352288893626, + "loss": 0.1865, + "step": 353430 + }, + { + "epoch": 14.64, + "grad_norm": 1.359375, + "learning_rate": 0.0002594926855240532, + "loss": 0.1939, + "step": 353440 + }, + { + "epoch": 14.64, + "grad_norm": 0.7734375, + "learning_rate": 0.00025948184814130593, + "loss": 0.2105, + "step": 353450 + }, + { + "epoch": 14.64, + "grad_norm": 0.29296875, + "learning_rate": 0.0002594710107407148, + "loss": 0.2079, + "step": 353460 + }, + { + "epoch": 14.64, + "grad_norm": 1.03125, + "learning_rate": 0.00025946017332230024, + "loss": 0.1846, + "step": 353470 + }, + { + "epoch": 14.64, + "grad_norm": 3.0, + "learning_rate": 0.0002594493358860827, + "loss": 0.1958, + "step": 353480 + }, + { + "epoch": 14.64, + "grad_norm": 1.4453125, + "learning_rate": 0.0002594384984320825, + "loss": 0.1697, + "step": 353490 + }, + { + "epoch": 14.64, + "grad_norm": 0.9140625, + "learning_rate": 0.0002594276609603202, + "loss": 0.1711, + "step": 353500 + }, + { + "epoch": 14.64, + "grad_norm": 1.3515625, + "learning_rate": 0.0002594168234708159, + "loss": 0.19, + "step": 353510 + }, + { + "epoch": 14.64, + "grad_norm": 1.25, + "learning_rate": 0.00025940598596359016, + "loss": 0.2038, + "step": 353520 + }, + { + "epoch": 14.64, + "grad_norm": 0.76953125, + "learning_rate": 0.0002593951484386635, + "loss": 0.2236, + "step": 353530 + }, + { + "epoch": 14.64, + "grad_norm": 1.4453125, + "learning_rate": 0.000259384310896056, + "loss": 0.1884, + "step": 353540 + }, + { + "epoch": 14.64, + "grad_norm": 0.62109375, + "learning_rate": 0.00025937347333578834, + "loss": 0.1624, + "step": 353550 + }, + { + "epoch": 14.64, + "grad_norm": 0.80078125, + "learning_rate": 0.0002593626357578808, + "loss": 0.1742, + "step": 353560 + }, + { + "epoch": 14.64, + "grad_norm": 0.427734375, + "learning_rate": 0.0002593517981623538, + "loss": 0.1833, + "step": 353570 + }, + { + "epoch": 14.65, + "grad_norm": 0.6328125, + "learning_rate": 0.00025934096054922777, + "loss": 0.2079, + "step": 353580 + }, + { + "epoch": 14.65, + "grad_norm": 0.328125, + "learning_rate": 0.000259330122918523, + "loss": 0.2386, + "step": 353590 + }, + { + "epoch": 14.65, + "grad_norm": 0.58984375, + "learning_rate": 0.00025931928527026, + "loss": 0.19, + "step": 353600 + }, + { + "epoch": 14.65, + "grad_norm": 1.109375, + "learning_rate": 0.00025930844760445917, + "loss": 0.2575, + "step": 353610 + }, + { + "epoch": 14.65, + "grad_norm": 1.421875, + "learning_rate": 0.00025929760992114063, + "loss": 0.1748, + "step": 353620 + }, + { + "epoch": 14.65, + "grad_norm": 0.7421875, + "learning_rate": 0.00025928677222032513, + "loss": 0.1136, + "step": 353630 + }, + { + "epoch": 14.65, + "grad_norm": 1.1328125, + "learning_rate": 0.00025927593450203294, + "loss": 0.1691, + "step": 353640 + }, + { + "epoch": 14.65, + "grad_norm": 0.64453125, + "learning_rate": 0.0002592650967662844, + "loss": 0.181, + "step": 353650 + }, + { + "epoch": 14.65, + "grad_norm": 0.76953125, + "learning_rate": 0.0002592542590131, + "loss": 0.1217, + "step": 353660 + }, + { + "epoch": 14.65, + "grad_norm": 1.75, + "learning_rate": 0.00025924342124250007, + "loss": 0.18, + "step": 353670 + }, + { + "epoch": 14.65, + "grad_norm": 1.9296875, + "learning_rate": 0.00025923258345450495, + "loss": 0.1764, + "step": 353680 + }, + { + "epoch": 14.65, + "grad_norm": 0.51953125, + "learning_rate": 0.00025922174564913523, + "loss": 0.2126, + "step": 353690 + }, + { + "epoch": 14.65, + "grad_norm": 0.83984375, + "learning_rate": 0.00025921090782641106, + "loss": 0.2198, + "step": 353700 + }, + { + "epoch": 14.65, + "grad_norm": 1.2734375, + "learning_rate": 0.00025920006998635304, + "loss": 0.2386, + "step": 353710 + }, + { + "epoch": 14.65, + "grad_norm": 0.84765625, + "learning_rate": 0.00025918923212898146, + "loss": 0.1665, + "step": 353720 + }, + { + "epoch": 14.65, + "grad_norm": 1.296875, + "learning_rate": 0.0002591783942543167, + "loss": 0.2432, + "step": 353730 + }, + { + "epoch": 14.65, + "grad_norm": 0.4140625, + "learning_rate": 0.0002591675563623792, + "loss": 0.1744, + "step": 353740 + }, + { + "epoch": 14.65, + "grad_norm": 0.7109375, + "learning_rate": 0.0002591567184531894, + "loss": 0.2351, + "step": 353750 + }, + { + "epoch": 14.65, + "grad_norm": 0.44140625, + "learning_rate": 0.00025914588052676756, + "loss": 0.1838, + "step": 353760 + }, + { + "epoch": 14.65, + "grad_norm": 1.25, + "learning_rate": 0.0002591350425831342, + "loss": 0.1805, + "step": 353770 + }, + { + "epoch": 14.65, + "grad_norm": 0.6484375, + "learning_rate": 0.00025912420462230966, + "loss": 0.1959, + "step": 353780 + }, + { + "epoch": 14.65, + "grad_norm": 0.8125, + "learning_rate": 0.00025911336664431446, + "loss": 0.2443, + "step": 353790 + }, + { + "epoch": 14.65, + "grad_norm": 0.25, + "learning_rate": 0.00025910252864916875, + "loss": 0.1921, + "step": 353800 + }, + { + "epoch": 14.65, + "grad_norm": 0.765625, + "learning_rate": 0.00025909169063689314, + "loss": 0.1922, + "step": 353810 + }, + { + "epoch": 14.66, + "grad_norm": 0.8359375, + "learning_rate": 0.00025908085260750794, + "loss": 0.2135, + "step": 353820 + }, + { + "epoch": 14.66, + "grad_norm": 0.77734375, + "learning_rate": 0.00025907001456103356, + "loss": 0.2282, + "step": 353830 + }, + { + "epoch": 14.66, + "grad_norm": 0.87890625, + "learning_rate": 0.00025905917649749036, + "loss": 0.1829, + "step": 353840 + }, + { + "epoch": 14.66, + "grad_norm": 1.609375, + "learning_rate": 0.0002590483384168988, + "loss": 0.1711, + "step": 353850 + }, + { + "epoch": 14.66, + "grad_norm": 0.8515625, + "learning_rate": 0.0002590375003192792, + "loss": 0.1719, + "step": 353860 + }, + { + "epoch": 14.66, + "grad_norm": 1.359375, + "learning_rate": 0.00025902666220465204, + "loss": 0.2097, + "step": 353870 + }, + { + "epoch": 14.66, + "grad_norm": 0.609375, + "learning_rate": 0.0002590158240730377, + "loss": 0.224, + "step": 353880 + }, + { + "epoch": 14.66, + "grad_norm": 0.71875, + "learning_rate": 0.00025900498592445646, + "loss": 0.1774, + "step": 353890 + }, + { + "epoch": 14.66, + "grad_norm": 0.78515625, + "learning_rate": 0.0002589941477589289, + "loss": 0.1981, + "step": 353900 + }, + { + "epoch": 14.66, + "grad_norm": 1.4296875, + "learning_rate": 0.00025898330957647525, + "loss": 0.1868, + "step": 353910 + }, + { + "epoch": 14.66, + "grad_norm": 1.1796875, + "learning_rate": 0.0002589724713771161, + "loss": 0.2038, + "step": 353920 + }, + { + "epoch": 14.66, + "grad_norm": 0.84765625, + "learning_rate": 0.00025896163316087165, + "loss": 0.1623, + "step": 353930 + }, + { + "epoch": 14.66, + "grad_norm": 1.9296875, + "learning_rate": 0.0002589507949277624, + "loss": 0.1867, + "step": 353940 + }, + { + "epoch": 14.66, + "grad_norm": 0.66796875, + "learning_rate": 0.00025893995667780866, + "loss": 0.1808, + "step": 353950 + }, + { + "epoch": 14.66, + "grad_norm": 0.84765625, + "learning_rate": 0.00025892911841103095, + "loss": 0.1873, + "step": 353960 + }, + { + "epoch": 14.66, + "grad_norm": 0.7265625, + "learning_rate": 0.0002589182801274496, + "loss": 0.1835, + "step": 353970 + }, + { + "epoch": 14.66, + "grad_norm": 0.63671875, + "learning_rate": 0.0002589074418270849, + "loss": 0.1903, + "step": 353980 + }, + { + "epoch": 14.66, + "grad_norm": 0.7109375, + "learning_rate": 0.0002588966035099575, + "loss": 0.2415, + "step": 353990 + }, + { + "epoch": 14.66, + "grad_norm": 0.49609375, + "learning_rate": 0.00025888576517608757, + "loss": 0.195, + "step": 354000 + }, + { + "epoch": 14.66, + "grad_norm": 0.9921875, + "learning_rate": 0.0002588749268254956, + "loss": 0.2198, + "step": 354010 + }, + { + "epoch": 14.66, + "grad_norm": 1.171875, + "learning_rate": 0.000258864088458202, + "loss": 0.1932, + "step": 354020 + }, + { + "epoch": 14.66, + "grad_norm": 0.9375, + "learning_rate": 0.00025885325007422714, + "loss": 0.223, + "step": 354030 + }, + { + "epoch": 14.66, + "grad_norm": 1.1484375, + "learning_rate": 0.00025884241167359144, + "loss": 0.2108, + "step": 354040 + }, + { + "epoch": 14.66, + "grad_norm": 0.91015625, + "learning_rate": 0.00025883157325631525, + "loss": 0.1754, + "step": 354050 + }, + { + "epoch": 14.67, + "grad_norm": 0.458984375, + "learning_rate": 0.00025882073482241894, + "loss": 0.1838, + "step": 354060 + }, + { + "epoch": 14.67, + "grad_norm": 1.171875, + "learning_rate": 0.000258809896371923, + "loss": 0.2107, + "step": 354070 + }, + { + "epoch": 14.67, + "grad_norm": 0.59765625, + "learning_rate": 0.00025879905790484784, + "loss": 0.199, + "step": 354080 + }, + { + "epoch": 14.67, + "grad_norm": 0.66796875, + "learning_rate": 0.00025878821942121376, + "loss": 0.184, + "step": 354090 + }, + { + "epoch": 14.67, + "grad_norm": 0.8515625, + "learning_rate": 0.00025877738092104126, + "loss": 0.243, + "step": 354100 + }, + { + "epoch": 14.67, + "grad_norm": 1.046875, + "learning_rate": 0.0002587665424043506, + "loss": 0.1879, + "step": 354110 + }, + { + "epoch": 14.67, + "grad_norm": 0.470703125, + "learning_rate": 0.0002587557038711623, + "loss": 0.1866, + "step": 354120 + }, + { + "epoch": 14.67, + "grad_norm": 0.75390625, + "learning_rate": 0.0002587448653214967, + "loss": 0.1756, + "step": 354130 + }, + { + "epoch": 14.67, + "grad_norm": 0.796875, + "learning_rate": 0.00025873402675537417, + "loss": 0.1866, + "step": 354140 + }, + { + "epoch": 14.67, + "grad_norm": 1.015625, + "learning_rate": 0.00025872318817281515, + "loss": 0.1977, + "step": 354150 + }, + { + "epoch": 14.67, + "grad_norm": 1.5, + "learning_rate": 0.00025871234957384006, + "loss": 0.181, + "step": 354160 + }, + { + "epoch": 14.67, + "grad_norm": 0.482421875, + "learning_rate": 0.0002587015109584693, + "loss": 0.2148, + "step": 354170 + }, + { + "epoch": 14.67, + "grad_norm": 0.8828125, + "learning_rate": 0.00025869067232672316, + "loss": 0.2445, + "step": 354180 + }, + { + "epoch": 14.67, + "grad_norm": 0.62109375, + "learning_rate": 0.0002586798336786221, + "loss": 0.1668, + "step": 354190 + }, + { + "epoch": 14.67, + "grad_norm": 1.0703125, + "learning_rate": 0.0002586689950141866, + "loss": 0.2198, + "step": 354200 + }, + { + "epoch": 14.67, + "grad_norm": 1.3515625, + "learning_rate": 0.00025865815633343696, + "loss": 0.1662, + "step": 354210 + }, + { + "epoch": 14.67, + "grad_norm": 0.48046875, + "learning_rate": 0.0002586473176363937, + "loss": 0.2519, + "step": 354220 + }, + { + "epoch": 14.67, + "grad_norm": 2.203125, + "learning_rate": 0.00025863647892307695, + "loss": 0.1457, + "step": 354230 + }, + { + "epoch": 14.67, + "grad_norm": 0.74609375, + "learning_rate": 0.00025862564019350743, + "loss": 0.2073, + "step": 354240 + }, + { + "epoch": 14.67, + "grad_norm": 0.7265625, + "learning_rate": 0.0002586148014477053, + "loss": 0.1504, + "step": 354250 + }, + { + "epoch": 14.67, + "grad_norm": 0.95703125, + "learning_rate": 0.00025860396268569103, + "loss": 0.2127, + "step": 354260 + }, + { + "epoch": 14.67, + "grad_norm": 0.953125, + "learning_rate": 0.00025859312390748507, + "loss": 0.1889, + "step": 354270 + }, + { + "epoch": 14.67, + "grad_norm": 0.578125, + "learning_rate": 0.00025858228511310775, + "loss": 0.2231, + "step": 354280 + }, + { + "epoch": 14.67, + "grad_norm": 1.0078125, + "learning_rate": 0.00025857144630257956, + "loss": 0.202, + "step": 354290 + }, + { + "epoch": 14.68, + "grad_norm": 1.234375, + "learning_rate": 0.00025856060747592077, + "loss": 0.167, + "step": 354300 + }, + { + "epoch": 14.68, + "grad_norm": 1.2734375, + "learning_rate": 0.00025854976863315187, + "loss": 0.2032, + "step": 354310 + }, + { + "epoch": 14.68, + "grad_norm": 0.8046875, + "learning_rate": 0.0002585389297742932, + "loss": 0.2021, + "step": 354320 + }, + { + "epoch": 14.68, + "grad_norm": 0.6875, + "learning_rate": 0.00025852809089936524, + "loss": 0.1827, + "step": 354330 + }, + { + "epoch": 14.68, + "grad_norm": 0.5859375, + "learning_rate": 0.00025851725200838827, + "loss": 0.212, + "step": 354340 + }, + { + "epoch": 14.68, + "grad_norm": 0.7265625, + "learning_rate": 0.0002585064131013828, + "loss": 0.1755, + "step": 354350 + }, + { + "epoch": 14.68, + "grad_norm": 0.4921875, + "learning_rate": 0.0002584955741783692, + "loss": 0.2142, + "step": 354360 + }, + { + "epoch": 14.68, + "grad_norm": 0.56640625, + "learning_rate": 0.00025848473523936783, + "loss": 0.1579, + "step": 354370 + }, + { + "epoch": 14.68, + "grad_norm": 0.330078125, + "learning_rate": 0.000258473896284399, + "loss": 0.1769, + "step": 354380 + }, + { + "epoch": 14.68, + "grad_norm": 0.7734375, + "learning_rate": 0.00025846305731348334, + "loss": 0.2063, + "step": 354390 + }, + { + "epoch": 14.68, + "grad_norm": 1.625, + "learning_rate": 0.00025845221832664105, + "loss": 0.1719, + "step": 354400 + }, + { + "epoch": 14.68, + "grad_norm": 0.59375, + "learning_rate": 0.00025844137932389265, + "loss": 0.2001, + "step": 354410 + }, + { + "epoch": 14.68, + "grad_norm": 0.83203125, + "learning_rate": 0.00025843054030525846, + "loss": 0.1984, + "step": 354420 + }, + { + "epoch": 14.68, + "grad_norm": 1.0625, + "learning_rate": 0.00025841970127075893, + "loss": 0.1401, + "step": 354430 + }, + { + "epoch": 14.68, + "grad_norm": 1.3125, + "learning_rate": 0.00025840886222041443, + "loss": 0.2055, + "step": 354440 + }, + { + "epoch": 14.68, + "grad_norm": 0.70703125, + "learning_rate": 0.0002583980231542453, + "loss": 0.1836, + "step": 354450 + }, + { + "epoch": 14.68, + "grad_norm": 0.75, + "learning_rate": 0.000258387184072272, + "loss": 0.1846, + "step": 354460 + }, + { + "epoch": 14.68, + "grad_norm": 1.71875, + "learning_rate": 0.000258376344974515, + "loss": 0.2234, + "step": 354470 + }, + { + "epoch": 14.68, + "grad_norm": 0.91796875, + "learning_rate": 0.00025836550586099457, + "loss": 0.2019, + "step": 354480 + }, + { + "epoch": 14.68, + "grad_norm": 1.1328125, + "learning_rate": 0.0002583546667317312, + "loss": 0.1602, + "step": 354490 + }, + { + "epoch": 14.68, + "grad_norm": 1.421875, + "learning_rate": 0.0002583438275867452, + "loss": 0.1759, + "step": 354500 + }, + { + "epoch": 14.68, + "grad_norm": 0.78125, + "learning_rate": 0.000258332988426057, + "loss": 0.2226, + "step": 354510 + }, + { + "epoch": 14.68, + "grad_norm": 1.0625, + "learning_rate": 0.0002583221492496871, + "loss": 0.1574, + "step": 354520 + }, + { + "epoch": 14.68, + "grad_norm": 1.546875, + "learning_rate": 0.0002583113100576557, + "loss": 0.1507, + "step": 354530 + }, + { + "epoch": 14.69, + "grad_norm": 0.7890625, + "learning_rate": 0.0002583004708499835, + "loss": 0.1325, + "step": 354540 + }, + { + "epoch": 14.69, + "grad_norm": 1.6640625, + "learning_rate": 0.00025828963162669055, + "loss": 0.2431, + "step": 354550 + }, + { + "epoch": 14.69, + "grad_norm": 1.0546875, + "learning_rate": 0.00025827879238779744, + "loss": 0.2179, + "step": 354560 + }, + { + "epoch": 14.69, + "grad_norm": 1.125, + "learning_rate": 0.00025826795313332456, + "loss": 0.1586, + "step": 354570 + }, + { + "epoch": 14.69, + "grad_norm": 1.296875, + "learning_rate": 0.00025825711386329224, + "loss": 0.1886, + "step": 354580 + }, + { + "epoch": 14.69, + "grad_norm": 1.25, + "learning_rate": 0.000258246274577721, + "loss": 0.1987, + "step": 354590 + }, + { + "epoch": 14.69, + "grad_norm": 0.73046875, + "learning_rate": 0.00025823543527663114, + "loss": 0.2128, + "step": 354600 + }, + { + "epoch": 14.69, + "grad_norm": 0.89453125, + "learning_rate": 0.000258224595960043, + "loss": 0.2085, + "step": 354610 + }, + { + "epoch": 14.69, + "grad_norm": 0.22265625, + "learning_rate": 0.00025821375662797715, + "loss": 0.1625, + "step": 354620 + }, + { + "epoch": 14.69, + "grad_norm": 0.796875, + "learning_rate": 0.00025820291728045387, + "loss": 0.2284, + "step": 354630 + }, + { + "epoch": 14.69, + "grad_norm": 0.77734375, + "learning_rate": 0.00025819207791749355, + "loss": 0.1763, + "step": 354640 + }, + { + "epoch": 14.69, + "grad_norm": 0.78515625, + "learning_rate": 0.0002581812385391167, + "loss": 0.1744, + "step": 354650 + }, + { + "epoch": 14.69, + "grad_norm": 2.1875, + "learning_rate": 0.00025817039914534357, + "loss": 0.1888, + "step": 354660 + }, + { + "epoch": 14.69, + "grad_norm": 0.875, + "learning_rate": 0.00025815955973619467, + "loss": 0.1755, + "step": 354670 + }, + { + "epoch": 14.69, + "grad_norm": 0.62890625, + "learning_rate": 0.0002581487203116903, + "loss": 0.2088, + "step": 354680 + }, + { + "epoch": 14.69, + "grad_norm": 1.6484375, + "learning_rate": 0.00025813788087185096, + "loss": 0.1792, + "step": 354690 + }, + { + "epoch": 14.69, + "grad_norm": 0.8984375, + "learning_rate": 0.0002581270414166971, + "loss": 0.1969, + "step": 354700 + }, + { + "epoch": 14.69, + "grad_norm": 0.56640625, + "learning_rate": 0.00025811620194624886, + "loss": 0.1363, + "step": 354710 + }, + { + "epoch": 14.69, + "grad_norm": 1.21875, + "learning_rate": 0.0002581053624605269, + "loss": 0.2043, + "step": 354720 + }, + { + "epoch": 14.69, + "grad_norm": 2.046875, + "learning_rate": 0.00025809452295955153, + "loss": 0.2109, + "step": 354730 + }, + { + "epoch": 14.69, + "grad_norm": 0.59375, + "learning_rate": 0.00025808368344334303, + "loss": 0.207, + "step": 354740 + }, + { + "epoch": 14.69, + "grad_norm": 0.578125, + "learning_rate": 0.00025807284391192204, + "loss": 0.2406, + "step": 354750 + }, + { + "epoch": 14.69, + "grad_norm": 1.609375, + "learning_rate": 0.0002580620043653089, + "loss": 0.2062, + "step": 354760 + }, + { + "epoch": 14.69, + "grad_norm": 0.890625, + "learning_rate": 0.0002580511648035237, + "loss": 0.1855, + "step": 354770 + }, + { + "epoch": 14.69, + "grad_norm": 0.51953125, + "learning_rate": 0.00025804032522658727, + "loss": 0.208, + "step": 354780 + }, + { + "epoch": 14.7, + "grad_norm": 0.93359375, + "learning_rate": 0.00025802948563451975, + "loss": 0.1782, + "step": 354790 + }, + { + "epoch": 14.7, + "grad_norm": 0.58203125, + "learning_rate": 0.00025801864602734153, + "loss": 0.2383, + "step": 354800 + }, + { + "epoch": 14.7, + "grad_norm": 1.3828125, + "learning_rate": 0.00025800780640507324, + "loss": 0.262, + "step": 354810 + }, + { + "epoch": 14.7, + "grad_norm": 0.90625, + "learning_rate": 0.0002579969667677349, + "loss": 0.1941, + "step": 354820 + }, + { + "epoch": 14.7, + "grad_norm": 0.3203125, + "learning_rate": 0.00025798612711534736, + "loss": 0.2401, + "step": 354830 + }, + { + "epoch": 14.7, + "grad_norm": 0.56640625, + "learning_rate": 0.00025797528744793077, + "loss": 0.1773, + "step": 354840 + }, + { + "epoch": 14.7, + "grad_norm": 0.416015625, + "learning_rate": 0.0002579644477655054, + "loss": 0.1911, + "step": 354850 + }, + { + "epoch": 14.7, + "grad_norm": 0.3046875, + "learning_rate": 0.0002579536080680919, + "loss": 0.238, + "step": 354860 + }, + { + "epoch": 14.7, + "grad_norm": 1.046875, + "learning_rate": 0.0002579427683557106, + "loss": 0.2087, + "step": 354870 + }, + { + "epoch": 14.7, + "grad_norm": 0.6875, + "learning_rate": 0.0002579319286283818, + "loss": 0.2146, + "step": 354880 + }, + { + "epoch": 14.7, + "grad_norm": 0.419921875, + "learning_rate": 0.00025792108888612604, + "loss": 0.1896, + "step": 354890 + }, + { + "epoch": 14.7, + "grad_norm": 0.6328125, + "learning_rate": 0.0002579102491289635, + "loss": 0.1895, + "step": 354900 + }, + { + "epoch": 14.7, + "grad_norm": 0.91796875, + "learning_rate": 0.00025789940935691487, + "loss": 0.1872, + "step": 354910 + }, + { + "epoch": 14.7, + "grad_norm": 1.2578125, + "learning_rate": 0.0002578885695700004, + "loss": 0.1999, + "step": 354920 + }, + { + "epoch": 14.7, + "grad_norm": 0.5234375, + "learning_rate": 0.00025787772976824036, + "loss": 0.1812, + "step": 354930 + }, + { + "epoch": 14.7, + "grad_norm": 1.046875, + "learning_rate": 0.00025786688995165543, + "loss": 0.2509, + "step": 354940 + }, + { + "epoch": 14.7, + "grad_norm": 0.8359375, + "learning_rate": 0.00025785605012026573, + "loss": 0.1884, + "step": 354950 + }, + { + "epoch": 14.7, + "grad_norm": 2.03125, + "learning_rate": 0.0002578452102740919, + "loss": 0.1966, + "step": 354960 + }, + { + "epoch": 14.7, + "grad_norm": 0.4375, + "learning_rate": 0.0002578343704131542, + "loss": 0.189, + "step": 354970 + }, + { + "epoch": 14.7, + "grad_norm": 0.00010251998901367188, + "learning_rate": 0.000257823530537473, + "loss": 0.1689, + "step": 354980 + }, + { + "epoch": 14.7, + "grad_norm": 0.953125, + "learning_rate": 0.00025781269064706884, + "loss": 0.1951, + "step": 354990 + }, + { + "epoch": 14.7, + "grad_norm": 0.5703125, + "learning_rate": 0.0002578018507419621, + "loss": 0.1883, + "step": 355000 + }, + { + "epoch": 14.7, + "grad_norm": 0.98828125, + "learning_rate": 0.00025779101082217296, + "loss": 0.1853, + "step": 355010 + }, + { + "epoch": 14.7, + "grad_norm": 0.88671875, + "learning_rate": 0.0002577801708877221, + "loss": 0.1823, + "step": 355020 + }, + { + "epoch": 14.71, + "grad_norm": 0.7890625, + "learning_rate": 0.0002577693309386298, + "loss": 0.2109, + "step": 355030 + }, + { + "epoch": 14.71, + "grad_norm": 1.0390625, + "learning_rate": 0.0002577584909749164, + "loss": 0.2161, + "step": 355040 + }, + { + "epoch": 14.71, + "grad_norm": 0.90625, + "learning_rate": 0.00025774765099660234, + "loss": 0.1574, + "step": 355050 + }, + { + "epoch": 14.71, + "grad_norm": 1.2421875, + "learning_rate": 0.00025773681100370804, + "loss": 0.2094, + "step": 355060 + }, + { + "epoch": 14.71, + "grad_norm": 0.5234375, + "learning_rate": 0.00025772597099625393, + "loss": 0.119, + "step": 355070 + }, + { + "epoch": 14.71, + "grad_norm": 0.9453125, + "learning_rate": 0.0002577151309742604, + "loss": 0.1668, + "step": 355080 + }, + { + "epoch": 14.71, + "grad_norm": 1.4609375, + "learning_rate": 0.0002577042909377478, + "loss": 0.1291, + "step": 355090 + }, + { + "epoch": 14.71, + "grad_norm": 1.1875, + "learning_rate": 0.00025769345088673656, + "loss": 0.1906, + "step": 355100 + }, + { + "epoch": 14.71, + "grad_norm": 0.5859375, + "learning_rate": 0.00025768261082124706, + "loss": 0.2169, + "step": 355110 + }, + { + "epoch": 14.71, + "grad_norm": 0.92578125, + "learning_rate": 0.00025767177074129973, + "loss": 0.1624, + "step": 355120 + }, + { + "epoch": 14.71, + "grad_norm": 0.44921875, + "learning_rate": 0.000257660930646915, + "loss": 0.2091, + "step": 355130 + }, + { + "epoch": 14.71, + "grad_norm": 0.5078125, + "learning_rate": 0.0002576500905381131, + "loss": 0.1501, + "step": 355140 + }, + { + "epoch": 14.71, + "grad_norm": 0.59375, + "learning_rate": 0.0002576392504149147, + "loss": 0.1573, + "step": 355150 + }, + { + "epoch": 14.71, + "grad_norm": 0.53515625, + "learning_rate": 0.00025762841027733997, + "loss": 0.1715, + "step": 355160 + }, + { + "epoch": 14.71, + "grad_norm": 0.490234375, + "learning_rate": 0.00025761757012540936, + "loss": 0.2834, + "step": 355170 + }, + { + "epoch": 14.71, + "grad_norm": 0.5234375, + "learning_rate": 0.00025760672995914344, + "loss": 0.135, + "step": 355180 + }, + { + "epoch": 14.71, + "grad_norm": 0.33203125, + "learning_rate": 0.0002575958897785624, + "loss": 0.1344, + "step": 355190 + }, + { + "epoch": 14.71, + "grad_norm": 0.85546875, + "learning_rate": 0.00025758504958368667, + "loss": 0.2055, + "step": 355200 + }, + { + "epoch": 14.71, + "grad_norm": 1.1875, + "learning_rate": 0.0002575742093745368, + "loss": 0.1911, + "step": 355210 + }, + { + "epoch": 14.71, + "grad_norm": 0.76171875, + "learning_rate": 0.00025756336915113293, + "loss": 0.2006, + "step": 355220 + }, + { + "epoch": 14.71, + "grad_norm": 1.2734375, + "learning_rate": 0.00025755252891349577, + "loss": 0.1389, + "step": 355230 + }, + { + "epoch": 14.71, + "grad_norm": 0.51171875, + "learning_rate": 0.0002575416886616455, + "loss": 0.1678, + "step": 355240 + }, + { + "epoch": 14.71, + "grad_norm": 1.453125, + "learning_rate": 0.0002575308483956026, + "loss": 0.2103, + "step": 355250 + }, + { + "epoch": 14.71, + "grad_norm": 0.84375, + "learning_rate": 0.0002575200081153875, + "loss": 0.1872, + "step": 355260 + }, + { + "epoch": 14.72, + "grad_norm": 0.5390625, + "learning_rate": 0.0002575091678210205, + "loss": 0.1545, + "step": 355270 + }, + { + "epoch": 14.72, + "grad_norm": 0.7421875, + "learning_rate": 0.000257498327512522, + "loss": 0.1667, + "step": 355280 + }, + { + "epoch": 14.72, + "grad_norm": 0.96484375, + "learning_rate": 0.00025748748718991253, + "loss": 0.2064, + "step": 355290 + }, + { + "epoch": 14.72, + "grad_norm": 1.2734375, + "learning_rate": 0.0002574766468532124, + "loss": 0.2391, + "step": 355300 + }, + { + "epoch": 14.72, + "grad_norm": 0.63671875, + "learning_rate": 0.00025746580650244205, + "loss": 0.2058, + "step": 355310 + }, + { + "epoch": 14.72, + "grad_norm": 0.008544921875, + "learning_rate": 0.0002574549661376219, + "loss": 0.1798, + "step": 355320 + }, + { + "epoch": 14.72, + "grad_norm": 0.55078125, + "learning_rate": 0.00025744412575877217, + "loss": 0.2045, + "step": 355330 + }, + { + "epoch": 14.72, + "grad_norm": 1.0, + "learning_rate": 0.0002574332853659135, + "loss": 0.183, + "step": 355340 + }, + { + "epoch": 14.72, + "grad_norm": 0.4609375, + "learning_rate": 0.00025742244495906627, + "loss": 0.1722, + "step": 355350 + }, + { + "epoch": 14.72, + "grad_norm": 1.203125, + "learning_rate": 0.00025741160453825066, + "loss": 0.1747, + "step": 355360 + }, + { + "epoch": 14.72, + "grad_norm": 2.0625, + "learning_rate": 0.0002574007641034872, + "loss": 0.1972, + "step": 355370 + }, + { + "epoch": 14.72, + "grad_norm": 0.275390625, + "learning_rate": 0.0002573899236547964, + "loss": 0.1575, + "step": 355380 + }, + { + "epoch": 14.72, + "grad_norm": 0.95703125, + "learning_rate": 0.00025737908319219854, + "loss": 0.1947, + "step": 355390 + }, + { + "epoch": 14.72, + "grad_norm": 0.7421875, + "learning_rate": 0.000257368242715714, + "loss": 0.1635, + "step": 355400 + }, + { + "epoch": 14.72, + "grad_norm": 0.578125, + "learning_rate": 0.00025735740222536326, + "loss": 0.1688, + "step": 355410 + }, + { + "epoch": 14.72, + "grad_norm": 0.96484375, + "learning_rate": 0.00025734656172116664, + "loss": 0.2422, + "step": 355420 + }, + { + "epoch": 14.72, + "grad_norm": 1.2109375, + "learning_rate": 0.00025733572120314465, + "loss": 0.1944, + "step": 355430 + }, + { + "epoch": 14.72, + "grad_norm": 1.046875, + "learning_rate": 0.0002573248806713175, + "loss": 0.1903, + "step": 355440 + }, + { + "epoch": 14.72, + "grad_norm": 0.384765625, + "learning_rate": 0.00025731404012570584, + "loss": 0.2394, + "step": 355450 + }, + { + "epoch": 14.72, + "grad_norm": 1.140625, + "learning_rate": 0.0002573031995663299, + "loss": 0.1732, + "step": 355460 + }, + { + "epoch": 14.72, + "grad_norm": 0.96875, + "learning_rate": 0.00025729235899321016, + "loss": 0.2245, + "step": 355470 + }, + { + "epoch": 14.72, + "grad_norm": 1.0625, + "learning_rate": 0.000257281518406367, + "loss": 0.1993, + "step": 355480 + }, + { + "epoch": 14.72, + "grad_norm": 0.40234375, + "learning_rate": 0.00025727067780582073, + "loss": 0.2031, + "step": 355490 + }, + { + "epoch": 14.72, + "grad_norm": 1.4296875, + "learning_rate": 0.00025725983719159185, + "loss": 0.2231, + "step": 355500 + }, + { + "epoch": 14.73, + "grad_norm": 0.91015625, + "learning_rate": 0.0002572489965637008, + "loss": 0.2069, + "step": 355510 + }, + { + "epoch": 14.73, + "grad_norm": 1.5234375, + "learning_rate": 0.0002572381559221679, + "loss": 0.1984, + "step": 355520 + }, + { + "epoch": 14.73, + "grad_norm": 0.9140625, + "learning_rate": 0.0002572273152670135, + "loss": 0.1547, + "step": 355530 + }, + { + "epoch": 14.73, + "grad_norm": 0.22265625, + "learning_rate": 0.00025721647459825815, + "loss": 0.2381, + "step": 355540 + }, + { + "epoch": 14.73, + "grad_norm": 0.65625, + "learning_rate": 0.0002572056339159222, + "loss": 0.1993, + "step": 355550 + }, + { + "epoch": 14.73, + "grad_norm": 1.265625, + "learning_rate": 0.00025719479322002594, + "loss": 0.2338, + "step": 355560 + }, + { + "epoch": 14.73, + "grad_norm": 0.94140625, + "learning_rate": 0.0002571839525105899, + "loss": 0.2272, + "step": 355570 + }, + { + "epoch": 14.73, + "grad_norm": 0.5625, + "learning_rate": 0.0002571731117876344, + "loss": 0.1927, + "step": 355580 + }, + { + "epoch": 14.73, + "grad_norm": 1.15625, + "learning_rate": 0.00025716227105117994, + "loss": 0.2033, + "step": 355590 + }, + { + "epoch": 14.73, + "grad_norm": 0.60546875, + "learning_rate": 0.00025715143030124684, + "loss": 0.1668, + "step": 355600 + }, + { + "epoch": 14.73, + "grad_norm": 0.52734375, + "learning_rate": 0.0002571405895378555, + "loss": 0.1703, + "step": 355610 + }, + { + "epoch": 14.73, + "grad_norm": 0.7890625, + "learning_rate": 0.0002571297487610263, + "loss": 0.2109, + "step": 355620 + }, + { + "epoch": 14.73, + "grad_norm": 0.85546875, + "learning_rate": 0.0002571189079707798, + "loss": 0.1758, + "step": 355630 + }, + { + "epoch": 14.73, + "grad_norm": 1.015625, + "learning_rate": 0.0002571080671671362, + "loss": 0.1969, + "step": 355640 + }, + { + "epoch": 14.73, + "grad_norm": 2.171875, + "learning_rate": 0.00025709722635011603, + "loss": 0.1977, + "step": 355650 + }, + { + "epoch": 14.73, + "grad_norm": 0.6484375, + "learning_rate": 0.00025708638551973957, + "loss": 0.2539, + "step": 355660 + }, + { + "epoch": 14.73, + "grad_norm": 0.369140625, + "learning_rate": 0.00025707554467602737, + "loss": 0.1757, + "step": 355670 + }, + { + "epoch": 14.73, + "grad_norm": 0.5703125, + "learning_rate": 0.0002570647038189997, + "loss": 0.1712, + "step": 355680 + }, + { + "epoch": 14.73, + "grad_norm": 0.318359375, + "learning_rate": 0.00025705386294867707, + "loss": 0.182, + "step": 355690 + }, + { + "epoch": 14.73, + "grad_norm": 1.5625, + "learning_rate": 0.0002570430220650798, + "loss": 0.1656, + "step": 355700 + }, + { + "epoch": 14.73, + "grad_norm": 1.28125, + "learning_rate": 0.00025703218116822834, + "loss": 0.1806, + "step": 355710 + }, + { + "epoch": 14.73, + "grad_norm": 0.60546875, + "learning_rate": 0.00025702134025814313, + "loss": 0.2238, + "step": 355720 + }, + { + "epoch": 14.73, + "grad_norm": 0.89453125, + "learning_rate": 0.0002570104993348444, + "loss": 0.1695, + "step": 355730 + }, + { + "epoch": 14.73, + "grad_norm": 0.5625, + "learning_rate": 0.00025699965839835276, + "loss": 0.2032, + "step": 355740 + }, + { + "epoch": 14.74, + "grad_norm": 0.390625, + "learning_rate": 0.00025698881744868853, + "loss": 0.1664, + "step": 355750 + }, + { + "epoch": 14.74, + "grad_norm": 1.3515625, + "learning_rate": 0.000256977976485872, + "loss": 0.1471, + "step": 355760 + }, + { + "epoch": 14.74, + "grad_norm": 0.67578125, + "learning_rate": 0.00025696713550992374, + "loss": 0.1971, + "step": 355770 + }, + { + "epoch": 14.74, + "grad_norm": 1.875, + "learning_rate": 0.00025695629452086405, + "loss": 0.1904, + "step": 355780 + }, + { + "epoch": 14.74, + "grad_norm": 0.486328125, + "learning_rate": 0.0002569454535187134, + "loss": 0.225, + "step": 355790 + }, + { + "epoch": 14.74, + "grad_norm": 0.765625, + "learning_rate": 0.00025693461250349216, + "loss": 0.2718, + "step": 355800 + }, + { + "epoch": 14.74, + "grad_norm": 1.03125, + "learning_rate": 0.0002569237714752207, + "loss": 0.2055, + "step": 355810 + }, + { + "epoch": 14.74, + "grad_norm": 0.96484375, + "learning_rate": 0.00025691293043391945, + "loss": 0.1739, + "step": 355820 + }, + { + "epoch": 14.74, + "grad_norm": 2.0, + "learning_rate": 0.0002569020893796088, + "loss": 0.2021, + "step": 355830 + }, + { + "epoch": 14.74, + "grad_norm": 2.0, + "learning_rate": 0.0002568912483123092, + "loss": 0.209, + "step": 355840 + }, + { + "epoch": 14.74, + "grad_norm": 0.53515625, + "learning_rate": 0.000256880407232041, + "loss": 0.1658, + "step": 355850 + }, + { + "epoch": 14.74, + "grad_norm": 1.4453125, + "learning_rate": 0.00025686956613882463, + "loss": 0.15, + "step": 355860 + }, + { + "epoch": 14.74, + "grad_norm": 1.203125, + "learning_rate": 0.0002568587250326805, + "loss": 0.2236, + "step": 355870 + }, + { + "epoch": 14.74, + "grad_norm": 1.703125, + "learning_rate": 0.00025684788391362897, + "loss": 0.166, + "step": 355880 + }, + { + "epoch": 14.74, + "grad_norm": 0.859375, + "learning_rate": 0.0002568370427816904, + "loss": 0.1796, + "step": 355890 + }, + { + "epoch": 14.74, + "grad_norm": 0.7109375, + "learning_rate": 0.0002568262016368853, + "loss": 0.146, + "step": 355900 + }, + { + "epoch": 14.74, + "grad_norm": 0.68359375, + "learning_rate": 0.00025681536047923406, + "loss": 0.1677, + "step": 355910 + }, + { + "epoch": 14.74, + "grad_norm": 0.55859375, + "learning_rate": 0.000256804519308757, + "loss": 0.1777, + "step": 355920 + }, + { + "epoch": 14.74, + "grad_norm": 0.546875, + "learning_rate": 0.0002567936781254746, + "loss": 0.169, + "step": 355930 + }, + { + "epoch": 14.74, + "grad_norm": 0.6484375, + "learning_rate": 0.0002567828369294072, + "loss": 0.2335, + "step": 355940 + }, + { + "epoch": 14.74, + "grad_norm": 0.8359375, + "learning_rate": 0.00025677199572057526, + "loss": 0.1496, + "step": 355950 + }, + { + "epoch": 14.74, + "grad_norm": 0.60546875, + "learning_rate": 0.0002567611544989992, + "loss": 0.2058, + "step": 355960 + }, + { + "epoch": 14.74, + "grad_norm": 1.53125, + "learning_rate": 0.00025675031326469926, + "loss": 0.2247, + "step": 355970 + }, + { + "epoch": 14.74, + "grad_norm": 0.79296875, + "learning_rate": 0.00025673947201769606, + "loss": 0.1725, + "step": 355980 + }, + { + "epoch": 14.75, + "grad_norm": 0.427734375, + "learning_rate": 0.0002567286307580098, + "loss": 0.1899, + "step": 355990 + }, + { + "epoch": 14.75, + "grad_norm": 0.48828125, + "learning_rate": 0.00025671778948566104, + "loss": 0.1709, + "step": 356000 + }, + { + "epoch": 14.75, + "grad_norm": 2.703125, + "learning_rate": 0.0002567069482006702, + "loss": 0.2108, + "step": 356010 + }, + { + "epoch": 14.75, + "grad_norm": 0.94140625, + "learning_rate": 0.00025669610690305753, + "loss": 0.1832, + "step": 356020 + }, + { + "epoch": 14.75, + "grad_norm": 2.3125, + "learning_rate": 0.0002566852655928435, + "loss": 0.1904, + "step": 356030 + }, + { + "epoch": 14.75, + "grad_norm": 0.3671875, + "learning_rate": 0.00025667442427004854, + "loss": 0.1735, + "step": 356040 + }, + { + "epoch": 14.75, + "grad_norm": 0.63671875, + "learning_rate": 0.000256663582934693, + "loss": 0.1965, + "step": 356050 + }, + { + "epoch": 14.75, + "grad_norm": 1.2578125, + "learning_rate": 0.0002566527415867974, + "loss": 0.1779, + "step": 356060 + }, + { + "epoch": 14.75, + "grad_norm": 1.3359375, + "learning_rate": 0.000256641900226382, + "loss": 0.2187, + "step": 356070 + }, + { + "epoch": 14.75, + "grad_norm": 0.69140625, + "learning_rate": 0.0002566310588534673, + "loss": 0.2028, + "step": 356080 + }, + { + "epoch": 14.75, + "grad_norm": 0.244140625, + "learning_rate": 0.0002566202174680736, + "loss": 0.164, + "step": 356090 + }, + { + "epoch": 14.75, + "grad_norm": 0.5703125, + "learning_rate": 0.0002566093760702214, + "loss": 0.1655, + "step": 356100 + }, + { + "epoch": 14.75, + "grad_norm": 0.53125, + "learning_rate": 0.00025659853465993105, + "loss": 0.2397, + "step": 356110 + }, + { + "epoch": 14.75, + "grad_norm": 1.546875, + "learning_rate": 0.000256587693237223, + "loss": 0.2136, + "step": 356120 + }, + { + "epoch": 14.75, + "grad_norm": 1.3046875, + "learning_rate": 0.0002565768518021176, + "loss": 0.1973, + "step": 356130 + }, + { + "epoch": 14.75, + "grad_norm": 1.203125, + "learning_rate": 0.00025656601035463534, + "loss": 0.1522, + "step": 356140 + }, + { + "epoch": 14.75, + "grad_norm": 0.734375, + "learning_rate": 0.0002565551688947965, + "loss": 0.1961, + "step": 356150 + }, + { + "epoch": 14.75, + "grad_norm": 0.65234375, + "learning_rate": 0.0002565443274226215, + "loss": 0.2191, + "step": 356160 + }, + { + "epoch": 14.75, + "grad_norm": 2.453125, + "learning_rate": 0.00025653348593813086, + "loss": 0.1947, + "step": 356170 + }, + { + "epoch": 14.75, + "grad_norm": 0.7890625, + "learning_rate": 0.0002565226444413449, + "loss": 0.1953, + "step": 356180 + }, + { + "epoch": 14.75, + "grad_norm": 0.64453125, + "learning_rate": 0.000256511802932284, + "loss": 0.1899, + "step": 356190 + }, + { + "epoch": 14.75, + "grad_norm": 0.484375, + "learning_rate": 0.00025650096141096863, + "loss": 0.1669, + "step": 356200 + }, + { + "epoch": 14.75, + "grad_norm": 0.91796875, + "learning_rate": 0.000256490119877419, + "loss": 0.1935, + "step": 356210 + }, + { + "epoch": 14.75, + "grad_norm": 0.314453125, + "learning_rate": 0.00025647927833165584, + "loss": 0.1898, + "step": 356220 + }, + { + "epoch": 14.76, + "grad_norm": 0.486328125, + "learning_rate": 0.0002564684367736993, + "loss": 0.2043, + "step": 356230 + }, + { + "epoch": 14.76, + "grad_norm": 0.89453125, + "learning_rate": 0.0002564575952035699, + "loss": 0.2151, + "step": 356240 + }, + { + "epoch": 14.76, + "grad_norm": 1.59375, + "learning_rate": 0.000256446753621288, + "loss": 0.1763, + "step": 356250 + }, + { + "epoch": 14.76, + "grad_norm": 2.484375, + "learning_rate": 0.00025643591202687395, + "loss": 0.1819, + "step": 356260 + }, + { + "epoch": 14.76, + "grad_norm": 1.2578125, + "learning_rate": 0.0002564250704203482, + "loss": 0.2079, + "step": 356270 + }, + { + "epoch": 14.76, + "grad_norm": 0.59765625, + "learning_rate": 0.0002564142288017313, + "loss": 0.2174, + "step": 356280 + }, + { + "epoch": 14.76, + "grad_norm": 0.85546875, + "learning_rate": 0.0002564033871710433, + "loss": 0.1827, + "step": 356290 + }, + { + "epoch": 14.76, + "grad_norm": 2.140625, + "learning_rate": 0.000256392545528305, + "loss": 0.1949, + "step": 356300 + }, + { + "epoch": 14.76, + "grad_norm": 0.90234375, + "learning_rate": 0.0002563817038735365, + "loss": 0.2229, + "step": 356310 + }, + { + "epoch": 14.76, + "grad_norm": 0.62890625, + "learning_rate": 0.0002563708622067584, + "loss": 0.2014, + "step": 356320 + }, + { + "epoch": 14.76, + "grad_norm": 1.0859375, + "learning_rate": 0.00025636002052799106, + "loss": 0.2272, + "step": 356330 + }, + { + "epoch": 14.76, + "grad_norm": 0.76171875, + "learning_rate": 0.00025634917883725475, + "loss": 0.1939, + "step": 356340 + }, + { + "epoch": 14.76, + "grad_norm": 0.56640625, + "learning_rate": 0.00025633833713457004, + "loss": 0.2386, + "step": 356350 + }, + { + "epoch": 14.76, + "grad_norm": 0.75390625, + "learning_rate": 0.0002563274954199573, + "loss": 0.2035, + "step": 356360 + }, + { + "epoch": 14.76, + "grad_norm": 1.5078125, + "learning_rate": 0.00025631665369343676, + "loss": 0.1817, + "step": 356370 + }, + { + "epoch": 14.76, + "grad_norm": 1.0703125, + "learning_rate": 0.00025630581195502906, + "loss": 0.1957, + "step": 356380 + }, + { + "epoch": 14.76, + "grad_norm": 0.404296875, + "learning_rate": 0.00025629497020475444, + "loss": 0.186, + "step": 356390 + }, + { + "epoch": 14.76, + "grad_norm": 3.453125, + "learning_rate": 0.00025628412844263344, + "loss": 0.1756, + "step": 356400 + }, + { + "epoch": 14.76, + "grad_norm": 2.109375, + "learning_rate": 0.00025627328666868635, + "loss": 0.2081, + "step": 356410 + }, + { + "epoch": 14.76, + "grad_norm": 0.98046875, + "learning_rate": 0.0002562624448829336, + "loss": 0.1434, + "step": 356420 + }, + { + "epoch": 14.76, + "grad_norm": 1.0546875, + "learning_rate": 0.0002562516030853956, + "loss": 0.1596, + "step": 356430 + }, + { + "epoch": 14.76, + "grad_norm": 0.71484375, + "learning_rate": 0.00025624076127609286, + "loss": 0.2222, + "step": 356440 + }, + { + "epoch": 14.76, + "grad_norm": 1.0859375, + "learning_rate": 0.00025622991945504554, + "loss": 0.221, + "step": 356450 + }, + { + "epoch": 14.76, + "grad_norm": 0.376953125, + "learning_rate": 0.00025621907762227424, + "loss": 0.1673, + "step": 356460 + }, + { + "epoch": 14.76, + "grad_norm": 0.7109375, + "learning_rate": 0.0002562082357777993, + "loss": 0.2016, + "step": 356470 + }, + { + "epoch": 14.77, + "grad_norm": 0.46875, + "learning_rate": 0.00025619739392164113, + "loss": 0.1618, + "step": 356480 + }, + { + "epoch": 14.77, + "grad_norm": 0.69921875, + "learning_rate": 0.0002561865520538202, + "loss": 0.186, + "step": 356490 + }, + { + "epoch": 14.77, + "grad_norm": 0.24609375, + "learning_rate": 0.00025617571017435675, + "loss": 0.2468, + "step": 356500 + }, + { + "epoch": 14.77, + "grad_norm": 0.83203125, + "learning_rate": 0.00025616486828327137, + "loss": 0.1639, + "step": 356510 + }, + { + "epoch": 14.77, + "grad_norm": 1.2265625, + "learning_rate": 0.0002561540263805843, + "loss": 0.1726, + "step": 356520 + }, + { + "epoch": 14.77, + "grad_norm": 3.34375, + "learning_rate": 0.000256143184466316, + "loss": 0.1908, + "step": 356530 + }, + { + "epoch": 14.77, + "grad_norm": 1.3125, + "learning_rate": 0.00025613234254048703, + "loss": 0.2306, + "step": 356540 + }, + { + "epoch": 14.77, + "grad_norm": 0.98828125, + "learning_rate": 0.0002561215006031175, + "loss": 0.2137, + "step": 356550 + }, + { + "epoch": 14.77, + "grad_norm": 0.53515625, + "learning_rate": 0.000256110658654228, + "loss": 0.2349, + "step": 356560 + }, + { + "epoch": 14.77, + "grad_norm": 0.6796875, + "learning_rate": 0.000256099816693839, + "loss": 0.1516, + "step": 356570 + }, + { + "epoch": 14.77, + "grad_norm": 1.234375, + "learning_rate": 0.00025608897472197064, + "loss": 0.2106, + "step": 356580 + }, + { + "epoch": 14.77, + "grad_norm": 1.1015625, + "learning_rate": 0.0002560781327386436, + "loss": 0.2073, + "step": 356590 + }, + { + "epoch": 14.77, + "grad_norm": 0.94140625, + "learning_rate": 0.00025606729074387815, + "loss": 0.1946, + "step": 356600 + }, + { + "epoch": 14.77, + "grad_norm": 1.015625, + "learning_rate": 0.0002560564487376947, + "loss": 0.2431, + "step": 356610 + }, + { + "epoch": 14.77, + "grad_norm": 0.49609375, + "learning_rate": 0.00025604560672011376, + "loss": 0.1599, + "step": 356620 + }, + { + "epoch": 14.77, + "grad_norm": 1.65625, + "learning_rate": 0.0002560347646911555, + "loss": 0.1811, + "step": 356630 + }, + { + "epoch": 14.77, + "grad_norm": 0.515625, + "learning_rate": 0.0002560239226508405, + "loss": 0.1752, + "step": 356640 + }, + { + "epoch": 14.77, + "grad_norm": 1.6171875, + "learning_rate": 0.0002560130805991892, + "loss": 0.2709, + "step": 356650 + }, + { + "epoch": 14.77, + "grad_norm": 1.0234375, + "learning_rate": 0.00025600223853622183, + "loss": 0.169, + "step": 356660 + }, + { + "epoch": 14.77, + "grad_norm": 0.6328125, + "learning_rate": 0.000255991396461959, + "loss": 0.1607, + "step": 356670 + }, + { + "epoch": 14.77, + "grad_norm": 1.03125, + "learning_rate": 0.0002559805543764209, + "loss": 0.2227, + "step": 356680 + }, + { + "epoch": 14.77, + "grad_norm": 0.78125, + "learning_rate": 0.0002559697122796281, + "loss": 0.1868, + "step": 356690 + }, + { + "epoch": 14.77, + "grad_norm": 0.84375, + "learning_rate": 0.00025595887017160104, + "loss": 0.1661, + "step": 356700 + }, + { + "epoch": 14.77, + "grad_norm": 0.6953125, + "learning_rate": 0.0002559480280523599, + "loss": 0.2021, + "step": 356710 + }, + { + "epoch": 14.78, + "grad_norm": 0.91015625, + "learning_rate": 0.00025593718592192523, + "loss": 0.1325, + "step": 356720 + }, + { + "epoch": 14.78, + "grad_norm": 0.421875, + "learning_rate": 0.0002559263437803175, + "loss": 0.2048, + "step": 356730 + }, + { + "epoch": 14.78, + "grad_norm": 0.91796875, + "learning_rate": 0.0002559155016275569, + "loss": 0.1959, + "step": 356740 + }, + { + "epoch": 14.78, + "grad_norm": 2.625, + "learning_rate": 0.000255904659463664, + "loss": 0.2094, + "step": 356750 + }, + { + "epoch": 14.78, + "grad_norm": 0.7421875, + "learning_rate": 0.00025589381728865926, + "loss": 0.1875, + "step": 356760 + }, + { + "epoch": 14.78, + "grad_norm": 1.5078125, + "learning_rate": 0.00025588297510256285, + "loss": 0.2144, + "step": 356770 + }, + { + "epoch": 14.78, + "grad_norm": 0.82421875, + "learning_rate": 0.00025587213290539544, + "loss": 0.2043, + "step": 356780 + }, + { + "epoch": 14.78, + "grad_norm": 0.72265625, + "learning_rate": 0.0002558612906971773, + "loss": 0.1949, + "step": 356790 + }, + { + "epoch": 14.78, + "grad_norm": 0.52734375, + "learning_rate": 0.00025585044847792874, + "loss": 0.1878, + "step": 356800 + }, + { + "epoch": 14.78, + "grad_norm": 2.9375, + "learning_rate": 0.0002558396062476704, + "loss": 0.1834, + "step": 356810 + }, + { + "epoch": 14.78, + "grad_norm": 0.66796875, + "learning_rate": 0.00025582876400642246, + "loss": 0.2032, + "step": 356820 + }, + { + "epoch": 14.78, + "grad_norm": 0.345703125, + "learning_rate": 0.0002558179217542055, + "loss": 0.2141, + "step": 356830 + }, + { + "epoch": 14.78, + "grad_norm": 1.2109375, + "learning_rate": 0.00025580707949103977, + "loss": 0.1958, + "step": 356840 + }, + { + "epoch": 14.78, + "grad_norm": 0.96875, + "learning_rate": 0.0002557962372169457, + "loss": 0.2023, + "step": 356850 + }, + { + "epoch": 14.78, + "grad_norm": 0.90234375, + "learning_rate": 0.0002557853949319439, + "loss": 0.2005, + "step": 356860 + }, + { + "epoch": 14.78, + "grad_norm": 1.3515625, + "learning_rate": 0.0002557745526360545, + "loss": 0.1502, + "step": 356870 + }, + { + "epoch": 14.78, + "grad_norm": 0.87890625, + "learning_rate": 0.000255763710329298, + "loss": 0.2049, + "step": 356880 + }, + { + "epoch": 14.78, + "grad_norm": 0.9453125, + "learning_rate": 0.00025575286801169484, + "loss": 0.1726, + "step": 356890 + }, + { + "epoch": 14.78, + "grad_norm": 0.6015625, + "learning_rate": 0.0002557420256832654, + "loss": 0.1616, + "step": 356900 + }, + { + "epoch": 14.78, + "grad_norm": 0.46875, + "learning_rate": 0.0002557311833440301, + "loss": 0.2082, + "step": 356910 + }, + { + "epoch": 14.78, + "grad_norm": 1.28125, + "learning_rate": 0.0002557203409940093, + "loss": 0.1667, + "step": 356920 + }, + { + "epoch": 14.78, + "grad_norm": 3.03125, + "learning_rate": 0.0002557094986332235, + "loss": 0.2062, + "step": 356930 + }, + { + "epoch": 14.78, + "grad_norm": 0.5859375, + "learning_rate": 0.00025569865626169305, + "loss": 0.142, + "step": 356940 + }, + { + "epoch": 14.78, + "grad_norm": 0.6875, + "learning_rate": 0.00025568781387943823, + "loss": 0.2169, + "step": 356950 + }, + { + "epoch": 14.79, + "grad_norm": 1.015625, + "learning_rate": 0.0002556769714864796, + "loss": 0.1385, + "step": 356960 + }, + { + "epoch": 14.79, + "grad_norm": 0.439453125, + "learning_rate": 0.0002556661290828376, + "loss": 0.2275, + "step": 356970 + }, + { + "epoch": 14.79, + "grad_norm": 0.78125, + "learning_rate": 0.0002556552866685325, + "loss": 0.1696, + "step": 356980 + }, + { + "epoch": 14.79, + "grad_norm": 0.875, + "learning_rate": 0.0002556444442435848, + "loss": 0.2199, + "step": 356990 + }, + { + "epoch": 14.79, + "grad_norm": 0.44140625, + "learning_rate": 0.00025563360180801484, + "loss": 0.1774, + "step": 357000 + }, + { + "epoch": 14.79, + "grad_norm": 0.44140625, + "learning_rate": 0.000255622759361843, + "loss": 0.2024, + "step": 357010 + }, + { + "epoch": 14.79, + "grad_norm": 1.2734375, + "learning_rate": 0.00025561191690508977, + "loss": 0.186, + "step": 357020 + }, + { + "epoch": 14.79, + "grad_norm": 1.375, + "learning_rate": 0.00025560107443777555, + "loss": 0.205, + "step": 357030 + }, + { + "epoch": 14.79, + "grad_norm": 2.1875, + "learning_rate": 0.00025559023195992066, + "loss": 0.1886, + "step": 357040 + }, + { + "epoch": 14.79, + "grad_norm": 0.48828125, + "learning_rate": 0.0002555793894715456, + "loss": 0.1538, + "step": 357050 + }, + { + "epoch": 14.79, + "grad_norm": 0.6796875, + "learning_rate": 0.0002555685469726707, + "loss": 0.1753, + "step": 357060 + }, + { + "epoch": 14.79, + "grad_norm": 0.703125, + "learning_rate": 0.00025555770446331646, + "loss": 0.1911, + "step": 357070 + }, + { + "epoch": 14.79, + "grad_norm": 0.94140625, + "learning_rate": 0.0002555468619435031, + "loss": 0.178, + "step": 357080 + }, + { + "epoch": 14.79, + "grad_norm": 0.6015625, + "learning_rate": 0.0002555360194132513, + "loss": 0.189, + "step": 357090 + }, + { + "epoch": 14.79, + "grad_norm": 1.5390625, + "learning_rate": 0.00025552517687258113, + "loss": 0.209, + "step": 357100 + }, + { + "epoch": 14.79, + "grad_norm": 0.83203125, + "learning_rate": 0.0002555143343215133, + "loss": 0.1585, + "step": 357110 + }, + { + "epoch": 14.79, + "grad_norm": 0.6875, + "learning_rate": 0.0002555034917600681, + "loss": 0.1458, + "step": 357120 + }, + { + "epoch": 14.79, + "grad_norm": 5.6875, + "learning_rate": 0.0002554926491882659, + "loss": 0.1951, + "step": 357130 + }, + { + "epoch": 14.79, + "grad_norm": 0.65625, + "learning_rate": 0.00025548180660612705, + "loss": 0.1683, + "step": 357140 + }, + { + "epoch": 14.79, + "grad_norm": 0.70703125, + "learning_rate": 0.00025547096401367216, + "loss": 0.1863, + "step": 357150 + }, + { + "epoch": 14.79, + "grad_norm": 0.68359375, + "learning_rate": 0.0002554601214109214, + "loss": 0.1631, + "step": 357160 + }, + { + "epoch": 14.79, + "grad_norm": 0.83984375, + "learning_rate": 0.0002554492787978953, + "loss": 0.1641, + "step": 357170 + }, + { + "epoch": 14.79, + "grad_norm": 1.2109375, + "learning_rate": 0.00025543843617461425, + "loss": 0.2125, + "step": 357180 + }, + { + "epoch": 14.79, + "grad_norm": 1.09375, + "learning_rate": 0.0002554275935410987, + "loss": 0.2507, + "step": 357190 + }, + { + "epoch": 14.8, + "grad_norm": 2.203125, + "learning_rate": 0.000255416750897369, + "loss": 0.2005, + "step": 357200 + }, + { + "epoch": 14.8, + "grad_norm": 0.5625, + "learning_rate": 0.0002554059082434455, + "loss": 0.1921, + "step": 357210 + }, + { + "epoch": 14.8, + "grad_norm": 0.84375, + "learning_rate": 0.00025539506557934865, + "loss": 0.1873, + "step": 357220 + }, + { + "epoch": 14.8, + "grad_norm": 0.78515625, + "learning_rate": 0.00025538422290509894, + "loss": 0.1965, + "step": 357230 + }, + { + "epoch": 14.8, + "grad_norm": 1.40625, + "learning_rate": 0.0002553733802207167, + "loss": 0.2496, + "step": 357240 + }, + { + "epoch": 14.8, + "grad_norm": 0.95703125, + "learning_rate": 0.0002553625375262223, + "loss": 0.2048, + "step": 357250 + }, + { + "epoch": 14.8, + "grad_norm": 0.87109375, + "learning_rate": 0.0002553516948216362, + "loss": 0.1757, + "step": 357260 + }, + { + "epoch": 14.8, + "grad_norm": 0.7734375, + "learning_rate": 0.00025534085210697884, + "loss": 0.1963, + "step": 357270 + }, + { + "epoch": 14.8, + "grad_norm": 0.279296875, + "learning_rate": 0.0002553300093822705, + "loss": 0.1736, + "step": 357280 + }, + { + "epoch": 14.8, + "grad_norm": 0.7734375, + "learning_rate": 0.0002553191666475317, + "loss": 0.1986, + "step": 357290 + }, + { + "epoch": 14.8, + "grad_norm": 0.375, + "learning_rate": 0.00025530832390278273, + "loss": 0.1745, + "step": 357300 + }, + { + "epoch": 14.8, + "grad_norm": 0.390625, + "learning_rate": 0.00025529748114804413, + "loss": 0.1966, + "step": 357310 + }, + { + "epoch": 14.8, + "grad_norm": 1.0625, + "learning_rate": 0.0002552866383833363, + "loss": 0.2016, + "step": 357320 + }, + { + "epoch": 14.8, + "grad_norm": 1.0, + "learning_rate": 0.0002552757956086795, + "loss": 0.1569, + "step": 357330 + }, + { + "epoch": 14.8, + "grad_norm": 1.5703125, + "learning_rate": 0.0002552649528240942, + "loss": 0.1934, + "step": 357340 + }, + { + "epoch": 14.8, + "grad_norm": 0.57421875, + "learning_rate": 0.00025525411002960094, + "loss": 0.1835, + "step": 357350 + }, + { + "epoch": 14.8, + "grad_norm": 0.33984375, + "learning_rate": 0.00025524326722521987, + "loss": 0.1364, + "step": 357360 + }, + { + "epoch": 14.8, + "grad_norm": 0.703125, + "learning_rate": 0.00025523242441097163, + "loss": 0.1822, + "step": 357370 + }, + { + "epoch": 14.8, + "grad_norm": 0.7109375, + "learning_rate": 0.00025522158158687645, + "loss": 0.1839, + "step": 357380 + }, + { + "epoch": 14.8, + "grad_norm": 1.625, + "learning_rate": 0.00025521073875295493, + "loss": 0.1591, + "step": 357390 + }, + { + "epoch": 14.8, + "grad_norm": 0.57421875, + "learning_rate": 0.00025519989590922735, + "loss": 0.1624, + "step": 357400 + }, + { + "epoch": 14.8, + "grad_norm": 0.59375, + "learning_rate": 0.000255189053055714, + "loss": 0.1298, + "step": 357410 + }, + { + "epoch": 14.8, + "grad_norm": 0.7890625, + "learning_rate": 0.0002551782101924355, + "loss": 0.1704, + "step": 357420 + }, + { + "epoch": 14.8, + "grad_norm": 1.0, + "learning_rate": 0.00025516736731941224, + "loss": 0.1375, + "step": 357430 + }, + { + "epoch": 14.81, + "grad_norm": 1.6015625, + "learning_rate": 0.00025515652443666443, + "loss": 0.1806, + "step": 357440 + }, + { + "epoch": 14.81, + "grad_norm": 0.640625, + "learning_rate": 0.0002551456815442127, + "loss": 0.1992, + "step": 357450 + }, + { + "epoch": 14.81, + "grad_norm": 0.796875, + "learning_rate": 0.00025513483864207724, + "loss": 0.178, + "step": 357460 + }, + { + "epoch": 14.81, + "grad_norm": 0.34765625, + "learning_rate": 0.00025512399573027867, + "loss": 0.1822, + "step": 357470 + }, + { + "epoch": 14.81, + "grad_norm": 1.21875, + "learning_rate": 0.0002551131528088373, + "loss": 0.2055, + "step": 357480 + }, + { + "epoch": 14.81, + "grad_norm": 0.921875, + "learning_rate": 0.00025510230987777345, + "loss": 0.1824, + "step": 357490 + }, + { + "epoch": 14.81, + "grad_norm": 0.68359375, + "learning_rate": 0.0002550914669371076, + "loss": 0.1778, + "step": 357500 + }, + { + "epoch": 14.81, + "grad_norm": 0.87890625, + "learning_rate": 0.00025508062398686026, + "loss": 0.1857, + "step": 357510 + }, + { + "epoch": 14.81, + "grad_norm": 0.3359375, + "learning_rate": 0.0002550697810270516, + "loss": 0.1745, + "step": 357520 + }, + { + "epoch": 14.81, + "grad_norm": 0.93359375, + "learning_rate": 0.0002550589380577023, + "loss": 0.1752, + "step": 357530 + }, + { + "epoch": 14.81, + "grad_norm": 0.94140625, + "learning_rate": 0.0002550480950788325, + "loss": 0.1708, + "step": 357540 + }, + { + "epoch": 14.81, + "grad_norm": 0.7734375, + "learning_rate": 0.0002550372520904628, + "loss": 0.1823, + "step": 357550 + }, + { + "epoch": 14.81, + "grad_norm": 1.7109375, + "learning_rate": 0.00025502640909261357, + "loss": 0.2205, + "step": 357560 + }, + { + "epoch": 14.81, + "grad_norm": 2.09375, + "learning_rate": 0.0002550155660853051, + "loss": 0.2176, + "step": 357570 + }, + { + "epoch": 14.81, + "grad_norm": 0.73828125, + "learning_rate": 0.00025500472306855786, + "loss": 0.1872, + "step": 357580 + }, + { + "epoch": 14.81, + "grad_norm": 0.609375, + "learning_rate": 0.0002549938800423924, + "loss": 0.209, + "step": 357590 + }, + { + "epoch": 14.81, + "grad_norm": 0.4765625, + "learning_rate": 0.00025498303700682883, + "loss": 0.1327, + "step": 357600 + }, + { + "epoch": 14.81, + "grad_norm": 1.34375, + "learning_rate": 0.00025497219396188785, + "loss": 0.2371, + "step": 357610 + }, + { + "epoch": 14.81, + "grad_norm": 0.0, + "learning_rate": 0.0002549613509075897, + "loss": 0.1968, + "step": 357620 + }, + { + "epoch": 14.81, + "grad_norm": 0.515625, + "learning_rate": 0.0002549505078439548, + "loss": 0.1804, + "step": 357630 + }, + { + "epoch": 14.81, + "grad_norm": 2.421875, + "learning_rate": 0.0002549396647710037, + "loss": 0.1502, + "step": 357640 + }, + { + "epoch": 14.81, + "grad_norm": 0.58203125, + "learning_rate": 0.00025492882168875645, + "loss": 0.201, + "step": 357650 + }, + { + "epoch": 14.81, + "grad_norm": 0.85546875, + "learning_rate": 0.00025491797859723386, + "loss": 0.1757, + "step": 357660 + }, + { + "epoch": 14.81, + "grad_norm": 0.71484375, + "learning_rate": 0.0002549071354964561, + "loss": 0.1844, + "step": 357670 + }, + { + "epoch": 14.82, + "grad_norm": 1.3828125, + "learning_rate": 0.0002548962923864437, + "loss": 0.2078, + "step": 357680 + }, + { + "epoch": 14.82, + "grad_norm": 0.57421875, + "learning_rate": 0.000254885449267217, + "loss": 0.1397, + "step": 357690 + }, + { + "epoch": 14.82, + "grad_norm": 1.6015625, + "learning_rate": 0.00025487460613879637, + "loss": 0.161, + "step": 357700 + }, + { + "epoch": 14.82, + "grad_norm": 1.3359375, + "learning_rate": 0.0002548637630012023, + "loss": 0.2075, + "step": 357710 + }, + { + "epoch": 14.82, + "grad_norm": 1.34375, + "learning_rate": 0.00025485291985445516, + "loss": 0.1777, + "step": 357720 + }, + { + "epoch": 14.82, + "grad_norm": 0.486328125, + "learning_rate": 0.00025484207669857525, + "loss": 0.2209, + "step": 357730 + }, + { + "epoch": 14.82, + "grad_norm": 1.2109375, + "learning_rate": 0.00025483123353358316, + "loss": 0.138, + "step": 357740 + }, + { + "epoch": 14.82, + "grad_norm": 1.59375, + "learning_rate": 0.0002548203903594992, + "loss": 0.1962, + "step": 357750 + }, + { + "epoch": 14.82, + "grad_norm": 0.49609375, + "learning_rate": 0.0002548095471763437, + "loss": 0.204, + "step": 357760 + }, + { + "epoch": 14.82, + "grad_norm": 0.75390625, + "learning_rate": 0.00025479870398413735, + "loss": 0.1997, + "step": 357770 + }, + { + "epoch": 14.82, + "grad_norm": 0.765625, + "learning_rate": 0.00025478786078290017, + "loss": 0.1788, + "step": 357780 + }, + { + "epoch": 14.82, + "grad_norm": 0.5859375, + "learning_rate": 0.00025477701757265284, + "loss": 0.208, + "step": 357790 + }, + { + "epoch": 14.82, + "grad_norm": 0.94140625, + "learning_rate": 0.0002547661743534157, + "loss": 0.1991, + "step": 357800 + }, + { + "epoch": 14.82, + "grad_norm": 0.91015625, + "learning_rate": 0.000254755331125209, + "loss": 0.2407, + "step": 357810 + }, + { + "epoch": 14.82, + "grad_norm": 0.6015625, + "learning_rate": 0.0002547444878880534, + "loss": 0.1747, + "step": 357820 + }, + { + "epoch": 14.82, + "grad_norm": 1.2109375, + "learning_rate": 0.0002547336446419692, + "loss": 0.1873, + "step": 357830 + }, + { + "epoch": 14.82, + "grad_norm": 0.326171875, + "learning_rate": 0.0002547228013869767, + "loss": 0.1824, + "step": 357840 + }, + { + "epoch": 14.82, + "grad_norm": 0.8125, + "learning_rate": 0.0002547119581230965, + "loss": 0.2136, + "step": 357850 + }, + { + "epoch": 14.82, + "grad_norm": 1.03125, + "learning_rate": 0.0002547011148503489, + "loss": 0.1938, + "step": 357860 + }, + { + "epoch": 14.82, + "grad_norm": 0.89453125, + "learning_rate": 0.0002546902715687542, + "loss": 0.2412, + "step": 357870 + }, + { + "epoch": 14.82, + "grad_norm": 0.66796875, + "learning_rate": 0.000254679428278333, + "loss": 0.1239, + "step": 357880 + }, + { + "epoch": 14.82, + "grad_norm": 0.6640625, + "learning_rate": 0.00025466858497910557, + "loss": 0.1963, + "step": 357890 + }, + { + "epoch": 14.82, + "grad_norm": 0.81640625, + "learning_rate": 0.0002546577416710924, + "loss": 0.2009, + "step": 357900 + }, + { + "epoch": 14.82, + "grad_norm": 1.1796875, + "learning_rate": 0.00025464689835431387, + "loss": 0.2095, + "step": 357910 + }, + { + "epoch": 14.83, + "grad_norm": 0.375, + "learning_rate": 0.0002546360550287903, + "loss": 0.1574, + "step": 357920 + }, + { + "epoch": 14.83, + "grad_norm": 0.6796875, + "learning_rate": 0.00025462521169454234, + "loss": 0.2021, + "step": 357930 + }, + { + "epoch": 14.83, + "grad_norm": 1.7890625, + "learning_rate": 0.00025461436835159013, + "loss": 0.1816, + "step": 357940 + }, + { + "epoch": 14.83, + "grad_norm": 0.48046875, + "learning_rate": 0.00025460352499995416, + "loss": 0.1524, + "step": 357950 + }, + { + "epoch": 14.83, + "grad_norm": 2.203125, + "learning_rate": 0.0002545926816396549, + "loss": 0.2689, + "step": 357960 + }, + { + "epoch": 14.83, + "grad_norm": 0.423828125, + "learning_rate": 0.00025458183827071264, + "loss": 0.1531, + "step": 357970 + }, + { + "epoch": 14.83, + "grad_norm": 0.71484375, + "learning_rate": 0.0002545709948931479, + "loss": 0.1802, + "step": 357980 + }, + { + "epoch": 14.83, + "grad_norm": 0.76953125, + "learning_rate": 0.0002545601515069811, + "loss": 0.2532, + "step": 357990 + }, + { + "epoch": 14.83, + "grad_norm": 1.1015625, + "learning_rate": 0.00025454930811223247, + "loss": 0.1528, + "step": 358000 + }, + { + "epoch": 14.83, + "grad_norm": 0.5546875, + "learning_rate": 0.00025453846470892266, + "loss": 0.1681, + "step": 358010 + }, + { + "epoch": 14.83, + "grad_norm": 0.458984375, + "learning_rate": 0.00025452762129707184, + "loss": 0.1394, + "step": 358020 + }, + { + "epoch": 14.83, + "grad_norm": 0.625, + "learning_rate": 0.0002545167778767006, + "loss": 0.2256, + "step": 358030 + }, + { + "epoch": 14.83, + "grad_norm": 0.5234375, + "learning_rate": 0.00025450593444782926, + "loss": 0.1899, + "step": 358040 + }, + { + "epoch": 14.83, + "grad_norm": 0.6484375, + "learning_rate": 0.0002544950910104781, + "loss": 0.1637, + "step": 358050 + }, + { + "epoch": 14.83, + "grad_norm": 0.54296875, + "learning_rate": 0.00025448424756466785, + "loss": 0.1764, + "step": 358060 + }, + { + "epoch": 14.83, + "grad_norm": 0.00066375732421875, + "learning_rate": 0.00025447340411041865, + "loss": 0.1723, + "step": 358070 + }, + { + "epoch": 14.83, + "grad_norm": 1.0078125, + "learning_rate": 0.000254462560647751, + "loss": 0.2168, + "step": 358080 + }, + { + "epoch": 14.83, + "grad_norm": 0.58203125, + "learning_rate": 0.00025445171717668533, + "loss": 0.1361, + "step": 358090 + }, + { + "epoch": 14.83, + "grad_norm": 0.484375, + "learning_rate": 0.0002544408736972419, + "loss": 0.1922, + "step": 358100 + }, + { + "epoch": 14.83, + "grad_norm": 0.283203125, + "learning_rate": 0.00025443003020944133, + "loss": 0.1789, + "step": 358110 + }, + { + "epoch": 14.83, + "grad_norm": 0.83203125, + "learning_rate": 0.00025441918671330387, + "loss": 0.1936, + "step": 358120 + }, + { + "epoch": 14.83, + "grad_norm": 0.376953125, + "learning_rate": 0.00025440834320885, + "loss": 0.1639, + "step": 358130 + }, + { + "epoch": 14.83, + "grad_norm": 0.58984375, + "learning_rate": 0.00025439749969610016, + "loss": 0.1589, + "step": 358140 + }, + { + "epoch": 14.83, + "grad_norm": 1.2734375, + "learning_rate": 0.0002543866561750746, + "loss": 0.2012, + "step": 358150 + }, + { + "epoch": 14.83, + "grad_norm": 1.1015625, + "learning_rate": 0.00025437581264579385, + "loss": 0.2073, + "step": 358160 + }, + { + "epoch": 14.84, + "grad_norm": 1.4609375, + "learning_rate": 0.0002543649691082783, + "loss": 0.1541, + "step": 358170 + }, + { + "epoch": 14.84, + "grad_norm": 0.73046875, + "learning_rate": 0.0002543541255625484, + "loss": 0.1305, + "step": 358180 + }, + { + "epoch": 14.84, + "grad_norm": 0.37890625, + "learning_rate": 0.00025434328200862443, + "loss": 0.1559, + "step": 358190 + }, + { + "epoch": 14.84, + "grad_norm": 0.765625, + "learning_rate": 0.00025433243844652693, + "loss": 0.1656, + "step": 358200 + }, + { + "epoch": 14.84, + "grad_norm": 0.5078125, + "learning_rate": 0.00025432159487627623, + "loss": 0.1874, + "step": 358210 + }, + { + "epoch": 14.84, + "grad_norm": 0.6328125, + "learning_rate": 0.00025431075129789276, + "loss": 0.1694, + "step": 358220 + }, + { + "epoch": 14.84, + "grad_norm": 1.046875, + "learning_rate": 0.00025429990771139686, + "loss": 0.2156, + "step": 358230 + }, + { + "epoch": 14.84, + "grad_norm": 0.640625, + "learning_rate": 0.0002542890641168091, + "loss": 0.2276, + "step": 358240 + }, + { + "epoch": 14.84, + "grad_norm": 1.1640625, + "learning_rate": 0.0002542782205141497, + "loss": 0.1726, + "step": 358250 + }, + { + "epoch": 14.84, + "grad_norm": 1.6640625, + "learning_rate": 0.00025426737690343915, + "loss": 0.1673, + "step": 358260 + }, + { + "epoch": 14.84, + "grad_norm": 0.72265625, + "learning_rate": 0.00025425653328469794, + "loss": 0.1183, + "step": 358270 + }, + { + "epoch": 14.84, + "grad_norm": 0.69140625, + "learning_rate": 0.00025424568965794637, + "loss": 0.1279, + "step": 358280 + }, + { + "epoch": 14.84, + "grad_norm": 0.43359375, + "learning_rate": 0.00025423484602320477, + "loss": 0.1633, + "step": 358290 + }, + { + "epoch": 14.84, + "grad_norm": 0.91015625, + "learning_rate": 0.0002542240023804938, + "loss": 0.1553, + "step": 358300 + }, + { + "epoch": 14.84, + "grad_norm": 0.625, + "learning_rate": 0.0002542131587298336, + "loss": 0.1792, + "step": 358310 + }, + { + "epoch": 14.84, + "grad_norm": 1.046875, + "learning_rate": 0.00025420231507124473, + "loss": 0.2248, + "step": 358320 + }, + { + "epoch": 14.84, + "grad_norm": 1.015625, + "learning_rate": 0.00025419147140474756, + "loss": 0.1716, + "step": 358330 + }, + { + "epoch": 14.84, + "grad_norm": 0.5078125, + "learning_rate": 0.0002541806277303625, + "loss": 0.1778, + "step": 358340 + }, + { + "epoch": 14.84, + "grad_norm": 0.95703125, + "learning_rate": 0.00025416978404810996, + "loss": 0.198, + "step": 358350 + }, + { + "epoch": 14.84, + "grad_norm": 0.6640625, + "learning_rate": 0.0002541589403580103, + "loss": 0.1594, + "step": 358360 + }, + { + "epoch": 14.84, + "grad_norm": 1.0390625, + "learning_rate": 0.00025414809666008396, + "loss": 0.1609, + "step": 358370 + }, + { + "epoch": 14.84, + "grad_norm": 0.8046875, + "learning_rate": 0.0002541372529543514, + "loss": 0.1671, + "step": 358380 + }, + { + "epoch": 14.84, + "grad_norm": 0.83984375, + "learning_rate": 0.000254126409240833, + "loss": 0.2066, + "step": 358390 + }, + { + "epoch": 14.84, + "grad_norm": 0.71484375, + "learning_rate": 0.000254115565519549, + "loss": 0.2047, + "step": 358400 + }, + { + "epoch": 14.85, + "grad_norm": 1.25, + "learning_rate": 0.00025410472179052004, + "loss": 0.2159, + "step": 358410 + }, + { + "epoch": 14.85, + "grad_norm": 1.25, + "learning_rate": 0.00025409387805376646, + "loss": 0.1826, + "step": 358420 + }, + { + "epoch": 14.85, + "grad_norm": 0.90625, + "learning_rate": 0.00025408303430930865, + "loss": 0.1484, + "step": 358430 + }, + { + "epoch": 14.85, + "grad_norm": 0.46875, + "learning_rate": 0.000254072190557167, + "loss": 0.2288, + "step": 358440 + }, + { + "epoch": 14.85, + "grad_norm": 0.86328125, + "learning_rate": 0.00025406134679736186, + "loss": 0.1836, + "step": 358450 + }, + { + "epoch": 14.85, + "grad_norm": 2.1875, + "learning_rate": 0.0002540505030299138, + "loss": 0.202, + "step": 358460 + }, + { + "epoch": 14.85, + "grad_norm": 0.88671875, + "learning_rate": 0.00025403965925484306, + "loss": 0.221, + "step": 358470 + }, + { + "epoch": 14.85, + "grad_norm": 0.345703125, + "learning_rate": 0.0002540288154721701, + "loss": 0.2146, + "step": 358480 + }, + { + "epoch": 14.85, + "grad_norm": 0.98828125, + "learning_rate": 0.00025401797168191537, + "loss": 0.1904, + "step": 358490 + }, + { + "epoch": 14.85, + "grad_norm": 0.6796875, + "learning_rate": 0.00025400712788409925, + "loss": 0.2514, + "step": 358500 + }, + { + "epoch": 14.85, + "grad_norm": 1.0234375, + "learning_rate": 0.0002539962840787422, + "loss": 0.1997, + "step": 358510 + }, + { + "epoch": 14.85, + "grad_norm": 0.248046875, + "learning_rate": 0.00025398544026586454, + "loss": 0.1807, + "step": 358520 + }, + { + "epoch": 14.85, + "grad_norm": 0.640625, + "learning_rate": 0.0002539745964454867, + "loss": 0.2131, + "step": 358530 + }, + { + "epoch": 14.85, + "grad_norm": 0.609375, + "learning_rate": 0.0002539637526176291, + "loss": 0.1603, + "step": 358540 + }, + { + "epoch": 14.85, + "grad_norm": 0.59765625, + "learning_rate": 0.0002539529087823122, + "loss": 0.1724, + "step": 358550 + }, + { + "epoch": 14.85, + "grad_norm": 0.64453125, + "learning_rate": 0.0002539420649395563, + "loss": 0.1844, + "step": 358560 + }, + { + "epoch": 14.85, + "grad_norm": 1.9921875, + "learning_rate": 0.0002539312210893818, + "loss": 0.1754, + "step": 358570 + }, + { + "epoch": 14.85, + "grad_norm": 0.765625, + "learning_rate": 0.00025392037723180925, + "loss": 0.1999, + "step": 358580 + }, + { + "epoch": 14.85, + "grad_norm": 0.62890625, + "learning_rate": 0.000253909533366859, + "loss": 0.224, + "step": 358590 + }, + { + "epoch": 14.85, + "grad_norm": 0.94921875, + "learning_rate": 0.00025389868949455137, + "loss": 0.1793, + "step": 358600 + }, + { + "epoch": 14.85, + "grad_norm": 0.69921875, + "learning_rate": 0.0002538878456149069, + "loss": 0.2121, + "step": 358610 + }, + { + "epoch": 14.85, + "grad_norm": 1.7421875, + "learning_rate": 0.0002538770017279458, + "loss": 0.1516, + "step": 358620 + }, + { + "epoch": 14.85, + "grad_norm": 1.46875, + "learning_rate": 0.0002538661578336887, + "loss": 0.1908, + "step": 358630 + }, + { + "epoch": 14.85, + "grad_norm": 0.7421875, + "learning_rate": 0.00025385531393215587, + "loss": 0.2066, + "step": 358640 + }, + { + "epoch": 14.86, + "grad_norm": 0.546875, + "learning_rate": 0.00025384447002336775, + "loss": 0.1962, + "step": 358650 + }, + { + "epoch": 14.86, + "grad_norm": 0.4375, + "learning_rate": 0.0002538336261073447, + "loss": 0.2117, + "step": 358660 + }, + { + "epoch": 14.86, + "grad_norm": 0.6328125, + "learning_rate": 0.00025382278218410737, + "loss": 0.1989, + "step": 358670 + }, + { + "epoch": 14.86, + "grad_norm": 0.78515625, + "learning_rate": 0.0002538119382536758, + "loss": 0.2051, + "step": 358680 + }, + { + "epoch": 14.86, + "grad_norm": 0.87890625, + "learning_rate": 0.00025380109431607064, + "loss": 0.181, + "step": 358690 + }, + { + "epoch": 14.86, + "grad_norm": 0.58984375, + "learning_rate": 0.0002537902503713122, + "loss": 0.2245, + "step": 358700 + }, + { + "epoch": 14.86, + "grad_norm": 0.69140625, + "learning_rate": 0.000253779406419421, + "loss": 0.2495, + "step": 358710 + }, + { + "epoch": 14.86, + "grad_norm": 0.63671875, + "learning_rate": 0.00025376856246041724, + "loss": 0.209, + "step": 358720 + }, + { + "epoch": 14.86, + "grad_norm": 0.400390625, + "learning_rate": 0.0002537577184943215, + "loss": 0.1868, + "step": 358730 + }, + { + "epoch": 14.86, + "grad_norm": 0.486328125, + "learning_rate": 0.0002537468745211541, + "loss": 0.2076, + "step": 358740 + }, + { + "epoch": 14.86, + "grad_norm": 0.7734375, + "learning_rate": 0.00025373603054093557, + "loss": 0.2117, + "step": 358750 + }, + { + "epoch": 14.86, + "grad_norm": 3.1875, + "learning_rate": 0.00025372518655368624, + "loss": 0.2006, + "step": 358760 + }, + { + "epoch": 14.86, + "grad_norm": 1.0234375, + "learning_rate": 0.0002537143425594265, + "loss": 0.202, + "step": 358770 + }, + { + "epoch": 14.86, + "grad_norm": 2.609375, + "learning_rate": 0.0002537034985581767, + "loss": 0.1715, + "step": 358780 + }, + { + "epoch": 14.86, + "grad_norm": 1.0, + "learning_rate": 0.00025369265454995744, + "loss": 0.175, + "step": 358790 + }, + { + "epoch": 14.86, + "grad_norm": 0.8984375, + "learning_rate": 0.0002536818105347889, + "loss": 0.1898, + "step": 358800 + }, + { + "epoch": 14.86, + "grad_norm": 0.439453125, + "learning_rate": 0.00025367096651269155, + "loss": 0.1529, + "step": 358810 + }, + { + "epoch": 14.86, + "grad_norm": 0.81640625, + "learning_rate": 0.0002536601224836859, + "loss": 0.1974, + "step": 358820 + }, + { + "epoch": 14.86, + "grad_norm": 0.78125, + "learning_rate": 0.00025364927844779227, + "loss": 0.2176, + "step": 358830 + }, + { + "epoch": 14.86, + "grad_norm": 1.125, + "learning_rate": 0.0002536384344050312, + "loss": 0.2036, + "step": 358840 + }, + { + "epoch": 14.86, + "grad_norm": 0.69140625, + "learning_rate": 0.0002536275903554229, + "loss": 0.1483, + "step": 358850 + }, + { + "epoch": 14.86, + "grad_norm": 0.36328125, + "learning_rate": 0.00025361674629898785, + "loss": 0.2166, + "step": 358860 + }, + { + "epoch": 14.86, + "grad_norm": 1.4921875, + "learning_rate": 0.0002536059022357465, + "loss": 0.1811, + "step": 358870 + }, + { + "epoch": 14.86, + "grad_norm": 0.62109375, + "learning_rate": 0.0002535950581657192, + "loss": 0.1628, + "step": 358880 + }, + { + "epoch": 14.87, + "grad_norm": 0.671875, + "learning_rate": 0.00025358421408892644, + "loss": 0.2123, + "step": 358890 + }, + { + "epoch": 14.87, + "grad_norm": 1.625, + "learning_rate": 0.00025357337000538857, + "loss": 0.1746, + "step": 358900 + }, + { + "epoch": 14.87, + "grad_norm": 0.275390625, + "learning_rate": 0.00025356252591512594, + "loss": 0.1505, + "step": 358910 + }, + { + "epoch": 14.87, + "grad_norm": 1.453125, + "learning_rate": 0.00025355168181815914, + "loss": 0.176, + "step": 358920 + }, + { + "epoch": 14.87, + "grad_norm": 1.328125, + "learning_rate": 0.00025354083771450835, + "loss": 0.1695, + "step": 358930 + }, + { + "epoch": 14.87, + "grad_norm": 0.890625, + "learning_rate": 0.0002535299936041941, + "loss": 0.1771, + "step": 358940 + }, + { + "epoch": 14.87, + "grad_norm": 0.73828125, + "learning_rate": 0.0002535191494872369, + "loss": 0.206, + "step": 358950 + }, + { + "epoch": 14.87, + "grad_norm": 0.345703125, + "learning_rate": 0.00025350830536365683, + "loss": 0.1936, + "step": 358960 + }, + { + "epoch": 14.87, + "grad_norm": 0.875, + "learning_rate": 0.0002534974612334747, + "loss": 0.1785, + "step": 358970 + }, + { + "epoch": 14.87, + "grad_norm": 0.95703125, + "learning_rate": 0.0002534866170967106, + "loss": 0.1574, + "step": 358980 + }, + { + "epoch": 14.87, + "grad_norm": 0.91015625, + "learning_rate": 0.0002534757729533851, + "loss": 0.2264, + "step": 358990 + }, + { + "epoch": 14.87, + "grad_norm": 1.765625, + "learning_rate": 0.00025346492880351865, + "loss": 0.1429, + "step": 359000 + }, + { + "epoch": 14.87, + "grad_norm": 2.453125, + "learning_rate": 0.0002534540846471315, + "loss": 0.177, + "step": 359010 + }, + { + "epoch": 14.87, + "grad_norm": 2.515625, + "learning_rate": 0.0002534432404842441, + "loss": 0.1626, + "step": 359020 + }, + { + "epoch": 14.87, + "grad_norm": 0.8046875, + "learning_rate": 0.00025343239631487696, + "loss": 0.1751, + "step": 359030 + }, + { + "epoch": 14.87, + "grad_norm": 1.015625, + "learning_rate": 0.0002534215521390504, + "loss": 0.1586, + "step": 359040 + }, + { + "epoch": 14.87, + "grad_norm": 1.0, + "learning_rate": 0.00025341070795678486, + "loss": 0.1668, + "step": 359050 + }, + { + "epoch": 14.87, + "grad_norm": 0.625, + "learning_rate": 0.0002533998637681007, + "loss": 0.2804, + "step": 359060 + }, + { + "epoch": 14.87, + "grad_norm": 0.54296875, + "learning_rate": 0.00025338901957301836, + "loss": 0.1531, + "step": 359070 + }, + { + "epoch": 14.87, + "grad_norm": 0.53125, + "learning_rate": 0.0002533781753715583, + "loss": 0.1799, + "step": 359080 + }, + { + "epoch": 14.87, + "grad_norm": 1.34375, + "learning_rate": 0.00025336733116374085, + "loss": 0.2043, + "step": 359090 + }, + { + "epoch": 14.87, + "grad_norm": 0.7578125, + "learning_rate": 0.00025335648694958644, + "loss": 0.1864, + "step": 359100 + }, + { + "epoch": 14.87, + "grad_norm": 0.65625, + "learning_rate": 0.00025334564272911557, + "loss": 0.1963, + "step": 359110 + }, + { + "epoch": 14.87, + "grad_norm": 0.48046875, + "learning_rate": 0.0002533347985023484, + "loss": 0.1924, + "step": 359120 + }, + { + "epoch": 14.88, + "grad_norm": 1.234375, + "learning_rate": 0.0002533239542693057, + "loss": 0.1944, + "step": 359130 + }, + { + "epoch": 14.88, + "grad_norm": 0.94921875, + "learning_rate": 0.00025331311003000753, + "loss": 0.1704, + "step": 359140 + }, + { + "epoch": 14.88, + "grad_norm": 1.015625, + "learning_rate": 0.00025330226578447446, + "loss": 0.1746, + "step": 359150 + }, + { + "epoch": 14.88, + "grad_norm": 1.2421875, + "learning_rate": 0.00025329142153272696, + "loss": 0.2239, + "step": 359160 + }, + { + "epoch": 14.88, + "grad_norm": 0.53125, + "learning_rate": 0.0002532805772747852, + "loss": 0.2413, + "step": 359170 + }, + { + "epoch": 14.88, + "grad_norm": 0.5546875, + "learning_rate": 0.00025326973301066993, + "loss": 0.1697, + "step": 359180 + }, + { + "epoch": 14.88, + "grad_norm": 0.69921875, + "learning_rate": 0.00025325888874040133, + "loss": 0.1769, + "step": 359190 + }, + { + "epoch": 14.88, + "grad_norm": 1.2265625, + "learning_rate": 0.00025324804446399973, + "loss": 0.1755, + "step": 359200 + }, + { + "epoch": 14.88, + "grad_norm": 0.67578125, + "learning_rate": 0.0002532372001814858, + "loss": 0.1935, + "step": 359210 + }, + { + "epoch": 14.88, + "grad_norm": 0.62890625, + "learning_rate": 0.00025322635589287973, + "loss": 0.1859, + "step": 359220 + }, + { + "epoch": 14.88, + "grad_norm": 0.71484375, + "learning_rate": 0.00025321551159820206, + "loss": 0.1731, + "step": 359230 + }, + { + "epoch": 14.88, + "grad_norm": 1.0546875, + "learning_rate": 0.0002532046672974732, + "loss": 0.192, + "step": 359240 + }, + { + "epoch": 14.88, + "grad_norm": 0.57421875, + "learning_rate": 0.0002531938229907133, + "loss": 0.1945, + "step": 359250 + }, + { + "epoch": 14.88, + "grad_norm": 0.8046875, + "learning_rate": 0.0002531829786779432, + "loss": 0.2061, + "step": 359260 + }, + { + "epoch": 14.88, + "grad_norm": 1.7265625, + "learning_rate": 0.000253172134359183, + "loss": 0.2329, + "step": 359270 + }, + { + "epoch": 14.88, + "grad_norm": 1.34375, + "learning_rate": 0.00025316129003445306, + "loss": 0.1909, + "step": 359280 + }, + { + "epoch": 14.88, + "grad_norm": 0.9453125, + "learning_rate": 0.00025315044570377407, + "loss": 0.1992, + "step": 359290 + }, + { + "epoch": 14.88, + "grad_norm": 0.625, + "learning_rate": 0.0002531396013671662, + "loss": 0.2165, + "step": 359300 + }, + { + "epoch": 14.88, + "grad_norm": 0.640625, + "learning_rate": 0.0002531287570246499, + "loss": 0.1835, + "step": 359310 + }, + { + "epoch": 14.88, + "grad_norm": 0.8125, + "learning_rate": 0.00025311791267624575, + "loss": 0.1744, + "step": 359320 + }, + { + "epoch": 14.88, + "grad_norm": 0.7890625, + "learning_rate": 0.00025310706832197387, + "loss": 0.2325, + "step": 359330 + }, + { + "epoch": 14.88, + "grad_norm": 0.59765625, + "learning_rate": 0.00025309622396185494, + "loss": 0.1806, + "step": 359340 + }, + { + "epoch": 14.88, + "grad_norm": 0.671875, + "learning_rate": 0.0002530853795959093, + "loss": 0.163, + "step": 359350 + }, + { + "epoch": 14.88, + "grad_norm": 0.87890625, + "learning_rate": 0.00025307453522415715, + "loss": 0.1625, + "step": 359360 + }, + { + "epoch": 14.89, + "grad_norm": 0.625, + "learning_rate": 0.00025306369084661915, + "loss": 0.1433, + "step": 359370 + }, + { + "epoch": 14.89, + "grad_norm": 0.85546875, + "learning_rate": 0.0002530528464633156, + "loss": 0.1696, + "step": 359380 + }, + { + "epoch": 14.89, + "grad_norm": 0.21484375, + "learning_rate": 0.0002530420020742669, + "loss": 0.1851, + "step": 359390 + }, + { + "epoch": 14.89, + "grad_norm": 0.90625, + "learning_rate": 0.00025303115767949356, + "loss": 0.2032, + "step": 359400 + }, + { + "epoch": 14.89, + "grad_norm": 0.828125, + "learning_rate": 0.00025302031327901575, + "loss": 0.1747, + "step": 359410 + }, + { + "epoch": 14.89, + "grad_norm": 0.466796875, + "learning_rate": 0.0002530094688728542, + "loss": 0.1978, + "step": 359420 + }, + { + "epoch": 14.89, + "grad_norm": 1.28125, + "learning_rate": 0.00025299862446102916, + "loss": 0.1921, + "step": 359430 + }, + { + "epoch": 14.89, + "grad_norm": 1.015625, + "learning_rate": 0.00025298778004356087, + "loss": 0.2023, + "step": 359440 + }, + { + "epoch": 14.89, + "grad_norm": 2.109375, + "learning_rate": 0.00025297693562047, + "loss": 0.1664, + "step": 359450 + }, + { + "epoch": 14.89, + "grad_norm": 3.125, + "learning_rate": 0.00025296609119177687, + "loss": 0.1948, + "step": 359460 + }, + { + "epoch": 14.89, + "grad_norm": 2.4375, + "learning_rate": 0.0002529552467575019, + "loss": 0.2153, + "step": 359470 + }, + { + "epoch": 14.89, + "grad_norm": 0.94921875, + "learning_rate": 0.0002529444023176654, + "loss": 0.2, + "step": 359480 + }, + { + "epoch": 14.89, + "grad_norm": 1.234375, + "learning_rate": 0.00025293355787228786, + "loss": 0.1725, + "step": 359490 + }, + { + "epoch": 14.89, + "grad_norm": 1.171875, + "learning_rate": 0.0002529227134213897, + "loss": 0.2134, + "step": 359500 + }, + { + "epoch": 14.89, + "grad_norm": 0.6875, + "learning_rate": 0.00025291186896499136, + "loss": 0.1979, + "step": 359510 + }, + { + "epoch": 14.89, + "grad_norm": 1.4453125, + "learning_rate": 0.0002529010245031131, + "loss": 0.201, + "step": 359520 + }, + { + "epoch": 14.89, + "grad_norm": 1.09375, + "learning_rate": 0.0002528901800357756, + "loss": 0.2236, + "step": 359530 + }, + { + "epoch": 14.89, + "grad_norm": 0.95703125, + "learning_rate": 0.0002528793355629989, + "loss": 0.199, + "step": 359540 + }, + { + "epoch": 14.89, + "grad_norm": 0.0, + "learning_rate": 0.0002528684910848037, + "loss": 0.1422, + "step": 359550 + }, + { + "epoch": 14.89, + "grad_norm": 0.91796875, + "learning_rate": 0.0002528576466012103, + "loss": 0.2095, + "step": 359560 + }, + { + "epoch": 14.89, + "grad_norm": 0.7109375, + "learning_rate": 0.000252846802112239, + "loss": 0.2108, + "step": 359570 + }, + { + "epoch": 14.89, + "grad_norm": 0.76171875, + "learning_rate": 0.0002528359576179105, + "loss": 0.213, + "step": 359580 + }, + { + "epoch": 14.89, + "grad_norm": 0.52734375, + "learning_rate": 0.00025282511311824495, + "loss": 0.2246, + "step": 359590 + }, + { + "epoch": 14.89, + "grad_norm": 1.453125, + "learning_rate": 0.00025281426861326284, + "loss": 0.1901, + "step": 359600 + }, + { + "epoch": 14.9, + "grad_norm": 0.63671875, + "learning_rate": 0.0002528034241029846, + "loss": 0.1519, + "step": 359610 + }, + { + "epoch": 14.9, + "grad_norm": 2.5, + "learning_rate": 0.0002527925795874306, + "loss": 0.2397, + "step": 359620 + }, + { + "epoch": 14.9, + "grad_norm": 0.423828125, + "learning_rate": 0.0002527817350666213, + "loss": 0.1527, + "step": 359630 + }, + { + "epoch": 14.9, + "grad_norm": 1.296875, + "learning_rate": 0.0002527708905405771, + "loss": 0.1789, + "step": 359640 + }, + { + "epoch": 14.9, + "grad_norm": 0.70703125, + "learning_rate": 0.0002527600460093183, + "loss": 0.1719, + "step": 359650 + }, + { + "epoch": 14.9, + "grad_norm": 0.69140625, + "learning_rate": 0.00025274920147286544, + "loss": 0.2247, + "step": 359660 + }, + { + "epoch": 14.9, + "grad_norm": 0.625, + "learning_rate": 0.00025273835693123887, + "loss": 0.2209, + "step": 359670 + }, + { + "epoch": 14.9, + "grad_norm": 0.9296875, + "learning_rate": 0.000252727512384459, + "loss": 0.2147, + "step": 359680 + }, + { + "epoch": 14.9, + "grad_norm": 2.359375, + "learning_rate": 0.00025271666783254626, + "loss": 0.2017, + "step": 359690 + }, + { + "epoch": 14.9, + "grad_norm": 0.9921875, + "learning_rate": 0.00025270582327552104, + "loss": 0.1505, + "step": 359700 + }, + { + "epoch": 14.9, + "grad_norm": 1.2421875, + "learning_rate": 0.00025269497871340375, + "loss": 0.2072, + "step": 359710 + }, + { + "epoch": 14.9, + "grad_norm": 0.9921875, + "learning_rate": 0.0002526841341462148, + "loss": 0.1733, + "step": 359720 + }, + { + "epoch": 14.9, + "grad_norm": 1.296875, + "learning_rate": 0.0002526732895739746, + "loss": 0.1803, + "step": 359730 + }, + { + "epoch": 14.9, + "grad_norm": 0.78125, + "learning_rate": 0.00025266244499670356, + "loss": 0.2306, + "step": 359740 + }, + { + "epoch": 14.9, + "grad_norm": 0.67578125, + "learning_rate": 0.00025265160041442207, + "loss": 0.2089, + "step": 359750 + }, + { + "epoch": 14.9, + "grad_norm": 0.53125, + "learning_rate": 0.0002526407558271506, + "loss": 0.1768, + "step": 359760 + }, + { + "epoch": 14.9, + "grad_norm": 1.0078125, + "learning_rate": 0.00025262991123490945, + "loss": 0.2032, + "step": 359770 + }, + { + "epoch": 14.9, + "grad_norm": 0.8828125, + "learning_rate": 0.00025261906663771915, + "loss": 0.2276, + "step": 359780 + }, + { + "epoch": 14.9, + "grad_norm": 0.84765625, + "learning_rate": 0.0002526082220356, + "loss": 0.2029, + "step": 359790 + }, + { + "epoch": 14.9, + "grad_norm": 1.234375, + "learning_rate": 0.00025259737742857247, + "loss": 0.1949, + "step": 359800 + }, + { + "epoch": 14.9, + "grad_norm": 1.234375, + "learning_rate": 0.00025258653281665696, + "loss": 0.1974, + "step": 359810 + }, + { + "epoch": 14.9, + "grad_norm": 1.8984375, + "learning_rate": 0.00025257568819987393, + "loss": 0.1751, + "step": 359820 + }, + { + "epoch": 14.9, + "grad_norm": 0.87890625, + "learning_rate": 0.00025256484357824365, + "loss": 0.2002, + "step": 359830 + }, + { + "epoch": 14.9, + "grad_norm": 0.71484375, + "learning_rate": 0.00025255399895178667, + "loss": 0.2068, + "step": 359840 + }, + { + "epoch": 14.9, + "grad_norm": 0.59375, + "learning_rate": 0.00025254315432052325, + "loss": 0.2071, + "step": 359850 + }, + { + "epoch": 14.91, + "grad_norm": 0.474609375, + "learning_rate": 0.000252532309684474, + "loss": 0.1662, + "step": 359860 + }, + { + "epoch": 14.91, + "grad_norm": 0.90234375, + "learning_rate": 0.00025252146504365914, + "loss": 0.146, + "step": 359870 + }, + { + "epoch": 14.91, + "grad_norm": 0.7890625, + "learning_rate": 0.00025251062039809917, + "loss": 0.238, + "step": 359880 + }, + { + "epoch": 14.91, + "grad_norm": 0.81640625, + "learning_rate": 0.0002524997757478145, + "loss": 0.1775, + "step": 359890 + }, + { + "epoch": 14.91, + "grad_norm": 0.486328125, + "learning_rate": 0.00025248893109282555, + "loss": 0.2188, + "step": 359900 + }, + { + "epoch": 14.91, + "grad_norm": 1.09375, + "learning_rate": 0.00025247808643315267, + "loss": 0.1719, + "step": 359910 + }, + { + "epoch": 14.91, + "grad_norm": 1.3203125, + "learning_rate": 0.0002524672417688163, + "loss": 0.1732, + "step": 359920 + }, + { + "epoch": 14.91, + "grad_norm": 1.03125, + "learning_rate": 0.0002524563970998368, + "loss": 0.1857, + "step": 359930 + }, + { + "epoch": 14.91, + "grad_norm": 1.4140625, + "learning_rate": 0.00025244555242623476, + "loss": 0.2159, + "step": 359940 + }, + { + "epoch": 14.91, + "grad_norm": 0.57421875, + "learning_rate": 0.0002524347077480303, + "loss": 0.2092, + "step": 359950 + }, + { + "epoch": 14.91, + "grad_norm": 1.1953125, + "learning_rate": 0.00025242386306524405, + "loss": 0.1888, + "step": 359960 + }, + { + "epoch": 14.91, + "grad_norm": 0.74609375, + "learning_rate": 0.00025241301837789637, + "loss": 0.206, + "step": 359970 + }, + { + "epoch": 14.91, + "grad_norm": 1.921875, + "learning_rate": 0.0002524021736860077, + "loss": 0.2136, + "step": 359980 + }, + { + "epoch": 14.91, + "grad_norm": 0.95703125, + "learning_rate": 0.0002523913289895983, + "loss": 0.1948, + "step": 359990 + }, + { + "epoch": 14.91, + "grad_norm": 0.486328125, + "learning_rate": 0.00025238048428868865, + "loss": 0.1798, + "step": 360000 + } + ], + "logging_steps": 10, + "max_steps": 724290, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 20000, + "total_flos": 2.2684460939497964e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}