{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995795432403884, "eval_steps": 500, "global_step": 1523, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000656322746710695, "grad_norm": 1.0, "learning_rate": 1.3071895424836602e-06, "loss": 2.0939, "step": 1 }, { "epoch": 0.003281613733553475, "grad_norm": 0.9765625, "learning_rate": 6.535947712418301e-06, "loss": 2.0618, "step": 5 }, { "epoch": 0.00656322746710695, "grad_norm": 0.91796875, "learning_rate": 1.3071895424836602e-05, "loss": 2.0432, "step": 10 }, { "epoch": 0.009844841200660424, "grad_norm": 0.71484375, "learning_rate": 1.9607843137254903e-05, "loss": 2.001, "step": 15 }, { "epoch": 0.0131264549342139, "grad_norm": 0.578125, "learning_rate": 2.6143790849673204e-05, "loss": 1.9571, "step": 20 }, { "epoch": 0.016408068667767374, "grad_norm": 0.5625, "learning_rate": 3.2679738562091506e-05, "loss": 1.8829, "step": 25 }, { "epoch": 0.019689682401320848, "grad_norm": 0.90234375, "learning_rate": 3.9215686274509805e-05, "loss": 1.8114, "step": 30 }, { "epoch": 0.022971296134874326, "grad_norm": 0.490234375, "learning_rate": 4.5751633986928104e-05, "loss": 1.6748, "step": 35 }, { "epoch": 0.0262529098684278, "grad_norm": 0.3828125, "learning_rate": 5.228758169934641e-05, "loss": 1.665, "step": 40 }, { "epoch": 0.029534523601981274, "grad_norm": 0.2099609375, "learning_rate": 5.882352941176471e-05, "loss": 1.6295, "step": 45 }, { "epoch": 0.03281613733553475, "grad_norm": 0.2314453125, "learning_rate": 6.535947712418301e-05, "loss": 1.6194, "step": 50 }, { "epoch": 0.036097751069088226, "grad_norm": 0.18359375, "learning_rate": 7.189542483660131e-05, "loss": 1.5773, "step": 55 }, { "epoch": 0.039379364802641696, "grad_norm": 0.140625, "learning_rate": 7.843137254901961e-05, "loss": 1.542, "step": 60 }, { "epoch": 0.042660978536195174, "grad_norm": 0.126953125, "learning_rate": 8.496732026143791e-05, "loss": 1.5303, "step": 65 }, { "epoch": 0.04594259226974865, "grad_norm": 0.1123046875, "learning_rate": 9.150326797385621e-05, "loss": 1.5031, "step": 70 }, { "epoch": 0.04922420600330212, "grad_norm": 0.10595703125, "learning_rate": 9.80392156862745e-05, "loss": 1.5177, "step": 75 }, { "epoch": 0.0525058197368556, "grad_norm": 0.09814453125, "learning_rate": 0.00010457516339869282, "loss": 1.5038, "step": 80 }, { "epoch": 0.05578743347040908, "grad_norm": 0.09130859375, "learning_rate": 0.00011111111111111112, "loss": 1.4847, "step": 85 }, { "epoch": 0.05906904720396255, "grad_norm": 0.0986328125, "learning_rate": 0.00011764705882352942, "loss": 1.5054, "step": 90 }, { "epoch": 0.062350660937516025, "grad_norm": 0.09912109375, "learning_rate": 0.00012418300653594771, "loss": 1.4672, "step": 95 }, { "epoch": 0.0656322746710695, "grad_norm": 0.10205078125, "learning_rate": 0.00013071895424836603, "loss": 1.4626, "step": 100 }, { "epoch": 0.06891388840462297, "grad_norm": 0.1416015625, "learning_rate": 0.0001372549019607843, "loss": 1.451, "step": 105 }, { "epoch": 0.07219550213817645, "grad_norm": 0.111328125, "learning_rate": 0.00014379084967320262, "loss": 1.4631, "step": 110 }, { "epoch": 0.07547711587172992, "grad_norm": 0.10302734375, "learning_rate": 0.0001503267973856209, "loss": 1.4423, "step": 115 }, { "epoch": 0.07875872960528339, "grad_norm": 0.1064453125, "learning_rate": 0.00015686274509803922, "loss": 1.4301, "step": 120 }, { "epoch": 0.08204034333883688, "grad_norm": 0.1123046875, "learning_rate": 0.00016339869281045753, "loss": 1.4287, "step": 125 }, { "epoch": 0.08532195707239035, "grad_norm": 0.162109375, "learning_rate": 0.00016993464052287582, "loss": 1.4294, "step": 130 }, { "epoch": 0.08860357080594382, "grad_norm": 0.1240234375, "learning_rate": 0.00017647058823529413, "loss": 1.4201, "step": 135 }, { "epoch": 0.0918851845394973, "grad_norm": 0.1494140625, "learning_rate": 0.00018300653594771241, "loss": 1.4044, "step": 140 }, { "epoch": 0.09516679827305077, "grad_norm": 0.1357421875, "learning_rate": 0.00018954248366013073, "loss": 1.4114, "step": 145 }, { "epoch": 0.09844841200660424, "grad_norm": 0.1552734375, "learning_rate": 0.000196078431372549, "loss": 1.3928, "step": 150 }, { "epoch": 0.10173002574015773, "grad_norm": 0.162109375, "learning_rate": 0.0001999989483097553, "loss": 1.4054, "step": 155 }, { "epoch": 0.1050116394737112, "grad_norm": 0.1337890625, "learning_rate": 0.00019998711704854725, "loss": 1.3996, "step": 160 }, { "epoch": 0.10829325320726467, "grad_norm": 0.15625, "learning_rate": 0.0001999621414738484, "loss": 1.4015, "step": 165 }, { "epoch": 0.11157486694081815, "grad_norm": 0.1455078125, "learning_rate": 0.0001999240248689495, "loss": 1.387, "step": 170 }, { "epoch": 0.11485648067437162, "grad_norm": 0.140625, "learning_rate": 0.00019987277224466215, "loss": 1.4057, "step": 175 }, { "epoch": 0.1181380944079251, "grad_norm": 0.142578125, "learning_rate": 0.00019980839033865994, "loss": 1.3844, "step": 180 }, { "epoch": 0.12141970814147857, "grad_norm": 0.146484375, "learning_rate": 0.00019973088761459287, "loss": 1.3654, "step": 185 }, { "epoch": 0.12470132187503205, "grad_norm": 0.1591796875, "learning_rate": 0.00019964027426097448, "loss": 1.3632, "step": 190 }, { "epoch": 0.12798293560858554, "grad_norm": 0.154296875, "learning_rate": 0.00019953656218984263, "loss": 1.3982, "step": 195 }, { "epoch": 0.131264549342139, "grad_norm": 0.146484375, "learning_rate": 0.0001994197650351936, "loss": 1.3842, "step": 200 }, { "epoch": 0.13454616307569248, "grad_norm": 0.1640625, "learning_rate": 0.0001992898981511896, "loss": 1.3528, "step": 205 }, { "epoch": 0.13782777680924593, "grad_norm": 0.173828125, "learning_rate": 0.0001991469786101404, "loss": 1.3811, "step": 210 }, { "epoch": 0.14110939054279942, "grad_norm": 0.162109375, "learning_rate": 0.00019899102520025896, "loss": 1.3594, "step": 215 }, { "epoch": 0.1443910042763529, "grad_norm": 0.1865234375, "learning_rate": 0.0001988220584231916, "loss": 1.3675, "step": 220 }, { "epoch": 0.14767261800990636, "grad_norm": 0.15234375, "learning_rate": 0.00019864010049132287, "loss": 1.3532, "step": 225 }, { "epoch": 0.15095423174345984, "grad_norm": 0.1455078125, "learning_rate": 0.0001984451753248553, "loss": 1.3672, "step": 230 }, { "epoch": 0.15423584547701333, "grad_norm": 0.1689453125, "learning_rate": 0.0001982373085486651, "loss": 1.3553, "step": 235 }, { "epoch": 0.15751745921056678, "grad_norm": 0.1396484375, "learning_rate": 0.00019801652748893347, "loss": 1.3303, "step": 240 }, { "epoch": 0.16079907294412027, "grad_norm": 0.1396484375, "learning_rate": 0.00019778286116955407, "loss": 1.35, "step": 245 }, { "epoch": 0.16408068667767375, "grad_norm": 0.1484375, "learning_rate": 0.00019753634030831782, "loss": 1.3451, "step": 250 }, { "epoch": 0.1673623004112272, "grad_norm": 0.1357421875, "learning_rate": 0.00019727699731287465, "loss": 1.3604, "step": 255 }, { "epoch": 0.1706439141447807, "grad_norm": 0.15625, "learning_rate": 0.00019700486627647305, "loss": 1.3418, "step": 260 }, { "epoch": 0.17392552787833418, "grad_norm": 0.171875, "learning_rate": 0.0001967199829734784, "loss": 1.3463, "step": 265 }, { "epoch": 0.17720714161188764, "grad_norm": 0.1533203125, "learning_rate": 0.00019642238485466989, "loss": 1.357, "step": 270 }, { "epoch": 0.18048875534544112, "grad_norm": 0.1455078125, "learning_rate": 0.00019611211104231724, "loss": 1.3678, "step": 275 }, { "epoch": 0.1837703690789946, "grad_norm": 0.134765625, "learning_rate": 0.0001957892023250379, "loss": 1.3296, "step": 280 }, { "epoch": 0.18705198281254806, "grad_norm": 0.154296875, "learning_rate": 0.00019545370115243462, "loss": 1.3445, "step": 285 }, { "epoch": 0.19033359654610155, "grad_norm": 0.146484375, "learning_rate": 0.00019510565162951537, "loss": 1.3407, "step": 290 }, { "epoch": 0.19361521027965503, "grad_norm": 0.1484375, "learning_rate": 0.00019474509951089507, "loss": 1.3419, "step": 295 }, { "epoch": 0.1968968240132085, "grad_norm": 0.1357421875, "learning_rate": 0.00019437209219478084, "loss": 1.349, "step": 300 }, { "epoch": 0.20017843774676197, "grad_norm": 0.1728515625, "learning_rate": 0.00019398667871674082, "loss": 1.3282, "step": 305 }, { "epoch": 0.20346005148031546, "grad_norm": 0.1669921875, "learning_rate": 0.00019358890974325817, "loss": 1.3115, "step": 310 }, { "epoch": 0.2067416652138689, "grad_norm": 0.1396484375, "learning_rate": 0.00019317883756507026, "loss": 1.339, "step": 315 }, { "epoch": 0.2100232789474224, "grad_norm": 0.1435546875, "learning_rate": 0.0001927565160902948, "loss": 1.3186, "step": 320 }, { "epoch": 0.21330489268097588, "grad_norm": 0.1494140625, "learning_rate": 0.00019232200083734265, "loss": 1.3434, "step": 325 }, { "epoch": 0.21658650641452934, "grad_norm": 0.1455078125, "learning_rate": 0.00019187534892761986, "loss": 1.3186, "step": 330 }, { "epoch": 0.21986812014808282, "grad_norm": 0.1435546875, "learning_rate": 0.0001914166190780181, "loss": 1.3394, "step": 335 }, { "epoch": 0.2231497338816363, "grad_norm": 0.1416015625, "learning_rate": 0.00019094587159319585, "loss": 1.317, "step": 340 }, { "epoch": 0.22643134761518977, "grad_norm": 0.142578125, "learning_rate": 0.00019046316835765083, "loss": 1.3344, "step": 345 }, { "epoch": 0.22971296134874325, "grad_norm": 0.1650390625, "learning_rate": 0.00018996857282758462, "loss": 1.3402, "step": 350 }, { "epoch": 0.2329945750822967, "grad_norm": 0.1455078125, "learning_rate": 0.00018946215002256061, "loss": 1.3396, "step": 355 }, { "epoch": 0.2362761888158502, "grad_norm": 0.1591796875, "learning_rate": 0.00018894396651695662, "loss": 1.3289, "step": 360 }, { "epoch": 0.23955780254940368, "grad_norm": 0.1396484375, "learning_rate": 0.00018841409043121306, "loss": 1.3274, "step": 365 }, { "epoch": 0.24283941628295713, "grad_norm": 0.1416015625, "learning_rate": 0.0001878725914228776, "loss": 1.3312, "step": 370 }, { "epoch": 0.24612103001651062, "grad_norm": 0.14453125, "learning_rate": 0.00018731954067744834, "loss": 1.3258, "step": 375 }, { "epoch": 0.2494026437500641, "grad_norm": 0.150390625, "learning_rate": 0.00018675501089901542, "loss": 1.3224, "step": 380 }, { "epoch": 0.25268425748361756, "grad_norm": 0.1474609375, "learning_rate": 0.00018617907630070352, "loss": 1.3219, "step": 385 }, { "epoch": 0.25596587121717107, "grad_norm": 0.1337890625, "learning_rate": 0.0001855918125949157, "loss": 1.3191, "step": 390 }, { "epoch": 0.2592474849507245, "grad_norm": 0.1435546875, "learning_rate": 0.00018499329698338035, "loss": 1.3183, "step": 395 }, { "epoch": 0.262529098684278, "grad_norm": 0.1474609375, "learning_rate": 0.0001843836081470022, "loss": 1.3154, "step": 400 }, { "epoch": 0.2658107124178315, "grad_norm": 0.142578125, "learning_rate": 0.0001837628262355188, "loss": 1.3131, "step": 405 }, { "epoch": 0.26909232615138495, "grad_norm": 0.14453125, "learning_rate": 0.00018313103285696425, "loss": 1.3211, "step": 410 }, { "epoch": 0.2723739398849384, "grad_norm": 0.150390625, "learning_rate": 0.00018248831106694086, "loss": 1.3082, "step": 415 }, { "epoch": 0.27565555361849187, "grad_norm": 0.15234375, "learning_rate": 0.00018183474535770068, "loss": 1.3046, "step": 420 }, { "epoch": 0.2789371673520454, "grad_norm": 0.1748046875, "learning_rate": 0.00018117042164703814, "loss": 1.3026, "step": 425 }, { "epoch": 0.28221878108559884, "grad_norm": 0.1484375, "learning_rate": 0.00018049542726699533, "loss": 1.3265, "step": 430 }, { "epoch": 0.2855003948191523, "grad_norm": 0.142578125, "learning_rate": 0.00017980985095238124, "loss": 1.2958, "step": 435 }, { "epoch": 0.2887820085527058, "grad_norm": 0.1767578125, "learning_rate": 0.00017911378282910675, "loss": 1.311, "step": 440 }, { "epoch": 0.29206362228625926, "grad_norm": 0.138671875, "learning_rate": 0.00017840731440233674, "loss": 1.3126, "step": 445 }, { "epoch": 0.2953452360198127, "grad_norm": 0.1435546875, "learning_rate": 0.00017769053854446053, "loss": 1.303, "step": 450 }, { "epoch": 0.29862684975336623, "grad_norm": 0.1484375, "learning_rate": 0.00017696354948288327, "loss": 1.3106, "step": 455 }, { "epoch": 0.3019084634869197, "grad_norm": 0.14453125, "learning_rate": 0.00017622644278763843, "loss": 1.2993, "step": 460 }, { "epoch": 0.30519007722047314, "grad_norm": 0.1455078125, "learning_rate": 0.00017547931535882445, "loss": 1.3031, "step": 465 }, { "epoch": 0.30847169095402666, "grad_norm": 0.134765625, "learning_rate": 0.000174722265413866, "loss": 1.3175, "step": 470 }, { "epoch": 0.3117533046875801, "grad_norm": 0.1484375, "learning_rate": 0.0001739553924746025, "loss": 1.3086, "step": 475 }, { "epoch": 0.31503491842113357, "grad_norm": 0.1669921875, "learning_rate": 0.0001731787973542049, "loss": 1.3364, "step": 480 }, { "epoch": 0.3183165321546871, "grad_norm": 0.150390625, "learning_rate": 0.0001723925821439227, "loss": 1.3103, "step": 485 }, { "epoch": 0.32159814588824054, "grad_norm": 0.13671875, "learning_rate": 0.00017159685019966316, "loss": 1.3312, "step": 490 }, { "epoch": 0.324879759621794, "grad_norm": 0.142578125, "learning_rate": 0.00017079170612840404, "loss": 1.3064, "step": 495 }, { "epoch": 0.3281613733553475, "grad_norm": 0.1357421875, "learning_rate": 0.00016997725577444205, "loss": 1.3109, "step": 500 }, { "epoch": 0.33144298708890096, "grad_norm": 0.1318359375, "learning_rate": 0.0001691536062054783, "loss": 1.3083, "step": 505 }, { "epoch": 0.3347246008224544, "grad_norm": 0.1376953125, "learning_rate": 0.0001683208656985436, "loss": 1.2997, "step": 510 }, { "epoch": 0.33800621455600793, "grad_norm": 0.140625, "learning_rate": 0.00016747914372576393, "loss": 1.3161, "step": 515 }, { "epoch": 0.3412878282895614, "grad_norm": 0.1376953125, "learning_rate": 0.00016662855093996945, "loss": 1.2811, "step": 520 }, { "epoch": 0.34456944202311485, "grad_norm": 0.14453125, "learning_rate": 0.00016576919916014808, "loss": 1.3146, "step": 525 }, { "epoch": 0.34785105575666836, "grad_norm": 0.13671875, "learning_rate": 0.00016490120135674566, "loss": 1.2922, "step": 530 }, { "epoch": 0.3511326694902218, "grad_norm": 0.146484375, "learning_rate": 0.00016402467163681493, "loss": 1.2962, "step": 535 }, { "epoch": 0.3544142832237753, "grad_norm": 0.138671875, "learning_rate": 0.00016313972522901491, "loss": 1.3008, "step": 540 }, { "epoch": 0.3576958969573288, "grad_norm": 0.1455078125, "learning_rate": 0.00016224647846846315, "loss": 1.3042, "step": 545 }, { "epoch": 0.36097751069088224, "grad_norm": 0.134765625, "learning_rate": 0.00016134504878144204, "loss": 1.2876, "step": 550 }, { "epoch": 0.3642591244244357, "grad_norm": 0.1474609375, "learning_rate": 0.00016043555466996206, "loss": 1.3086, "step": 555 }, { "epoch": 0.3675407381579892, "grad_norm": 0.1513671875, "learning_rate": 0.0001595181156961836, "loss": 1.2894, "step": 560 }, { "epoch": 0.37082235189154267, "grad_norm": 0.1357421875, "learning_rate": 0.00015859285246669913, "loss": 1.2933, "step": 565 }, { "epoch": 0.3741039656250961, "grad_norm": 0.1435546875, "learning_rate": 0.00015765988661667834, "loss": 1.2885, "step": 570 }, { "epoch": 0.37738557935864964, "grad_norm": 0.150390625, "learning_rate": 0.00015671934079387797, "loss": 1.2719, "step": 575 }, { "epoch": 0.3806671930922031, "grad_norm": 0.1337890625, "learning_rate": 0.00015577133864251848, "loss": 1.3012, "step": 580 }, { "epoch": 0.38394880682575655, "grad_norm": 0.1337890625, "learning_rate": 0.00015481600478702996, "loss": 1.3046, "step": 585 }, { "epoch": 0.38723042055931006, "grad_norm": 0.142578125, "learning_rate": 0.0001538534648156686, "loss": 1.3017, "step": 590 }, { "epoch": 0.3905120342928635, "grad_norm": 0.13671875, "learning_rate": 0.00015288384526400734, "loss": 1.307, "step": 595 }, { "epoch": 0.393793648026417, "grad_norm": 0.134765625, "learning_rate": 0.00015190727359830109, "loss": 1.2755, "step": 600 }, { "epoch": 0.3970752617599705, "grad_norm": 0.1357421875, "learning_rate": 0.00015092387819873014, "loss": 1.2961, "step": 605 }, { "epoch": 0.40035687549352394, "grad_norm": 0.1416015625, "learning_rate": 0.0001499337883425235, "loss": 1.2989, "step": 610 }, { "epoch": 0.4036384892270774, "grad_norm": 0.138671875, "learning_rate": 0.0001489371341869638, "loss": 1.2897, "step": 615 }, { "epoch": 0.4069201029606309, "grad_norm": 0.1435546875, "learning_rate": 0.00014793404675227684, "loss": 1.3068, "step": 620 }, { "epoch": 0.41020171669418437, "grad_norm": 0.1416015625, "learning_rate": 0.00014692465790440792, "loss": 1.28, "step": 625 }, { "epoch": 0.4134833304277378, "grad_norm": 0.13671875, "learning_rate": 0.0001459091003376865, "loss": 1.274, "step": 630 }, { "epoch": 0.41676494416129134, "grad_norm": 0.1474609375, "learning_rate": 0.00014488750755738223, "loss": 1.3007, "step": 635 }, { "epoch": 0.4200465578948448, "grad_norm": 0.14453125, "learning_rate": 0.00014386001386215434, "loss": 1.2852, "step": 640 }, { "epoch": 0.42332817162839825, "grad_norm": 0.138671875, "learning_rate": 0.0001428267543263969, "loss": 1.2861, "step": 645 }, { "epoch": 0.42660978536195177, "grad_norm": 0.1376953125, "learning_rate": 0.00014178786478248162, "loss": 1.2873, "step": 650 }, { "epoch": 0.4298913990955052, "grad_norm": 0.140625, "learning_rate": 0.0001407434818029015, "loss": 1.283, "step": 655 }, { "epoch": 0.4331730128290587, "grad_norm": 0.1376953125, "learning_rate": 0.00013969374268231713, "loss": 1.2828, "step": 660 }, { "epoch": 0.4364546265626122, "grad_norm": 0.1376953125, "learning_rate": 0.0001386387854195076, "loss": 1.2577, "step": 665 }, { "epoch": 0.43973624029616565, "grad_norm": 0.134765625, "learning_rate": 0.0001375787486992294, "loss": 1.3003, "step": 670 }, { "epoch": 0.4430178540297191, "grad_norm": 0.1435546875, "learning_rate": 0.00013651377187398492, "loss": 1.2879, "step": 675 }, { "epoch": 0.4462994677632726, "grad_norm": 0.140625, "learning_rate": 0.00013544399494570307, "loss": 1.2947, "step": 680 }, { "epoch": 0.4495810814968261, "grad_norm": 0.1376953125, "learning_rate": 0.0001343695585473346, "loss": 1.263, "step": 685 }, { "epoch": 0.45286269523037953, "grad_norm": 0.1328125, "learning_rate": 0.00013329060392436456, "loss": 1.2842, "step": 690 }, { "epoch": 0.456144308963933, "grad_norm": 0.14453125, "learning_rate": 0.00013220727291624415, "loss": 1.2789, "step": 695 }, { "epoch": 0.4594259226974865, "grad_norm": 0.1396484375, "learning_rate": 0.00013111970793774439, "loss": 1.2638, "step": 700 }, { "epoch": 0.46270753643103996, "grad_norm": 0.1318359375, "learning_rate": 0.00013002805196023448, "loss": 1.2978, "step": 705 }, { "epoch": 0.4659891501645934, "grad_norm": 0.13671875, "learning_rate": 0.0001289324484928865, "loss": 1.2863, "step": 710 }, { "epoch": 0.4692707638981469, "grad_norm": 0.1318359375, "learning_rate": 0.0001278330415638099, "loss": 1.2774, "step": 715 }, { "epoch": 0.4725523776317004, "grad_norm": 0.1357421875, "learning_rate": 0.0001267299757011175, "loss": 1.2741, "step": 720 }, { "epoch": 0.47583399136525384, "grad_norm": 0.1328125, "learning_rate": 0.00012562339591392572, "loss": 1.2904, "step": 725 }, { "epoch": 0.47911560509880735, "grad_norm": 0.1318359375, "learning_rate": 0.00012451344767329178, "loss": 1.2737, "step": 730 }, { "epoch": 0.4823972188323608, "grad_norm": 0.140625, "learning_rate": 0.00012340027689309, "loss": 1.2958, "step": 735 }, { "epoch": 0.48567883256591426, "grad_norm": 0.1435546875, "learning_rate": 0.0001222840299108301, "loss": 1.2914, "step": 740 }, { "epoch": 0.4889604462994678, "grad_norm": 0.1435546875, "learning_rate": 0.0001211648534684194, "loss": 1.3006, "step": 745 }, { "epoch": 0.49224206003302123, "grad_norm": 0.134765625, "learning_rate": 0.00012004289469287229, "loss": 1.2698, "step": 750 }, { "epoch": 0.4955236737665747, "grad_norm": 0.146484375, "learning_rate": 0.00011891830107696891, "loss": 1.2954, "step": 755 }, { "epoch": 0.4988052875001282, "grad_norm": 0.138671875, "learning_rate": 0.00011779122045986567, "loss": 1.2682, "step": 760 }, { "epoch": 0.5020869012336817, "grad_norm": 0.1337890625, "learning_rate": 0.00011666180100766036, "loss": 1.2779, "step": 765 }, { "epoch": 0.5053685149672351, "grad_norm": 0.1318359375, "learning_rate": 0.00011553019119391412, "loss": 1.2848, "step": 770 }, { "epoch": 0.5086501287007886, "grad_norm": 0.1376953125, "learning_rate": 0.00011439653978013334, "loss": 1.2788, "step": 775 }, { "epoch": 0.5119317424343421, "grad_norm": 0.146484375, "learning_rate": 0.0001132609957962131, "loss": 1.2696, "step": 780 }, { "epoch": 0.5152133561678955, "grad_norm": 0.1337890625, "learning_rate": 0.00011212370852084603, "loss": 1.272, "step": 785 }, { "epoch": 0.518494969901449, "grad_norm": 0.13671875, "learning_rate": 0.00011098482746189786, "loss": 1.2893, "step": 790 }, { "epoch": 0.5217765836350026, "grad_norm": 0.138671875, "learning_rate": 0.00010984450233675334, "loss": 1.2761, "step": 795 }, { "epoch": 0.525058197368556, "grad_norm": 0.142578125, "learning_rate": 0.0001087028830526342, "loss": 1.2472, "step": 800 }, { "epoch": 0.5283398111021095, "grad_norm": 0.1416015625, "learning_rate": 0.00010756011968689242, "loss": 1.2683, "step": 805 }, { "epoch": 0.531621424835663, "grad_norm": 0.1396484375, "learning_rate": 0.00010641636246728095, "loss": 1.2535, "step": 810 }, { "epoch": 0.5349030385692164, "grad_norm": 0.134765625, "learning_rate": 0.00010527176175220499, "loss": 1.2508, "step": 815 }, { "epoch": 0.5381846523027699, "grad_norm": 0.138671875, "learning_rate": 0.0001041264680109556, "loss": 1.2801, "step": 820 }, { "epoch": 0.5414662660363233, "grad_norm": 0.142578125, "learning_rate": 0.00010298063180392917, "loss": 1.2661, "step": 825 }, { "epoch": 0.5447478797698768, "grad_norm": 0.134765625, "learning_rate": 0.0001018344037628346, "loss": 1.2817, "step": 830 }, { "epoch": 0.5480294935034303, "grad_norm": 0.134765625, "learning_rate": 0.00010068793457089141, "loss": 1.2881, "step": 835 }, { "epoch": 0.5513111072369837, "grad_norm": 0.134765625, "learning_rate": 9.954137494302079e-05, "loss": 1.2755, "step": 840 }, { "epoch": 0.5545927209705372, "grad_norm": 0.1396484375, "learning_rate": 9.839487560603266e-05, "loss": 1.2741, "step": 845 }, { "epoch": 0.5578743347040908, "grad_norm": 0.1357421875, "learning_rate": 9.724858727881107e-05, "loss": 1.304, "step": 850 }, { "epoch": 0.5611559484376442, "grad_norm": 0.13671875, "learning_rate": 9.610266065250077e-05, "loss": 1.2742, "step": 855 }, { "epoch": 0.5644375621711977, "grad_norm": 0.13671875, "learning_rate": 9.495724637069718e-05, "loss": 1.2988, "step": 860 }, { "epoch": 0.5677191759047512, "grad_norm": 0.1337890625, "learning_rate": 9.381249500964294e-05, "loss": 1.2753, "step": 865 }, { "epoch": 0.5710007896383046, "grad_norm": 0.1376953125, "learning_rate": 9.266855705843309e-05, "loss": 1.273, "step": 870 }, { "epoch": 0.5742824033718581, "grad_norm": 0.13671875, "learning_rate": 9.152558289923177e-05, "loss": 1.2702, "step": 875 }, { "epoch": 0.5775640171054116, "grad_norm": 0.1396484375, "learning_rate": 9.038372278750287e-05, "loss": 1.2854, "step": 880 }, { "epoch": 0.580845630838965, "grad_norm": 0.140625, "learning_rate": 8.92431268322576e-05, "loss": 1.285, "step": 885 }, { "epoch": 0.5841272445725185, "grad_norm": 0.1318359375, "learning_rate": 8.810394497632102e-05, "loss": 1.2743, "step": 890 }, { "epoch": 0.587408858306072, "grad_norm": 0.142578125, "learning_rate": 8.696632697662063e-05, "loss": 1.2741, "step": 895 }, { "epoch": 0.5906904720396254, "grad_norm": 0.13671875, "learning_rate": 8.58304223844993e-05, "loss": 1.2847, "step": 900 }, { "epoch": 0.593972085773179, "grad_norm": 0.1376953125, "learning_rate": 8.469638052605513e-05, "loss": 1.2753, "step": 905 }, { "epoch": 0.5972536995067325, "grad_norm": 0.138671875, "learning_rate": 8.356435048251126e-05, "loss": 1.2679, "step": 910 }, { "epoch": 0.6005353132402859, "grad_norm": 0.1650390625, "learning_rate": 8.243448107061729e-05, "loss": 1.2631, "step": 915 }, { "epoch": 0.6038169269738394, "grad_norm": 0.134765625, "learning_rate": 8.130692082308624e-05, "loss": 1.2655, "step": 920 }, { "epoch": 0.6070985407073929, "grad_norm": 0.130859375, "learning_rate": 8.01818179690681e-05, "loss": 1.3186, "step": 925 }, { "epoch": 0.6103801544409463, "grad_norm": 0.1337890625, "learning_rate": 7.90593204146638e-05, "loss": 1.2895, "step": 930 }, { "epoch": 0.6136617681744998, "grad_norm": 0.1337890625, "learning_rate": 7.793957572348131e-05, "loss": 1.2751, "step": 935 }, { "epoch": 0.6169433819080533, "grad_norm": 0.13671875, "learning_rate": 7.682273109723712e-05, "loss": 1.2663, "step": 940 }, { "epoch": 0.6202249956416067, "grad_norm": 0.1357421875, "learning_rate": 7.570893335640487e-05, "loss": 1.2706, "step": 945 }, { "epoch": 0.6235066093751602, "grad_norm": 0.13671875, "learning_rate": 7.459832892091455e-05, "loss": 1.2638, "step": 950 }, { "epoch": 0.6267882231087137, "grad_norm": 0.134765625, "learning_rate": 7.349106379090381e-05, "loss": 1.275, "step": 955 }, { "epoch": 0.6300698368422671, "grad_norm": 0.1328125, "learning_rate": 7.23872835275252e-05, "loss": 1.272, "step": 960 }, { "epoch": 0.6333514505758207, "grad_norm": 0.1337890625, "learning_rate": 7.128713323381032e-05, "loss": 1.2768, "step": 965 }, { "epoch": 0.6366330643093742, "grad_norm": 0.1318359375, "learning_rate": 7.019075753559468e-05, "loss": 1.2743, "step": 970 }, { "epoch": 0.6399146780429276, "grad_norm": 0.13671875, "learning_rate": 6.909830056250527e-05, "loss": 1.2707, "step": 975 }, { "epoch": 0.6431962917764811, "grad_norm": 0.1337890625, "learning_rate": 6.800990592901315e-05, "loss": 1.2844, "step": 980 }, { "epoch": 0.6464779055100346, "grad_norm": 0.1376953125, "learning_rate": 6.692571671555398e-05, "loss": 1.264, "step": 985 }, { "epoch": 0.649759519243588, "grad_norm": 0.1435546875, "learning_rate": 6.584587544971854e-05, "loss": 1.2481, "step": 990 }, { "epoch": 0.6530411329771415, "grad_norm": 0.130859375, "learning_rate": 6.477052408751616e-05, "loss": 1.2738, "step": 995 }, { "epoch": 0.656322746710695, "grad_norm": 0.1337890625, "learning_rate": 6.369980399471306e-05, "loss": 1.2806, "step": 1000 }, { "epoch": 0.6596043604442484, "grad_norm": 0.13671875, "learning_rate": 6.263385592824857e-05, "loss": 1.2911, "step": 1005 }, { "epoch": 0.6628859741778019, "grad_norm": 0.134765625, "learning_rate": 6.157282001773095e-05, "loss": 1.2794, "step": 1010 }, { "epoch": 0.6661675879113554, "grad_norm": 0.134765625, "learning_rate": 6.051683574701616e-05, "loss": 1.2664, "step": 1015 }, { "epoch": 0.6694492016449088, "grad_norm": 0.142578125, "learning_rate": 5.946604193587134e-05, "loss": 1.2674, "step": 1020 }, { "epoch": 0.6727308153784624, "grad_norm": 0.1357421875, "learning_rate": 5.842057672172525e-05, "loss": 1.2696, "step": 1025 }, { "epoch": 0.6760124291120159, "grad_norm": 0.138671875, "learning_rate": 5.738057754150905e-05, "loss": 1.2657, "step": 1030 }, { "epoch": 0.6792940428455693, "grad_norm": 0.1318359375, "learning_rate": 5.634618111358865e-05, "loss": 1.2726, "step": 1035 }, { "epoch": 0.6825756565791228, "grad_norm": 0.134765625, "learning_rate": 5.531752341979173e-05, "loss": 1.2842, "step": 1040 }, { "epoch": 0.6858572703126763, "grad_norm": 0.134765625, "learning_rate": 5.429473968753157e-05, "loss": 1.265, "step": 1045 }, { "epoch": 0.6891388840462297, "grad_norm": 0.13671875, "learning_rate": 5.327796437203019e-05, "loss": 1.2795, "step": 1050 }, { "epoch": 0.6924204977797832, "grad_norm": 0.1328125, "learning_rate": 5.226733113864242e-05, "loss": 1.2817, "step": 1055 }, { "epoch": 0.6957021115133367, "grad_norm": 0.13671875, "learning_rate": 5.126297284528485e-05, "loss": 1.2538, "step": 1060 }, { "epoch": 0.6989837252468901, "grad_norm": 0.1416015625, "learning_rate": 5.0265021524969857e-05, "loss": 1.2608, "step": 1065 }, { "epoch": 0.7022653389804436, "grad_norm": 0.1318359375, "learning_rate": 4.927360836844868e-05, "loss": 1.2743, "step": 1070 }, { "epoch": 0.7055469527139971, "grad_norm": 0.1357421875, "learning_rate": 4.82888637069651e-05, "loss": 1.2725, "step": 1075 }, { "epoch": 0.7088285664475505, "grad_norm": 0.1328125, "learning_rate": 4.731091699512215e-05, "loss": 1.2578, "step": 1080 }, { "epoch": 0.7121101801811041, "grad_norm": 0.1357421875, "learning_rate": 4.6339896793863804e-05, "loss": 1.2784, "step": 1085 }, { "epoch": 0.7153917939146576, "grad_norm": 0.13671875, "learning_rate": 4.537593075357451e-05, "loss": 1.2708, "step": 1090 }, { "epoch": 0.718673407648211, "grad_norm": 0.13671875, "learning_rate": 4.441914559729825e-05, "loss": 1.2797, "step": 1095 }, { "epoch": 0.7219550213817645, "grad_norm": 0.13671875, "learning_rate": 4.346966710407937e-05, "loss": 1.3013, "step": 1100 }, { "epoch": 0.725236635115318, "grad_norm": 0.134765625, "learning_rate": 4.2527620092428e-05, "loss": 1.2535, "step": 1105 }, { "epoch": 0.7285182488488714, "grad_norm": 0.1337890625, "learning_rate": 4.159312840391086e-05, "loss": 1.2779, "step": 1110 }, { "epoch": 0.7317998625824249, "grad_norm": 0.134765625, "learning_rate": 4.066631488687166e-05, "loss": 1.2659, "step": 1115 }, { "epoch": 0.7350814763159784, "grad_norm": 0.1337890625, "learning_rate": 3.974730138028095e-05, "loss": 1.2653, "step": 1120 }, { "epoch": 0.7383630900495318, "grad_norm": 0.130859375, "learning_rate": 3.883620869771943e-05, "loss": 1.2735, "step": 1125 }, { "epoch": 0.7416447037830853, "grad_norm": 0.134765625, "learning_rate": 3.79331566114957e-05, "loss": 1.2653, "step": 1130 }, { "epoch": 0.7449263175166388, "grad_norm": 0.134765625, "learning_rate": 3.703826383690099e-05, "loss": 1.262, "step": 1135 }, { "epoch": 0.7482079312501922, "grad_norm": 0.1328125, "learning_rate": 3.6151648016602794e-05, "loss": 1.2491, "step": 1140 }, { "epoch": 0.7514895449837458, "grad_norm": 0.1357421875, "learning_rate": 3.527342570517975e-05, "loss": 1.2551, "step": 1145 }, { "epoch": 0.7547711587172993, "grad_norm": 0.1328125, "learning_rate": 3.44037123537991e-05, "loss": 1.2605, "step": 1150 }, { "epoch": 0.7580527724508527, "grad_norm": 0.134765625, "learning_rate": 3.3542622295039593e-05, "loss": 1.2621, "step": 1155 }, { "epoch": 0.7613343861844062, "grad_norm": 0.12890625, "learning_rate": 3.269026872786145e-05, "loss": 1.2798, "step": 1160 }, { "epoch": 0.7646159999179597, "grad_norm": 0.1318359375, "learning_rate": 3.184676370272488e-05, "loss": 1.2823, "step": 1165 }, { "epoch": 0.7678976136515131, "grad_norm": 0.1318359375, "learning_rate": 3.1012218106860345e-05, "loss": 1.284, "step": 1170 }, { "epoch": 0.7711792273850666, "grad_norm": 0.1357421875, "learning_rate": 3.0186741649690963e-05, "loss": 1.2825, "step": 1175 }, { "epoch": 0.7744608411186201, "grad_norm": 0.1337890625, "learning_rate": 2.937044284841026e-05, "loss": 1.2561, "step": 1180 }, { "epoch": 0.7777424548521735, "grad_norm": 0.1337890625, "learning_rate": 2.8563429013716514e-05, "loss": 1.2587, "step": 1185 }, { "epoch": 0.781024068585727, "grad_norm": 0.1328125, "learning_rate": 2.7765806235705594e-05, "loss": 1.2545, "step": 1190 }, { "epoch": 0.7843056823192806, "grad_norm": 0.1337890625, "learning_rate": 2.6977679369924357e-05, "loss": 1.2553, "step": 1195 }, { "epoch": 0.787587296052834, "grad_norm": 0.1337890625, "learning_rate": 2.6199152023586503e-05, "loss": 1.2713, "step": 1200 }, { "epoch": 0.7908689097863875, "grad_norm": 0.1318359375, "learning_rate": 2.5430326541952087e-05, "loss": 1.2593, "step": 1205 }, { "epoch": 0.794150523519941, "grad_norm": 0.13671875, "learning_rate": 2.4671303994873373e-05, "loss": 1.2509, "step": 1210 }, { "epoch": 0.7974321372534944, "grad_norm": 0.1328125, "learning_rate": 2.3922184163508254e-05, "loss": 1.2682, "step": 1215 }, { "epoch": 0.8007137509870479, "grad_norm": 0.134765625, "learning_rate": 2.3183065527202718e-05, "loss": 1.2596, "step": 1220 }, { "epoch": 0.8039953647206014, "grad_norm": 0.1337890625, "learning_rate": 2.245404525054515e-05, "loss": 1.2634, "step": 1225 }, { "epoch": 0.8072769784541548, "grad_norm": 0.130859375, "learning_rate": 2.1735219170592734e-05, "loss": 1.2717, "step": 1230 }, { "epoch": 0.8105585921877083, "grad_norm": 0.1357421875, "learning_rate": 2.1026681784272872e-05, "loss": 1.2607, "step": 1235 }, { "epoch": 0.8138402059212618, "grad_norm": 0.1337890625, "learning_rate": 2.0328526235960565e-05, "loss": 1.2733, "step": 1240 }, { "epoch": 0.8171218196548152, "grad_norm": 0.1318359375, "learning_rate": 1.9640844305233642e-05, "loss": 1.2696, "step": 1245 }, { "epoch": 0.8204034333883687, "grad_norm": 0.1318359375, "learning_rate": 1.8963726394807424e-05, "loss": 1.2779, "step": 1250 }, { "epoch": 0.8236850471219223, "grad_norm": 0.1328125, "learning_rate": 1.8297261518650456e-05, "loss": 1.2668, "step": 1255 }, { "epoch": 0.8269666608554757, "grad_norm": 0.130859375, "learning_rate": 1.7641537290282472e-05, "loss": 1.2646, "step": 1260 }, { "epoch": 0.8302482745890292, "grad_norm": 0.130859375, "learning_rate": 1.699663991125705e-05, "loss": 1.2696, "step": 1265 }, { "epoch": 0.8335298883225827, "grad_norm": 0.12890625, "learning_rate": 1.636265415982936e-05, "loss": 1.2604, "step": 1270 }, { "epoch": 0.8368115020561361, "grad_norm": 0.1328125, "learning_rate": 1.5739663379811122e-05, "loss": 1.2664, "step": 1275 }, { "epoch": 0.8400931157896896, "grad_norm": 0.130859375, "learning_rate": 1.512774946961445e-05, "loss": 1.2804, "step": 1280 }, { "epoch": 0.8433747295232431, "grad_norm": 0.1337890625, "learning_rate": 1.4526992871485345e-05, "loss": 1.2641, "step": 1285 }, { "epoch": 0.8466563432567965, "grad_norm": 0.1298828125, "learning_rate": 1.3937472560928733e-05, "loss": 1.2795, "step": 1290 }, { "epoch": 0.84993795699035, "grad_norm": 0.1318359375, "learning_rate": 1.3359266036326412e-05, "loss": 1.2659, "step": 1295 }, { "epoch": 0.8532195707239035, "grad_norm": 0.1337890625, "learning_rate": 1.2792449308749076e-05, "loss": 1.2643, "step": 1300 }, { "epoch": 0.8565011844574569, "grad_norm": 0.1298828125, "learning_rate": 1.2237096891963862e-05, "loss": 1.2812, "step": 1305 }, { "epoch": 0.8597827981910104, "grad_norm": 0.1298828125, "learning_rate": 1.1693281792638877e-05, "loss": 1.2669, "step": 1310 }, { "epoch": 0.863064411924564, "grad_norm": 0.1298828125, "learning_rate": 1.1161075500745543e-05, "loss": 1.2734, "step": 1315 }, { "epoch": 0.8663460256581174, "grad_norm": 0.1318359375, "learning_rate": 1.0640547980160742e-05, "loss": 1.2607, "step": 1320 }, { "epoch": 0.8696276393916709, "grad_norm": 0.1298828125, "learning_rate": 1.0131767659469205e-05, "loss": 1.2717, "step": 1325 }, { "epoch": 0.8729092531252244, "grad_norm": 0.1376953125, "learning_rate": 9.634801422967887e-06, "loss": 1.2767, "step": 1330 }, { "epoch": 0.8761908668587778, "grad_norm": 0.1337890625, "learning_rate": 9.149714601873516e-06, "loss": 1.274, "step": 1335 }, { "epoch": 0.8794724805923313, "grad_norm": 0.1328125, "learning_rate": 8.67657096573391e-06, "loss": 1.2553, "step": 1340 }, { "epoch": 0.8827540943258848, "grad_norm": 0.1279296875, "learning_rate": 8.215432714045024e-06, "loss": 1.2758, "step": 1345 }, { "epoch": 0.8860357080594382, "grad_norm": 0.1279296875, "learning_rate": 7.766360468074074e-06, "loss": 1.288, "step": 1350 }, { "epoch": 0.8893173217929917, "grad_norm": 0.1318359375, "learning_rate": 7.32941326289035e-06, "loss": 1.2421, "step": 1355 }, { "epoch": 0.8925989355265452, "grad_norm": 0.130859375, "learning_rate": 6.904648539604364e-06, "loss": 1.2517, "step": 1360 }, { "epoch": 0.8958805492600986, "grad_norm": 0.1328125, "learning_rate": 6.4921221378167915e-06, "loss": 1.2712, "step": 1365 }, { "epoch": 0.8991621629936521, "grad_norm": 0.12890625, "learning_rate": 6.091888288277569e-06, "loss": 1.264, "step": 1370 }, { "epoch": 0.9024437767272055, "grad_norm": 0.1328125, "learning_rate": 5.70399960575696e-06, "loss": 1.2713, "step": 1375 }, { "epoch": 0.9057253904607591, "grad_norm": 0.1337890625, "learning_rate": 5.328507082128642e-06, "loss": 1.272, "step": 1380 }, { "epoch": 0.9090070041943126, "grad_norm": 0.12890625, "learning_rate": 4.965460079666362e-06, "loss": 1.2672, "step": 1385 }, { "epoch": 0.912288617927866, "grad_norm": 0.1298828125, "learning_rate": 4.61490632455478e-06, "loss": 1.2732, "step": 1390 }, { "epoch": 0.9155702316614195, "grad_norm": 0.130859375, "learning_rate": 4.2768919006153876e-06, "loss": 1.2467, "step": 1395 }, { "epoch": 0.918851845394973, "grad_norm": 0.1328125, "learning_rate": 3.951461243248311e-06, "loss": 1.2634, "step": 1400 }, { "epoch": 0.9221334591285264, "grad_norm": 0.130859375, "learning_rate": 3.638657133590817e-06, "loss": 1.2571, "step": 1405 }, { "epoch": 0.9254150728620799, "grad_norm": 0.134765625, "learning_rate": 3.3385206928933097e-06, "loss": 1.2528, "step": 1410 }, { "epoch": 0.9286966865956334, "grad_norm": 0.130859375, "learning_rate": 3.0510913771135463e-06, "loss": 1.2647, "step": 1415 }, { "epoch": 0.9319783003291868, "grad_norm": 0.1337890625, "learning_rate": 2.7764069717297724e-06, "loss": 1.2769, "step": 1420 }, { "epoch": 0.9352599140627403, "grad_norm": 0.1318359375, "learning_rate": 2.5145035867733312e-06, "loss": 1.2616, "step": 1425 }, { "epoch": 0.9385415277962939, "grad_norm": 0.1357421875, "learning_rate": 2.265415652081804e-06, "loss": 1.2698, "step": 1430 }, { "epoch": 0.9418231415298473, "grad_norm": 0.1337890625, "learning_rate": 2.0291759127727294e-06, "loss": 1.2415, "step": 1435 }, { "epoch": 0.9451047552634008, "grad_norm": 0.1279296875, "learning_rate": 1.8058154249389502e-06, "loss": 1.2907, "step": 1440 }, { "epoch": 0.9483863689969543, "grad_norm": 0.130859375, "learning_rate": 1.5953635515660425e-06, "loss": 1.2786, "step": 1445 }, { "epoch": 0.9516679827305077, "grad_norm": 0.1337890625, "learning_rate": 1.3978479586721716e-06, "loss": 1.2634, "step": 1450 }, { "epoch": 0.9549495964640612, "grad_norm": 0.1337890625, "learning_rate": 1.2132946116711897e-06, "loss": 1.2866, "step": 1455 }, { "epoch": 0.9582312101976147, "grad_norm": 0.12890625, "learning_rate": 1.0417277719591667e-06, "loss": 1.2671, "step": 1460 }, { "epoch": 0.9615128239311681, "grad_norm": 0.1328125, "learning_rate": 8.831699937249859e-07, "loss": 1.251, "step": 1465 }, { "epoch": 0.9647944376647216, "grad_norm": 0.1318359375, "learning_rate": 7.376421209854267e-07, "loss": 1.2793, "step": 1470 }, { "epoch": 0.9680760513982751, "grad_norm": 0.126953125, "learning_rate": 6.051632848449562e-07, "loss": 1.2684, "step": 1475 }, { "epoch": 0.9713576651318285, "grad_norm": 0.1318359375, "learning_rate": 4.857509009807304e-07, "loss": 1.2605, "step": 1480 }, { "epoch": 0.974639278865382, "grad_norm": 0.130859375, "learning_rate": 3.7942066735321414e-07, "loss": 1.2608, "step": 1485 }, { "epoch": 0.9779208925989356, "grad_norm": 0.1318359375, "learning_rate": 2.861865621424431e-07, "loss": 1.2735, "step": 1490 }, { "epoch": 0.981202506332489, "grad_norm": 0.130859375, "learning_rate": 2.060608419105048e-07, "loss": 1.2788, "step": 1495 }, { "epoch": 0.9844841200660425, "grad_norm": 0.2451171875, "learning_rate": 1.3905403999024957e-07, "loss": 1.264, "step": 1500 }, { "epoch": 0.987765733799596, "grad_norm": 0.130859375, "learning_rate": 8.517496510059841e-08, "loss": 1.2673, "step": 1505 }, { "epoch": 0.9910473475331494, "grad_norm": 0.12890625, "learning_rate": 4.4430700188569095e-08, "loss": 1.2753, "step": 1510 }, { "epoch": 0.9943289612667029, "grad_norm": 0.130859375, "learning_rate": 1.6826601498098894e-08, "loss": 1.2567, "step": 1515 }, { "epoch": 0.9976105750002564, "grad_norm": 0.1337890625, "learning_rate": 2.3662978659633183e-09, "loss": 1.2568, "step": 1520 }, { "epoch": 0.9995795432403884, "eval_loss": 1.4362765550613403, "eval_runtime": 1174.833, "eval_samples_per_second": 12.068, "eval_steps_per_second": 12.068, "step": 1523 }, { "epoch": 0.9995795432403884, "step": 1523, "total_flos": 2.6010044317889987e+18, "train_loss": 1.1185581020360233, "train_runtime": 52635.226, "train_samples_per_second": 3.705, "train_steps_per_second": 0.029 } ], "logging_steps": 5, "max_steps": 1523, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 2.6010044317889987e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }