{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.85925925925926, "eval_steps": 500, "global_step": 670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014814814814814815, "grad_norm": 0.4682641327381134, "learning_rate": 2.9850746268656717e-05, "loss": 1.4595, "step": 1 }, { "epoch": 0.07407407407407407, "grad_norm": 0.30114030838012695, "learning_rate": 0.00014925373134328358, "loss": 1.4529, "step": 5 }, { "epoch": 0.14814814814814814, "grad_norm": 0.2646062970161438, "learning_rate": 0.00029850746268656717, "loss": 1.3781, "step": 10 }, { "epoch": 0.2222222222222222, "grad_norm": 0.2039109170436859, "learning_rate": 0.00044776119402985075, "loss": 1.2598, "step": 15 }, { "epoch": 0.2962962962962963, "grad_norm": 0.12383515387773514, "learning_rate": 0.0005970149253731343, "loss": 1.1834, "step": 20 }, { "epoch": 0.37037037037037035, "grad_norm": 0.1035536378622055, "learning_rate": 0.0007462686567164179, "loss": 1.1305, "step": 25 }, { "epoch": 0.4444444444444444, "grad_norm": 0.09090688824653625, "learning_rate": 0.0008955223880597015, "loss": 1.0904, "step": 30 }, { "epoch": 0.5185185185185185, "grad_norm": 0.1432366669178009, "learning_rate": 0.001044776119402985, "loss": 1.0762, "step": 35 }, { "epoch": 0.5925925925925926, "grad_norm": 0.07171270251274109, "learning_rate": 0.0011940298507462687, "loss": 1.0618, "step": 40 }, { "epoch": 0.6666666666666666, "grad_norm": 0.07491806894540787, "learning_rate": 0.0013432835820895524, "loss": 1.0421, "step": 45 }, { "epoch": 0.7407407407407407, "grad_norm": 0.06790623813867569, "learning_rate": 0.0014925373134328358, "loss": 1.0302, "step": 50 }, { "epoch": 0.8148148148148148, "grad_norm": 0.08844709396362305, "learning_rate": 0.0016417910447761195, "loss": 1.018, "step": 55 }, { "epoch": 0.8888888888888888, "grad_norm": 0.08857131749391556, "learning_rate": 0.001791044776119403, "loss": 1.0156, "step": 60 }, { "epoch": 0.9629629629629629, "grad_norm": 0.09674689918756485, "learning_rate": 0.0019402985074626867, "loss": 1.0031, "step": 65 }, { "epoch": 1.0, "eval_loss": 1.551000714302063, "eval_runtime": 0.869, "eval_samples_per_second": 4.603, "eval_steps_per_second": 1.151, "step": 68 }, { "epoch": 1.0296296296296297, "grad_norm": 0.09539825469255447, "learning_rate": 0.001999877856940653, "loss": 0.9937, "step": 70 }, { "epoch": 1.1037037037037036, "grad_norm": 0.09851890057325363, "learning_rate": 0.0019991315351855746, "loss": 0.9895, "step": 75 }, { "epoch": 1.1777777777777778, "grad_norm": 0.06911145895719528, "learning_rate": 0.0019977072547317748, "loss": 0.9817, "step": 80 }, { "epoch": 1.2518518518518518, "grad_norm": 0.06769894808530807, "learning_rate": 0.001995605982021898, "loss": 0.9762, "step": 85 }, { "epoch": 1.325925925925926, "grad_norm": 0.06828448921442032, "learning_rate": 0.001992829142870326, "loss": 0.9743, "step": 90 }, { "epoch": 1.4, "grad_norm": 0.06951478868722916, "learning_rate": 0.0019893786214956943, "loss": 0.9743, "step": 95 }, { "epoch": 1.474074074074074, "grad_norm": 0.06752126663923264, "learning_rate": 0.001985256759242359, "loss": 0.9718, "step": 100 }, { "epoch": 1.5481481481481483, "grad_norm": 0.06669533252716064, "learning_rate": 0.0019804663529916825, "loss": 0.9743, "step": 105 }, { "epoch": 1.6222222222222222, "grad_norm": 0.06977611780166626, "learning_rate": 0.001975010653264216, "loss": 0.9678, "step": 110 }, { "epoch": 1.6962962962962962, "grad_norm": 0.07217196375131607, "learning_rate": 0.0019688933620140635, "loss": 0.9694, "step": 115 }, { "epoch": 1.7703703703703704, "grad_norm": 0.06247986480593681, "learning_rate": 0.0019621186301169314, "loss": 0.9625, "step": 120 }, { "epoch": 1.8444444444444446, "grad_norm": 0.07415565848350525, "learning_rate": 0.001954691054553556, "loss": 0.9697, "step": 125 }, { "epoch": 1.9185185185185185, "grad_norm": 0.07004866003990173, "learning_rate": 0.0019466156752904343, "loss": 0.957, "step": 130 }, { "epoch": 1.9925925925925925, "grad_norm": 0.06320279091596603, "learning_rate": 0.0019378979718599645, "loss": 0.9546, "step": 135 }, { "epoch": 2.0, "eval_loss": 1.5149173736572266, "eval_runtime": 0.8697, "eval_samples_per_second": 4.599, "eval_steps_per_second": 1.15, "step": 136 }, { "epoch": 2.0592592592592593, "grad_norm": 0.07447217404842377, "learning_rate": 0.0019285438596423204, "loss": 0.9443, "step": 140 }, { "epoch": 2.1333333333333333, "grad_norm": 0.06741169095039368, "learning_rate": 0.0019185596858515798, "loss": 0.9371, "step": 145 }, { "epoch": 2.2074074074074073, "grad_norm": 0.06852757930755615, "learning_rate": 0.0019079522252288387, "loss": 0.9395, "step": 150 }, { "epoch": 2.2814814814814817, "grad_norm": 0.06586603075265884, "learning_rate": 0.0018967286754452213, "loss": 0.937, "step": 155 }, { "epoch": 2.3555555555555556, "grad_norm": 0.0683656558394432, "learning_rate": 0.0018848966522179167, "loss": 0.9336, "step": 160 }, { "epoch": 2.4296296296296296, "grad_norm": 0.07259602099657059, "learning_rate": 0.001872464184142548, "loss": 0.935, "step": 165 }, { "epoch": 2.5037037037037035, "grad_norm": 0.06436455249786377, "learning_rate": 0.0018594397072453856, "loss": 0.9316, "step": 170 }, { "epoch": 2.5777777777777775, "grad_norm": 0.08042966574430466, "learning_rate": 0.0018458320592590974, "loss": 0.938, "step": 175 }, { "epoch": 2.651851851851852, "grad_norm": 0.0699801966547966, "learning_rate": 0.0018316504736259254, "loss": 0.9422, "step": 180 }, { "epoch": 2.725925925925926, "grad_norm": 0.06373833864927292, "learning_rate": 0.0018169045732323492, "loss": 0.9348, "step": 185 }, { "epoch": 2.8, "grad_norm": 0.07165364176034927, "learning_rate": 0.0018016043638794975, "loss": 0.9354, "step": 190 }, { "epoch": 2.8740740740740742, "grad_norm": 0.06121128425002098, "learning_rate": 0.0017857602274937308, "loss": 0.9386, "step": 195 }, { "epoch": 2.948148148148148, "grad_norm": 0.06334740668535233, "learning_rate": 0.0017693829150820068, "loss": 0.936, "step": 200 }, { "epoch": 3.0, "eval_loss": 1.508521318435669, "eval_runtime": 0.8697, "eval_samples_per_second": 4.599, "eval_steps_per_second": 1.15, "step": 204 }, { "epoch": 3.0148148148148146, "grad_norm": 0.07033156603574753, "learning_rate": 0.0017524835394368066, "loss": 0.9317, "step": 205 }, { "epoch": 3.088888888888889, "grad_norm": 0.06662800908088684, "learning_rate": 0.0017350735675955695, "loss": 0.9145, "step": 210 }, { "epoch": 3.162962962962963, "grad_norm": 0.06688813865184784, "learning_rate": 0.001717164813059761, "loss": 0.9094, "step": 215 }, { "epoch": 3.237037037037037, "grad_norm": 0.07399953156709671, "learning_rate": 0.0016987694277788418, "loss": 0.9147, "step": 220 }, { "epoch": 3.311111111111111, "grad_norm": 0.06779713183641434, "learning_rate": 0.0016798998939045893, "loss": 0.9123, "step": 225 }, { "epoch": 3.3851851851851853, "grad_norm": 0.06676509976387024, "learning_rate": 0.001660569015321357, "loss": 0.9099, "step": 230 }, { "epoch": 3.4592592592592593, "grad_norm": 0.06683938950300217, "learning_rate": 0.001640789908958026, "loss": 0.9112, "step": 235 }, { "epoch": 3.533333333333333, "grad_norm": 0.06712319701910019, "learning_rate": 0.001620575995887538, "loss": 0.914, "step": 240 }, { "epoch": 3.6074074074074076, "grad_norm": 0.06718605011701584, "learning_rate": 0.001599940992220053, "loss": 0.9156, "step": 245 }, { "epoch": 3.6814814814814816, "grad_norm": 0.06765800714492798, "learning_rate": 0.0015788988997959114, "loss": 0.9168, "step": 250 }, { "epoch": 3.7555555555555555, "grad_norm": 0.06374535709619522, "learning_rate": 0.0015574639966847127, "loss": 0.9114, "step": 255 }, { "epoch": 3.8296296296296295, "grad_norm": 0.06388971954584122, "learning_rate": 0.0015356508274969594, "loss": 0.9139, "step": 260 }, { "epoch": 3.9037037037037035, "grad_norm": 0.0656428337097168, "learning_rate": 0.0015134741935148419, "loss": 0.916, "step": 265 }, { "epoch": 3.977777777777778, "grad_norm": 0.06783714145421982, "learning_rate": 0.0014909491426488577, "loss": 0.9186, "step": 270 }, { "epoch": 4.0, "eval_loss": 1.517486810684204, "eval_runtime": 0.8755, "eval_samples_per_second": 4.569, "eval_steps_per_second": 1.142, "step": 272 }, { "epoch": 4.044444444444444, "grad_norm": 0.06940994411706924, "learning_rate": 0.001468090959227082, "loss": 0.9011, "step": 275 }, { "epoch": 4.118518518518519, "grad_norm": 0.06819378584623337, "learning_rate": 0.0014449151536240167, "loss": 0.8866, "step": 280 }, { "epoch": 4.192592592592592, "grad_norm": 0.0655524805188179, "learning_rate": 0.0014214374517360576, "loss": 0.8916, "step": 285 }, { "epoch": 4.266666666666667, "grad_norm": 0.06668845564126968, "learning_rate": 0.0013976737843107202, "loss": 0.8871, "step": 290 }, { "epoch": 4.340740740740741, "grad_norm": 0.06470604240894318, "learning_rate": 0.0013736402761368597, "loss": 0.8928, "step": 295 }, { "epoch": 4.4148148148148145, "grad_norm": 0.06732232868671417, "learning_rate": 0.0013493532351032318, "loss": 0.8985, "step": 300 }, { "epoch": 4.488888888888889, "grad_norm": 0.0662841871380806, "learning_rate": 0.0013248291411328047, "loss": 0.8869, "step": 305 }, { "epoch": 4.562962962962963, "grad_norm": 0.06613945215940475, "learning_rate": 0.001300084635000341, "loss": 0.8963, "step": 310 }, { "epoch": 4.637037037037037, "grad_norm": 0.06735741347074509, "learning_rate": 0.0012751365070408334, "loss": 0.9035, "step": 315 }, { "epoch": 4.711111111111111, "grad_norm": 0.06463445723056793, "learning_rate": 0.0012500016857564585, "loss": 0.8966, "step": 320 }, { "epoch": 4.785185185185185, "grad_norm": 0.06602155417203903, "learning_rate": 0.0012246972263297718, "loss": 0.895, "step": 325 }, { "epoch": 4.859259259259259, "grad_norm": 0.06352429836988449, "learning_rate": 0.0011992402990509514, "loss": 0.894, "step": 330 }, { "epoch": 4.933333333333334, "grad_norm": 0.06808946281671524, "learning_rate": 0.0011736481776669307, "loss": 0.8969, "step": 335 }, { "epoch": 5.0, "grad_norm": 0.08401331305503845, "learning_rate": 0.0011479382276603299, "loss": 0.8948, "step": 340 }, { "epoch": 5.0, "eval_loss": 1.5301542282104492, "eval_runtime": 0.8691, "eval_samples_per_second": 4.602, "eval_steps_per_second": 1.151, "step": 340 }, { "epoch": 5.074074074074074, "grad_norm": 0.06839559227228165, "learning_rate": 0.0011221278944661473, "loss": 0.8678, "step": 345 }, { "epoch": 5.148148148148148, "grad_norm": 0.06838098913431168, "learning_rate": 0.0010962346916341904, "loss": 0.8666, "step": 350 }, { "epoch": 5.222222222222222, "grad_norm": 0.06836072355508804, "learning_rate": 0.001070276188945293, "loss": 0.8731, "step": 355 }, { "epoch": 5.296296296296296, "grad_norm": 0.06789132207632065, "learning_rate": 0.0010442700004893765, "loss": 0.8724, "step": 360 }, { "epoch": 5.37037037037037, "grad_norm": 0.06825467944145203, "learning_rate": 0.001018233772713443, "loss": 0.8757, "step": 365 }, { "epoch": 5.444444444444445, "grad_norm": 0.06852041184902191, "learning_rate": 0.000992185172447616, "loss": 0.8762, "step": 370 }, { "epoch": 5.518518518518518, "grad_norm": 0.06898131966590881, "learning_rate": 0.0009661418749173466, "loss": 0.8731, "step": 375 }, { "epoch": 5.592592592592593, "grad_norm": 0.06875770539045334, "learning_rate": 0.0009401215517499251, "loss": 0.8746, "step": 380 }, { "epoch": 5.666666666666667, "grad_norm": 0.06649214774370193, "learning_rate": 0.0009141418589834339, "loss": 0.8748, "step": 385 }, { "epoch": 5.7407407407407405, "grad_norm": 0.06804858148097992, "learning_rate": 0.0008882204250862795, "loss": 0.8783, "step": 390 }, { "epoch": 5.814814814814815, "grad_norm": 0.06907966732978821, "learning_rate": 0.0008623748389954282, "loss": 0.8822, "step": 395 }, { "epoch": 5.888888888888889, "grad_norm": 0.0679902508854866, "learning_rate": 0.0008366226381814697, "loss": 0.8777, "step": 400 }, { "epoch": 5.962962962962963, "grad_norm": 0.06677145510911942, "learning_rate": 0.0008109812967486025, "loss": 0.8742, "step": 405 }, { "epoch": 6.0, "eval_loss": 1.5502283573150635, "eval_runtime": 0.8693, "eval_samples_per_second": 4.602, "eval_steps_per_second": 1.15, "step": 408 }, { "epoch": 6.029629629629629, "grad_norm": 0.06917522847652435, "learning_rate": 0.0007854682135776132, "loss": 0.8605, "step": 410 }, { "epoch": 6.103703703703704, "grad_norm": 0.07051009684801102, "learning_rate": 0.0007601007005199021, "loss": 0.8501, "step": 415 }, { "epoch": 6.177777777777778, "grad_norm": 0.07272496819496155, "learning_rate": 0.0007348959706505627, "loss": 0.8553, "step": 420 }, { "epoch": 6.2518518518518515, "grad_norm": 0.07074420154094696, "learning_rate": 0.000709871126588481, "loss": 0.8496, "step": 425 }, { "epoch": 6.325925925925926, "grad_norm": 0.07021532952785492, "learning_rate": 0.0006850431488913895, "loss": 0.8547, "step": 430 }, { "epoch": 6.4, "grad_norm": 0.07260267436504364, "learning_rate": 0.0006604288845337453, "loss": 0.8568, "step": 435 }, { "epoch": 6.474074074074074, "grad_norm": 0.06939396262168884, "learning_rate": 0.0006360450354752458, "loss": 0.8561, "step": 440 }, { "epoch": 6.548148148148148, "grad_norm": 0.06964612007141113, "learning_rate": 0.0006119081473277501, "loss": 0.8577, "step": 445 }, { "epoch": 6.622222222222222, "grad_norm": 0.06987880170345306, "learning_rate": 0.0005880345981282876, "loss": 0.858, "step": 450 }, { "epoch": 6.696296296296296, "grad_norm": 0.06909282505512238, "learning_rate": 0.0005644405872257716, "loss": 0.8559, "step": 455 }, { "epoch": 6.770370370370371, "grad_norm": 0.0683453232049942, "learning_rate": 0.0005411421242889642, "loss": 0.8561, "step": 460 }, { "epoch": 6.844444444444444, "grad_norm": 0.0680374875664711, "learning_rate": 0.000518155018443151, "loss": 0.859, "step": 465 }, { "epoch": 6.9185185185185185, "grad_norm": 0.067069411277771, "learning_rate": 0.0004954948675428853, "loss": 0.8489, "step": 470 }, { "epoch": 6.992592592592593, "grad_norm": 0.06691515445709229, "learning_rate": 0.00047317704758809945, "loss": 0.8556, "step": 475 }, { "epoch": 7.0, "eval_loss": 1.5617406368255615, "eval_runtime": 0.8711, "eval_samples_per_second": 4.592, "eval_steps_per_second": 1.148, "step": 476 }, { "epoch": 7.059259259259259, "grad_norm": 0.08424794673919678, "learning_rate": 0.0004512167022907494, "loss": 0.8413, "step": 480 }, { "epoch": 7.133333333333334, "grad_norm": 0.07284523546695709, "learning_rate": 0.00042962873279907965, "loss": 0.8329, "step": 485 }, { "epoch": 7.207407407407407, "grad_norm": 0.06989779323339462, "learning_rate": 0.0004084277875864776, "loss": 0.8368, "step": 490 }, { "epoch": 7.281481481481482, "grad_norm": 0.0744442567229271, "learning_rate": 0.0003876282525117847, "loss": 0.831, "step": 495 }, { "epoch": 7.355555555555555, "grad_norm": 0.07233459502458572, "learning_rate": 0.0003672442410577965, "loss": 0.8344, "step": 500 }, { "epoch": 7.42962962962963, "grad_norm": 0.07147523015737534, "learning_rate": 0.0003472895847545905, "loss": 0.837, "step": 505 }, { "epoch": 7.503703703703704, "grad_norm": 0.0732484832406044, "learning_rate": 0.000327777823794168, "loss": 0.8427, "step": 510 }, { "epoch": 7.5777777777777775, "grad_norm": 0.0711125060915947, "learning_rate": 0.00030872219784278354, "loss": 0.8394, "step": 515 }, { "epoch": 7.651851851851852, "grad_norm": 0.07285265624523163, "learning_rate": 0.0002901356370571967, "loss": 0.8336, "step": 520 }, { "epoch": 7.725925925925926, "grad_norm": 0.07154905050992966, "learning_rate": 0.0002720307533109402, "loss": 0.8403, "step": 525 }, { "epoch": 7.8, "grad_norm": 0.07089488953351974, "learning_rate": 0.000254419831636557, "loss": 0.839, "step": 530 }, { "epoch": 7.874074074074074, "grad_norm": 0.0709661915898323, "learning_rate": 0.00023731482188961818, "loss": 0.8353, "step": 535 }, { "epoch": 7.948148148148148, "grad_norm": 0.07034063339233398, "learning_rate": 0.00022072733064017102, "loss": 0.8428, "step": 540 }, { "epoch": 8.0, "eval_loss": 1.596451997756958, "eval_runtime": 0.8703, "eval_samples_per_second": 4.596, "eval_steps_per_second": 1.149, "step": 544 }, { "epoch": 8.014814814814814, "grad_norm": 0.07084991037845612, "learning_rate": 0.00020466861329712473, "loss": 0.8359, "step": 545 }, { "epoch": 8.088888888888889, "grad_norm": 0.07405474036931992, "learning_rate": 0.00018914956647091496, "loss": 0.8195, "step": 550 }, { "epoch": 8.162962962962963, "grad_norm": 0.07152204215526581, "learning_rate": 0.0001741807205796314, "loss": 0.8289, "step": 555 }, { "epoch": 8.237037037037037, "grad_norm": 0.0712200403213501, "learning_rate": 0.00015977223270362194, "loss": 0.8271, "step": 560 }, { "epoch": 8.311111111111112, "grad_norm": 0.07045566290616989, "learning_rate": 0.0001459338796934293, "loss": 0.829, "step": 565 }, { "epoch": 8.385185185185184, "grad_norm": 0.0720411017537117, "learning_rate": 0.000132675051535725, "loss": 0.8265, "step": 570 }, { "epoch": 8.459259259259259, "grad_norm": 0.07052139192819595, "learning_rate": 0.00012000474498175551, "loss": 0.8226, "step": 575 }, { "epoch": 8.533333333333333, "grad_norm": 0.07078087329864502, "learning_rate": 0.00010793155744261352, "loss": 0.8241, "step": 580 }, { "epoch": 8.607407407407408, "grad_norm": 0.07028964906930923, "learning_rate": 9.646368115548232e-05, "loss": 0.8212, "step": 585 }, { "epoch": 8.681481481481482, "grad_norm": 0.0702112540602684, "learning_rate": 8.56088976248095e-05, "loss": 0.8232, "step": 590 }, { "epoch": 8.755555555555556, "grad_norm": 0.07041744887828827, "learning_rate": 7.53745723421827e-05, "loss": 0.8193, "step": 595 }, { "epoch": 8.829629629629629, "grad_norm": 0.06979186832904816, "learning_rate": 6.576764978849003e-05, "loss": 0.8186, "step": 600 }, { "epoch": 8.903703703703703, "grad_norm": 0.07058751583099365, "learning_rate": 5.679464872175666e-05, "loss": 0.8279, "step": 605 }, { "epoch": 8.977777777777778, "grad_norm": 0.07009345293045044, "learning_rate": 4.846165775385458e-05, "loss": 0.8168, "step": 610 }, { "epoch": 9.0, "eval_loss": 1.6216613054275513, "eval_runtime": 0.8697, "eval_samples_per_second": 4.599, "eval_steps_per_second": 1.15, "step": 612 }, { "epoch": 9.044444444444444, "grad_norm": 0.0695219412446022, "learning_rate": 4.077433121908747e-05, "loss": 0.8172, "step": 615 }, { "epoch": 9.118518518518519, "grad_norm": 0.07094599306583405, "learning_rate": 3.373788533745281e-05, "loss": 0.8207, "step": 620 }, { "epoch": 9.192592592592593, "grad_norm": 0.07142723351716995, "learning_rate": 2.7357094675186987e-05, "loss": 0.8101, "step": 625 }, { "epoch": 9.266666666666667, "grad_norm": 0.07132075726985931, "learning_rate": 2.1636288904992585e-05, "loss": 0.8137, "step": 630 }, { "epoch": 9.34074074074074, "grad_norm": 0.07085540145635605, "learning_rate": 1.6579349868147686e-05, "loss": 0.8103, "step": 635 }, { "epoch": 9.414814814814815, "grad_norm": 0.0697101578116417, "learning_rate": 1.218970894049065e-05, "loss": 0.8094, "step": 640 }, { "epoch": 9.488888888888889, "grad_norm": 0.07001277059316635, "learning_rate": 8.470344704066047e-06, "loss": 0.8233, "step": 645 }, { "epoch": 9.562962962962963, "grad_norm": 0.07051407545804977, "learning_rate": 5.42378092601481e-06, "loss": 0.8181, "step": 650 }, { "epoch": 9.637037037037038, "grad_norm": 0.07038593292236328, "learning_rate": 3.0520848460765526e-06, "loss": 0.8198, "step": 655 }, { "epoch": 9.71111111111111, "grad_norm": 0.06979399174451828, "learning_rate": 1.3568657738678436e-06, "loss": 0.8138, "step": 660 }, { "epoch": 9.785185185185185, "grad_norm": 0.07017084956169128, "learning_rate": 3.3927399688948866e-07, "loss": 0.8138, "step": 665 }, { "epoch": 9.85925925925926, "grad_norm": 0.07032209634780884, "learning_rate": 0.0, "loss": 0.8191, "step": 670 }, { "epoch": 9.85925925925926, "eval_loss": 1.630096197128296, "eval_runtime": 0.8812, "eval_samples_per_second": 4.539, "eval_steps_per_second": 1.135, "step": 670 }, { "epoch": 9.85925925925926, "step": 670, "total_flos": 2.9601022627828204e+18, "train_loss": 0.9062140895359552, "train_runtime": 3484.2972, "train_samples_per_second": 49.516, "train_steps_per_second": 0.192 } ], "logging_steps": 5, "max_steps": 670, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.9601022627828204e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }