{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 192734, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010376996274658336, "grad_norm": 2.8394250869750977, "learning_rate": 0.0002998972677368809, "loss": 7.61689697265625, "step": 100 }, { "epoch": 0.0020753992549316673, "grad_norm": 1.2515239715576172, "learning_rate": 0.00029979349777413427, "loss": 6.9781201171875, "step": 200 }, { "epoch": 0.0031130988823975013, "grad_norm": 3.6369314193725586, "learning_rate": 0.0002996897278113877, "loss": 6.69011474609375, "step": 300 }, { "epoch": 0.0041507985098633345, "grad_norm": 2.6945459842681885, "learning_rate": 0.0002995859578486411, "loss": 6.55205078125, "step": 400 }, { "epoch": 0.005188498137329169, "grad_norm": 1.4870922565460205, "learning_rate": 0.0002994821878858945, "loss": 6.272236938476563, "step": 500 }, { "epoch": 0.006226197764795003, "grad_norm": 2.198580265045166, "learning_rate": 0.00029937841792314796, "loss": 6.2509613037109375, "step": 600 }, { "epoch": 0.007263897392260836, "grad_norm": 1.332912564277649, "learning_rate": 0.00029927464796040135, "loss": 6.2750787353515625, "step": 700 }, { "epoch": 0.008301597019726669, "grad_norm": 1.6891261339187622, "learning_rate": 0.0002991708779976548, "loss": 6.012156372070312, "step": 800 }, { "epoch": 0.009339296647192503, "grad_norm": 2.389779806137085, "learning_rate": 0.0002990671080349082, "loss": 6.011610717773437, "step": 900 }, { "epoch": 0.010376996274658337, "grad_norm": 3.896207332611084, "learning_rate": 0.0002989633380721616, "loss": 5.872296752929688, "step": 1000 }, { "epoch": 0.011414695902124171, "grad_norm": 1.2714102268218994, "learning_rate": 0.00029885956810941504, "loss": 5.8444580078125, "step": 1100 }, { "epoch": 0.012452395529590005, "grad_norm": 1.9793014526367188, "learning_rate": 0.00029875579814666844, "loss": 5.780259399414063, "step": 1200 }, { "epoch": 0.01349009515705584, "grad_norm": 1.7210673093795776, "learning_rate": 0.0002986520281839219, "loss": 5.784580688476563, "step": 1300 }, { "epoch": 0.014527794784521672, "grad_norm": 3.133103609085083, "learning_rate": 0.0002985482582211753, "loss": 5.726546020507812, "step": 1400 }, { "epoch": 0.015565494411987506, "grad_norm": 3.7988669872283936, "learning_rate": 0.0002984444882584287, "loss": 5.659859619140625, "step": 1500 }, { "epoch": 0.016603194039453338, "grad_norm": 1.580628514289856, "learning_rate": 0.00029834071829568207, "loss": 5.710869140625, "step": 1600 }, { "epoch": 0.017640893666919174, "grad_norm": 2.1428017616271973, "learning_rate": 0.0002982369483329355, "loss": 5.61485107421875, "step": 1700 }, { "epoch": 0.018678593294385006, "grad_norm": 1.9413044452667236, "learning_rate": 0.00029813317837018897, "loss": 5.542117309570313, "step": 1800 }, { "epoch": 0.019716292921850842, "grad_norm": 1.9118558168411255, "learning_rate": 0.00029802940840744236, "loss": 5.524238891601563, "step": 1900 }, { "epoch": 0.020753992549316674, "grad_norm": 1.9226549863815308, "learning_rate": 0.00029792563844469576, "loss": 5.544407348632813, "step": 2000 }, { "epoch": 0.02179169217678251, "grad_norm": 3.6845390796661377, "learning_rate": 0.00029782186848194915, "loss": 5.507258911132812, "step": 2100 }, { "epoch": 0.022829391804248342, "grad_norm": 1.113272786140442, "learning_rate": 0.0002977180985192026, "loss": 5.420562133789063, "step": 2200 }, { "epoch": 0.023867091431714175, "grad_norm": 1.05723237991333, "learning_rate": 0.00029761432855645605, "loss": 5.467652587890625, "step": 2300 }, { "epoch": 0.02490479105918001, "grad_norm": 3.3967299461364746, "learning_rate": 0.00029751055859370944, "loss": 5.412258911132812, "step": 2400 }, { "epoch": 0.025942490686645843, "grad_norm": 2.4142208099365234, "learning_rate": 0.0002974067886309629, "loss": 5.421605224609375, "step": 2500 }, { "epoch": 0.02698019031411168, "grad_norm": 1.577314853668213, "learning_rate": 0.0002973030186682163, "loss": 5.2732666015625, "step": 2600 }, { "epoch": 0.02801788994157751, "grad_norm": 2.5680480003356934, "learning_rate": 0.0002971992487054697, "loss": 5.42623779296875, "step": 2700 }, { "epoch": 0.029055589569043343, "grad_norm": 1.665701150894165, "learning_rate": 0.0002970954787427231, "loss": 5.345192260742188, "step": 2800 }, { "epoch": 0.03009328919650918, "grad_norm": 1.3420246839523315, "learning_rate": 0.0002969917087799765, "loss": 5.259754028320312, "step": 2900 }, { "epoch": 0.03113098882397501, "grad_norm": 1.4943575859069824, "learning_rate": 0.00029688793881723, "loss": 5.325694580078125, "step": 3000 }, { "epoch": 0.032168688451440844, "grad_norm": 1.7797436714172363, "learning_rate": 0.00029678416885448337, "loss": 5.393818359375, "step": 3100 }, { "epoch": 0.033206388078906676, "grad_norm": 3.023359537124634, "learning_rate": 0.00029668039889173677, "loss": 5.23187255859375, "step": 3200 }, { "epoch": 0.034244087706372515, "grad_norm": 1.9899531602859497, "learning_rate": 0.00029657662892899016, "loss": 5.1434765625, "step": 3300 }, { "epoch": 0.03528178733383835, "grad_norm": 1.0039557218551636, "learning_rate": 0.0002964728589662436, "loss": 5.28422607421875, "step": 3400 }, { "epoch": 0.03631948696130418, "grad_norm": 1.9204686880111694, "learning_rate": 0.000296369089003497, "loss": 5.149194946289063, "step": 3500 }, { "epoch": 0.03735718658877001, "grad_norm": 1.5530883073806763, "learning_rate": 0.00029626531904075045, "loss": 5.0889456176757815, "step": 3600 }, { "epoch": 0.03839488621623585, "grad_norm": 1.4477442502975464, "learning_rate": 0.00029616154907800385, "loss": 5.225645751953125, "step": 3700 }, { "epoch": 0.039432585843701684, "grad_norm": 2.998966693878174, "learning_rate": 0.00029605777911525724, "loss": 5.127691650390625, "step": 3800 }, { "epoch": 0.040470285471167516, "grad_norm": 1.1760146617889404, "learning_rate": 0.0002959540091525107, "loss": 5.099805908203125, "step": 3900 }, { "epoch": 0.04150798509863335, "grad_norm": 1.6684191226959229, "learning_rate": 0.0002958502391897641, "loss": 5.195625, "step": 4000 }, { "epoch": 0.04254568472609918, "grad_norm": 3.276620864868164, "learning_rate": 0.00029574646922701754, "loss": 5.0514678955078125, "step": 4100 }, { "epoch": 0.04358338435356502, "grad_norm": 1.505712628364563, "learning_rate": 0.00029564269926427093, "loss": 5.234470825195313, "step": 4200 }, { "epoch": 0.04462108398103085, "grad_norm": 1.561785101890564, "learning_rate": 0.0002955389293015243, "loss": 5.18435302734375, "step": 4300 }, { "epoch": 0.045658783608496685, "grad_norm": 2.103935956954956, "learning_rate": 0.0002954351593387778, "loss": 5.127916259765625, "step": 4400 }, { "epoch": 0.04669648323596252, "grad_norm": 1.1984394788742065, "learning_rate": 0.00029533138937603117, "loss": 5.009371032714844, "step": 4500 }, { "epoch": 0.04773418286342835, "grad_norm": 1.35122549533844, "learning_rate": 0.0002952276194132846, "loss": 4.988144836425781, "step": 4600 }, { "epoch": 0.04877188249089419, "grad_norm": 1.7199909687042236, "learning_rate": 0.000295123849450538, "loss": 5.139700317382813, "step": 4700 }, { "epoch": 0.04980958211836002, "grad_norm": 2.299783706665039, "learning_rate": 0.00029502007948779146, "loss": 5.189196166992187, "step": 4800 }, { "epoch": 0.050847281745825854, "grad_norm": 1.251342535018921, "learning_rate": 0.00029491630952504486, "loss": 5.0067724609375, "step": 4900 }, { "epoch": 0.051884981373291686, "grad_norm": 1.7228055000305176, "learning_rate": 0.00029481253956229825, "loss": 5.058696594238281, "step": 5000 }, { "epoch": 0.05292268100075752, "grad_norm": 1.2999722957611084, "learning_rate": 0.0002947087695995517, "loss": 4.953595275878906, "step": 5100 }, { "epoch": 0.05396038062822336, "grad_norm": 2.576788902282715, "learning_rate": 0.0002946049996368051, "loss": 4.935113220214844, "step": 5200 }, { "epoch": 0.05499808025568919, "grad_norm": 3.006600856781006, "learning_rate": 0.00029450122967405854, "loss": 5.13054931640625, "step": 5300 }, { "epoch": 0.05603577988315502, "grad_norm": 1.5450797080993652, "learning_rate": 0.00029439745971131194, "loss": 4.888633117675782, "step": 5400 }, { "epoch": 0.057073479510620855, "grad_norm": 1.9071307182312012, "learning_rate": 0.00029429368974856533, "loss": 4.968219299316406, "step": 5500 }, { "epoch": 0.05811117913808669, "grad_norm": 1.2374857664108276, "learning_rate": 0.0002941899197858188, "loss": 5.0035269165039065, "step": 5600 }, { "epoch": 0.059148878765552526, "grad_norm": 1.270337462425232, "learning_rate": 0.0002940861498230722, "loss": 4.9964404296875, "step": 5700 }, { "epoch": 0.06018657839301836, "grad_norm": 2.112285614013672, "learning_rate": 0.0002939823798603256, "loss": 4.882070007324219, "step": 5800 }, { "epoch": 0.06122427802048419, "grad_norm": 1.2048200368881226, "learning_rate": 0.000293878609897579, "loss": 4.689561767578125, "step": 5900 }, { "epoch": 0.06226197764795002, "grad_norm": 1.213274359703064, "learning_rate": 0.0002937748399348324, "loss": 4.969376525878906, "step": 6000 }, { "epoch": 0.06329967727541586, "grad_norm": 1.1453360319137573, "learning_rate": 0.00029367106997208587, "loss": 4.848797302246094, "step": 6100 }, { "epoch": 0.06433737690288169, "grad_norm": 1.78568696975708, "learning_rate": 0.00029356730000933926, "loss": 4.889250793457031, "step": 6200 }, { "epoch": 0.06537507653034752, "grad_norm": 1.004668951034546, "learning_rate": 0.0002934635300465927, "loss": 4.881064758300782, "step": 6300 }, { "epoch": 0.06641277615781335, "grad_norm": 3.34089994430542, "learning_rate": 0.0002933597600838461, "loss": 4.922989501953125, "step": 6400 }, { "epoch": 0.0674504757852792, "grad_norm": 1.7132960557937622, "learning_rate": 0.00029325599012109955, "loss": 4.900790405273438, "step": 6500 }, { "epoch": 0.06848817541274503, "grad_norm": 3.6154215335845947, "learning_rate": 0.00029315222015835295, "loss": 4.858998718261719, "step": 6600 }, { "epoch": 0.06952587504021086, "grad_norm": 2.199787139892578, "learning_rate": 0.00029304845019560634, "loss": 4.776265258789063, "step": 6700 }, { "epoch": 0.0705635746676767, "grad_norm": 1.193831443786621, "learning_rate": 0.0002929446802328598, "loss": 4.933597717285156, "step": 6800 }, { "epoch": 0.07160127429514253, "grad_norm": 1.0364950895309448, "learning_rate": 0.0002928409102701132, "loss": 4.812368469238281, "step": 6900 }, { "epoch": 0.07263897392260836, "grad_norm": 4.54287576675415, "learning_rate": 0.00029273714030736664, "loss": 4.874449157714844, "step": 7000 }, { "epoch": 0.07367667355007419, "grad_norm": 1.9481868743896484, "learning_rate": 0.00029263337034462003, "loss": 4.836025390625, "step": 7100 }, { "epoch": 0.07471437317754002, "grad_norm": 1.5283995866775513, "learning_rate": 0.0002925296003818734, "loss": 4.789447631835937, "step": 7200 }, { "epoch": 0.07575207280500586, "grad_norm": 1.1243209838867188, "learning_rate": 0.0002924258304191268, "loss": 4.771495971679688, "step": 7300 }, { "epoch": 0.0767897724324717, "grad_norm": 1.2010672092437744, "learning_rate": 0.00029232206045638027, "loss": 4.796032104492188, "step": 7400 }, { "epoch": 0.07782747205993754, "grad_norm": 1.3179821968078613, "learning_rate": 0.0002922182904936337, "loss": 4.949848022460937, "step": 7500 }, { "epoch": 0.07886517168740337, "grad_norm": 2.766585111618042, "learning_rate": 0.0002921145205308871, "loss": 4.7913055419921875, "step": 7600 }, { "epoch": 0.0799028713148692, "grad_norm": 1.301639437675476, "learning_rate": 0.0002920107505681405, "loss": 4.828057556152344, "step": 7700 }, { "epoch": 0.08094057094233503, "grad_norm": 1.205676794052124, "learning_rate": 0.0002919069806053939, "loss": 4.7562734985351565, "step": 7800 }, { "epoch": 0.08197827056980087, "grad_norm": 2.1412694454193115, "learning_rate": 0.00029180321064264735, "loss": 4.7240576171875, "step": 7900 }, { "epoch": 0.0830159701972667, "grad_norm": 1.9297393560409546, "learning_rate": 0.0002916994406799008, "loss": 4.752750244140625, "step": 8000 }, { "epoch": 0.08405366982473253, "grad_norm": 1.5971039533615112, "learning_rate": 0.0002915956707171542, "loss": 4.7790225219726565, "step": 8100 }, { "epoch": 0.08509136945219836, "grad_norm": 1.4667614698410034, "learning_rate": 0.0002914919007544076, "loss": 4.823405151367187, "step": 8200 }, { "epoch": 0.0861290690796642, "grad_norm": 1.8018951416015625, "learning_rate": 0.000291388130791661, "loss": 4.806950378417969, "step": 8300 }, { "epoch": 0.08716676870713004, "grad_norm": 3.0917904376983643, "learning_rate": 0.00029128436082891443, "loss": 4.716513977050782, "step": 8400 }, { "epoch": 0.08820446833459587, "grad_norm": 1.8211461305618286, "learning_rate": 0.00029118059086616783, "loss": 4.803590393066406, "step": 8500 }, { "epoch": 0.0892421679620617, "grad_norm": 1.4940656423568726, "learning_rate": 0.0002910768209034213, "loss": 4.682643737792969, "step": 8600 }, { "epoch": 0.09027986758952754, "grad_norm": 1.432560682296753, "learning_rate": 0.00029097305094067473, "loss": 4.758638610839844, "step": 8700 }, { "epoch": 0.09131756721699337, "grad_norm": 1.0015602111816406, "learning_rate": 0.0002908692809779281, "loss": 4.829322204589844, "step": 8800 }, { "epoch": 0.0923552668444592, "grad_norm": 1.3050769567489624, "learning_rate": 0.0002907655110151815, "loss": 4.62219482421875, "step": 8900 }, { "epoch": 0.09339296647192503, "grad_norm": 1.0704928636550903, "learning_rate": 0.0002906617410524349, "loss": 4.6304998779296875, "step": 9000 }, { "epoch": 0.09443066609939087, "grad_norm": 2.2267684936523438, "learning_rate": 0.00029055797108968836, "loss": 4.664536437988281, "step": 9100 }, { "epoch": 0.0954683657268567, "grad_norm": 2.4608747959136963, "learning_rate": 0.00029045420112694176, "loss": 4.759125366210937, "step": 9200 }, { "epoch": 0.09650606535432253, "grad_norm": 1.5068875551223755, "learning_rate": 0.0002903504311641952, "loss": 4.665271606445312, "step": 9300 }, { "epoch": 0.09754376498178838, "grad_norm": 2.078646421432495, "learning_rate": 0.0002902466612014486, "loss": 4.739638671875, "step": 9400 }, { "epoch": 0.09858146460925421, "grad_norm": 1.3762885332107544, "learning_rate": 0.000290142891238702, "loss": 4.698047485351562, "step": 9500 }, { "epoch": 0.09961916423672004, "grad_norm": 1.2879425287246704, "learning_rate": 0.00029003912127595544, "loss": 4.619927673339844, "step": 9600 }, { "epoch": 0.10065686386418587, "grad_norm": 1.584159016609192, "learning_rate": 0.00028993535131320884, "loss": 4.748394165039063, "step": 9700 }, { "epoch": 0.10169456349165171, "grad_norm": 1.453415870666504, "learning_rate": 0.0002898315813504623, "loss": 4.62876220703125, "step": 9800 }, { "epoch": 0.10273226311911754, "grad_norm": 0.965919017791748, "learning_rate": 0.0002897278113877157, "loss": 4.665562438964844, "step": 9900 }, { "epoch": 0.10376996274658337, "grad_norm": 1.2607330083847046, "learning_rate": 0.0002896240414249691, "loss": 4.7940805053710935, "step": 10000 }, { "epoch": 0.1048076623740492, "grad_norm": 1.0126069784164429, "learning_rate": 0.0002895202714622225, "loss": 4.7508541870117185, "step": 10100 }, { "epoch": 0.10584536200151504, "grad_norm": 1.541813850402832, "learning_rate": 0.0002894165014994759, "loss": 4.57702880859375, "step": 10200 }, { "epoch": 0.10688306162898087, "grad_norm": 2.78938889503479, "learning_rate": 0.00028931273153672937, "loss": 4.652121887207032, "step": 10300 }, { "epoch": 0.10792076125644672, "grad_norm": 2.3567938804626465, "learning_rate": 0.00028920896157398276, "loss": 4.566509094238281, "step": 10400 }, { "epoch": 0.10895846088391255, "grad_norm": 1.0480419397354126, "learning_rate": 0.0002891051916112362, "loss": 4.611513977050781, "step": 10500 }, { "epoch": 0.10999616051137838, "grad_norm": 1.577042579650879, "learning_rate": 0.0002890014216484896, "loss": 4.62977783203125, "step": 10600 }, { "epoch": 0.11103386013884421, "grad_norm": 1.5839786529541016, "learning_rate": 0.000288897651685743, "loss": 4.569055786132813, "step": 10700 }, { "epoch": 0.11207155976631004, "grad_norm": 3.9769680500030518, "learning_rate": 0.00028879388172299645, "loss": 4.6786282348632815, "step": 10800 }, { "epoch": 0.11310925939377588, "grad_norm": 1.8089715242385864, "learning_rate": 0.00028869011176024985, "loss": 4.630350036621094, "step": 10900 }, { "epoch": 0.11414695902124171, "grad_norm": 1.4216063022613525, "learning_rate": 0.0002885863417975033, "loss": 4.669395751953125, "step": 11000 }, { "epoch": 0.11518465864870754, "grad_norm": 1.2107151746749878, "learning_rate": 0.0002884825718347567, "loss": 4.612738342285156, "step": 11100 }, { "epoch": 0.11622235827617337, "grad_norm": 1.5037158727645874, "learning_rate": 0.0002883788018720101, "loss": 4.534631958007813, "step": 11200 }, { "epoch": 0.1172600579036392, "grad_norm": 1.1375142335891724, "learning_rate": 0.00028827503190926353, "loss": 4.803286437988281, "step": 11300 }, { "epoch": 0.11829775753110505, "grad_norm": 1.8553053140640259, "learning_rate": 0.00028817126194651693, "loss": 4.684965515136719, "step": 11400 }, { "epoch": 0.11933545715857088, "grad_norm": 5.896717071533203, "learning_rate": 0.0002880674919837704, "loss": 4.533707275390625, "step": 11500 }, { "epoch": 0.12037315678603672, "grad_norm": 0.9495351910591125, "learning_rate": 0.0002879637220210238, "loss": 4.481864013671875, "step": 11600 }, { "epoch": 0.12141085641350255, "grad_norm": 1.2148685455322266, "learning_rate": 0.00028785995205827717, "loss": 4.508511047363282, "step": 11700 }, { "epoch": 0.12244855604096838, "grad_norm": 1.2658835649490356, "learning_rate": 0.0002877561820955306, "loss": 4.453274841308594, "step": 11800 }, { "epoch": 0.12348625566843421, "grad_norm": 1.0808942317962646, "learning_rate": 0.000287652412132784, "loss": 4.470396118164063, "step": 11900 }, { "epoch": 0.12452395529590005, "grad_norm": 2.0280075073242188, "learning_rate": 0.00028754864217003746, "loss": 4.629884643554687, "step": 12000 }, { "epoch": 0.12556165492336588, "grad_norm": 1.6987171173095703, "learning_rate": 0.00028744487220729086, "loss": 4.673434143066406, "step": 12100 }, { "epoch": 0.1265993545508317, "grad_norm": 1.076246976852417, "learning_rate": 0.00028734110224454425, "loss": 4.707933349609375, "step": 12200 }, { "epoch": 0.12763705417829754, "grad_norm": 1.4878133535385132, "learning_rate": 0.00028723733228179765, "loss": 4.649747924804688, "step": 12300 }, { "epoch": 0.12867475380576338, "grad_norm": 1.132073163986206, "learning_rate": 0.0002871335623190511, "loss": 4.510395812988281, "step": 12400 }, { "epoch": 0.1297124534332292, "grad_norm": 1.172968864440918, "learning_rate": 0.00028702979235630454, "loss": 4.7042324829101565, "step": 12500 }, { "epoch": 0.13075015306069504, "grad_norm": 1.331409215927124, "learning_rate": 0.00028692602239355794, "loss": 4.478284912109375, "step": 12600 }, { "epoch": 0.13178785268816087, "grad_norm": 0.9544440507888794, "learning_rate": 0.0002868222524308114, "loss": 4.574405517578125, "step": 12700 }, { "epoch": 0.1328255523156267, "grad_norm": 1.3560587167739868, "learning_rate": 0.0002867184824680648, "loss": 4.359691467285156, "step": 12800 }, { "epoch": 0.13386325194309256, "grad_norm": 1.4807325601577759, "learning_rate": 0.0002866147125053182, "loss": 4.541731872558594, "step": 12900 }, { "epoch": 0.1349009515705584, "grad_norm": 1.0621514320373535, "learning_rate": 0.00028651094254257157, "loss": 4.442927551269531, "step": 13000 }, { "epoch": 0.13593865119802423, "grad_norm": 0.9886642098426819, "learning_rate": 0.000286407172579825, "loss": 4.690697326660156, "step": 13100 }, { "epoch": 0.13697635082549006, "grad_norm": 1.9239803552627563, "learning_rate": 0.00028630340261707847, "loss": 4.497586669921875, "step": 13200 }, { "epoch": 0.1380140504529559, "grad_norm": 1.644500494003296, "learning_rate": 0.00028619963265433186, "loss": 4.598764038085937, "step": 13300 }, { "epoch": 0.13905175008042173, "grad_norm": 1.3600581884384155, "learning_rate": 0.00028609586269158526, "loss": 4.550304260253906, "step": 13400 }, { "epoch": 0.14008944970788756, "grad_norm": 1.4329279661178589, "learning_rate": 0.00028599209272883865, "loss": 4.506571960449219, "step": 13500 }, { "epoch": 0.1411271493353534, "grad_norm": 1.386486291885376, "learning_rate": 0.0002858883227660921, "loss": 4.419360046386719, "step": 13600 }, { "epoch": 0.14216484896281922, "grad_norm": 0.9777548909187317, "learning_rate": 0.00028578455280334555, "loss": 4.371921691894531, "step": 13700 }, { "epoch": 0.14320254859028506, "grad_norm": 1.323614239692688, "learning_rate": 0.00028568078284059895, "loss": 4.449886474609375, "step": 13800 }, { "epoch": 0.1442402482177509, "grad_norm": 2.0104715824127197, "learning_rate": 0.00028557701287785234, "loss": 4.498194885253906, "step": 13900 }, { "epoch": 0.14527794784521672, "grad_norm": 1.040453314781189, "learning_rate": 0.00028547324291510574, "loss": 4.410159301757813, "step": 14000 }, { "epoch": 0.14631564747268255, "grad_norm": 1.6704965829849243, "learning_rate": 0.0002853694729523592, "loss": 4.4047763061523435, "step": 14100 }, { "epoch": 0.14735334710014839, "grad_norm": 1.1640102863311768, "learning_rate": 0.0002852657029896126, "loss": 4.482722778320312, "step": 14200 }, { "epoch": 0.14839104672761422, "grad_norm": 1.5910676717758179, "learning_rate": 0.00028516193302686603, "loss": 4.464485473632813, "step": 14300 }, { "epoch": 0.14942874635508005, "grad_norm": 2.349853277206421, "learning_rate": 0.0002850581630641194, "loss": 4.478161010742188, "step": 14400 }, { "epoch": 0.15046644598254588, "grad_norm": 1.6594980955123901, "learning_rate": 0.0002849543931013728, "loss": 4.524984741210938, "step": 14500 }, { "epoch": 0.15150414561001171, "grad_norm": 1.0867830514907837, "learning_rate": 0.00028485062313862627, "loss": 4.444278259277343, "step": 14600 }, { "epoch": 0.15254184523747755, "grad_norm": 1.4026222229003906, "learning_rate": 0.00028474685317587966, "loss": 4.562846374511719, "step": 14700 }, { "epoch": 0.1535795448649434, "grad_norm": 1.7118810415267944, "learning_rate": 0.0002846430832131331, "loss": 4.434857177734375, "step": 14800 }, { "epoch": 0.15461724449240924, "grad_norm": 1.3377333879470825, "learning_rate": 0.0002845393132503865, "loss": 4.50284912109375, "step": 14900 }, { "epoch": 0.15565494411987507, "grad_norm": 1.0628588199615479, "learning_rate": 0.00028443554328763996, "loss": 4.467984924316406, "step": 15000 }, { "epoch": 0.1566926437473409, "grad_norm": 1.122900366783142, "learning_rate": 0.00028433177332489335, "loss": 4.477691650390625, "step": 15100 }, { "epoch": 0.15773034337480674, "grad_norm": 1.0721949338912964, "learning_rate": 0.00028422800336214675, "loss": 4.566653137207031, "step": 15200 }, { "epoch": 0.15876804300227257, "grad_norm": 2.0959179401397705, "learning_rate": 0.0002841242333994002, "loss": 4.459400939941406, "step": 15300 }, { "epoch": 0.1598057426297384, "grad_norm": 1.832321047782898, "learning_rate": 0.0002840204634366536, "loss": 4.441622009277344, "step": 15400 }, { "epoch": 0.16084344225720423, "grad_norm": 1.9756203889846802, "learning_rate": 0.00028391669347390704, "loss": 4.5193002319335935, "step": 15500 }, { "epoch": 0.16188114188467007, "grad_norm": 1.9734655618667603, "learning_rate": 0.00028381292351116043, "loss": 4.403963012695312, "step": 15600 }, { "epoch": 0.1629188415121359, "grad_norm": 1.0987114906311035, "learning_rate": 0.00028370915354841383, "loss": 4.3827951049804685, "step": 15700 }, { "epoch": 0.16395654113960173, "grad_norm": 1.0084813833236694, "learning_rate": 0.0002836053835856673, "loss": 4.431182861328125, "step": 15800 }, { "epoch": 0.16499424076706756, "grad_norm": 0.8771688342094421, "learning_rate": 0.00028350161362292067, "loss": 4.386305236816407, "step": 15900 }, { "epoch": 0.1660319403945334, "grad_norm": 1.960618495941162, "learning_rate": 0.0002833978436601741, "loss": 4.450301513671875, "step": 16000 }, { "epoch": 0.16706964002199923, "grad_norm": 2.016059398651123, "learning_rate": 0.0002832940736974275, "loss": 4.443774719238281, "step": 16100 }, { "epoch": 0.16810733964946506, "grad_norm": 2.1017072200775146, "learning_rate": 0.0002831903037346809, "loss": 4.387731323242187, "step": 16200 }, { "epoch": 0.1691450392769309, "grad_norm": 3.876704216003418, "learning_rate": 0.00028308653377193436, "loss": 4.339099731445312, "step": 16300 }, { "epoch": 0.17018273890439672, "grad_norm": 2.4443888664245605, "learning_rate": 0.00028298276380918776, "loss": 4.420601196289063, "step": 16400 }, { "epoch": 0.17122043853186256, "grad_norm": 2.2986700534820557, "learning_rate": 0.0002828789938464412, "loss": 4.574692687988281, "step": 16500 }, { "epoch": 0.1722581381593284, "grad_norm": 3.120959997177124, "learning_rate": 0.0002827752238836946, "loss": 4.3793856811523435, "step": 16600 }, { "epoch": 0.17329583778679422, "grad_norm": 3.928020715713501, "learning_rate": 0.00028267145392094805, "loss": 4.389268188476563, "step": 16700 }, { "epoch": 0.17433353741426008, "grad_norm": 1.5828691720962524, "learning_rate": 0.00028256768395820144, "loss": 4.353381652832031, "step": 16800 }, { "epoch": 0.1753712370417259, "grad_norm": 1.0565470457077026, "learning_rate": 0.00028246391399545484, "loss": 4.289037170410157, "step": 16900 }, { "epoch": 0.17640893666919175, "grad_norm": 1.7072774171829224, "learning_rate": 0.0002823601440327083, "loss": 4.325290832519531, "step": 17000 }, { "epoch": 0.17744663629665758, "grad_norm": 1.0402146577835083, "learning_rate": 0.0002822563740699617, "loss": 4.450514221191407, "step": 17100 }, { "epoch": 0.1784843359241234, "grad_norm": 1.4970057010650635, "learning_rate": 0.00028215260410721513, "loss": 4.393040161132813, "step": 17200 }, { "epoch": 0.17952203555158924, "grad_norm": 1.266546607017517, "learning_rate": 0.0002820488341444685, "loss": 4.276432800292969, "step": 17300 }, { "epoch": 0.18055973517905508, "grad_norm": 1.751590371131897, "learning_rate": 0.0002819450641817219, "loss": 4.40036376953125, "step": 17400 }, { "epoch": 0.1815974348065209, "grad_norm": 1.5430057048797607, "learning_rate": 0.00028184129421897537, "loss": 4.279835205078125, "step": 17500 }, { "epoch": 0.18263513443398674, "grad_norm": 4.205715179443359, "learning_rate": 0.00028173752425622876, "loss": 4.501398315429688, "step": 17600 }, { "epoch": 0.18367283406145257, "grad_norm": 2.2290608882904053, "learning_rate": 0.0002816337542934822, "loss": 4.400292053222656, "step": 17700 }, { "epoch": 0.1847105336889184, "grad_norm": 1.6409145593643188, "learning_rate": 0.0002815299843307356, "loss": 4.361965026855469, "step": 17800 }, { "epoch": 0.18574823331638424, "grad_norm": 1.235737919807434, "learning_rate": 0.000281426214367989, "loss": 4.4263699340820315, "step": 17900 }, { "epoch": 0.18678593294385007, "grad_norm": 1.8182483911514282, "learning_rate": 0.0002813224444052424, "loss": 4.38103759765625, "step": 18000 }, { "epoch": 0.1878236325713159, "grad_norm": 1.725359559059143, "learning_rate": 0.00028121867444249585, "loss": 4.332106323242187, "step": 18100 }, { "epoch": 0.18886133219878173, "grad_norm": 1.9186443090438843, "learning_rate": 0.0002811149044797493, "loss": 4.354175415039062, "step": 18200 }, { "epoch": 0.18989903182624757, "grad_norm": 1.1907823085784912, "learning_rate": 0.0002810111345170027, "loss": 4.521398315429687, "step": 18300 }, { "epoch": 0.1909367314537134, "grad_norm": 2.796095609664917, "learning_rate": 0.0002809073645542561, "loss": 4.280415649414063, "step": 18400 }, { "epoch": 0.19197443108117923, "grad_norm": 2.043811798095703, "learning_rate": 0.0002808035945915095, "loss": 4.364379272460938, "step": 18500 }, { "epoch": 0.19301213070864506, "grad_norm": 6.419173240661621, "learning_rate": 0.00028069982462876293, "loss": 4.420321044921875, "step": 18600 }, { "epoch": 0.1940498303361109, "grad_norm": 2.0183868408203125, "learning_rate": 0.0002805960546660163, "loss": 4.203153381347656, "step": 18700 }, { "epoch": 0.19508752996357676, "grad_norm": 1.1752562522888184, "learning_rate": 0.00028049228470326977, "loss": 4.362376098632812, "step": 18800 }, { "epoch": 0.1961252295910426, "grad_norm": 1.7152916193008423, "learning_rate": 0.0002803885147405232, "loss": 4.423097229003906, "step": 18900 }, { "epoch": 0.19716292921850842, "grad_norm": 0.8988032341003418, "learning_rate": 0.0002802847447777766, "loss": 4.291071166992188, "step": 19000 }, { "epoch": 0.19820062884597425, "grad_norm": 1.2874023914337158, "learning_rate": 0.00028018097481503, "loss": 4.257485046386718, "step": 19100 }, { "epoch": 0.19923832847344009, "grad_norm": 3.89581561088562, "learning_rate": 0.0002800772048522834, "loss": 4.355436401367188, "step": 19200 }, { "epoch": 0.20027602810090592, "grad_norm": 1.4264250993728638, "learning_rate": 0.00027997343488953686, "loss": 4.268387451171875, "step": 19300 }, { "epoch": 0.20131372772837175, "grad_norm": 2.3243231773376465, "learning_rate": 0.0002798696649267903, "loss": 4.248961791992188, "step": 19400 }, { "epoch": 0.20235142735583758, "grad_norm": 1.609995722770691, "learning_rate": 0.0002797658949640437, "loss": 4.299253845214844, "step": 19500 }, { "epoch": 0.20338912698330341, "grad_norm": 1.636496901512146, "learning_rate": 0.0002796621250012971, "loss": 4.379757690429687, "step": 19600 }, { "epoch": 0.20442682661076925, "grad_norm": 1.742827296257019, "learning_rate": 0.0002795583550385505, "loss": 4.298026733398437, "step": 19700 }, { "epoch": 0.20546452623823508, "grad_norm": 1.3360769748687744, "learning_rate": 0.00027945458507580394, "loss": 4.443134155273437, "step": 19800 }, { "epoch": 0.2065022258657009, "grad_norm": 1.5279536247253418, "learning_rate": 0.00027935081511305733, "loss": 4.3536380004882815, "step": 19900 }, { "epoch": 0.20753992549316674, "grad_norm": 1.2768709659576416, "learning_rate": 0.0002792470451503108, "loss": 4.420497741699219, "step": 20000 }, { "epoch": 0.20857762512063258, "grad_norm": 1.1040194034576416, "learning_rate": 0.0002791432751875642, "loss": 4.308759155273438, "step": 20100 }, { "epoch": 0.2096153247480984, "grad_norm": 1.5710710287094116, "learning_rate": 0.00027903950522481757, "loss": 4.188085021972657, "step": 20200 }, { "epoch": 0.21065302437556424, "grad_norm": 0.9058725237846375, "learning_rate": 0.000278935735262071, "loss": 4.162925720214844, "step": 20300 }, { "epoch": 0.21169072400303007, "grad_norm": 2.4681508541107178, "learning_rate": 0.0002788319652993244, "loss": 4.207759704589844, "step": 20400 }, { "epoch": 0.2127284236304959, "grad_norm": 1.7522861957550049, "learning_rate": 0.00027872819533657786, "loss": 4.448352355957031, "step": 20500 }, { "epoch": 0.21376612325796174, "grad_norm": 1.8361260890960693, "learning_rate": 0.00027862442537383126, "loss": 4.27260986328125, "step": 20600 }, { "epoch": 0.2148038228854276, "grad_norm": 1.7720355987548828, "learning_rate": 0.0002785206554110847, "loss": 4.315809326171875, "step": 20700 }, { "epoch": 0.21584152251289343, "grad_norm": 2.2454731464385986, "learning_rate": 0.0002784168854483381, "loss": 4.421763916015625, "step": 20800 }, { "epoch": 0.21687922214035926, "grad_norm": 2.7393276691436768, "learning_rate": 0.0002783131154855915, "loss": 4.268560791015625, "step": 20900 }, { "epoch": 0.2179169217678251, "grad_norm": 1.8933848142623901, "learning_rate": 0.00027820934552284495, "loss": 4.316322937011718, "step": 21000 }, { "epoch": 0.21895462139529093, "grad_norm": 1.2294155359268188, "learning_rate": 0.00027810557556009834, "loss": 4.247787780761719, "step": 21100 }, { "epoch": 0.21999232102275676, "grad_norm": 1.5950024127960205, "learning_rate": 0.0002780018055973518, "loss": 4.292718811035156, "step": 21200 }, { "epoch": 0.2210300206502226, "grad_norm": 0.9710947275161743, "learning_rate": 0.0002778980356346052, "loss": 4.238976135253906, "step": 21300 }, { "epoch": 0.22206772027768842, "grad_norm": 1.3599995374679565, "learning_rate": 0.0002777942656718586, "loss": 4.441769409179687, "step": 21400 }, { "epoch": 0.22310541990515426, "grad_norm": 1.2248610258102417, "learning_rate": 0.00027769049570911203, "loss": 4.34153564453125, "step": 21500 }, { "epoch": 0.2241431195326201, "grad_norm": 1.07679283618927, "learning_rate": 0.0002775867257463654, "loss": 4.307798767089844, "step": 21600 }, { "epoch": 0.22518081916008592, "grad_norm": 2.6134791374206543, "learning_rate": 0.0002774829557836189, "loss": 4.170127868652344, "step": 21700 }, { "epoch": 0.22621851878755175, "grad_norm": 3.8844735622406006, "learning_rate": 0.00027737918582087227, "loss": 4.2596041870117185, "step": 21800 }, { "epoch": 0.22725621841501759, "grad_norm": 3.4798216819763184, "learning_rate": 0.00027727541585812566, "loss": 4.257220153808594, "step": 21900 }, { "epoch": 0.22829391804248342, "grad_norm": 1.0172936916351318, "learning_rate": 0.0002771716458953791, "loss": 4.342347717285156, "step": 22000 }, { "epoch": 0.22933161766994925, "grad_norm": 2.0007245540618896, "learning_rate": 0.0002770678759326325, "loss": 4.21951171875, "step": 22100 }, { "epoch": 0.23036931729741508, "grad_norm": 1.0652577877044678, "learning_rate": 0.00027696410596988596, "loss": 4.309334411621093, "step": 22200 }, { "epoch": 0.23140701692488092, "grad_norm": 1.0696879625320435, "learning_rate": 0.00027686033600713935, "loss": 4.333943481445313, "step": 22300 }, { "epoch": 0.23244471655234675, "grad_norm": 1.0693758726119995, "learning_rate": 0.00027675656604439275, "loss": 4.325413513183594, "step": 22400 }, { "epoch": 0.23348241617981258, "grad_norm": 1.3958321809768677, "learning_rate": 0.00027665279608164614, "loss": 4.1349484252929685, "step": 22500 }, { "epoch": 0.2345201158072784, "grad_norm": 1.732444167137146, "learning_rate": 0.0002765490261188996, "loss": 4.191957397460937, "step": 22600 }, { "epoch": 0.23555781543474427, "grad_norm": 1.329959750175476, "learning_rate": 0.00027644525615615304, "loss": 4.440416870117187, "step": 22700 }, { "epoch": 0.2365955150622101, "grad_norm": 1.4088762998580933, "learning_rate": 0.00027634148619340643, "loss": 4.128535461425781, "step": 22800 }, { "epoch": 0.23763321468967594, "grad_norm": 1.167936086654663, "learning_rate": 0.0002762377162306599, "loss": 4.3338143920898435, "step": 22900 }, { "epoch": 0.23867091431714177, "grad_norm": 1.1570918560028076, "learning_rate": 0.0002761339462679133, "loss": 4.180432739257813, "step": 23000 }, { "epoch": 0.2397086139446076, "grad_norm": 1.2544199228286743, "learning_rate": 0.00027603017630516667, "loss": 4.1538671875, "step": 23100 }, { "epoch": 0.24074631357207343, "grad_norm": 1.844802975654602, "learning_rate": 0.0002759264063424201, "loss": 4.238400268554687, "step": 23200 }, { "epoch": 0.24178401319953927, "grad_norm": 2.407107353210449, "learning_rate": 0.0002758226363796735, "loss": 4.1402197265625, "step": 23300 }, { "epoch": 0.2428217128270051, "grad_norm": 1.7526997327804565, "learning_rate": 0.00027571886641692696, "loss": 4.253873901367188, "step": 23400 }, { "epoch": 0.24385941245447093, "grad_norm": 2.1768147945404053, "learning_rate": 0.00027561509645418036, "loss": 4.146066589355469, "step": 23500 }, { "epoch": 0.24489711208193676, "grad_norm": 1.0545059442520142, "learning_rate": 0.00027551132649143375, "loss": 4.199613037109375, "step": 23600 }, { "epoch": 0.2459348117094026, "grad_norm": 1.2132643461227417, "learning_rate": 0.00027540755652868715, "loss": 4.202657775878906, "step": 23700 }, { "epoch": 0.24697251133686843, "grad_norm": 2.1652746200561523, "learning_rate": 0.0002753037865659406, "loss": 4.301669311523438, "step": 23800 }, { "epoch": 0.24801021096433426, "grad_norm": 1.0687705278396606, "learning_rate": 0.00027520001660319405, "loss": 4.310574340820312, "step": 23900 }, { "epoch": 0.2490479105918001, "grad_norm": 2.6030638217926025, "learning_rate": 0.00027509624664044744, "loss": 4.220720825195312, "step": 24000 }, { "epoch": 0.25008561021926595, "grad_norm": 0.9720291495323181, "learning_rate": 0.00027499247667770084, "loss": 4.376803283691406, "step": 24100 }, { "epoch": 0.25112330984673176, "grad_norm": 1.398289680480957, "learning_rate": 0.00027488870671495423, "loss": 4.39901123046875, "step": 24200 }, { "epoch": 0.2521610094741976, "grad_norm": 2.2055957317352295, "learning_rate": 0.0002747849367522077, "loss": 4.196527709960938, "step": 24300 }, { "epoch": 0.2531987091016634, "grad_norm": 2.036271810531616, "learning_rate": 0.0002746811667894611, "loss": 4.274451599121094, "step": 24400 }, { "epoch": 0.2542364087291293, "grad_norm": 2.6011345386505127, "learning_rate": 0.0002745773968267145, "loss": 4.2699462890625, "step": 24500 }, { "epoch": 0.2552741083565951, "grad_norm": 1.9660414457321167, "learning_rate": 0.0002744736268639679, "loss": 4.2452325439453125, "step": 24600 }, { "epoch": 0.25631180798406095, "grad_norm": 1.2747102975845337, "learning_rate": 0.0002743698569012213, "loss": 4.348042907714844, "step": 24700 }, { "epoch": 0.25734950761152675, "grad_norm": 1.4823510646820068, "learning_rate": 0.00027426608693847476, "loss": 4.154461669921875, "step": 24800 }, { "epoch": 0.2583872072389926, "grad_norm": 1.6665210723876953, "learning_rate": 0.00027416231697572816, "loss": 4.136954956054687, "step": 24900 }, { "epoch": 0.2594249068664584, "grad_norm": 1.8465914726257324, "learning_rate": 0.0002740585470129816, "loss": 4.296747741699218, "step": 25000 }, { "epoch": 0.2604626064939243, "grad_norm": 1.0613303184509277, "learning_rate": 0.00027395477705023506, "loss": 4.209448547363281, "step": 25100 }, { "epoch": 0.2615003061213901, "grad_norm": 2.3083701133728027, "learning_rate": 0.00027385100708748845, "loss": 4.412258911132812, "step": 25200 }, { "epoch": 0.26253800574885594, "grad_norm": 1.8509588241577148, "learning_rate": 0.00027374723712474185, "loss": 4.171485595703125, "step": 25300 }, { "epoch": 0.26357570537632175, "grad_norm": 1.091736078262329, "learning_rate": 0.00027364346716199524, "loss": 4.24049560546875, "step": 25400 }, { "epoch": 0.2646134050037876, "grad_norm": 1.201401710510254, "learning_rate": 0.0002735396971992487, "loss": 4.135834350585937, "step": 25500 }, { "epoch": 0.2656511046312534, "grad_norm": 1.5545823574066162, "learning_rate": 0.0002734359272365021, "loss": 4.291419677734375, "step": 25600 }, { "epoch": 0.26668880425871927, "grad_norm": 1.3560378551483154, "learning_rate": 0.00027333215727375553, "loss": 4.236996459960937, "step": 25700 }, { "epoch": 0.26772650388618513, "grad_norm": 1.0210782289505005, "learning_rate": 0.00027322838731100893, "loss": 4.249810791015625, "step": 25800 }, { "epoch": 0.26876420351365093, "grad_norm": 1.3093341588974, "learning_rate": 0.0002731246173482623, "loss": 4.195414428710937, "step": 25900 }, { "epoch": 0.2698019031411168, "grad_norm": 1.7895358800888062, "learning_rate": 0.00027302084738551577, "loss": 4.180751037597656, "step": 26000 }, { "epoch": 0.2708396027685826, "grad_norm": 11.451671600341797, "learning_rate": 0.00027291707742276917, "loss": 4.157826538085938, "step": 26100 }, { "epoch": 0.27187730239604846, "grad_norm": 1.9708665609359741, "learning_rate": 0.0002728133074600226, "loss": 4.128204956054687, "step": 26200 }, { "epoch": 0.27291500202351426, "grad_norm": 1.2628132104873657, "learning_rate": 0.000272709537497276, "loss": 4.281667785644531, "step": 26300 }, { "epoch": 0.2739527016509801, "grad_norm": 2.2199666500091553, "learning_rate": 0.0002726057675345294, "loss": 4.237691650390625, "step": 26400 }, { "epoch": 0.27499040127844593, "grad_norm": 2.815150022506714, "learning_rate": 0.00027250199757178285, "loss": 4.080834045410156, "step": 26500 }, { "epoch": 0.2760281009059118, "grad_norm": 1.7167062759399414, "learning_rate": 0.00027239822760903625, "loss": 4.224625549316406, "step": 26600 }, { "epoch": 0.2770658005333776, "grad_norm": 2.769949436187744, "learning_rate": 0.0002722944576462897, "loss": 4.3115145874023435, "step": 26700 }, { "epoch": 0.27810350016084345, "grad_norm": 1.3523616790771484, "learning_rate": 0.0002721906876835431, "loss": 4.356557006835938, "step": 26800 }, { "epoch": 0.27914119978830926, "grad_norm": 4.089077949523926, "learning_rate": 0.00027208691772079654, "loss": 4.286115112304688, "step": 26900 }, { "epoch": 0.2801788994157751, "grad_norm": 1.1650248765945435, "learning_rate": 0.00027198314775804994, "loss": 4.335249328613282, "step": 27000 }, { "epoch": 0.2812165990432409, "grad_norm": 1.8776350021362305, "learning_rate": 0.00027187937779530333, "loss": 4.274792175292969, "step": 27100 }, { "epoch": 0.2822542986707068, "grad_norm": 3.665797710418701, "learning_rate": 0.0002717756078325568, "loss": 4.347820739746094, "step": 27200 }, { "epoch": 0.2832919982981726, "grad_norm": 1.1905182600021362, "learning_rate": 0.0002716718378698102, "loss": 4.234444274902343, "step": 27300 }, { "epoch": 0.28432969792563845, "grad_norm": 1.2664549350738525, "learning_rate": 0.0002715680679070636, "loss": 4.19026123046875, "step": 27400 }, { "epoch": 0.28536739755310425, "grad_norm": 1.5952035188674927, "learning_rate": 0.000271464297944317, "loss": 4.284921569824219, "step": 27500 }, { "epoch": 0.2864050971805701, "grad_norm": 1.5898215770721436, "learning_rate": 0.0002713605279815704, "loss": 4.128340759277344, "step": 27600 }, { "epoch": 0.28744279680803597, "grad_norm": 1.701250433921814, "learning_rate": 0.00027125675801882386, "loss": 4.1064456176757815, "step": 27700 }, { "epoch": 0.2884804964355018, "grad_norm": 2.2521140575408936, "learning_rate": 0.00027115298805607726, "loss": 4.188478698730469, "step": 27800 }, { "epoch": 0.28951819606296764, "grad_norm": 1.428589105606079, "learning_rate": 0.0002710492180933307, "loss": 4.172950134277344, "step": 27900 }, { "epoch": 0.29055589569043344, "grad_norm": 1.5243910551071167, "learning_rate": 0.0002709454481305841, "loss": 4.251683044433594, "step": 28000 }, { "epoch": 0.2915935953178993, "grad_norm": 1.285276174545288, "learning_rate": 0.0002708416781678375, "loss": 4.291034851074219, "step": 28100 }, { "epoch": 0.2926312949453651, "grad_norm": 1.2959215641021729, "learning_rate": 0.0002707379082050909, "loss": 4.223204040527344, "step": 28200 }, { "epoch": 0.29366899457283097, "grad_norm": 1.9572069644927979, "learning_rate": 0.00027063413824234434, "loss": 4.1140069580078125, "step": 28300 }, { "epoch": 0.29470669420029677, "grad_norm": 2.5625929832458496, "learning_rate": 0.0002705303682795978, "loss": 4.2418734741210935, "step": 28400 }, { "epoch": 0.29574439382776263, "grad_norm": 1.657065510749817, "learning_rate": 0.0002704265983168512, "loss": 4.2059628295898435, "step": 28500 }, { "epoch": 0.29678209345522844, "grad_norm": 1.4735133647918701, "learning_rate": 0.0002703228283541046, "loss": 4.232904663085938, "step": 28600 }, { "epoch": 0.2978197930826943, "grad_norm": 2.643979549407959, "learning_rate": 0.000270219058391358, "loss": 4.151640930175781, "step": 28700 }, { "epoch": 0.2988574927101601, "grad_norm": 1.5147004127502441, "learning_rate": 0.0002701152884286114, "loss": 4.171849060058594, "step": 28800 }, { "epoch": 0.29989519233762596, "grad_norm": 1.4815659523010254, "learning_rate": 0.00027001151846586487, "loss": 4.120007019042969, "step": 28900 }, { "epoch": 0.30093289196509176, "grad_norm": 3.8772029876708984, "learning_rate": 0.00026990774850311827, "loss": 4.113840637207031, "step": 29000 }, { "epoch": 0.3019705915925576, "grad_norm": 1.8152740001678467, "learning_rate": 0.0002698039785403717, "loss": 4.143219604492187, "step": 29100 }, { "epoch": 0.30300829122002343, "grad_norm": 1.3441669940948486, "learning_rate": 0.0002697002085776251, "loss": 4.151035461425781, "step": 29200 }, { "epoch": 0.3040459908474893, "grad_norm": 2.0656609535217285, "learning_rate": 0.0002695964386148785, "loss": 4.229763793945312, "step": 29300 }, { "epoch": 0.3050836904749551, "grad_norm": 2.8376095294952393, "learning_rate": 0.0002694926686521319, "loss": 4.2303158569335935, "step": 29400 }, { "epoch": 0.30612139010242095, "grad_norm": 1.9161107540130615, "learning_rate": 0.00026938889868938535, "loss": 4.252763061523438, "step": 29500 }, { "epoch": 0.3071590897298868, "grad_norm": 2.1317851543426514, "learning_rate": 0.0002692851287266388, "loss": 4.12993408203125, "step": 29600 }, { "epoch": 0.3081967893573526, "grad_norm": 2.9762330055236816, "learning_rate": 0.0002691813587638922, "loss": 4.347277221679687, "step": 29700 }, { "epoch": 0.3092344889848185, "grad_norm": 2.135929584503174, "learning_rate": 0.0002690775888011456, "loss": 4.052276611328125, "step": 29800 }, { "epoch": 0.3102721886122843, "grad_norm": 1.3577543497085571, "learning_rate": 0.000268973818838399, "loss": 4.199589233398438, "step": 29900 }, { "epoch": 0.31130988823975014, "grad_norm": 1.2834597826004028, "learning_rate": 0.00026887004887565243, "loss": 4.134565734863282, "step": 30000 }, { "epoch": 0.31234758786721595, "grad_norm": 2.093669891357422, "learning_rate": 0.00026876627891290583, "loss": 4.183307495117187, "step": 30100 }, { "epoch": 0.3133852874946818, "grad_norm": 1.1888537406921387, "learning_rate": 0.0002686625089501593, "loss": 4.022268371582031, "step": 30200 }, { "epoch": 0.3144229871221476, "grad_norm": 1.4640058279037476, "learning_rate": 0.00026855873898741267, "loss": 4.191292724609375, "step": 30300 }, { "epoch": 0.3154606867496135, "grad_norm": 0.9469636678695679, "learning_rate": 0.00026845496902466607, "loss": 4.2131259155273435, "step": 30400 }, { "epoch": 0.3164983863770793, "grad_norm": 1.5227535963058472, "learning_rate": 0.0002683511990619195, "loss": 4.24783935546875, "step": 30500 }, { "epoch": 0.31753608600454514, "grad_norm": 2.524731159210205, "learning_rate": 0.0002682474290991729, "loss": 4.206085205078125, "step": 30600 }, { "epoch": 0.31857378563201094, "grad_norm": 2.7074637413024902, "learning_rate": 0.00026814365913642636, "loss": 3.964501953125, "step": 30700 }, { "epoch": 0.3196114852594768, "grad_norm": 2.1479899883270264, "learning_rate": 0.00026803988917367975, "loss": 4.121002197265625, "step": 30800 }, { "epoch": 0.3206491848869426, "grad_norm": 3.6871800422668457, "learning_rate": 0.00026793611921093315, "loss": 4.290604858398438, "step": 30900 }, { "epoch": 0.32168688451440847, "grad_norm": 2.0092685222625732, "learning_rate": 0.0002678323492481866, "loss": 4.169475708007813, "step": 31000 }, { "epoch": 0.32272458414187427, "grad_norm": 1.3000237941741943, "learning_rate": 0.00026772857928544, "loss": 4.096010131835937, "step": 31100 }, { "epoch": 0.32376228376934013, "grad_norm": 2.161574125289917, "learning_rate": 0.00026762480932269344, "loss": 4.249325561523437, "step": 31200 }, { "epoch": 0.32479998339680594, "grad_norm": 1.0579701662063599, "learning_rate": 0.00026752103935994684, "loss": 4.270779724121094, "step": 31300 }, { "epoch": 0.3258376830242718, "grad_norm": 1.2264137268066406, "learning_rate": 0.0002674172693972003, "loss": 4.2367620849609375, "step": 31400 }, { "epoch": 0.3268753826517376, "grad_norm": 3.2623612880706787, "learning_rate": 0.0002673134994344537, "loss": 4.160564575195313, "step": 31500 }, { "epoch": 0.32791308227920346, "grad_norm": 2.1803345680236816, "learning_rate": 0.0002672097294717071, "loss": 4.203829040527344, "step": 31600 }, { "epoch": 0.3289507819066693, "grad_norm": 1.9515228271484375, "learning_rate": 0.0002671059595089605, "loss": 4.315436706542969, "step": 31700 }, { "epoch": 0.3299884815341351, "grad_norm": 3.0683810710906982, "learning_rate": 0.0002670021895462139, "loss": 4.155743103027344, "step": 31800 }, { "epoch": 0.331026181161601, "grad_norm": 2.6642050743103027, "learning_rate": 0.00026689841958346737, "loss": 4.222473754882812, "step": 31900 }, { "epoch": 0.3320638807890668, "grad_norm": 1.8333579301834106, "learning_rate": 0.00026679464962072076, "loss": 4.210680541992187, "step": 32000 }, { "epoch": 0.33310158041653265, "grad_norm": 2.136242151260376, "learning_rate": 0.00026669087965797416, "loss": 4.144779357910156, "step": 32100 }, { "epoch": 0.33413928004399845, "grad_norm": 0.9694802165031433, "learning_rate": 0.0002665871096952276, "loss": 4.1770632934570315, "step": 32200 }, { "epoch": 0.3351769796714643, "grad_norm": 2.070678949356079, "learning_rate": 0.000266483339732481, "loss": 4.140425415039062, "step": 32300 }, { "epoch": 0.3362146792989301, "grad_norm": 1.3420311212539673, "learning_rate": 0.00026637956976973445, "loss": 4.117745361328125, "step": 32400 }, { "epoch": 0.337252378926396, "grad_norm": 1.7498325109481812, "learning_rate": 0.00026627579980698785, "loss": 4.090622253417969, "step": 32500 }, { "epoch": 0.3382900785538618, "grad_norm": 5.7661848068237305, "learning_rate": 0.00026617202984424124, "loss": 4.212898864746093, "step": 32600 }, { "epoch": 0.33932777818132764, "grad_norm": 1.856246829032898, "learning_rate": 0.0002660682598814947, "loss": 4.20351806640625, "step": 32700 }, { "epoch": 0.34036547780879345, "grad_norm": 5.002403259277344, "learning_rate": 0.0002659644899187481, "loss": 4.095009765625, "step": 32800 }, { "epoch": 0.3414031774362593, "grad_norm": 1.0896239280700684, "learning_rate": 0.00026586071995600153, "loss": 4.061651000976562, "step": 32900 }, { "epoch": 0.3424408770637251, "grad_norm": 1.4536166191101074, "learning_rate": 0.00026575694999325493, "loss": 4.05192626953125, "step": 33000 }, { "epoch": 0.343478576691191, "grad_norm": 3.966247081756592, "learning_rate": 0.0002656531800305084, "loss": 4.122060241699219, "step": 33100 }, { "epoch": 0.3445162763186568, "grad_norm": 2.3092470169067383, "learning_rate": 0.00026554941006776177, "loss": 4.171341247558594, "step": 33200 }, { "epoch": 0.34555397594612264, "grad_norm": 1.6187312602996826, "learning_rate": 0.00026544564010501517, "loss": 4.147681579589844, "step": 33300 }, { "epoch": 0.34659167557358844, "grad_norm": 1.4459052085876465, "learning_rate": 0.0002653418701422686, "loss": 4.12395751953125, "step": 33400 }, { "epoch": 0.3476293752010543, "grad_norm": 1.6370753049850464, "learning_rate": 0.000265238100179522, "loss": 4.043997192382813, "step": 33500 }, { "epoch": 0.34866707482852016, "grad_norm": 2.5965089797973633, "learning_rate": 0.00026513433021677546, "loss": 4.149281311035156, "step": 33600 }, { "epoch": 0.34970477445598597, "grad_norm": 1.4466602802276611, "learning_rate": 0.00026503056025402885, "loss": 4.153418884277344, "step": 33700 }, { "epoch": 0.3507424740834518, "grad_norm": 1.1217280626296997, "learning_rate": 0.00026492679029128225, "loss": 4.173803405761719, "step": 33800 }, { "epoch": 0.35178017371091763, "grad_norm": 2.853686809539795, "learning_rate": 0.00026482302032853564, "loss": 4.1212451171875, "step": 33900 }, { "epoch": 0.3528178733383835, "grad_norm": 1.1508560180664062, "learning_rate": 0.0002647192503657891, "loss": 4.179091186523437, "step": 34000 }, { "epoch": 0.3538555729658493, "grad_norm": 1.8668493032455444, "learning_rate": 0.00026461548040304254, "loss": 4.142960205078125, "step": 34100 }, { "epoch": 0.35489327259331516, "grad_norm": 1.7272940874099731, "learning_rate": 0.00026451171044029594, "loss": 4.127975769042969, "step": 34200 }, { "epoch": 0.35593097222078096, "grad_norm": 1.5529290437698364, "learning_rate": 0.00026440794047754933, "loss": 4.2190853881835935, "step": 34300 }, { "epoch": 0.3569686718482468, "grad_norm": 1.506499171257019, "learning_rate": 0.0002643041705148027, "loss": 4.168932800292969, "step": 34400 }, { "epoch": 0.3580063714757126, "grad_norm": 1.2258543968200684, "learning_rate": 0.0002642004005520562, "loss": 4.065081176757812, "step": 34500 }, { "epoch": 0.3590440711031785, "grad_norm": 1.4408226013183594, "learning_rate": 0.0002640966305893096, "loss": 4.102992858886719, "step": 34600 }, { "epoch": 0.3600817707306443, "grad_norm": 2.467862844467163, "learning_rate": 0.000263992860626563, "loss": 4.061658935546875, "step": 34700 }, { "epoch": 0.36111947035811015, "grad_norm": 1.3214993476867676, "learning_rate": 0.0002638890906638164, "loss": 4.133033752441406, "step": 34800 }, { "epoch": 0.36215716998557596, "grad_norm": 1.2223659753799438, "learning_rate": 0.0002637853207010698, "loss": 4.0944091796875, "step": 34900 }, { "epoch": 0.3631948696130418, "grad_norm": 1.5864417552947998, "learning_rate": 0.00026368155073832326, "loss": 4.031897277832031, "step": 35000 }, { "epoch": 0.3642325692405076, "grad_norm": 3.021804094314575, "learning_rate": 0.00026357778077557665, "loss": 4.253480224609375, "step": 35100 }, { "epoch": 0.3652702688679735, "grad_norm": 2.419196844100952, "learning_rate": 0.0002634740108128301, "loss": 4.060654602050781, "step": 35200 }, { "epoch": 0.3663079684954393, "grad_norm": 3.106058359146118, "learning_rate": 0.00026337024085008355, "loss": 4.167652282714844, "step": 35300 }, { "epoch": 0.36734566812290514, "grad_norm": 2.6082842350006104, "learning_rate": 0.00026326647088733695, "loss": 4.126443481445312, "step": 35400 }, { "epoch": 0.368383367750371, "grad_norm": 3.2292778491973877, "learning_rate": 0.00026316270092459034, "loss": 4.16947509765625, "step": 35500 }, { "epoch": 0.3694210673778368, "grad_norm": 3.438127279281616, "learning_rate": 0.00026305893096184374, "loss": 4.18126220703125, "step": 35600 }, { "epoch": 0.37045876700530267, "grad_norm": 1.1258721351623535, "learning_rate": 0.0002629551609990972, "loss": 4.133269348144531, "step": 35700 }, { "epoch": 0.3714964666327685, "grad_norm": 2.0176923274993896, "learning_rate": 0.00026285139103635063, "loss": 4.000823059082031, "step": 35800 }, { "epoch": 0.37253416626023433, "grad_norm": 2.162721872329712, "learning_rate": 0.00026274762107360403, "loss": 4.158842163085938, "step": 35900 }, { "epoch": 0.37357186588770014, "grad_norm": 1.3159765005111694, "learning_rate": 0.0002626438511108574, "loss": 4.156724853515625, "step": 36000 }, { "epoch": 0.374609565515166, "grad_norm": 1.8504067659378052, "learning_rate": 0.0002625400811481108, "loss": 4.074109191894531, "step": 36100 }, { "epoch": 0.3756472651426318, "grad_norm": 1.3491618633270264, "learning_rate": 0.00026243631118536427, "loss": 4.117833557128907, "step": 36200 }, { "epoch": 0.37668496477009766, "grad_norm": 1.1090528964996338, "learning_rate": 0.00026233254122261766, "loss": 4.0473480224609375, "step": 36300 }, { "epoch": 0.37772266439756347, "grad_norm": 4.539895057678223, "learning_rate": 0.0002622287712598711, "loss": 4.0527517700195315, "step": 36400 }, { "epoch": 0.37876036402502933, "grad_norm": 1.792636513710022, "learning_rate": 0.0002621250012971245, "loss": 3.9459353637695314, "step": 36500 }, { "epoch": 0.37979806365249513, "grad_norm": 2.4098236560821533, "learning_rate": 0.0002620212313343779, "loss": 4.144781494140625, "step": 36600 }, { "epoch": 0.380835763279961, "grad_norm": 1.8648608922958374, "learning_rate": 0.00026191746137163135, "loss": 4.104746704101562, "step": 36700 }, { "epoch": 0.3818734629074268, "grad_norm": 2.071338653564453, "learning_rate": 0.00026181369140888474, "loss": 4.074734191894532, "step": 36800 }, { "epoch": 0.38291116253489266, "grad_norm": 1.3856308460235596, "learning_rate": 0.0002617099214461382, "loss": 4.158983154296875, "step": 36900 }, { "epoch": 0.38394886216235846, "grad_norm": 2.072495698928833, "learning_rate": 0.0002616061514833916, "loss": 4.08581787109375, "step": 37000 }, { "epoch": 0.3849865617898243, "grad_norm": 1.3703645467758179, "learning_rate": 0.00026150238152064504, "loss": 4.006895446777344, "step": 37100 }, { "epoch": 0.3860242614172901, "grad_norm": 2.7975013256073, "learning_rate": 0.00026139861155789843, "loss": 4.147843627929688, "step": 37200 }, { "epoch": 0.387061961044756, "grad_norm": 3.56386661529541, "learning_rate": 0.0002612948415951518, "loss": 4.2793121337890625, "step": 37300 }, { "epoch": 0.3880996606722218, "grad_norm": 2.8237593173980713, "learning_rate": 0.0002611910716324053, "loss": 4.13215087890625, "step": 37400 }, { "epoch": 0.38913736029968765, "grad_norm": 1.2382421493530273, "learning_rate": 0.00026108730166965867, "loss": 4.071390991210937, "step": 37500 }, { "epoch": 0.3901750599271535, "grad_norm": 1.620809555053711, "learning_rate": 0.0002609835317069121, "loss": 4.081386108398437, "step": 37600 }, { "epoch": 0.3912127595546193, "grad_norm": 1.5530173778533936, "learning_rate": 0.0002608797617441655, "loss": 4.095469970703125, "step": 37700 }, { "epoch": 0.3922504591820852, "grad_norm": 2.7742369174957275, "learning_rate": 0.0002607759917814189, "loss": 4.096160888671875, "step": 37800 }, { "epoch": 0.393288158809551, "grad_norm": 1.0493942499160767, "learning_rate": 0.00026067222181867236, "loss": 4.000921936035156, "step": 37900 }, { "epoch": 0.39432585843701684, "grad_norm": 4.1348958015441895, "learning_rate": 0.00026056845185592575, "loss": 3.991048583984375, "step": 38000 }, { "epoch": 0.39536355806448265, "grad_norm": 4.481339454650879, "learning_rate": 0.0002604646818931792, "loss": 4.004680786132813, "step": 38100 }, { "epoch": 0.3964012576919485, "grad_norm": 1.5849348306655884, "learning_rate": 0.0002603609119304326, "loss": 4.127825317382812, "step": 38200 }, { "epoch": 0.3974389573194143, "grad_norm": 1.5340007543563843, "learning_rate": 0.000260257141967686, "loss": 4.126565551757812, "step": 38300 }, { "epoch": 0.39847665694688017, "grad_norm": 1.9388331174850464, "learning_rate": 0.00026015337200493944, "loss": 4.147232666015625, "step": 38400 }, { "epoch": 0.399514356574346, "grad_norm": 1.4936273097991943, "learning_rate": 0.00026004960204219284, "loss": 4.046693115234375, "step": 38500 }, { "epoch": 0.40055205620181183, "grad_norm": 1.4128496646881104, "learning_rate": 0.0002599458320794463, "loss": 4.027592468261719, "step": 38600 }, { "epoch": 0.40158975582927764, "grad_norm": 1.2070266008377075, "learning_rate": 0.0002598420621166997, "loss": 3.9974462890625, "step": 38700 }, { "epoch": 0.4026274554567435, "grad_norm": 1.0721571445465088, "learning_rate": 0.0002597382921539531, "loss": 4.048193054199219, "step": 38800 }, { "epoch": 0.4036651550842093, "grad_norm": 4.593639373779297, "learning_rate": 0.00025963452219120647, "loss": 3.9815548706054686, "step": 38900 }, { "epoch": 0.40470285471167516, "grad_norm": 2.84889817237854, "learning_rate": 0.0002595307522284599, "loss": 4.118370666503906, "step": 39000 }, { "epoch": 0.40574055433914097, "grad_norm": 1.6757389307022095, "learning_rate": 0.00025942698226571337, "loss": 4.095942077636718, "step": 39100 }, { "epoch": 0.40677825396660683, "grad_norm": 3.5596885681152344, "learning_rate": 0.00025932321230296676, "loss": 4.0965576171875, "step": 39200 }, { "epoch": 0.40781595359407263, "grad_norm": 1.0558372735977173, "learning_rate": 0.0002592194423402202, "loss": 4.239440307617188, "step": 39300 }, { "epoch": 0.4088536532215385, "grad_norm": 5.334078311920166, "learning_rate": 0.0002591156723774736, "loss": 4.089285888671875, "step": 39400 }, { "epoch": 0.40989135284900435, "grad_norm": 2.4086287021636963, "learning_rate": 0.000259011902414727, "loss": 4.103414611816406, "step": 39500 }, { "epoch": 0.41092905247647016, "grad_norm": 4.432836055755615, "learning_rate": 0.00025890813245198045, "loss": 4.0577630615234375, "step": 39600 }, { "epoch": 0.411966752103936, "grad_norm": 1.3129891157150269, "learning_rate": 0.00025880436248923384, "loss": 4.128912353515625, "step": 39700 }, { "epoch": 0.4130044517314018, "grad_norm": 2.148174524307251, "learning_rate": 0.0002587005925264873, "loss": 4.197516174316406, "step": 39800 }, { "epoch": 0.4140421513588677, "grad_norm": 6.447707176208496, "learning_rate": 0.0002585968225637407, "loss": 4.087812805175782, "step": 39900 }, { "epoch": 0.4150798509863335, "grad_norm": 2.721989393234253, "learning_rate": 0.0002584930526009941, "loss": 3.9460833740234373, "step": 40000 }, { "epoch": 0.41611755061379935, "grad_norm": 1.543135166168213, "learning_rate": 0.0002583892826382475, "loss": 4.02151611328125, "step": 40100 }, { "epoch": 0.41715525024126515, "grad_norm": 1.4670268297195435, "learning_rate": 0.0002582855126755009, "loss": 4.18778564453125, "step": 40200 }, { "epoch": 0.418192949868731, "grad_norm": 3.8556268215179443, "learning_rate": 0.0002581817427127544, "loss": 3.996910400390625, "step": 40300 }, { "epoch": 0.4192306494961968, "grad_norm": 1.702594518661499, "learning_rate": 0.00025807797275000777, "loss": 4.031709594726562, "step": 40400 }, { "epoch": 0.4202683491236627, "grad_norm": 1.2531317472457886, "learning_rate": 0.00025797420278726117, "loss": 4.188993835449219, "step": 40500 }, { "epoch": 0.4213060487511285, "grad_norm": 2.5484142303466797, "learning_rate": 0.00025787043282451456, "loss": 4.031621398925782, "step": 40600 }, { "epoch": 0.42234374837859434, "grad_norm": 1.823457956314087, "learning_rate": 0.000257766662861768, "loss": 4.001983032226563, "step": 40700 }, { "epoch": 0.42338144800606015, "grad_norm": 1.9530704021453857, "learning_rate": 0.0002576628928990214, "loss": 4.030978088378906, "step": 40800 }, { "epoch": 0.424419147633526, "grad_norm": 4.55501127243042, "learning_rate": 0.00025755912293627485, "loss": 4.062133178710938, "step": 40900 }, { "epoch": 0.4254568472609918, "grad_norm": 1.9799492359161377, "learning_rate": 0.00025745535297352825, "loss": 3.9875259399414062, "step": 41000 }, { "epoch": 0.42649454688845767, "grad_norm": 2.4329614639282227, "learning_rate": 0.00025735158301078164, "loss": 3.9634893798828124, "step": 41100 }, { "epoch": 0.4275322465159235, "grad_norm": 1.3791182041168213, "learning_rate": 0.0002572478130480351, "loss": 4.171094055175781, "step": 41200 }, { "epoch": 0.42856994614338934, "grad_norm": 1.4852691888809204, "learning_rate": 0.0002571440430852885, "loss": 4.059336547851562, "step": 41300 }, { "epoch": 0.4296076457708552, "grad_norm": 2.191392183303833, "learning_rate": 0.00025704027312254194, "loss": 3.9574560546875, "step": 41400 }, { "epoch": 0.430645345398321, "grad_norm": 3.4423017501831055, "learning_rate": 0.0002569365031597954, "loss": 3.990745849609375, "step": 41500 }, { "epoch": 0.43168304502578686, "grad_norm": 2.979930877685547, "learning_rate": 0.0002568327331970488, "loss": 4.166605529785156, "step": 41600 }, { "epoch": 0.43272074465325266, "grad_norm": 3.131230354309082, "learning_rate": 0.0002567289632343022, "loss": 4.026178894042968, "step": 41700 }, { "epoch": 0.4337584442807185, "grad_norm": 1.578643798828125, "learning_rate": 0.00025662519327155557, "loss": 4.10739990234375, "step": 41800 }, { "epoch": 0.43479614390818433, "grad_norm": 3.628096580505371, "learning_rate": 0.000256521423308809, "loss": 4.021985473632813, "step": 41900 }, { "epoch": 0.4358338435356502, "grad_norm": 2.235994815826416, "learning_rate": 0.0002564176533460624, "loss": 4.138570251464844, "step": 42000 }, { "epoch": 0.436871543163116, "grad_norm": 3.0459887981414795, "learning_rate": 0.00025631388338331586, "loss": 4.139791564941406, "step": 42100 }, { "epoch": 0.43790924279058185, "grad_norm": 1.0590101480484009, "learning_rate": 0.00025621011342056926, "loss": 4.018776550292968, "step": 42200 }, { "epoch": 0.43894694241804766, "grad_norm": 3.5735878944396973, "learning_rate": 0.00025610634345782265, "loss": 4.182121887207031, "step": 42300 }, { "epoch": 0.4399846420455135, "grad_norm": 1.1051421165466309, "learning_rate": 0.0002560025734950761, "loss": 4.086949157714844, "step": 42400 }, { "epoch": 0.4410223416729793, "grad_norm": 2.8680758476257324, "learning_rate": 0.0002558988035323295, "loss": 4.053037414550781, "step": 42500 }, { "epoch": 0.4420600413004452, "grad_norm": 1.6805782318115234, "learning_rate": 0.00025579503356958294, "loss": 4.041470947265625, "step": 42600 }, { "epoch": 0.443097740927911, "grad_norm": 1.7229841947555542, "learning_rate": 0.00025569126360683634, "loss": 4.1356103515625, "step": 42700 }, { "epoch": 0.44413544055537685, "grad_norm": 1.4601655006408691, "learning_rate": 0.00025558749364408973, "loss": 4.052696533203125, "step": 42800 }, { "epoch": 0.44517314018284265, "grad_norm": 1.552959680557251, "learning_rate": 0.0002554837236813432, "loss": 4.020947875976563, "step": 42900 }, { "epoch": 0.4462108398103085, "grad_norm": 1.3446309566497803, "learning_rate": 0.0002553799537185966, "loss": 4.150856018066406, "step": 43000 }, { "epoch": 0.4472485394377743, "grad_norm": 3.128110408782959, "learning_rate": 0.00025527618375585003, "loss": 4.118401794433594, "step": 43100 }, { "epoch": 0.4482862390652402, "grad_norm": 1.328148603439331, "learning_rate": 0.0002551724137931034, "loss": 4.073428649902343, "step": 43200 }, { "epoch": 0.449323938692706, "grad_norm": 1.5910078287124634, "learning_rate": 0.00025506864383035687, "loss": 4.110806579589844, "step": 43300 }, { "epoch": 0.45036163832017184, "grad_norm": 1.2686039209365845, "learning_rate": 0.00025496487386761027, "loss": 4.007551574707032, "step": 43400 }, { "epoch": 0.4513993379476377, "grad_norm": 4.290769577026367, "learning_rate": 0.00025486110390486366, "loss": 4.068913269042969, "step": 43500 }, { "epoch": 0.4524370375751035, "grad_norm": 1.6915346384048462, "learning_rate": 0.0002547573339421171, "loss": 4.066489562988282, "step": 43600 }, { "epoch": 0.45347473720256937, "grad_norm": 1.3425647020339966, "learning_rate": 0.0002546535639793705, "loss": 4.024351806640625, "step": 43700 }, { "epoch": 0.45451243683003517, "grad_norm": 4.726262092590332, "learning_rate": 0.00025454979401662395, "loss": 4.055924987792968, "step": 43800 }, { "epoch": 0.45555013645750103, "grad_norm": 1.3767929077148438, "learning_rate": 0.00025444602405387735, "loss": 4.1108706665039065, "step": 43900 }, { "epoch": 0.45658783608496684, "grad_norm": 2.199096918106079, "learning_rate": 0.00025434225409113074, "loss": 4.032781982421875, "step": 44000 }, { "epoch": 0.4576255357124327, "grad_norm": 1.529963731765747, "learning_rate": 0.0002542384841283842, "loss": 3.9078250122070313, "step": 44100 }, { "epoch": 0.4586632353398985, "grad_norm": 2.381452798843384, "learning_rate": 0.0002541347141656376, "loss": 4.1637747192382815, "step": 44200 }, { "epoch": 0.45970093496736436, "grad_norm": 1.3512217998504639, "learning_rate": 0.00025403094420289104, "loss": 4.1603765869140625, "step": 44300 }, { "epoch": 0.46073863459483017, "grad_norm": 1.6877330541610718, "learning_rate": 0.00025392717424014443, "loss": 3.9833114624023436, "step": 44400 }, { "epoch": 0.461776334222296, "grad_norm": 10.19050121307373, "learning_rate": 0.0002538234042773978, "loss": 4.087564086914062, "step": 44500 }, { "epoch": 0.46281403384976183, "grad_norm": 2.2430684566497803, "learning_rate": 0.0002537196343146512, "loss": 3.943908386230469, "step": 44600 }, { "epoch": 0.4638517334772277, "grad_norm": 1.8005903959274292, "learning_rate": 0.00025361586435190467, "loss": 4.026759948730469, "step": 44700 }, { "epoch": 0.4648894331046935, "grad_norm": 1.3022342920303345, "learning_rate": 0.0002535120943891581, "loss": 4.106507263183594, "step": 44800 }, { "epoch": 0.46592713273215935, "grad_norm": 1.1729425191879272, "learning_rate": 0.0002534083244264115, "loss": 4.0660693359375, "step": 44900 }, { "epoch": 0.46696483235962516, "grad_norm": 1.7224327325820923, "learning_rate": 0.0002533045544636649, "loss": 3.9855413818359375, "step": 45000 }, { "epoch": 0.468002531987091, "grad_norm": 1.6977527141571045, "learning_rate": 0.0002532007845009183, "loss": 3.813612976074219, "step": 45100 }, { "epoch": 0.4690402316145568, "grad_norm": 2.9529614448547363, "learning_rate": 0.00025309701453817175, "loss": 3.995145263671875, "step": 45200 }, { "epoch": 0.4700779312420227, "grad_norm": 3.1997270584106445, "learning_rate": 0.0002529932445754252, "loss": 4.031595153808594, "step": 45300 }, { "epoch": 0.47111563086948854, "grad_norm": 5.878026008605957, "learning_rate": 0.0002528894746126786, "loss": 4.028975524902344, "step": 45400 }, { "epoch": 0.47215333049695435, "grad_norm": 1.7146035432815552, "learning_rate": 0.00025278570464993205, "loss": 4.085393676757812, "step": 45500 }, { "epoch": 0.4731910301244202, "grad_norm": 2.954148292541504, "learning_rate": 0.00025268193468718544, "loss": 4.039700622558594, "step": 45600 }, { "epoch": 0.474228729751886, "grad_norm": 1.9127237796783447, "learning_rate": 0.00025257816472443883, "loss": 4.100406494140625, "step": 45700 }, { "epoch": 0.4752664293793519, "grad_norm": 1.8794509172439575, "learning_rate": 0.00025247439476169223, "loss": 3.9390939331054686, "step": 45800 }, { "epoch": 0.4763041290068177, "grad_norm": 2.165816307067871, "learning_rate": 0.0002523706247989457, "loss": 4.155856628417968, "step": 45900 }, { "epoch": 0.47734182863428354, "grad_norm": 6.686591148376465, "learning_rate": 0.00025226685483619913, "loss": 4.097453918457031, "step": 46000 }, { "epoch": 0.47837952826174934, "grad_norm": 2.4973371028900146, "learning_rate": 0.0002521630848734525, "loss": 4.200291137695313, "step": 46100 }, { "epoch": 0.4794172278892152, "grad_norm": 2.1478147506713867, "learning_rate": 0.0002520593149107059, "loss": 3.899898681640625, "step": 46200 }, { "epoch": 0.480454927516681, "grad_norm": 1.6290667057037354, "learning_rate": 0.0002519555449479593, "loss": 4.157419128417969, "step": 46300 }, { "epoch": 0.48149262714414687, "grad_norm": 2.3697171211242676, "learning_rate": 0.00025185177498521276, "loss": 4.0068753051757815, "step": 46400 }, { "epoch": 0.48253032677161267, "grad_norm": 3.123157501220703, "learning_rate": 0.00025174800502246616, "loss": 3.9923574829101565, "step": 46500 }, { "epoch": 0.48356802639907853, "grad_norm": 3.4272193908691406, "learning_rate": 0.0002516442350597196, "loss": 4.144463195800781, "step": 46600 }, { "epoch": 0.48460572602654434, "grad_norm": 2.8348467350006104, "learning_rate": 0.000251540465096973, "loss": 4.055748291015625, "step": 46700 }, { "epoch": 0.4856434256540102, "grad_norm": 3.0261967182159424, "learning_rate": 0.0002514366951342264, "loss": 4.177880554199219, "step": 46800 }, { "epoch": 0.486681125281476, "grad_norm": 10.726264953613281, "learning_rate": 0.00025133292517147984, "loss": 3.9125796508789064, "step": 46900 }, { "epoch": 0.48771882490894186, "grad_norm": 8.811136245727539, "learning_rate": 0.00025122915520873324, "loss": 3.9216848754882814, "step": 47000 }, { "epoch": 0.48875652453640767, "grad_norm": 6.8598151206970215, "learning_rate": 0.0002511253852459867, "loss": 3.9738433837890623, "step": 47100 }, { "epoch": 0.4897942241638735, "grad_norm": 5.096536636352539, "learning_rate": 0.0002510216152832401, "loss": 3.998507080078125, "step": 47200 }, { "epoch": 0.4908319237913394, "grad_norm": 1.4742202758789062, "learning_rate": 0.00025091784532049353, "loss": 4.171350402832031, "step": 47300 }, { "epoch": 0.4918696234188052, "grad_norm": 1.88887357711792, "learning_rate": 0.0002508140753577469, "loss": 4.106647644042969, "step": 47400 }, { "epoch": 0.49290732304627105, "grad_norm": 1.6502625942230225, "learning_rate": 0.0002507103053950003, "loss": 3.877885437011719, "step": 47500 }, { "epoch": 0.49394502267373686, "grad_norm": 1.728053331375122, "learning_rate": 0.00025060653543225377, "loss": 4.064427795410157, "step": 47600 }, { "epoch": 0.4949827223012027, "grad_norm": 4.632587432861328, "learning_rate": 0.00025050276546950716, "loss": 4.113824157714844, "step": 47700 }, { "epoch": 0.4960204219286685, "grad_norm": 1.5823708772659302, "learning_rate": 0.0002503989955067606, "loss": 4.080696411132813, "step": 47800 }, { "epoch": 0.4970581215561344, "grad_norm": 1.9801136255264282, "learning_rate": 0.000250295225544014, "loss": 3.945875549316406, "step": 47900 }, { "epoch": 0.4980958211836002, "grad_norm": 1.3339368104934692, "learning_rate": 0.0002501914555812674, "loss": 3.951331787109375, "step": 48000 }, { "epoch": 0.49913352081106604, "grad_norm": 2.1013355255126953, "learning_rate": 0.00025008768561852085, "loss": 4.022156372070312, "step": 48100 }, { "epoch": 0.5001712204385319, "grad_norm": 2.7022488117218018, "learning_rate": 0.00024998391565577425, "loss": 3.9780624389648436, "step": 48200 }, { "epoch": 0.5012089200659977, "grad_norm": 10.230494499206543, "learning_rate": 0.0002498801456930277, "loss": 4.024637145996094, "step": 48300 }, { "epoch": 0.5022466196934635, "grad_norm": 7.242427349090576, "learning_rate": 0.0002497763757302811, "loss": 3.9954248046875, "step": 48400 }, { "epoch": 0.5032843193209293, "grad_norm": 2.742445945739746, "learning_rate": 0.0002496726057675345, "loss": 3.9637130737304687, "step": 48500 }, { "epoch": 0.5043220189483952, "grad_norm": 1.6320149898529053, "learning_rate": 0.00024956883580478794, "loss": 4.035350952148438, "step": 48600 }, { "epoch": 0.505359718575861, "grad_norm": 2.239950180053711, "learning_rate": 0.00024946506584204133, "loss": 3.961440124511719, "step": 48700 }, { "epoch": 0.5063974182033268, "grad_norm": 6.686822891235352, "learning_rate": 0.0002493612958792948, "loss": 4.003260498046875, "step": 48800 }, { "epoch": 0.5074351178307926, "grad_norm": 1.9818964004516602, "learning_rate": 0.0002492575259165482, "loss": 4.018614501953125, "step": 48900 }, { "epoch": 0.5084728174582586, "grad_norm": 1.5698004961013794, "learning_rate": 0.00024915375595380157, "loss": 4.045997314453125, "step": 49000 }, { "epoch": 0.5095105170857244, "grad_norm": 2.3865158557891846, "learning_rate": 0.000249049985991055, "loss": 4.050853576660156, "step": 49100 }, { "epoch": 0.5105482167131902, "grad_norm": 14.248946189880371, "learning_rate": 0.0002489462160283084, "loss": 3.991949462890625, "step": 49200 }, { "epoch": 0.5115859163406561, "grad_norm": 1.279118537902832, "learning_rate": 0.00024884244606556186, "loss": 3.92796875, "step": 49300 }, { "epoch": 0.5126236159681219, "grad_norm": 2.575704574584961, "learning_rate": 0.00024873867610281526, "loss": 4.12865478515625, "step": 49400 }, { "epoch": 0.5136613155955877, "grad_norm": 2.0912930965423584, "learning_rate": 0.0002486349061400687, "loss": 4.042799682617187, "step": 49500 }, { "epoch": 0.5146990152230535, "grad_norm": 2.6358580589294434, "learning_rate": 0.0002485311361773221, "loss": 4.069761047363281, "step": 49600 }, { "epoch": 0.5157367148505194, "grad_norm": 2.6711385250091553, "learning_rate": 0.0002484273662145755, "loss": 3.9823483276367186, "step": 49700 }, { "epoch": 0.5167744144779852, "grad_norm": 3.348376989364624, "learning_rate": 0.00024832359625182894, "loss": 4.119874572753906, "step": 49800 }, { "epoch": 0.517812114105451, "grad_norm": 1.7040736675262451, "learning_rate": 0.00024821982628908234, "loss": 4.038002319335938, "step": 49900 }, { "epoch": 0.5188498137329168, "grad_norm": 11.144097328186035, "learning_rate": 0.0002481160563263358, "loss": 3.933763122558594, "step": 50000 }, { "epoch": 0.5198875133603827, "grad_norm": 3.1529595851898193, "learning_rate": 0.0002480122863635892, "loss": 3.990421142578125, "step": 50100 }, { "epoch": 0.5209252129878486, "grad_norm": 2.3761773109436035, "learning_rate": 0.0002479085164008426, "loss": 3.9385421752929686, "step": 50200 }, { "epoch": 0.5219629126153144, "grad_norm": 14.909253120422363, "learning_rate": 0.00024780474643809597, "loss": 3.924638671875, "step": 50300 }, { "epoch": 0.5230006122427802, "grad_norm": 1.4870705604553223, "learning_rate": 0.0002477009764753494, "loss": 4.003363037109375, "step": 50400 }, { "epoch": 0.5240383118702461, "grad_norm": 2.5456697940826416, "learning_rate": 0.00024759720651260287, "loss": 4.063373413085937, "step": 50500 }, { "epoch": 0.5250760114977119, "grad_norm": 4.392611980438232, "learning_rate": 0.00024749343654985627, "loss": 4.108450927734375, "step": 50600 }, { "epoch": 0.5261137111251777, "grad_norm": 2.8420300483703613, "learning_rate": 0.00024738966658710966, "loss": 3.9724908447265626, "step": 50700 }, { "epoch": 0.5271514107526435, "grad_norm": 2.3819692134857178, "learning_rate": 0.00024728589662436306, "loss": 4.040487060546875, "step": 50800 }, { "epoch": 0.5281891103801094, "grad_norm": 2.1021909713745117, "learning_rate": 0.0002471821266616165, "loss": 4.101463623046875, "step": 50900 }, { "epoch": 0.5292268100075752, "grad_norm": 2.8605117797851562, "learning_rate": 0.00024707835669886995, "loss": 3.9426974487304687, "step": 51000 }, { "epoch": 0.530264509635041, "grad_norm": 1.331457257270813, "learning_rate": 0.00024697458673612335, "loss": 4.005464172363281, "step": 51100 }, { "epoch": 0.5313022092625068, "grad_norm": 2.4866714477539062, "learning_rate": 0.00024687081677337674, "loss": 4.089916687011719, "step": 51200 }, { "epoch": 0.5323399088899727, "grad_norm": 6.342608451843262, "learning_rate": 0.00024676704681063014, "loss": 3.979620361328125, "step": 51300 }, { "epoch": 0.5333776085174385, "grad_norm": 1.3954708576202393, "learning_rate": 0.0002466632768478836, "loss": 3.9805123901367185, "step": 51400 }, { "epoch": 0.5344153081449043, "grad_norm": 24.8520450592041, "learning_rate": 0.000246559506885137, "loss": 4.0105502319335935, "step": 51500 }, { "epoch": 0.5354530077723703, "grad_norm": 2.0366039276123047, "learning_rate": 0.00024645573692239043, "loss": 3.919516296386719, "step": 51600 }, { "epoch": 0.5364907073998361, "grad_norm": 1.3017858266830444, "learning_rate": 0.0002463519669596439, "loss": 3.951867980957031, "step": 51700 }, { "epoch": 0.5375284070273019, "grad_norm": 2.579885244369507, "learning_rate": 0.0002462481969968973, "loss": 3.960545959472656, "step": 51800 }, { "epoch": 0.5385661066547677, "grad_norm": 1.5787100791931152, "learning_rate": 0.00024614442703415067, "loss": 4.013999938964844, "step": 51900 }, { "epoch": 0.5396038062822336, "grad_norm": 3.9871633052825928, "learning_rate": 0.00024604065707140406, "loss": 3.950070495605469, "step": 52000 }, { "epoch": 0.5406415059096994, "grad_norm": 1.572277545928955, "learning_rate": 0.0002459368871086575, "loss": 4.086417846679687, "step": 52100 }, { "epoch": 0.5416792055371652, "grad_norm": 7.029146671295166, "learning_rate": 0.0002458331171459109, "loss": 3.8767724609375, "step": 52200 }, { "epoch": 0.542716905164631, "grad_norm": 1.2442755699157715, "learning_rate": 0.00024572934718316436, "loss": 3.875315856933594, "step": 52300 }, { "epoch": 0.5437546047920969, "grad_norm": 3.5381152629852295, "learning_rate": 0.00024562557722041775, "loss": 4.013727416992188, "step": 52400 }, { "epoch": 0.5447923044195627, "grad_norm": 16.472898483276367, "learning_rate": 0.00024552180725767115, "loss": 4.058722839355469, "step": 52500 }, { "epoch": 0.5458300040470285, "grad_norm": 1.4836983680725098, "learning_rate": 0.0002454180372949246, "loss": 4.106039123535156, "step": 52600 }, { "epoch": 0.5468677036744943, "grad_norm": 4.735908031463623, "learning_rate": 0.000245314267332178, "loss": 4.109900817871094, "step": 52700 }, { "epoch": 0.5479054033019602, "grad_norm": 1.7438913583755493, "learning_rate": 0.00024521049736943144, "loss": 4.098789978027344, "step": 52800 }, { "epoch": 0.548943102929426, "grad_norm": 3.592564105987549, "learning_rate": 0.00024510672740668483, "loss": 3.9866278076171877, "step": 52900 }, { "epoch": 0.5499808025568919, "grad_norm": 1.9763888120651245, "learning_rate": 0.00024500295744393823, "loss": 3.9620831298828123, "step": 53000 }, { "epoch": 0.5510185021843577, "grad_norm": 1.0539793968200684, "learning_rate": 0.0002448991874811917, "loss": 4.006460266113281, "step": 53100 }, { "epoch": 0.5520562018118236, "grad_norm": 2.2474358081817627, "learning_rate": 0.00024479541751844507, "loss": 4.067258605957031, "step": 53200 }, { "epoch": 0.5530939014392894, "grad_norm": 1.5785913467407227, "learning_rate": 0.0002446916475556985, "loss": 4.057683715820312, "step": 53300 }, { "epoch": 0.5541316010667552, "grad_norm": 2.2754416465759277, "learning_rate": 0.0002445878775929519, "loss": 3.9662628173828125, "step": 53400 }, { "epoch": 0.5551693006942211, "grad_norm": 2.0118043422698975, "learning_rate": 0.00024448410763020537, "loss": 3.9848583984375, "step": 53500 }, { "epoch": 0.5562070003216869, "grad_norm": 2.3987770080566406, "learning_rate": 0.00024438033766745876, "loss": 4.00030029296875, "step": 53600 }, { "epoch": 0.5572446999491527, "grad_norm": 2.9198148250579834, "learning_rate": 0.00024427656770471216, "loss": 3.8882846069335937, "step": 53700 }, { "epoch": 0.5582823995766185, "grad_norm": 2.0234696865081787, "learning_rate": 0.0002441727977419656, "loss": 3.9845794677734374, "step": 53800 }, { "epoch": 0.5593200992040844, "grad_norm": 1.701568841934204, "learning_rate": 0.000244069027779219, "loss": 4.01090087890625, "step": 53900 }, { "epoch": 0.5603577988315502, "grad_norm": 2.3093771934509277, "learning_rate": 0.00024396525781647242, "loss": 3.9678195190429686, "step": 54000 }, { "epoch": 0.561395498459016, "grad_norm": 2.0182909965515137, "learning_rate": 0.00024386148785372582, "loss": 4.025320434570313, "step": 54100 }, { "epoch": 0.5624331980864818, "grad_norm": 3.1341028213500977, "learning_rate": 0.00024375771789097927, "loss": 3.9826446533203126, "step": 54200 }, { "epoch": 0.5634708977139478, "grad_norm": 2.025581121444702, "learning_rate": 0.0002436539479282327, "loss": 3.906527404785156, "step": 54300 }, { "epoch": 0.5645085973414136, "grad_norm": 2.913895845413208, "learning_rate": 0.00024355017796548608, "loss": 3.970755920410156, "step": 54400 }, { "epoch": 0.5655462969688794, "grad_norm": 1.9220850467681885, "learning_rate": 0.0002434464080027395, "loss": 3.943621826171875, "step": 54500 }, { "epoch": 0.5665839965963452, "grad_norm": 1.2168983221054077, "learning_rate": 0.0002433426380399929, "loss": 3.9780545043945312, "step": 54600 }, { "epoch": 0.5676216962238111, "grad_norm": 1.5367380380630493, "learning_rate": 0.00024323886807724635, "loss": 3.8468157958984377, "step": 54700 }, { "epoch": 0.5686593958512769, "grad_norm": 2.7281689643859863, "learning_rate": 0.00024313509811449977, "loss": 3.9043319702148436, "step": 54800 }, { "epoch": 0.5696970954787427, "grad_norm": 1.1875724792480469, "learning_rate": 0.00024303132815175316, "loss": 4.029020385742188, "step": 54900 }, { "epoch": 0.5707347951062085, "grad_norm": 9.087173461914062, "learning_rate": 0.00024292755818900659, "loss": 3.977708740234375, "step": 55000 }, { "epoch": 0.5717724947336744, "grad_norm": 1.94620943069458, "learning_rate": 0.00024282378822626, "loss": 3.9465988159179686, "step": 55100 }, { "epoch": 0.5728101943611402, "grad_norm": 3.0396885871887207, "learning_rate": 0.00024272001826351343, "loss": 4.030888366699219, "step": 55200 }, { "epoch": 0.573847893988606, "grad_norm": 1.557199239730835, "learning_rate": 0.00024261624830076682, "loss": 3.9756591796875, "step": 55300 }, { "epoch": 0.5748855936160719, "grad_norm": 3.0625579357147217, "learning_rate": 0.00024251247833802025, "loss": 4.076784362792969, "step": 55400 }, { "epoch": 0.5759232932435377, "grad_norm": 1.9166301488876343, "learning_rate": 0.0002424087083752737, "loss": 3.9604058837890626, "step": 55500 }, { "epoch": 0.5769609928710036, "grad_norm": 1.2829216718673706, "learning_rate": 0.0002423049384125271, "loss": 3.841531066894531, "step": 55600 }, { "epoch": 0.5779986924984694, "grad_norm": 2.9800634384155273, "learning_rate": 0.0002422011684497805, "loss": 3.915208435058594, "step": 55700 }, { "epoch": 0.5790363921259353, "grad_norm": 4.931972026824951, "learning_rate": 0.0002420973984870339, "loss": 3.7610516357421875, "step": 55800 }, { "epoch": 0.5800740917534011, "grad_norm": 3.796473264694214, "learning_rate": 0.00024199362852428733, "loss": 4.009695129394531, "step": 55900 }, { "epoch": 0.5811117913808669, "grad_norm": 2.3635172843933105, "learning_rate": 0.00024188985856154075, "loss": 4.164959716796875, "step": 56000 }, { "epoch": 0.5821494910083327, "grad_norm": 2.3295187950134277, "learning_rate": 0.00024178608859879417, "loss": 4.012393493652343, "step": 56100 }, { "epoch": 0.5831871906357986, "grad_norm": 3.1501762866973877, "learning_rate": 0.0002416823186360476, "loss": 3.9226104736328127, "step": 56200 }, { "epoch": 0.5842248902632644, "grad_norm": 2.8185627460479736, "learning_rate": 0.000241578548673301, "loss": 3.9830364990234375, "step": 56300 }, { "epoch": 0.5852625898907302, "grad_norm": 2.39125657081604, "learning_rate": 0.00024147477871055444, "loss": 4.058615112304688, "step": 56400 }, { "epoch": 0.586300289518196, "grad_norm": 2.658254623413086, "learning_rate": 0.00024137100874780783, "loss": 3.9012820434570314, "step": 56500 }, { "epoch": 0.5873379891456619, "grad_norm": 2.873662233352661, "learning_rate": 0.00024126723878506126, "loss": 4.018562622070313, "step": 56600 }, { "epoch": 0.5883756887731277, "grad_norm": 2.0522000789642334, "learning_rate": 0.00024116346882231468, "loss": 4.0417938232421875, "step": 56700 }, { "epoch": 0.5894133884005935, "grad_norm": 2.688117742538452, "learning_rate": 0.00024105969885956807, "loss": 3.910294494628906, "step": 56800 }, { "epoch": 0.5904510880280593, "grad_norm": 3.5324251651763916, "learning_rate": 0.00024095592889682152, "loss": 4.042366027832031, "step": 56900 }, { "epoch": 0.5914887876555253, "grad_norm": 3.254483461380005, "learning_rate": 0.00024085215893407492, "loss": 3.875579833984375, "step": 57000 }, { "epoch": 0.5925264872829911, "grad_norm": 1.4469491243362427, "learning_rate": 0.00024074838897132834, "loss": 3.8468057250976564, "step": 57100 }, { "epoch": 0.5935641869104569, "grad_norm": 7.142496585845947, "learning_rate": 0.00024064461900858173, "loss": 3.9028366088867186, "step": 57200 }, { "epoch": 0.5946018865379228, "grad_norm": 2.8328020572662354, "learning_rate": 0.00024054084904583518, "loss": 4.013849182128906, "step": 57300 }, { "epoch": 0.5956395861653886, "grad_norm": 1.999799370765686, "learning_rate": 0.0002404370790830886, "loss": 3.9890103149414062, "step": 57400 }, { "epoch": 0.5966772857928544, "grad_norm": 5.142120361328125, "learning_rate": 0.000240333309120342, "loss": 3.8782421875, "step": 57500 }, { "epoch": 0.5977149854203202, "grad_norm": 2.6170506477355957, "learning_rate": 0.00024022953915759542, "loss": 3.9341799926757814, "step": 57600 }, { "epoch": 0.5987526850477861, "grad_norm": 4.847115993499756, "learning_rate": 0.00024012576919484882, "loss": 4.028234252929687, "step": 57700 }, { "epoch": 0.5997903846752519, "grad_norm": 3.093014717102051, "learning_rate": 0.00024002199923210226, "loss": 4.02893310546875, "step": 57800 }, { "epoch": 0.6008280843027177, "grad_norm": 2.6559977531433105, "learning_rate": 0.00023991822926935566, "loss": 3.9997882080078124, "step": 57900 }, { "epoch": 0.6018657839301835, "grad_norm": 1.5972485542297363, "learning_rate": 0.00023981445930660908, "loss": 3.9749560546875, "step": 58000 }, { "epoch": 0.6029034835576494, "grad_norm": 3.777557134628296, "learning_rate": 0.0002397106893438625, "loss": 3.9969076538085937, "step": 58100 }, { "epoch": 0.6039411831851152, "grad_norm": 1.8903939723968506, "learning_rate": 0.00023960691938111593, "loss": 4.007763977050781, "step": 58200 }, { "epoch": 0.604978882812581, "grad_norm": 3.150963068008423, "learning_rate": 0.00023950314941836935, "loss": 4.019749145507813, "step": 58300 }, { "epoch": 0.6060165824400469, "grad_norm": 1.934287190437317, "learning_rate": 0.00023939937945562274, "loss": 4.014994812011719, "step": 58400 }, { "epoch": 0.6070542820675128, "grad_norm": 7.10530948638916, "learning_rate": 0.00023929560949287616, "loss": 4.050195617675781, "step": 58500 }, { "epoch": 0.6080919816949786, "grad_norm": 2.367403030395508, "learning_rate": 0.0002391918395301296, "loss": 3.8701296997070314, "step": 58600 }, { "epoch": 0.6091296813224444, "grad_norm": 1.9392305612564087, "learning_rate": 0.000239088069567383, "loss": 4.08440185546875, "step": 58700 }, { "epoch": 0.6101673809499102, "grad_norm": 2.5947983264923096, "learning_rate": 0.00023898429960463643, "loss": 4.050205078125, "step": 58800 }, { "epoch": 0.6112050805773761, "grad_norm": 2.1583032608032227, "learning_rate": 0.00023888052964188982, "loss": 3.958690490722656, "step": 58900 }, { "epoch": 0.6122427802048419, "grad_norm": 1.6529427766799927, "learning_rate": 0.00023877675967914325, "loss": 3.9609234619140623, "step": 59000 }, { "epoch": 0.6132804798323077, "grad_norm": 2.0239171981811523, "learning_rate": 0.00023867298971639667, "loss": 4.128135986328125, "step": 59100 }, { "epoch": 0.6143181794597736, "grad_norm": 3.8679206371307373, "learning_rate": 0.0002385692197536501, "loss": 4.005528869628907, "step": 59200 }, { "epoch": 0.6153558790872394, "grad_norm": 3.305494785308838, "learning_rate": 0.0002384654497909035, "loss": 3.9134161376953127, "step": 59300 }, { "epoch": 0.6163935787147052, "grad_norm": 1.640649676322937, "learning_rate": 0.0002383616798281569, "loss": 3.92852783203125, "step": 59400 }, { "epoch": 0.617431278342171, "grad_norm": 1.7184723615646362, "learning_rate": 0.00023825790986541036, "loss": 3.8771322631835936, "step": 59500 }, { "epoch": 0.618468977969637, "grad_norm": 2.6886117458343506, "learning_rate": 0.00023815413990266375, "loss": 4.047822875976562, "step": 59600 }, { "epoch": 0.6195066775971028, "grad_norm": 2.9485394954681396, "learning_rate": 0.00023805036993991717, "loss": 4.04974853515625, "step": 59700 }, { "epoch": 0.6205443772245686, "grad_norm": 18.998411178588867, "learning_rate": 0.00023794659997717057, "loss": 3.978843994140625, "step": 59800 }, { "epoch": 0.6215820768520344, "grad_norm": 1.6347628831863403, "learning_rate": 0.000237842830014424, "loss": 3.94311279296875, "step": 59900 }, { "epoch": 0.6226197764795003, "grad_norm": 4.1301798820495605, "learning_rate": 0.00023773906005167744, "loss": 4.044434814453125, "step": 60000 }, { "epoch": 0.6236574761069661, "grad_norm": 2.7278170585632324, "learning_rate": 0.00023763529008893083, "loss": 3.9771295166015626, "step": 60100 }, { "epoch": 0.6246951757344319, "grad_norm": 3.4196488857269287, "learning_rate": 0.00023753152012618426, "loss": 3.9663619995117188, "step": 60200 }, { "epoch": 0.6257328753618977, "grad_norm": 1.3134477138519287, "learning_rate": 0.00023742775016343765, "loss": 4.089789733886719, "step": 60300 }, { "epoch": 0.6267705749893636, "grad_norm": 4.490455627441406, "learning_rate": 0.0002373239802006911, "loss": 3.87512939453125, "step": 60400 }, { "epoch": 0.6278082746168294, "grad_norm": 3.0652222633361816, "learning_rate": 0.00023722021023794452, "loss": 3.893270263671875, "step": 60500 }, { "epoch": 0.6288459742442952, "grad_norm": 8.751646995544434, "learning_rate": 0.00023711644027519792, "loss": 3.862340393066406, "step": 60600 }, { "epoch": 0.629883673871761, "grad_norm": 2.9108734130859375, "learning_rate": 0.00023701267031245134, "loss": 3.9849557495117187, "step": 60700 }, { "epoch": 0.630921373499227, "grad_norm": 2.250643253326416, "learning_rate": 0.00023690890034970473, "loss": 3.955241394042969, "step": 60800 }, { "epoch": 0.6319590731266927, "grad_norm": 1.4363751411437988, "learning_rate": 0.00023680513038695818, "loss": 4.0179071044921875, "step": 60900 }, { "epoch": 0.6329967727541586, "grad_norm": 1.6399027109146118, "learning_rate": 0.00023670136042421158, "loss": 3.911060485839844, "step": 61000 }, { "epoch": 0.6340344723816245, "grad_norm": 2.371727228164673, "learning_rate": 0.000236597590461465, "loss": 3.9237380981445313, "step": 61100 }, { "epoch": 0.6350721720090903, "grad_norm": 1.6354718208312988, "learning_rate": 0.00023649382049871842, "loss": 4.036581420898438, "step": 61200 }, { "epoch": 0.6361098716365561, "grad_norm": 3.147254705429077, "learning_rate": 0.00023639005053597184, "loss": 4.009747619628906, "step": 61300 }, { "epoch": 0.6371475712640219, "grad_norm": 2.9439003467559814, "learning_rate": 0.00023628628057322526, "loss": 3.965068664550781, "step": 61400 }, { "epoch": 0.6381852708914878, "grad_norm": 2.8980836868286133, "learning_rate": 0.00023618251061047866, "loss": 3.99951171875, "step": 61500 }, { "epoch": 0.6392229705189536, "grad_norm": 2.862438201904297, "learning_rate": 0.00023607874064773208, "loss": 3.8896145629882812, "step": 61600 }, { "epoch": 0.6402606701464194, "grad_norm": 1.7125756740570068, "learning_rate": 0.00023597497068498548, "loss": 3.9900253295898436, "step": 61700 }, { "epoch": 0.6412983697738852, "grad_norm": 13.891119956970215, "learning_rate": 0.00023587120072223892, "loss": 3.8787249755859374, "step": 61800 }, { "epoch": 0.6423360694013511, "grad_norm": 3.5258827209472656, "learning_rate": 0.00023576743075949235, "loss": 3.940326843261719, "step": 61900 }, { "epoch": 0.6433737690288169, "grad_norm": 4.297271251678467, "learning_rate": 0.00023566366079674574, "loss": 3.8732571411132812, "step": 62000 }, { "epoch": 0.6444114686562827, "grad_norm": 3.574477195739746, "learning_rate": 0.00023555989083399916, "loss": 4.078603515625, "step": 62100 }, { "epoch": 0.6454491682837485, "grad_norm": 3.2514758110046387, "learning_rate": 0.00023545612087125259, "loss": 3.956298522949219, "step": 62200 }, { "epoch": 0.6464868679112145, "grad_norm": 2.582719326019287, "learning_rate": 0.000235352350908506, "loss": 3.8729116821289065, "step": 62300 }, { "epoch": 0.6475245675386803, "grad_norm": 2.445774793624878, "learning_rate": 0.00023524858094575943, "loss": 4.064724426269532, "step": 62400 }, { "epoch": 0.6485622671661461, "grad_norm": 4.912772178649902, "learning_rate": 0.00023514481098301282, "loss": 4.02049560546875, "step": 62500 }, { "epoch": 0.6495999667936119, "grad_norm": 3.490936040878296, "learning_rate": 0.00023504104102026627, "loss": 3.912366943359375, "step": 62600 }, { "epoch": 0.6506376664210778, "grad_norm": 2.109618902206421, "learning_rate": 0.00023493727105751967, "loss": 3.963838806152344, "step": 62700 }, { "epoch": 0.6516753660485436, "grad_norm": 12.706518173217773, "learning_rate": 0.0002348335010947731, "loss": 3.901888732910156, "step": 62800 }, { "epoch": 0.6527130656760094, "grad_norm": 4.266041278839111, "learning_rate": 0.00023472973113202648, "loss": 3.902781982421875, "step": 62900 }, { "epoch": 0.6537507653034752, "grad_norm": 3.4900457859039307, "learning_rate": 0.0002346259611692799, "loss": 3.8866873168945313, "step": 63000 }, { "epoch": 0.6547884649309411, "grad_norm": 2.4276134967803955, "learning_rate": 0.00023452219120653336, "loss": 3.8234634399414062, "step": 63100 }, { "epoch": 0.6558261645584069, "grad_norm": 2.8377914428710938, "learning_rate": 0.00023441842124378675, "loss": 3.836332092285156, "step": 63200 }, { "epoch": 0.6568638641858727, "grad_norm": 6.935495853424072, "learning_rate": 0.00023431465128104017, "loss": 4.100373229980469, "step": 63300 }, { "epoch": 0.6579015638133386, "grad_norm": 2.90283465385437, "learning_rate": 0.00023421088131829357, "loss": 3.9408758544921874, "step": 63400 }, { "epoch": 0.6589392634408044, "grad_norm": 2.8002378940582275, "learning_rate": 0.00023410711135554702, "loss": 3.9959124755859374, "step": 63500 }, { "epoch": 0.6599769630682703, "grad_norm": 6.091791152954102, "learning_rate": 0.0002340033413928004, "loss": 3.9287460327148436, "step": 63600 }, { "epoch": 0.661014662695736, "grad_norm": 1.2786389589309692, "learning_rate": 0.00023389957143005383, "loss": 4.015799560546875, "step": 63700 }, { "epoch": 0.662052362323202, "grad_norm": 1.4586912393569946, "learning_rate": 0.00023379580146730726, "loss": 3.89241455078125, "step": 63800 }, { "epoch": 0.6630900619506678, "grad_norm": 2.502657890319824, "learning_rate": 0.00023369203150456065, "loss": 3.9217596435546875, "step": 63900 }, { "epoch": 0.6641277615781336, "grad_norm": 3.8019394874572754, "learning_rate": 0.0002335882615418141, "loss": 3.91360595703125, "step": 64000 }, { "epoch": 0.6651654612055994, "grad_norm": 1.5058764219284058, "learning_rate": 0.0002334844915790675, "loss": 4.059972839355469, "step": 64100 }, { "epoch": 0.6662031608330653, "grad_norm": 2.416229248046875, "learning_rate": 0.00023338072161632092, "loss": 3.9887905883789063, "step": 64200 }, { "epoch": 0.6672408604605311, "grad_norm": 1.8767884969711304, "learning_rate": 0.00023327695165357434, "loss": 3.8748153686523437, "step": 64300 }, { "epoch": 0.6682785600879969, "grad_norm": 1.7000967264175415, "learning_rate": 0.00023317318169082776, "loss": 3.9118023681640626, "step": 64400 }, { "epoch": 0.6693162597154627, "grad_norm": 4.796393394470215, "learning_rate": 0.00023306941172808118, "loss": 3.9076058959960935, "step": 64500 }, { "epoch": 0.6703539593429286, "grad_norm": 3.117870807647705, "learning_rate": 0.00023296564176533458, "loss": 3.95484375, "step": 64600 }, { "epoch": 0.6713916589703944, "grad_norm": 1.6787638664245605, "learning_rate": 0.000232861871802588, "loss": 3.858246154785156, "step": 64700 }, { "epoch": 0.6724293585978602, "grad_norm": 5.671106815338135, "learning_rate": 0.0002327581018398414, "loss": 3.9156753540039064, "step": 64800 }, { "epoch": 0.673467058225326, "grad_norm": 7.058924674987793, "learning_rate": 0.00023265433187709484, "loss": 3.8724734497070314, "step": 64900 }, { "epoch": 0.674504757852792, "grad_norm": 4.8587422370910645, "learning_rate": 0.00023255056191434826, "loss": 3.958966064453125, "step": 65000 }, { "epoch": 0.6755424574802578, "grad_norm": 2.546802520751953, "learning_rate": 0.00023244679195160166, "loss": 3.9913558959960938, "step": 65100 }, { "epoch": 0.6765801571077236, "grad_norm": 1.8444024324417114, "learning_rate": 0.00023234302198885508, "loss": 4.089451293945313, "step": 65200 }, { "epoch": 0.6776178567351895, "grad_norm": 1.5202494859695435, "learning_rate": 0.0002322392520261085, "loss": 3.83590576171875, "step": 65300 }, { "epoch": 0.6786555563626553, "grad_norm": 2.554324150085449, "learning_rate": 0.00023213548206336192, "loss": 3.9957940673828123, "step": 65400 }, { "epoch": 0.6796932559901211, "grad_norm": 1.6007890701293945, "learning_rate": 0.00023203171210061532, "loss": 3.9022012329101563, "step": 65500 }, { "epoch": 0.6807309556175869, "grad_norm": 2.593081474304199, "learning_rate": 0.00023192794213786874, "loss": 3.944790954589844, "step": 65600 }, { "epoch": 0.6817686552450528, "grad_norm": 2.1474156379699707, "learning_rate": 0.0002318241721751222, "loss": 3.78737060546875, "step": 65700 }, { "epoch": 0.6828063548725186, "grad_norm": 3.1960246562957764, "learning_rate": 0.00023172040221237559, "loss": 3.9783554077148438, "step": 65800 }, { "epoch": 0.6838440544999844, "grad_norm": 3.8228328227996826, "learning_rate": 0.000231616632249629, "loss": 3.856565246582031, "step": 65900 }, { "epoch": 0.6848817541274502, "grad_norm": 11.939492225646973, "learning_rate": 0.0002315128622868824, "loss": 3.8156298828125, "step": 66000 }, { "epoch": 0.6859194537549161, "grad_norm": 1.8741025924682617, "learning_rate": 0.00023140909232413582, "loss": 3.9566534423828124, "step": 66100 }, { "epoch": 0.686957153382382, "grad_norm": 1.682139277458191, "learning_rate": 0.00023130532236138927, "loss": 3.9164004516601563, "step": 66200 }, { "epoch": 0.6879948530098478, "grad_norm": 1.1901954412460327, "learning_rate": 0.00023120155239864267, "loss": 4.0331982421875, "step": 66300 }, { "epoch": 0.6890325526373136, "grad_norm": 2.2226786613464355, "learning_rate": 0.0002310977824358961, "loss": 3.901326904296875, "step": 66400 }, { "epoch": 0.6900702522647795, "grad_norm": 2.28139328956604, "learning_rate": 0.00023099401247314948, "loss": 3.734437255859375, "step": 66500 }, { "epoch": 0.6911079518922453, "grad_norm": 3.9518322944641113, "learning_rate": 0.00023089024251040293, "loss": 3.890718994140625, "step": 66600 }, { "epoch": 0.6921456515197111, "grad_norm": 4.689309120178223, "learning_rate": 0.00023078647254765633, "loss": 3.83462646484375, "step": 66700 }, { "epoch": 0.6931833511471769, "grad_norm": 2.5103607177734375, "learning_rate": 0.00023068270258490975, "loss": 3.8714788818359374, "step": 66800 }, { "epoch": 0.6942210507746428, "grad_norm": 2.060398578643799, "learning_rate": 0.00023057893262216317, "loss": 3.8463949584960937, "step": 66900 }, { "epoch": 0.6952587504021086, "grad_norm": 3.9058265686035156, "learning_rate": 0.00023047516265941657, "loss": 3.955802001953125, "step": 67000 }, { "epoch": 0.6962964500295744, "grad_norm": 2.7018091678619385, "learning_rate": 0.00023037139269667002, "loss": 4.010853271484375, "step": 67100 }, { "epoch": 0.6973341496570403, "grad_norm": 1.759364366531372, "learning_rate": 0.0002302676227339234, "loss": 3.8436270141601563, "step": 67200 }, { "epoch": 0.6983718492845061, "grad_norm": 4.264219284057617, "learning_rate": 0.00023016385277117683, "loss": 3.906452941894531, "step": 67300 }, { "epoch": 0.6994095489119719, "grad_norm": 2.064502000808716, "learning_rate": 0.00023006008280843023, "loss": 3.9249755859375, "step": 67400 }, { "epoch": 0.7004472485394377, "grad_norm": 4.326413154602051, "learning_rate": 0.00022995631284568368, "loss": 3.9763421630859375, "step": 67500 }, { "epoch": 0.7014849481669037, "grad_norm": 1.5424126386642456, "learning_rate": 0.0002298525428829371, "loss": 3.9105490112304686, "step": 67600 }, { "epoch": 0.7025226477943695, "grad_norm": 3.1067123413085938, "learning_rate": 0.0002297487729201905, "loss": 4.066288146972656, "step": 67700 }, { "epoch": 0.7035603474218353, "grad_norm": 1.3455185890197754, "learning_rate": 0.00022964500295744392, "loss": 3.906605224609375, "step": 67800 }, { "epoch": 0.7045980470493011, "grad_norm": 4.567904472351074, "learning_rate": 0.0002295412329946973, "loss": 3.8274655151367187, "step": 67900 }, { "epoch": 0.705635746676767, "grad_norm": 1.4911061525344849, "learning_rate": 0.00022943746303195076, "loss": 3.8712289428710935, "step": 68000 }, { "epoch": 0.7066734463042328, "grad_norm": 1.8636422157287598, "learning_rate": 0.00022933369306920418, "loss": 3.9435845947265626, "step": 68100 }, { "epoch": 0.7077111459316986, "grad_norm": 4.616937637329102, "learning_rate": 0.00022922992310645758, "loss": 4.073515319824219, "step": 68200 }, { "epoch": 0.7087488455591644, "grad_norm": 2.339660167694092, "learning_rate": 0.000229126153143711, "loss": 3.752909851074219, "step": 68300 }, { "epoch": 0.7097865451866303, "grad_norm": 2.2960572242736816, "learning_rate": 0.00022902238318096442, "loss": 3.841389465332031, "step": 68400 }, { "epoch": 0.7108242448140961, "grad_norm": 1.9303183555603027, "learning_rate": 0.00022891861321821784, "loss": 4.007230529785156, "step": 68500 }, { "epoch": 0.7118619444415619, "grad_norm": 3.3750216960906982, "learning_rate": 0.00022881484325547124, "loss": 4.0530221557617185, "step": 68600 }, { "epoch": 0.7128996440690277, "grad_norm": 3.9443397521972656, "learning_rate": 0.00022871107329272466, "loss": 3.92802734375, "step": 68700 }, { "epoch": 0.7139373436964936, "grad_norm": 2.2526562213897705, "learning_rate": 0.0002286073033299781, "loss": 4.117896728515625, "step": 68800 }, { "epoch": 0.7149750433239594, "grad_norm": 3.631329298019409, "learning_rate": 0.0002285035333672315, "loss": 3.876401062011719, "step": 68900 }, { "epoch": 0.7160127429514253, "grad_norm": 2.0594444274902344, "learning_rate": 0.00022839976340448492, "loss": 3.9595294189453125, "step": 69000 }, { "epoch": 0.7170504425788912, "grad_norm": 6.801323413848877, "learning_rate": 0.00022829599344173832, "loss": 3.966697998046875, "step": 69100 }, { "epoch": 0.718088142206357, "grad_norm": 3.579699754714966, "learning_rate": 0.00022819222347899174, "loss": 3.9083868408203126, "step": 69200 }, { "epoch": 0.7191258418338228, "grad_norm": 3.9111030101776123, "learning_rate": 0.0002280884535162452, "loss": 4.020595092773437, "step": 69300 }, { "epoch": 0.7201635414612886, "grad_norm": 1.5465009212493896, "learning_rate": 0.00022798468355349858, "loss": 4.002583618164063, "step": 69400 }, { "epoch": 0.7212012410887545, "grad_norm": 2.5977070331573486, "learning_rate": 0.000227880913590752, "loss": 3.82881591796875, "step": 69500 }, { "epoch": 0.7222389407162203, "grad_norm": 3.807143211364746, "learning_rate": 0.0002277771436280054, "loss": 3.8127020263671874, "step": 69600 }, { "epoch": 0.7232766403436861, "grad_norm": 3.562692165374756, "learning_rate": 0.00022767337366525885, "loss": 3.861103820800781, "step": 69700 }, { "epoch": 0.7243143399711519, "grad_norm": 4.136765003204346, "learning_rate": 0.00022756960370251225, "loss": 3.817465515136719, "step": 69800 }, { "epoch": 0.7253520395986178, "grad_norm": 1.9534144401550293, "learning_rate": 0.00022746583373976567, "loss": 3.784884338378906, "step": 69900 }, { "epoch": 0.7263897392260836, "grad_norm": 2.2738490104675293, "learning_rate": 0.0002273620637770191, "loss": 3.9553741455078124, "step": 70000 }, { "epoch": 0.7274274388535494, "grad_norm": 8.41178035736084, "learning_rate": 0.00022725829381427248, "loss": 3.9581622314453124, "step": 70100 }, { "epoch": 0.7284651384810152, "grad_norm": 2.574738025665283, "learning_rate": 0.00022715452385152593, "loss": 3.865647888183594, "step": 70200 }, { "epoch": 0.7295028381084812, "grad_norm": 4.12198543548584, "learning_rate": 0.00022705075388877933, "loss": 3.8447744750976565, "step": 70300 }, { "epoch": 0.730540537735947, "grad_norm": 3.4615478515625, "learning_rate": 0.00022694698392603275, "loss": 3.8417919921875, "step": 70400 }, { "epoch": 0.7315782373634128, "grad_norm": 1.9662399291992188, "learning_rate": 0.00022684321396328614, "loss": 3.943636779785156, "step": 70500 }, { "epoch": 0.7326159369908786, "grad_norm": 6.054515361785889, "learning_rate": 0.0002267394440005396, "loss": 3.9477130126953126, "step": 70600 }, { "epoch": 0.7336536366183445, "grad_norm": 2.6368846893310547, "learning_rate": 0.00022663567403779302, "loss": 3.9134860229492188, "step": 70700 }, { "epoch": 0.7346913362458103, "grad_norm": 18.437114715576172, "learning_rate": 0.0002265319040750464, "loss": 3.9025979614257813, "step": 70800 }, { "epoch": 0.7357290358732761, "grad_norm": 3.9227664470672607, "learning_rate": 0.00022642813411229983, "loss": 3.9925546264648437, "step": 70900 }, { "epoch": 0.736766735500742, "grad_norm": 2.9096601009368896, "learning_rate": 0.00022632436414955323, "loss": 3.7520477294921877, "step": 71000 }, { "epoch": 0.7378044351282078, "grad_norm": 2.756199598312378, "learning_rate": 0.00022622059418680668, "loss": 3.7744400024414064, "step": 71100 }, { "epoch": 0.7388421347556736, "grad_norm": 4.398651123046875, "learning_rate": 0.0002261168242240601, "loss": 3.8754537963867186, "step": 71200 }, { "epoch": 0.7398798343831394, "grad_norm": 3.0455260276794434, "learning_rate": 0.0002260130542613135, "loss": 3.8303518676757813, "step": 71300 }, { "epoch": 0.7409175340106053, "grad_norm": 1.6435341835021973, "learning_rate": 0.00022590928429856692, "loss": 3.868741149902344, "step": 71400 }, { "epoch": 0.7419552336380711, "grad_norm": 2.460381507873535, "learning_rate": 0.00022580551433582034, "loss": 3.971143798828125, "step": 71500 }, { "epoch": 0.742992933265537, "grad_norm": 3.793260335922241, "learning_rate": 0.00022570174437307376, "loss": 3.9564599609375, "step": 71600 }, { "epoch": 0.7440306328930028, "grad_norm": 2.2400221824645996, "learning_rate": 0.00022559797441032715, "loss": 3.868074951171875, "step": 71700 }, { "epoch": 0.7450683325204687, "grad_norm": 4.521097660064697, "learning_rate": 0.00022549420444758058, "loss": 3.9104345703125, "step": 71800 }, { "epoch": 0.7461060321479345, "grad_norm": 2.454610824584961, "learning_rate": 0.00022539043448483402, "loss": 3.8415142822265627, "step": 71900 }, { "epoch": 0.7471437317754003, "grad_norm": 1.7384246587753296, "learning_rate": 0.00022528666452208742, "loss": 3.9767572021484376, "step": 72000 }, { "epoch": 0.7481814314028661, "grad_norm": 2.3506603240966797, "learning_rate": 0.00022518289455934084, "loss": 3.804529724121094, "step": 72100 }, { "epoch": 0.749219131030332, "grad_norm": 8.719681739807129, "learning_rate": 0.00022507912459659424, "loss": 3.6692437744140625, "step": 72200 }, { "epoch": 0.7502568306577978, "grad_norm": 2.188565254211426, "learning_rate": 0.00022497535463384766, "loss": 3.9966400146484373, "step": 72300 }, { "epoch": 0.7512945302852636, "grad_norm": 2.7061383724212646, "learning_rate": 0.00022487158467110108, "loss": 3.7955560302734375, "step": 72400 }, { "epoch": 0.7523322299127294, "grad_norm": 1.820816993713379, "learning_rate": 0.0002247678147083545, "loss": 3.800717468261719, "step": 72500 }, { "epoch": 0.7533699295401953, "grad_norm": 2.3510568141937256, "learning_rate": 0.00022466404474560792, "loss": 3.8987237548828126, "step": 72600 }, { "epoch": 0.7544076291676611, "grad_norm": 3.0852279663085938, "learning_rate": 0.00022456027478286132, "loss": 3.9560122680664063, "step": 72700 }, { "epoch": 0.7554453287951269, "grad_norm": 2.3377742767333984, "learning_rate": 0.00022445650482011477, "loss": 3.9077328491210936, "step": 72800 }, { "epoch": 0.7564830284225929, "grad_norm": 4.257030010223389, "learning_rate": 0.00022435273485736816, "loss": 3.915125732421875, "step": 72900 }, { "epoch": 0.7575207280500587, "grad_norm": 1.8238855600357056, "learning_rate": 0.00022424896489462158, "loss": 3.8456768798828125, "step": 73000 }, { "epoch": 0.7585584276775245, "grad_norm": 2.2102901935577393, "learning_rate": 0.000224145194931875, "loss": 3.9905462646484375, "step": 73100 }, { "epoch": 0.7595961273049903, "grad_norm": 6.003772735595703, "learning_rate": 0.0002240414249691284, "loss": 3.831954040527344, "step": 73200 }, { "epoch": 0.7606338269324562, "grad_norm": 2.209681272506714, "learning_rate": 0.00022393765500638185, "loss": 3.96739990234375, "step": 73300 }, { "epoch": 0.761671526559922, "grad_norm": 5.8811235427856445, "learning_rate": 0.00022383388504363525, "loss": 3.8418869018554687, "step": 73400 }, { "epoch": 0.7627092261873878, "grad_norm": 1.9358527660369873, "learning_rate": 0.00022373011508088867, "loss": 3.9846435546875, "step": 73500 }, { "epoch": 0.7637469258148536, "grad_norm": 4.668230056762695, "learning_rate": 0.00022362634511814206, "loss": 3.87702880859375, "step": 73600 }, { "epoch": 0.7647846254423195, "grad_norm": 2.1674551963806152, "learning_rate": 0.0002235225751553955, "loss": 3.9948715209960937, "step": 73700 }, { "epoch": 0.7658223250697853, "grad_norm": 3.276775360107422, "learning_rate": 0.00022341880519264893, "loss": 3.876432189941406, "step": 73800 }, { "epoch": 0.7668600246972511, "grad_norm": 2.382432222366333, "learning_rate": 0.00022331503522990233, "loss": 3.9535626220703124, "step": 73900 }, { "epoch": 0.7678977243247169, "grad_norm": 2.288184404373169, "learning_rate": 0.00022321126526715575, "loss": 3.962213134765625, "step": 74000 }, { "epoch": 0.7689354239521828, "grad_norm": 11.535764694213867, "learning_rate": 0.00022310749530440914, "loss": 3.839007568359375, "step": 74100 }, { "epoch": 0.7699731235796486, "grad_norm": 2.520615816116333, "learning_rate": 0.0002230037253416626, "loss": 3.942041015625, "step": 74200 }, { "epoch": 0.7710108232071144, "grad_norm": 5.035190582275391, "learning_rate": 0.000222899955378916, "loss": 3.827362365722656, "step": 74300 }, { "epoch": 0.7720485228345803, "grad_norm": 2.1133370399475098, "learning_rate": 0.0002227961854161694, "loss": 3.8085946655273437, "step": 74400 }, { "epoch": 0.7730862224620462, "grad_norm": 3.3813223838806152, "learning_rate": 0.00022269241545342283, "loss": 3.8528924560546876, "step": 74500 }, { "epoch": 0.774123922089512, "grad_norm": 2.5912599563598633, "learning_rate": 0.00022258864549067625, "loss": 4.025367126464844, "step": 74600 }, { "epoch": 0.7751616217169778, "grad_norm": 8.560553550720215, "learning_rate": 0.00022248487552792968, "loss": 3.8942611694335936, "step": 74700 }, { "epoch": 0.7761993213444436, "grad_norm": 2.7210657596588135, "learning_rate": 0.00022238110556518307, "loss": 3.7450421142578123, "step": 74800 }, { "epoch": 0.7772370209719095, "grad_norm": 3.06449031829834, "learning_rate": 0.0002222773356024365, "loss": 4.058497619628906, "step": 74900 }, { "epoch": 0.7782747205993753, "grad_norm": 2.6780056953430176, "learning_rate": 0.00022217356563968994, "loss": 3.908025207519531, "step": 75000 }, { "epoch": 0.7793124202268411, "grad_norm": 2.579087257385254, "learning_rate": 0.00022206979567694334, "loss": 3.914963684082031, "step": 75100 }, { "epoch": 0.780350119854307, "grad_norm": 6.844696998596191, "learning_rate": 0.00022196602571419676, "loss": 3.8832046508789064, "step": 75200 }, { "epoch": 0.7813878194817728, "grad_norm": 7.694204330444336, "learning_rate": 0.00022186225575145015, "loss": 3.9718392944335936, "step": 75300 }, { "epoch": 0.7824255191092386, "grad_norm": 9.200462341308594, "learning_rate": 0.00022175848578870358, "loss": 3.859333801269531, "step": 75400 }, { "epoch": 0.7834632187367044, "grad_norm": 4.622501850128174, "learning_rate": 0.000221654715825957, "loss": 3.9099847412109376, "step": 75500 }, { "epoch": 0.7845009183641704, "grad_norm": 1.9592938423156738, "learning_rate": 0.00022155094586321042, "loss": 3.8727886962890623, "step": 75600 }, { "epoch": 0.7855386179916362, "grad_norm": 4.431970119476318, "learning_rate": 0.00022144717590046384, "loss": 3.9126931762695314, "step": 75700 }, { "epoch": 0.786576317619102, "grad_norm": 4.069213390350342, "learning_rate": 0.00022134340593771724, "loss": 3.8846563720703124, "step": 75800 }, { "epoch": 0.7876140172465678, "grad_norm": 2.009706497192383, "learning_rate": 0.00022123963597497068, "loss": 3.951784362792969, "step": 75900 }, { "epoch": 0.7886517168740337, "grad_norm": 3.475999116897583, "learning_rate": 0.00022113586601222408, "loss": 3.8493191528320314, "step": 76000 }, { "epoch": 0.7896894165014995, "grad_norm": 2.45090913772583, "learning_rate": 0.0002210320960494775, "loss": 3.938821105957031, "step": 76100 }, { "epoch": 0.7907271161289653, "grad_norm": 3.2572762966156006, "learning_rate": 0.0002209283260867309, "loss": 3.8848175048828124, "step": 76200 }, { "epoch": 0.7917648157564311, "grad_norm": 2.2695441246032715, "learning_rate": 0.00022082455612398432, "loss": 3.8166204833984376, "step": 76300 }, { "epoch": 0.792802515383897, "grad_norm": 6.520568370819092, "learning_rate": 0.00022072078616123777, "loss": 3.8947482299804688, "step": 76400 }, { "epoch": 0.7938402150113628, "grad_norm": 9.233070373535156, "learning_rate": 0.00022061701619849116, "loss": 3.8395782470703126, "step": 76500 }, { "epoch": 0.7948779146388286, "grad_norm": 1.5229090452194214, "learning_rate": 0.00022051324623574458, "loss": 3.979128723144531, "step": 76600 }, { "epoch": 0.7959156142662944, "grad_norm": 3.9737226963043213, "learning_rate": 0.00022040947627299798, "loss": 3.890586242675781, "step": 76700 }, { "epoch": 0.7969533138937603, "grad_norm": 1.9717073440551758, "learning_rate": 0.00022030570631025143, "loss": 3.971199951171875, "step": 76800 }, { "epoch": 0.7979910135212261, "grad_norm": 3.3416688442230225, "learning_rate": 0.00022020193634750485, "loss": 3.961914367675781, "step": 76900 }, { "epoch": 0.799028713148692, "grad_norm": 2.037693738937378, "learning_rate": 0.00022009816638475824, "loss": 3.8637881469726563, "step": 77000 }, { "epoch": 0.8000664127761579, "grad_norm": 5.026768207550049, "learning_rate": 0.00021999439642201167, "loss": 3.9692828369140627, "step": 77100 }, { "epoch": 0.8011041124036237, "grad_norm": 2.230590581893921, "learning_rate": 0.00021989062645926506, "loss": 3.852244873046875, "step": 77200 }, { "epoch": 0.8021418120310895, "grad_norm": 2.0119717121124268, "learning_rate": 0.0002197868564965185, "loss": 3.9774188232421874, "step": 77300 }, { "epoch": 0.8031795116585553, "grad_norm": 5.08432674407959, "learning_rate": 0.0002196830865337719, "loss": 3.8257907104492186, "step": 77400 }, { "epoch": 0.8042172112860212, "grad_norm": 3.0086820125579834, "learning_rate": 0.00021957931657102533, "loss": 3.865489501953125, "step": 77500 }, { "epoch": 0.805254910913487, "grad_norm": 4.534199237823486, "learning_rate": 0.00021947554660827875, "loss": 3.875529479980469, "step": 77600 }, { "epoch": 0.8062926105409528, "grad_norm": 2.68324613571167, "learning_rate": 0.00021937177664553217, "loss": 3.928450927734375, "step": 77700 }, { "epoch": 0.8073303101684186, "grad_norm": 3.7302651405334473, "learning_rate": 0.0002192680066827856, "loss": 3.9593939208984374, "step": 77800 }, { "epoch": 0.8083680097958845, "grad_norm": 2.8160176277160645, "learning_rate": 0.000219164236720039, "loss": 4.003828735351562, "step": 77900 }, { "epoch": 0.8094057094233503, "grad_norm": 2.314183473587036, "learning_rate": 0.0002190604667572924, "loss": 3.988243408203125, "step": 78000 }, { "epoch": 0.8104434090508161, "grad_norm": 2.661289691925049, "learning_rate": 0.0002189566967945458, "loss": 3.9358248901367188, "step": 78100 }, { "epoch": 0.8114811086782819, "grad_norm": 5.065707206726074, "learning_rate": 0.00021885292683179925, "loss": 3.7886788940429685, "step": 78200 }, { "epoch": 0.8125188083057479, "grad_norm": 5.173181056976318, "learning_rate": 0.00021874915686905268, "loss": 3.790332946777344, "step": 78300 }, { "epoch": 0.8135565079332137, "grad_norm": 2.573274850845337, "learning_rate": 0.00021864538690630607, "loss": 3.975767822265625, "step": 78400 }, { "epoch": 0.8145942075606795, "grad_norm": 3.010472536087036, "learning_rate": 0.0002185416169435595, "loss": 3.861507568359375, "step": 78500 }, { "epoch": 0.8156319071881453, "grad_norm": 2.632009983062744, "learning_rate": 0.00021843784698081291, "loss": 3.9550189208984374, "step": 78600 }, { "epoch": 0.8166696068156112, "grad_norm": 5.590510368347168, "learning_rate": 0.00021833407701806634, "loss": 3.924696044921875, "step": 78700 }, { "epoch": 0.817707306443077, "grad_norm": 4.052700042724609, "learning_rate": 0.00021823030705531976, "loss": 3.831592712402344, "step": 78800 }, { "epoch": 0.8187450060705428, "grad_norm": 2.7363622188568115, "learning_rate": 0.00021812653709257315, "loss": 4.028314208984375, "step": 78900 }, { "epoch": 0.8197827056980087, "grad_norm": 4.773056507110596, "learning_rate": 0.0002180227671298266, "loss": 3.7855130004882813, "step": 79000 }, { "epoch": 0.8208204053254745, "grad_norm": 2.6858768463134766, "learning_rate": 0.00021791899716708, "loss": 3.9081768798828125, "step": 79100 }, { "epoch": 0.8218581049529403, "grad_norm": 4.861189842224121, "learning_rate": 0.00021781522720433342, "loss": 3.99755126953125, "step": 79200 }, { "epoch": 0.8228958045804061, "grad_norm": 2.1088833808898926, "learning_rate": 0.00021771145724158681, "loss": 3.8839871215820314, "step": 79300 }, { "epoch": 0.823933504207872, "grad_norm": 2.911973237991333, "learning_rate": 0.00021760768727884024, "loss": 3.864557189941406, "step": 79400 }, { "epoch": 0.8249712038353378, "grad_norm": 6.847414016723633, "learning_rate": 0.00021750391731609368, "loss": 3.868388366699219, "step": 79500 }, { "epoch": 0.8260089034628036, "grad_norm": 2.0376992225646973, "learning_rate": 0.00021740014735334708, "loss": 3.9390859985351563, "step": 79600 }, { "epoch": 0.8270466030902694, "grad_norm": 4.972707271575928, "learning_rate": 0.0002172963773906005, "loss": 3.8582077026367188, "step": 79700 }, { "epoch": 0.8280843027177354, "grad_norm": 7.205460071563721, "learning_rate": 0.0002171926074278539, "loss": 3.839405212402344, "step": 79800 }, { "epoch": 0.8291220023452012, "grad_norm": 12.633910179138184, "learning_rate": 0.00021708883746510735, "loss": 3.831856384277344, "step": 79900 }, { "epoch": 0.830159701972667, "grad_norm": 4.479480743408203, "learning_rate": 0.00021698506750236074, "loss": 3.795959167480469, "step": 80000 }, { "epoch": 0.8311974016001328, "grad_norm": 4.281702995300293, "learning_rate": 0.00021688129753961416, "loss": 4.039653625488281, "step": 80100 }, { "epoch": 0.8322351012275987, "grad_norm": 3.5497429370880127, "learning_rate": 0.00021677752757686758, "loss": 4.000389709472656, "step": 80200 }, { "epoch": 0.8332728008550645, "grad_norm": 2.431144952774048, "learning_rate": 0.00021667375761412098, "loss": 3.9792193603515624, "step": 80300 }, { "epoch": 0.8343105004825303, "grad_norm": 13.734992980957031, "learning_rate": 0.00021656998765137443, "loss": 3.8038821411132813, "step": 80400 }, { "epoch": 0.8353482001099961, "grad_norm": 1.6895164251327515, "learning_rate": 0.00021646621768862782, "loss": 3.7827383422851564, "step": 80500 }, { "epoch": 0.836385899737462, "grad_norm": 3.4907968044281006, "learning_rate": 0.00021636244772588124, "loss": 3.882090759277344, "step": 80600 }, { "epoch": 0.8374235993649278, "grad_norm": 2.345144510269165, "learning_rate": 0.0002162586777631347, "loss": 3.8400167846679687, "step": 80700 }, { "epoch": 0.8384612989923936, "grad_norm": 3.4369494915008545, "learning_rate": 0.0002161549078003881, "loss": 3.7776190185546876, "step": 80800 }, { "epoch": 0.8394989986198595, "grad_norm": 5.47845983505249, "learning_rate": 0.0002160511378376415, "loss": 4.044588623046875, "step": 80900 }, { "epoch": 0.8405366982473254, "grad_norm": 1.5931683778762817, "learning_rate": 0.0002159473678748949, "loss": 3.9998703002929688, "step": 81000 }, { "epoch": 0.8415743978747912, "grad_norm": 3.1940066814422607, "learning_rate": 0.00021584359791214833, "loss": 3.8839016723632813, "step": 81100 }, { "epoch": 0.842612097502257, "grad_norm": 9.511052131652832, "learning_rate": 0.00021573982794940172, "loss": 3.9398565673828125, "step": 81200 }, { "epoch": 0.8436497971297229, "grad_norm": 1.9886616468429565, "learning_rate": 0.00021563605798665517, "loss": 3.82979736328125, "step": 81300 }, { "epoch": 0.8446874967571887, "grad_norm": 2.362103223800659, "learning_rate": 0.0002155322880239086, "loss": 3.8248995971679687, "step": 81400 }, { "epoch": 0.8457251963846545, "grad_norm": 1.7605165243148804, "learning_rate": 0.000215428518061162, "loss": 3.7230010986328126, "step": 81500 }, { "epoch": 0.8467628960121203, "grad_norm": 1.8303929567337036, "learning_rate": 0.0002153247480984154, "loss": 3.861679992675781, "step": 81600 }, { "epoch": 0.8478005956395862, "grad_norm": 4.539703845977783, "learning_rate": 0.00021522097813566883, "loss": 3.8151321411132812, "step": 81700 }, { "epoch": 0.848838295267052, "grad_norm": 1.8927255868911743, "learning_rate": 0.00021511720817292225, "loss": 3.999220886230469, "step": 81800 }, { "epoch": 0.8498759948945178, "grad_norm": 3.66632080078125, "learning_rate": 0.00021501343821017565, "loss": 3.9149603271484374, "step": 81900 }, { "epoch": 0.8509136945219836, "grad_norm": 6.1261887550354, "learning_rate": 0.00021490966824742907, "loss": 3.808494873046875, "step": 82000 }, { "epoch": 0.8519513941494495, "grad_norm": 2.9073901176452637, "learning_rate": 0.00021480589828468252, "loss": 3.8501129150390625, "step": 82100 }, { "epoch": 0.8529890937769153, "grad_norm": 1.9176596403121948, "learning_rate": 0.00021470212832193591, "loss": 3.9358505249023437, "step": 82200 }, { "epoch": 0.8540267934043811, "grad_norm": 2.3072047233581543, "learning_rate": 0.00021459835835918934, "loss": 3.8934945678710937, "step": 82300 }, { "epoch": 0.855064493031847, "grad_norm": 2.7599945068359375, "learning_rate": 0.00021449458839644273, "loss": 3.929814453125, "step": 82400 }, { "epoch": 0.8561021926593129, "grad_norm": 2.0721237659454346, "learning_rate": 0.00021439081843369615, "loss": 3.86040283203125, "step": 82500 }, { "epoch": 0.8571398922867787, "grad_norm": 5.156016826629639, "learning_rate": 0.0002142870484709496, "loss": 3.8864166259765627, "step": 82600 }, { "epoch": 0.8581775919142445, "grad_norm": 4.168294906616211, "learning_rate": 0.000214183278508203, "loss": 4.001069030761719, "step": 82700 }, { "epoch": 0.8592152915417104, "grad_norm": 1.7126719951629639, "learning_rate": 0.00021407950854545642, "loss": 3.946321716308594, "step": 82800 }, { "epoch": 0.8602529911691762, "grad_norm": 5.809075355529785, "learning_rate": 0.0002139757385827098, "loss": 3.82521240234375, "step": 82900 }, { "epoch": 0.861290690796642, "grad_norm": 5.8849921226501465, "learning_rate": 0.00021387196861996326, "loss": 3.7766848754882814, "step": 83000 }, { "epoch": 0.8623283904241078, "grad_norm": 2.317793607711792, "learning_rate": 0.00021376819865721666, "loss": 4.01570068359375, "step": 83100 }, { "epoch": 0.8633660900515737, "grad_norm": 19.14999008178711, "learning_rate": 0.00021366442869447008, "loss": 3.760934143066406, "step": 83200 }, { "epoch": 0.8644037896790395, "grad_norm": 2.025818109512329, "learning_rate": 0.0002135606587317235, "loss": 3.9255300903320314, "step": 83300 }, { "epoch": 0.8654414893065053, "grad_norm": 3.068112373352051, "learning_rate": 0.0002134568887689769, "loss": 3.821394348144531, "step": 83400 }, { "epoch": 0.8664791889339711, "grad_norm": 8.730904579162598, "learning_rate": 0.00021335311880623034, "loss": 3.8662478637695314, "step": 83500 }, { "epoch": 0.867516888561437, "grad_norm": 2.9956910610198975, "learning_rate": 0.00021324934884348374, "loss": 3.8266961669921873, "step": 83600 }, { "epoch": 0.8685545881889029, "grad_norm": 2.774705410003662, "learning_rate": 0.00021314557888073716, "loss": 3.8334832763671876, "step": 83700 }, { "epoch": 0.8695922878163687, "grad_norm": 1.9926444292068481, "learning_rate": 0.00021304180891799056, "loss": 3.973898620605469, "step": 83800 }, { "epoch": 0.8706299874438345, "grad_norm": 1.8433290719985962, "learning_rate": 0.000212938038955244, "loss": 3.8273077392578125, "step": 83900 }, { "epoch": 0.8716676870713004, "grad_norm": 5.3389410972595215, "learning_rate": 0.00021283426899249743, "loss": 3.8604061889648436, "step": 84000 }, { "epoch": 0.8727053866987662, "grad_norm": 7.391428470611572, "learning_rate": 0.00021273049902975082, "loss": 3.8056671142578127, "step": 84100 }, { "epoch": 0.873743086326232, "grad_norm": 5.367404937744141, "learning_rate": 0.00021262672906700424, "loss": 3.8744406127929687, "step": 84200 }, { "epoch": 0.8747807859536978, "grad_norm": 3.1199004650115967, "learning_rate": 0.00021252295910425764, "loss": 3.8992080688476562, "step": 84300 }, { "epoch": 0.8758184855811637, "grad_norm": 1.8603098392486572, "learning_rate": 0.0002124191891415111, "loss": 3.8311639404296876, "step": 84400 }, { "epoch": 0.8768561852086295, "grad_norm": 2.5739691257476807, "learning_rate": 0.0002123154191787645, "loss": 3.754921569824219, "step": 84500 }, { "epoch": 0.8778938848360953, "grad_norm": 3.090057134628296, "learning_rate": 0.0002122116492160179, "loss": 3.74908935546875, "step": 84600 }, { "epoch": 0.8789315844635612, "grad_norm": 9.258840560913086, "learning_rate": 0.00021210787925327133, "loss": 3.985562744140625, "step": 84700 }, { "epoch": 0.879969284091027, "grad_norm": 3.738255262374878, "learning_rate": 0.00021200410929052475, "loss": 3.9656732177734373, "step": 84800 }, { "epoch": 0.8810069837184928, "grad_norm": 3.415017604827881, "learning_rate": 0.00021190033932777817, "loss": 3.958587341308594, "step": 84900 }, { "epoch": 0.8820446833459586, "grad_norm": 6.633699893951416, "learning_rate": 0.00021179656936503157, "loss": 3.866285705566406, "step": 85000 }, { "epoch": 0.8830823829734246, "grad_norm": 1.7935473918914795, "learning_rate": 0.000211692799402285, "loss": 3.9740695190429687, "step": 85100 }, { "epoch": 0.8841200826008904, "grad_norm": 2.706197500228882, "learning_rate": 0.00021158902943953844, "loss": 3.8669891357421875, "step": 85200 }, { "epoch": 0.8851577822283562, "grad_norm": 4.353029727935791, "learning_rate": 0.00021148525947679183, "loss": 3.881668701171875, "step": 85300 }, { "epoch": 0.886195481855822, "grad_norm": 3.0080366134643555, "learning_rate": 0.00021138148951404525, "loss": 3.8229278564453124, "step": 85400 }, { "epoch": 0.8872331814832879, "grad_norm": 7.4073028564453125, "learning_rate": 0.00021127771955129865, "loss": 4.015174560546875, "step": 85500 }, { "epoch": 0.8882708811107537, "grad_norm": 4.174534320831299, "learning_rate": 0.00021117394958855207, "loss": 3.8184585571289062, "step": 85600 }, { "epoch": 0.8893085807382195, "grad_norm": 5.683806896209717, "learning_rate": 0.0002110701796258055, "loss": 3.8243557739257814, "step": 85700 }, { "epoch": 0.8903462803656853, "grad_norm": 2.076599597930908, "learning_rate": 0.00021096640966305891, "loss": 3.71376220703125, "step": 85800 }, { "epoch": 0.8913839799931512, "grad_norm": 2.4622974395751953, "learning_rate": 0.00021086263970031234, "loss": 3.85018310546875, "step": 85900 }, { "epoch": 0.892421679620617, "grad_norm": 2.3247082233428955, "learning_rate": 0.00021075886973756573, "loss": 3.9427032470703125, "step": 86000 }, { "epoch": 0.8934593792480828, "grad_norm": 5.115243911743164, "learning_rate": 0.00021065509977481918, "loss": 3.6884475708007813, "step": 86100 }, { "epoch": 0.8944970788755486, "grad_norm": 5.306711196899414, "learning_rate": 0.00021055132981207257, "loss": 3.8416738891601563, "step": 86200 }, { "epoch": 0.8955347785030146, "grad_norm": 1.5796631574630737, "learning_rate": 0.000210447559849326, "loss": 3.874592590332031, "step": 86300 }, { "epoch": 0.8965724781304804, "grad_norm": 1.6183887720108032, "learning_rate": 0.00021034378988657942, "loss": 3.840068054199219, "step": 86400 }, { "epoch": 0.8976101777579462, "grad_norm": 3.1412158012390137, "learning_rate": 0.0002102400199238328, "loss": 4.0432958984375, "step": 86500 }, { "epoch": 0.898647877385412, "grad_norm": 1.6547956466674805, "learning_rate": 0.00021013624996108626, "loss": 3.829620361328125, "step": 86600 }, { "epoch": 0.8996855770128779, "grad_norm": 9.84925365447998, "learning_rate": 0.00021003247999833966, "loss": 3.74409912109375, "step": 86700 }, { "epoch": 0.9007232766403437, "grad_norm": 4.718574523925781, "learning_rate": 0.00020992871003559308, "loss": 3.8265109252929688, "step": 86800 }, { "epoch": 0.9017609762678095, "grad_norm": 4.692354679107666, "learning_rate": 0.00020982494007284647, "loss": 3.9203875732421873, "step": 86900 }, { "epoch": 0.9027986758952754, "grad_norm": 3.620683431625366, "learning_rate": 0.00020972117011009992, "loss": 3.9122955322265627, "step": 87000 }, { "epoch": 0.9038363755227412, "grad_norm": 4.431119918823242, "learning_rate": 0.00020961740014735334, "loss": 3.9402545166015623, "step": 87100 }, { "epoch": 0.904874075150207, "grad_norm": 3.734344005584717, "learning_rate": 0.00020951363018460674, "loss": 3.8481884765625, "step": 87200 }, { "epoch": 0.9059117747776728, "grad_norm": 3.735985279083252, "learning_rate": 0.00020940986022186016, "loss": 3.8412353515625, "step": 87300 }, { "epoch": 0.9069494744051387, "grad_norm": 2.774721145629883, "learning_rate": 0.00020930609025911356, "loss": 3.76121337890625, "step": 87400 }, { "epoch": 0.9079871740326045, "grad_norm": 13.096595764160156, "learning_rate": 0.000209202320296367, "loss": 3.9009844970703127, "step": 87500 }, { "epoch": 0.9090248736600703, "grad_norm": 5.561835765838623, "learning_rate": 0.0002090985503336204, "loss": 3.7489013671875, "step": 87600 }, { "epoch": 0.9100625732875361, "grad_norm": 5.21470832824707, "learning_rate": 0.00020899478037087382, "loss": 3.9491476440429687, "step": 87700 }, { "epoch": 0.9111002729150021, "grad_norm": 3.611980438232422, "learning_rate": 0.00020889101040812724, "loss": 3.8744741821289064, "step": 87800 }, { "epoch": 0.9121379725424679, "grad_norm": 3.670480489730835, "learning_rate": 0.00020878724044538067, "loss": 3.8484326171875, "step": 87900 }, { "epoch": 0.9131756721699337, "grad_norm": 2.46195387840271, "learning_rate": 0.0002086834704826341, "loss": 3.8545870971679688, "step": 88000 }, { "epoch": 0.9142133717973995, "grad_norm": 2.256782054901123, "learning_rate": 0.00020857970051988748, "loss": 3.788062744140625, "step": 88100 }, { "epoch": 0.9152510714248654, "grad_norm": 1.5597251653671265, "learning_rate": 0.0002084759305571409, "loss": 3.8967153930664065, "step": 88200 }, { "epoch": 0.9162887710523312, "grad_norm": 4.607747554779053, "learning_rate": 0.00020837216059439435, "loss": 3.84433837890625, "step": 88300 }, { "epoch": 0.917326470679797, "grad_norm": 2.7213637828826904, "learning_rate": 0.00020826839063164775, "loss": 3.6432476806640626, "step": 88400 }, { "epoch": 0.9183641703072628, "grad_norm": 1.6943309307098389, "learning_rate": 0.00020816462066890117, "loss": 3.942064208984375, "step": 88500 }, { "epoch": 0.9194018699347287, "grad_norm": 1.9761497974395752, "learning_rate": 0.00020806085070615457, "loss": 3.757283020019531, "step": 88600 }, { "epoch": 0.9204395695621945, "grad_norm": 2.720459461212158, "learning_rate": 0.000207957080743408, "loss": 3.723210754394531, "step": 88700 }, { "epoch": 0.9214772691896603, "grad_norm": 2.986565589904785, "learning_rate": 0.0002078533107806614, "loss": 3.9739913940429688, "step": 88800 }, { "epoch": 0.9225149688171262, "grad_norm": 2.682279348373413, "learning_rate": 0.00020774954081791483, "loss": 3.706415100097656, "step": 88900 }, { "epoch": 0.923552668444592, "grad_norm": 14.281532287597656, "learning_rate": 0.00020764577085516825, "loss": 3.799072570800781, "step": 89000 }, { "epoch": 0.9245903680720579, "grad_norm": 3.1239538192749023, "learning_rate": 0.00020754200089242165, "loss": 3.8822201538085936, "step": 89100 }, { "epoch": 0.9256280676995237, "grad_norm": 7.4986252784729, "learning_rate": 0.0002074382309296751, "loss": 3.852564392089844, "step": 89200 }, { "epoch": 0.9266657673269896, "grad_norm": 4.3345441818237305, "learning_rate": 0.0002073344609669285, "loss": 3.890749206542969, "step": 89300 }, { "epoch": 0.9277034669544554, "grad_norm": 2.6886496543884277, "learning_rate": 0.0002072306910041819, "loss": 3.8261907958984374, "step": 89400 }, { "epoch": 0.9287411665819212, "grad_norm": 2.2986016273498535, "learning_rate": 0.0002071269210414353, "loss": 3.8075076293945314, "step": 89500 }, { "epoch": 0.929778866209387, "grad_norm": 11.309110641479492, "learning_rate": 0.00020702315107868873, "loss": 3.829825744628906, "step": 89600 }, { "epoch": 0.9308165658368529, "grad_norm": 2.784146308898926, "learning_rate": 0.00020691938111594218, "loss": 3.7934060668945313, "step": 89700 }, { "epoch": 0.9318542654643187, "grad_norm": 2.3935048580169678, "learning_rate": 0.00020681561115319557, "loss": 3.882371826171875, "step": 89800 }, { "epoch": 0.9328919650917845, "grad_norm": 3.6735377311706543, "learning_rate": 0.000206711841190449, "loss": 3.842451171875, "step": 89900 }, { "epoch": 0.9339296647192503, "grad_norm": 3.037416696548462, "learning_rate": 0.0002066080712277024, "loss": 3.9087152099609375, "step": 90000 }, { "epoch": 0.9349673643467162, "grad_norm": 9.315804481506348, "learning_rate": 0.00020650430126495584, "loss": 3.773963623046875, "step": 90100 }, { "epoch": 0.936005063974182, "grad_norm": 5.039952278137207, "learning_rate": 0.00020640053130220926, "loss": 3.7935626220703127, "step": 90200 }, { "epoch": 0.9370427636016478, "grad_norm": 5.707028388977051, "learning_rate": 0.00020629676133946266, "loss": 3.775277404785156, "step": 90300 }, { "epoch": 0.9380804632291136, "grad_norm": 3.8109843730926514, "learning_rate": 0.00020619299137671608, "loss": 3.779449462890625, "step": 90400 }, { "epoch": 0.9391181628565796, "grad_norm": 2.9235146045684814, "learning_rate": 0.00020608922141396947, "loss": 3.8383111572265625, "step": 90500 }, { "epoch": 0.9401558624840454, "grad_norm": 1.6856282949447632, "learning_rate": 0.00020598545145122292, "loss": 3.8841232299804687, "step": 90600 }, { "epoch": 0.9411935621115112, "grad_norm": 7.263090133666992, "learning_rate": 0.00020588168148847632, "loss": 3.9575741577148436, "step": 90700 }, { "epoch": 0.9422312617389771, "grad_norm": 3.6679883003234863, "learning_rate": 0.00020577791152572974, "loss": 3.81220947265625, "step": 90800 }, { "epoch": 0.9432689613664429, "grad_norm": 5.708615303039551, "learning_rate": 0.0002056741415629832, "loss": 3.807239685058594, "step": 90900 }, { "epoch": 0.9443066609939087, "grad_norm": 4.463714122772217, "learning_rate": 0.00020557037160023658, "loss": 3.841280517578125, "step": 91000 }, { "epoch": 0.9453443606213745, "grad_norm": 10.150075912475586, "learning_rate": 0.00020546660163749, "loss": 3.75313232421875, "step": 91100 }, { "epoch": 0.9463820602488404, "grad_norm": 11.987652778625488, "learning_rate": 0.0002053628316747434, "loss": 3.903273620605469, "step": 91200 }, { "epoch": 0.9474197598763062, "grad_norm": 4.522410869598389, "learning_rate": 0.00020525906171199682, "loss": 3.760314636230469, "step": 91300 }, { "epoch": 0.948457459503772, "grad_norm": 4.449744701385498, "learning_rate": 0.00020515529174925022, "loss": 3.685667724609375, "step": 91400 }, { "epoch": 0.9494951591312378, "grad_norm": 1.8593145608901978, "learning_rate": 0.00020505152178650367, "loss": 3.7343402099609375, "step": 91500 }, { "epoch": 0.9505328587587037, "grad_norm": 2.4731132984161377, "learning_rate": 0.0002049477518237571, "loss": 3.783785705566406, "step": 91600 }, { "epoch": 0.9515705583861696, "grad_norm": 1.820862889289856, "learning_rate": 0.00020484398186101048, "loss": 3.719476318359375, "step": 91700 }, { "epoch": 0.9526082580136354, "grad_norm": 2.214238166809082, "learning_rate": 0.0002047402118982639, "loss": 3.7817031860351564, "step": 91800 }, { "epoch": 0.9536459576411012, "grad_norm": 3.6466450691223145, "learning_rate": 0.00020463644193551733, "loss": 3.7672024536132813, "step": 91900 }, { "epoch": 0.9546836572685671, "grad_norm": 5.454410076141357, "learning_rate": 0.00020453267197277075, "loss": 3.77567626953125, "step": 92000 }, { "epoch": 0.9557213568960329, "grad_norm": 20.138710021972656, "learning_rate": 0.00020442890201002417, "loss": 3.7506854248046877, "step": 92100 }, { "epoch": 0.9567590565234987, "grad_norm": 2.0090079307556152, "learning_rate": 0.00020432513204727756, "loss": 3.8082257080078126, "step": 92200 }, { "epoch": 0.9577967561509645, "grad_norm": 2.6881604194641113, "learning_rate": 0.00020422136208453101, "loss": 4.051754150390625, "step": 92300 }, { "epoch": 0.9588344557784304, "grad_norm": 3.293210029602051, "learning_rate": 0.0002041175921217844, "loss": 3.702369384765625, "step": 92400 }, { "epoch": 0.9598721554058962, "grad_norm": 5.354658126831055, "learning_rate": 0.00020401382215903783, "loss": 3.8296829223632813, "step": 92500 }, { "epoch": 0.960909855033362, "grad_norm": 2.285318374633789, "learning_rate": 0.00020391005219629123, "loss": 3.8205487060546877, "step": 92600 }, { "epoch": 0.9619475546608279, "grad_norm": 3.3139116764068604, "learning_rate": 0.00020380628223354465, "loss": 3.9517453002929686, "step": 92700 }, { "epoch": 0.9629852542882937, "grad_norm": 4.242766380310059, "learning_rate": 0.0002037025122707981, "loss": 3.819052429199219, "step": 92800 }, { "epoch": 0.9640229539157595, "grad_norm": 11.361218452453613, "learning_rate": 0.0002035987423080515, "loss": 3.8673443603515625, "step": 92900 }, { "epoch": 0.9650606535432253, "grad_norm": 1.6263092756271362, "learning_rate": 0.0002034949723453049, "loss": 3.6743267822265624, "step": 93000 }, { "epoch": 0.9660983531706913, "grad_norm": 3.191160202026367, "learning_rate": 0.0002033912023825583, "loss": 3.85127685546875, "step": 93100 }, { "epoch": 0.9671360527981571, "grad_norm": 14.219719886779785, "learning_rate": 0.00020328743241981176, "loss": 3.8775042724609374, "step": 93200 }, { "epoch": 0.9681737524256229, "grad_norm": 2.592212200164795, "learning_rate": 0.00020318366245706515, "loss": 3.784809265136719, "step": 93300 }, { "epoch": 0.9692114520530887, "grad_norm": 2.058199644088745, "learning_rate": 0.00020307989249431857, "loss": 3.7654934692382813, "step": 93400 }, { "epoch": 0.9702491516805546, "grad_norm": 3.3060290813446045, "learning_rate": 0.000202976122531572, "loss": 3.78427734375, "step": 93500 }, { "epoch": 0.9712868513080204, "grad_norm": 5.642673492431641, "learning_rate": 0.0002028723525688254, "loss": 3.768431396484375, "step": 93600 }, { "epoch": 0.9723245509354862, "grad_norm": 2.416527271270752, "learning_rate": 0.00020276858260607884, "loss": 3.9477734375, "step": 93700 }, { "epoch": 0.973362250562952, "grad_norm": 6.023645877838135, "learning_rate": 0.00020266481264333223, "loss": 3.8290167236328125, "step": 93800 }, { "epoch": 0.9743999501904179, "grad_norm": 3.252999782562256, "learning_rate": 0.00020256104268058566, "loss": 3.959106750488281, "step": 93900 }, { "epoch": 0.9754376498178837, "grad_norm": 2.065927743911743, "learning_rate": 0.0002024572727178391, "loss": 3.868408508300781, "step": 94000 }, { "epoch": 0.9764753494453495, "grad_norm": 3.3688645362854004, "learning_rate": 0.0002023535027550925, "loss": 3.91245361328125, "step": 94100 }, { "epoch": 0.9775130490728153, "grad_norm": 3.004783868789673, "learning_rate": 0.00020224973279234592, "loss": 3.7105670166015625, "step": 94200 }, { "epoch": 0.9785507487002812, "grad_norm": 2.6519381999969482, "learning_rate": 0.00020214596282959932, "loss": 3.8060031127929688, "step": 94300 }, { "epoch": 0.979588448327747, "grad_norm": 2.3849129676818848, "learning_rate": 0.00020204219286685274, "loss": 3.7225299072265625, "step": 94400 }, { "epoch": 0.9806261479552129, "grad_norm": 2.5238912105560303, "learning_rate": 0.00020193842290410613, "loss": 3.6197088623046874, "step": 94500 }, { "epoch": 0.9816638475826788, "grad_norm": 7.388523101806641, "learning_rate": 0.00020183465294135958, "loss": 3.6996939086914065, "step": 94600 }, { "epoch": 0.9827015472101446, "grad_norm": 10.3375883102417, "learning_rate": 0.000201730882978613, "loss": 3.7547808837890626, "step": 94700 }, { "epoch": 0.9837392468376104, "grad_norm": 2.251610040664673, "learning_rate": 0.0002016271130158664, "loss": 3.794500732421875, "step": 94800 }, { "epoch": 0.9847769464650762, "grad_norm": 3.8766162395477295, "learning_rate": 0.00020152334305311982, "loss": 3.7538128662109376, "step": 94900 }, { "epoch": 0.9858146460925421, "grad_norm": 2.7171695232391357, "learning_rate": 0.00020141957309037324, "loss": 3.7826458740234377, "step": 95000 }, { "epoch": 0.9868523457200079, "grad_norm": 3.8345425128936768, "learning_rate": 0.00020131580312762667, "loss": 3.8197344970703124, "step": 95100 }, { "epoch": 0.9878900453474737, "grad_norm": 5.732568740844727, "learning_rate": 0.00020121203316488006, "loss": 3.84238525390625, "step": 95200 }, { "epoch": 0.9889277449749395, "grad_norm": 2.933835744857788, "learning_rate": 0.00020110826320213348, "loss": 3.8682632446289062, "step": 95300 }, { "epoch": 0.9899654446024054, "grad_norm": 6.234426021575928, "learning_rate": 0.00020100449323938693, "loss": 3.7140426635742188, "step": 95400 }, { "epoch": 0.9910031442298712, "grad_norm": 3.3652026653289795, "learning_rate": 0.00020090072327664033, "loss": 3.7597830200195315, "step": 95500 }, { "epoch": 0.992040843857337, "grad_norm": 3.030595541000366, "learning_rate": 0.00020079695331389375, "loss": 3.824953308105469, "step": 95600 }, { "epoch": 0.9930785434848028, "grad_norm": 2.6781022548675537, "learning_rate": 0.00020069318335114714, "loss": 3.71589599609375, "step": 95700 }, { "epoch": 0.9941162431122688, "grad_norm": 6.144374370574951, "learning_rate": 0.00020058941338840056, "loss": 3.856881408691406, "step": 95800 }, { "epoch": 0.9951539427397346, "grad_norm": 11.093416213989258, "learning_rate": 0.000200485643425654, "loss": 3.8529815673828125, "step": 95900 }, { "epoch": 0.9961916423672004, "grad_norm": 3.1640384197235107, "learning_rate": 0.0002003818734629074, "loss": 3.966211853027344, "step": 96000 }, { "epoch": 0.9972293419946662, "grad_norm": 4.370779037475586, "learning_rate": 0.00020027810350016083, "loss": 3.7798886108398437, "step": 96100 }, { "epoch": 0.9982670416221321, "grad_norm": 3.453723669052124, "learning_rate": 0.00020017433353741422, "loss": 3.8633013916015626, "step": 96200 }, { "epoch": 0.9993047412495979, "grad_norm": 2.1785902976989746, "learning_rate": 0.00020007056357466767, "loss": 3.7897879028320314, "step": 96300 }, { "epoch": 1.0003424408770638, "grad_norm": 7.7243971824646, "learning_rate": 0.00019996679361192107, "loss": 3.999345397949219, "step": 96400 }, { "epoch": 1.0013801405045295, "grad_norm": 4.7181925773620605, "learning_rate": 0.0001998630236491745, "loss": 3.6450360107421873, "step": 96500 }, { "epoch": 1.0024178401319954, "grad_norm": 5.74350643157959, "learning_rate": 0.0001997592536864279, "loss": 3.742356872558594, "step": 96600 }, { "epoch": 1.0034555397594613, "grad_norm": 4.781228065490723, "learning_rate": 0.0001996554837236813, "loss": 3.88675048828125, "step": 96700 }, { "epoch": 1.004493239386927, "grad_norm": 3.398968458175659, "learning_rate": 0.00019955171376093476, "loss": 3.604486083984375, "step": 96800 }, { "epoch": 1.005530939014393, "grad_norm": 2.33478045463562, "learning_rate": 0.00019944794379818815, "loss": 3.6777334594726563, "step": 96900 }, { "epoch": 1.0065686386418586, "grad_norm": 5.443575382232666, "learning_rate": 0.00019934417383544157, "loss": 3.71547119140625, "step": 97000 }, { "epoch": 1.0076063382693246, "grad_norm": 9.512263298034668, "learning_rate": 0.00019924040387269497, "loss": 3.7301199340820315, "step": 97100 }, { "epoch": 1.0086440378967905, "grad_norm": 7.4802985191345215, "learning_rate": 0.00019913663390994842, "loss": 3.924736328125, "step": 97200 }, { "epoch": 1.0096817375242562, "grad_norm": 3.0878612995147705, "learning_rate": 0.00019903286394720184, "loss": 3.802860107421875, "step": 97300 }, { "epoch": 1.010719437151722, "grad_norm": 3.557770252227783, "learning_rate": 0.00019892909398445523, "loss": 3.782970275878906, "step": 97400 }, { "epoch": 1.011757136779188, "grad_norm": 4.309437274932861, "learning_rate": 0.00019882532402170866, "loss": 3.7818194580078126, "step": 97500 }, { "epoch": 1.0127948364066537, "grad_norm": 9.057745933532715, "learning_rate": 0.00019872155405896205, "loss": 3.807467041015625, "step": 97600 }, { "epoch": 1.0138325360341196, "grad_norm": 3.3481385707855225, "learning_rate": 0.0001986177840962155, "loss": 3.7055014038085936, "step": 97700 }, { "epoch": 1.0148702356615853, "grad_norm": 5.001105308532715, "learning_rate": 0.00019851401413346892, "loss": 3.803979797363281, "step": 97800 }, { "epoch": 1.0159079352890512, "grad_norm": 2.7995588779449463, "learning_rate": 0.00019841024417072232, "loss": 3.784454650878906, "step": 97900 }, { "epoch": 1.0169456349165171, "grad_norm": 2.4021806716918945, "learning_rate": 0.00019830647420797574, "loss": 3.8534210205078123, "step": 98000 }, { "epoch": 1.0179833345439828, "grad_norm": 2.6125597953796387, "learning_rate": 0.00019820270424522916, "loss": 3.6783572387695314, "step": 98100 }, { "epoch": 1.0190210341714487, "grad_norm": 12.870917320251465, "learning_rate": 0.00019809893428248258, "loss": 3.833390808105469, "step": 98200 }, { "epoch": 1.0200587337989147, "grad_norm": 5.185585021972656, "learning_rate": 0.00019799516431973598, "loss": 3.7223880004882814, "step": 98300 }, { "epoch": 1.0210964334263803, "grad_norm": 1.9634087085723877, "learning_rate": 0.0001978913943569894, "loss": 3.6614044189453123, "step": 98400 }, { "epoch": 1.0221341330538463, "grad_norm": 5.82041072845459, "learning_rate": 0.00019778762439424285, "loss": 3.729730224609375, "step": 98500 }, { "epoch": 1.0231718326813122, "grad_norm": 5.905141353607178, "learning_rate": 0.00019768385443149624, "loss": 3.8260488891601563, "step": 98600 }, { "epoch": 1.0242095323087779, "grad_norm": 3.5444912910461426, "learning_rate": 0.00019758008446874966, "loss": 3.687132568359375, "step": 98700 }, { "epoch": 1.0252472319362438, "grad_norm": 7.397883892059326, "learning_rate": 0.00019747631450600306, "loss": 3.815035400390625, "step": 98800 }, { "epoch": 1.0262849315637095, "grad_norm": 4.467862129211426, "learning_rate": 0.00019737254454325648, "loss": 3.645810241699219, "step": 98900 }, { "epoch": 1.0273226311911754, "grad_norm": 7.824927806854248, "learning_rate": 0.0001972687745805099, "loss": 3.7502801513671873, "step": 99000 }, { "epoch": 1.0283603308186413, "grad_norm": 9.055319786071777, "learning_rate": 0.00019716500461776333, "loss": 3.895949401855469, "step": 99100 }, { "epoch": 1.029398030446107, "grad_norm": 2.499072313308716, "learning_rate": 0.00019706123465501675, "loss": 3.729786071777344, "step": 99200 }, { "epoch": 1.030435730073573, "grad_norm": 2.091538667678833, "learning_rate": 0.00019695746469227014, "loss": 3.6661376953125, "step": 99300 }, { "epoch": 1.0314734297010388, "grad_norm": 2.9895308017730713, "learning_rate": 0.0001968536947295236, "loss": 3.7620065307617185, "step": 99400 }, { "epoch": 1.0325111293285045, "grad_norm": 3.8646888732910156, "learning_rate": 0.00019674992476677699, "loss": 3.8454522705078125, "step": 99500 }, { "epoch": 1.0335488289559704, "grad_norm": 4.3288044929504395, "learning_rate": 0.0001966461548040304, "loss": 3.682370300292969, "step": 99600 }, { "epoch": 1.0345865285834361, "grad_norm": 1.888063907623291, "learning_rate": 0.00019654238484128383, "loss": 3.7136306762695312, "step": 99700 }, { "epoch": 1.035624228210902, "grad_norm": 2.9146947860717773, "learning_rate": 0.00019643861487853722, "loss": 3.7029214477539063, "step": 99800 }, { "epoch": 1.036661927838368, "grad_norm": 3.3660199642181396, "learning_rate": 0.00019633484491579067, "loss": 3.669721984863281, "step": 99900 }, { "epoch": 1.0376996274658337, "grad_norm": 3.8642494678497314, "learning_rate": 0.00019623107495304407, "loss": 3.718172302246094, "step": 100000 }, { "epoch": 1.0387373270932996, "grad_norm": 19.524248123168945, "learning_rate": 0.0001961273049902975, "loss": 3.8097552490234374, "step": 100100 }, { "epoch": 1.0397750267207655, "grad_norm": 2.175708293914795, "learning_rate": 0.00019602353502755089, "loss": 3.7663388061523437, "step": 100200 }, { "epoch": 1.0408127263482312, "grad_norm": 2.0963635444641113, "learning_rate": 0.00019591976506480433, "loss": 3.7331805419921875, "step": 100300 }, { "epoch": 1.041850425975697, "grad_norm": 4.1156134605407715, "learning_rate": 0.00019581599510205776, "loss": 3.7513092041015623, "step": 100400 }, { "epoch": 1.042888125603163, "grad_norm": 1.9364126920700073, "learning_rate": 0.00019571222513931115, "loss": 3.7811895751953126, "step": 100500 }, { "epoch": 1.0439258252306287, "grad_norm": 3.9929726123809814, "learning_rate": 0.00019560845517656457, "loss": 3.6916510009765626, "step": 100600 }, { "epoch": 1.0449635248580946, "grad_norm": 6.161198139190674, "learning_rate": 0.00019550468521381797, "loss": 3.735494384765625, "step": 100700 }, { "epoch": 1.0460012244855603, "grad_norm": 5.300504207611084, "learning_rate": 0.00019540091525107142, "loss": 3.6318603515625, "step": 100800 }, { "epoch": 1.0470389241130262, "grad_norm": 6.671936988830566, "learning_rate": 0.0001952971452883248, "loss": 3.753620300292969, "step": 100900 }, { "epoch": 1.0480766237404922, "grad_norm": 4.034755229949951, "learning_rate": 0.00019519337532557823, "loss": 3.6916033935546877, "step": 101000 }, { "epoch": 1.0491143233679578, "grad_norm": 2.8349599838256836, "learning_rate": 0.00019508960536283168, "loss": 3.6846957397460938, "step": 101100 }, { "epoch": 1.0501520229954238, "grad_norm": 4.222849369049072, "learning_rate": 0.00019498583540008508, "loss": 3.785768737792969, "step": 101200 }, { "epoch": 1.0511897226228897, "grad_norm": 7.210328102111816, "learning_rate": 0.0001948820654373385, "loss": 3.674949035644531, "step": 101300 }, { "epoch": 1.0522274222503554, "grad_norm": 4.031270503997803, "learning_rate": 0.0001947782954745919, "loss": 3.7858917236328127, "step": 101400 }, { "epoch": 1.0532651218778213, "grad_norm": 28.53989601135254, "learning_rate": 0.00019467452551184532, "loss": 3.8007437133789064, "step": 101500 }, { "epoch": 1.054302821505287, "grad_norm": 5.528784275054932, "learning_rate": 0.00019457075554909877, "loss": 3.624027099609375, "step": 101600 }, { "epoch": 1.055340521132753, "grad_norm": 3.1289713382720947, "learning_rate": 0.00019446698558635216, "loss": 3.7536968994140625, "step": 101700 }, { "epoch": 1.0563782207602188, "grad_norm": 2.9442858695983887, "learning_rate": 0.00019436321562360558, "loss": 3.569986572265625, "step": 101800 }, { "epoch": 1.0574159203876845, "grad_norm": 4.8674726486206055, "learning_rate": 0.00019425944566085898, "loss": 3.8215240478515624, "step": 101900 }, { "epoch": 1.0584536200151504, "grad_norm": 13.513835906982422, "learning_rate": 0.0001941556756981124, "loss": 3.6686697387695313, "step": 102000 }, { "epoch": 1.0594913196426163, "grad_norm": 3.146784543991089, "learning_rate": 0.00019405190573536582, "loss": 3.643824462890625, "step": 102100 }, { "epoch": 1.060529019270082, "grad_norm": 4.964068412780762, "learning_rate": 0.00019394813577261924, "loss": 3.748782043457031, "step": 102200 }, { "epoch": 1.061566718897548, "grad_norm": 3.178044557571411, "learning_rate": 0.00019384436580987266, "loss": 3.7086587524414063, "step": 102300 }, { "epoch": 1.0626044185250136, "grad_norm": 2.6959052085876465, "learning_rate": 0.00019374059584712606, "loss": 3.8190512084960937, "step": 102400 }, { "epoch": 1.0636421181524796, "grad_norm": 4.595401763916016, "learning_rate": 0.0001936368258843795, "loss": 3.6920120239257814, "step": 102500 }, { "epoch": 1.0646798177799455, "grad_norm": 3.383439064025879, "learning_rate": 0.0001935330559216329, "loss": 3.7616091918945314, "step": 102600 }, { "epoch": 1.0657175174074112, "grad_norm": 6.921218395233154, "learning_rate": 0.00019342928595888633, "loss": 3.8070159912109376, "step": 102700 }, { "epoch": 1.066755217034877, "grad_norm": 3.7757728099823, "learning_rate": 0.00019332551599613975, "loss": 3.64797119140625, "step": 102800 }, { "epoch": 1.067792916662343, "grad_norm": 5.452692985534668, "learning_rate": 0.00019322174603339314, "loss": 3.7128118896484374, "step": 102900 }, { "epoch": 1.0688306162898087, "grad_norm": 2.324277639389038, "learning_rate": 0.0001931179760706466, "loss": 3.5481451416015624, "step": 103000 }, { "epoch": 1.0698683159172746, "grad_norm": 2.998181104660034, "learning_rate": 0.00019301420610789999, "loss": 3.6443612670898435, "step": 103100 }, { "epoch": 1.0709060155447405, "grad_norm": 5.453862190246582, "learning_rate": 0.0001929104361451534, "loss": 3.7542648315429688, "step": 103200 }, { "epoch": 1.0719437151722062, "grad_norm": 7.444779396057129, "learning_rate": 0.0001928066661824068, "loss": 3.696410827636719, "step": 103300 }, { "epoch": 1.0729814147996721, "grad_norm": 4.7863569259643555, "learning_rate": 0.00019270289621966025, "loss": 3.6802603149414064, "step": 103400 }, { "epoch": 1.0740191144271378, "grad_norm": 2.9291558265686035, "learning_rate": 0.00019259912625691367, "loss": 3.7929959106445312, "step": 103500 }, { "epoch": 1.0750568140546037, "grad_norm": 3.2032582759857178, "learning_rate": 0.00019249535629416707, "loss": 3.6861895751953124, "step": 103600 }, { "epoch": 1.0760945136820697, "grad_norm": 3.1435580253601074, "learning_rate": 0.0001923915863314205, "loss": 3.799478759765625, "step": 103700 }, { "epoch": 1.0771322133095353, "grad_norm": 2.8310792446136475, "learning_rate": 0.00019228781636867388, "loss": 3.73474365234375, "step": 103800 }, { "epoch": 1.0781699129370013, "grad_norm": 2.285276174545288, "learning_rate": 0.00019218404640592733, "loss": 3.6168304443359376, "step": 103900 }, { "epoch": 1.0792076125644672, "grad_norm": 5.524131774902344, "learning_rate": 0.00019208027644318073, "loss": 3.710784912109375, "step": 104000 }, { "epoch": 1.0802453121919329, "grad_norm": 3.545400619506836, "learning_rate": 0.00019197650648043415, "loss": 3.6640530395507813, "step": 104100 }, { "epoch": 1.0812830118193988, "grad_norm": 3.101451873779297, "learning_rate": 0.0001918727365176876, "loss": 3.7735882568359376, "step": 104200 }, { "epoch": 1.0823207114468647, "grad_norm": 2.4820311069488525, "learning_rate": 0.000191768966554941, "loss": 3.6366726684570314, "step": 104300 }, { "epoch": 1.0833584110743304, "grad_norm": 26.539804458618164, "learning_rate": 0.00019166519659219442, "loss": 3.7211334228515627, "step": 104400 }, { "epoch": 1.0843961107017963, "grad_norm": 3.41780161857605, "learning_rate": 0.0001915614266294478, "loss": 3.60020263671875, "step": 104500 }, { "epoch": 1.085433810329262, "grad_norm": 2.689753293991089, "learning_rate": 0.00019145765666670123, "loss": 3.7544232177734376, "step": 104600 }, { "epoch": 1.086471509956728, "grad_norm": 2.2958478927612305, "learning_rate": 0.00019135388670395468, "loss": 3.849725646972656, "step": 104700 }, { "epoch": 1.0875092095841938, "grad_norm": 3.697185754776001, "learning_rate": 0.00019125011674120808, "loss": 3.813602294921875, "step": 104800 }, { "epoch": 1.0885469092116595, "grad_norm": 2.1992783546447754, "learning_rate": 0.0001911463467784615, "loss": 3.6952606201171876, "step": 104900 }, { "epoch": 1.0895846088391254, "grad_norm": 2.1027495861053467, "learning_rate": 0.0001910425768157149, "loss": 3.6720751953125, "step": 105000 }, { "epoch": 1.0906223084665914, "grad_norm": 2.2862184047698975, "learning_rate": 0.00019093880685296832, "loss": 3.729759521484375, "step": 105100 }, { "epoch": 1.091660008094057, "grad_norm": 2.060633659362793, "learning_rate": 0.00019083503689022174, "loss": 3.7085842895507812, "step": 105200 }, { "epoch": 1.092697707721523, "grad_norm": 2.636503219604492, "learning_rate": 0.00019073126692747516, "loss": 3.6184716796875, "step": 105300 }, { "epoch": 1.0937354073489887, "grad_norm": 7.98659086227417, "learning_rate": 0.00019062749696472858, "loss": 3.875008544921875, "step": 105400 }, { "epoch": 1.0947731069764546, "grad_norm": 3.7854599952697754, "learning_rate": 0.00019052372700198198, "loss": 3.8590658569335936, "step": 105500 }, { "epoch": 1.0958108066039205, "grad_norm": 9.304828643798828, "learning_rate": 0.00019041995703923543, "loss": 3.7910305786132814, "step": 105600 }, { "epoch": 1.0968485062313862, "grad_norm": 6.323867321014404, "learning_rate": 0.00019031618707648882, "loss": 3.763433532714844, "step": 105700 }, { "epoch": 1.097886205858852, "grad_norm": 5.698137283325195, "learning_rate": 0.00019021241711374224, "loss": 3.6159381103515624, "step": 105800 }, { "epoch": 1.098923905486318, "grad_norm": 80.88331604003906, "learning_rate": 0.00019010864715099564, "loss": 3.738255920410156, "step": 105900 }, { "epoch": 1.0999616051137837, "grad_norm": 4.7448577880859375, "learning_rate": 0.00019000487718824906, "loss": 3.6675250244140627, "step": 106000 }, { "epoch": 1.1009993047412496, "grad_norm": 5.72471809387207, "learning_rate": 0.0001899011072255025, "loss": 3.7835205078125, "step": 106100 }, { "epoch": 1.1020370043687153, "grad_norm": 3.3427250385284424, "learning_rate": 0.0001897973372627559, "loss": 3.6577874755859376, "step": 106200 }, { "epoch": 1.1030747039961812, "grad_norm": 15.587642669677734, "learning_rate": 0.00018969356730000932, "loss": 3.716649169921875, "step": 106300 }, { "epoch": 1.1041124036236472, "grad_norm": 4.485306262969971, "learning_rate": 0.00018958979733726272, "loss": 3.8367926025390626, "step": 106400 }, { "epoch": 1.1051501032511128, "grad_norm": 2.82476806640625, "learning_rate": 0.00018948602737451617, "loss": 3.7493435668945314, "step": 106500 }, { "epoch": 1.1061878028785788, "grad_norm": 15.561006546020508, "learning_rate": 0.0001893822574117696, "loss": 3.826619873046875, "step": 106600 }, { "epoch": 1.1072255025060447, "grad_norm": 2.592461109161377, "learning_rate": 0.00018927848744902299, "loss": 3.7684344482421874, "step": 106700 }, { "epoch": 1.1082632021335104, "grad_norm": 7.259844779968262, "learning_rate": 0.0001891747174862764, "loss": 3.758468017578125, "step": 106800 }, { "epoch": 1.1093009017609763, "grad_norm": 5.973848342895508, "learning_rate": 0.0001890709475235298, "loss": 3.638338317871094, "step": 106900 }, { "epoch": 1.1103386013884422, "grad_norm": 4.451427459716797, "learning_rate": 0.00018896717756078325, "loss": 3.788179626464844, "step": 107000 }, { "epoch": 1.111376301015908, "grad_norm": 4.0467143058776855, "learning_rate": 0.00018886340759803665, "loss": 3.7329791259765623, "step": 107100 }, { "epoch": 1.1124140006433738, "grad_norm": 5.440663814544678, "learning_rate": 0.00018875963763529007, "loss": 3.9233663940429686, "step": 107200 }, { "epoch": 1.1134517002708395, "grad_norm": 2.327005386352539, "learning_rate": 0.00018865586767254352, "loss": 3.688836975097656, "step": 107300 }, { "epoch": 1.1144893998983054, "grad_norm": 2.948439598083496, "learning_rate": 0.0001885520977097969, "loss": 3.623143310546875, "step": 107400 }, { "epoch": 1.1155270995257713, "grad_norm": 8.996918678283691, "learning_rate": 0.00018844832774705033, "loss": 3.6873675537109376, "step": 107500 }, { "epoch": 1.116564799153237, "grad_norm": 13.88825798034668, "learning_rate": 0.00018834455778430373, "loss": 3.889109802246094, "step": 107600 }, { "epoch": 1.117602498780703, "grad_norm": 4.712568283081055, "learning_rate": 0.00018824078782155715, "loss": 3.8336361694335936, "step": 107700 }, { "epoch": 1.1186401984081689, "grad_norm": 9.021018028259277, "learning_rate": 0.00018813701785881055, "loss": 3.818023681640625, "step": 107800 }, { "epoch": 1.1196778980356346, "grad_norm": 4.5635294914245605, "learning_rate": 0.000188033247896064, "loss": 3.8210824584960936, "step": 107900 }, { "epoch": 1.1207155976631005, "grad_norm": 6.118738651275635, "learning_rate": 0.00018792947793331742, "loss": 3.762948303222656, "step": 108000 }, { "epoch": 1.1217532972905664, "grad_norm": 6.2977824211120605, "learning_rate": 0.0001878257079705708, "loss": 3.7840338134765625, "step": 108100 }, { "epoch": 1.122790996918032, "grad_norm": 5.161929607391357, "learning_rate": 0.00018772193800782423, "loss": 3.8385690307617186, "step": 108200 }, { "epoch": 1.123828696545498, "grad_norm": 19.5078067779541, "learning_rate": 0.00018761816804507765, "loss": 3.708250732421875, "step": 108300 }, { "epoch": 1.1248663961729637, "grad_norm": 6.583184242248535, "learning_rate": 0.00018751439808233108, "loss": 3.6731692504882814, "step": 108400 }, { "epoch": 1.1259040958004296, "grad_norm": 2.8113479614257812, "learning_rate": 0.0001874106281195845, "loss": 3.776397705078125, "step": 108500 }, { "epoch": 1.1269417954278955, "grad_norm": 3.526796340942383, "learning_rate": 0.0001873068581568379, "loss": 3.713113098144531, "step": 108600 }, { "epoch": 1.1279794950553612, "grad_norm": 4.96720027923584, "learning_rate": 0.00018720308819409134, "loss": 3.758629150390625, "step": 108700 }, { "epoch": 1.1290171946828271, "grad_norm": 2.3801918029785156, "learning_rate": 0.00018709931823134474, "loss": 3.931161193847656, "step": 108800 }, { "epoch": 1.1300548943102928, "grad_norm": 5.336031913757324, "learning_rate": 0.00018699554826859816, "loss": 3.7431265258789064, "step": 108900 }, { "epoch": 1.1310925939377587, "grad_norm": 3.3115835189819336, "learning_rate": 0.00018689177830585155, "loss": 3.6016845703125, "step": 109000 }, { "epoch": 1.1321302935652247, "grad_norm": 3.2625627517700195, "learning_rate": 0.00018678800834310498, "loss": 3.8173687744140623, "step": 109100 }, { "epoch": 1.1331679931926903, "grad_norm": 3.4688777923583984, "learning_rate": 0.00018668423838035843, "loss": 3.7339138793945312, "step": 109200 }, { "epoch": 1.1342056928201563, "grad_norm": 5.170476913452148, "learning_rate": 0.00018658046841761182, "loss": 3.8035733032226564, "step": 109300 }, { "epoch": 1.1352433924476222, "grad_norm": 6.003453731536865, "learning_rate": 0.00018647669845486524, "loss": 3.7767242431640624, "step": 109400 }, { "epoch": 1.1362810920750879, "grad_norm": 3.4862396717071533, "learning_rate": 0.00018637292849211864, "loss": 3.643880615234375, "step": 109500 }, { "epoch": 1.1373187917025538, "grad_norm": 5.885380268096924, "learning_rate": 0.00018626915852937209, "loss": 3.7285040283203124, "step": 109600 }, { "epoch": 1.1383564913300197, "grad_norm": 2.839015245437622, "learning_rate": 0.00018616538856662548, "loss": 3.7614910888671873, "step": 109700 }, { "epoch": 1.1393941909574854, "grad_norm": 10.154685020446777, "learning_rate": 0.0001860616186038789, "loss": 3.635873107910156, "step": 109800 }, { "epoch": 1.1404318905849513, "grad_norm": 11.110898971557617, "learning_rate": 0.00018595784864113232, "loss": 3.690367431640625, "step": 109900 }, { "epoch": 1.141469590212417, "grad_norm": 2.4880504608154297, "learning_rate": 0.00018585407867838572, "loss": 3.69529541015625, "step": 110000 }, { "epoch": 1.142507289839883, "grad_norm": 12.104265213012695, "learning_rate": 0.00018575030871563917, "loss": 3.81604736328125, "step": 110100 }, { "epoch": 1.1435449894673488, "grad_norm": 4.529385089874268, "learning_rate": 0.00018564653875289256, "loss": 3.847531433105469, "step": 110200 }, { "epoch": 1.1445826890948145, "grad_norm": 4.51477575302124, "learning_rate": 0.00018554276879014598, "loss": 3.6786367797851565, "step": 110300 }, { "epoch": 1.1456203887222804, "grad_norm": 3.946871757507324, "learning_rate": 0.00018543899882739943, "loss": 3.7411343383789064, "step": 110400 }, { "epoch": 1.1466580883497464, "grad_norm": 24.773929595947266, "learning_rate": 0.00018533522886465283, "loss": 3.6971206665039062, "step": 110500 }, { "epoch": 1.147695787977212, "grad_norm": 4.848511695861816, "learning_rate": 0.00018523145890190625, "loss": 3.6610791015625, "step": 110600 }, { "epoch": 1.148733487604678, "grad_norm": 3.155839681625366, "learning_rate": 0.00018512768893915965, "loss": 3.6824301147460936, "step": 110700 }, { "epoch": 1.1497711872321439, "grad_norm": 3.4173624515533447, "learning_rate": 0.00018502391897641307, "loss": 3.729654541015625, "step": 110800 }, { "epoch": 1.1508088868596096, "grad_norm": 3.1743650436401367, "learning_rate": 0.00018492014901366646, "loss": 3.752574157714844, "step": 110900 }, { "epoch": 1.1518465864870755, "grad_norm": 5.655935287475586, "learning_rate": 0.0001848163790509199, "loss": 3.6886166381835936, "step": 111000 }, { "epoch": 1.1528842861145412, "grad_norm": 2.8840067386627197, "learning_rate": 0.00018471260908817333, "loss": 3.8322817993164064, "step": 111100 }, { "epoch": 1.153921985742007, "grad_norm": 4.1215057373046875, "learning_rate": 0.00018460883912542673, "loss": 3.634107971191406, "step": 111200 }, { "epoch": 1.154959685369473, "grad_norm": 8.988388061523438, "learning_rate": 0.00018450506916268018, "loss": 3.813481750488281, "step": 111300 }, { "epoch": 1.1559973849969387, "grad_norm": 4.154327869415283, "learning_rate": 0.00018440129919993357, "loss": 3.792846374511719, "step": 111400 }, { "epoch": 1.1570350846244046, "grad_norm": 5.43167781829834, "learning_rate": 0.000184297529237187, "loss": 3.695276794433594, "step": 111500 }, { "epoch": 1.1580727842518705, "grad_norm": 2.1235880851745605, "learning_rate": 0.0001841937592744404, "loss": 3.7109506225585935, "step": 111600 }, { "epoch": 1.1591104838793362, "grad_norm": 3.2670278549194336, "learning_rate": 0.0001840899893116938, "loss": 3.779457702636719, "step": 111700 }, { "epoch": 1.1601481835068022, "grad_norm": 4.596736431121826, "learning_rate": 0.00018398621934894726, "loss": 3.690837097167969, "step": 111800 }, { "epoch": 1.161185883134268, "grad_norm": 5.063496112823486, "learning_rate": 0.00018388244938620065, "loss": 3.7899896240234376, "step": 111900 }, { "epoch": 1.1622235827617338, "grad_norm": 3.2700915336608887, "learning_rate": 0.00018377867942345408, "loss": 3.7538375854492188, "step": 112000 }, { "epoch": 1.1632612823891997, "grad_norm": 2.544558048248291, "learning_rate": 0.00018367490946070747, "loss": 3.7601394653320312, "step": 112100 }, { "epoch": 1.1642989820166654, "grad_norm": 6.950151443481445, "learning_rate": 0.0001835711394979609, "loss": 3.797687683105469, "step": 112200 }, { "epoch": 1.1653366816441313, "grad_norm": 2.161999464035034, "learning_rate": 0.00018346736953521434, "loss": 3.7408486938476564, "step": 112300 }, { "epoch": 1.1663743812715972, "grad_norm": 2.824725866317749, "learning_rate": 0.00018336359957246774, "loss": 3.708443298339844, "step": 112400 }, { "epoch": 1.167412080899063, "grad_norm": 11.807979583740234, "learning_rate": 0.00018325982960972116, "loss": 3.6650485229492187, "step": 112500 }, { "epoch": 1.1684497805265288, "grad_norm": 12.751113891601562, "learning_rate": 0.00018315605964697455, "loss": 3.5273590087890625, "step": 112600 }, { "epoch": 1.1694874801539945, "grad_norm": 3.0161349773406982, "learning_rate": 0.000183052289684228, "loss": 3.8431436157226564, "step": 112700 }, { "epoch": 1.1705251797814604, "grad_norm": 8.852095603942871, "learning_rate": 0.0001829485197214814, "loss": 3.6667901611328126, "step": 112800 }, { "epoch": 1.1715628794089263, "grad_norm": 16.80730438232422, "learning_rate": 0.00018284474975873482, "loss": 3.7361489868164064, "step": 112900 }, { "epoch": 1.172600579036392, "grad_norm": 4.340658187866211, "learning_rate": 0.00018274097979598824, "loss": 3.7126028442382815, "step": 113000 }, { "epoch": 1.173638278663858, "grad_norm": 2.2295515537261963, "learning_rate": 0.00018263720983324164, "loss": 3.6620779418945313, "step": 113100 }, { "epoch": 1.1746759782913239, "grad_norm": 3.5379912853240967, "learning_rate": 0.00018253343987049509, "loss": 3.60224609375, "step": 113200 }, { "epoch": 1.1757136779187896, "grad_norm": 3.174776315689087, "learning_rate": 0.00018242966990774848, "loss": 3.7180682373046876, "step": 113300 }, { "epoch": 1.1767513775462555, "grad_norm": 4.343127250671387, "learning_rate": 0.0001823258999450019, "loss": 3.7377755737304685, "step": 113400 }, { "epoch": 1.1777890771737214, "grad_norm": 21.170530319213867, "learning_rate": 0.0001822221299822553, "loss": 3.752294921875, "step": 113500 }, { "epoch": 1.178826776801187, "grad_norm": 4.612101078033447, "learning_rate": 0.00018211836001950875, "loss": 3.8363751220703124, "step": 113600 }, { "epoch": 1.179864476428653, "grad_norm": 6.276144981384277, "learning_rate": 0.00018201459005676217, "loss": 3.713616943359375, "step": 113700 }, { "epoch": 1.1809021760561187, "grad_norm": 10.716604232788086, "learning_rate": 0.00018191082009401556, "loss": 3.629880676269531, "step": 113800 }, { "epoch": 1.1819398756835846, "grad_norm": 2.2933573722839355, "learning_rate": 0.00018180705013126898, "loss": 3.8490249633789064, "step": 113900 }, { "epoch": 1.1829775753110505, "grad_norm": 4.147966384887695, "learning_rate": 0.00018170328016852238, "loss": 3.5557064819335937, "step": 114000 }, { "epoch": 1.1840152749385162, "grad_norm": 3.122669219970703, "learning_rate": 0.00018159951020577583, "loss": 3.73438232421875, "step": 114100 }, { "epoch": 1.1850529745659821, "grad_norm": 9.210347175598145, "learning_rate": 0.00018149574024302925, "loss": 3.6972500610351564, "step": 114200 }, { "epoch": 1.186090674193448, "grad_norm": 17.161890029907227, "learning_rate": 0.00018139197028028265, "loss": 3.819235534667969, "step": 114300 }, { "epoch": 1.1871283738209137, "grad_norm": 5.225100040435791, "learning_rate": 0.0001812882003175361, "loss": 3.7081121826171874, "step": 114400 }, { "epoch": 1.1881660734483797, "grad_norm": 8.891063690185547, "learning_rate": 0.0001811844303547895, "loss": 3.7459030151367188, "step": 114500 }, { "epoch": 1.1892037730758456, "grad_norm": 3.465555429458618, "learning_rate": 0.0001810806603920429, "loss": 3.7495687866210936, "step": 114600 }, { "epoch": 1.1902414727033113, "grad_norm": 2.962984561920166, "learning_rate": 0.0001809768904292963, "loss": 3.620650329589844, "step": 114700 }, { "epoch": 1.1912791723307772, "grad_norm": 66.27200317382812, "learning_rate": 0.00018087312046654973, "loss": 3.8266671752929686, "step": 114800 }, { "epoch": 1.1923168719582429, "grad_norm": 10.21193790435791, "learning_rate": 0.00018076935050380318, "loss": 3.7377734375, "step": 114900 }, { "epoch": 1.1933545715857088, "grad_norm": 4.959332466125488, "learning_rate": 0.00018066558054105657, "loss": 3.767408752441406, "step": 115000 }, { "epoch": 1.1943922712131747, "grad_norm": 4.304464817047119, "learning_rate": 0.00018056181057831, "loss": 3.793067626953125, "step": 115100 }, { "epoch": 1.1954299708406404, "grad_norm": 4.872037887573242, "learning_rate": 0.0001804580406155634, "loss": 3.754971923828125, "step": 115200 }, { "epoch": 1.1964676704681063, "grad_norm": 5.543403625488281, "learning_rate": 0.0001803542706528168, "loss": 3.6738140869140623, "step": 115300 }, { "epoch": 1.1975053700955722, "grad_norm": 4.535797595977783, "learning_rate": 0.00018025050069007023, "loss": 3.6706658935546876, "step": 115400 }, { "epoch": 1.198543069723038, "grad_norm": 3.987654209136963, "learning_rate": 0.00018014673072732365, "loss": 3.7104837036132814, "step": 115500 }, { "epoch": 1.1995807693505038, "grad_norm": 4.604912757873535, "learning_rate": 0.00018004296076457708, "loss": 3.7295111083984374, "step": 115600 }, { "epoch": 1.2006184689779698, "grad_norm": 7.51154088973999, "learning_rate": 0.00017993919080183047, "loss": 3.882249755859375, "step": 115700 }, { "epoch": 1.2016561686054354, "grad_norm": 7.570425987243652, "learning_rate": 0.00017983542083908392, "loss": 3.7709280395507814, "step": 115800 }, { "epoch": 1.2026938682329014, "grad_norm": 7.528663635253906, "learning_rate": 0.00017973165087633731, "loss": 3.744920654296875, "step": 115900 }, { "epoch": 1.203731567860367, "grad_norm": 4.613593578338623, "learning_rate": 0.00017962788091359074, "loss": 3.81932373046875, "step": 116000 }, { "epoch": 1.204769267487833, "grad_norm": 4.6101508140563965, "learning_rate": 0.00017952411095084416, "loss": 3.701668701171875, "step": 116100 }, { "epoch": 1.2058069671152989, "grad_norm": 3.3336641788482666, "learning_rate": 0.00017942034098809755, "loss": 3.5936102294921874, "step": 116200 }, { "epoch": 1.2068446667427646, "grad_norm": 8.796258926391602, "learning_rate": 0.000179316571025351, "loss": 3.6812298583984373, "step": 116300 }, { "epoch": 1.2078823663702305, "grad_norm": 2.9002747535705566, "learning_rate": 0.0001792128010626044, "loss": 3.79119873046875, "step": 116400 }, { "epoch": 1.2089200659976962, "grad_norm": 3.5677108764648438, "learning_rate": 0.00017910903109985782, "loss": 3.868831787109375, "step": 116500 }, { "epoch": 1.209957765625162, "grad_norm": 10.07345199584961, "learning_rate": 0.00017900526113711121, "loss": 3.8205535888671873, "step": 116600 }, { "epoch": 1.210995465252628, "grad_norm": 2.9789609909057617, "learning_rate": 0.00017890149117436466, "loss": 3.655535888671875, "step": 116700 }, { "epoch": 1.2120331648800937, "grad_norm": 7.362621784210205, "learning_rate": 0.00017879772121161808, "loss": 3.5663858032226563, "step": 116800 }, { "epoch": 1.2130708645075596, "grad_norm": 3.515774726867676, "learning_rate": 0.00017869395124887148, "loss": 3.64054443359375, "step": 116900 }, { "epoch": 1.2141085641350255, "grad_norm": 2.5356316566467285, "learning_rate": 0.0001785901812861249, "loss": 3.621481628417969, "step": 117000 }, { "epoch": 1.2151462637624912, "grad_norm": 4.910796642303467, "learning_rate": 0.0001784864113233783, "loss": 3.6991619873046875, "step": 117100 }, { "epoch": 1.2161839633899572, "grad_norm": 4.202451705932617, "learning_rate": 0.00017838264136063175, "loss": 3.8038519287109374, "step": 117200 }, { "epoch": 1.217221663017423, "grad_norm": 4.467262268066406, "learning_rate": 0.00017827887139788514, "loss": 3.771558837890625, "step": 117300 }, { "epoch": 1.2182593626448888, "grad_norm": 3.9160234928131104, "learning_rate": 0.00017817510143513856, "loss": 3.7639215087890623, "step": 117400 }, { "epoch": 1.2192970622723547, "grad_norm": 4.396745681762695, "learning_rate": 0.000178071331472392, "loss": 3.68260498046875, "step": 117500 }, { "epoch": 1.2203347618998204, "grad_norm": 3.5205559730529785, "learning_rate": 0.0001779675615096454, "loss": 3.6396359252929686, "step": 117600 }, { "epoch": 1.2213724615272863, "grad_norm": 3.1027088165283203, "learning_rate": 0.00017786379154689883, "loss": 3.5732858276367185, "step": 117700 }, { "epoch": 1.2224101611547522, "grad_norm": 2.6304574012756348, "learning_rate": 0.00017776002158415222, "loss": 3.508619384765625, "step": 117800 }, { "epoch": 1.223447860782218, "grad_norm": 2.9613137245178223, "learning_rate": 0.00017765625162140564, "loss": 3.65043212890625, "step": 117900 }, { "epoch": 1.2244855604096838, "grad_norm": 3.6579976081848145, "learning_rate": 0.0001775524816586591, "loss": 3.805189514160156, "step": 118000 }, { "epoch": 1.2255232600371497, "grad_norm": 2.3908674716949463, "learning_rate": 0.0001774487116959125, "loss": 3.608123474121094, "step": 118100 }, { "epoch": 1.2265609596646154, "grad_norm": 3.335692882537842, "learning_rate": 0.0001773449417331659, "loss": 3.707095947265625, "step": 118200 }, { "epoch": 1.2275986592920813, "grad_norm": 5.722865581512451, "learning_rate": 0.0001772411717704193, "loss": 3.7158029174804685, "step": 118300 }, { "epoch": 1.2286363589195473, "grad_norm": 9.1022310256958, "learning_rate": 0.00017713740180767273, "loss": 3.7301669311523438, "step": 118400 }, { "epoch": 1.229674058547013, "grad_norm": 5.698774814605713, "learning_rate": 0.00017703363184492615, "loss": 3.638455810546875, "step": 118500 }, { "epoch": 1.2307117581744789, "grad_norm": 2.373983144760132, "learning_rate": 0.00017692986188217957, "loss": 3.6596408081054688, "step": 118600 }, { "epoch": 1.2317494578019446, "grad_norm": 8.193933486938477, "learning_rate": 0.000176826091919433, "loss": 3.670250244140625, "step": 118700 }, { "epoch": 1.2327871574294105, "grad_norm": 4.394575119018555, "learning_rate": 0.0001767223219566864, "loss": 3.7637249755859377, "step": 118800 }, { "epoch": 1.2338248570568764, "grad_norm": 8.713273048400879, "learning_rate": 0.00017661855199393984, "loss": 3.7907025146484377, "step": 118900 }, { "epoch": 1.234862556684342, "grad_norm": 2.0170185565948486, "learning_rate": 0.00017651478203119323, "loss": 3.638475036621094, "step": 119000 }, { "epoch": 1.235900256311808, "grad_norm": 14.477542877197266, "learning_rate": 0.00017641101206844665, "loss": 3.6606521606445312, "step": 119100 }, { "epoch": 1.236937955939274, "grad_norm": 3.3395235538482666, "learning_rate": 0.00017630724210570005, "loss": 3.5342837524414064, "step": 119200 }, { "epoch": 1.2379756555667396, "grad_norm": 3.269758701324463, "learning_rate": 0.00017620347214295347, "loss": 3.5976416015625, "step": 119300 }, { "epoch": 1.2390133551942055, "grad_norm": 7.099674224853516, "learning_rate": 0.00017609970218020692, "loss": 3.599384460449219, "step": 119400 }, { "epoch": 1.2400510548216714, "grad_norm": 2.358044385910034, "learning_rate": 0.00017599593221746031, "loss": 3.4857781982421874, "step": 119500 }, { "epoch": 1.2410887544491371, "grad_norm": 5.485024929046631, "learning_rate": 0.00017589216225471374, "loss": 3.69429931640625, "step": 119600 }, { "epoch": 1.242126454076603, "grad_norm": 5.038040637969971, "learning_rate": 0.00017578839229196713, "loss": 3.599921875, "step": 119700 }, { "epoch": 1.2431641537040687, "grad_norm": 6.716040134429932, "learning_rate": 0.00017568462232922058, "loss": 3.555647888183594, "step": 119800 }, { "epoch": 1.2442018533315347, "grad_norm": 9.499709129333496, "learning_rate": 0.000175580852366474, "loss": 3.740644836425781, "step": 119900 }, { "epoch": 1.2452395529590006, "grad_norm": 2.5602540969848633, "learning_rate": 0.0001754770824037274, "loss": 3.7783831787109374, "step": 120000 }, { "epoch": 1.2462772525864663, "grad_norm": 5.06706428527832, "learning_rate": 0.00017537331244098082, "loss": 3.7457623291015625, "step": 120100 }, { "epoch": 1.2473149522139322, "grad_norm": 4.963079452514648, "learning_rate": 0.00017526954247823421, "loss": 3.726761474609375, "step": 120200 }, { "epoch": 1.2483526518413979, "grad_norm": 4.604287624359131, "learning_rate": 0.00017516577251548766, "loss": 3.8796881103515624, "step": 120300 }, { "epoch": 1.2493903514688638, "grad_norm": 7.884790897369385, "learning_rate": 0.00017506200255274106, "loss": 3.7173165893554687, "step": 120400 }, { "epoch": 1.2504280510963297, "grad_norm": 7.230984687805176, "learning_rate": 0.00017495823258999448, "loss": 3.7296737670898437, "step": 120500 }, { "epoch": 1.2514657507237956, "grad_norm": 4.4041032791137695, "learning_rate": 0.00017485446262724793, "loss": 3.695928039550781, "step": 120600 }, { "epoch": 1.2525034503512613, "grad_norm": 4.800326347351074, "learning_rate": 0.00017475069266450132, "loss": 3.692496032714844, "step": 120700 }, { "epoch": 1.2535411499787272, "grad_norm": 4.20355224609375, "learning_rate": 0.00017464692270175475, "loss": 3.724625549316406, "step": 120800 }, { "epoch": 1.254578849606193, "grad_norm": 8.89311408996582, "learning_rate": 0.00017454315273900814, "loss": 3.6060061645507813, "step": 120900 }, { "epoch": 1.2556165492336588, "grad_norm": 3.7018239498138428, "learning_rate": 0.00017443938277626156, "loss": 3.7614715576171873, "step": 121000 }, { "epoch": 1.2566542488611248, "grad_norm": 3.2457141876220703, "learning_rate": 0.00017433561281351496, "loss": 3.729616394042969, "step": 121100 }, { "epoch": 1.2576919484885904, "grad_norm": 9.342671394348145, "learning_rate": 0.0001742318428507684, "loss": 3.717445068359375, "step": 121200 }, { "epoch": 1.2587296481160564, "grad_norm": 3.293091058731079, "learning_rate": 0.00017412807288802183, "loss": 3.7832305908203123, "step": 121300 }, { "epoch": 1.259767347743522, "grad_norm": 4.222780704498291, "learning_rate": 0.00017402430292527522, "loss": 3.7384588623046877, "step": 121400 }, { "epoch": 1.260805047370988, "grad_norm": 3.0761492252349854, "learning_rate": 0.00017392053296252867, "loss": 3.7555526733398437, "step": 121500 }, { "epoch": 1.261842746998454, "grad_norm": 2.887803554534912, "learning_rate": 0.00017381676299978207, "loss": 3.695442810058594, "step": 121600 }, { "epoch": 1.2628804466259196, "grad_norm": 3.7166850566864014, "learning_rate": 0.0001737129930370355, "loss": 3.815606689453125, "step": 121700 }, { "epoch": 1.2639181462533855, "grad_norm": 12.183484077453613, "learning_rate": 0.0001736092230742889, "loss": 3.637664794921875, "step": 121800 }, { "epoch": 1.2649558458808512, "grad_norm": 3.1364870071411133, "learning_rate": 0.0001735054531115423, "loss": 3.6319699096679687, "step": 121900 }, { "epoch": 1.265993545508317, "grad_norm": 4.354419708251953, "learning_rate": 0.00017340168314879575, "loss": 3.786130065917969, "step": 122000 }, { "epoch": 1.267031245135783, "grad_norm": 4.645047664642334, "learning_rate": 0.00017329791318604915, "loss": 3.7552008056640624, "step": 122100 }, { "epoch": 1.268068944763249, "grad_norm": 4.269083499908447, "learning_rate": 0.00017319414322330257, "loss": 3.7506790161132812, "step": 122200 }, { "epoch": 1.2691066443907146, "grad_norm": 5.066195011138916, "learning_rate": 0.00017309037326055597, "loss": 3.788629455566406, "step": 122300 }, { "epoch": 1.2701443440181805, "grad_norm": 5.5616021156311035, "learning_rate": 0.0001729866032978094, "loss": 3.6688613891601562, "step": 122400 }, { "epoch": 1.2711820436456462, "grad_norm": 3.1797661781311035, "learning_rate": 0.00017288283333506284, "loss": 3.718145751953125, "step": 122500 }, { "epoch": 1.2722197432731122, "grad_norm": 3.063791275024414, "learning_rate": 0.00017277906337231623, "loss": 3.66003662109375, "step": 122600 }, { "epoch": 1.273257442900578, "grad_norm": 24.703685760498047, "learning_rate": 0.00017267529340956965, "loss": 3.697345886230469, "step": 122700 }, { "epoch": 1.2742951425280438, "grad_norm": 4.573358058929443, "learning_rate": 0.00017257152344682305, "loss": 3.770580139160156, "step": 122800 }, { "epoch": 1.2753328421555097, "grad_norm": 6.073929309844971, "learning_rate": 0.0001724677534840765, "loss": 3.570367736816406, "step": 122900 }, { "epoch": 1.2763705417829754, "grad_norm": 4.804381847381592, "learning_rate": 0.0001723639835213299, "loss": 3.7930453491210936, "step": 123000 }, { "epoch": 1.2774082414104413, "grad_norm": 7.542964935302734, "learning_rate": 0.00017226021355858331, "loss": 3.6680117797851564, "step": 123100 }, { "epoch": 1.2784459410379072, "grad_norm": 7.110779285430908, "learning_rate": 0.00017215644359583674, "loss": 3.645113830566406, "step": 123200 }, { "epoch": 1.2794836406653731, "grad_norm": 5.410161018371582, "learning_rate": 0.00017205267363309013, "loss": 3.7428775024414063, "step": 123300 }, { "epoch": 1.2805213402928388, "grad_norm": 4.089752197265625, "learning_rate": 0.00017194890367034358, "loss": 3.7075924682617187, "step": 123400 }, { "epoch": 1.2815590399203047, "grad_norm": 5.877744197845459, "learning_rate": 0.00017184513370759697, "loss": 3.546766662597656, "step": 123500 }, { "epoch": 1.2825967395477704, "grad_norm": 4.295921802520752, "learning_rate": 0.0001717413637448504, "loss": 3.5129269409179686, "step": 123600 }, { "epoch": 1.2836344391752363, "grad_norm": 7.998104572296143, "learning_rate": 0.00017163759378210385, "loss": 3.6661138916015625, "step": 123700 }, { "epoch": 1.2846721388027023, "grad_norm": 4.939531326293945, "learning_rate": 0.00017153382381935724, "loss": 3.665038757324219, "step": 123800 }, { "epoch": 1.285709838430168, "grad_norm": 6.5936384201049805, "learning_rate": 0.00017143005385661066, "loss": 3.6241445922851563, "step": 123900 }, { "epoch": 1.2867475380576339, "grad_norm": 4.765341281890869, "learning_rate": 0.00017132628389386406, "loss": 3.651435546875, "step": 124000 }, { "epoch": 1.2877852376850996, "grad_norm": 5.4220147132873535, "learning_rate": 0.00017122251393111748, "loss": 3.8530377197265624, "step": 124100 }, { "epoch": 1.2888229373125655, "grad_norm": 5.066165447235107, "learning_rate": 0.00017111874396837087, "loss": 3.6765261840820314, "step": 124200 }, { "epoch": 1.2898606369400314, "grad_norm": 2.871612787246704, "learning_rate": 0.00017101497400562432, "loss": 3.7530276489257814, "step": 124300 }, { "epoch": 1.2908983365674973, "grad_norm": 3.5445234775543213, "learning_rate": 0.00017091120404287774, "loss": 3.65380126953125, "step": 124400 }, { "epoch": 1.291936036194963, "grad_norm": 12.712068557739258, "learning_rate": 0.00017080743408013114, "loss": 3.651844787597656, "step": 124500 }, { "epoch": 1.292973735822429, "grad_norm": 5.535710334777832, "learning_rate": 0.0001707036641173846, "loss": 3.648440246582031, "step": 124600 }, { "epoch": 1.2940114354498946, "grad_norm": 6.527225017547607, "learning_rate": 0.00017059989415463798, "loss": 3.6168035888671874, "step": 124700 }, { "epoch": 1.2950491350773605, "grad_norm": 3.675743579864502, "learning_rate": 0.0001704961241918914, "loss": 3.689391784667969, "step": 124800 }, { "epoch": 1.2960868347048264, "grad_norm": 7.041729927062988, "learning_rate": 0.0001703923542291448, "loss": 3.6547369384765624, "step": 124900 }, { "epoch": 1.2971245343322921, "grad_norm": 2.5913071632385254, "learning_rate": 0.00017028858426639822, "loss": 3.803846740722656, "step": 125000 }, { "epoch": 1.298162233959758, "grad_norm": 5.099416732788086, "learning_rate": 0.00017018481430365167, "loss": 3.661207580566406, "step": 125100 }, { "epoch": 1.2991999335872237, "grad_norm": 3.8206946849823, "learning_rate": 0.00017008104434090507, "loss": 3.552643127441406, "step": 125200 }, { "epoch": 1.3002376332146897, "grad_norm": 3.769073247909546, "learning_rate": 0.0001699772743781585, "loss": 3.842325439453125, "step": 125300 }, { "epoch": 1.3012753328421556, "grad_norm": 2.529937744140625, "learning_rate": 0.00016987350441541188, "loss": 3.676832275390625, "step": 125400 }, { "epoch": 1.3023130324696213, "grad_norm": 7.345049858093262, "learning_rate": 0.0001697697344526653, "loss": 3.6286630249023437, "step": 125500 }, { "epoch": 1.3033507320970872, "grad_norm": 7.380908012390137, "learning_rate": 0.00016966596448991875, "loss": 3.6627023315429685, "step": 125600 }, { "epoch": 1.3043884317245529, "grad_norm": 2.8857064247131348, "learning_rate": 0.00016956219452717215, "loss": 3.641376953125, "step": 125700 }, { "epoch": 1.3054261313520188, "grad_norm": 6.945189476013184, "learning_rate": 0.00016945842456442557, "loss": 3.606731262207031, "step": 125800 }, { "epoch": 1.3064638309794847, "grad_norm": 6.422026634216309, "learning_rate": 0.00016935465460167897, "loss": 3.5785845947265624, "step": 125900 }, { "epoch": 1.3075015306069506, "grad_norm": 8.35920524597168, "learning_rate": 0.00016925088463893241, "loss": 3.6259381103515627, "step": 126000 }, { "epoch": 1.3085392302344163, "grad_norm": 8.193489074707031, "learning_rate": 0.0001691471146761858, "loss": 3.7568353271484374, "step": 126100 }, { "epoch": 1.3095769298618822, "grad_norm": 5.267637252807617, "learning_rate": 0.00016904334471343923, "loss": 3.757891845703125, "step": 126200 }, { "epoch": 1.310614629489348, "grad_norm": 3.3981618881225586, "learning_rate": 0.00016893957475069265, "loss": 3.6808877563476563, "step": 126300 }, { "epoch": 1.3116523291168138, "grad_norm": 11.042278289794922, "learning_rate": 0.00016883580478794605, "loss": 3.5690008544921876, "step": 126400 }, { "epoch": 1.3126900287442798, "grad_norm": 12.522445678710938, "learning_rate": 0.0001687320348251995, "loss": 3.675894775390625, "step": 126500 }, { "epoch": 1.3137277283717455, "grad_norm": 4.374575138092041, "learning_rate": 0.0001686282648624529, "loss": 3.8043743896484377, "step": 126600 }, { "epoch": 1.3147654279992114, "grad_norm": 2.7740325927734375, "learning_rate": 0.00016852449489970631, "loss": 3.7183938598632813, "step": 126700 }, { "epoch": 1.315803127626677, "grad_norm": 16.38130760192871, "learning_rate": 0.0001684207249369597, "loss": 3.7160101318359375, "step": 126800 }, { "epoch": 1.316840827254143, "grad_norm": 9.450004577636719, "learning_rate": 0.00016831695497421316, "loss": 3.6377835083007812, "step": 126900 }, { "epoch": 1.317878526881609, "grad_norm": 8.669651985168457, "learning_rate": 0.00016821318501146658, "loss": 3.5026895141601564, "step": 127000 }, { "epoch": 1.3189162265090748, "grad_norm": 4.877604007720947, "learning_rate": 0.00016810941504871997, "loss": 3.6808175659179687, "step": 127100 }, { "epoch": 1.3199539261365405, "grad_norm": 9.553235054016113, "learning_rate": 0.0001680056450859734, "loss": 3.706498718261719, "step": 127200 }, { "epoch": 1.3209916257640064, "grad_norm": 4.275841236114502, "learning_rate": 0.0001679018751232268, "loss": 3.752271728515625, "step": 127300 }, { "epoch": 1.322029325391472, "grad_norm": 7.115382671356201, "learning_rate": 0.00016779810516048024, "loss": 3.721490783691406, "step": 127400 }, { "epoch": 1.323067025018938, "grad_norm": 3.066580057144165, "learning_rate": 0.00016769433519773366, "loss": 3.67330322265625, "step": 127500 }, { "epoch": 1.324104724646404, "grad_norm": 3.145909547805786, "learning_rate": 0.00016759056523498706, "loss": 3.7071697998046873, "step": 127600 }, { "epoch": 1.3251424242738696, "grad_norm": 3.342615842819214, "learning_rate": 0.0001674867952722405, "loss": 3.68224853515625, "step": 127700 }, { "epoch": 1.3261801239013356, "grad_norm": 4.780127048492432, "learning_rate": 0.0001673830253094939, "loss": 3.914273986816406, "step": 127800 }, { "epoch": 1.3272178235288012, "grad_norm": 8.07118034362793, "learning_rate": 0.00016727925534674732, "loss": 3.6639437866210938, "step": 127900 }, { "epoch": 1.3282555231562672, "grad_norm": 6.763175964355469, "learning_rate": 0.00016717548538400072, "loss": 3.62579345703125, "step": 128000 }, { "epoch": 1.329293222783733, "grad_norm": 12.123154640197754, "learning_rate": 0.00016707171542125414, "loss": 3.721268615722656, "step": 128100 }, { "epoch": 1.330330922411199, "grad_norm": 3.787297010421753, "learning_rate": 0.0001669679454585076, "loss": 3.7412783813476564, "step": 128200 }, { "epoch": 1.3313686220386647, "grad_norm": 2.629784107208252, "learning_rate": 0.00016686417549576098, "loss": 3.7266500854492186, "step": 128300 }, { "epoch": 1.3324063216661306, "grad_norm": 2.8463058471679688, "learning_rate": 0.0001667604055330144, "loss": 3.56947021484375, "step": 128400 }, { "epoch": 1.3334440212935963, "grad_norm": 3.5442264080047607, "learning_rate": 0.0001666566355702678, "loss": 3.6988034057617187, "step": 128500 }, { "epoch": 1.3344817209210622, "grad_norm": 3.726022243499756, "learning_rate": 0.00016655286560752122, "loss": 3.6229156494140624, "step": 128600 }, { "epoch": 1.3355194205485281, "grad_norm": 5.090481758117676, "learning_rate": 0.00016644909564477464, "loss": 3.5555209350585937, "step": 128700 }, { "epoch": 1.3365571201759938, "grad_norm": 5.148849964141846, "learning_rate": 0.00016634532568202807, "loss": 3.723890380859375, "step": 128800 }, { "epoch": 1.3375948198034597, "grad_norm": 7.033978462219238, "learning_rate": 0.0001662415557192815, "loss": 3.6295504760742188, "step": 128900 }, { "epoch": 1.3386325194309254, "grad_norm": 5.022918701171875, "learning_rate": 0.00016613778575653488, "loss": 3.604397888183594, "step": 129000 }, { "epoch": 1.3396702190583913, "grad_norm": 3.9396724700927734, "learning_rate": 0.00016603401579378833, "loss": 3.740953369140625, "step": 129100 }, { "epoch": 1.3407079186858573, "grad_norm": 4.96920919418335, "learning_rate": 0.00016593024583104173, "loss": 3.6454959106445313, "step": 129200 }, { "epoch": 1.341745618313323, "grad_norm": 3.2997357845306396, "learning_rate": 0.00016582647586829515, "loss": 3.64101806640625, "step": 129300 }, { "epoch": 1.3427833179407889, "grad_norm": 12.793081283569336, "learning_rate": 0.00016572270590554857, "loss": 3.537852478027344, "step": 129400 }, { "epoch": 1.3438210175682546, "grad_norm": 7.696393013000488, "learning_rate": 0.00016561893594280197, "loss": 3.6636843872070313, "step": 129500 }, { "epoch": 1.3448587171957205, "grad_norm": 4.841111183166504, "learning_rate": 0.00016551516598005541, "loss": 3.6766192626953127, "step": 129600 }, { "epoch": 1.3458964168231864, "grad_norm": 2.822445869445801, "learning_rate": 0.0001654113960173088, "loss": 3.5910659790039063, "step": 129700 }, { "epoch": 1.3469341164506523, "grad_norm": 7.020183086395264, "learning_rate": 0.00016530762605456223, "loss": 3.6770706176757812, "step": 129800 }, { "epoch": 1.347971816078118, "grad_norm": 3.323997974395752, "learning_rate": 0.00016520385609181563, "loss": 3.673494567871094, "step": 129900 }, { "epoch": 1.349009515705584, "grad_norm": 12.734125137329102, "learning_rate": 0.00016510008612906907, "loss": 3.645369873046875, "step": 130000 }, { "epoch": 1.3500472153330496, "grad_norm": 6.959007740020752, "learning_rate": 0.0001649963161663225, "loss": 3.5545895385742186, "step": 130100 }, { "epoch": 1.3510849149605155, "grad_norm": 5.492075443267822, "learning_rate": 0.0001648925462035759, "loss": 3.73270263671875, "step": 130200 }, { "epoch": 1.3521226145879814, "grad_norm": 5.578936576843262, "learning_rate": 0.0001647887762408293, "loss": 3.633159484863281, "step": 130300 }, { "epoch": 1.3531603142154471, "grad_norm": 4.073727607727051, "learning_rate": 0.0001646850062780827, "loss": 3.7094195556640623, "step": 130400 }, { "epoch": 1.354198013842913, "grad_norm": 3.7967214584350586, "learning_rate": 0.00016458123631533616, "loss": 3.6143753051757814, "step": 130500 }, { "epoch": 1.3552357134703787, "grad_norm": 5.993916034698486, "learning_rate": 0.00016447746635258955, "loss": 3.722456359863281, "step": 130600 }, { "epoch": 1.3562734130978447, "grad_norm": 4.235459327697754, "learning_rate": 0.00016437369638984297, "loss": 3.7401913452148436, "step": 130700 }, { "epoch": 1.3573111127253106, "grad_norm": 13.88862133026123, "learning_rate": 0.00016426992642709642, "loss": 3.746804504394531, "step": 130800 }, { "epoch": 1.3583488123527765, "grad_norm": 5.165769100189209, "learning_rate": 0.00016416615646434982, "loss": 3.74326416015625, "step": 130900 }, { "epoch": 1.3593865119802422, "grad_norm": 3.6813595294952393, "learning_rate": 0.00016406238650160324, "loss": 3.617030029296875, "step": 131000 }, { "epoch": 1.360424211607708, "grad_norm": 5.9350152015686035, "learning_rate": 0.00016395861653885663, "loss": 3.873332214355469, "step": 131100 }, { "epoch": 1.3614619112351738, "grad_norm": 4.220798969268799, "learning_rate": 0.00016385484657611006, "loss": 3.6584405517578125, "step": 131200 }, { "epoch": 1.3624996108626397, "grad_norm": 21.21164894104004, "learning_rate": 0.0001637510766133635, "loss": 3.617677917480469, "step": 131300 }, { "epoch": 1.3635373104901056, "grad_norm": 5.271477699279785, "learning_rate": 0.0001636473066506169, "loss": 3.5792852783203126, "step": 131400 }, { "epoch": 1.3645750101175713, "grad_norm": 4.747986316680908, "learning_rate": 0.00016354353668787032, "loss": 3.6235577392578127, "step": 131500 }, { "epoch": 1.3656127097450372, "grad_norm": 3.8399877548217773, "learning_rate": 0.00016343976672512372, "loss": 3.780206604003906, "step": 131600 }, { "epoch": 1.366650409372503, "grad_norm": 7.428284645080566, "learning_rate": 0.00016333599676237714, "loss": 3.600271911621094, "step": 131700 }, { "epoch": 1.3676881089999688, "grad_norm": 4.4645304679870605, "learning_rate": 0.00016323222679963056, "loss": 3.6348703002929685, "step": 131800 }, { "epoch": 1.3687258086274348, "grad_norm": 4.429653167724609, "learning_rate": 0.00016312845683688398, "loss": 3.704706726074219, "step": 131900 }, { "epoch": 1.3697635082549007, "grad_norm": 4.308233737945557, "learning_rate": 0.0001630246868741374, "loss": 3.704057312011719, "step": 132000 }, { "epoch": 1.3708012078823664, "grad_norm": 12.334646224975586, "learning_rate": 0.0001629209169113908, "loss": 3.6710003662109374, "step": 132100 }, { "epoch": 1.3718389075098323, "grad_norm": 5.286363124847412, "learning_rate": 0.00016281714694864425, "loss": 3.6472879028320313, "step": 132200 }, { "epoch": 1.372876607137298, "grad_norm": 3.0022027492523193, "learning_rate": 0.00016271337698589764, "loss": 3.867461853027344, "step": 132300 }, { "epoch": 1.373914306764764, "grad_norm": 3.6052401065826416, "learning_rate": 0.00016260960702315107, "loss": 3.465709533691406, "step": 132400 }, { "epoch": 1.3749520063922298, "grad_norm": 4.250115871429443, "learning_rate": 0.00016250583706040446, "loss": 3.6189974975585937, "step": 132500 }, { "epoch": 1.3759897060196955, "grad_norm": 4.520415306091309, "learning_rate": 0.00016240206709765788, "loss": 3.697256774902344, "step": 132600 }, { "epoch": 1.3770274056471614, "grad_norm": 3.608278751373291, "learning_rate": 0.00016229829713491133, "loss": 3.6748687744140627, "step": 132700 }, { "epoch": 1.3780651052746271, "grad_norm": 3.6304538249969482, "learning_rate": 0.00016219452717216473, "loss": 3.6889605712890625, "step": 132800 }, { "epoch": 1.379102804902093, "grad_norm": 4.484381675720215, "learning_rate": 0.00016209075720941815, "loss": 3.667810974121094, "step": 132900 }, { "epoch": 1.380140504529559, "grad_norm": 12.79962158203125, "learning_rate": 0.00016198698724667154, "loss": 3.901937255859375, "step": 133000 }, { "epoch": 1.3811782041570246, "grad_norm": 3.6465935707092285, "learning_rate": 0.000161883217283925, "loss": 3.6334658813476564, "step": 133100 }, { "epoch": 1.3822159037844906, "grad_norm": 2.5269343852996826, "learning_rate": 0.00016177944732117841, "loss": 3.6968539428710936, "step": 133200 }, { "epoch": 1.3832536034119562, "grad_norm": 4.01210880279541, "learning_rate": 0.0001616756773584318, "loss": 3.4310296630859374, "step": 133300 }, { "epoch": 1.3842913030394222, "grad_norm": 4.493933200836182, "learning_rate": 0.00016157190739568523, "loss": 3.719140930175781, "step": 133400 }, { "epoch": 1.385329002666888, "grad_norm": 3.25607967376709, "learning_rate": 0.00016146813743293863, "loss": 3.6992584228515626, "step": 133500 }, { "epoch": 1.386366702294354, "grad_norm": 6.134942054748535, "learning_rate": 0.00016136436747019207, "loss": 3.748294677734375, "step": 133600 }, { "epoch": 1.3874044019218197, "grad_norm": 3.706012725830078, "learning_rate": 0.00016126059750744547, "loss": 3.586408996582031, "step": 133700 }, { "epoch": 1.3884421015492856, "grad_norm": 5.05728816986084, "learning_rate": 0.0001611568275446989, "loss": 3.7400482177734373, "step": 133800 }, { "epoch": 1.3894798011767513, "grad_norm": 4.292380332946777, "learning_rate": 0.00016105305758195234, "loss": 3.7132363891601563, "step": 133900 }, { "epoch": 1.3905175008042172, "grad_norm": 9.770214080810547, "learning_rate": 0.00016094928761920573, "loss": 3.5888162231445313, "step": 134000 }, { "epoch": 1.3915552004316831, "grad_norm": 9.073437690734863, "learning_rate": 0.00016084551765645916, "loss": 3.6239898681640623, "step": 134100 }, { "epoch": 1.3925929000591488, "grad_norm": 5.210220813751221, "learning_rate": 0.00016074174769371255, "loss": 3.4854669189453125, "step": 134200 }, { "epoch": 1.3936305996866147, "grad_norm": 5.995209693908691, "learning_rate": 0.00016063797773096597, "loss": 3.6248184204101563, "step": 134300 }, { "epoch": 1.3946682993140804, "grad_norm": 8.040777206420898, "learning_rate": 0.00016053420776821937, "loss": 3.767200622558594, "step": 134400 }, { "epoch": 1.3957059989415463, "grad_norm": 6.153497695922852, "learning_rate": 0.00016043043780547282, "loss": 3.6283489990234377, "step": 134500 }, { "epoch": 1.3967436985690123, "grad_norm": 3.4162278175354004, "learning_rate": 0.00016032666784272624, "loss": 3.6065017700195314, "step": 134600 }, { "epoch": 1.3977813981964782, "grad_norm": 3.4524638652801514, "learning_rate": 0.00016022289787997963, "loss": 3.6301129150390623, "step": 134700 }, { "epoch": 1.3988190978239439, "grad_norm": 6.9367804527282715, "learning_rate": 0.00016011912791723308, "loss": 3.6796551513671876, "step": 134800 }, { "epoch": 1.3998567974514098, "grad_norm": 3.629422903060913, "learning_rate": 0.00016001535795448648, "loss": 3.745485534667969, "step": 134900 }, { "epoch": 1.4008944970788755, "grad_norm": 3.658010959625244, "learning_rate": 0.0001599115879917399, "loss": 3.6311688232421875, "step": 135000 }, { "epoch": 1.4019321967063414, "grad_norm": 16.63618278503418, "learning_rate": 0.00015980781802899332, "loss": 3.6807235717773437, "step": 135100 }, { "epoch": 1.4029698963338073, "grad_norm": 6.354872703552246, "learning_rate": 0.00015970404806624672, "loss": 3.5296261596679686, "step": 135200 }, { "epoch": 1.404007595961273, "grad_norm": 7.496634483337402, "learning_rate": 0.00015960027810350017, "loss": 3.5905780029296874, "step": 135300 }, { "epoch": 1.405045295588739, "grad_norm": 2.790278673171997, "learning_rate": 0.00015949650814075356, "loss": 3.544078369140625, "step": 135400 }, { "epoch": 1.4060829952162046, "grad_norm": 5.150670528411865, "learning_rate": 0.00015939273817800698, "loss": 3.7144375610351563, "step": 135500 }, { "epoch": 1.4071206948436705, "grad_norm": 5.606545448303223, "learning_rate": 0.00015928896821526038, "loss": 3.719892578125, "step": 135600 }, { "epoch": 1.4081583944711364, "grad_norm": 15.23755931854248, "learning_rate": 0.0001591851982525138, "loss": 3.649613952636719, "step": 135700 }, { "epoch": 1.4091960940986021, "grad_norm": 20.73650550842285, "learning_rate": 0.00015908142828976725, "loss": 3.6828762817382814, "step": 135800 }, { "epoch": 1.410233793726068, "grad_norm": 8.400344848632812, "learning_rate": 0.00015897765832702064, "loss": 3.6613919067382814, "step": 135900 }, { "epoch": 1.411271493353534, "grad_norm": 2.5724685192108154, "learning_rate": 0.00015887388836427407, "loss": 3.657626037597656, "step": 136000 }, { "epoch": 1.4123091929809997, "grad_norm": 19.325956344604492, "learning_rate": 0.00015877011840152746, "loss": 3.8178024291992188, "step": 136100 }, { "epoch": 1.4133468926084656, "grad_norm": 2.402404308319092, "learning_rate": 0.0001586663484387809, "loss": 3.59340576171875, "step": 136200 }, { "epoch": 1.4143845922359315, "grad_norm": 6.188352108001709, "learning_rate": 0.00015856257847603433, "loss": 3.6710971069335936, "step": 136300 }, { "epoch": 1.4154222918633972, "grad_norm": 4.21588659286499, "learning_rate": 0.00015845880851328773, "loss": 3.721273193359375, "step": 136400 }, { "epoch": 1.416459991490863, "grad_norm": 4.4968485832214355, "learning_rate": 0.00015835503855054115, "loss": 3.6669491577148436, "step": 136500 }, { "epoch": 1.4174976911183288, "grad_norm": 7.214438438415527, "learning_rate": 0.00015825126858779454, "loss": 3.799635925292969, "step": 136600 }, { "epoch": 1.4185353907457947, "grad_norm": 7.262329578399658, "learning_rate": 0.000158147498625048, "loss": 3.807882995605469, "step": 136700 }, { "epoch": 1.4195730903732606, "grad_norm": 3.5909628868103027, "learning_rate": 0.00015804372866230139, "loss": 3.7313577270507814, "step": 136800 }, { "epoch": 1.4206107900007263, "grad_norm": 10.205459594726562, "learning_rate": 0.0001579399586995548, "loss": 3.675950622558594, "step": 136900 }, { "epoch": 1.4216484896281922, "grad_norm": 5.25307559967041, "learning_rate": 0.00015783618873680826, "loss": 3.6014810180664063, "step": 137000 }, { "epoch": 1.422686189255658, "grad_norm": 42.26997756958008, "learning_rate": 0.00015773241877406165, "loss": 3.6278192138671876, "step": 137100 }, { "epoch": 1.4237238888831238, "grad_norm": 6.092323303222656, "learning_rate": 0.00015762864881131507, "loss": 3.555603332519531, "step": 137200 }, { "epoch": 1.4247615885105898, "grad_norm": 2.74434232711792, "learning_rate": 0.00015752487884856847, "loss": 3.5426220703125, "step": 137300 }, { "epoch": 1.4257992881380557, "grad_norm": 13.12152099609375, "learning_rate": 0.0001574211088858219, "loss": 3.7107192993164064, "step": 137400 }, { "epoch": 1.4268369877655214, "grad_norm": 3.9462010860443115, "learning_rate": 0.00015731733892307529, "loss": 3.5455560302734375, "step": 137500 }, { "epoch": 1.4278746873929873, "grad_norm": 3.7687721252441406, "learning_rate": 0.00015721356896032873, "loss": 3.630052490234375, "step": 137600 }, { "epoch": 1.428912387020453, "grad_norm": 4.470894813537598, "learning_rate": 0.00015710979899758216, "loss": 3.627494201660156, "step": 137700 }, { "epoch": 1.429950086647919, "grad_norm": 4.3846259117126465, "learning_rate": 0.00015700602903483555, "loss": 3.5804782104492188, "step": 137800 }, { "epoch": 1.4309877862753848, "grad_norm": 3.9794013500213623, "learning_rate": 0.000156902259072089, "loss": 3.739950866699219, "step": 137900 }, { "epoch": 1.4320254859028505, "grad_norm": 10.886957168579102, "learning_rate": 0.0001567984891093424, "loss": 3.7072845458984376, "step": 138000 }, { "epoch": 1.4330631855303164, "grad_norm": 4.187902927398682, "learning_rate": 0.00015669471914659582, "loss": 3.64345703125, "step": 138100 }, { "epoch": 1.4341008851577821, "grad_norm": 32.209293365478516, "learning_rate": 0.00015659094918384924, "loss": 3.6210546875, "step": 138200 }, { "epoch": 1.435138584785248, "grad_norm": 3.12260365486145, "learning_rate": 0.00015648717922110263, "loss": 3.7005911254882813, "step": 138300 }, { "epoch": 1.436176284412714, "grad_norm": 6.220150470733643, "learning_rate": 0.00015638340925835608, "loss": 3.7236618041992187, "step": 138400 }, { "epoch": 1.4372139840401799, "grad_norm": 2.38154673576355, "learning_rate": 0.00015627963929560948, "loss": 3.633033447265625, "step": 138500 }, { "epoch": 1.4382516836676456, "grad_norm": 7.884495258331299, "learning_rate": 0.0001561758693328629, "loss": 3.5666903686523437, "step": 138600 }, { "epoch": 1.4392893832951115, "grad_norm": 3.8970346450805664, "learning_rate": 0.0001560720993701163, "loss": 3.6862808227539063, "step": 138700 }, { "epoch": 1.4403270829225772, "grad_norm": 3.273268461227417, "learning_rate": 0.00015596832940736972, "loss": 3.6251177978515625, "step": 138800 }, { "epoch": 1.441364782550043, "grad_norm": 3.0285887718200684, "learning_rate": 0.00015586455944462317, "loss": 3.61291015625, "step": 138900 }, { "epoch": 1.442402482177509, "grad_norm": 3.4767589569091797, "learning_rate": 0.00015576078948187656, "loss": 3.6781646728515627, "step": 139000 }, { "epoch": 1.4434401818049747, "grad_norm": 156.1669158935547, "learning_rate": 0.00015565701951912998, "loss": 3.6272451782226565, "step": 139100 }, { "epoch": 1.4444778814324406, "grad_norm": 2.3591196537017822, "learning_rate": 0.00015555324955638338, "loss": 3.589447021484375, "step": 139200 }, { "epoch": 1.4455155810599063, "grad_norm": 3.8040847778320312, "learning_rate": 0.00015544947959363683, "loss": 3.64208251953125, "step": 139300 }, { "epoch": 1.4465532806873722, "grad_norm": 2.655759811401367, "learning_rate": 0.00015534570963089022, "loss": 3.671148376464844, "step": 139400 }, { "epoch": 1.4475909803148381, "grad_norm": 7.29696798324585, "learning_rate": 0.00015524193966814364, "loss": 3.751770324707031, "step": 139500 }, { "epoch": 1.4486286799423038, "grad_norm": 6.334928035736084, "learning_rate": 0.00015513816970539706, "loss": 3.7970040893554686, "step": 139600 }, { "epoch": 1.4496663795697697, "grad_norm": 6.7520623207092285, "learning_rate": 0.00015503439974265046, "loss": 3.6929965209960938, "step": 139700 }, { "epoch": 1.4507040791972354, "grad_norm": 10.428074836730957, "learning_rate": 0.0001549306297799039, "loss": 3.734377136230469, "step": 139800 }, { "epoch": 1.4517417788247013, "grad_norm": 8.371795654296875, "learning_rate": 0.0001548268598171573, "loss": 3.6029412841796873, "step": 139900 }, { "epoch": 1.4527794784521673, "grad_norm": 3.291740894317627, "learning_rate": 0.00015472308985441073, "loss": 3.6670523071289063, "step": 140000 }, { "epoch": 1.4538171780796332, "grad_norm": 7.120608806610107, "learning_rate": 0.00015461931989166417, "loss": 3.638569030761719, "step": 140100 }, { "epoch": 1.4548548777070989, "grad_norm": 6.361410617828369, "learning_rate": 0.00015451554992891757, "loss": 3.661440734863281, "step": 140200 }, { "epoch": 1.4558925773345648, "grad_norm": 3.5337114334106445, "learning_rate": 0.000154411779966171, "loss": 3.69423828125, "step": 140300 }, { "epoch": 1.4569302769620305, "grad_norm": 8.946898460388184, "learning_rate": 0.00015430801000342439, "loss": 3.636510925292969, "step": 140400 }, { "epoch": 1.4579679765894964, "grad_norm": 3.5454866886138916, "learning_rate": 0.0001542042400406778, "loss": 3.833760986328125, "step": 140500 }, { "epoch": 1.4590056762169623, "grad_norm": 20.629167556762695, "learning_rate": 0.0001541004700779312, "loss": 3.740248718261719, "step": 140600 }, { "epoch": 1.460043375844428, "grad_norm": 3.0284929275512695, "learning_rate": 0.00015399670011518465, "loss": 3.6760980224609376, "step": 140700 }, { "epoch": 1.461081075471894, "grad_norm": 4.971894264221191, "learning_rate": 0.00015389293015243807, "loss": 3.600714111328125, "step": 140800 }, { "epoch": 1.4621187750993596, "grad_norm": 3.689394950866699, "learning_rate": 0.00015378916018969147, "loss": 3.5257861328125, "step": 140900 }, { "epoch": 1.4631564747268255, "grad_norm": 4.305582523345947, "learning_rate": 0.00015368539022694492, "loss": 3.66658447265625, "step": 141000 }, { "epoch": 1.4641941743542914, "grad_norm": 12.191847801208496, "learning_rate": 0.0001535816202641983, "loss": 3.5539178466796875, "step": 141100 }, { "epoch": 1.4652318739817574, "grad_norm": 5.9276814460754395, "learning_rate": 0.00015347785030145173, "loss": 3.712036437988281, "step": 141200 }, { "epoch": 1.466269573609223, "grad_norm": 7.3767008781433105, "learning_rate": 0.00015337408033870513, "loss": 3.688995361328125, "step": 141300 }, { "epoch": 1.467307273236689, "grad_norm": 4.156796932220459, "learning_rate": 0.00015327031037595855, "loss": 3.5971023559570314, "step": 141400 }, { "epoch": 1.4683449728641547, "grad_norm": 3.876843214035034, "learning_rate": 0.000153166540413212, "loss": 3.7138726806640623, "step": 141500 }, { "epoch": 1.4693826724916206, "grad_norm": 2.5647096633911133, "learning_rate": 0.0001530627704504654, "loss": 3.575816650390625, "step": 141600 }, { "epoch": 1.4704203721190865, "grad_norm": 6.341168403625488, "learning_rate": 0.00015295900048771882, "loss": 3.675234375, "step": 141700 }, { "epoch": 1.4714580717465522, "grad_norm": 11.66984748840332, "learning_rate": 0.0001528552305249722, "loss": 3.5949581909179686, "step": 141800 }, { "epoch": 1.472495771374018, "grad_norm": 2.7472872734069824, "learning_rate": 0.00015275146056222563, "loss": 3.4315753173828125, "step": 141900 }, { "epoch": 1.4735334710014838, "grad_norm": 2.7182295322418213, "learning_rate": 0.00015264769059947908, "loss": 3.580435791015625, "step": 142000 }, { "epoch": 1.4745711706289497, "grad_norm": 7.28167200088501, "learning_rate": 0.00015254392063673248, "loss": 3.6344500732421876, "step": 142100 }, { "epoch": 1.4756088702564156, "grad_norm": 3.1541340351104736, "learning_rate": 0.0001524401506739859, "loss": 3.6579803466796874, "step": 142200 }, { "epoch": 1.4766465698838815, "grad_norm": 4.42963171005249, "learning_rate": 0.0001523363807112393, "loss": 3.5743417358398437, "step": 142300 }, { "epoch": 1.4776842695113472, "grad_norm": 7.278059005737305, "learning_rate": 0.00015223261074849274, "loss": 3.7834173583984376, "step": 142400 }, { "epoch": 1.4787219691388132, "grad_norm": 10.52426528930664, "learning_rate": 0.00015212884078574614, "loss": 3.6968179321289063, "step": 142500 }, { "epoch": 1.4797596687662788, "grad_norm": 3.5773837566375732, "learning_rate": 0.00015202507082299956, "loss": 3.6810809326171876, "step": 142600 }, { "epoch": 1.4807973683937448, "grad_norm": 3.344587802886963, "learning_rate": 0.00015192130086025298, "loss": 3.6345669555664064, "step": 142700 }, { "epoch": 1.4818350680212107, "grad_norm": 6.329004287719727, "learning_rate": 0.00015181753089750638, "loss": 3.647319641113281, "step": 142800 }, { "epoch": 1.4828727676486764, "grad_norm": 6.577507495880127, "learning_rate": 0.00015171376093475983, "loss": 3.5769888305664064, "step": 142900 }, { "epoch": 1.4839104672761423, "grad_norm": 4.545724391937256, "learning_rate": 0.00015160999097201322, "loss": 3.583935546875, "step": 143000 }, { "epoch": 1.484948166903608, "grad_norm": 13.324125289916992, "learning_rate": 0.00015150622100926664, "loss": 3.612706604003906, "step": 143100 }, { "epoch": 1.485985866531074, "grad_norm": 4.545955657958984, "learning_rate": 0.00015140245104652004, "loss": 3.4066473388671876, "step": 143200 }, { "epoch": 1.4870235661585398, "grad_norm": 8.517041206359863, "learning_rate": 0.00015129868108377349, "loss": 3.6258444213867187, "step": 143300 }, { "epoch": 1.4880612657860055, "grad_norm": 5.813758373260498, "learning_rate": 0.0001511949111210269, "loss": 3.686318054199219, "step": 143400 }, { "epoch": 1.4890989654134714, "grad_norm": 6.236087322235107, "learning_rate": 0.0001510911411582803, "loss": 3.7458810424804687, "step": 143500 }, { "epoch": 1.4901366650409371, "grad_norm": 5.874231815338135, "learning_rate": 0.00015098737119553373, "loss": 3.6481814575195313, "step": 143600 }, { "epoch": 1.491174364668403, "grad_norm": 7.229684829711914, "learning_rate": 0.00015088360123278712, "loss": 3.6855035400390626, "step": 143700 }, { "epoch": 1.492212064295869, "grad_norm": 7.212390422821045, "learning_rate": 0.00015077983127004057, "loss": 3.750265808105469, "step": 143800 }, { "epoch": 1.4932497639233349, "grad_norm": 5.408252239227295, "learning_rate": 0.000150676061307294, "loss": 3.5695159912109373, "step": 143900 }, { "epoch": 1.4942874635508006, "grad_norm": 8.125064849853516, "learning_rate": 0.00015057229134454739, "loss": 3.642791442871094, "step": 144000 }, { "epoch": 1.4953251631782665, "grad_norm": 5.047210216522217, "learning_rate": 0.00015046852138180083, "loss": 3.588906555175781, "step": 144100 }, { "epoch": 1.4963628628057322, "grad_norm": 2.775951623916626, "learning_rate": 0.00015036475141905423, "loss": 3.672796325683594, "step": 144200 }, { "epoch": 1.497400562433198, "grad_norm": 7.114427089691162, "learning_rate": 0.00015026098145630765, "loss": 3.7460537719726563, "step": 144300 }, { "epoch": 1.498438262060664, "grad_norm": 4.1067585945129395, "learning_rate": 0.00015015721149356105, "loss": 3.4047305297851564, "step": 144400 }, { "epoch": 1.4994759616881297, "grad_norm": 6.3360276222229, "learning_rate": 0.00015005344153081447, "loss": 3.6055087280273437, "step": 144500 }, { "epoch": 1.5005136613155956, "grad_norm": 3.8499081134796143, "learning_rate": 0.0001499496715680679, "loss": 3.6976129150390626, "step": 144600 }, { "epoch": 1.5015513609430613, "grad_norm": 4.669349193572998, "learning_rate": 0.0001498459016053213, "loss": 3.6301043701171873, "step": 144700 }, { "epoch": 1.5025890605705272, "grad_norm": 12.484715461730957, "learning_rate": 0.0001497421316425747, "loss": 3.6376629638671876, "step": 144800 }, { "epoch": 1.5036267601979931, "grad_norm": 3.1881167888641357, "learning_rate": 0.00014963836167982816, "loss": 3.688013000488281, "step": 144900 }, { "epoch": 1.504664459825459, "grad_norm": 3.1999073028564453, "learning_rate": 0.00014953459171708158, "loss": 3.767580871582031, "step": 145000 }, { "epoch": 1.5057021594529247, "grad_norm": 2.503138303756714, "learning_rate": 0.00014943082175433497, "loss": 3.772780456542969, "step": 145100 }, { "epoch": 1.5067398590803904, "grad_norm": 5.124083995819092, "learning_rate": 0.0001493270517915884, "loss": 3.709577941894531, "step": 145200 }, { "epoch": 1.5077775587078563, "grad_norm": 12.24608039855957, "learning_rate": 0.00014922328182884182, "loss": 3.46869140625, "step": 145300 }, { "epoch": 1.5088152583353223, "grad_norm": 11.273271560668945, "learning_rate": 0.0001491195118660952, "loss": 3.4797503662109377, "step": 145400 }, { "epoch": 1.5098529579627882, "grad_norm": 60.867916107177734, "learning_rate": 0.00014901574190334866, "loss": 3.54853515625, "step": 145500 }, { "epoch": 1.5108906575902539, "grad_norm": 4.276978969573975, "learning_rate": 0.00014891197194060206, "loss": 3.908219299316406, "step": 145600 }, { "epoch": 1.5119283572177198, "grad_norm": 2.901015281677246, "learning_rate": 0.00014880820197785548, "loss": 3.4694091796875, "step": 145700 }, { "epoch": 1.5129660568451855, "grad_norm": 2.3719887733459473, "learning_rate": 0.0001487044320151089, "loss": 3.7670758056640623, "step": 145800 }, { "epoch": 1.5140037564726514, "grad_norm": 2.4967026710510254, "learning_rate": 0.0001486006620523623, "loss": 3.635834045410156, "step": 145900 }, { "epoch": 1.5150414561001173, "grad_norm": 3.604675769805908, "learning_rate": 0.00014849689208961572, "loss": 3.5507608032226563, "step": 146000 }, { "epoch": 1.5160791557275832, "grad_norm": 5.442782402038574, "learning_rate": 0.00014839312212686916, "loss": 3.5730636596679686, "step": 146100 }, { "epoch": 1.517116855355049, "grad_norm": 3.7341339588165283, "learning_rate": 0.00014828935216412256, "loss": 3.569194641113281, "step": 146200 }, { "epoch": 1.5181545549825146, "grad_norm": 12.070112228393555, "learning_rate": 0.00014818558220137598, "loss": 3.60053955078125, "step": 146300 }, { "epoch": 1.5191922546099805, "grad_norm": 5.036438941955566, "learning_rate": 0.0001480818122386294, "loss": 3.7114804077148436, "step": 146400 }, { "epoch": 1.5202299542374464, "grad_norm": 10.83106803894043, "learning_rate": 0.0001479780422758828, "loss": 3.5428836059570314, "step": 146500 }, { "epoch": 1.5212676538649124, "grad_norm": 9.07150650024414, "learning_rate": 0.00014787427231313622, "loss": 3.6087515258789065, "step": 146600 }, { "epoch": 1.522305353492378, "grad_norm": 3.6539382934570312, "learning_rate": 0.00014777050235038964, "loss": 3.6974029541015625, "step": 146700 }, { "epoch": 1.523343053119844, "grad_norm": 2.5568654537200928, "learning_rate": 0.00014766673238764306, "loss": 3.7100448608398438, "step": 146800 }, { "epoch": 1.5243807527473097, "grad_norm": 5.767122745513916, "learning_rate": 0.00014756296242489649, "loss": 3.494932861328125, "step": 146900 }, { "epoch": 1.5254184523747756, "grad_norm": 5.006596088409424, "learning_rate": 0.00014745919246214988, "loss": 3.804518737792969, "step": 147000 }, { "epoch": 1.5264561520022415, "grad_norm": 3.907433271408081, "learning_rate": 0.0001473554224994033, "loss": 3.6617333984375, "step": 147100 }, { "epoch": 1.5274938516297074, "grad_norm": 6.253331184387207, "learning_rate": 0.00014725165253665672, "loss": 3.611311950683594, "step": 147200 }, { "epoch": 1.528531551257173, "grad_norm": 5.735301494598389, "learning_rate": 0.00014714788257391015, "loss": 3.605543518066406, "step": 147300 }, { "epoch": 1.5295692508846388, "grad_norm": 1.7375198602676392, "learning_rate": 0.00014704411261116357, "loss": 3.6379776000976562, "step": 147400 }, { "epoch": 1.5306069505121047, "grad_norm": 4.913732051849365, "learning_rate": 0.000146940342648417, "loss": 3.757569580078125, "step": 147500 }, { "epoch": 1.5316446501395706, "grad_norm": 3.887519598007202, "learning_rate": 0.00014683657268567039, "loss": 3.654621887207031, "step": 147600 }, { "epoch": 1.5326823497670365, "grad_norm": 45.76445007324219, "learning_rate": 0.0001467328027229238, "loss": 3.611448059082031, "step": 147700 }, { "epoch": 1.5337200493945022, "grad_norm": 3.629575729370117, "learning_rate": 0.00014662903276017723, "loss": 3.6693844604492187, "step": 147800 }, { "epoch": 1.5347577490219682, "grad_norm": 2.453900098800659, "learning_rate": 0.00014652526279743062, "loss": 3.6880978393554686, "step": 147900 }, { "epoch": 1.5357954486494338, "grad_norm": 3.411557674407959, "learning_rate": 0.00014642149283468407, "loss": 3.656671447753906, "step": 148000 }, { "epoch": 1.5368331482768998, "grad_norm": 3.5617477893829346, "learning_rate": 0.0001463177228719375, "loss": 3.706895446777344, "step": 148100 }, { "epoch": 1.5378708479043657, "grad_norm": 3.5422544479370117, "learning_rate": 0.0001462139529091909, "loss": 3.605690612792969, "step": 148200 }, { "epoch": 1.5389085475318316, "grad_norm": 3.9814698696136475, "learning_rate": 0.0001461101829464443, "loss": 3.6530465698242187, "step": 148300 }, { "epoch": 1.5399462471592973, "grad_norm": 10.028122901916504, "learning_rate": 0.00014600641298369773, "loss": 3.623879089355469, "step": 148400 }, { "epoch": 1.540983946786763, "grad_norm": 3.4206697940826416, "learning_rate": 0.00014590264302095113, "loss": 3.517763366699219, "step": 148500 }, { "epoch": 1.542021646414229, "grad_norm": 3.4238781929016113, "learning_rate": 0.00014579887305820455, "loss": 3.52829833984375, "step": 148600 }, { "epoch": 1.5430593460416948, "grad_norm": 58.35453414916992, "learning_rate": 0.00014569510309545797, "loss": 3.682017517089844, "step": 148700 }, { "epoch": 1.5440970456691607, "grad_norm": 4.933131217956543, "learning_rate": 0.0001455913331327114, "loss": 3.577257080078125, "step": 148800 }, { "epoch": 1.5451347452966264, "grad_norm": 17.892318725585938, "learning_rate": 0.00014548756316996482, "loss": 3.710743713378906, "step": 148900 }, { "epoch": 1.5461724449240921, "grad_norm": 6.2961249351501465, "learning_rate": 0.0001453837932072182, "loss": 3.6647821044921876, "step": 149000 }, { "epoch": 1.547210144551558, "grad_norm": 4.278889179229736, "learning_rate": 0.00014528002324447163, "loss": 3.613748779296875, "step": 149100 }, { "epoch": 1.548247844179024, "grad_norm": 3.2785260677337646, "learning_rate": 0.00014517625328172505, "loss": 3.6411376953125, "step": 149200 }, { "epoch": 1.5492855438064899, "grad_norm": 3.227151393890381, "learning_rate": 0.00014507248331897848, "loss": 3.758666687011719, "step": 149300 }, { "epoch": 1.5503232434339556, "grad_norm": 2.6391334533691406, "learning_rate": 0.0001449687133562319, "loss": 3.5469485473632814, "step": 149400 }, { "epoch": 1.5513609430614215, "grad_norm": 2.5920772552490234, "learning_rate": 0.00014486494339348532, "loss": 3.621335754394531, "step": 149500 }, { "epoch": 1.5523986426888872, "grad_norm": 2.864225387573242, "learning_rate": 0.00014476117343073872, "loss": 3.6408596801757813, "step": 149600 }, { "epoch": 1.553436342316353, "grad_norm": 4.697976112365723, "learning_rate": 0.00014465740346799214, "loss": 3.6993423461914063, "step": 149700 }, { "epoch": 1.554474041943819, "grad_norm": 4.074455738067627, "learning_rate": 0.00014455363350524556, "loss": 3.6419488525390626, "step": 149800 }, { "epoch": 1.555511741571285, "grad_norm": 2.933537721633911, "learning_rate": 0.00014444986354249898, "loss": 3.622572326660156, "step": 149900 }, { "epoch": 1.5565494411987506, "grad_norm": 5.856564521789551, "learning_rate": 0.0001443460935797524, "loss": 3.7532833862304686, "step": 150000 }, { "epoch": 1.5575871408262163, "grad_norm": 4.24385929107666, "learning_rate": 0.00014424232361700583, "loss": 3.67490234375, "step": 150100 }, { "epoch": 1.5586248404536822, "grad_norm": 5.053845405578613, "learning_rate": 0.00014413855365425922, "loss": 3.7350125122070312, "step": 150200 }, { "epoch": 1.5596625400811481, "grad_norm": 3.423252582550049, "learning_rate": 0.00014403478369151264, "loss": 3.522652893066406, "step": 150300 }, { "epoch": 1.560700239708614, "grad_norm": 8.40445327758789, "learning_rate": 0.00014393101372876606, "loss": 3.480498962402344, "step": 150400 }, { "epoch": 1.5617379393360797, "grad_norm": 3.1955294609069824, "learning_rate": 0.00014382724376601946, "loss": 3.6813082885742188, "step": 150500 }, { "epoch": 1.5627756389635457, "grad_norm": 6.0853681564331055, "learning_rate": 0.0001437234738032729, "loss": 3.6238223266601564, "step": 150600 }, { "epoch": 1.5638133385910113, "grad_norm": 5.178461074829102, "learning_rate": 0.0001436197038405263, "loss": 3.6469857788085935, "step": 150700 }, { "epoch": 1.5648510382184773, "grad_norm": 8.24820613861084, "learning_rate": 0.00014351593387777972, "loss": 3.6198629760742187, "step": 150800 }, { "epoch": 1.5658887378459432, "grad_norm": 4.228358745574951, "learning_rate": 0.00014341216391503315, "loss": 3.4716970825195315, "step": 150900 }, { "epoch": 1.566926437473409, "grad_norm": 3.555584192276001, "learning_rate": 0.00014330839395228654, "loss": 3.739703369140625, "step": 151000 }, { "epoch": 1.5679641371008748, "grad_norm": 5.781318187713623, "learning_rate": 0.00014320462398953996, "loss": 3.5981024169921874, "step": 151100 }, { "epoch": 1.5690018367283405, "grad_norm": 6.903919696807861, "learning_rate": 0.0001431008540267934, "loss": 3.5764788818359374, "step": 151200 }, { "epoch": 1.5700395363558064, "grad_norm": 3.584331512451172, "learning_rate": 0.0001429970840640468, "loss": 3.6005426025390626, "step": 151300 }, { "epoch": 1.5710772359832723, "grad_norm": 4.393853664398193, "learning_rate": 0.00014289331410130023, "loss": 3.78184814453125, "step": 151400 }, { "epoch": 1.5721149356107382, "grad_norm": 2.4552299976348877, "learning_rate": 0.00014278954413855365, "loss": 3.7241311645507813, "step": 151500 }, { "epoch": 1.573152635238204, "grad_norm": 6.105810642242432, "learning_rate": 0.00014268577417580705, "loss": 3.6668280029296874, "step": 151600 }, { "epoch": 1.5741903348656698, "grad_norm": 5.4593939781188965, "learning_rate": 0.00014258200421306047, "loss": 3.6350604248046876, "step": 151700 }, { "epoch": 1.5752280344931355, "grad_norm": 8.01681900024414, "learning_rate": 0.0001424782342503139, "loss": 3.636524658203125, "step": 151800 }, { "epoch": 1.5762657341206014, "grad_norm": 27.08595848083496, "learning_rate": 0.0001423744642875673, "loss": 3.7312826538085937, "step": 151900 }, { "epoch": 1.5773034337480674, "grad_norm": 3.227189064025879, "learning_rate": 0.00014227069432482073, "loss": 3.54576416015625, "step": 152000 }, { "epoch": 1.5783411333755333, "grad_norm": 3.922788619995117, "learning_rate": 0.00014216692436207413, "loss": 3.762126770019531, "step": 152100 }, { "epoch": 1.579378833002999, "grad_norm": 11.172755241394043, "learning_rate": 0.00014206315439932755, "loss": 3.6238665771484375, "step": 152200 }, { "epoch": 1.5804165326304647, "grad_norm": 4.898155212402344, "learning_rate": 0.00014195938443658097, "loss": 3.5397454833984376, "step": 152300 }, { "epoch": 1.5814542322579306, "grad_norm": 4.228941440582275, "learning_rate": 0.0001418556144738344, "loss": 3.482630615234375, "step": 152400 }, { "epoch": 1.5824919318853965, "grad_norm": 3.2711164951324463, "learning_rate": 0.00014175184451108782, "loss": 3.55691162109375, "step": 152500 }, { "epoch": 1.5835296315128624, "grad_norm": 4.924630641937256, "learning_rate": 0.00014164807454834124, "loss": 3.6941983032226564, "step": 152600 }, { "epoch": 1.584567331140328, "grad_norm": 4.247806072235107, "learning_rate": 0.00014154430458559463, "loss": 3.704905700683594, "step": 152700 }, { "epoch": 1.5856050307677938, "grad_norm": 5.901268482208252, "learning_rate": 0.00014144053462284805, "loss": 3.4900387573242186, "step": 152800 }, { "epoch": 1.5866427303952597, "grad_norm": 2.9829347133636475, "learning_rate": 0.00014133676466010148, "loss": 3.560227966308594, "step": 152900 }, { "epoch": 1.5876804300227256, "grad_norm": 3.3158979415893555, "learning_rate": 0.00014123299469735487, "loss": 3.6083251953125, "step": 153000 }, { "epoch": 1.5887181296501915, "grad_norm": 3.4291346073150635, "learning_rate": 0.00014112922473460832, "loss": 3.634643859863281, "step": 153100 }, { "epoch": 1.5897558292776572, "grad_norm": 6.855015754699707, "learning_rate": 0.00014102545477186174, "loss": 3.5994863891601563, "step": 153200 }, { "epoch": 1.5907935289051232, "grad_norm": 5.0481133460998535, "learning_rate": 0.00014092168480911514, "loss": 3.7016021728515627, "step": 153300 }, { "epoch": 1.5918312285325888, "grad_norm": 7.888632297515869, "learning_rate": 0.00014081791484636856, "loss": 3.531593017578125, "step": 153400 }, { "epoch": 1.5928689281600548, "grad_norm": 3.533106565475464, "learning_rate": 0.00014071414488362198, "loss": 3.671497497558594, "step": 153500 }, { "epoch": 1.5939066277875207, "grad_norm": 3.2950990200042725, "learning_rate": 0.00014061037492087538, "loss": 3.6725836181640625, "step": 153600 }, { "epoch": 1.5949443274149866, "grad_norm": 5.21208381652832, "learning_rate": 0.00014050660495812882, "loss": 3.6607846069335936, "step": 153700 }, { "epoch": 1.5959820270424523, "grad_norm": 2.718191385269165, "learning_rate": 0.00014040283499538222, "loss": 3.607443542480469, "step": 153800 }, { "epoch": 1.597019726669918, "grad_norm": 3.6571433544158936, "learning_rate": 0.00014029906503263564, "loss": 3.675062255859375, "step": 153900 }, { "epoch": 1.598057426297384, "grad_norm": 2.440661907196045, "learning_rate": 0.00014019529506988906, "loss": 3.4682305908203124, "step": 154000 }, { "epoch": 1.5990951259248498, "grad_norm": 4.171643257141113, "learning_rate": 0.00014009152510714246, "loss": 3.682950134277344, "step": 154100 }, { "epoch": 1.6001328255523157, "grad_norm": 7.624752998352051, "learning_rate": 0.00013998775514439588, "loss": 3.67526611328125, "step": 154200 }, { "epoch": 1.6011705251797814, "grad_norm": 7.279924392700195, "learning_rate": 0.0001398839851816493, "loss": 3.6037884521484376, "step": 154300 }, { "epoch": 1.6022082248072473, "grad_norm": 3.2470226287841797, "learning_rate": 0.00013978021521890272, "loss": 3.658772277832031, "step": 154400 }, { "epoch": 1.603245924434713, "grad_norm": 5.602239608764648, "learning_rate": 0.00013967644525615615, "loss": 3.5984457397460936, "step": 154500 }, { "epoch": 1.604283624062179, "grad_norm": 3.6453311443328857, "learning_rate": 0.00013957267529340957, "loss": 3.388334045410156, "step": 154600 }, { "epoch": 1.6053213236896449, "grad_norm": 6.957507610321045, "learning_rate": 0.00013946890533066296, "loss": 3.617900695800781, "step": 154700 }, { "epoch": 1.6063590233171108, "grad_norm": 15.978106498718262, "learning_rate": 0.00013936513536791638, "loss": 3.514501647949219, "step": 154800 }, { "epoch": 1.6073967229445765, "grad_norm": 4.719081401824951, "learning_rate": 0.0001392613654051698, "loss": 3.6095266723632813, "step": 154900 }, { "epoch": 1.6084344225720422, "grad_norm": 3.6483592987060547, "learning_rate": 0.00013915759544242323, "loss": 3.6144635009765627, "step": 155000 }, { "epoch": 1.609472122199508, "grad_norm": 3.3481674194335938, "learning_rate": 0.00013905382547967665, "loss": 3.5398931884765625, "step": 155100 }, { "epoch": 1.610509821826974, "grad_norm": 6.413243293762207, "learning_rate": 0.00013895005551693007, "loss": 3.6416336059570313, "step": 155200 }, { "epoch": 1.61154752145444, "grad_norm": 7.17488431930542, "learning_rate": 0.00013884628555418347, "loss": 3.717559814453125, "step": 155300 }, { "epoch": 1.6125852210819056, "grad_norm": 6.735267162322998, "learning_rate": 0.0001387425155914369, "loss": 3.6701150512695313, "step": 155400 }, { "epoch": 1.6136229207093713, "grad_norm": 3.489192008972168, "learning_rate": 0.0001386387456286903, "loss": 3.607757263183594, "step": 155500 }, { "epoch": 1.6146606203368372, "grad_norm": 4.3538360595703125, "learning_rate": 0.00013853497566594373, "loss": 3.6339166259765623, "step": 155600 }, { "epoch": 1.6156983199643031, "grad_norm": 17.20830535888672, "learning_rate": 0.00013843120570319715, "loss": 3.42401611328125, "step": 155700 }, { "epoch": 1.616736019591769, "grad_norm": 2.5314135551452637, "learning_rate": 0.00013832743574045055, "loss": 3.527308349609375, "step": 155800 }, { "epoch": 1.617773719219235, "grad_norm": 4.076705455780029, "learning_rate": 0.00013822366577770397, "loss": 3.5527752685546874, "step": 155900 }, { "epoch": 1.6188114188467007, "grad_norm": 3.8894543647766113, "learning_rate": 0.0001381198958149574, "loss": 3.68035400390625, "step": 156000 }, { "epoch": 1.6198491184741663, "grad_norm": 17.054737091064453, "learning_rate": 0.0001380161258522108, "loss": 3.4780517578125, "step": 156100 }, { "epoch": 1.6208868181016323, "grad_norm": 20.06046485900879, "learning_rate": 0.0001379123558894642, "loss": 3.5436282348632813, "step": 156200 }, { "epoch": 1.6219245177290982, "grad_norm": 3.36186146736145, "learning_rate": 0.00013780858592671766, "loss": 3.6915762329101565, "step": 156300 }, { "epoch": 1.622962217356564, "grad_norm": 3.333552360534668, "learning_rate": 0.00013770481596397105, "loss": 3.551458740234375, "step": 156400 }, { "epoch": 1.6239999169840298, "grad_norm": 16.679468154907227, "learning_rate": 0.00013760104600122448, "loss": 3.5686306762695312, "step": 156500 }, { "epoch": 1.6250376166114955, "grad_norm": 3.8986880779266357, "learning_rate": 0.0001374972760384779, "loss": 3.6233151245117186, "step": 156600 }, { "epoch": 1.6260753162389614, "grad_norm": 5.065491199493408, "learning_rate": 0.0001373935060757313, "loss": 3.6737161254882813, "step": 156700 }, { "epoch": 1.6271130158664273, "grad_norm": 16.096450805664062, "learning_rate": 0.00013728973611298471, "loss": 3.6823269653320314, "step": 156800 }, { "epoch": 1.6281507154938932, "grad_norm": 3.939023733139038, "learning_rate": 0.00013718596615023814, "loss": 3.655545349121094, "step": 156900 }, { "epoch": 1.629188415121359, "grad_norm": 5.221971035003662, "learning_rate": 0.00013708219618749156, "loss": 3.5299761962890623, "step": 157000 }, { "epoch": 1.6302261147488248, "grad_norm": 4.515364646911621, "learning_rate": 0.00013697842622474498, "loss": 3.5957623291015626, "step": 157100 }, { "epoch": 1.6312638143762905, "grad_norm": 2.1334664821624756, "learning_rate": 0.00013687465626199838, "loss": 3.5724642944335936, "step": 157200 }, { "epoch": 1.6323015140037564, "grad_norm": 3.8212311267852783, "learning_rate": 0.0001367708862992518, "loss": 3.5870269775390624, "step": 157300 }, { "epoch": 1.6333392136312224, "grad_norm": 7.132654666900635, "learning_rate": 0.00013666711633650522, "loss": 3.5734619140625, "step": 157400 }, { "epoch": 1.6343769132586883, "grad_norm": 4.568203926086426, "learning_rate": 0.00013656334637375864, "loss": 3.6052120971679686, "step": 157500 }, { "epoch": 1.635414612886154, "grad_norm": 6.630765438079834, "learning_rate": 0.00013645957641101206, "loss": 3.7074453735351565, "step": 157600 }, { "epoch": 1.6364523125136197, "grad_norm": 9.513466835021973, "learning_rate": 0.00013635580644826549, "loss": 3.4421658325195312, "step": 157700 }, { "epoch": 1.6374900121410856, "grad_norm": 3.5600993633270264, "learning_rate": 0.00013625203648551888, "loss": 3.472029724121094, "step": 157800 }, { "epoch": 1.6385277117685515, "grad_norm": 3.796132802963257, "learning_rate": 0.0001361482665227723, "loss": 3.700109558105469, "step": 157900 }, { "epoch": 1.6395654113960174, "grad_norm": 5.419138431549072, "learning_rate": 0.00013604449656002572, "loss": 3.525767517089844, "step": 158000 }, { "epoch": 1.640603111023483, "grad_norm": 7.728092193603516, "learning_rate": 0.00013594072659727912, "loss": 3.4612411499023437, "step": 158100 }, { "epoch": 1.641640810650949, "grad_norm": 5.094764232635498, "learning_rate": 0.00013583695663453257, "loss": 3.5728485107421877, "step": 158200 }, { "epoch": 1.6426785102784147, "grad_norm": 7.930044174194336, "learning_rate": 0.000135733186671786, "loss": 3.547598571777344, "step": 158300 }, { "epoch": 1.6437162099058806, "grad_norm": 3.853911876678467, "learning_rate": 0.00013562941670903938, "loss": 3.5331781005859373, "step": 158400 }, { "epoch": 1.6447539095333465, "grad_norm": 14.153372764587402, "learning_rate": 0.0001355256467462928, "loss": 3.5483056640625, "step": 158500 }, { "epoch": 1.6457916091608125, "grad_norm": 4.353669166564941, "learning_rate": 0.00013542187678354623, "loss": 3.5902810668945313, "step": 158600 }, { "epoch": 1.6468293087882782, "grad_norm": 3.16603946685791, "learning_rate": 0.00013531810682079962, "loss": 3.5274386596679688, "step": 158700 }, { "epoch": 1.6478670084157439, "grad_norm": 5.928895950317383, "learning_rate": 0.00013521433685805307, "loss": 3.662962646484375, "step": 158800 }, { "epoch": 1.6489047080432098, "grad_norm": 4.497453689575195, "learning_rate": 0.00013511056689530647, "loss": 3.6771749877929687, "step": 158900 }, { "epoch": 1.6499424076706757, "grad_norm": 6.737712383270264, "learning_rate": 0.0001350067969325599, "loss": 3.546751708984375, "step": 159000 }, { "epoch": 1.6509801072981416, "grad_norm": 3.984771490097046, "learning_rate": 0.0001349030269698133, "loss": 3.5879977416992186, "step": 159100 }, { "epoch": 1.6520178069256073, "grad_norm": 7.267343521118164, "learning_rate": 0.0001347992570070667, "loss": 3.5557431030273436, "step": 159200 }, { "epoch": 1.653055506553073, "grad_norm": 5.349457263946533, "learning_rate": 0.00013469548704432013, "loss": 3.6174130249023437, "step": 159300 }, { "epoch": 1.654093206180539, "grad_norm": 3.6522059440612793, "learning_rate": 0.00013459171708157358, "loss": 3.609751892089844, "step": 159400 }, { "epoch": 1.6551309058080048, "grad_norm": 5.704461574554443, "learning_rate": 0.00013448794711882697, "loss": 3.5679837036132813, "step": 159500 }, { "epoch": 1.6561686054354707, "grad_norm": 5.23817777633667, "learning_rate": 0.0001343841771560804, "loss": 3.5738253784179688, "step": 159600 }, { "epoch": 1.6572063050629366, "grad_norm": 12.301040649414062, "learning_rate": 0.00013428040719333382, "loss": 3.587038879394531, "step": 159700 }, { "epoch": 1.6582440046904023, "grad_norm": 6.761283874511719, "learning_rate": 0.0001341766372305872, "loss": 3.521001281738281, "step": 159800 }, { "epoch": 1.659281704317868, "grad_norm": 5.411608695983887, "learning_rate": 0.00013407286726784063, "loss": 3.473619384765625, "step": 159900 }, { "epoch": 1.660319403945334, "grad_norm": 14.189502716064453, "learning_rate": 0.00013396909730509405, "loss": 3.5413604736328126, "step": 160000 }, { "epoch": 1.6613571035727999, "grad_norm": 3.0541956424713135, "learning_rate": 0.00013386532734234748, "loss": 3.5548626708984377, "step": 160100 }, { "epoch": 1.6623948032002658, "grad_norm": 3.2475764751434326, "learning_rate": 0.0001337615573796009, "loss": 3.5887530517578123, "step": 160200 }, { "epoch": 1.6634325028277315, "grad_norm": 4.810506343841553, "learning_rate": 0.00013365778741685432, "loss": 3.6068450927734377, "step": 160300 }, { "epoch": 1.6644702024551972, "grad_norm": 11.347721099853516, "learning_rate": 0.00013355401745410771, "loss": 3.663785705566406, "step": 160400 }, { "epoch": 1.665507902082663, "grad_norm": 2.9197380542755127, "learning_rate": 0.00013345024749136114, "loss": 3.6435916137695314, "step": 160500 }, { "epoch": 1.666545601710129, "grad_norm": 5.3932037353515625, "learning_rate": 0.00013334647752861456, "loss": 3.6256121826171874, "step": 160600 }, { "epoch": 1.667583301337595, "grad_norm": 3.6826651096343994, "learning_rate": 0.00013324270756586798, "loss": 3.60268798828125, "step": 160700 }, { "epoch": 1.6686210009650606, "grad_norm": 4.883547782897949, "learning_rate": 0.0001331389376031214, "loss": 3.508822326660156, "step": 160800 }, { "epoch": 1.6696587005925265, "grad_norm": 3.1789474487304688, "learning_rate": 0.0001330351676403748, "loss": 3.5955624389648437, "step": 160900 }, { "epoch": 1.6706964002199922, "grad_norm": 3.8428354263305664, "learning_rate": 0.00013293139767762822, "loss": 3.6681442260742188, "step": 161000 }, { "epoch": 1.6717340998474581, "grad_norm": 5.440670490264893, "learning_rate": 0.00013282762771488164, "loss": 3.65127197265625, "step": 161100 }, { "epoch": 1.672771799474924, "grad_norm": 4.737522125244141, "learning_rate": 0.00013272385775213504, "loss": 3.757344055175781, "step": 161200 }, { "epoch": 1.67380949910239, "grad_norm": 5.953054428100586, "learning_rate": 0.00013262008778938848, "loss": 3.690797119140625, "step": 161300 }, { "epoch": 1.6748471987298557, "grad_norm": 8.720730781555176, "learning_rate": 0.0001325163178266419, "loss": 3.790602722167969, "step": 161400 }, { "epoch": 1.6758848983573214, "grad_norm": 3.9143240451812744, "learning_rate": 0.0001324125478638953, "loss": 3.439073486328125, "step": 161500 }, { "epoch": 1.6769225979847873, "grad_norm": 4.572363376617432, "learning_rate": 0.00013230877790114872, "loss": 3.5498342895507813, "step": 161600 }, { "epoch": 1.6779602976122532, "grad_norm": 9.166924476623535, "learning_rate": 0.00013220500793840215, "loss": 3.479727478027344, "step": 161700 }, { "epoch": 1.678997997239719, "grad_norm": 2.0057218074798584, "learning_rate": 0.00013210123797565554, "loss": 3.72489990234375, "step": 161800 }, { "epoch": 1.6800356968671848, "grad_norm": 4.892455101013184, "learning_rate": 0.000131997468012909, "loss": 3.6359210205078125, "step": 161900 }, { "epoch": 1.6810733964946507, "grad_norm": 8.374796867370605, "learning_rate": 0.00013189369805016238, "loss": 3.5657424926757812, "step": 162000 }, { "epoch": 1.6821110961221164, "grad_norm": 3.702462911605835, "learning_rate": 0.0001317899280874158, "loss": 3.6002679443359376, "step": 162100 }, { "epoch": 1.6831487957495823, "grad_norm": 6.6382856369018555, "learning_rate": 0.00013168615812466923, "loss": 3.5055661010742187, "step": 162200 }, { "epoch": 1.6841864953770482, "grad_norm": 4.067321300506592, "learning_rate": 0.00013158238816192262, "loss": 3.6370770263671877, "step": 162300 }, { "epoch": 1.6852241950045141, "grad_norm": 6.839338779449463, "learning_rate": 0.00013147861819917604, "loss": 3.68888671875, "step": 162400 }, { "epoch": 1.6862618946319798, "grad_norm": 4.304868221282959, "learning_rate": 0.00013137484823642947, "loss": 3.5517013549804686, "step": 162500 }, { "epoch": 1.6872995942594455, "grad_norm": 6.149030685424805, "learning_rate": 0.0001312710782736829, "loss": 3.535697326660156, "step": 162600 }, { "epoch": 1.6883372938869114, "grad_norm": 3.3684825897216797, "learning_rate": 0.0001311673083109363, "loss": 3.4286175537109376, "step": 162700 }, { "epoch": 1.6893749935143774, "grad_norm": 3.4294440746307373, "learning_rate": 0.00013106353834818973, "loss": 3.443184509277344, "step": 162800 }, { "epoch": 1.6904126931418433, "grad_norm": 4.177918434143066, "learning_rate": 0.00013095976838544313, "loss": 3.6785324096679686, "step": 162900 }, { "epoch": 1.691450392769309, "grad_norm": 3.914222478866577, "learning_rate": 0.00013085599842269655, "loss": 3.6343704223632813, "step": 163000 }, { "epoch": 1.6924880923967747, "grad_norm": 10.268918991088867, "learning_rate": 0.00013075222845994997, "loss": 3.625147399902344, "step": 163100 }, { "epoch": 1.6935257920242406, "grad_norm": 3.8632876873016357, "learning_rate": 0.0001306484584972034, "loss": 3.62834228515625, "step": 163200 }, { "epoch": 1.6945634916517065, "grad_norm": 3.8029658794403076, "learning_rate": 0.00013054468853445681, "loss": 3.4555462646484374, "step": 163300 }, { "epoch": 1.6956011912791724, "grad_norm": 3.983098030090332, "learning_rate": 0.00013044091857171024, "loss": 3.6773056030273437, "step": 163400 }, { "epoch": 1.6966388909066383, "grad_norm": 3.1625497341156006, "learning_rate": 0.00013033714860896363, "loss": 3.525480041503906, "step": 163500 }, { "epoch": 1.697676590534104, "grad_norm": 6.201349258422852, "learning_rate": 0.00013023337864621705, "loss": 3.626365051269531, "step": 163600 }, { "epoch": 1.6987142901615697, "grad_norm": 4.032458782196045, "learning_rate": 0.00013012960868347048, "loss": 3.5092694091796877, "step": 163700 }, { "epoch": 1.6997519897890356, "grad_norm": 3.9698915481567383, "learning_rate": 0.0001300258387207239, "loss": 3.273734436035156, "step": 163800 }, { "epoch": 1.7007896894165015, "grad_norm": 9.877572059631348, "learning_rate": 0.00012992206875797732, "loss": 3.576407775878906, "step": 163900 }, { "epoch": 1.7018273890439675, "grad_norm": 14.561692237854004, "learning_rate": 0.00012981829879523071, "loss": 3.6638983154296874, "step": 164000 }, { "epoch": 1.7028650886714332, "grad_norm": 2.6718385219573975, "learning_rate": 0.00012971452883248414, "loss": 3.671317138671875, "step": 164100 }, { "epoch": 1.7039027882988989, "grad_norm": 3.6662535667419434, "learning_rate": 0.00012961075886973756, "loss": 3.648578796386719, "step": 164200 }, { "epoch": 1.7049404879263648, "grad_norm": 4.04230260848999, "learning_rate": 0.00012950698890699095, "loss": 3.4332769775390624, "step": 164300 }, { "epoch": 1.7059781875538307, "grad_norm": 9.336248397827148, "learning_rate": 0.00012940321894424437, "loss": 3.6213333129882814, "step": 164400 }, { "epoch": 1.7070158871812966, "grad_norm": 5.882486820220947, "learning_rate": 0.00012929944898149782, "loss": 3.525044250488281, "step": 164500 }, { "epoch": 1.7080535868087623, "grad_norm": 6.984238624572754, "learning_rate": 0.00012919567901875122, "loss": 3.6717626953125, "step": 164600 }, { "epoch": 1.7090912864362282, "grad_norm": 19.616052627563477, "learning_rate": 0.00012909190905600464, "loss": 3.5099832153320314, "step": 164700 }, { "epoch": 1.710128986063694, "grad_norm": 8.419858932495117, "learning_rate": 0.00012898813909325806, "loss": 3.624603576660156, "step": 164800 }, { "epoch": 1.7111666856911598, "grad_norm": 3.145763397216797, "learning_rate": 0.00012888436913051146, "loss": 3.5627670288085938, "step": 164900 }, { "epoch": 1.7122043853186257, "grad_norm": 2.620919704437256, "learning_rate": 0.00012878059916776488, "loss": 3.556968994140625, "step": 165000 }, { "epoch": 1.7132420849460916, "grad_norm": 3.6687073707580566, "learning_rate": 0.0001286768292050183, "loss": 3.590003662109375, "step": 165100 }, { "epoch": 1.7142797845735573, "grad_norm": 3.51960825920105, "learning_rate": 0.00012857305924227172, "loss": 3.443156433105469, "step": 165200 }, { "epoch": 1.715317484201023, "grad_norm": 7.178112030029297, "learning_rate": 0.00012846928927952514, "loss": 3.5516900634765625, "step": 165300 }, { "epoch": 1.716355183828489, "grad_norm": 3.60011887550354, "learning_rate": 0.00012836551931677857, "loss": 3.5771609497070314, "step": 165400 }, { "epoch": 1.7173928834559549, "grad_norm": 5.902312278747559, "learning_rate": 0.00012826174935403196, "loss": 3.590467529296875, "step": 165500 }, { "epoch": 1.7184305830834208, "grad_norm": 2.6880180835723877, "learning_rate": 0.00012815797939128538, "loss": 3.6772579956054687, "step": 165600 }, { "epoch": 1.7194682827108865, "grad_norm": 4.136773109436035, "learning_rate": 0.0001280542094285388, "loss": 3.7336956787109377, "step": 165700 }, { "epoch": 1.7205059823383524, "grad_norm": 5.155696392059326, "learning_rate": 0.00012795043946579223, "loss": 3.4659573364257814, "step": 165800 }, { "epoch": 1.721543681965818, "grad_norm": 5.531459331512451, "learning_rate": 0.00012784666950304565, "loss": 3.4835992431640626, "step": 165900 }, { "epoch": 1.722581381593284, "grad_norm": 6.343237400054932, "learning_rate": 0.00012774289954029904, "loss": 3.5382821655273435, "step": 166000 }, { "epoch": 1.72361908122075, "grad_norm": 2.731682538986206, "learning_rate": 0.00012763912957755247, "loss": 3.426122131347656, "step": 166100 }, { "epoch": 1.7246567808482158, "grad_norm": 5.487903594970703, "learning_rate": 0.0001275353596148059, "loss": 3.5763626098632812, "step": 166200 }, { "epoch": 1.7256944804756815, "grad_norm": 6.798583984375, "learning_rate": 0.00012743158965205928, "loss": 3.439584045410156, "step": 166300 }, { "epoch": 1.7267321801031472, "grad_norm": 18.596773147583008, "learning_rate": 0.00012732781968931273, "loss": 3.4846591186523437, "step": 166400 }, { "epoch": 1.7277698797306131, "grad_norm": 9.826458930969238, "learning_rate": 0.00012722404972656615, "loss": 3.410422668457031, "step": 166500 }, { "epoch": 1.728807579358079, "grad_norm": 5.076817035675049, "learning_rate": 0.00012712027976381955, "loss": 3.5888720703125, "step": 166600 }, { "epoch": 1.729845278985545, "grad_norm": 2.289203405380249, "learning_rate": 0.00012701650980107297, "loss": 3.6262445068359375, "step": 166700 }, { "epoch": 1.7308829786130107, "grad_norm": 2.4246132373809814, "learning_rate": 0.0001269127398383264, "loss": 3.5331646728515627, "step": 166800 }, { "epoch": 1.7319206782404764, "grad_norm": 20.16929054260254, "learning_rate": 0.0001268089698755798, "loss": 3.3934396362304686, "step": 166900 }, { "epoch": 1.7329583778679423, "grad_norm": 4.409317970275879, "learning_rate": 0.00012670519991283324, "loss": 3.46904052734375, "step": 167000 }, { "epoch": 1.7339960774954082, "grad_norm": 3.533935308456421, "learning_rate": 0.00012660142995008663, "loss": 3.6115313720703126, "step": 167100 }, { "epoch": 1.735033777122874, "grad_norm": 3.760765790939331, "learning_rate": 0.00012649765998734005, "loss": 3.7661947631835937, "step": 167200 }, { "epoch": 1.7360714767503398, "grad_norm": 3.174926996231079, "learning_rate": 0.00012639389002459348, "loss": 3.4038616943359377, "step": 167300 }, { "epoch": 1.7371091763778057, "grad_norm": 4.701259136199951, "learning_rate": 0.00012629012006184687, "loss": 3.575841064453125, "step": 167400 }, { "epoch": 1.7381468760052714, "grad_norm": 4.684348106384277, "learning_rate": 0.0001261863500991003, "loss": 3.650244140625, "step": 167500 }, { "epoch": 1.7391845756327373, "grad_norm": 5.04356575012207, "learning_rate": 0.00012608258013635374, "loss": 3.512914733886719, "step": 167600 }, { "epoch": 1.7402222752602032, "grad_norm": 4.33563232421875, "learning_rate": 0.00012597881017360714, "loss": 3.462794189453125, "step": 167700 }, { "epoch": 1.7412599748876691, "grad_norm": 3.108952522277832, "learning_rate": 0.00012587504021086056, "loss": 3.6481967163085938, "step": 167800 }, { "epoch": 1.7422976745151348, "grad_norm": 7.204711437225342, "learning_rate": 0.00012577127024811398, "loss": 3.3575787353515625, "step": 167900 }, { "epoch": 1.7433353741426005, "grad_norm": 9.035337448120117, "learning_rate": 0.00012566750028536737, "loss": 3.5675091552734375, "step": 168000 }, { "epoch": 1.7443730737700665, "grad_norm": 5.063663005828857, "learning_rate": 0.0001255637303226208, "loss": 3.48505615234375, "step": 168100 }, { "epoch": 1.7454107733975324, "grad_norm": 3.2425074577331543, "learning_rate": 0.00012545996035987422, "loss": 3.6897207641601564, "step": 168200 }, { "epoch": 1.7464484730249983, "grad_norm": 5.356579303741455, "learning_rate": 0.00012535619039712764, "loss": 3.5673727416992187, "step": 168300 }, { "epoch": 1.747486172652464, "grad_norm": 4.124982833862305, "learning_rate": 0.00012525242043438106, "loss": 3.512673034667969, "step": 168400 }, { "epoch": 1.74852387227993, "grad_norm": 4.768991470336914, "learning_rate": 0.00012514865047163448, "loss": 3.5959738159179686, "step": 168500 }, { "epoch": 1.7495615719073956, "grad_norm": 9.657281875610352, "learning_rate": 0.00012504488050888788, "loss": 3.528682861328125, "step": 168600 }, { "epoch": 1.7505992715348615, "grad_norm": 2.538902759552002, "learning_rate": 0.0001249411105461413, "loss": 3.4649612426757814, "step": 168700 }, { "epoch": 1.7516369711623274, "grad_norm": 4.286279201507568, "learning_rate": 0.00012483734058339472, "loss": 3.5286309814453123, "step": 168800 }, { "epoch": 1.7526746707897933, "grad_norm": 15.081319808959961, "learning_rate": 0.00012473357062064814, "loss": 3.492412414550781, "step": 168900 }, { "epoch": 1.753712370417259, "grad_norm": 2.91190767288208, "learning_rate": 0.00012462980065790157, "loss": 3.4919317626953124, "step": 169000 }, { "epoch": 1.7547500700447247, "grad_norm": 3.788306713104248, "learning_rate": 0.00012452603069515496, "loss": 3.587347412109375, "step": 169100 }, { "epoch": 1.7557877696721906, "grad_norm": 4.830081462860107, "learning_rate": 0.00012442226073240838, "loss": 3.6080587768554686, "step": 169200 }, { "epoch": 1.7568254692996566, "grad_norm": 4.777892112731934, "learning_rate": 0.0001243184907696618, "loss": 3.653542175292969, "step": 169300 }, { "epoch": 1.7578631689271225, "grad_norm": 8.966485977172852, "learning_rate": 0.0001242147208069152, "loss": 3.55691650390625, "step": 169400 }, { "epoch": 1.7589008685545882, "grad_norm": 1.9701244831085205, "learning_rate": 0.00012411095084416865, "loss": 3.587906799316406, "step": 169500 }, { "epoch": 1.759938568182054, "grad_norm": 12.719783782958984, "learning_rate": 0.00012400718088142207, "loss": 3.413060302734375, "step": 169600 }, { "epoch": 1.7609762678095198, "grad_norm": 3.8632144927978516, "learning_rate": 0.00012390341091867547, "loss": 3.6044146728515627, "step": 169700 }, { "epoch": 1.7620139674369857, "grad_norm": 5.806576251983643, "learning_rate": 0.0001237996409559289, "loss": 3.59072509765625, "step": 169800 }, { "epoch": 1.7630516670644516, "grad_norm": 7.052939414978027, "learning_rate": 0.0001236958709931823, "loss": 3.4161257934570313, "step": 169900 }, { "epoch": 1.7640893666919175, "grad_norm": 4.090539455413818, "learning_rate": 0.0001235921010304357, "loss": 3.4862603759765625, "step": 170000 }, { "epoch": 1.7651270663193832, "grad_norm": 8.032806396484375, "learning_rate": 0.00012348833106768913, "loss": 3.5226229858398437, "step": 170100 }, { "epoch": 1.766164765946849, "grad_norm": 7.900229454040527, "learning_rate": 0.00012338456110494255, "loss": 3.428408203125, "step": 170200 }, { "epoch": 1.7672024655743148, "grad_norm": 3.3465304374694824, "learning_rate": 0.00012328079114219597, "loss": 3.4806304931640626, "step": 170300 }, { "epoch": 1.7682401652017807, "grad_norm": 2.737323522567749, "learning_rate": 0.0001231770211794494, "loss": 3.5239492797851564, "step": 170400 }, { "epoch": 1.7692778648292466, "grad_norm": 5.74827766418457, "learning_rate": 0.00012307325121670281, "loss": 3.5097976684570313, "step": 170500 }, { "epoch": 1.7703155644567123, "grad_norm": 6.033031463623047, "learning_rate": 0.0001229694812539562, "loss": 3.4570046997070314, "step": 170600 }, { "epoch": 1.771353264084178, "grad_norm": 8.032061576843262, "learning_rate": 0.00012286571129120963, "loss": 3.560968017578125, "step": 170700 }, { "epoch": 1.772390963711644, "grad_norm": 4.955009460449219, "learning_rate": 0.00012276194132846305, "loss": 3.54818115234375, "step": 170800 }, { "epoch": 1.7734286633391099, "grad_norm": 10.685212135314941, "learning_rate": 0.00012265817136571647, "loss": 3.5968731689453124, "step": 170900 }, { "epoch": 1.7744663629665758, "grad_norm": 6.002890110015869, "learning_rate": 0.0001225544014029699, "loss": 3.6380169677734373, "step": 171000 }, { "epoch": 1.7755040625940415, "grad_norm": 2.442901849746704, "learning_rate": 0.0001224506314402233, "loss": 3.546981201171875, "step": 171100 }, { "epoch": 1.7765417622215074, "grad_norm": 7.106812000274658, "learning_rate": 0.0001223468614774767, "loss": 3.4353497314453123, "step": 171200 }, { "epoch": 1.777579461848973, "grad_norm": 4.951285362243652, "learning_rate": 0.00012224309151473014, "loss": 3.5619387817382813, "step": 171300 }, { "epoch": 1.778617161476439, "grad_norm": 4.533148765563965, "learning_rate": 0.00012213932155198356, "loss": 3.4085040283203125, "step": 171400 }, { "epoch": 1.779654861103905, "grad_norm": 3.1281020641326904, "learning_rate": 0.00012203555158923698, "loss": 3.5755316162109376, "step": 171500 }, { "epoch": 1.7806925607313708, "grad_norm": 3.2438437938690186, "learning_rate": 0.00012193178162649039, "loss": 3.419034118652344, "step": 171600 }, { "epoch": 1.7817302603588365, "grad_norm": 6.113760948181152, "learning_rate": 0.0001218280116637438, "loss": 3.4608731079101562, "step": 171700 }, { "epoch": 1.7827679599863022, "grad_norm": 3.805856227874756, "learning_rate": 0.00012172424170099722, "loss": 3.542497253417969, "step": 171800 }, { "epoch": 1.7838056596137681, "grad_norm": 11.923066139221191, "learning_rate": 0.00012162047173825063, "loss": 3.5580120849609376, "step": 171900 }, { "epoch": 1.784843359241234, "grad_norm": 7.653703212738037, "learning_rate": 0.00012151670177550405, "loss": 3.464120178222656, "step": 172000 }, { "epoch": 1.7858810588687, "grad_norm": 4.955140113830566, "learning_rate": 0.00012141293181275747, "loss": 3.5985858154296877, "step": 172100 }, { "epoch": 1.7869187584961657, "grad_norm": 2.7006173133850098, "learning_rate": 0.00012130916185001089, "loss": 3.608409423828125, "step": 172200 }, { "epoch": 1.7879564581236316, "grad_norm": 10.799352645874023, "learning_rate": 0.0001212053918872643, "loss": 3.5314166259765627, "step": 172300 }, { "epoch": 1.7889941577510973, "grad_norm": 2.7497682571411133, "learning_rate": 0.00012110162192451772, "loss": 3.5095343017578124, "step": 172400 }, { "epoch": 1.7900318573785632, "grad_norm": 3.47670316696167, "learning_rate": 0.00012099785196177113, "loss": 3.508272705078125, "step": 172500 }, { "epoch": 1.791069557006029, "grad_norm": 5.199550151824951, "learning_rate": 0.00012089408199902454, "loss": 3.503916015625, "step": 172600 }, { "epoch": 1.792107256633495, "grad_norm": 5.3487043380737305, "learning_rate": 0.00012079031203627797, "loss": 3.63627685546875, "step": 172700 }, { "epoch": 1.7931449562609607, "grad_norm": 4.6182074546813965, "learning_rate": 0.00012068654207353138, "loss": 3.517708740234375, "step": 172800 }, { "epoch": 1.7941826558884264, "grad_norm": 2.607217788696289, "learning_rate": 0.0001205827721107848, "loss": 3.555519714355469, "step": 172900 }, { "epoch": 1.7952203555158923, "grad_norm": 9.180208206176758, "learning_rate": 0.00012047900214803821, "loss": 3.5748587036132813, "step": 173000 }, { "epoch": 1.7962580551433582, "grad_norm": 5.080584526062012, "learning_rate": 0.00012037523218529164, "loss": 3.5299716186523438, "step": 173100 }, { "epoch": 1.7972957547708241, "grad_norm": 2.5319409370422363, "learning_rate": 0.00012027146222254504, "loss": 3.5544561767578124, "step": 173200 }, { "epoch": 1.7983334543982898, "grad_norm": 4.81158447265625, "learning_rate": 0.00012016769225979848, "loss": 3.6039208984375, "step": 173300 }, { "epoch": 1.7993711540257558, "grad_norm": Infinity, "learning_rate": 0.00012006392229705189, "loss": 3.439290771484375, "step": 173400 }, { "epoch": 1.8004088536532215, "grad_norm": 2.6214425563812256, "learning_rate": 0.00011996015233430531, "loss": 3.670186767578125, "step": 173500 }, { "epoch": 1.8014465532806874, "grad_norm": 2.7172493934631348, "learning_rate": 0.00011985638237155872, "loss": 3.5838592529296873, "step": 173600 }, { "epoch": 1.8024842529081533, "grad_norm": 8.898774147033691, "learning_rate": 0.00011975261240881213, "loss": 3.4800985717773436, "step": 173700 }, { "epoch": 1.8035219525356192, "grad_norm": 3.5623104572296143, "learning_rate": 0.00011964884244606555, "loss": 3.511365966796875, "step": 173800 }, { "epoch": 1.804559652163085, "grad_norm": 8.46833610534668, "learning_rate": 0.00011954507248331896, "loss": 3.7088421630859374, "step": 173900 }, { "epoch": 1.8055973517905506, "grad_norm": 5.097702980041504, "learning_rate": 0.00011944130252057239, "loss": 3.6202734375, "step": 174000 }, { "epoch": 1.8066350514180165, "grad_norm": 2.758472204208374, "learning_rate": 0.0001193375325578258, "loss": 3.561451721191406, "step": 174100 }, { "epoch": 1.8076727510454824, "grad_norm": 10.48659610748291, "learning_rate": 0.00011923376259507922, "loss": 3.5661395263671873, "step": 174200 }, { "epoch": 1.8087104506729483, "grad_norm": 4.996297836303711, "learning_rate": 0.00011912999263233263, "loss": 3.680464782714844, "step": 174300 }, { "epoch": 1.809748150300414, "grad_norm": 3.927097797393799, "learning_rate": 0.00011902622266958605, "loss": 3.4924087524414062, "step": 174400 }, { "epoch": 1.8107858499278797, "grad_norm": 9.367024421691895, "learning_rate": 0.00011892245270683946, "loss": 3.4610064697265623, "step": 174500 }, { "epoch": 1.8118235495553456, "grad_norm": 2.7783424854278564, "learning_rate": 0.0001188186827440929, "loss": 3.411673583984375, "step": 174600 }, { "epoch": 1.8128612491828116, "grad_norm": 8.61545181274414, "learning_rate": 0.0001187149127813463, "loss": 3.5328875732421876, "step": 174700 }, { "epoch": 1.8138989488102775, "grad_norm": 7.4906182289123535, "learning_rate": 0.00011861114281859971, "loss": 3.376343078613281, "step": 174800 }, { "epoch": 1.8149366484377432, "grad_norm": 1.9939513206481934, "learning_rate": 0.00011850737285585314, "loss": 3.428880615234375, "step": 174900 }, { "epoch": 1.815974348065209, "grad_norm": 6.011395454406738, "learning_rate": 0.00011840360289310654, "loss": 3.5798504638671873, "step": 175000 }, { "epoch": 1.8170120476926748, "grad_norm": 2.0973944664001465, "learning_rate": 0.00011829983293035997, "loss": 3.5833367919921875, "step": 175100 }, { "epoch": 1.8180497473201407, "grad_norm": 4.992910861968994, "learning_rate": 0.00011819606296761339, "loss": 3.6261285400390624, "step": 175200 }, { "epoch": 1.8190874469476066, "grad_norm": 89.73089599609375, "learning_rate": 0.00011809229300486681, "loss": 3.390103454589844, "step": 175300 }, { "epoch": 1.8201251465750725, "grad_norm": 4.343557834625244, "learning_rate": 0.00011798852304212022, "loss": 3.6147576904296876, "step": 175400 }, { "epoch": 1.8211628462025382, "grad_norm": Infinity, "learning_rate": 0.00011788475307937364, "loss": 3.5382080078125, "step": 175500 }, { "epoch": 1.822200545830004, "grad_norm": 8.41909408569336, "learning_rate": 0.00011778098311662705, "loss": 3.5090155029296874, "step": 175600 }, { "epoch": 1.8232382454574698, "grad_norm": 7.508602619171143, "learning_rate": 0.00011767721315388046, "loss": 3.540501708984375, "step": 175700 }, { "epoch": 1.8242759450849357, "grad_norm": 2.713555335998535, "learning_rate": 0.00011757344319113388, "loss": 3.5669735717773436, "step": 175800 }, { "epoch": 1.8253136447124017, "grad_norm": 9.780903816223145, "learning_rate": 0.0001174696732283873, "loss": 3.542893981933594, "step": 175900 }, { "epoch": 1.8263513443398673, "grad_norm": 2.6435556411743164, "learning_rate": 0.00011736590326564072, "loss": 3.6134707641601564, "step": 176000 }, { "epoch": 1.8273890439673333, "grad_norm": 3.3884384632110596, "learning_rate": 0.00011726213330289413, "loss": 3.574878845214844, "step": 176100 }, { "epoch": 1.828426743594799, "grad_norm": 4.323862552642822, "learning_rate": 0.00011715836334014755, "loss": 3.5432839965820313, "step": 176200 }, { "epoch": 1.8294644432222649, "grad_norm": 6.794419765472412, "learning_rate": 0.00011705459337740096, "loss": 3.4334552001953127, "step": 176300 }, { "epoch": 1.8305021428497308, "grad_norm": 3.3329992294311523, "learning_rate": 0.00011695082341465438, "loss": 3.511024169921875, "step": 176400 }, { "epoch": 1.8315398424771967, "grad_norm": 6.582189083099365, "learning_rate": 0.0001168470534519078, "loss": 3.4057382202148436, "step": 176500 }, { "epoch": 1.8325775421046624, "grad_norm": 3.5420665740966797, "learning_rate": 0.00011674328348916123, "loss": 3.530198974609375, "step": 176600 }, { "epoch": 1.833615241732128, "grad_norm": 3.2835450172424316, "learning_rate": 0.00011663951352641463, "loss": 3.3689605712890627, "step": 176700 }, { "epoch": 1.834652941359594, "grad_norm": 4.352384567260742, "learning_rate": 0.00011653574356366804, "loss": 3.5228622436523436, "step": 176800 }, { "epoch": 1.83569064098706, "grad_norm": 6.940867900848389, "learning_rate": 0.00011643197360092147, "loss": 3.422699279785156, "step": 176900 }, { "epoch": 1.8367283406145258, "grad_norm": 9.627628326416016, "learning_rate": 0.00011632820363817487, "loss": 3.4203256225585936, "step": 177000 }, { "epoch": 1.8377660402419915, "grad_norm": 7.819676399230957, "learning_rate": 0.00011622443367542831, "loss": 3.569815673828125, "step": 177100 }, { "epoch": 1.8388037398694572, "grad_norm": 3.4782094955444336, "learning_rate": 0.00011612066371268172, "loss": 3.5252569580078124, "step": 177200 }, { "epoch": 1.8398414394969231, "grad_norm": 9.448952674865723, "learning_rate": 0.00011601689374993514, "loss": 3.43080322265625, "step": 177300 }, { "epoch": 1.840879139124389, "grad_norm": 5.754225730895996, "learning_rate": 0.00011591312378718855, "loss": 3.45312744140625, "step": 177400 }, { "epoch": 1.841916838751855, "grad_norm": 2.9918229579925537, "learning_rate": 0.00011580935382444197, "loss": 3.548991394042969, "step": 177500 }, { "epoch": 1.8429545383793209, "grad_norm": 4.406205177307129, "learning_rate": 0.00011570558386169538, "loss": 3.5221047973632813, "step": 177600 }, { "epoch": 1.8439922380067866, "grad_norm": 3.79978346824646, "learning_rate": 0.00011560181389894879, "loss": 3.4272702026367186, "step": 177700 }, { "epoch": 1.8450299376342523, "grad_norm": 8.362844467163086, "learning_rate": 0.00011549804393620222, "loss": 3.496314697265625, "step": 177800 }, { "epoch": 1.8460676372617182, "grad_norm": 4.00974702835083, "learning_rate": 0.00011539427397345563, "loss": 3.4739456176757812, "step": 177900 }, { "epoch": 1.847105336889184, "grad_norm": 4.4382853507995605, "learning_rate": 0.00011529050401070905, "loss": 3.637906799316406, "step": 178000 }, { "epoch": 1.84814303651665, "grad_norm": 3.4561121463775635, "learning_rate": 0.00011518673404796246, "loss": 3.4582669067382814, "step": 178100 }, { "epoch": 1.8491807361441157, "grad_norm": 9.542756080627441, "learning_rate": 0.00011508296408521588, "loss": 3.5665469360351563, "step": 178200 }, { "epoch": 1.8502184357715814, "grad_norm": 5.516635894775391, "learning_rate": 0.00011497919412246929, "loss": 3.5199371337890626, "step": 178300 }, { "epoch": 1.8512561353990473, "grad_norm": 10.64023494720459, "learning_rate": 0.00011487542415972273, "loss": 3.605532531738281, "step": 178400 }, { "epoch": 1.8522938350265132, "grad_norm": 3.7197024822235107, "learning_rate": 0.00011477165419697613, "loss": 3.5585647583007813, "step": 178500 }, { "epoch": 1.8533315346539792, "grad_norm": 8.84176254272461, "learning_rate": 0.00011466788423422956, "loss": 3.469338684082031, "step": 178600 }, { "epoch": 1.8543692342814448, "grad_norm": 13.789299011230469, "learning_rate": 0.00011456411427148297, "loss": 3.654618835449219, "step": 178700 }, { "epoch": 1.8554069339089108, "grad_norm": 3.7758259773254395, "learning_rate": 0.00011446034430873637, "loss": 3.511930236816406, "step": 178800 }, { "epoch": 1.8564446335363765, "grad_norm": 4.542521953582764, "learning_rate": 0.0001143565743459898, "loss": 3.572850341796875, "step": 178900 }, { "epoch": 1.8574823331638424, "grad_norm": 7.155478477478027, "learning_rate": 0.00011425280438324322, "loss": 3.6194467163085937, "step": 179000 }, { "epoch": 1.8585200327913083, "grad_norm": 5.109609603881836, "learning_rate": 0.00011414903442049664, "loss": 3.585841064453125, "step": 179100 }, { "epoch": 1.8595577324187742, "grad_norm": 4.251883506774902, "learning_rate": 0.00011404526445775005, "loss": 3.4581594848632813, "step": 179200 }, { "epoch": 1.86059543204624, "grad_norm": 22.98354148864746, "learning_rate": 0.00011394149449500347, "loss": 3.47680419921875, "step": 179300 }, { "epoch": 1.8616331316737056, "grad_norm": 4.897403240203857, "learning_rate": 0.00011383772453225688, "loss": 3.5364599609375, "step": 179400 }, { "epoch": 1.8626708313011715, "grad_norm": 11.166070938110352, "learning_rate": 0.0001137339545695103, "loss": 3.4703445434570312, "step": 179500 }, { "epoch": 1.8637085309286374, "grad_norm": 3.64528226852417, "learning_rate": 0.00011363018460676372, "loss": 3.612529296875, "step": 179600 }, { "epoch": 1.8647462305561033, "grad_norm": 3.4828524589538574, "learning_rate": 0.00011352641464401714, "loss": 3.622635803222656, "step": 179700 }, { "epoch": 1.865783930183569, "grad_norm": 4.965012550354004, "learning_rate": 0.00011342264468127055, "loss": 3.420509033203125, "step": 179800 }, { "epoch": 1.866821629811035, "grad_norm": 6.657770156860352, "learning_rate": 0.00011331887471852396, "loss": 3.57205810546875, "step": 179900 }, { "epoch": 1.8678593294385006, "grad_norm": 6.785094738006592, "learning_rate": 0.00011321510475577738, "loss": 3.613439025878906, "step": 180000 }, { "epoch": 1.8688970290659666, "grad_norm": 3.2131218910217285, "learning_rate": 0.00011311133479303079, "loss": 3.721015625, "step": 180100 }, { "epoch": 1.8699347286934325, "grad_norm": 3.327937364578247, "learning_rate": 0.00011300756483028421, "loss": 3.47718017578125, "step": 180200 }, { "epoch": 1.8709724283208984, "grad_norm": 8.65044116973877, "learning_rate": 0.00011290379486753763, "loss": 3.6089404296875, "step": 180300 }, { "epoch": 1.872010127948364, "grad_norm": 2.0018603801727295, "learning_rate": 0.00011280002490479106, "loss": 3.3825112915039064, "step": 180400 }, { "epoch": 1.8730478275758298, "grad_norm": 2.7814066410064697, "learning_rate": 0.00011269625494204446, "loss": 3.4428082275390626, "step": 180500 }, { "epoch": 1.8740855272032957, "grad_norm": 2.5407564640045166, "learning_rate": 0.00011259248497929789, "loss": 3.6512811279296873, "step": 180600 }, { "epoch": 1.8751232268307616, "grad_norm": 3.6118102073669434, "learning_rate": 0.0001124887150165513, "loss": 3.4491305541992188, "step": 180700 }, { "epoch": 1.8761609264582275, "grad_norm": 4.681710720062256, "learning_rate": 0.0001123849450538047, "loss": 3.5399176025390626, "step": 180800 }, { "epoch": 1.8771986260856932, "grad_norm": 5.6345062255859375, "learning_rate": 0.00011228117509105814, "loss": 3.580292053222656, "step": 180900 }, { "epoch": 1.878236325713159, "grad_norm": 4.881344318389893, "learning_rate": 0.00011217740512831155, "loss": 3.5553582763671874, "step": 181000 }, { "epoch": 1.8792740253406248, "grad_norm": 3.3916895389556885, "learning_rate": 0.00011207363516556497, "loss": 3.468414001464844, "step": 181100 }, { "epoch": 1.8803117249680907, "grad_norm": 4.611287593841553, "learning_rate": 0.00011196986520281838, "loss": 3.420959167480469, "step": 181200 }, { "epoch": 1.8813494245955567, "grad_norm": 3.4268012046813965, "learning_rate": 0.0001118660952400718, "loss": 3.614518737792969, "step": 181300 }, { "epoch": 1.8823871242230226, "grad_norm": 9.675979614257812, "learning_rate": 0.00011176232527732521, "loss": 3.460643310546875, "step": 181400 }, { "epoch": 1.8834248238504883, "grad_norm": 4.765254497528076, "learning_rate": 0.00011165855531457864, "loss": 3.5331201171875, "step": 181500 }, { "epoch": 1.884462523477954, "grad_norm": 12.958268165588379, "learning_rate": 0.00011155478535183205, "loss": 3.458702392578125, "step": 181600 }, { "epoch": 1.8855002231054199, "grad_norm": 3.9760847091674805, "learning_rate": 0.00011145101538908547, "loss": 3.5144024658203126, "step": 181700 }, { "epoch": 1.8865379227328858, "grad_norm": 3.063124656677246, "learning_rate": 0.00011134724542633888, "loss": 3.3591217041015624, "step": 181800 }, { "epoch": 1.8875756223603517, "grad_norm": 14.115145683288574, "learning_rate": 0.00011124347546359229, "loss": 3.5416494750976564, "step": 181900 }, { "epoch": 1.8886133219878174, "grad_norm": 2.602299213409424, "learning_rate": 0.00011113970550084571, "loss": 3.4190499877929685, "step": 182000 }, { "epoch": 1.889651021615283, "grad_norm": 6.7280168533325195, "learning_rate": 0.00011103593553809912, "loss": 3.3795068359375, "step": 182100 }, { "epoch": 1.890688721242749, "grad_norm": 6.911862850189209, "learning_rate": 0.00011093216557535256, "loss": 3.5166439819335937, "step": 182200 }, { "epoch": 1.891726420870215, "grad_norm": 6.751010894775391, "learning_rate": 0.00011082839561260596, "loss": 3.4338143920898436, "step": 182300 }, { "epoch": 1.8927641204976808, "grad_norm": 4.327939510345459, "learning_rate": 0.00011072462564985939, "loss": 3.4822421264648438, "step": 182400 }, { "epoch": 1.8938018201251465, "grad_norm": 2.485795259475708, "learning_rate": 0.0001106208556871128, "loss": 3.464154052734375, "step": 182500 }, { "epoch": 1.8948395197526124, "grad_norm": 104.476318359375, "learning_rate": 0.00011051708572436622, "loss": 3.4480935668945314, "step": 182600 }, { "epoch": 1.8958772193800781, "grad_norm": 2.829188346862793, "learning_rate": 0.00011041331576161963, "loss": 3.593952331542969, "step": 182700 }, { "epoch": 1.896914919007544, "grad_norm": 4.845984935760498, "learning_rate": 0.00011030954579887306, "loss": 3.244365234375, "step": 182800 }, { "epoch": 1.89795261863501, "grad_norm": 2.055333375930786, "learning_rate": 0.00011020577583612647, "loss": 3.5465518188476564, "step": 182900 }, { "epoch": 1.8989903182624759, "grad_norm": 19.445037841796875, "learning_rate": 0.00011010200587337988, "loss": 3.5760122680664064, "step": 183000 }, { "epoch": 1.9000280178899416, "grad_norm": 3.0907251834869385, "learning_rate": 0.0001099982359106333, "loss": 3.524999084472656, "step": 183100 }, { "epoch": 1.9010657175174073, "grad_norm": 1.9697469472885132, "learning_rate": 0.00010989446594788671, "loss": 3.4634637451171875, "step": 183200 }, { "epoch": 1.9021034171448732, "grad_norm": 6.751926898956299, "learning_rate": 0.00010979069598514013, "loss": 3.4596926879882814, "step": 183300 }, { "epoch": 1.903141116772339, "grad_norm": 2.561213493347168, "learning_rate": 0.00010968692602239355, "loss": 3.5389044189453127, "step": 183400 }, { "epoch": 1.904178816399805, "grad_norm": 6.130541801452637, "learning_rate": 0.00010958315605964697, "loss": 3.4779763793945313, "step": 183500 }, { "epoch": 1.9052165160272707, "grad_norm": 3.2996444702148438, "learning_rate": 0.00010947938609690038, "loss": 3.4853436279296877, "step": 183600 }, { "epoch": 1.9062542156547366, "grad_norm": 4.535896301269531, "learning_rate": 0.0001093756161341538, "loss": 3.4235238647460937, "step": 183700 }, { "epoch": 1.9072919152822023, "grad_norm": 4.082485675811768, "learning_rate": 0.00010927184617140721, "loss": 3.4645541381835936, "step": 183800 }, { "epoch": 1.9083296149096682, "grad_norm": 5.501161098480225, "learning_rate": 0.00010916807620866062, "loss": 3.555899658203125, "step": 183900 }, { "epoch": 1.9093673145371342, "grad_norm": 7.624723434448242, "learning_rate": 0.00010906430624591404, "loss": 3.4653219604492187, "step": 184000 }, { "epoch": 1.9104050141646, "grad_norm": 3.386392116546631, "learning_rate": 0.00010896053628316746, "loss": 3.5530450439453123, "step": 184100 }, { "epoch": 1.9114427137920658, "grad_norm": 4.087791442871094, "learning_rate": 0.00010885676632042089, "loss": 3.470418701171875, "step": 184200 }, { "epoch": 1.9124804134195315, "grad_norm": 4.145429611206055, "learning_rate": 0.0001087529963576743, "loss": 3.416697692871094, "step": 184300 }, { "epoch": 1.9135181130469974, "grad_norm": 4.366927623748779, "learning_rate": 0.00010864922639492772, "loss": 3.4999765014648436, "step": 184400 }, { "epoch": 1.9145558126744633, "grad_norm": 4.084202289581299, "learning_rate": 0.00010854545643218113, "loss": 3.435041809082031, "step": 184500 }, { "epoch": 1.9155935123019292, "grad_norm": 9.935702323913574, "learning_rate": 0.00010844168646943455, "loss": 3.54100341796875, "step": 184600 }, { "epoch": 1.916631211929395, "grad_norm": 6.931925296783447, "learning_rate": 0.00010833791650668797, "loss": 3.5136874389648436, "step": 184700 }, { "epoch": 1.9176689115568606, "grad_norm": 3.0231878757476807, "learning_rate": 0.00010823414654394139, "loss": 3.6150555419921875, "step": 184800 }, { "epoch": 1.9187066111843265, "grad_norm": 3.3393242359161377, "learning_rate": 0.0001081303765811948, "loss": 3.479617004394531, "step": 184900 }, { "epoch": 1.9197443108117924, "grad_norm": 1.9449257850646973, "learning_rate": 0.00010802660661844821, "loss": 3.4772000122070312, "step": 185000 }, { "epoch": 1.9207820104392583, "grad_norm": 5.924251079559326, "learning_rate": 0.00010792283665570163, "loss": 3.558631591796875, "step": 185100 }, { "epoch": 1.9218197100667243, "grad_norm": 3.7242231369018555, "learning_rate": 0.00010781906669295504, "loss": 3.4901129150390626, "step": 185200 }, { "epoch": 1.92285740969419, "grad_norm": 4.291270732879639, "learning_rate": 0.00010771529673020847, "loss": 3.4830392456054686, "step": 185300 }, { "epoch": 1.9238951093216556, "grad_norm": 8.315948486328125, "learning_rate": 0.00010761152676746188, "loss": 3.654394226074219, "step": 185400 }, { "epoch": 1.9249328089491216, "grad_norm": 3.3864219188690186, "learning_rate": 0.0001075077568047153, "loss": 3.4916171264648437, "step": 185500 }, { "epoch": 1.9259705085765875, "grad_norm": 2.4446215629577637, "learning_rate": 0.00010740398684196871, "loss": 3.5801641845703127, "step": 185600 }, { "epoch": 1.9270082082040534, "grad_norm": 4.319270133972168, "learning_rate": 0.00010730021687922213, "loss": 3.485596008300781, "step": 185700 }, { "epoch": 1.928045907831519, "grad_norm": 12.243918418884277, "learning_rate": 0.00010719644691647554, "loss": 3.297283020019531, "step": 185800 }, { "epoch": 1.9290836074589848, "grad_norm": 3.614396333694458, "learning_rate": 0.00010709267695372895, "loss": 3.4672842407226563, "step": 185900 }, { "epoch": 1.9301213070864507, "grad_norm": 7.824878692626953, "learning_rate": 0.00010698890699098239, "loss": 3.5030999755859376, "step": 186000 }, { "epoch": 1.9311590067139166, "grad_norm": 11.845438003540039, "learning_rate": 0.0001068851370282358, "loss": 3.5722430419921873, "step": 186100 }, { "epoch": 1.9321967063413825, "grad_norm": 8.008241653442383, "learning_rate": 0.00010678136706548922, "loss": 3.4848983764648436, "step": 186200 }, { "epoch": 1.9332344059688482, "grad_norm": 38.26485824584961, "learning_rate": 0.00010667759710274262, "loss": 3.4654171752929686, "step": 186300 }, { "epoch": 1.9342721055963141, "grad_norm": 3.587207317352295, "learning_rate": 0.00010657382713999605, "loss": 3.443753967285156, "step": 186400 }, { "epoch": 1.9353098052237798, "grad_norm": 7.548192024230957, "learning_rate": 0.00010647005717724946, "loss": 3.555989074707031, "step": 186500 }, { "epoch": 1.9363475048512457, "grad_norm": 5.652491092681885, "learning_rate": 0.00010636628721450289, "loss": 3.5138848876953124, "step": 186600 }, { "epoch": 1.9373852044787117, "grad_norm": 4.181760311126709, "learning_rate": 0.0001062625172517563, "loss": 3.4649755859375, "step": 186700 }, { "epoch": 1.9384229041061776, "grad_norm": 39.51677703857422, "learning_rate": 0.00010615874728900972, "loss": 3.4170611572265623, "step": 186800 }, { "epoch": 1.9394606037336433, "grad_norm": 5.663796901702881, "learning_rate": 0.00010605497732626313, "loss": 3.6423403930664064, "step": 186900 }, { "epoch": 1.940498303361109, "grad_norm": 49.58971405029297, "learning_rate": 0.00010595120736351654, "loss": 3.556903076171875, "step": 187000 }, { "epoch": 1.9415360029885749, "grad_norm": 4.037705421447754, "learning_rate": 0.00010584743740076996, "loss": 3.581287536621094, "step": 187100 }, { "epoch": 1.9425737026160408, "grad_norm": 2.6354784965515137, "learning_rate": 0.00010574366743802338, "loss": 3.4927523803710936, "step": 187200 }, { "epoch": 1.9436114022435067, "grad_norm": 3.8889167308807373, "learning_rate": 0.0001056398974752768, "loss": 3.485701904296875, "step": 187300 }, { "epoch": 1.9446491018709724, "grad_norm": 6.694062232971191, "learning_rate": 0.00010553612751253021, "loss": 3.3910641479492187, "step": 187400 }, { "epoch": 1.9456868014984383, "grad_norm": 5.231113910675049, "learning_rate": 0.00010543235754978363, "loss": 3.5116064453125, "step": 187500 }, { "epoch": 1.946724501125904, "grad_norm": 13.281269073486328, "learning_rate": 0.00010532858758703704, "loss": 3.5454452514648436, "step": 187600 }, { "epoch": 1.94776220075337, "grad_norm": 5.362813472747803, "learning_rate": 0.00010522481762429046, "loss": 3.5717642211914065, "step": 187700 }, { "epoch": 1.9487999003808358, "grad_norm": 3.0265583992004395, "learning_rate": 0.00010512104766154387, "loss": 3.529712829589844, "step": 187800 }, { "epoch": 1.9498376000083018, "grad_norm": 2.4003071784973145, "learning_rate": 0.00010501727769879731, "loss": 3.5179287719726564, "step": 187900 }, { "epoch": 1.9508752996357674, "grad_norm": 3.5519869327545166, "learning_rate": 0.00010491350773605072, "loss": 3.3665447998046876, "step": 188000 }, { "epoch": 1.9519129992632331, "grad_norm": 1.9300223588943481, "learning_rate": 0.00010480973777330412, "loss": 3.5477023315429688, "step": 188100 }, { "epoch": 1.952950698890699, "grad_norm": 3.3745410442352295, "learning_rate": 0.00010470596781055755, "loss": 3.5283209228515626, "step": 188200 }, { "epoch": 1.953988398518165, "grad_norm": 18.314775466918945, "learning_rate": 0.00010460219784781096, "loss": 3.4730484008789064, "step": 188300 }, { "epoch": 1.9550260981456309, "grad_norm": 4.006529331207275, "learning_rate": 0.00010449842788506438, "loss": 3.4675115966796874, "step": 188400 }, { "epoch": 1.9560637977730966, "grad_norm": 4.9441094398498535, "learning_rate": 0.0001043946579223178, "loss": 3.404721984863281, "step": 188500 }, { "epoch": 1.9571014974005623, "grad_norm": 3.18265962600708, "learning_rate": 0.00010429088795957122, "loss": 3.667085876464844, "step": 188600 }, { "epoch": 1.9581391970280282, "grad_norm": 3.0164151191711426, "learning_rate": 0.00010418711799682463, "loss": 3.5224847412109375, "step": 188700 }, { "epoch": 1.959176896655494, "grad_norm": 5.3650007247924805, "learning_rate": 0.00010408334803407805, "loss": 3.4098544311523438, "step": 188800 }, { "epoch": 1.96021459628296, "grad_norm": 6.3775224685668945, "learning_rate": 0.00010397957807133146, "loss": 3.649906005859375, "step": 188900 }, { "epoch": 1.9612522959104257, "grad_norm": 18.32954978942871, "learning_rate": 0.00010387580810858487, "loss": 3.642203674316406, "step": 189000 }, { "epoch": 1.9622899955378916, "grad_norm": 3.267017126083374, "learning_rate": 0.0001037720381458383, "loss": 3.522268981933594, "step": 189100 }, { "epoch": 1.9633276951653573, "grad_norm": 3.3189854621887207, "learning_rate": 0.00010366826818309171, "loss": 3.525494384765625, "step": 189200 }, { "epoch": 1.9643653947928232, "grad_norm": 20.459917068481445, "learning_rate": 0.00010356449822034513, "loss": 3.4846673583984376, "step": 189300 }, { "epoch": 1.9654030944202892, "grad_norm": 10.600302696228027, "learning_rate": 0.00010346072825759854, "loss": 3.4710623168945314, "step": 189400 }, { "epoch": 1.966440794047755, "grad_norm": 5.836012363433838, "learning_rate": 0.00010335695829485196, "loss": 3.395472412109375, "step": 189500 }, { "epoch": 1.9674784936752208, "grad_norm": 1.8093000650405884, "learning_rate": 0.00010325318833210537, "loss": 3.4295391845703125, "step": 189600 }, { "epoch": 1.9685161933026865, "grad_norm": 3.580705165863037, "learning_rate": 0.0001031494183693588, "loss": 3.571369934082031, "step": 189700 }, { "epoch": 1.9695538929301524, "grad_norm": 4.870438575744629, "learning_rate": 0.00010304564840661222, "loss": 3.520045166015625, "step": 189800 }, { "epoch": 1.9705915925576183, "grad_norm": 3.781505823135376, "learning_rate": 0.00010294187844386564, "loss": 3.5424517822265624, "step": 189900 }, { "epoch": 1.9716292921850842, "grad_norm": 3.340085983276367, "learning_rate": 0.00010283810848111905, "loss": 3.518573913574219, "step": 190000 }, { "epoch": 1.97266699181255, "grad_norm": 5.02490234375, "learning_rate": 0.00010273433851837245, "loss": 3.3679263305664064, "step": 190100 }, { "epoch": 1.9737046914400158, "grad_norm": 4.117876052856445, "learning_rate": 0.00010263056855562588, "loss": 3.5929489135742188, "step": 190200 }, { "epoch": 1.9747423910674815, "grad_norm": 3.8365478515625, "learning_rate": 0.00010252679859287929, "loss": 3.40560302734375, "step": 190300 }, { "epoch": 1.9757800906949474, "grad_norm": 7.205904006958008, "learning_rate": 0.00010242302863013272, "loss": 3.38099609375, "step": 190400 }, { "epoch": 1.9768177903224133, "grad_norm": 2.767961025238037, "learning_rate": 0.00010231925866738613, "loss": 3.4381674194335936, "step": 190500 }, { "epoch": 1.9778554899498793, "grad_norm": 4.335025310516357, "learning_rate": 0.00010221548870463955, "loss": 3.3964199829101562, "step": 190600 }, { "epoch": 1.978893189577345, "grad_norm": 4.294001579284668, "learning_rate": 0.00010211171874189296, "loss": 3.411571350097656, "step": 190700 }, { "epoch": 1.9799308892048106, "grad_norm": 3.6443490982055664, "learning_rate": 0.00010200794877914638, "loss": 3.4534707641601563, "step": 190800 }, { "epoch": 1.9809685888322766, "grad_norm": 4.729245662689209, "learning_rate": 0.00010190417881639979, "loss": 3.577586669921875, "step": 190900 }, { "epoch": 1.9820062884597425, "grad_norm": 3.587510108947754, "learning_rate": 0.00010180040885365323, "loss": 3.4148577880859374, "step": 191000 }, { "epoch": 1.9830439880872084, "grad_norm": 13.635988235473633, "learning_rate": 0.00010169663889090663, "loss": 3.531971435546875, "step": 191100 }, { "epoch": 1.984081687714674, "grad_norm": 4.0034356117248535, "learning_rate": 0.00010159286892816004, "loss": 3.464627685546875, "step": 191200 }, { "epoch": 1.98511938734214, "grad_norm": 4.326283931732178, "learning_rate": 0.00010148909896541346, "loss": 3.4689093017578125, "step": 191300 }, { "epoch": 1.9861570869696057, "grad_norm": 10.159041404724121, "learning_rate": 0.00010138532900266687, "loss": 3.4093603515625, "step": 191400 }, { "epoch": 1.9871947865970716, "grad_norm": 6.295145511627197, "learning_rate": 0.0001012815590399203, "loss": 3.4013311767578127, "step": 191500 }, { "epoch": 1.9882324862245375, "grad_norm": 2.6228549480438232, "learning_rate": 0.0001011777890771737, "loss": 3.4039892578125, "step": 191600 }, { "epoch": 1.9892701858520034, "grad_norm": 2.0637784004211426, "learning_rate": 0.00010107401911442714, "loss": 3.4192919921875, "step": 191700 }, { "epoch": 1.9903078854794691, "grad_norm": 4.193583011627197, "learning_rate": 0.00010097024915168055, "loss": 3.5069757080078126, "step": 191800 }, { "epoch": 1.9913455851069348, "grad_norm": 3.6812117099761963, "learning_rate": 0.00010086647918893397, "loss": 3.421480712890625, "step": 191900 }, { "epoch": 1.9923832847344007, "grad_norm": 33.859195709228516, "learning_rate": 0.00010076270922618738, "loss": 3.506886291503906, "step": 192000 }, { "epoch": 1.9934209843618667, "grad_norm": 3.308947801589966, "learning_rate": 0.00010065893926344079, "loss": 3.424991455078125, "step": 192100 }, { "epoch": 1.9944586839893326, "grad_norm": 4.380412578582764, "learning_rate": 0.00010055516930069421, "loss": 3.4896340942382813, "step": 192200 }, { "epoch": 1.9954963836167983, "grad_norm": 3.492359161376953, "learning_rate": 0.00010045139933794763, "loss": 3.403392333984375, "step": 192300 }, { "epoch": 1.996534083244264, "grad_norm": 8.865891456604004, "learning_rate": 0.00010034762937520105, "loss": 3.60391845703125, "step": 192400 }, { "epoch": 1.9975717828717299, "grad_norm": 1.982731819152832, "learning_rate": 0.00010024385941245446, "loss": 3.5614895629882812, "step": 192500 }, { "epoch": 1.9986094824991958, "grad_norm": 2.9287161827087402, "learning_rate": 0.00010014008944970788, "loss": 3.5097760009765624, "step": 192600 }, { "epoch": 1.9996471821266617, "grad_norm": 1.8267062902450562, "learning_rate": 0.00010003631948696129, "loss": 3.4958160400390623, "step": 192700 } ], "logging_steps": 100, "max_steps": 289101, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.2645192822135194e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }