web-base-model / trainer_state.json
simonko912's picture
Upload 12 files
4384525 verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 192734,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010376996274658336,
"grad_norm": 2.8394250869750977,
"learning_rate": 0.0002998972677368809,
"loss": 7.61689697265625,
"step": 100
},
{
"epoch": 0.0020753992549316673,
"grad_norm": 1.2515239715576172,
"learning_rate": 0.00029979349777413427,
"loss": 6.9781201171875,
"step": 200
},
{
"epoch": 0.0031130988823975013,
"grad_norm": 3.6369314193725586,
"learning_rate": 0.0002996897278113877,
"loss": 6.69011474609375,
"step": 300
},
{
"epoch": 0.0041507985098633345,
"grad_norm": 2.6945459842681885,
"learning_rate": 0.0002995859578486411,
"loss": 6.55205078125,
"step": 400
},
{
"epoch": 0.005188498137329169,
"grad_norm": 1.4870922565460205,
"learning_rate": 0.0002994821878858945,
"loss": 6.272236938476563,
"step": 500
},
{
"epoch": 0.006226197764795003,
"grad_norm": 2.198580265045166,
"learning_rate": 0.00029937841792314796,
"loss": 6.2509613037109375,
"step": 600
},
{
"epoch": 0.007263897392260836,
"grad_norm": 1.332912564277649,
"learning_rate": 0.00029927464796040135,
"loss": 6.2750787353515625,
"step": 700
},
{
"epoch": 0.008301597019726669,
"grad_norm": 1.6891261339187622,
"learning_rate": 0.0002991708779976548,
"loss": 6.012156372070312,
"step": 800
},
{
"epoch": 0.009339296647192503,
"grad_norm": 2.389779806137085,
"learning_rate": 0.0002990671080349082,
"loss": 6.011610717773437,
"step": 900
},
{
"epoch": 0.010376996274658337,
"grad_norm": 3.896207332611084,
"learning_rate": 0.0002989633380721616,
"loss": 5.872296752929688,
"step": 1000
},
{
"epoch": 0.011414695902124171,
"grad_norm": 1.2714102268218994,
"learning_rate": 0.00029885956810941504,
"loss": 5.8444580078125,
"step": 1100
},
{
"epoch": 0.012452395529590005,
"grad_norm": 1.9793014526367188,
"learning_rate": 0.00029875579814666844,
"loss": 5.780259399414063,
"step": 1200
},
{
"epoch": 0.01349009515705584,
"grad_norm": 1.7210673093795776,
"learning_rate": 0.0002986520281839219,
"loss": 5.784580688476563,
"step": 1300
},
{
"epoch": 0.014527794784521672,
"grad_norm": 3.133103609085083,
"learning_rate": 0.0002985482582211753,
"loss": 5.726546020507812,
"step": 1400
},
{
"epoch": 0.015565494411987506,
"grad_norm": 3.7988669872283936,
"learning_rate": 0.0002984444882584287,
"loss": 5.659859619140625,
"step": 1500
},
{
"epoch": 0.016603194039453338,
"grad_norm": 1.580628514289856,
"learning_rate": 0.00029834071829568207,
"loss": 5.710869140625,
"step": 1600
},
{
"epoch": 0.017640893666919174,
"grad_norm": 2.1428017616271973,
"learning_rate": 0.0002982369483329355,
"loss": 5.61485107421875,
"step": 1700
},
{
"epoch": 0.018678593294385006,
"grad_norm": 1.9413044452667236,
"learning_rate": 0.00029813317837018897,
"loss": 5.542117309570313,
"step": 1800
},
{
"epoch": 0.019716292921850842,
"grad_norm": 1.9118558168411255,
"learning_rate": 0.00029802940840744236,
"loss": 5.524238891601563,
"step": 1900
},
{
"epoch": 0.020753992549316674,
"grad_norm": 1.9226549863815308,
"learning_rate": 0.00029792563844469576,
"loss": 5.544407348632813,
"step": 2000
},
{
"epoch": 0.02179169217678251,
"grad_norm": 3.6845390796661377,
"learning_rate": 0.00029782186848194915,
"loss": 5.507258911132812,
"step": 2100
},
{
"epoch": 0.022829391804248342,
"grad_norm": 1.113272786140442,
"learning_rate": 0.0002977180985192026,
"loss": 5.420562133789063,
"step": 2200
},
{
"epoch": 0.023867091431714175,
"grad_norm": 1.05723237991333,
"learning_rate": 0.00029761432855645605,
"loss": 5.467652587890625,
"step": 2300
},
{
"epoch": 0.02490479105918001,
"grad_norm": 3.3967299461364746,
"learning_rate": 0.00029751055859370944,
"loss": 5.412258911132812,
"step": 2400
},
{
"epoch": 0.025942490686645843,
"grad_norm": 2.4142208099365234,
"learning_rate": 0.0002974067886309629,
"loss": 5.421605224609375,
"step": 2500
},
{
"epoch": 0.02698019031411168,
"grad_norm": 1.577314853668213,
"learning_rate": 0.0002973030186682163,
"loss": 5.2732666015625,
"step": 2600
},
{
"epoch": 0.02801788994157751,
"grad_norm": 2.5680480003356934,
"learning_rate": 0.0002971992487054697,
"loss": 5.42623779296875,
"step": 2700
},
{
"epoch": 0.029055589569043343,
"grad_norm": 1.665701150894165,
"learning_rate": 0.0002970954787427231,
"loss": 5.345192260742188,
"step": 2800
},
{
"epoch": 0.03009328919650918,
"grad_norm": 1.3420246839523315,
"learning_rate": 0.0002969917087799765,
"loss": 5.259754028320312,
"step": 2900
},
{
"epoch": 0.03113098882397501,
"grad_norm": 1.4943575859069824,
"learning_rate": 0.00029688793881723,
"loss": 5.325694580078125,
"step": 3000
},
{
"epoch": 0.032168688451440844,
"grad_norm": 1.7797436714172363,
"learning_rate": 0.00029678416885448337,
"loss": 5.393818359375,
"step": 3100
},
{
"epoch": 0.033206388078906676,
"grad_norm": 3.023359537124634,
"learning_rate": 0.00029668039889173677,
"loss": 5.23187255859375,
"step": 3200
},
{
"epoch": 0.034244087706372515,
"grad_norm": 1.9899531602859497,
"learning_rate": 0.00029657662892899016,
"loss": 5.1434765625,
"step": 3300
},
{
"epoch": 0.03528178733383835,
"grad_norm": 1.0039557218551636,
"learning_rate": 0.0002964728589662436,
"loss": 5.28422607421875,
"step": 3400
},
{
"epoch": 0.03631948696130418,
"grad_norm": 1.9204686880111694,
"learning_rate": 0.000296369089003497,
"loss": 5.149194946289063,
"step": 3500
},
{
"epoch": 0.03735718658877001,
"grad_norm": 1.5530883073806763,
"learning_rate": 0.00029626531904075045,
"loss": 5.0889456176757815,
"step": 3600
},
{
"epoch": 0.03839488621623585,
"grad_norm": 1.4477442502975464,
"learning_rate": 0.00029616154907800385,
"loss": 5.225645751953125,
"step": 3700
},
{
"epoch": 0.039432585843701684,
"grad_norm": 2.998966693878174,
"learning_rate": 0.00029605777911525724,
"loss": 5.127691650390625,
"step": 3800
},
{
"epoch": 0.040470285471167516,
"grad_norm": 1.1760146617889404,
"learning_rate": 0.0002959540091525107,
"loss": 5.099805908203125,
"step": 3900
},
{
"epoch": 0.04150798509863335,
"grad_norm": 1.6684191226959229,
"learning_rate": 0.0002958502391897641,
"loss": 5.195625,
"step": 4000
},
{
"epoch": 0.04254568472609918,
"grad_norm": 3.276620864868164,
"learning_rate": 0.00029574646922701754,
"loss": 5.0514678955078125,
"step": 4100
},
{
"epoch": 0.04358338435356502,
"grad_norm": 1.505712628364563,
"learning_rate": 0.00029564269926427093,
"loss": 5.234470825195313,
"step": 4200
},
{
"epoch": 0.04462108398103085,
"grad_norm": 1.561785101890564,
"learning_rate": 0.0002955389293015243,
"loss": 5.18435302734375,
"step": 4300
},
{
"epoch": 0.045658783608496685,
"grad_norm": 2.103935956954956,
"learning_rate": 0.0002954351593387778,
"loss": 5.127916259765625,
"step": 4400
},
{
"epoch": 0.04669648323596252,
"grad_norm": 1.1984394788742065,
"learning_rate": 0.00029533138937603117,
"loss": 5.009371032714844,
"step": 4500
},
{
"epoch": 0.04773418286342835,
"grad_norm": 1.35122549533844,
"learning_rate": 0.0002952276194132846,
"loss": 4.988144836425781,
"step": 4600
},
{
"epoch": 0.04877188249089419,
"grad_norm": 1.7199909687042236,
"learning_rate": 0.000295123849450538,
"loss": 5.139700317382813,
"step": 4700
},
{
"epoch": 0.04980958211836002,
"grad_norm": 2.299783706665039,
"learning_rate": 0.00029502007948779146,
"loss": 5.189196166992187,
"step": 4800
},
{
"epoch": 0.050847281745825854,
"grad_norm": 1.251342535018921,
"learning_rate": 0.00029491630952504486,
"loss": 5.0067724609375,
"step": 4900
},
{
"epoch": 0.051884981373291686,
"grad_norm": 1.7228055000305176,
"learning_rate": 0.00029481253956229825,
"loss": 5.058696594238281,
"step": 5000
},
{
"epoch": 0.05292268100075752,
"grad_norm": 1.2999722957611084,
"learning_rate": 0.0002947087695995517,
"loss": 4.953595275878906,
"step": 5100
},
{
"epoch": 0.05396038062822336,
"grad_norm": 2.576788902282715,
"learning_rate": 0.0002946049996368051,
"loss": 4.935113220214844,
"step": 5200
},
{
"epoch": 0.05499808025568919,
"grad_norm": 3.006600856781006,
"learning_rate": 0.00029450122967405854,
"loss": 5.13054931640625,
"step": 5300
},
{
"epoch": 0.05603577988315502,
"grad_norm": 1.5450797080993652,
"learning_rate": 0.00029439745971131194,
"loss": 4.888633117675782,
"step": 5400
},
{
"epoch": 0.057073479510620855,
"grad_norm": 1.9071307182312012,
"learning_rate": 0.00029429368974856533,
"loss": 4.968219299316406,
"step": 5500
},
{
"epoch": 0.05811117913808669,
"grad_norm": 1.2374857664108276,
"learning_rate": 0.0002941899197858188,
"loss": 5.0035269165039065,
"step": 5600
},
{
"epoch": 0.059148878765552526,
"grad_norm": 1.270337462425232,
"learning_rate": 0.0002940861498230722,
"loss": 4.9964404296875,
"step": 5700
},
{
"epoch": 0.06018657839301836,
"grad_norm": 2.112285614013672,
"learning_rate": 0.0002939823798603256,
"loss": 4.882070007324219,
"step": 5800
},
{
"epoch": 0.06122427802048419,
"grad_norm": 1.2048200368881226,
"learning_rate": 0.000293878609897579,
"loss": 4.689561767578125,
"step": 5900
},
{
"epoch": 0.06226197764795002,
"grad_norm": 1.213274359703064,
"learning_rate": 0.0002937748399348324,
"loss": 4.969376525878906,
"step": 6000
},
{
"epoch": 0.06329967727541586,
"grad_norm": 1.1453360319137573,
"learning_rate": 0.00029367106997208587,
"loss": 4.848797302246094,
"step": 6100
},
{
"epoch": 0.06433737690288169,
"grad_norm": 1.78568696975708,
"learning_rate": 0.00029356730000933926,
"loss": 4.889250793457031,
"step": 6200
},
{
"epoch": 0.06537507653034752,
"grad_norm": 1.004668951034546,
"learning_rate": 0.0002934635300465927,
"loss": 4.881064758300782,
"step": 6300
},
{
"epoch": 0.06641277615781335,
"grad_norm": 3.34089994430542,
"learning_rate": 0.0002933597600838461,
"loss": 4.922989501953125,
"step": 6400
},
{
"epoch": 0.0674504757852792,
"grad_norm": 1.7132960557937622,
"learning_rate": 0.00029325599012109955,
"loss": 4.900790405273438,
"step": 6500
},
{
"epoch": 0.06848817541274503,
"grad_norm": 3.6154215335845947,
"learning_rate": 0.00029315222015835295,
"loss": 4.858998718261719,
"step": 6600
},
{
"epoch": 0.06952587504021086,
"grad_norm": 2.199787139892578,
"learning_rate": 0.00029304845019560634,
"loss": 4.776265258789063,
"step": 6700
},
{
"epoch": 0.0705635746676767,
"grad_norm": 1.193831443786621,
"learning_rate": 0.0002929446802328598,
"loss": 4.933597717285156,
"step": 6800
},
{
"epoch": 0.07160127429514253,
"grad_norm": 1.0364950895309448,
"learning_rate": 0.0002928409102701132,
"loss": 4.812368469238281,
"step": 6900
},
{
"epoch": 0.07263897392260836,
"grad_norm": 4.54287576675415,
"learning_rate": 0.00029273714030736664,
"loss": 4.874449157714844,
"step": 7000
},
{
"epoch": 0.07367667355007419,
"grad_norm": 1.9481868743896484,
"learning_rate": 0.00029263337034462003,
"loss": 4.836025390625,
"step": 7100
},
{
"epoch": 0.07471437317754002,
"grad_norm": 1.5283995866775513,
"learning_rate": 0.0002925296003818734,
"loss": 4.789447631835937,
"step": 7200
},
{
"epoch": 0.07575207280500586,
"grad_norm": 1.1243209838867188,
"learning_rate": 0.0002924258304191268,
"loss": 4.771495971679688,
"step": 7300
},
{
"epoch": 0.0767897724324717,
"grad_norm": 1.2010672092437744,
"learning_rate": 0.00029232206045638027,
"loss": 4.796032104492188,
"step": 7400
},
{
"epoch": 0.07782747205993754,
"grad_norm": 1.3179821968078613,
"learning_rate": 0.0002922182904936337,
"loss": 4.949848022460937,
"step": 7500
},
{
"epoch": 0.07886517168740337,
"grad_norm": 2.766585111618042,
"learning_rate": 0.0002921145205308871,
"loss": 4.7913055419921875,
"step": 7600
},
{
"epoch": 0.0799028713148692,
"grad_norm": 1.301639437675476,
"learning_rate": 0.0002920107505681405,
"loss": 4.828057556152344,
"step": 7700
},
{
"epoch": 0.08094057094233503,
"grad_norm": 1.205676794052124,
"learning_rate": 0.0002919069806053939,
"loss": 4.7562734985351565,
"step": 7800
},
{
"epoch": 0.08197827056980087,
"grad_norm": 2.1412694454193115,
"learning_rate": 0.00029180321064264735,
"loss": 4.7240576171875,
"step": 7900
},
{
"epoch": 0.0830159701972667,
"grad_norm": 1.9297393560409546,
"learning_rate": 0.0002916994406799008,
"loss": 4.752750244140625,
"step": 8000
},
{
"epoch": 0.08405366982473253,
"grad_norm": 1.5971039533615112,
"learning_rate": 0.0002915956707171542,
"loss": 4.7790225219726565,
"step": 8100
},
{
"epoch": 0.08509136945219836,
"grad_norm": 1.4667614698410034,
"learning_rate": 0.0002914919007544076,
"loss": 4.823405151367187,
"step": 8200
},
{
"epoch": 0.0861290690796642,
"grad_norm": 1.8018951416015625,
"learning_rate": 0.000291388130791661,
"loss": 4.806950378417969,
"step": 8300
},
{
"epoch": 0.08716676870713004,
"grad_norm": 3.0917904376983643,
"learning_rate": 0.00029128436082891443,
"loss": 4.716513977050782,
"step": 8400
},
{
"epoch": 0.08820446833459587,
"grad_norm": 1.8211461305618286,
"learning_rate": 0.00029118059086616783,
"loss": 4.803590393066406,
"step": 8500
},
{
"epoch": 0.0892421679620617,
"grad_norm": 1.4940656423568726,
"learning_rate": 0.0002910768209034213,
"loss": 4.682643737792969,
"step": 8600
},
{
"epoch": 0.09027986758952754,
"grad_norm": 1.432560682296753,
"learning_rate": 0.00029097305094067473,
"loss": 4.758638610839844,
"step": 8700
},
{
"epoch": 0.09131756721699337,
"grad_norm": 1.0015602111816406,
"learning_rate": 0.0002908692809779281,
"loss": 4.829322204589844,
"step": 8800
},
{
"epoch": 0.0923552668444592,
"grad_norm": 1.3050769567489624,
"learning_rate": 0.0002907655110151815,
"loss": 4.62219482421875,
"step": 8900
},
{
"epoch": 0.09339296647192503,
"grad_norm": 1.0704928636550903,
"learning_rate": 0.0002906617410524349,
"loss": 4.6304998779296875,
"step": 9000
},
{
"epoch": 0.09443066609939087,
"grad_norm": 2.2267684936523438,
"learning_rate": 0.00029055797108968836,
"loss": 4.664536437988281,
"step": 9100
},
{
"epoch": 0.0954683657268567,
"grad_norm": 2.4608747959136963,
"learning_rate": 0.00029045420112694176,
"loss": 4.759125366210937,
"step": 9200
},
{
"epoch": 0.09650606535432253,
"grad_norm": 1.5068875551223755,
"learning_rate": 0.0002903504311641952,
"loss": 4.665271606445312,
"step": 9300
},
{
"epoch": 0.09754376498178838,
"grad_norm": 2.078646421432495,
"learning_rate": 0.0002902466612014486,
"loss": 4.739638671875,
"step": 9400
},
{
"epoch": 0.09858146460925421,
"grad_norm": 1.3762885332107544,
"learning_rate": 0.000290142891238702,
"loss": 4.698047485351562,
"step": 9500
},
{
"epoch": 0.09961916423672004,
"grad_norm": 1.2879425287246704,
"learning_rate": 0.00029003912127595544,
"loss": 4.619927673339844,
"step": 9600
},
{
"epoch": 0.10065686386418587,
"grad_norm": 1.584159016609192,
"learning_rate": 0.00028993535131320884,
"loss": 4.748394165039063,
"step": 9700
},
{
"epoch": 0.10169456349165171,
"grad_norm": 1.453415870666504,
"learning_rate": 0.0002898315813504623,
"loss": 4.62876220703125,
"step": 9800
},
{
"epoch": 0.10273226311911754,
"grad_norm": 0.965919017791748,
"learning_rate": 0.0002897278113877157,
"loss": 4.665562438964844,
"step": 9900
},
{
"epoch": 0.10376996274658337,
"grad_norm": 1.2607330083847046,
"learning_rate": 0.0002896240414249691,
"loss": 4.7940805053710935,
"step": 10000
},
{
"epoch": 0.1048076623740492,
"grad_norm": 1.0126069784164429,
"learning_rate": 0.0002895202714622225,
"loss": 4.7508541870117185,
"step": 10100
},
{
"epoch": 0.10584536200151504,
"grad_norm": 1.541813850402832,
"learning_rate": 0.0002894165014994759,
"loss": 4.57702880859375,
"step": 10200
},
{
"epoch": 0.10688306162898087,
"grad_norm": 2.78938889503479,
"learning_rate": 0.00028931273153672937,
"loss": 4.652121887207032,
"step": 10300
},
{
"epoch": 0.10792076125644672,
"grad_norm": 2.3567938804626465,
"learning_rate": 0.00028920896157398276,
"loss": 4.566509094238281,
"step": 10400
},
{
"epoch": 0.10895846088391255,
"grad_norm": 1.0480419397354126,
"learning_rate": 0.0002891051916112362,
"loss": 4.611513977050781,
"step": 10500
},
{
"epoch": 0.10999616051137838,
"grad_norm": 1.577042579650879,
"learning_rate": 0.0002890014216484896,
"loss": 4.62977783203125,
"step": 10600
},
{
"epoch": 0.11103386013884421,
"grad_norm": 1.5839786529541016,
"learning_rate": 0.000288897651685743,
"loss": 4.569055786132813,
"step": 10700
},
{
"epoch": 0.11207155976631004,
"grad_norm": 3.9769680500030518,
"learning_rate": 0.00028879388172299645,
"loss": 4.6786282348632815,
"step": 10800
},
{
"epoch": 0.11310925939377588,
"grad_norm": 1.8089715242385864,
"learning_rate": 0.00028869011176024985,
"loss": 4.630350036621094,
"step": 10900
},
{
"epoch": 0.11414695902124171,
"grad_norm": 1.4216063022613525,
"learning_rate": 0.0002885863417975033,
"loss": 4.669395751953125,
"step": 11000
},
{
"epoch": 0.11518465864870754,
"grad_norm": 1.2107151746749878,
"learning_rate": 0.0002884825718347567,
"loss": 4.612738342285156,
"step": 11100
},
{
"epoch": 0.11622235827617337,
"grad_norm": 1.5037158727645874,
"learning_rate": 0.0002883788018720101,
"loss": 4.534631958007813,
"step": 11200
},
{
"epoch": 0.1172600579036392,
"grad_norm": 1.1375142335891724,
"learning_rate": 0.00028827503190926353,
"loss": 4.803286437988281,
"step": 11300
},
{
"epoch": 0.11829775753110505,
"grad_norm": 1.8553053140640259,
"learning_rate": 0.00028817126194651693,
"loss": 4.684965515136719,
"step": 11400
},
{
"epoch": 0.11933545715857088,
"grad_norm": 5.896717071533203,
"learning_rate": 0.0002880674919837704,
"loss": 4.533707275390625,
"step": 11500
},
{
"epoch": 0.12037315678603672,
"grad_norm": 0.9495351910591125,
"learning_rate": 0.0002879637220210238,
"loss": 4.481864013671875,
"step": 11600
},
{
"epoch": 0.12141085641350255,
"grad_norm": 1.2148685455322266,
"learning_rate": 0.00028785995205827717,
"loss": 4.508511047363282,
"step": 11700
},
{
"epoch": 0.12244855604096838,
"grad_norm": 1.2658835649490356,
"learning_rate": 0.0002877561820955306,
"loss": 4.453274841308594,
"step": 11800
},
{
"epoch": 0.12348625566843421,
"grad_norm": 1.0808942317962646,
"learning_rate": 0.000287652412132784,
"loss": 4.470396118164063,
"step": 11900
},
{
"epoch": 0.12452395529590005,
"grad_norm": 2.0280075073242188,
"learning_rate": 0.00028754864217003746,
"loss": 4.629884643554687,
"step": 12000
},
{
"epoch": 0.12556165492336588,
"grad_norm": 1.6987171173095703,
"learning_rate": 0.00028744487220729086,
"loss": 4.673434143066406,
"step": 12100
},
{
"epoch": 0.1265993545508317,
"grad_norm": 1.076246976852417,
"learning_rate": 0.00028734110224454425,
"loss": 4.707933349609375,
"step": 12200
},
{
"epoch": 0.12763705417829754,
"grad_norm": 1.4878133535385132,
"learning_rate": 0.00028723733228179765,
"loss": 4.649747924804688,
"step": 12300
},
{
"epoch": 0.12867475380576338,
"grad_norm": 1.132073163986206,
"learning_rate": 0.0002871335623190511,
"loss": 4.510395812988281,
"step": 12400
},
{
"epoch": 0.1297124534332292,
"grad_norm": 1.172968864440918,
"learning_rate": 0.00028702979235630454,
"loss": 4.7042324829101565,
"step": 12500
},
{
"epoch": 0.13075015306069504,
"grad_norm": 1.331409215927124,
"learning_rate": 0.00028692602239355794,
"loss": 4.478284912109375,
"step": 12600
},
{
"epoch": 0.13178785268816087,
"grad_norm": 0.9544440507888794,
"learning_rate": 0.0002868222524308114,
"loss": 4.574405517578125,
"step": 12700
},
{
"epoch": 0.1328255523156267,
"grad_norm": 1.3560587167739868,
"learning_rate": 0.0002867184824680648,
"loss": 4.359691467285156,
"step": 12800
},
{
"epoch": 0.13386325194309256,
"grad_norm": 1.4807325601577759,
"learning_rate": 0.0002866147125053182,
"loss": 4.541731872558594,
"step": 12900
},
{
"epoch": 0.1349009515705584,
"grad_norm": 1.0621514320373535,
"learning_rate": 0.00028651094254257157,
"loss": 4.442927551269531,
"step": 13000
},
{
"epoch": 0.13593865119802423,
"grad_norm": 0.9886642098426819,
"learning_rate": 0.000286407172579825,
"loss": 4.690697326660156,
"step": 13100
},
{
"epoch": 0.13697635082549006,
"grad_norm": 1.9239803552627563,
"learning_rate": 0.00028630340261707847,
"loss": 4.497586669921875,
"step": 13200
},
{
"epoch": 0.1380140504529559,
"grad_norm": 1.644500494003296,
"learning_rate": 0.00028619963265433186,
"loss": 4.598764038085937,
"step": 13300
},
{
"epoch": 0.13905175008042173,
"grad_norm": 1.3600581884384155,
"learning_rate": 0.00028609586269158526,
"loss": 4.550304260253906,
"step": 13400
},
{
"epoch": 0.14008944970788756,
"grad_norm": 1.4329279661178589,
"learning_rate": 0.00028599209272883865,
"loss": 4.506571960449219,
"step": 13500
},
{
"epoch": 0.1411271493353534,
"grad_norm": 1.386486291885376,
"learning_rate": 0.0002858883227660921,
"loss": 4.419360046386719,
"step": 13600
},
{
"epoch": 0.14216484896281922,
"grad_norm": 0.9777548909187317,
"learning_rate": 0.00028578455280334555,
"loss": 4.371921691894531,
"step": 13700
},
{
"epoch": 0.14320254859028506,
"grad_norm": 1.323614239692688,
"learning_rate": 0.00028568078284059895,
"loss": 4.449886474609375,
"step": 13800
},
{
"epoch": 0.1442402482177509,
"grad_norm": 2.0104715824127197,
"learning_rate": 0.00028557701287785234,
"loss": 4.498194885253906,
"step": 13900
},
{
"epoch": 0.14527794784521672,
"grad_norm": 1.040453314781189,
"learning_rate": 0.00028547324291510574,
"loss": 4.410159301757813,
"step": 14000
},
{
"epoch": 0.14631564747268255,
"grad_norm": 1.6704965829849243,
"learning_rate": 0.0002853694729523592,
"loss": 4.4047763061523435,
"step": 14100
},
{
"epoch": 0.14735334710014839,
"grad_norm": 1.1640102863311768,
"learning_rate": 0.0002852657029896126,
"loss": 4.482722778320312,
"step": 14200
},
{
"epoch": 0.14839104672761422,
"grad_norm": 1.5910676717758179,
"learning_rate": 0.00028516193302686603,
"loss": 4.464485473632813,
"step": 14300
},
{
"epoch": 0.14942874635508005,
"grad_norm": 2.349853277206421,
"learning_rate": 0.0002850581630641194,
"loss": 4.478161010742188,
"step": 14400
},
{
"epoch": 0.15046644598254588,
"grad_norm": 1.6594980955123901,
"learning_rate": 0.0002849543931013728,
"loss": 4.524984741210938,
"step": 14500
},
{
"epoch": 0.15150414561001171,
"grad_norm": 1.0867830514907837,
"learning_rate": 0.00028485062313862627,
"loss": 4.444278259277343,
"step": 14600
},
{
"epoch": 0.15254184523747755,
"grad_norm": 1.4026222229003906,
"learning_rate": 0.00028474685317587966,
"loss": 4.562846374511719,
"step": 14700
},
{
"epoch": 0.1535795448649434,
"grad_norm": 1.7118810415267944,
"learning_rate": 0.0002846430832131331,
"loss": 4.434857177734375,
"step": 14800
},
{
"epoch": 0.15461724449240924,
"grad_norm": 1.3377333879470825,
"learning_rate": 0.0002845393132503865,
"loss": 4.50284912109375,
"step": 14900
},
{
"epoch": 0.15565494411987507,
"grad_norm": 1.0628588199615479,
"learning_rate": 0.00028443554328763996,
"loss": 4.467984924316406,
"step": 15000
},
{
"epoch": 0.1566926437473409,
"grad_norm": 1.122900366783142,
"learning_rate": 0.00028433177332489335,
"loss": 4.477691650390625,
"step": 15100
},
{
"epoch": 0.15773034337480674,
"grad_norm": 1.0721949338912964,
"learning_rate": 0.00028422800336214675,
"loss": 4.566653137207031,
"step": 15200
},
{
"epoch": 0.15876804300227257,
"grad_norm": 2.0959179401397705,
"learning_rate": 0.0002841242333994002,
"loss": 4.459400939941406,
"step": 15300
},
{
"epoch": 0.1598057426297384,
"grad_norm": 1.832321047782898,
"learning_rate": 0.0002840204634366536,
"loss": 4.441622009277344,
"step": 15400
},
{
"epoch": 0.16084344225720423,
"grad_norm": 1.9756203889846802,
"learning_rate": 0.00028391669347390704,
"loss": 4.5193002319335935,
"step": 15500
},
{
"epoch": 0.16188114188467007,
"grad_norm": 1.9734655618667603,
"learning_rate": 0.00028381292351116043,
"loss": 4.403963012695312,
"step": 15600
},
{
"epoch": 0.1629188415121359,
"grad_norm": 1.0987114906311035,
"learning_rate": 0.00028370915354841383,
"loss": 4.3827951049804685,
"step": 15700
},
{
"epoch": 0.16395654113960173,
"grad_norm": 1.0084813833236694,
"learning_rate": 0.0002836053835856673,
"loss": 4.431182861328125,
"step": 15800
},
{
"epoch": 0.16499424076706756,
"grad_norm": 0.8771688342094421,
"learning_rate": 0.00028350161362292067,
"loss": 4.386305236816407,
"step": 15900
},
{
"epoch": 0.1660319403945334,
"grad_norm": 1.960618495941162,
"learning_rate": 0.0002833978436601741,
"loss": 4.450301513671875,
"step": 16000
},
{
"epoch": 0.16706964002199923,
"grad_norm": 2.016059398651123,
"learning_rate": 0.0002832940736974275,
"loss": 4.443774719238281,
"step": 16100
},
{
"epoch": 0.16810733964946506,
"grad_norm": 2.1017072200775146,
"learning_rate": 0.0002831903037346809,
"loss": 4.387731323242187,
"step": 16200
},
{
"epoch": 0.1691450392769309,
"grad_norm": 3.876704216003418,
"learning_rate": 0.00028308653377193436,
"loss": 4.339099731445312,
"step": 16300
},
{
"epoch": 0.17018273890439672,
"grad_norm": 2.4443888664245605,
"learning_rate": 0.00028298276380918776,
"loss": 4.420601196289063,
"step": 16400
},
{
"epoch": 0.17122043853186256,
"grad_norm": 2.2986700534820557,
"learning_rate": 0.0002828789938464412,
"loss": 4.574692687988281,
"step": 16500
},
{
"epoch": 0.1722581381593284,
"grad_norm": 3.120959997177124,
"learning_rate": 0.0002827752238836946,
"loss": 4.3793856811523435,
"step": 16600
},
{
"epoch": 0.17329583778679422,
"grad_norm": 3.928020715713501,
"learning_rate": 0.00028267145392094805,
"loss": 4.389268188476563,
"step": 16700
},
{
"epoch": 0.17433353741426008,
"grad_norm": 1.5828691720962524,
"learning_rate": 0.00028256768395820144,
"loss": 4.353381652832031,
"step": 16800
},
{
"epoch": 0.1753712370417259,
"grad_norm": 1.0565470457077026,
"learning_rate": 0.00028246391399545484,
"loss": 4.289037170410157,
"step": 16900
},
{
"epoch": 0.17640893666919175,
"grad_norm": 1.7072774171829224,
"learning_rate": 0.0002823601440327083,
"loss": 4.325290832519531,
"step": 17000
},
{
"epoch": 0.17744663629665758,
"grad_norm": 1.0402146577835083,
"learning_rate": 0.0002822563740699617,
"loss": 4.450514221191407,
"step": 17100
},
{
"epoch": 0.1784843359241234,
"grad_norm": 1.4970057010650635,
"learning_rate": 0.00028215260410721513,
"loss": 4.393040161132813,
"step": 17200
},
{
"epoch": 0.17952203555158924,
"grad_norm": 1.266546607017517,
"learning_rate": 0.0002820488341444685,
"loss": 4.276432800292969,
"step": 17300
},
{
"epoch": 0.18055973517905508,
"grad_norm": 1.751590371131897,
"learning_rate": 0.0002819450641817219,
"loss": 4.40036376953125,
"step": 17400
},
{
"epoch": 0.1815974348065209,
"grad_norm": 1.5430057048797607,
"learning_rate": 0.00028184129421897537,
"loss": 4.279835205078125,
"step": 17500
},
{
"epoch": 0.18263513443398674,
"grad_norm": 4.205715179443359,
"learning_rate": 0.00028173752425622876,
"loss": 4.501398315429688,
"step": 17600
},
{
"epoch": 0.18367283406145257,
"grad_norm": 2.2290608882904053,
"learning_rate": 0.0002816337542934822,
"loss": 4.400292053222656,
"step": 17700
},
{
"epoch": 0.1847105336889184,
"grad_norm": 1.6409145593643188,
"learning_rate": 0.0002815299843307356,
"loss": 4.361965026855469,
"step": 17800
},
{
"epoch": 0.18574823331638424,
"grad_norm": 1.235737919807434,
"learning_rate": 0.000281426214367989,
"loss": 4.4263699340820315,
"step": 17900
},
{
"epoch": 0.18678593294385007,
"grad_norm": 1.8182483911514282,
"learning_rate": 0.0002813224444052424,
"loss": 4.38103759765625,
"step": 18000
},
{
"epoch": 0.1878236325713159,
"grad_norm": 1.725359559059143,
"learning_rate": 0.00028121867444249585,
"loss": 4.332106323242187,
"step": 18100
},
{
"epoch": 0.18886133219878173,
"grad_norm": 1.9186443090438843,
"learning_rate": 0.0002811149044797493,
"loss": 4.354175415039062,
"step": 18200
},
{
"epoch": 0.18989903182624757,
"grad_norm": 1.1907823085784912,
"learning_rate": 0.0002810111345170027,
"loss": 4.521398315429687,
"step": 18300
},
{
"epoch": 0.1909367314537134,
"grad_norm": 2.796095609664917,
"learning_rate": 0.0002809073645542561,
"loss": 4.280415649414063,
"step": 18400
},
{
"epoch": 0.19197443108117923,
"grad_norm": 2.043811798095703,
"learning_rate": 0.0002808035945915095,
"loss": 4.364379272460938,
"step": 18500
},
{
"epoch": 0.19301213070864506,
"grad_norm": 6.419173240661621,
"learning_rate": 0.00028069982462876293,
"loss": 4.420321044921875,
"step": 18600
},
{
"epoch": 0.1940498303361109,
"grad_norm": 2.0183868408203125,
"learning_rate": 0.0002805960546660163,
"loss": 4.203153381347656,
"step": 18700
},
{
"epoch": 0.19508752996357676,
"grad_norm": 1.1752562522888184,
"learning_rate": 0.00028049228470326977,
"loss": 4.362376098632812,
"step": 18800
},
{
"epoch": 0.1961252295910426,
"grad_norm": 1.7152916193008423,
"learning_rate": 0.0002803885147405232,
"loss": 4.423097229003906,
"step": 18900
},
{
"epoch": 0.19716292921850842,
"grad_norm": 0.8988032341003418,
"learning_rate": 0.0002802847447777766,
"loss": 4.291071166992188,
"step": 19000
},
{
"epoch": 0.19820062884597425,
"grad_norm": 1.2874023914337158,
"learning_rate": 0.00028018097481503,
"loss": 4.257485046386718,
"step": 19100
},
{
"epoch": 0.19923832847344009,
"grad_norm": 3.89581561088562,
"learning_rate": 0.0002800772048522834,
"loss": 4.355436401367188,
"step": 19200
},
{
"epoch": 0.20027602810090592,
"grad_norm": 1.4264250993728638,
"learning_rate": 0.00027997343488953686,
"loss": 4.268387451171875,
"step": 19300
},
{
"epoch": 0.20131372772837175,
"grad_norm": 2.3243231773376465,
"learning_rate": 0.0002798696649267903,
"loss": 4.248961791992188,
"step": 19400
},
{
"epoch": 0.20235142735583758,
"grad_norm": 1.609995722770691,
"learning_rate": 0.0002797658949640437,
"loss": 4.299253845214844,
"step": 19500
},
{
"epoch": 0.20338912698330341,
"grad_norm": 1.636496901512146,
"learning_rate": 0.0002796621250012971,
"loss": 4.379757690429687,
"step": 19600
},
{
"epoch": 0.20442682661076925,
"grad_norm": 1.742827296257019,
"learning_rate": 0.0002795583550385505,
"loss": 4.298026733398437,
"step": 19700
},
{
"epoch": 0.20546452623823508,
"grad_norm": 1.3360769748687744,
"learning_rate": 0.00027945458507580394,
"loss": 4.443134155273437,
"step": 19800
},
{
"epoch": 0.2065022258657009,
"grad_norm": 1.5279536247253418,
"learning_rate": 0.00027935081511305733,
"loss": 4.3536380004882815,
"step": 19900
},
{
"epoch": 0.20753992549316674,
"grad_norm": 1.2768709659576416,
"learning_rate": 0.0002792470451503108,
"loss": 4.420497741699219,
"step": 20000
},
{
"epoch": 0.20857762512063258,
"grad_norm": 1.1040194034576416,
"learning_rate": 0.0002791432751875642,
"loss": 4.308759155273438,
"step": 20100
},
{
"epoch": 0.2096153247480984,
"grad_norm": 1.5710710287094116,
"learning_rate": 0.00027903950522481757,
"loss": 4.188085021972657,
"step": 20200
},
{
"epoch": 0.21065302437556424,
"grad_norm": 0.9058725237846375,
"learning_rate": 0.000278935735262071,
"loss": 4.162925720214844,
"step": 20300
},
{
"epoch": 0.21169072400303007,
"grad_norm": 2.4681508541107178,
"learning_rate": 0.0002788319652993244,
"loss": 4.207759704589844,
"step": 20400
},
{
"epoch": 0.2127284236304959,
"grad_norm": 1.7522861957550049,
"learning_rate": 0.00027872819533657786,
"loss": 4.448352355957031,
"step": 20500
},
{
"epoch": 0.21376612325796174,
"grad_norm": 1.8361260890960693,
"learning_rate": 0.00027862442537383126,
"loss": 4.27260986328125,
"step": 20600
},
{
"epoch": 0.2148038228854276,
"grad_norm": 1.7720355987548828,
"learning_rate": 0.0002785206554110847,
"loss": 4.315809326171875,
"step": 20700
},
{
"epoch": 0.21584152251289343,
"grad_norm": 2.2454731464385986,
"learning_rate": 0.0002784168854483381,
"loss": 4.421763916015625,
"step": 20800
},
{
"epoch": 0.21687922214035926,
"grad_norm": 2.7393276691436768,
"learning_rate": 0.0002783131154855915,
"loss": 4.268560791015625,
"step": 20900
},
{
"epoch": 0.2179169217678251,
"grad_norm": 1.8933848142623901,
"learning_rate": 0.00027820934552284495,
"loss": 4.316322937011718,
"step": 21000
},
{
"epoch": 0.21895462139529093,
"grad_norm": 1.2294155359268188,
"learning_rate": 0.00027810557556009834,
"loss": 4.247787780761719,
"step": 21100
},
{
"epoch": 0.21999232102275676,
"grad_norm": 1.5950024127960205,
"learning_rate": 0.0002780018055973518,
"loss": 4.292718811035156,
"step": 21200
},
{
"epoch": 0.2210300206502226,
"grad_norm": 0.9710947275161743,
"learning_rate": 0.0002778980356346052,
"loss": 4.238976135253906,
"step": 21300
},
{
"epoch": 0.22206772027768842,
"grad_norm": 1.3599995374679565,
"learning_rate": 0.0002777942656718586,
"loss": 4.441769409179687,
"step": 21400
},
{
"epoch": 0.22310541990515426,
"grad_norm": 1.2248610258102417,
"learning_rate": 0.00027769049570911203,
"loss": 4.34153564453125,
"step": 21500
},
{
"epoch": 0.2241431195326201,
"grad_norm": 1.07679283618927,
"learning_rate": 0.0002775867257463654,
"loss": 4.307798767089844,
"step": 21600
},
{
"epoch": 0.22518081916008592,
"grad_norm": 2.6134791374206543,
"learning_rate": 0.0002774829557836189,
"loss": 4.170127868652344,
"step": 21700
},
{
"epoch": 0.22621851878755175,
"grad_norm": 3.8844735622406006,
"learning_rate": 0.00027737918582087227,
"loss": 4.2596041870117185,
"step": 21800
},
{
"epoch": 0.22725621841501759,
"grad_norm": 3.4798216819763184,
"learning_rate": 0.00027727541585812566,
"loss": 4.257220153808594,
"step": 21900
},
{
"epoch": 0.22829391804248342,
"grad_norm": 1.0172936916351318,
"learning_rate": 0.0002771716458953791,
"loss": 4.342347717285156,
"step": 22000
},
{
"epoch": 0.22933161766994925,
"grad_norm": 2.0007245540618896,
"learning_rate": 0.0002770678759326325,
"loss": 4.21951171875,
"step": 22100
},
{
"epoch": 0.23036931729741508,
"grad_norm": 1.0652577877044678,
"learning_rate": 0.00027696410596988596,
"loss": 4.309334411621093,
"step": 22200
},
{
"epoch": 0.23140701692488092,
"grad_norm": 1.0696879625320435,
"learning_rate": 0.00027686033600713935,
"loss": 4.333943481445313,
"step": 22300
},
{
"epoch": 0.23244471655234675,
"grad_norm": 1.0693758726119995,
"learning_rate": 0.00027675656604439275,
"loss": 4.325413513183594,
"step": 22400
},
{
"epoch": 0.23348241617981258,
"grad_norm": 1.3958321809768677,
"learning_rate": 0.00027665279608164614,
"loss": 4.1349484252929685,
"step": 22500
},
{
"epoch": 0.2345201158072784,
"grad_norm": 1.732444167137146,
"learning_rate": 0.0002765490261188996,
"loss": 4.191957397460937,
"step": 22600
},
{
"epoch": 0.23555781543474427,
"grad_norm": 1.329959750175476,
"learning_rate": 0.00027644525615615304,
"loss": 4.440416870117187,
"step": 22700
},
{
"epoch": 0.2365955150622101,
"grad_norm": 1.4088762998580933,
"learning_rate": 0.00027634148619340643,
"loss": 4.128535461425781,
"step": 22800
},
{
"epoch": 0.23763321468967594,
"grad_norm": 1.167936086654663,
"learning_rate": 0.0002762377162306599,
"loss": 4.3338143920898435,
"step": 22900
},
{
"epoch": 0.23867091431714177,
"grad_norm": 1.1570918560028076,
"learning_rate": 0.0002761339462679133,
"loss": 4.180432739257813,
"step": 23000
},
{
"epoch": 0.2397086139446076,
"grad_norm": 1.2544199228286743,
"learning_rate": 0.00027603017630516667,
"loss": 4.1538671875,
"step": 23100
},
{
"epoch": 0.24074631357207343,
"grad_norm": 1.844802975654602,
"learning_rate": 0.0002759264063424201,
"loss": 4.238400268554687,
"step": 23200
},
{
"epoch": 0.24178401319953927,
"grad_norm": 2.407107353210449,
"learning_rate": 0.0002758226363796735,
"loss": 4.1402197265625,
"step": 23300
},
{
"epoch": 0.2428217128270051,
"grad_norm": 1.7526997327804565,
"learning_rate": 0.00027571886641692696,
"loss": 4.253873901367188,
"step": 23400
},
{
"epoch": 0.24385941245447093,
"grad_norm": 2.1768147945404053,
"learning_rate": 0.00027561509645418036,
"loss": 4.146066589355469,
"step": 23500
},
{
"epoch": 0.24489711208193676,
"grad_norm": 1.0545059442520142,
"learning_rate": 0.00027551132649143375,
"loss": 4.199613037109375,
"step": 23600
},
{
"epoch": 0.2459348117094026,
"grad_norm": 1.2132643461227417,
"learning_rate": 0.00027540755652868715,
"loss": 4.202657775878906,
"step": 23700
},
{
"epoch": 0.24697251133686843,
"grad_norm": 2.1652746200561523,
"learning_rate": 0.0002753037865659406,
"loss": 4.301669311523438,
"step": 23800
},
{
"epoch": 0.24801021096433426,
"grad_norm": 1.0687705278396606,
"learning_rate": 0.00027520001660319405,
"loss": 4.310574340820312,
"step": 23900
},
{
"epoch": 0.2490479105918001,
"grad_norm": 2.6030638217926025,
"learning_rate": 0.00027509624664044744,
"loss": 4.220720825195312,
"step": 24000
},
{
"epoch": 0.25008561021926595,
"grad_norm": 0.9720291495323181,
"learning_rate": 0.00027499247667770084,
"loss": 4.376803283691406,
"step": 24100
},
{
"epoch": 0.25112330984673176,
"grad_norm": 1.398289680480957,
"learning_rate": 0.00027488870671495423,
"loss": 4.39901123046875,
"step": 24200
},
{
"epoch": 0.2521610094741976,
"grad_norm": 2.2055957317352295,
"learning_rate": 0.0002747849367522077,
"loss": 4.196527709960938,
"step": 24300
},
{
"epoch": 0.2531987091016634,
"grad_norm": 2.036271810531616,
"learning_rate": 0.0002746811667894611,
"loss": 4.274451599121094,
"step": 24400
},
{
"epoch": 0.2542364087291293,
"grad_norm": 2.6011345386505127,
"learning_rate": 0.0002745773968267145,
"loss": 4.2699462890625,
"step": 24500
},
{
"epoch": 0.2552741083565951,
"grad_norm": 1.9660414457321167,
"learning_rate": 0.0002744736268639679,
"loss": 4.2452325439453125,
"step": 24600
},
{
"epoch": 0.25631180798406095,
"grad_norm": 1.2747102975845337,
"learning_rate": 0.0002743698569012213,
"loss": 4.348042907714844,
"step": 24700
},
{
"epoch": 0.25734950761152675,
"grad_norm": 1.4823510646820068,
"learning_rate": 0.00027426608693847476,
"loss": 4.154461669921875,
"step": 24800
},
{
"epoch": 0.2583872072389926,
"grad_norm": 1.6665210723876953,
"learning_rate": 0.00027416231697572816,
"loss": 4.136954956054687,
"step": 24900
},
{
"epoch": 0.2594249068664584,
"grad_norm": 1.8465914726257324,
"learning_rate": 0.0002740585470129816,
"loss": 4.296747741699218,
"step": 25000
},
{
"epoch": 0.2604626064939243,
"grad_norm": 1.0613303184509277,
"learning_rate": 0.00027395477705023506,
"loss": 4.209448547363281,
"step": 25100
},
{
"epoch": 0.2615003061213901,
"grad_norm": 2.3083701133728027,
"learning_rate": 0.00027385100708748845,
"loss": 4.412258911132812,
"step": 25200
},
{
"epoch": 0.26253800574885594,
"grad_norm": 1.8509588241577148,
"learning_rate": 0.00027374723712474185,
"loss": 4.171485595703125,
"step": 25300
},
{
"epoch": 0.26357570537632175,
"grad_norm": 1.091736078262329,
"learning_rate": 0.00027364346716199524,
"loss": 4.24049560546875,
"step": 25400
},
{
"epoch": 0.2646134050037876,
"grad_norm": 1.201401710510254,
"learning_rate": 0.0002735396971992487,
"loss": 4.135834350585937,
"step": 25500
},
{
"epoch": 0.2656511046312534,
"grad_norm": 1.5545823574066162,
"learning_rate": 0.0002734359272365021,
"loss": 4.291419677734375,
"step": 25600
},
{
"epoch": 0.26668880425871927,
"grad_norm": 1.3560378551483154,
"learning_rate": 0.00027333215727375553,
"loss": 4.236996459960937,
"step": 25700
},
{
"epoch": 0.26772650388618513,
"grad_norm": 1.0210782289505005,
"learning_rate": 0.00027322838731100893,
"loss": 4.249810791015625,
"step": 25800
},
{
"epoch": 0.26876420351365093,
"grad_norm": 1.3093341588974,
"learning_rate": 0.0002731246173482623,
"loss": 4.195414428710937,
"step": 25900
},
{
"epoch": 0.2698019031411168,
"grad_norm": 1.7895358800888062,
"learning_rate": 0.00027302084738551577,
"loss": 4.180751037597656,
"step": 26000
},
{
"epoch": 0.2708396027685826,
"grad_norm": 11.451671600341797,
"learning_rate": 0.00027291707742276917,
"loss": 4.157826538085938,
"step": 26100
},
{
"epoch": 0.27187730239604846,
"grad_norm": 1.9708665609359741,
"learning_rate": 0.0002728133074600226,
"loss": 4.128204956054687,
"step": 26200
},
{
"epoch": 0.27291500202351426,
"grad_norm": 1.2628132104873657,
"learning_rate": 0.000272709537497276,
"loss": 4.281667785644531,
"step": 26300
},
{
"epoch": 0.2739527016509801,
"grad_norm": 2.2199666500091553,
"learning_rate": 0.0002726057675345294,
"loss": 4.237691650390625,
"step": 26400
},
{
"epoch": 0.27499040127844593,
"grad_norm": 2.815150022506714,
"learning_rate": 0.00027250199757178285,
"loss": 4.080834045410156,
"step": 26500
},
{
"epoch": 0.2760281009059118,
"grad_norm": 1.7167062759399414,
"learning_rate": 0.00027239822760903625,
"loss": 4.224625549316406,
"step": 26600
},
{
"epoch": 0.2770658005333776,
"grad_norm": 2.769949436187744,
"learning_rate": 0.0002722944576462897,
"loss": 4.3115145874023435,
"step": 26700
},
{
"epoch": 0.27810350016084345,
"grad_norm": 1.3523616790771484,
"learning_rate": 0.0002721906876835431,
"loss": 4.356557006835938,
"step": 26800
},
{
"epoch": 0.27914119978830926,
"grad_norm": 4.089077949523926,
"learning_rate": 0.00027208691772079654,
"loss": 4.286115112304688,
"step": 26900
},
{
"epoch": 0.2801788994157751,
"grad_norm": 1.1650248765945435,
"learning_rate": 0.00027198314775804994,
"loss": 4.335249328613282,
"step": 27000
},
{
"epoch": 0.2812165990432409,
"grad_norm": 1.8776350021362305,
"learning_rate": 0.00027187937779530333,
"loss": 4.274792175292969,
"step": 27100
},
{
"epoch": 0.2822542986707068,
"grad_norm": 3.665797710418701,
"learning_rate": 0.0002717756078325568,
"loss": 4.347820739746094,
"step": 27200
},
{
"epoch": 0.2832919982981726,
"grad_norm": 1.1905182600021362,
"learning_rate": 0.0002716718378698102,
"loss": 4.234444274902343,
"step": 27300
},
{
"epoch": 0.28432969792563845,
"grad_norm": 1.2664549350738525,
"learning_rate": 0.0002715680679070636,
"loss": 4.19026123046875,
"step": 27400
},
{
"epoch": 0.28536739755310425,
"grad_norm": 1.5952035188674927,
"learning_rate": 0.000271464297944317,
"loss": 4.284921569824219,
"step": 27500
},
{
"epoch": 0.2864050971805701,
"grad_norm": 1.5898215770721436,
"learning_rate": 0.0002713605279815704,
"loss": 4.128340759277344,
"step": 27600
},
{
"epoch": 0.28744279680803597,
"grad_norm": 1.701250433921814,
"learning_rate": 0.00027125675801882386,
"loss": 4.1064456176757815,
"step": 27700
},
{
"epoch": 0.2884804964355018,
"grad_norm": 2.2521140575408936,
"learning_rate": 0.00027115298805607726,
"loss": 4.188478698730469,
"step": 27800
},
{
"epoch": 0.28951819606296764,
"grad_norm": 1.428589105606079,
"learning_rate": 0.0002710492180933307,
"loss": 4.172950134277344,
"step": 27900
},
{
"epoch": 0.29055589569043344,
"grad_norm": 1.5243910551071167,
"learning_rate": 0.0002709454481305841,
"loss": 4.251683044433594,
"step": 28000
},
{
"epoch": 0.2915935953178993,
"grad_norm": 1.285276174545288,
"learning_rate": 0.0002708416781678375,
"loss": 4.291034851074219,
"step": 28100
},
{
"epoch": 0.2926312949453651,
"grad_norm": 1.2959215641021729,
"learning_rate": 0.0002707379082050909,
"loss": 4.223204040527344,
"step": 28200
},
{
"epoch": 0.29366899457283097,
"grad_norm": 1.9572069644927979,
"learning_rate": 0.00027063413824234434,
"loss": 4.1140069580078125,
"step": 28300
},
{
"epoch": 0.29470669420029677,
"grad_norm": 2.5625929832458496,
"learning_rate": 0.0002705303682795978,
"loss": 4.2418734741210935,
"step": 28400
},
{
"epoch": 0.29574439382776263,
"grad_norm": 1.657065510749817,
"learning_rate": 0.0002704265983168512,
"loss": 4.2059628295898435,
"step": 28500
},
{
"epoch": 0.29678209345522844,
"grad_norm": 1.4735133647918701,
"learning_rate": 0.0002703228283541046,
"loss": 4.232904663085938,
"step": 28600
},
{
"epoch": 0.2978197930826943,
"grad_norm": 2.643979549407959,
"learning_rate": 0.000270219058391358,
"loss": 4.151640930175781,
"step": 28700
},
{
"epoch": 0.2988574927101601,
"grad_norm": 1.5147004127502441,
"learning_rate": 0.0002701152884286114,
"loss": 4.171849060058594,
"step": 28800
},
{
"epoch": 0.29989519233762596,
"grad_norm": 1.4815659523010254,
"learning_rate": 0.00027001151846586487,
"loss": 4.120007019042969,
"step": 28900
},
{
"epoch": 0.30093289196509176,
"grad_norm": 3.8772029876708984,
"learning_rate": 0.00026990774850311827,
"loss": 4.113840637207031,
"step": 29000
},
{
"epoch": 0.3019705915925576,
"grad_norm": 1.8152740001678467,
"learning_rate": 0.0002698039785403717,
"loss": 4.143219604492187,
"step": 29100
},
{
"epoch": 0.30300829122002343,
"grad_norm": 1.3441669940948486,
"learning_rate": 0.0002697002085776251,
"loss": 4.151035461425781,
"step": 29200
},
{
"epoch": 0.3040459908474893,
"grad_norm": 2.0656609535217285,
"learning_rate": 0.0002695964386148785,
"loss": 4.229763793945312,
"step": 29300
},
{
"epoch": 0.3050836904749551,
"grad_norm": 2.8376095294952393,
"learning_rate": 0.0002694926686521319,
"loss": 4.2303158569335935,
"step": 29400
},
{
"epoch": 0.30612139010242095,
"grad_norm": 1.9161107540130615,
"learning_rate": 0.00026938889868938535,
"loss": 4.252763061523438,
"step": 29500
},
{
"epoch": 0.3071590897298868,
"grad_norm": 2.1317851543426514,
"learning_rate": 0.0002692851287266388,
"loss": 4.12993408203125,
"step": 29600
},
{
"epoch": 0.3081967893573526,
"grad_norm": 2.9762330055236816,
"learning_rate": 0.0002691813587638922,
"loss": 4.347277221679687,
"step": 29700
},
{
"epoch": 0.3092344889848185,
"grad_norm": 2.135929584503174,
"learning_rate": 0.0002690775888011456,
"loss": 4.052276611328125,
"step": 29800
},
{
"epoch": 0.3102721886122843,
"grad_norm": 1.3577543497085571,
"learning_rate": 0.000268973818838399,
"loss": 4.199589233398438,
"step": 29900
},
{
"epoch": 0.31130988823975014,
"grad_norm": 1.2834597826004028,
"learning_rate": 0.00026887004887565243,
"loss": 4.134565734863282,
"step": 30000
},
{
"epoch": 0.31234758786721595,
"grad_norm": 2.093669891357422,
"learning_rate": 0.00026876627891290583,
"loss": 4.183307495117187,
"step": 30100
},
{
"epoch": 0.3133852874946818,
"grad_norm": 1.1888537406921387,
"learning_rate": 0.0002686625089501593,
"loss": 4.022268371582031,
"step": 30200
},
{
"epoch": 0.3144229871221476,
"grad_norm": 1.4640058279037476,
"learning_rate": 0.00026855873898741267,
"loss": 4.191292724609375,
"step": 30300
},
{
"epoch": 0.3154606867496135,
"grad_norm": 0.9469636678695679,
"learning_rate": 0.00026845496902466607,
"loss": 4.2131259155273435,
"step": 30400
},
{
"epoch": 0.3164983863770793,
"grad_norm": 1.5227535963058472,
"learning_rate": 0.0002683511990619195,
"loss": 4.24783935546875,
"step": 30500
},
{
"epoch": 0.31753608600454514,
"grad_norm": 2.524731159210205,
"learning_rate": 0.0002682474290991729,
"loss": 4.206085205078125,
"step": 30600
},
{
"epoch": 0.31857378563201094,
"grad_norm": 2.7074637413024902,
"learning_rate": 0.00026814365913642636,
"loss": 3.964501953125,
"step": 30700
},
{
"epoch": 0.3196114852594768,
"grad_norm": 2.1479899883270264,
"learning_rate": 0.00026803988917367975,
"loss": 4.121002197265625,
"step": 30800
},
{
"epoch": 0.3206491848869426,
"grad_norm": 3.6871800422668457,
"learning_rate": 0.00026793611921093315,
"loss": 4.290604858398438,
"step": 30900
},
{
"epoch": 0.32168688451440847,
"grad_norm": 2.0092685222625732,
"learning_rate": 0.0002678323492481866,
"loss": 4.169475708007813,
"step": 31000
},
{
"epoch": 0.32272458414187427,
"grad_norm": 1.3000237941741943,
"learning_rate": 0.00026772857928544,
"loss": 4.096010131835937,
"step": 31100
},
{
"epoch": 0.32376228376934013,
"grad_norm": 2.161574125289917,
"learning_rate": 0.00026762480932269344,
"loss": 4.249325561523437,
"step": 31200
},
{
"epoch": 0.32479998339680594,
"grad_norm": 1.0579701662063599,
"learning_rate": 0.00026752103935994684,
"loss": 4.270779724121094,
"step": 31300
},
{
"epoch": 0.3258376830242718,
"grad_norm": 1.2264137268066406,
"learning_rate": 0.0002674172693972003,
"loss": 4.2367620849609375,
"step": 31400
},
{
"epoch": 0.3268753826517376,
"grad_norm": 3.2623612880706787,
"learning_rate": 0.0002673134994344537,
"loss": 4.160564575195313,
"step": 31500
},
{
"epoch": 0.32791308227920346,
"grad_norm": 2.1803345680236816,
"learning_rate": 0.0002672097294717071,
"loss": 4.203829040527344,
"step": 31600
},
{
"epoch": 0.3289507819066693,
"grad_norm": 1.9515228271484375,
"learning_rate": 0.0002671059595089605,
"loss": 4.315436706542969,
"step": 31700
},
{
"epoch": 0.3299884815341351,
"grad_norm": 3.0683810710906982,
"learning_rate": 0.0002670021895462139,
"loss": 4.155743103027344,
"step": 31800
},
{
"epoch": 0.331026181161601,
"grad_norm": 2.6642050743103027,
"learning_rate": 0.00026689841958346737,
"loss": 4.222473754882812,
"step": 31900
},
{
"epoch": 0.3320638807890668,
"grad_norm": 1.8333579301834106,
"learning_rate": 0.00026679464962072076,
"loss": 4.210680541992187,
"step": 32000
},
{
"epoch": 0.33310158041653265,
"grad_norm": 2.136242151260376,
"learning_rate": 0.00026669087965797416,
"loss": 4.144779357910156,
"step": 32100
},
{
"epoch": 0.33413928004399845,
"grad_norm": 0.9694802165031433,
"learning_rate": 0.0002665871096952276,
"loss": 4.1770632934570315,
"step": 32200
},
{
"epoch": 0.3351769796714643,
"grad_norm": 2.070678949356079,
"learning_rate": 0.000266483339732481,
"loss": 4.140425415039062,
"step": 32300
},
{
"epoch": 0.3362146792989301,
"grad_norm": 1.3420311212539673,
"learning_rate": 0.00026637956976973445,
"loss": 4.117745361328125,
"step": 32400
},
{
"epoch": 0.337252378926396,
"grad_norm": 1.7498325109481812,
"learning_rate": 0.00026627579980698785,
"loss": 4.090622253417969,
"step": 32500
},
{
"epoch": 0.3382900785538618,
"grad_norm": 5.7661848068237305,
"learning_rate": 0.00026617202984424124,
"loss": 4.212898864746093,
"step": 32600
},
{
"epoch": 0.33932777818132764,
"grad_norm": 1.856246829032898,
"learning_rate": 0.0002660682598814947,
"loss": 4.20351806640625,
"step": 32700
},
{
"epoch": 0.34036547780879345,
"grad_norm": 5.002403259277344,
"learning_rate": 0.0002659644899187481,
"loss": 4.095009765625,
"step": 32800
},
{
"epoch": 0.3414031774362593,
"grad_norm": 1.0896239280700684,
"learning_rate": 0.00026586071995600153,
"loss": 4.061651000976562,
"step": 32900
},
{
"epoch": 0.3424408770637251,
"grad_norm": 1.4536166191101074,
"learning_rate": 0.00026575694999325493,
"loss": 4.05192626953125,
"step": 33000
},
{
"epoch": 0.343478576691191,
"grad_norm": 3.966247081756592,
"learning_rate": 0.0002656531800305084,
"loss": 4.122060241699219,
"step": 33100
},
{
"epoch": 0.3445162763186568,
"grad_norm": 2.3092470169067383,
"learning_rate": 0.00026554941006776177,
"loss": 4.171341247558594,
"step": 33200
},
{
"epoch": 0.34555397594612264,
"grad_norm": 1.6187312602996826,
"learning_rate": 0.00026544564010501517,
"loss": 4.147681579589844,
"step": 33300
},
{
"epoch": 0.34659167557358844,
"grad_norm": 1.4459052085876465,
"learning_rate": 0.0002653418701422686,
"loss": 4.12395751953125,
"step": 33400
},
{
"epoch": 0.3476293752010543,
"grad_norm": 1.6370753049850464,
"learning_rate": 0.000265238100179522,
"loss": 4.043997192382813,
"step": 33500
},
{
"epoch": 0.34866707482852016,
"grad_norm": 2.5965089797973633,
"learning_rate": 0.00026513433021677546,
"loss": 4.149281311035156,
"step": 33600
},
{
"epoch": 0.34970477445598597,
"grad_norm": 1.4466602802276611,
"learning_rate": 0.00026503056025402885,
"loss": 4.153418884277344,
"step": 33700
},
{
"epoch": 0.3507424740834518,
"grad_norm": 1.1217280626296997,
"learning_rate": 0.00026492679029128225,
"loss": 4.173803405761719,
"step": 33800
},
{
"epoch": 0.35178017371091763,
"grad_norm": 2.853686809539795,
"learning_rate": 0.00026482302032853564,
"loss": 4.1212451171875,
"step": 33900
},
{
"epoch": 0.3528178733383835,
"grad_norm": 1.1508560180664062,
"learning_rate": 0.0002647192503657891,
"loss": 4.179091186523437,
"step": 34000
},
{
"epoch": 0.3538555729658493,
"grad_norm": 1.8668493032455444,
"learning_rate": 0.00026461548040304254,
"loss": 4.142960205078125,
"step": 34100
},
{
"epoch": 0.35489327259331516,
"grad_norm": 1.7272940874099731,
"learning_rate": 0.00026451171044029594,
"loss": 4.127975769042969,
"step": 34200
},
{
"epoch": 0.35593097222078096,
"grad_norm": 1.5529290437698364,
"learning_rate": 0.00026440794047754933,
"loss": 4.2190853881835935,
"step": 34300
},
{
"epoch": 0.3569686718482468,
"grad_norm": 1.506499171257019,
"learning_rate": 0.0002643041705148027,
"loss": 4.168932800292969,
"step": 34400
},
{
"epoch": 0.3580063714757126,
"grad_norm": 1.2258543968200684,
"learning_rate": 0.0002642004005520562,
"loss": 4.065081176757812,
"step": 34500
},
{
"epoch": 0.3590440711031785,
"grad_norm": 1.4408226013183594,
"learning_rate": 0.0002640966305893096,
"loss": 4.102992858886719,
"step": 34600
},
{
"epoch": 0.3600817707306443,
"grad_norm": 2.467862844467163,
"learning_rate": 0.000263992860626563,
"loss": 4.061658935546875,
"step": 34700
},
{
"epoch": 0.36111947035811015,
"grad_norm": 1.3214993476867676,
"learning_rate": 0.0002638890906638164,
"loss": 4.133033752441406,
"step": 34800
},
{
"epoch": 0.36215716998557596,
"grad_norm": 1.2223659753799438,
"learning_rate": 0.0002637853207010698,
"loss": 4.0944091796875,
"step": 34900
},
{
"epoch": 0.3631948696130418,
"grad_norm": 1.5864417552947998,
"learning_rate": 0.00026368155073832326,
"loss": 4.031897277832031,
"step": 35000
},
{
"epoch": 0.3642325692405076,
"grad_norm": 3.021804094314575,
"learning_rate": 0.00026357778077557665,
"loss": 4.253480224609375,
"step": 35100
},
{
"epoch": 0.3652702688679735,
"grad_norm": 2.419196844100952,
"learning_rate": 0.0002634740108128301,
"loss": 4.060654602050781,
"step": 35200
},
{
"epoch": 0.3663079684954393,
"grad_norm": 3.106058359146118,
"learning_rate": 0.00026337024085008355,
"loss": 4.167652282714844,
"step": 35300
},
{
"epoch": 0.36734566812290514,
"grad_norm": 2.6082842350006104,
"learning_rate": 0.00026326647088733695,
"loss": 4.126443481445312,
"step": 35400
},
{
"epoch": 0.368383367750371,
"grad_norm": 3.2292778491973877,
"learning_rate": 0.00026316270092459034,
"loss": 4.16947509765625,
"step": 35500
},
{
"epoch": 0.3694210673778368,
"grad_norm": 3.438127279281616,
"learning_rate": 0.00026305893096184374,
"loss": 4.18126220703125,
"step": 35600
},
{
"epoch": 0.37045876700530267,
"grad_norm": 1.1258721351623535,
"learning_rate": 0.0002629551609990972,
"loss": 4.133269348144531,
"step": 35700
},
{
"epoch": 0.3714964666327685,
"grad_norm": 2.0176923274993896,
"learning_rate": 0.00026285139103635063,
"loss": 4.000823059082031,
"step": 35800
},
{
"epoch": 0.37253416626023433,
"grad_norm": 2.162721872329712,
"learning_rate": 0.00026274762107360403,
"loss": 4.158842163085938,
"step": 35900
},
{
"epoch": 0.37357186588770014,
"grad_norm": 1.3159765005111694,
"learning_rate": 0.0002626438511108574,
"loss": 4.156724853515625,
"step": 36000
},
{
"epoch": 0.374609565515166,
"grad_norm": 1.8504067659378052,
"learning_rate": 0.0002625400811481108,
"loss": 4.074109191894531,
"step": 36100
},
{
"epoch": 0.3756472651426318,
"grad_norm": 1.3491618633270264,
"learning_rate": 0.00026243631118536427,
"loss": 4.117833557128907,
"step": 36200
},
{
"epoch": 0.37668496477009766,
"grad_norm": 1.1090528964996338,
"learning_rate": 0.00026233254122261766,
"loss": 4.0473480224609375,
"step": 36300
},
{
"epoch": 0.37772266439756347,
"grad_norm": 4.539895057678223,
"learning_rate": 0.0002622287712598711,
"loss": 4.0527517700195315,
"step": 36400
},
{
"epoch": 0.37876036402502933,
"grad_norm": 1.792636513710022,
"learning_rate": 0.0002621250012971245,
"loss": 3.9459353637695314,
"step": 36500
},
{
"epoch": 0.37979806365249513,
"grad_norm": 2.4098236560821533,
"learning_rate": 0.0002620212313343779,
"loss": 4.144781494140625,
"step": 36600
},
{
"epoch": 0.380835763279961,
"grad_norm": 1.8648608922958374,
"learning_rate": 0.00026191746137163135,
"loss": 4.104746704101562,
"step": 36700
},
{
"epoch": 0.3818734629074268,
"grad_norm": 2.071338653564453,
"learning_rate": 0.00026181369140888474,
"loss": 4.074734191894532,
"step": 36800
},
{
"epoch": 0.38291116253489266,
"grad_norm": 1.3856308460235596,
"learning_rate": 0.0002617099214461382,
"loss": 4.158983154296875,
"step": 36900
},
{
"epoch": 0.38394886216235846,
"grad_norm": 2.072495698928833,
"learning_rate": 0.0002616061514833916,
"loss": 4.08581787109375,
"step": 37000
},
{
"epoch": 0.3849865617898243,
"grad_norm": 1.3703645467758179,
"learning_rate": 0.00026150238152064504,
"loss": 4.006895446777344,
"step": 37100
},
{
"epoch": 0.3860242614172901,
"grad_norm": 2.7975013256073,
"learning_rate": 0.00026139861155789843,
"loss": 4.147843627929688,
"step": 37200
},
{
"epoch": 0.387061961044756,
"grad_norm": 3.56386661529541,
"learning_rate": 0.0002612948415951518,
"loss": 4.2793121337890625,
"step": 37300
},
{
"epoch": 0.3880996606722218,
"grad_norm": 2.8237593173980713,
"learning_rate": 0.0002611910716324053,
"loss": 4.13215087890625,
"step": 37400
},
{
"epoch": 0.38913736029968765,
"grad_norm": 1.2382421493530273,
"learning_rate": 0.00026108730166965867,
"loss": 4.071390991210937,
"step": 37500
},
{
"epoch": 0.3901750599271535,
"grad_norm": 1.620809555053711,
"learning_rate": 0.0002609835317069121,
"loss": 4.081386108398437,
"step": 37600
},
{
"epoch": 0.3912127595546193,
"grad_norm": 1.5530173778533936,
"learning_rate": 0.0002608797617441655,
"loss": 4.095469970703125,
"step": 37700
},
{
"epoch": 0.3922504591820852,
"grad_norm": 2.7742369174957275,
"learning_rate": 0.0002607759917814189,
"loss": 4.096160888671875,
"step": 37800
},
{
"epoch": 0.393288158809551,
"grad_norm": 1.0493942499160767,
"learning_rate": 0.00026067222181867236,
"loss": 4.000921936035156,
"step": 37900
},
{
"epoch": 0.39432585843701684,
"grad_norm": 4.1348958015441895,
"learning_rate": 0.00026056845185592575,
"loss": 3.991048583984375,
"step": 38000
},
{
"epoch": 0.39536355806448265,
"grad_norm": 4.481339454650879,
"learning_rate": 0.0002604646818931792,
"loss": 4.004680786132813,
"step": 38100
},
{
"epoch": 0.3964012576919485,
"grad_norm": 1.5849348306655884,
"learning_rate": 0.0002603609119304326,
"loss": 4.127825317382812,
"step": 38200
},
{
"epoch": 0.3974389573194143,
"grad_norm": 1.5340007543563843,
"learning_rate": 0.000260257141967686,
"loss": 4.126565551757812,
"step": 38300
},
{
"epoch": 0.39847665694688017,
"grad_norm": 1.9388331174850464,
"learning_rate": 0.00026015337200493944,
"loss": 4.147232666015625,
"step": 38400
},
{
"epoch": 0.399514356574346,
"grad_norm": 1.4936273097991943,
"learning_rate": 0.00026004960204219284,
"loss": 4.046693115234375,
"step": 38500
},
{
"epoch": 0.40055205620181183,
"grad_norm": 1.4128496646881104,
"learning_rate": 0.0002599458320794463,
"loss": 4.027592468261719,
"step": 38600
},
{
"epoch": 0.40158975582927764,
"grad_norm": 1.2070266008377075,
"learning_rate": 0.0002598420621166997,
"loss": 3.9974462890625,
"step": 38700
},
{
"epoch": 0.4026274554567435,
"grad_norm": 1.0721571445465088,
"learning_rate": 0.0002597382921539531,
"loss": 4.048193054199219,
"step": 38800
},
{
"epoch": 0.4036651550842093,
"grad_norm": 4.593639373779297,
"learning_rate": 0.00025963452219120647,
"loss": 3.9815548706054686,
"step": 38900
},
{
"epoch": 0.40470285471167516,
"grad_norm": 2.84889817237854,
"learning_rate": 0.0002595307522284599,
"loss": 4.118370666503906,
"step": 39000
},
{
"epoch": 0.40574055433914097,
"grad_norm": 1.6757389307022095,
"learning_rate": 0.00025942698226571337,
"loss": 4.095942077636718,
"step": 39100
},
{
"epoch": 0.40677825396660683,
"grad_norm": 3.5596885681152344,
"learning_rate": 0.00025932321230296676,
"loss": 4.0965576171875,
"step": 39200
},
{
"epoch": 0.40781595359407263,
"grad_norm": 1.0558372735977173,
"learning_rate": 0.0002592194423402202,
"loss": 4.239440307617188,
"step": 39300
},
{
"epoch": 0.4088536532215385,
"grad_norm": 5.334078311920166,
"learning_rate": 0.0002591156723774736,
"loss": 4.089285888671875,
"step": 39400
},
{
"epoch": 0.40989135284900435,
"grad_norm": 2.4086287021636963,
"learning_rate": 0.000259011902414727,
"loss": 4.103414611816406,
"step": 39500
},
{
"epoch": 0.41092905247647016,
"grad_norm": 4.432836055755615,
"learning_rate": 0.00025890813245198045,
"loss": 4.0577630615234375,
"step": 39600
},
{
"epoch": 0.411966752103936,
"grad_norm": 1.3129891157150269,
"learning_rate": 0.00025880436248923384,
"loss": 4.128912353515625,
"step": 39700
},
{
"epoch": 0.4130044517314018,
"grad_norm": 2.148174524307251,
"learning_rate": 0.0002587005925264873,
"loss": 4.197516174316406,
"step": 39800
},
{
"epoch": 0.4140421513588677,
"grad_norm": 6.447707176208496,
"learning_rate": 0.0002585968225637407,
"loss": 4.087812805175782,
"step": 39900
},
{
"epoch": 0.4150798509863335,
"grad_norm": 2.721989393234253,
"learning_rate": 0.0002584930526009941,
"loss": 3.9460833740234373,
"step": 40000
},
{
"epoch": 0.41611755061379935,
"grad_norm": 1.543135166168213,
"learning_rate": 0.0002583892826382475,
"loss": 4.02151611328125,
"step": 40100
},
{
"epoch": 0.41715525024126515,
"grad_norm": 1.4670268297195435,
"learning_rate": 0.0002582855126755009,
"loss": 4.18778564453125,
"step": 40200
},
{
"epoch": 0.418192949868731,
"grad_norm": 3.8556268215179443,
"learning_rate": 0.0002581817427127544,
"loss": 3.996910400390625,
"step": 40300
},
{
"epoch": 0.4192306494961968,
"grad_norm": 1.702594518661499,
"learning_rate": 0.00025807797275000777,
"loss": 4.031709594726562,
"step": 40400
},
{
"epoch": 0.4202683491236627,
"grad_norm": 1.2531317472457886,
"learning_rate": 0.00025797420278726117,
"loss": 4.188993835449219,
"step": 40500
},
{
"epoch": 0.4213060487511285,
"grad_norm": 2.5484142303466797,
"learning_rate": 0.00025787043282451456,
"loss": 4.031621398925782,
"step": 40600
},
{
"epoch": 0.42234374837859434,
"grad_norm": 1.823457956314087,
"learning_rate": 0.000257766662861768,
"loss": 4.001983032226563,
"step": 40700
},
{
"epoch": 0.42338144800606015,
"grad_norm": 1.9530704021453857,
"learning_rate": 0.0002576628928990214,
"loss": 4.030978088378906,
"step": 40800
},
{
"epoch": 0.424419147633526,
"grad_norm": 4.55501127243042,
"learning_rate": 0.00025755912293627485,
"loss": 4.062133178710938,
"step": 40900
},
{
"epoch": 0.4254568472609918,
"grad_norm": 1.9799492359161377,
"learning_rate": 0.00025745535297352825,
"loss": 3.9875259399414062,
"step": 41000
},
{
"epoch": 0.42649454688845767,
"grad_norm": 2.4329614639282227,
"learning_rate": 0.00025735158301078164,
"loss": 3.9634893798828124,
"step": 41100
},
{
"epoch": 0.4275322465159235,
"grad_norm": 1.3791182041168213,
"learning_rate": 0.0002572478130480351,
"loss": 4.171094055175781,
"step": 41200
},
{
"epoch": 0.42856994614338934,
"grad_norm": 1.4852691888809204,
"learning_rate": 0.0002571440430852885,
"loss": 4.059336547851562,
"step": 41300
},
{
"epoch": 0.4296076457708552,
"grad_norm": 2.191392183303833,
"learning_rate": 0.00025704027312254194,
"loss": 3.9574560546875,
"step": 41400
},
{
"epoch": 0.430645345398321,
"grad_norm": 3.4423017501831055,
"learning_rate": 0.0002569365031597954,
"loss": 3.990745849609375,
"step": 41500
},
{
"epoch": 0.43168304502578686,
"grad_norm": 2.979930877685547,
"learning_rate": 0.0002568327331970488,
"loss": 4.166605529785156,
"step": 41600
},
{
"epoch": 0.43272074465325266,
"grad_norm": 3.131230354309082,
"learning_rate": 0.0002567289632343022,
"loss": 4.026178894042968,
"step": 41700
},
{
"epoch": 0.4337584442807185,
"grad_norm": 1.578643798828125,
"learning_rate": 0.00025662519327155557,
"loss": 4.10739990234375,
"step": 41800
},
{
"epoch": 0.43479614390818433,
"grad_norm": 3.628096580505371,
"learning_rate": 0.000256521423308809,
"loss": 4.021985473632813,
"step": 41900
},
{
"epoch": 0.4358338435356502,
"grad_norm": 2.235994815826416,
"learning_rate": 0.0002564176533460624,
"loss": 4.138570251464844,
"step": 42000
},
{
"epoch": 0.436871543163116,
"grad_norm": 3.0459887981414795,
"learning_rate": 0.00025631388338331586,
"loss": 4.139791564941406,
"step": 42100
},
{
"epoch": 0.43790924279058185,
"grad_norm": 1.0590101480484009,
"learning_rate": 0.00025621011342056926,
"loss": 4.018776550292968,
"step": 42200
},
{
"epoch": 0.43894694241804766,
"grad_norm": 3.5735878944396973,
"learning_rate": 0.00025610634345782265,
"loss": 4.182121887207031,
"step": 42300
},
{
"epoch": 0.4399846420455135,
"grad_norm": 1.1051421165466309,
"learning_rate": 0.0002560025734950761,
"loss": 4.086949157714844,
"step": 42400
},
{
"epoch": 0.4410223416729793,
"grad_norm": 2.8680758476257324,
"learning_rate": 0.0002558988035323295,
"loss": 4.053037414550781,
"step": 42500
},
{
"epoch": 0.4420600413004452,
"grad_norm": 1.6805782318115234,
"learning_rate": 0.00025579503356958294,
"loss": 4.041470947265625,
"step": 42600
},
{
"epoch": 0.443097740927911,
"grad_norm": 1.7229841947555542,
"learning_rate": 0.00025569126360683634,
"loss": 4.1356103515625,
"step": 42700
},
{
"epoch": 0.44413544055537685,
"grad_norm": 1.4601655006408691,
"learning_rate": 0.00025558749364408973,
"loss": 4.052696533203125,
"step": 42800
},
{
"epoch": 0.44517314018284265,
"grad_norm": 1.552959680557251,
"learning_rate": 0.0002554837236813432,
"loss": 4.020947875976563,
"step": 42900
},
{
"epoch": 0.4462108398103085,
"grad_norm": 1.3446309566497803,
"learning_rate": 0.0002553799537185966,
"loss": 4.150856018066406,
"step": 43000
},
{
"epoch": 0.4472485394377743,
"grad_norm": 3.128110408782959,
"learning_rate": 0.00025527618375585003,
"loss": 4.118401794433594,
"step": 43100
},
{
"epoch": 0.4482862390652402,
"grad_norm": 1.328148603439331,
"learning_rate": 0.0002551724137931034,
"loss": 4.073428649902343,
"step": 43200
},
{
"epoch": 0.449323938692706,
"grad_norm": 1.5910078287124634,
"learning_rate": 0.00025506864383035687,
"loss": 4.110806579589844,
"step": 43300
},
{
"epoch": 0.45036163832017184,
"grad_norm": 1.2686039209365845,
"learning_rate": 0.00025496487386761027,
"loss": 4.007551574707032,
"step": 43400
},
{
"epoch": 0.4513993379476377,
"grad_norm": 4.290769577026367,
"learning_rate": 0.00025486110390486366,
"loss": 4.068913269042969,
"step": 43500
},
{
"epoch": 0.4524370375751035,
"grad_norm": 1.6915346384048462,
"learning_rate": 0.0002547573339421171,
"loss": 4.066489562988282,
"step": 43600
},
{
"epoch": 0.45347473720256937,
"grad_norm": 1.3425647020339966,
"learning_rate": 0.0002546535639793705,
"loss": 4.024351806640625,
"step": 43700
},
{
"epoch": 0.45451243683003517,
"grad_norm": 4.726262092590332,
"learning_rate": 0.00025454979401662395,
"loss": 4.055924987792968,
"step": 43800
},
{
"epoch": 0.45555013645750103,
"grad_norm": 1.3767929077148438,
"learning_rate": 0.00025444602405387735,
"loss": 4.1108706665039065,
"step": 43900
},
{
"epoch": 0.45658783608496684,
"grad_norm": 2.199096918106079,
"learning_rate": 0.00025434225409113074,
"loss": 4.032781982421875,
"step": 44000
},
{
"epoch": 0.4576255357124327,
"grad_norm": 1.529963731765747,
"learning_rate": 0.0002542384841283842,
"loss": 3.9078250122070313,
"step": 44100
},
{
"epoch": 0.4586632353398985,
"grad_norm": 2.381452798843384,
"learning_rate": 0.0002541347141656376,
"loss": 4.1637747192382815,
"step": 44200
},
{
"epoch": 0.45970093496736436,
"grad_norm": 1.3512217998504639,
"learning_rate": 0.00025403094420289104,
"loss": 4.1603765869140625,
"step": 44300
},
{
"epoch": 0.46073863459483017,
"grad_norm": 1.6877330541610718,
"learning_rate": 0.00025392717424014443,
"loss": 3.9833114624023436,
"step": 44400
},
{
"epoch": 0.461776334222296,
"grad_norm": 10.19050121307373,
"learning_rate": 0.0002538234042773978,
"loss": 4.087564086914062,
"step": 44500
},
{
"epoch": 0.46281403384976183,
"grad_norm": 2.2430684566497803,
"learning_rate": 0.0002537196343146512,
"loss": 3.943908386230469,
"step": 44600
},
{
"epoch": 0.4638517334772277,
"grad_norm": 1.8005903959274292,
"learning_rate": 0.00025361586435190467,
"loss": 4.026759948730469,
"step": 44700
},
{
"epoch": 0.4648894331046935,
"grad_norm": 1.3022342920303345,
"learning_rate": 0.0002535120943891581,
"loss": 4.106507263183594,
"step": 44800
},
{
"epoch": 0.46592713273215935,
"grad_norm": 1.1729425191879272,
"learning_rate": 0.0002534083244264115,
"loss": 4.0660693359375,
"step": 44900
},
{
"epoch": 0.46696483235962516,
"grad_norm": 1.7224327325820923,
"learning_rate": 0.0002533045544636649,
"loss": 3.9855413818359375,
"step": 45000
},
{
"epoch": 0.468002531987091,
"grad_norm": 1.6977527141571045,
"learning_rate": 0.0002532007845009183,
"loss": 3.813612976074219,
"step": 45100
},
{
"epoch": 0.4690402316145568,
"grad_norm": 2.9529614448547363,
"learning_rate": 0.00025309701453817175,
"loss": 3.995145263671875,
"step": 45200
},
{
"epoch": 0.4700779312420227,
"grad_norm": 3.1997270584106445,
"learning_rate": 0.0002529932445754252,
"loss": 4.031595153808594,
"step": 45300
},
{
"epoch": 0.47111563086948854,
"grad_norm": 5.878026008605957,
"learning_rate": 0.0002528894746126786,
"loss": 4.028975524902344,
"step": 45400
},
{
"epoch": 0.47215333049695435,
"grad_norm": 1.7146035432815552,
"learning_rate": 0.00025278570464993205,
"loss": 4.085393676757812,
"step": 45500
},
{
"epoch": 0.4731910301244202,
"grad_norm": 2.954148292541504,
"learning_rate": 0.00025268193468718544,
"loss": 4.039700622558594,
"step": 45600
},
{
"epoch": 0.474228729751886,
"grad_norm": 1.9127237796783447,
"learning_rate": 0.00025257816472443883,
"loss": 4.100406494140625,
"step": 45700
},
{
"epoch": 0.4752664293793519,
"grad_norm": 1.8794509172439575,
"learning_rate": 0.00025247439476169223,
"loss": 3.9390939331054686,
"step": 45800
},
{
"epoch": 0.4763041290068177,
"grad_norm": 2.165816307067871,
"learning_rate": 0.0002523706247989457,
"loss": 4.155856628417968,
"step": 45900
},
{
"epoch": 0.47734182863428354,
"grad_norm": 6.686591148376465,
"learning_rate": 0.00025226685483619913,
"loss": 4.097453918457031,
"step": 46000
},
{
"epoch": 0.47837952826174934,
"grad_norm": 2.4973371028900146,
"learning_rate": 0.0002521630848734525,
"loss": 4.200291137695313,
"step": 46100
},
{
"epoch": 0.4794172278892152,
"grad_norm": 2.1478147506713867,
"learning_rate": 0.0002520593149107059,
"loss": 3.899898681640625,
"step": 46200
},
{
"epoch": 0.480454927516681,
"grad_norm": 1.6290667057037354,
"learning_rate": 0.0002519555449479593,
"loss": 4.157419128417969,
"step": 46300
},
{
"epoch": 0.48149262714414687,
"grad_norm": 2.3697171211242676,
"learning_rate": 0.00025185177498521276,
"loss": 4.0068753051757815,
"step": 46400
},
{
"epoch": 0.48253032677161267,
"grad_norm": 3.123157501220703,
"learning_rate": 0.00025174800502246616,
"loss": 3.9923574829101565,
"step": 46500
},
{
"epoch": 0.48356802639907853,
"grad_norm": 3.4272193908691406,
"learning_rate": 0.0002516442350597196,
"loss": 4.144463195800781,
"step": 46600
},
{
"epoch": 0.48460572602654434,
"grad_norm": 2.8348467350006104,
"learning_rate": 0.000251540465096973,
"loss": 4.055748291015625,
"step": 46700
},
{
"epoch": 0.4856434256540102,
"grad_norm": 3.0261967182159424,
"learning_rate": 0.0002514366951342264,
"loss": 4.177880554199219,
"step": 46800
},
{
"epoch": 0.486681125281476,
"grad_norm": 10.726264953613281,
"learning_rate": 0.00025133292517147984,
"loss": 3.9125796508789064,
"step": 46900
},
{
"epoch": 0.48771882490894186,
"grad_norm": 8.811136245727539,
"learning_rate": 0.00025122915520873324,
"loss": 3.9216848754882814,
"step": 47000
},
{
"epoch": 0.48875652453640767,
"grad_norm": 6.8598151206970215,
"learning_rate": 0.0002511253852459867,
"loss": 3.9738433837890623,
"step": 47100
},
{
"epoch": 0.4897942241638735,
"grad_norm": 5.096536636352539,
"learning_rate": 0.0002510216152832401,
"loss": 3.998507080078125,
"step": 47200
},
{
"epoch": 0.4908319237913394,
"grad_norm": 1.4742202758789062,
"learning_rate": 0.00025091784532049353,
"loss": 4.171350402832031,
"step": 47300
},
{
"epoch": 0.4918696234188052,
"grad_norm": 1.88887357711792,
"learning_rate": 0.0002508140753577469,
"loss": 4.106647644042969,
"step": 47400
},
{
"epoch": 0.49290732304627105,
"grad_norm": 1.6502625942230225,
"learning_rate": 0.0002507103053950003,
"loss": 3.877885437011719,
"step": 47500
},
{
"epoch": 0.49394502267373686,
"grad_norm": 1.728053331375122,
"learning_rate": 0.00025060653543225377,
"loss": 4.064427795410157,
"step": 47600
},
{
"epoch": 0.4949827223012027,
"grad_norm": 4.632587432861328,
"learning_rate": 0.00025050276546950716,
"loss": 4.113824157714844,
"step": 47700
},
{
"epoch": 0.4960204219286685,
"grad_norm": 1.5823708772659302,
"learning_rate": 0.0002503989955067606,
"loss": 4.080696411132813,
"step": 47800
},
{
"epoch": 0.4970581215561344,
"grad_norm": 1.9801136255264282,
"learning_rate": 0.000250295225544014,
"loss": 3.945875549316406,
"step": 47900
},
{
"epoch": 0.4980958211836002,
"grad_norm": 1.3339368104934692,
"learning_rate": 0.0002501914555812674,
"loss": 3.951331787109375,
"step": 48000
},
{
"epoch": 0.49913352081106604,
"grad_norm": 2.1013355255126953,
"learning_rate": 0.00025008768561852085,
"loss": 4.022156372070312,
"step": 48100
},
{
"epoch": 0.5001712204385319,
"grad_norm": 2.7022488117218018,
"learning_rate": 0.00024998391565577425,
"loss": 3.9780624389648436,
"step": 48200
},
{
"epoch": 0.5012089200659977,
"grad_norm": 10.230494499206543,
"learning_rate": 0.0002498801456930277,
"loss": 4.024637145996094,
"step": 48300
},
{
"epoch": 0.5022466196934635,
"grad_norm": 7.242427349090576,
"learning_rate": 0.0002497763757302811,
"loss": 3.9954248046875,
"step": 48400
},
{
"epoch": 0.5032843193209293,
"grad_norm": 2.742445945739746,
"learning_rate": 0.0002496726057675345,
"loss": 3.9637130737304687,
"step": 48500
},
{
"epoch": 0.5043220189483952,
"grad_norm": 1.6320149898529053,
"learning_rate": 0.00024956883580478794,
"loss": 4.035350952148438,
"step": 48600
},
{
"epoch": 0.505359718575861,
"grad_norm": 2.239950180053711,
"learning_rate": 0.00024946506584204133,
"loss": 3.961440124511719,
"step": 48700
},
{
"epoch": 0.5063974182033268,
"grad_norm": 6.686822891235352,
"learning_rate": 0.0002493612958792948,
"loss": 4.003260498046875,
"step": 48800
},
{
"epoch": 0.5074351178307926,
"grad_norm": 1.9818964004516602,
"learning_rate": 0.0002492575259165482,
"loss": 4.018614501953125,
"step": 48900
},
{
"epoch": 0.5084728174582586,
"grad_norm": 1.5698004961013794,
"learning_rate": 0.00024915375595380157,
"loss": 4.045997314453125,
"step": 49000
},
{
"epoch": 0.5095105170857244,
"grad_norm": 2.3865158557891846,
"learning_rate": 0.000249049985991055,
"loss": 4.050853576660156,
"step": 49100
},
{
"epoch": 0.5105482167131902,
"grad_norm": 14.248946189880371,
"learning_rate": 0.0002489462160283084,
"loss": 3.991949462890625,
"step": 49200
},
{
"epoch": 0.5115859163406561,
"grad_norm": 1.279118537902832,
"learning_rate": 0.00024884244606556186,
"loss": 3.92796875,
"step": 49300
},
{
"epoch": 0.5126236159681219,
"grad_norm": 2.575704574584961,
"learning_rate": 0.00024873867610281526,
"loss": 4.12865478515625,
"step": 49400
},
{
"epoch": 0.5136613155955877,
"grad_norm": 2.0912930965423584,
"learning_rate": 0.0002486349061400687,
"loss": 4.042799682617187,
"step": 49500
},
{
"epoch": 0.5146990152230535,
"grad_norm": 2.6358580589294434,
"learning_rate": 0.0002485311361773221,
"loss": 4.069761047363281,
"step": 49600
},
{
"epoch": 0.5157367148505194,
"grad_norm": 2.6711385250091553,
"learning_rate": 0.0002484273662145755,
"loss": 3.9823483276367186,
"step": 49700
},
{
"epoch": 0.5167744144779852,
"grad_norm": 3.348376989364624,
"learning_rate": 0.00024832359625182894,
"loss": 4.119874572753906,
"step": 49800
},
{
"epoch": 0.517812114105451,
"grad_norm": 1.7040736675262451,
"learning_rate": 0.00024821982628908234,
"loss": 4.038002319335938,
"step": 49900
},
{
"epoch": 0.5188498137329168,
"grad_norm": 11.144097328186035,
"learning_rate": 0.0002481160563263358,
"loss": 3.933763122558594,
"step": 50000
},
{
"epoch": 0.5198875133603827,
"grad_norm": 3.1529595851898193,
"learning_rate": 0.0002480122863635892,
"loss": 3.990421142578125,
"step": 50100
},
{
"epoch": 0.5209252129878486,
"grad_norm": 2.3761773109436035,
"learning_rate": 0.0002479085164008426,
"loss": 3.9385421752929686,
"step": 50200
},
{
"epoch": 0.5219629126153144,
"grad_norm": 14.909253120422363,
"learning_rate": 0.00024780474643809597,
"loss": 3.924638671875,
"step": 50300
},
{
"epoch": 0.5230006122427802,
"grad_norm": 1.4870705604553223,
"learning_rate": 0.0002477009764753494,
"loss": 4.003363037109375,
"step": 50400
},
{
"epoch": 0.5240383118702461,
"grad_norm": 2.5456697940826416,
"learning_rate": 0.00024759720651260287,
"loss": 4.063373413085937,
"step": 50500
},
{
"epoch": 0.5250760114977119,
"grad_norm": 4.392611980438232,
"learning_rate": 0.00024749343654985627,
"loss": 4.108450927734375,
"step": 50600
},
{
"epoch": 0.5261137111251777,
"grad_norm": 2.8420300483703613,
"learning_rate": 0.00024738966658710966,
"loss": 3.9724908447265626,
"step": 50700
},
{
"epoch": 0.5271514107526435,
"grad_norm": 2.3819692134857178,
"learning_rate": 0.00024728589662436306,
"loss": 4.040487060546875,
"step": 50800
},
{
"epoch": 0.5281891103801094,
"grad_norm": 2.1021909713745117,
"learning_rate": 0.0002471821266616165,
"loss": 4.101463623046875,
"step": 50900
},
{
"epoch": 0.5292268100075752,
"grad_norm": 2.8605117797851562,
"learning_rate": 0.00024707835669886995,
"loss": 3.9426974487304687,
"step": 51000
},
{
"epoch": 0.530264509635041,
"grad_norm": 1.331457257270813,
"learning_rate": 0.00024697458673612335,
"loss": 4.005464172363281,
"step": 51100
},
{
"epoch": 0.5313022092625068,
"grad_norm": 2.4866714477539062,
"learning_rate": 0.00024687081677337674,
"loss": 4.089916687011719,
"step": 51200
},
{
"epoch": 0.5323399088899727,
"grad_norm": 6.342608451843262,
"learning_rate": 0.00024676704681063014,
"loss": 3.979620361328125,
"step": 51300
},
{
"epoch": 0.5333776085174385,
"grad_norm": 1.3954708576202393,
"learning_rate": 0.0002466632768478836,
"loss": 3.9805123901367185,
"step": 51400
},
{
"epoch": 0.5344153081449043,
"grad_norm": 24.8520450592041,
"learning_rate": 0.000246559506885137,
"loss": 4.0105502319335935,
"step": 51500
},
{
"epoch": 0.5354530077723703,
"grad_norm": 2.0366039276123047,
"learning_rate": 0.00024645573692239043,
"loss": 3.919516296386719,
"step": 51600
},
{
"epoch": 0.5364907073998361,
"grad_norm": 1.3017858266830444,
"learning_rate": 0.0002463519669596439,
"loss": 3.951867980957031,
"step": 51700
},
{
"epoch": 0.5375284070273019,
"grad_norm": 2.579885244369507,
"learning_rate": 0.0002462481969968973,
"loss": 3.960545959472656,
"step": 51800
},
{
"epoch": 0.5385661066547677,
"grad_norm": 1.5787100791931152,
"learning_rate": 0.00024614442703415067,
"loss": 4.013999938964844,
"step": 51900
},
{
"epoch": 0.5396038062822336,
"grad_norm": 3.9871633052825928,
"learning_rate": 0.00024604065707140406,
"loss": 3.950070495605469,
"step": 52000
},
{
"epoch": 0.5406415059096994,
"grad_norm": 1.572277545928955,
"learning_rate": 0.0002459368871086575,
"loss": 4.086417846679687,
"step": 52100
},
{
"epoch": 0.5416792055371652,
"grad_norm": 7.029146671295166,
"learning_rate": 0.0002458331171459109,
"loss": 3.8767724609375,
"step": 52200
},
{
"epoch": 0.542716905164631,
"grad_norm": 1.2442755699157715,
"learning_rate": 0.00024572934718316436,
"loss": 3.875315856933594,
"step": 52300
},
{
"epoch": 0.5437546047920969,
"grad_norm": 3.5381152629852295,
"learning_rate": 0.00024562557722041775,
"loss": 4.013727416992188,
"step": 52400
},
{
"epoch": 0.5447923044195627,
"grad_norm": 16.472898483276367,
"learning_rate": 0.00024552180725767115,
"loss": 4.058722839355469,
"step": 52500
},
{
"epoch": 0.5458300040470285,
"grad_norm": 1.4836983680725098,
"learning_rate": 0.0002454180372949246,
"loss": 4.106039123535156,
"step": 52600
},
{
"epoch": 0.5468677036744943,
"grad_norm": 4.735908031463623,
"learning_rate": 0.000245314267332178,
"loss": 4.109900817871094,
"step": 52700
},
{
"epoch": 0.5479054033019602,
"grad_norm": 1.7438913583755493,
"learning_rate": 0.00024521049736943144,
"loss": 4.098789978027344,
"step": 52800
},
{
"epoch": 0.548943102929426,
"grad_norm": 3.592564105987549,
"learning_rate": 0.00024510672740668483,
"loss": 3.9866278076171877,
"step": 52900
},
{
"epoch": 0.5499808025568919,
"grad_norm": 1.9763888120651245,
"learning_rate": 0.00024500295744393823,
"loss": 3.9620831298828123,
"step": 53000
},
{
"epoch": 0.5510185021843577,
"grad_norm": 1.0539793968200684,
"learning_rate": 0.0002448991874811917,
"loss": 4.006460266113281,
"step": 53100
},
{
"epoch": 0.5520562018118236,
"grad_norm": 2.2474358081817627,
"learning_rate": 0.00024479541751844507,
"loss": 4.067258605957031,
"step": 53200
},
{
"epoch": 0.5530939014392894,
"grad_norm": 1.5785913467407227,
"learning_rate": 0.0002446916475556985,
"loss": 4.057683715820312,
"step": 53300
},
{
"epoch": 0.5541316010667552,
"grad_norm": 2.2754416465759277,
"learning_rate": 0.0002445878775929519,
"loss": 3.9662628173828125,
"step": 53400
},
{
"epoch": 0.5551693006942211,
"grad_norm": 2.0118043422698975,
"learning_rate": 0.00024448410763020537,
"loss": 3.9848583984375,
"step": 53500
},
{
"epoch": 0.5562070003216869,
"grad_norm": 2.3987770080566406,
"learning_rate": 0.00024438033766745876,
"loss": 4.00030029296875,
"step": 53600
},
{
"epoch": 0.5572446999491527,
"grad_norm": 2.9198148250579834,
"learning_rate": 0.00024427656770471216,
"loss": 3.8882846069335937,
"step": 53700
},
{
"epoch": 0.5582823995766185,
"grad_norm": 2.0234696865081787,
"learning_rate": 0.0002441727977419656,
"loss": 3.9845794677734374,
"step": 53800
},
{
"epoch": 0.5593200992040844,
"grad_norm": 1.701568841934204,
"learning_rate": 0.000244069027779219,
"loss": 4.01090087890625,
"step": 53900
},
{
"epoch": 0.5603577988315502,
"grad_norm": 2.3093771934509277,
"learning_rate": 0.00024396525781647242,
"loss": 3.9678195190429686,
"step": 54000
},
{
"epoch": 0.561395498459016,
"grad_norm": 2.0182909965515137,
"learning_rate": 0.00024386148785372582,
"loss": 4.025320434570313,
"step": 54100
},
{
"epoch": 0.5624331980864818,
"grad_norm": 3.1341028213500977,
"learning_rate": 0.00024375771789097927,
"loss": 3.9826446533203126,
"step": 54200
},
{
"epoch": 0.5634708977139478,
"grad_norm": 2.025581121444702,
"learning_rate": 0.0002436539479282327,
"loss": 3.906527404785156,
"step": 54300
},
{
"epoch": 0.5645085973414136,
"grad_norm": 2.913895845413208,
"learning_rate": 0.00024355017796548608,
"loss": 3.970755920410156,
"step": 54400
},
{
"epoch": 0.5655462969688794,
"grad_norm": 1.9220850467681885,
"learning_rate": 0.0002434464080027395,
"loss": 3.943621826171875,
"step": 54500
},
{
"epoch": 0.5665839965963452,
"grad_norm": 1.2168983221054077,
"learning_rate": 0.0002433426380399929,
"loss": 3.9780545043945312,
"step": 54600
},
{
"epoch": 0.5676216962238111,
"grad_norm": 1.5367380380630493,
"learning_rate": 0.00024323886807724635,
"loss": 3.8468157958984377,
"step": 54700
},
{
"epoch": 0.5686593958512769,
"grad_norm": 2.7281689643859863,
"learning_rate": 0.00024313509811449977,
"loss": 3.9043319702148436,
"step": 54800
},
{
"epoch": 0.5696970954787427,
"grad_norm": 1.1875724792480469,
"learning_rate": 0.00024303132815175316,
"loss": 4.029020385742188,
"step": 54900
},
{
"epoch": 0.5707347951062085,
"grad_norm": 9.087173461914062,
"learning_rate": 0.00024292755818900659,
"loss": 3.977708740234375,
"step": 55000
},
{
"epoch": 0.5717724947336744,
"grad_norm": 1.94620943069458,
"learning_rate": 0.00024282378822626,
"loss": 3.9465988159179686,
"step": 55100
},
{
"epoch": 0.5728101943611402,
"grad_norm": 3.0396885871887207,
"learning_rate": 0.00024272001826351343,
"loss": 4.030888366699219,
"step": 55200
},
{
"epoch": 0.573847893988606,
"grad_norm": 1.557199239730835,
"learning_rate": 0.00024261624830076682,
"loss": 3.9756591796875,
"step": 55300
},
{
"epoch": 0.5748855936160719,
"grad_norm": 3.0625579357147217,
"learning_rate": 0.00024251247833802025,
"loss": 4.076784362792969,
"step": 55400
},
{
"epoch": 0.5759232932435377,
"grad_norm": 1.9166301488876343,
"learning_rate": 0.0002424087083752737,
"loss": 3.9604058837890626,
"step": 55500
},
{
"epoch": 0.5769609928710036,
"grad_norm": 1.2829216718673706,
"learning_rate": 0.0002423049384125271,
"loss": 3.841531066894531,
"step": 55600
},
{
"epoch": 0.5779986924984694,
"grad_norm": 2.9800634384155273,
"learning_rate": 0.0002422011684497805,
"loss": 3.915208435058594,
"step": 55700
},
{
"epoch": 0.5790363921259353,
"grad_norm": 4.931972026824951,
"learning_rate": 0.0002420973984870339,
"loss": 3.7610516357421875,
"step": 55800
},
{
"epoch": 0.5800740917534011,
"grad_norm": 3.796473264694214,
"learning_rate": 0.00024199362852428733,
"loss": 4.009695129394531,
"step": 55900
},
{
"epoch": 0.5811117913808669,
"grad_norm": 2.3635172843933105,
"learning_rate": 0.00024188985856154075,
"loss": 4.164959716796875,
"step": 56000
},
{
"epoch": 0.5821494910083327,
"grad_norm": 2.3295187950134277,
"learning_rate": 0.00024178608859879417,
"loss": 4.012393493652343,
"step": 56100
},
{
"epoch": 0.5831871906357986,
"grad_norm": 3.1501762866973877,
"learning_rate": 0.0002416823186360476,
"loss": 3.9226104736328127,
"step": 56200
},
{
"epoch": 0.5842248902632644,
"grad_norm": 2.8185627460479736,
"learning_rate": 0.000241578548673301,
"loss": 3.9830364990234375,
"step": 56300
},
{
"epoch": 0.5852625898907302,
"grad_norm": 2.39125657081604,
"learning_rate": 0.00024147477871055444,
"loss": 4.058615112304688,
"step": 56400
},
{
"epoch": 0.586300289518196,
"grad_norm": 2.658254623413086,
"learning_rate": 0.00024137100874780783,
"loss": 3.9012820434570314,
"step": 56500
},
{
"epoch": 0.5873379891456619,
"grad_norm": 2.873662233352661,
"learning_rate": 0.00024126723878506126,
"loss": 4.018562622070313,
"step": 56600
},
{
"epoch": 0.5883756887731277,
"grad_norm": 2.0522000789642334,
"learning_rate": 0.00024116346882231468,
"loss": 4.0417938232421875,
"step": 56700
},
{
"epoch": 0.5894133884005935,
"grad_norm": 2.688117742538452,
"learning_rate": 0.00024105969885956807,
"loss": 3.910294494628906,
"step": 56800
},
{
"epoch": 0.5904510880280593,
"grad_norm": 3.5324251651763916,
"learning_rate": 0.00024095592889682152,
"loss": 4.042366027832031,
"step": 56900
},
{
"epoch": 0.5914887876555253,
"grad_norm": 3.254483461380005,
"learning_rate": 0.00024085215893407492,
"loss": 3.875579833984375,
"step": 57000
},
{
"epoch": 0.5925264872829911,
"grad_norm": 1.4469491243362427,
"learning_rate": 0.00024074838897132834,
"loss": 3.8468057250976564,
"step": 57100
},
{
"epoch": 0.5935641869104569,
"grad_norm": 7.142496585845947,
"learning_rate": 0.00024064461900858173,
"loss": 3.9028366088867186,
"step": 57200
},
{
"epoch": 0.5946018865379228,
"grad_norm": 2.8328020572662354,
"learning_rate": 0.00024054084904583518,
"loss": 4.013849182128906,
"step": 57300
},
{
"epoch": 0.5956395861653886,
"grad_norm": 1.999799370765686,
"learning_rate": 0.0002404370790830886,
"loss": 3.9890103149414062,
"step": 57400
},
{
"epoch": 0.5966772857928544,
"grad_norm": 5.142120361328125,
"learning_rate": 0.000240333309120342,
"loss": 3.8782421875,
"step": 57500
},
{
"epoch": 0.5977149854203202,
"grad_norm": 2.6170506477355957,
"learning_rate": 0.00024022953915759542,
"loss": 3.9341799926757814,
"step": 57600
},
{
"epoch": 0.5987526850477861,
"grad_norm": 4.847115993499756,
"learning_rate": 0.00024012576919484882,
"loss": 4.028234252929687,
"step": 57700
},
{
"epoch": 0.5997903846752519,
"grad_norm": 3.093014717102051,
"learning_rate": 0.00024002199923210226,
"loss": 4.02893310546875,
"step": 57800
},
{
"epoch": 0.6008280843027177,
"grad_norm": 2.6559977531433105,
"learning_rate": 0.00023991822926935566,
"loss": 3.9997882080078124,
"step": 57900
},
{
"epoch": 0.6018657839301835,
"grad_norm": 1.5972485542297363,
"learning_rate": 0.00023981445930660908,
"loss": 3.9749560546875,
"step": 58000
},
{
"epoch": 0.6029034835576494,
"grad_norm": 3.777557134628296,
"learning_rate": 0.0002397106893438625,
"loss": 3.9969076538085937,
"step": 58100
},
{
"epoch": 0.6039411831851152,
"grad_norm": 1.8903939723968506,
"learning_rate": 0.00023960691938111593,
"loss": 4.007763977050781,
"step": 58200
},
{
"epoch": 0.604978882812581,
"grad_norm": 3.150963068008423,
"learning_rate": 0.00023950314941836935,
"loss": 4.019749145507813,
"step": 58300
},
{
"epoch": 0.6060165824400469,
"grad_norm": 1.934287190437317,
"learning_rate": 0.00023939937945562274,
"loss": 4.014994812011719,
"step": 58400
},
{
"epoch": 0.6070542820675128,
"grad_norm": 7.10530948638916,
"learning_rate": 0.00023929560949287616,
"loss": 4.050195617675781,
"step": 58500
},
{
"epoch": 0.6080919816949786,
"grad_norm": 2.367403030395508,
"learning_rate": 0.0002391918395301296,
"loss": 3.8701296997070314,
"step": 58600
},
{
"epoch": 0.6091296813224444,
"grad_norm": 1.9392305612564087,
"learning_rate": 0.000239088069567383,
"loss": 4.08440185546875,
"step": 58700
},
{
"epoch": 0.6101673809499102,
"grad_norm": 2.5947983264923096,
"learning_rate": 0.00023898429960463643,
"loss": 4.050205078125,
"step": 58800
},
{
"epoch": 0.6112050805773761,
"grad_norm": 2.1583032608032227,
"learning_rate": 0.00023888052964188982,
"loss": 3.958690490722656,
"step": 58900
},
{
"epoch": 0.6122427802048419,
"grad_norm": 1.6529427766799927,
"learning_rate": 0.00023877675967914325,
"loss": 3.9609234619140623,
"step": 59000
},
{
"epoch": 0.6132804798323077,
"grad_norm": 2.0239171981811523,
"learning_rate": 0.00023867298971639667,
"loss": 4.128135986328125,
"step": 59100
},
{
"epoch": 0.6143181794597736,
"grad_norm": 3.8679206371307373,
"learning_rate": 0.0002385692197536501,
"loss": 4.005528869628907,
"step": 59200
},
{
"epoch": 0.6153558790872394,
"grad_norm": 3.305494785308838,
"learning_rate": 0.0002384654497909035,
"loss": 3.9134161376953127,
"step": 59300
},
{
"epoch": 0.6163935787147052,
"grad_norm": 1.640649676322937,
"learning_rate": 0.0002383616798281569,
"loss": 3.92852783203125,
"step": 59400
},
{
"epoch": 0.617431278342171,
"grad_norm": 1.7184723615646362,
"learning_rate": 0.00023825790986541036,
"loss": 3.8771322631835936,
"step": 59500
},
{
"epoch": 0.618468977969637,
"grad_norm": 2.6886117458343506,
"learning_rate": 0.00023815413990266375,
"loss": 4.047822875976562,
"step": 59600
},
{
"epoch": 0.6195066775971028,
"grad_norm": 2.9485394954681396,
"learning_rate": 0.00023805036993991717,
"loss": 4.04974853515625,
"step": 59700
},
{
"epoch": 0.6205443772245686,
"grad_norm": 18.998411178588867,
"learning_rate": 0.00023794659997717057,
"loss": 3.978843994140625,
"step": 59800
},
{
"epoch": 0.6215820768520344,
"grad_norm": 1.6347628831863403,
"learning_rate": 0.000237842830014424,
"loss": 3.94311279296875,
"step": 59900
},
{
"epoch": 0.6226197764795003,
"grad_norm": 4.1301798820495605,
"learning_rate": 0.00023773906005167744,
"loss": 4.044434814453125,
"step": 60000
},
{
"epoch": 0.6236574761069661,
"grad_norm": 2.7278170585632324,
"learning_rate": 0.00023763529008893083,
"loss": 3.9771295166015626,
"step": 60100
},
{
"epoch": 0.6246951757344319,
"grad_norm": 3.4196488857269287,
"learning_rate": 0.00023753152012618426,
"loss": 3.9663619995117188,
"step": 60200
},
{
"epoch": 0.6257328753618977,
"grad_norm": 1.3134477138519287,
"learning_rate": 0.00023742775016343765,
"loss": 4.089789733886719,
"step": 60300
},
{
"epoch": 0.6267705749893636,
"grad_norm": 4.490455627441406,
"learning_rate": 0.0002373239802006911,
"loss": 3.87512939453125,
"step": 60400
},
{
"epoch": 0.6278082746168294,
"grad_norm": 3.0652222633361816,
"learning_rate": 0.00023722021023794452,
"loss": 3.893270263671875,
"step": 60500
},
{
"epoch": 0.6288459742442952,
"grad_norm": 8.751646995544434,
"learning_rate": 0.00023711644027519792,
"loss": 3.862340393066406,
"step": 60600
},
{
"epoch": 0.629883673871761,
"grad_norm": 2.9108734130859375,
"learning_rate": 0.00023701267031245134,
"loss": 3.9849557495117187,
"step": 60700
},
{
"epoch": 0.630921373499227,
"grad_norm": 2.250643253326416,
"learning_rate": 0.00023690890034970473,
"loss": 3.955241394042969,
"step": 60800
},
{
"epoch": 0.6319590731266927,
"grad_norm": 1.4363751411437988,
"learning_rate": 0.00023680513038695818,
"loss": 4.0179071044921875,
"step": 60900
},
{
"epoch": 0.6329967727541586,
"grad_norm": 1.6399027109146118,
"learning_rate": 0.00023670136042421158,
"loss": 3.911060485839844,
"step": 61000
},
{
"epoch": 0.6340344723816245,
"grad_norm": 2.371727228164673,
"learning_rate": 0.000236597590461465,
"loss": 3.9237380981445313,
"step": 61100
},
{
"epoch": 0.6350721720090903,
"grad_norm": 1.6354718208312988,
"learning_rate": 0.00023649382049871842,
"loss": 4.036581420898438,
"step": 61200
},
{
"epoch": 0.6361098716365561,
"grad_norm": 3.147254705429077,
"learning_rate": 0.00023639005053597184,
"loss": 4.009747619628906,
"step": 61300
},
{
"epoch": 0.6371475712640219,
"grad_norm": 2.9439003467559814,
"learning_rate": 0.00023628628057322526,
"loss": 3.965068664550781,
"step": 61400
},
{
"epoch": 0.6381852708914878,
"grad_norm": 2.8980836868286133,
"learning_rate": 0.00023618251061047866,
"loss": 3.99951171875,
"step": 61500
},
{
"epoch": 0.6392229705189536,
"grad_norm": 2.862438201904297,
"learning_rate": 0.00023607874064773208,
"loss": 3.8896145629882812,
"step": 61600
},
{
"epoch": 0.6402606701464194,
"grad_norm": 1.7125756740570068,
"learning_rate": 0.00023597497068498548,
"loss": 3.9900253295898436,
"step": 61700
},
{
"epoch": 0.6412983697738852,
"grad_norm": 13.891119956970215,
"learning_rate": 0.00023587120072223892,
"loss": 3.8787249755859374,
"step": 61800
},
{
"epoch": 0.6423360694013511,
"grad_norm": 3.5258827209472656,
"learning_rate": 0.00023576743075949235,
"loss": 3.940326843261719,
"step": 61900
},
{
"epoch": 0.6433737690288169,
"grad_norm": 4.297271251678467,
"learning_rate": 0.00023566366079674574,
"loss": 3.8732571411132812,
"step": 62000
},
{
"epoch": 0.6444114686562827,
"grad_norm": 3.574477195739746,
"learning_rate": 0.00023555989083399916,
"loss": 4.078603515625,
"step": 62100
},
{
"epoch": 0.6454491682837485,
"grad_norm": 3.2514758110046387,
"learning_rate": 0.00023545612087125259,
"loss": 3.956298522949219,
"step": 62200
},
{
"epoch": 0.6464868679112145,
"grad_norm": 2.582719326019287,
"learning_rate": 0.000235352350908506,
"loss": 3.8729116821289065,
"step": 62300
},
{
"epoch": 0.6475245675386803,
"grad_norm": 2.445774793624878,
"learning_rate": 0.00023524858094575943,
"loss": 4.064724426269532,
"step": 62400
},
{
"epoch": 0.6485622671661461,
"grad_norm": 4.912772178649902,
"learning_rate": 0.00023514481098301282,
"loss": 4.02049560546875,
"step": 62500
},
{
"epoch": 0.6495999667936119,
"grad_norm": 3.490936040878296,
"learning_rate": 0.00023504104102026627,
"loss": 3.912366943359375,
"step": 62600
},
{
"epoch": 0.6506376664210778,
"grad_norm": 2.109618902206421,
"learning_rate": 0.00023493727105751967,
"loss": 3.963838806152344,
"step": 62700
},
{
"epoch": 0.6516753660485436,
"grad_norm": 12.706518173217773,
"learning_rate": 0.0002348335010947731,
"loss": 3.901888732910156,
"step": 62800
},
{
"epoch": 0.6527130656760094,
"grad_norm": 4.266041278839111,
"learning_rate": 0.00023472973113202648,
"loss": 3.902781982421875,
"step": 62900
},
{
"epoch": 0.6537507653034752,
"grad_norm": 3.4900457859039307,
"learning_rate": 0.0002346259611692799,
"loss": 3.8866873168945313,
"step": 63000
},
{
"epoch": 0.6547884649309411,
"grad_norm": 2.4276134967803955,
"learning_rate": 0.00023452219120653336,
"loss": 3.8234634399414062,
"step": 63100
},
{
"epoch": 0.6558261645584069,
"grad_norm": 2.8377914428710938,
"learning_rate": 0.00023441842124378675,
"loss": 3.836332092285156,
"step": 63200
},
{
"epoch": 0.6568638641858727,
"grad_norm": 6.935495853424072,
"learning_rate": 0.00023431465128104017,
"loss": 4.100373229980469,
"step": 63300
},
{
"epoch": 0.6579015638133386,
"grad_norm": 2.90283465385437,
"learning_rate": 0.00023421088131829357,
"loss": 3.9408758544921874,
"step": 63400
},
{
"epoch": 0.6589392634408044,
"grad_norm": 2.8002378940582275,
"learning_rate": 0.00023410711135554702,
"loss": 3.9959124755859374,
"step": 63500
},
{
"epoch": 0.6599769630682703,
"grad_norm": 6.091791152954102,
"learning_rate": 0.0002340033413928004,
"loss": 3.9287460327148436,
"step": 63600
},
{
"epoch": 0.661014662695736,
"grad_norm": 1.2786389589309692,
"learning_rate": 0.00023389957143005383,
"loss": 4.015799560546875,
"step": 63700
},
{
"epoch": 0.662052362323202,
"grad_norm": 1.4586912393569946,
"learning_rate": 0.00023379580146730726,
"loss": 3.89241455078125,
"step": 63800
},
{
"epoch": 0.6630900619506678,
"grad_norm": 2.502657890319824,
"learning_rate": 0.00023369203150456065,
"loss": 3.9217596435546875,
"step": 63900
},
{
"epoch": 0.6641277615781336,
"grad_norm": 3.8019394874572754,
"learning_rate": 0.0002335882615418141,
"loss": 3.91360595703125,
"step": 64000
},
{
"epoch": 0.6651654612055994,
"grad_norm": 1.5058764219284058,
"learning_rate": 0.0002334844915790675,
"loss": 4.059972839355469,
"step": 64100
},
{
"epoch": 0.6662031608330653,
"grad_norm": 2.416229248046875,
"learning_rate": 0.00023338072161632092,
"loss": 3.9887905883789063,
"step": 64200
},
{
"epoch": 0.6672408604605311,
"grad_norm": 1.8767884969711304,
"learning_rate": 0.00023327695165357434,
"loss": 3.8748153686523437,
"step": 64300
},
{
"epoch": 0.6682785600879969,
"grad_norm": 1.7000967264175415,
"learning_rate": 0.00023317318169082776,
"loss": 3.9118023681640626,
"step": 64400
},
{
"epoch": 0.6693162597154627,
"grad_norm": 4.796393394470215,
"learning_rate": 0.00023306941172808118,
"loss": 3.9076058959960935,
"step": 64500
},
{
"epoch": 0.6703539593429286,
"grad_norm": 3.117870807647705,
"learning_rate": 0.00023296564176533458,
"loss": 3.95484375,
"step": 64600
},
{
"epoch": 0.6713916589703944,
"grad_norm": 1.6787638664245605,
"learning_rate": 0.000232861871802588,
"loss": 3.858246154785156,
"step": 64700
},
{
"epoch": 0.6724293585978602,
"grad_norm": 5.671106815338135,
"learning_rate": 0.0002327581018398414,
"loss": 3.9156753540039064,
"step": 64800
},
{
"epoch": 0.673467058225326,
"grad_norm": 7.058924674987793,
"learning_rate": 0.00023265433187709484,
"loss": 3.8724734497070314,
"step": 64900
},
{
"epoch": 0.674504757852792,
"grad_norm": 4.8587422370910645,
"learning_rate": 0.00023255056191434826,
"loss": 3.958966064453125,
"step": 65000
},
{
"epoch": 0.6755424574802578,
"grad_norm": 2.546802520751953,
"learning_rate": 0.00023244679195160166,
"loss": 3.9913558959960938,
"step": 65100
},
{
"epoch": 0.6765801571077236,
"grad_norm": 1.8444024324417114,
"learning_rate": 0.00023234302198885508,
"loss": 4.089451293945313,
"step": 65200
},
{
"epoch": 0.6776178567351895,
"grad_norm": 1.5202494859695435,
"learning_rate": 0.0002322392520261085,
"loss": 3.83590576171875,
"step": 65300
},
{
"epoch": 0.6786555563626553,
"grad_norm": 2.554324150085449,
"learning_rate": 0.00023213548206336192,
"loss": 3.9957940673828123,
"step": 65400
},
{
"epoch": 0.6796932559901211,
"grad_norm": 1.6007890701293945,
"learning_rate": 0.00023203171210061532,
"loss": 3.9022012329101563,
"step": 65500
},
{
"epoch": 0.6807309556175869,
"grad_norm": 2.593081474304199,
"learning_rate": 0.00023192794213786874,
"loss": 3.944790954589844,
"step": 65600
},
{
"epoch": 0.6817686552450528,
"grad_norm": 2.1474156379699707,
"learning_rate": 0.0002318241721751222,
"loss": 3.78737060546875,
"step": 65700
},
{
"epoch": 0.6828063548725186,
"grad_norm": 3.1960246562957764,
"learning_rate": 0.00023172040221237559,
"loss": 3.9783554077148438,
"step": 65800
},
{
"epoch": 0.6838440544999844,
"grad_norm": 3.8228328227996826,
"learning_rate": 0.000231616632249629,
"loss": 3.856565246582031,
"step": 65900
},
{
"epoch": 0.6848817541274502,
"grad_norm": 11.939492225646973,
"learning_rate": 0.0002315128622868824,
"loss": 3.8156298828125,
"step": 66000
},
{
"epoch": 0.6859194537549161,
"grad_norm": 1.8741025924682617,
"learning_rate": 0.00023140909232413582,
"loss": 3.9566534423828124,
"step": 66100
},
{
"epoch": 0.686957153382382,
"grad_norm": 1.682139277458191,
"learning_rate": 0.00023130532236138927,
"loss": 3.9164004516601563,
"step": 66200
},
{
"epoch": 0.6879948530098478,
"grad_norm": 1.1901954412460327,
"learning_rate": 0.00023120155239864267,
"loss": 4.0331982421875,
"step": 66300
},
{
"epoch": 0.6890325526373136,
"grad_norm": 2.2226786613464355,
"learning_rate": 0.0002310977824358961,
"loss": 3.901326904296875,
"step": 66400
},
{
"epoch": 0.6900702522647795,
"grad_norm": 2.28139328956604,
"learning_rate": 0.00023099401247314948,
"loss": 3.734437255859375,
"step": 66500
},
{
"epoch": 0.6911079518922453,
"grad_norm": 3.9518322944641113,
"learning_rate": 0.00023089024251040293,
"loss": 3.890718994140625,
"step": 66600
},
{
"epoch": 0.6921456515197111,
"grad_norm": 4.689309120178223,
"learning_rate": 0.00023078647254765633,
"loss": 3.83462646484375,
"step": 66700
},
{
"epoch": 0.6931833511471769,
"grad_norm": 2.5103607177734375,
"learning_rate": 0.00023068270258490975,
"loss": 3.8714788818359374,
"step": 66800
},
{
"epoch": 0.6942210507746428,
"grad_norm": 2.060398578643799,
"learning_rate": 0.00023057893262216317,
"loss": 3.8463949584960937,
"step": 66900
},
{
"epoch": 0.6952587504021086,
"grad_norm": 3.9058265686035156,
"learning_rate": 0.00023047516265941657,
"loss": 3.955802001953125,
"step": 67000
},
{
"epoch": 0.6962964500295744,
"grad_norm": 2.7018091678619385,
"learning_rate": 0.00023037139269667002,
"loss": 4.010853271484375,
"step": 67100
},
{
"epoch": 0.6973341496570403,
"grad_norm": 1.759364366531372,
"learning_rate": 0.0002302676227339234,
"loss": 3.8436270141601563,
"step": 67200
},
{
"epoch": 0.6983718492845061,
"grad_norm": 4.264219284057617,
"learning_rate": 0.00023016385277117683,
"loss": 3.906452941894531,
"step": 67300
},
{
"epoch": 0.6994095489119719,
"grad_norm": 2.064502000808716,
"learning_rate": 0.00023006008280843023,
"loss": 3.9249755859375,
"step": 67400
},
{
"epoch": 0.7004472485394377,
"grad_norm": 4.326413154602051,
"learning_rate": 0.00022995631284568368,
"loss": 3.9763421630859375,
"step": 67500
},
{
"epoch": 0.7014849481669037,
"grad_norm": 1.5424126386642456,
"learning_rate": 0.0002298525428829371,
"loss": 3.9105490112304686,
"step": 67600
},
{
"epoch": 0.7025226477943695,
"grad_norm": 3.1067123413085938,
"learning_rate": 0.0002297487729201905,
"loss": 4.066288146972656,
"step": 67700
},
{
"epoch": 0.7035603474218353,
"grad_norm": 1.3455185890197754,
"learning_rate": 0.00022964500295744392,
"loss": 3.906605224609375,
"step": 67800
},
{
"epoch": 0.7045980470493011,
"grad_norm": 4.567904472351074,
"learning_rate": 0.0002295412329946973,
"loss": 3.8274655151367187,
"step": 67900
},
{
"epoch": 0.705635746676767,
"grad_norm": 1.4911061525344849,
"learning_rate": 0.00022943746303195076,
"loss": 3.8712289428710935,
"step": 68000
},
{
"epoch": 0.7066734463042328,
"grad_norm": 1.8636422157287598,
"learning_rate": 0.00022933369306920418,
"loss": 3.9435845947265626,
"step": 68100
},
{
"epoch": 0.7077111459316986,
"grad_norm": 4.616937637329102,
"learning_rate": 0.00022922992310645758,
"loss": 4.073515319824219,
"step": 68200
},
{
"epoch": 0.7087488455591644,
"grad_norm": 2.339660167694092,
"learning_rate": 0.000229126153143711,
"loss": 3.752909851074219,
"step": 68300
},
{
"epoch": 0.7097865451866303,
"grad_norm": 2.2960572242736816,
"learning_rate": 0.00022902238318096442,
"loss": 3.841389465332031,
"step": 68400
},
{
"epoch": 0.7108242448140961,
"grad_norm": 1.9303183555603027,
"learning_rate": 0.00022891861321821784,
"loss": 4.007230529785156,
"step": 68500
},
{
"epoch": 0.7118619444415619,
"grad_norm": 3.3750216960906982,
"learning_rate": 0.00022881484325547124,
"loss": 4.0530221557617185,
"step": 68600
},
{
"epoch": 0.7128996440690277,
"grad_norm": 3.9443397521972656,
"learning_rate": 0.00022871107329272466,
"loss": 3.92802734375,
"step": 68700
},
{
"epoch": 0.7139373436964936,
"grad_norm": 2.2526562213897705,
"learning_rate": 0.0002286073033299781,
"loss": 4.117896728515625,
"step": 68800
},
{
"epoch": 0.7149750433239594,
"grad_norm": 3.631329298019409,
"learning_rate": 0.0002285035333672315,
"loss": 3.876401062011719,
"step": 68900
},
{
"epoch": 0.7160127429514253,
"grad_norm": 2.0594444274902344,
"learning_rate": 0.00022839976340448492,
"loss": 3.9595294189453125,
"step": 69000
},
{
"epoch": 0.7170504425788912,
"grad_norm": 6.801323413848877,
"learning_rate": 0.00022829599344173832,
"loss": 3.966697998046875,
"step": 69100
},
{
"epoch": 0.718088142206357,
"grad_norm": 3.579699754714966,
"learning_rate": 0.00022819222347899174,
"loss": 3.9083868408203126,
"step": 69200
},
{
"epoch": 0.7191258418338228,
"grad_norm": 3.9111030101776123,
"learning_rate": 0.0002280884535162452,
"loss": 4.020595092773437,
"step": 69300
},
{
"epoch": 0.7201635414612886,
"grad_norm": 1.5465009212493896,
"learning_rate": 0.00022798468355349858,
"loss": 4.002583618164063,
"step": 69400
},
{
"epoch": 0.7212012410887545,
"grad_norm": 2.5977070331573486,
"learning_rate": 0.000227880913590752,
"loss": 3.82881591796875,
"step": 69500
},
{
"epoch": 0.7222389407162203,
"grad_norm": 3.807143211364746,
"learning_rate": 0.0002277771436280054,
"loss": 3.8127020263671874,
"step": 69600
},
{
"epoch": 0.7232766403436861,
"grad_norm": 3.562692165374756,
"learning_rate": 0.00022767337366525885,
"loss": 3.861103820800781,
"step": 69700
},
{
"epoch": 0.7243143399711519,
"grad_norm": 4.136765003204346,
"learning_rate": 0.00022756960370251225,
"loss": 3.817465515136719,
"step": 69800
},
{
"epoch": 0.7253520395986178,
"grad_norm": 1.9534144401550293,
"learning_rate": 0.00022746583373976567,
"loss": 3.784884338378906,
"step": 69900
},
{
"epoch": 0.7263897392260836,
"grad_norm": 2.2738490104675293,
"learning_rate": 0.0002273620637770191,
"loss": 3.9553741455078124,
"step": 70000
},
{
"epoch": 0.7274274388535494,
"grad_norm": 8.41178035736084,
"learning_rate": 0.00022725829381427248,
"loss": 3.9581622314453124,
"step": 70100
},
{
"epoch": 0.7284651384810152,
"grad_norm": 2.574738025665283,
"learning_rate": 0.00022715452385152593,
"loss": 3.865647888183594,
"step": 70200
},
{
"epoch": 0.7295028381084812,
"grad_norm": 4.12198543548584,
"learning_rate": 0.00022705075388877933,
"loss": 3.8447744750976565,
"step": 70300
},
{
"epoch": 0.730540537735947,
"grad_norm": 3.4615478515625,
"learning_rate": 0.00022694698392603275,
"loss": 3.8417919921875,
"step": 70400
},
{
"epoch": 0.7315782373634128,
"grad_norm": 1.9662399291992188,
"learning_rate": 0.00022684321396328614,
"loss": 3.943636779785156,
"step": 70500
},
{
"epoch": 0.7326159369908786,
"grad_norm": 6.054515361785889,
"learning_rate": 0.0002267394440005396,
"loss": 3.9477130126953126,
"step": 70600
},
{
"epoch": 0.7336536366183445,
"grad_norm": 2.6368846893310547,
"learning_rate": 0.00022663567403779302,
"loss": 3.9134860229492188,
"step": 70700
},
{
"epoch": 0.7346913362458103,
"grad_norm": 18.437114715576172,
"learning_rate": 0.0002265319040750464,
"loss": 3.9025979614257813,
"step": 70800
},
{
"epoch": 0.7357290358732761,
"grad_norm": 3.9227664470672607,
"learning_rate": 0.00022642813411229983,
"loss": 3.9925546264648437,
"step": 70900
},
{
"epoch": 0.736766735500742,
"grad_norm": 2.9096601009368896,
"learning_rate": 0.00022632436414955323,
"loss": 3.7520477294921877,
"step": 71000
},
{
"epoch": 0.7378044351282078,
"grad_norm": 2.756199598312378,
"learning_rate": 0.00022622059418680668,
"loss": 3.7744400024414064,
"step": 71100
},
{
"epoch": 0.7388421347556736,
"grad_norm": 4.398651123046875,
"learning_rate": 0.0002261168242240601,
"loss": 3.8754537963867186,
"step": 71200
},
{
"epoch": 0.7398798343831394,
"grad_norm": 3.0455260276794434,
"learning_rate": 0.0002260130542613135,
"loss": 3.8303518676757813,
"step": 71300
},
{
"epoch": 0.7409175340106053,
"grad_norm": 1.6435341835021973,
"learning_rate": 0.00022590928429856692,
"loss": 3.868741149902344,
"step": 71400
},
{
"epoch": 0.7419552336380711,
"grad_norm": 2.460381507873535,
"learning_rate": 0.00022580551433582034,
"loss": 3.971143798828125,
"step": 71500
},
{
"epoch": 0.742992933265537,
"grad_norm": 3.793260335922241,
"learning_rate": 0.00022570174437307376,
"loss": 3.9564599609375,
"step": 71600
},
{
"epoch": 0.7440306328930028,
"grad_norm": 2.2400221824645996,
"learning_rate": 0.00022559797441032715,
"loss": 3.868074951171875,
"step": 71700
},
{
"epoch": 0.7450683325204687,
"grad_norm": 4.521097660064697,
"learning_rate": 0.00022549420444758058,
"loss": 3.9104345703125,
"step": 71800
},
{
"epoch": 0.7461060321479345,
"grad_norm": 2.454610824584961,
"learning_rate": 0.00022539043448483402,
"loss": 3.8415142822265627,
"step": 71900
},
{
"epoch": 0.7471437317754003,
"grad_norm": 1.7384246587753296,
"learning_rate": 0.00022528666452208742,
"loss": 3.9767572021484376,
"step": 72000
},
{
"epoch": 0.7481814314028661,
"grad_norm": 2.3506603240966797,
"learning_rate": 0.00022518289455934084,
"loss": 3.804529724121094,
"step": 72100
},
{
"epoch": 0.749219131030332,
"grad_norm": 8.719681739807129,
"learning_rate": 0.00022507912459659424,
"loss": 3.6692437744140625,
"step": 72200
},
{
"epoch": 0.7502568306577978,
"grad_norm": 2.188565254211426,
"learning_rate": 0.00022497535463384766,
"loss": 3.9966400146484373,
"step": 72300
},
{
"epoch": 0.7512945302852636,
"grad_norm": 2.7061383724212646,
"learning_rate": 0.00022487158467110108,
"loss": 3.7955560302734375,
"step": 72400
},
{
"epoch": 0.7523322299127294,
"grad_norm": 1.820816993713379,
"learning_rate": 0.0002247678147083545,
"loss": 3.800717468261719,
"step": 72500
},
{
"epoch": 0.7533699295401953,
"grad_norm": 2.3510568141937256,
"learning_rate": 0.00022466404474560792,
"loss": 3.8987237548828126,
"step": 72600
},
{
"epoch": 0.7544076291676611,
"grad_norm": 3.0852279663085938,
"learning_rate": 0.00022456027478286132,
"loss": 3.9560122680664063,
"step": 72700
},
{
"epoch": 0.7554453287951269,
"grad_norm": 2.3377742767333984,
"learning_rate": 0.00022445650482011477,
"loss": 3.9077328491210936,
"step": 72800
},
{
"epoch": 0.7564830284225929,
"grad_norm": 4.257030010223389,
"learning_rate": 0.00022435273485736816,
"loss": 3.915125732421875,
"step": 72900
},
{
"epoch": 0.7575207280500587,
"grad_norm": 1.8238855600357056,
"learning_rate": 0.00022424896489462158,
"loss": 3.8456768798828125,
"step": 73000
},
{
"epoch": 0.7585584276775245,
"grad_norm": 2.2102901935577393,
"learning_rate": 0.000224145194931875,
"loss": 3.9905462646484375,
"step": 73100
},
{
"epoch": 0.7595961273049903,
"grad_norm": 6.003772735595703,
"learning_rate": 0.0002240414249691284,
"loss": 3.831954040527344,
"step": 73200
},
{
"epoch": 0.7606338269324562,
"grad_norm": 2.209681272506714,
"learning_rate": 0.00022393765500638185,
"loss": 3.96739990234375,
"step": 73300
},
{
"epoch": 0.761671526559922,
"grad_norm": 5.8811235427856445,
"learning_rate": 0.00022383388504363525,
"loss": 3.8418869018554687,
"step": 73400
},
{
"epoch": 0.7627092261873878,
"grad_norm": 1.9358527660369873,
"learning_rate": 0.00022373011508088867,
"loss": 3.9846435546875,
"step": 73500
},
{
"epoch": 0.7637469258148536,
"grad_norm": 4.668230056762695,
"learning_rate": 0.00022362634511814206,
"loss": 3.87702880859375,
"step": 73600
},
{
"epoch": 0.7647846254423195,
"grad_norm": 2.1674551963806152,
"learning_rate": 0.0002235225751553955,
"loss": 3.9948715209960937,
"step": 73700
},
{
"epoch": 0.7658223250697853,
"grad_norm": 3.276775360107422,
"learning_rate": 0.00022341880519264893,
"loss": 3.876432189941406,
"step": 73800
},
{
"epoch": 0.7668600246972511,
"grad_norm": 2.382432222366333,
"learning_rate": 0.00022331503522990233,
"loss": 3.9535626220703124,
"step": 73900
},
{
"epoch": 0.7678977243247169,
"grad_norm": 2.288184404373169,
"learning_rate": 0.00022321126526715575,
"loss": 3.962213134765625,
"step": 74000
},
{
"epoch": 0.7689354239521828,
"grad_norm": 11.535764694213867,
"learning_rate": 0.00022310749530440914,
"loss": 3.839007568359375,
"step": 74100
},
{
"epoch": 0.7699731235796486,
"grad_norm": 2.520615816116333,
"learning_rate": 0.0002230037253416626,
"loss": 3.942041015625,
"step": 74200
},
{
"epoch": 0.7710108232071144,
"grad_norm": 5.035190582275391,
"learning_rate": 0.000222899955378916,
"loss": 3.827362365722656,
"step": 74300
},
{
"epoch": 0.7720485228345803,
"grad_norm": 2.1133370399475098,
"learning_rate": 0.0002227961854161694,
"loss": 3.8085946655273437,
"step": 74400
},
{
"epoch": 0.7730862224620462,
"grad_norm": 3.3813223838806152,
"learning_rate": 0.00022269241545342283,
"loss": 3.8528924560546876,
"step": 74500
},
{
"epoch": 0.774123922089512,
"grad_norm": 2.5912599563598633,
"learning_rate": 0.00022258864549067625,
"loss": 4.025367126464844,
"step": 74600
},
{
"epoch": 0.7751616217169778,
"grad_norm": 8.560553550720215,
"learning_rate": 0.00022248487552792968,
"loss": 3.8942611694335936,
"step": 74700
},
{
"epoch": 0.7761993213444436,
"grad_norm": 2.7210657596588135,
"learning_rate": 0.00022238110556518307,
"loss": 3.7450421142578123,
"step": 74800
},
{
"epoch": 0.7772370209719095,
"grad_norm": 3.06449031829834,
"learning_rate": 0.0002222773356024365,
"loss": 4.058497619628906,
"step": 74900
},
{
"epoch": 0.7782747205993753,
"grad_norm": 2.6780056953430176,
"learning_rate": 0.00022217356563968994,
"loss": 3.908025207519531,
"step": 75000
},
{
"epoch": 0.7793124202268411,
"grad_norm": 2.579087257385254,
"learning_rate": 0.00022206979567694334,
"loss": 3.914963684082031,
"step": 75100
},
{
"epoch": 0.780350119854307,
"grad_norm": 6.844696998596191,
"learning_rate": 0.00022196602571419676,
"loss": 3.8832046508789064,
"step": 75200
},
{
"epoch": 0.7813878194817728,
"grad_norm": 7.694204330444336,
"learning_rate": 0.00022186225575145015,
"loss": 3.9718392944335936,
"step": 75300
},
{
"epoch": 0.7824255191092386,
"grad_norm": 9.200462341308594,
"learning_rate": 0.00022175848578870358,
"loss": 3.859333801269531,
"step": 75400
},
{
"epoch": 0.7834632187367044,
"grad_norm": 4.622501850128174,
"learning_rate": 0.000221654715825957,
"loss": 3.9099847412109376,
"step": 75500
},
{
"epoch": 0.7845009183641704,
"grad_norm": 1.9592938423156738,
"learning_rate": 0.00022155094586321042,
"loss": 3.8727886962890623,
"step": 75600
},
{
"epoch": 0.7855386179916362,
"grad_norm": 4.431970119476318,
"learning_rate": 0.00022144717590046384,
"loss": 3.9126931762695314,
"step": 75700
},
{
"epoch": 0.786576317619102,
"grad_norm": 4.069213390350342,
"learning_rate": 0.00022134340593771724,
"loss": 3.8846563720703124,
"step": 75800
},
{
"epoch": 0.7876140172465678,
"grad_norm": 2.009706497192383,
"learning_rate": 0.00022123963597497068,
"loss": 3.951784362792969,
"step": 75900
},
{
"epoch": 0.7886517168740337,
"grad_norm": 3.475999116897583,
"learning_rate": 0.00022113586601222408,
"loss": 3.8493191528320314,
"step": 76000
},
{
"epoch": 0.7896894165014995,
"grad_norm": 2.45090913772583,
"learning_rate": 0.0002210320960494775,
"loss": 3.938821105957031,
"step": 76100
},
{
"epoch": 0.7907271161289653,
"grad_norm": 3.2572762966156006,
"learning_rate": 0.0002209283260867309,
"loss": 3.8848175048828124,
"step": 76200
},
{
"epoch": 0.7917648157564311,
"grad_norm": 2.2695441246032715,
"learning_rate": 0.00022082455612398432,
"loss": 3.8166204833984376,
"step": 76300
},
{
"epoch": 0.792802515383897,
"grad_norm": 6.520568370819092,
"learning_rate": 0.00022072078616123777,
"loss": 3.8947482299804688,
"step": 76400
},
{
"epoch": 0.7938402150113628,
"grad_norm": 9.233070373535156,
"learning_rate": 0.00022061701619849116,
"loss": 3.8395782470703126,
"step": 76500
},
{
"epoch": 0.7948779146388286,
"grad_norm": 1.5229090452194214,
"learning_rate": 0.00022051324623574458,
"loss": 3.979128723144531,
"step": 76600
},
{
"epoch": 0.7959156142662944,
"grad_norm": 3.9737226963043213,
"learning_rate": 0.00022040947627299798,
"loss": 3.890586242675781,
"step": 76700
},
{
"epoch": 0.7969533138937603,
"grad_norm": 1.9717073440551758,
"learning_rate": 0.00022030570631025143,
"loss": 3.971199951171875,
"step": 76800
},
{
"epoch": 0.7979910135212261,
"grad_norm": 3.3416688442230225,
"learning_rate": 0.00022020193634750485,
"loss": 3.961914367675781,
"step": 76900
},
{
"epoch": 0.799028713148692,
"grad_norm": 2.037693738937378,
"learning_rate": 0.00022009816638475824,
"loss": 3.8637881469726563,
"step": 77000
},
{
"epoch": 0.8000664127761579,
"grad_norm": 5.026768207550049,
"learning_rate": 0.00021999439642201167,
"loss": 3.9692828369140627,
"step": 77100
},
{
"epoch": 0.8011041124036237,
"grad_norm": 2.230590581893921,
"learning_rate": 0.00021989062645926506,
"loss": 3.852244873046875,
"step": 77200
},
{
"epoch": 0.8021418120310895,
"grad_norm": 2.0119717121124268,
"learning_rate": 0.0002197868564965185,
"loss": 3.9774188232421874,
"step": 77300
},
{
"epoch": 0.8031795116585553,
"grad_norm": 5.08432674407959,
"learning_rate": 0.0002196830865337719,
"loss": 3.8257907104492186,
"step": 77400
},
{
"epoch": 0.8042172112860212,
"grad_norm": 3.0086820125579834,
"learning_rate": 0.00021957931657102533,
"loss": 3.865489501953125,
"step": 77500
},
{
"epoch": 0.805254910913487,
"grad_norm": 4.534199237823486,
"learning_rate": 0.00021947554660827875,
"loss": 3.875529479980469,
"step": 77600
},
{
"epoch": 0.8062926105409528,
"grad_norm": 2.68324613571167,
"learning_rate": 0.00021937177664553217,
"loss": 3.928450927734375,
"step": 77700
},
{
"epoch": 0.8073303101684186,
"grad_norm": 3.7302651405334473,
"learning_rate": 0.0002192680066827856,
"loss": 3.9593939208984374,
"step": 77800
},
{
"epoch": 0.8083680097958845,
"grad_norm": 2.8160176277160645,
"learning_rate": 0.000219164236720039,
"loss": 4.003828735351562,
"step": 77900
},
{
"epoch": 0.8094057094233503,
"grad_norm": 2.314183473587036,
"learning_rate": 0.0002190604667572924,
"loss": 3.988243408203125,
"step": 78000
},
{
"epoch": 0.8104434090508161,
"grad_norm": 2.661289691925049,
"learning_rate": 0.0002189566967945458,
"loss": 3.9358248901367188,
"step": 78100
},
{
"epoch": 0.8114811086782819,
"grad_norm": 5.065707206726074,
"learning_rate": 0.00021885292683179925,
"loss": 3.7886788940429685,
"step": 78200
},
{
"epoch": 0.8125188083057479,
"grad_norm": 5.173181056976318,
"learning_rate": 0.00021874915686905268,
"loss": 3.790332946777344,
"step": 78300
},
{
"epoch": 0.8135565079332137,
"grad_norm": 2.573274850845337,
"learning_rate": 0.00021864538690630607,
"loss": 3.975767822265625,
"step": 78400
},
{
"epoch": 0.8145942075606795,
"grad_norm": 3.010472536087036,
"learning_rate": 0.0002185416169435595,
"loss": 3.861507568359375,
"step": 78500
},
{
"epoch": 0.8156319071881453,
"grad_norm": 2.632009983062744,
"learning_rate": 0.00021843784698081291,
"loss": 3.9550189208984374,
"step": 78600
},
{
"epoch": 0.8166696068156112,
"grad_norm": 5.590510368347168,
"learning_rate": 0.00021833407701806634,
"loss": 3.924696044921875,
"step": 78700
},
{
"epoch": 0.817707306443077,
"grad_norm": 4.052700042724609,
"learning_rate": 0.00021823030705531976,
"loss": 3.831592712402344,
"step": 78800
},
{
"epoch": 0.8187450060705428,
"grad_norm": 2.7363622188568115,
"learning_rate": 0.00021812653709257315,
"loss": 4.028314208984375,
"step": 78900
},
{
"epoch": 0.8197827056980087,
"grad_norm": 4.773056507110596,
"learning_rate": 0.0002180227671298266,
"loss": 3.7855130004882813,
"step": 79000
},
{
"epoch": 0.8208204053254745,
"grad_norm": 2.6858768463134766,
"learning_rate": 0.00021791899716708,
"loss": 3.9081768798828125,
"step": 79100
},
{
"epoch": 0.8218581049529403,
"grad_norm": 4.861189842224121,
"learning_rate": 0.00021781522720433342,
"loss": 3.99755126953125,
"step": 79200
},
{
"epoch": 0.8228958045804061,
"grad_norm": 2.1088833808898926,
"learning_rate": 0.00021771145724158681,
"loss": 3.8839871215820314,
"step": 79300
},
{
"epoch": 0.823933504207872,
"grad_norm": 2.911973237991333,
"learning_rate": 0.00021760768727884024,
"loss": 3.864557189941406,
"step": 79400
},
{
"epoch": 0.8249712038353378,
"grad_norm": 6.847414016723633,
"learning_rate": 0.00021750391731609368,
"loss": 3.868388366699219,
"step": 79500
},
{
"epoch": 0.8260089034628036,
"grad_norm": 2.0376992225646973,
"learning_rate": 0.00021740014735334708,
"loss": 3.9390859985351563,
"step": 79600
},
{
"epoch": 0.8270466030902694,
"grad_norm": 4.972707271575928,
"learning_rate": 0.0002172963773906005,
"loss": 3.8582077026367188,
"step": 79700
},
{
"epoch": 0.8280843027177354,
"grad_norm": 7.205460071563721,
"learning_rate": 0.0002171926074278539,
"loss": 3.839405212402344,
"step": 79800
},
{
"epoch": 0.8291220023452012,
"grad_norm": 12.633910179138184,
"learning_rate": 0.00021708883746510735,
"loss": 3.831856384277344,
"step": 79900
},
{
"epoch": 0.830159701972667,
"grad_norm": 4.479480743408203,
"learning_rate": 0.00021698506750236074,
"loss": 3.795959167480469,
"step": 80000
},
{
"epoch": 0.8311974016001328,
"grad_norm": 4.281702995300293,
"learning_rate": 0.00021688129753961416,
"loss": 4.039653625488281,
"step": 80100
},
{
"epoch": 0.8322351012275987,
"grad_norm": 3.5497429370880127,
"learning_rate": 0.00021677752757686758,
"loss": 4.000389709472656,
"step": 80200
},
{
"epoch": 0.8332728008550645,
"grad_norm": 2.431144952774048,
"learning_rate": 0.00021667375761412098,
"loss": 3.9792193603515624,
"step": 80300
},
{
"epoch": 0.8343105004825303,
"grad_norm": 13.734992980957031,
"learning_rate": 0.00021656998765137443,
"loss": 3.8038821411132813,
"step": 80400
},
{
"epoch": 0.8353482001099961,
"grad_norm": 1.6895164251327515,
"learning_rate": 0.00021646621768862782,
"loss": 3.7827383422851564,
"step": 80500
},
{
"epoch": 0.836385899737462,
"grad_norm": 3.4907968044281006,
"learning_rate": 0.00021636244772588124,
"loss": 3.882090759277344,
"step": 80600
},
{
"epoch": 0.8374235993649278,
"grad_norm": 2.345144510269165,
"learning_rate": 0.0002162586777631347,
"loss": 3.8400167846679687,
"step": 80700
},
{
"epoch": 0.8384612989923936,
"grad_norm": 3.4369494915008545,
"learning_rate": 0.0002161549078003881,
"loss": 3.7776190185546876,
"step": 80800
},
{
"epoch": 0.8394989986198595,
"grad_norm": 5.47845983505249,
"learning_rate": 0.0002160511378376415,
"loss": 4.044588623046875,
"step": 80900
},
{
"epoch": 0.8405366982473254,
"grad_norm": 1.5931683778762817,
"learning_rate": 0.0002159473678748949,
"loss": 3.9998703002929688,
"step": 81000
},
{
"epoch": 0.8415743978747912,
"grad_norm": 3.1940066814422607,
"learning_rate": 0.00021584359791214833,
"loss": 3.8839016723632813,
"step": 81100
},
{
"epoch": 0.842612097502257,
"grad_norm": 9.511052131652832,
"learning_rate": 0.00021573982794940172,
"loss": 3.9398565673828125,
"step": 81200
},
{
"epoch": 0.8436497971297229,
"grad_norm": 1.9886616468429565,
"learning_rate": 0.00021563605798665517,
"loss": 3.82979736328125,
"step": 81300
},
{
"epoch": 0.8446874967571887,
"grad_norm": 2.362103223800659,
"learning_rate": 0.0002155322880239086,
"loss": 3.8248995971679687,
"step": 81400
},
{
"epoch": 0.8457251963846545,
"grad_norm": 1.7605165243148804,
"learning_rate": 0.000215428518061162,
"loss": 3.7230010986328126,
"step": 81500
},
{
"epoch": 0.8467628960121203,
"grad_norm": 1.8303929567337036,
"learning_rate": 0.0002153247480984154,
"loss": 3.861679992675781,
"step": 81600
},
{
"epoch": 0.8478005956395862,
"grad_norm": 4.539703845977783,
"learning_rate": 0.00021522097813566883,
"loss": 3.8151321411132812,
"step": 81700
},
{
"epoch": 0.848838295267052,
"grad_norm": 1.8927255868911743,
"learning_rate": 0.00021511720817292225,
"loss": 3.999220886230469,
"step": 81800
},
{
"epoch": 0.8498759948945178,
"grad_norm": 3.66632080078125,
"learning_rate": 0.00021501343821017565,
"loss": 3.9149603271484374,
"step": 81900
},
{
"epoch": 0.8509136945219836,
"grad_norm": 6.1261887550354,
"learning_rate": 0.00021490966824742907,
"loss": 3.808494873046875,
"step": 82000
},
{
"epoch": 0.8519513941494495,
"grad_norm": 2.9073901176452637,
"learning_rate": 0.00021480589828468252,
"loss": 3.8501129150390625,
"step": 82100
},
{
"epoch": 0.8529890937769153,
"grad_norm": 1.9176596403121948,
"learning_rate": 0.00021470212832193591,
"loss": 3.9358505249023437,
"step": 82200
},
{
"epoch": 0.8540267934043811,
"grad_norm": 2.3072047233581543,
"learning_rate": 0.00021459835835918934,
"loss": 3.8934945678710937,
"step": 82300
},
{
"epoch": 0.855064493031847,
"grad_norm": 2.7599945068359375,
"learning_rate": 0.00021449458839644273,
"loss": 3.929814453125,
"step": 82400
},
{
"epoch": 0.8561021926593129,
"grad_norm": 2.0721237659454346,
"learning_rate": 0.00021439081843369615,
"loss": 3.86040283203125,
"step": 82500
},
{
"epoch": 0.8571398922867787,
"grad_norm": 5.156016826629639,
"learning_rate": 0.0002142870484709496,
"loss": 3.8864166259765627,
"step": 82600
},
{
"epoch": 0.8581775919142445,
"grad_norm": 4.168294906616211,
"learning_rate": 0.000214183278508203,
"loss": 4.001069030761719,
"step": 82700
},
{
"epoch": 0.8592152915417104,
"grad_norm": 1.7126719951629639,
"learning_rate": 0.00021407950854545642,
"loss": 3.946321716308594,
"step": 82800
},
{
"epoch": 0.8602529911691762,
"grad_norm": 5.809075355529785,
"learning_rate": 0.0002139757385827098,
"loss": 3.82521240234375,
"step": 82900
},
{
"epoch": 0.861290690796642,
"grad_norm": 5.8849921226501465,
"learning_rate": 0.00021387196861996326,
"loss": 3.7766848754882814,
"step": 83000
},
{
"epoch": 0.8623283904241078,
"grad_norm": 2.317793607711792,
"learning_rate": 0.00021376819865721666,
"loss": 4.01570068359375,
"step": 83100
},
{
"epoch": 0.8633660900515737,
"grad_norm": 19.14999008178711,
"learning_rate": 0.00021366442869447008,
"loss": 3.760934143066406,
"step": 83200
},
{
"epoch": 0.8644037896790395,
"grad_norm": 2.025818109512329,
"learning_rate": 0.0002135606587317235,
"loss": 3.9255300903320314,
"step": 83300
},
{
"epoch": 0.8654414893065053,
"grad_norm": 3.068112373352051,
"learning_rate": 0.0002134568887689769,
"loss": 3.821394348144531,
"step": 83400
},
{
"epoch": 0.8664791889339711,
"grad_norm": 8.730904579162598,
"learning_rate": 0.00021335311880623034,
"loss": 3.8662478637695314,
"step": 83500
},
{
"epoch": 0.867516888561437,
"grad_norm": 2.9956910610198975,
"learning_rate": 0.00021324934884348374,
"loss": 3.8266961669921873,
"step": 83600
},
{
"epoch": 0.8685545881889029,
"grad_norm": 2.774705410003662,
"learning_rate": 0.00021314557888073716,
"loss": 3.8334832763671876,
"step": 83700
},
{
"epoch": 0.8695922878163687,
"grad_norm": 1.9926444292068481,
"learning_rate": 0.00021304180891799056,
"loss": 3.973898620605469,
"step": 83800
},
{
"epoch": 0.8706299874438345,
"grad_norm": 1.8433290719985962,
"learning_rate": 0.000212938038955244,
"loss": 3.8273077392578125,
"step": 83900
},
{
"epoch": 0.8716676870713004,
"grad_norm": 5.3389410972595215,
"learning_rate": 0.00021283426899249743,
"loss": 3.8604061889648436,
"step": 84000
},
{
"epoch": 0.8727053866987662,
"grad_norm": 7.391428470611572,
"learning_rate": 0.00021273049902975082,
"loss": 3.8056671142578127,
"step": 84100
},
{
"epoch": 0.873743086326232,
"grad_norm": 5.367404937744141,
"learning_rate": 0.00021262672906700424,
"loss": 3.8744406127929687,
"step": 84200
},
{
"epoch": 0.8747807859536978,
"grad_norm": 3.1199004650115967,
"learning_rate": 0.00021252295910425764,
"loss": 3.8992080688476562,
"step": 84300
},
{
"epoch": 0.8758184855811637,
"grad_norm": 1.8603098392486572,
"learning_rate": 0.0002124191891415111,
"loss": 3.8311639404296876,
"step": 84400
},
{
"epoch": 0.8768561852086295,
"grad_norm": 2.5739691257476807,
"learning_rate": 0.0002123154191787645,
"loss": 3.754921569824219,
"step": 84500
},
{
"epoch": 0.8778938848360953,
"grad_norm": 3.090057134628296,
"learning_rate": 0.0002122116492160179,
"loss": 3.74908935546875,
"step": 84600
},
{
"epoch": 0.8789315844635612,
"grad_norm": 9.258840560913086,
"learning_rate": 0.00021210787925327133,
"loss": 3.985562744140625,
"step": 84700
},
{
"epoch": 0.879969284091027,
"grad_norm": 3.738255262374878,
"learning_rate": 0.00021200410929052475,
"loss": 3.9656732177734373,
"step": 84800
},
{
"epoch": 0.8810069837184928,
"grad_norm": 3.415017604827881,
"learning_rate": 0.00021190033932777817,
"loss": 3.958587341308594,
"step": 84900
},
{
"epoch": 0.8820446833459586,
"grad_norm": 6.633699893951416,
"learning_rate": 0.00021179656936503157,
"loss": 3.866285705566406,
"step": 85000
},
{
"epoch": 0.8830823829734246,
"grad_norm": 1.7935473918914795,
"learning_rate": 0.000211692799402285,
"loss": 3.9740695190429687,
"step": 85100
},
{
"epoch": 0.8841200826008904,
"grad_norm": 2.706197500228882,
"learning_rate": 0.00021158902943953844,
"loss": 3.8669891357421875,
"step": 85200
},
{
"epoch": 0.8851577822283562,
"grad_norm": 4.353029727935791,
"learning_rate": 0.00021148525947679183,
"loss": 3.881668701171875,
"step": 85300
},
{
"epoch": 0.886195481855822,
"grad_norm": 3.0080366134643555,
"learning_rate": 0.00021138148951404525,
"loss": 3.8229278564453124,
"step": 85400
},
{
"epoch": 0.8872331814832879,
"grad_norm": 7.4073028564453125,
"learning_rate": 0.00021127771955129865,
"loss": 4.015174560546875,
"step": 85500
},
{
"epoch": 0.8882708811107537,
"grad_norm": 4.174534320831299,
"learning_rate": 0.00021117394958855207,
"loss": 3.8184585571289062,
"step": 85600
},
{
"epoch": 0.8893085807382195,
"grad_norm": 5.683806896209717,
"learning_rate": 0.0002110701796258055,
"loss": 3.8243557739257814,
"step": 85700
},
{
"epoch": 0.8903462803656853,
"grad_norm": 2.076599597930908,
"learning_rate": 0.00021096640966305891,
"loss": 3.71376220703125,
"step": 85800
},
{
"epoch": 0.8913839799931512,
"grad_norm": 2.4622974395751953,
"learning_rate": 0.00021086263970031234,
"loss": 3.85018310546875,
"step": 85900
},
{
"epoch": 0.892421679620617,
"grad_norm": 2.3247082233428955,
"learning_rate": 0.00021075886973756573,
"loss": 3.9427032470703125,
"step": 86000
},
{
"epoch": 0.8934593792480828,
"grad_norm": 5.115243911743164,
"learning_rate": 0.00021065509977481918,
"loss": 3.6884475708007813,
"step": 86100
},
{
"epoch": 0.8944970788755486,
"grad_norm": 5.306711196899414,
"learning_rate": 0.00021055132981207257,
"loss": 3.8416738891601563,
"step": 86200
},
{
"epoch": 0.8955347785030146,
"grad_norm": 1.5796631574630737,
"learning_rate": 0.000210447559849326,
"loss": 3.874592590332031,
"step": 86300
},
{
"epoch": 0.8965724781304804,
"grad_norm": 1.6183887720108032,
"learning_rate": 0.00021034378988657942,
"loss": 3.840068054199219,
"step": 86400
},
{
"epoch": 0.8976101777579462,
"grad_norm": 3.1412158012390137,
"learning_rate": 0.0002102400199238328,
"loss": 4.0432958984375,
"step": 86500
},
{
"epoch": 0.898647877385412,
"grad_norm": 1.6547956466674805,
"learning_rate": 0.00021013624996108626,
"loss": 3.829620361328125,
"step": 86600
},
{
"epoch": 0.8996855770128779,
"grad_norm": 9.84925365447998,
"learning_rate": 0.00021003247999833966,
"loss": 3.74409912109375,
"step": 86700
},
{
"epoch": 0.9007232766403437,
"grad_norm": 4.718574523925781,
"learning_rate": 0.00020992871003559308,
"loss": 3.8265109252929688,
"step": 86800
},
{
"epoch": 0.9017609762678095,
"grad_norm": 4.692354679107666,
"learning_rate": 0.00020982494007284647,
"loss": 3.9203875732421873,
"step": 86900
},
{
"epoch": 0.9027986758952754,
"grad_norm": 3.620683431625366,
"learning_rate": 0.00020972117011009992,
"loss": 3.9122955322265627,
"step": 87000
},
{
"epoch": 0.9038363755227412,
"grad_norm": 4.431119918823242,
"learning_rate": 0.00020961740014735334,
"loss": 3.9402545166015623,
"step": 87100
},
{
"epoch": 0.904874075150207,
"grad_norm": 3.734344005584717,
"learning_rate": 0.00020951363018460674,
"loss": 3.8481884765625,
"step": 87200
},
{
"epoch": 0.9059117747776728,
"grad_norm": 3.735985279083252,
"learning_rate": 0.00020940986022186016,
"loss": 3.8412353515625,
"step": 87300
},
{
"epoch": 0.9069494744051387,
"grad_norm": 2.774721145629883,
"learning_rate": 0.00020930609025911356,
"loss": 3.76121337890625,
"step": 87400
},
{
"epoch": 0.9079871740326045,
"grad_norm": 13.096595764160156,
"learning_rate": 0.000209202320296367,
"loss": 3.9009844970703127,
"step": 87500
},
{
"epoch": 0.9090248736600703,
"grad_norm": 5.561835765838623,
"learning_rate": 0.0002090985503336204,
"loss": 3.7489013671875,
"step": 87600
},
{
"epoch": 0.9100625732875361,
"grad_norm": 5.21470832824707,
"learning_rate": 0.00020899478037087382,
"loss": 3.9491476440429687,
"step": 87700
},
{
"epoch": 0.9111002729150021,
"grad_norm": 3.611980438232422,
"learning_rate": 0.00020889101040812724,
"loss": 3.8744741821289064,
"step": 87800
},
{
"epoch": 0.9121379725424679,
"grad_norm": 3.670480489730835,
"learning_rate": 0.00020878724044538067,
"loss": 3.8484326171875,
"step": 87900
},
{
"epoch": 0.9131756721699337,
"grad_norm": 2.46195387840271,
"learning_rate": 0.0002086834704826341,
"loss": 3.8545870971679688,
"step": 88000
},
{
"epoch": 0.9142133717973995,
"grad_norm": 2.256782054901123,
"learning_rate": 0.00020857970051988748,
"loss": 3.788062744140625,
"step": 88100
},
{
"epoch": 0.9152510714248654,
"grad_norm": 1.5597251653671265,
"learning_rate": 0.0002084759305571409,
"loss": 3.8967153930664065,
"step": 88200
},
{
"epoch": 0.9162887710523312,
"grad_norm": 4.607747554779053,
"learning_rate": 0.00020837216059439435,
"loss": 3.84433837890625,
"step": 88300
},
{
"epoch": 0.917326470679797,
"grad_norm": 2.7213637828826904,
"learning_rate": 0.00020826839063164775,
"loss": 3.6432476806640626,
"step": 88400
},
{
"epoch": 0.9183641703072628,
"grad_norm": 1.6943309307098389,
"learning_rate": 0.00020816462066890117,
"loss": 3.942064208984375,
"step": 88500
},
{
"epoch": 0.9194018699347287,
"grad_norm": 1.9761497974395752,
"learning_rate": 0.00020806085070615457,
"loss": 3.757283020019531,
"step": 88600
},
{
"epoch": 0.9204395695621945,
"grad_norm": 2.720459461212158,
"learning_rate": 0.000207957080743408,
"loss": 3.723210754394531,
"step": 88700
},
{
"epoch": 0.9214772691896603,
"grad_norm": 2.986565589904785,
"learning_rate": 0.0002078533107806614,
"loss": 3.9739913940429688,
"step": 88800
},
{
"epoch": 0.9225149688171262,
"grad_norm": 2.682279348373413,
"learning_rate": 0.00020774954081791483,
"loss": 3.706415100097656,
"step": 88900
},
{
"epoch": 0.923552668444592,
"grad_norm": 14.281532287597656,
"learning_rate": 0.00020764577085516825,
"loss": 3.799072570800781,
"step": 89000
},
{
"epoch": 0.9245903680720579,
"grad_norm": 3.1239538192749023,
"learning_rate": 0.00020754200089242165,
"loss": 3.8822201538085936,
"step": 89100
},
{
"epoch": 0.9256280676995237,
"grad_norm": 7.4986252784729,
"learning_rate": 0.0002074382309296751,
"loss": 3.852564392089844,
"step": 89200
},
{
"epoch": 0.9266657673269896,
"grad_norm": 4.3345441818237305,
"learning_rate": 0.0002073344609669285,
"loss": 3.890749206542969,
"step": 89300
},
{
"epoch": 0.9277034669544554,
"grad_norm": 2.6886496543884277,
"learning_rate": 0.0002072306910041819,
"loss": 3.8261907958984374,
"step": 89400
},
{
"epoch": 0.9287411665819212,
"grad_norm": 2.2986016273498535,
"learning_rate": 0.0002071269210414353,
"loss": 3.8075076293945314,
"step": 89500
},
{
"epoch": 0.929778866209387,
"grad_norm": 11.309110641479492,
"learning_rate": 0.00020702315107868873,
"loss": 3.829825744628906,
"step": 89600
},
{
"epoch": 0.9308165658368529,
"grad_norm": 2.784146308898926,
"learning_rate": 0.00020691938111594218,
"loss": 3.7934060668945313,
"step": 89700
},
{
"epoch": 0.9318542654643187,
"grad_norm": 2.3935048580169678,
"learning_rate": 0.00020681561115319557,
"loss": 3.882371826171875,
"step": 89800
},
{
"epoch": 0.9328919650917845,
"grad_norm": 3.6735377311706543,
"learning_rate": 0.000206711841190449,
"loss": 3.842451171875,
"step": 89900
},
{
"epoch": 0.9339296647192503,
"grad_norm": 3.037416696548462,
"learning_rate": 0.0002066080712277024,
"loss": 3.9087152099609375,
"step": 90000
},
{
"epoch": 0.9349673643467162,
"grad_norm": 9.315804481506348,
"learning_rate": 0.00020650430126495584,
"loss": 3.773963623046875,
"step": 90100
},
{
"epoch": 0.936005063974182,
"grad_norm": 5.039952278137207,
"learning_rate": 0.00020640053130220926,
"loss": 3.7935626220703127,
"step": 90200
},
{
"epoch": 0.9370427636016478,
"grad_norm": 5.707028388977051,
"learning_rate": 0.00020629676133946266,
"loss": 3.775277404785156,
"step": 90300
},
{
"epoch": 0.9380804632291136,
"grad_norm": 3.8109843730926514,
"learning_rate": 0.00020619299137671608,
"loss": 3.779449462890625,
"step": 90400
},
{
"epoch": 0.9391181628565796,
"grad_norm": 2.9235146045684814,
"learning_rate": 0.00020608922141396947,
"loss": 3.8383111572265625,
"step": 90500
},
{
"epoch": 0.9401558624840454,
"grad_norm": 1.6856282949447632,
"learning_rate": 0.00020598545145122292,
"loss": 3.8841232299804687,
"step": 90600
},
{
"epoch": 0.9411935621115112,
"grad_norm": 7.263090133666992,
"learning_rate": 0.00020588168148847632,
"loss": 3.9575741577148436,
"step": 90700
},
{
"epoch": 0.9422312617389771,
"grad_norm": 3.6679883003234863,
"learning_rate": 0.00020577791152572974,
"loss": 3.81220947265625,
"step": 90800
},
{
"epoch": 0.9432689613664429,
"grad_norm": 5.708615303039551,
"learning_rate": 0.0002056741415629832,
"loss": 3.807239685058594,
"step": 90900
},
{
"epoch": 0.9443066609939087,
"grad_norm": 4.463714122772217,
"learning_rate": 0.00020557037160023658,
"loss": 3.841280517578125,
"step": 91000
},
{
"epoch": 0.9453443606213745,
"grad_norm": 10.150075912475586,
"learning_rate": 0.00020546660163749,
"loss": 3.75313232421875,
"step": 91100
},
{
"epoch": 0.9463820602488404,
"grad_norm": 11.987652778625488,
"learning_rate": 0.0002053628316747434,
"loss": 3.903273620605469,
"step": 91200
},
{
"epoch": 0.9474197598763062,
"grad_norm": 4.522410869598389,
"learning_rate": 0.00020525906171199682,
"loss": 3.760314636230469,
"step": 91300
},
{
"epoch": 0.948457459503772,
"grad_norm": 4.449744701385498,
"learning_rate": 0.00020515529174925022,
"loss": 3.685667724609375,
"step": 91400
},
{
"epoch": 0.9494951591312378,
"grad_norm": 1.8593145608901978,
"learning_rate": 0.00020505152178650367,
"loss": 3.7343402099609375,
"step": 91500
},
{
"epoch": 0.9505328587587037,
"grad_norm": 2.4731132984161377,
"learning_rate": 0.0002049477518237571,
"loss": 3.783785705566406,
"step": 91600
},
{
"epoch": 0.9515705583861696,
"grad_norm": 1.820862889289856,
"learning_rate": 0.00020484398186101048,
"loss": 3.719476318359375,
"step": 91700
},
{
"epoch": 0.9526082580136354,
"grad_norm": 2.214238166809082,
"learning_rate": 0.0002047402118982639,
"loss": 3.7817031860351564,
"step": 91800
},
{
"epoch": 0.9536459576411012,
"grad_norm": 3.6466450691223145,
"learning_rate": 0.00020463644193551733,
"loss": 3.7672024536132813,
"step": 91900
},
{
"epoch": 0.9546836572685671,
"grad_norm": 5.454410076141357,
"learning_rate": 0.00020453267197277075,
"loss": 3.77567626953125,
"step": 92000
},
{
"epoch": 0.9557213568960329,
"grad_norm": 20.138710021972656,
"learning_rate": 0.00020442890201002417,
"loss": 3.7506854248046877,
"step": 92100
},
{
"epoch": 0.9567590565234987,
"grad_norm": 2.0090079307556152,
"learning_rate": 0.00020432513204727756,
"loss": 3.8082257080078126,
"step": 92200
},
{
"epoch": 0.9577967561509645,
"grad_norm": 2.6881604194641113,
"learning_rate": 0.00020422136208453101,
"loss": 4.051754150390625,
"step": 92300
},
{
"epoch": 0.9588344557784304,
"grad_norm": 3.293210029602051,
"learning_rate": 0.0002041175921217844,
"loss": 3.702369384765625,
"step": 92400
},
{
"epoch": 0.9598721554058962,
"grad_norm": 5.354658126831055,
"learning_rate": 0.00020401382215903783,
"loss": 3.8296829223632813,
"step": 92500
},
{
"epoch": 0.960909855033362,
"grad_norm": 2.285318374633789,
"learning_rate": 0.00020391005219629123,
"loss": 3.8205487060546877,
"step": 92600
},
{
"epoch": 0.9619475546608279,
"grad_norm": 3.3139116764068604,
"learning_rate": 0.00020380628223354465,
"loss": 3.9517453002929686,
"step": 92700
},
{
"epoch": 0.9629852542882937,
"grad_norm": 4.242766380310059,
"learning_rate": 0.0002037025122707981,
"loss": 3.819052429199219,
"step": 92800
},
{
"epoch": 0.9640229539157595,
"grad_norm": 11.361218452453613,
"learning_rate": 0.0002035987423080515,
"loss": 3.8673443603515625,
"step": 92900
},
{
"epoch": 0.9650606535432253,
"grad_norm": 1.6263092756271362,
"learning_rate": 0.0002034949723453049,
"loss": 3.6743267822265624,
"step": 93000
},
{
"epoch": 0.9660983531706913,
"grad_norm": 3.191160202026367,
"learning_rate": 0.0002033912023825583,
"loss": 3.85127685546875,
"step": 93100
},
{
"epoch": 0.9671360527981571,
"grad_norm": 14.219719886779785,
"learning_rate": 0.00020328743241981176,
"loss": 3.8775042724609374,
"step": 93200
},
{
"epoch": 0.9681737524256229,
"grad_norm": 2.592212200164795,
"learning_rate": 0.00020318366245706515,
"loss": 3.784809265136719,
"step": 93300
},
{
"epoch": 0.9692114520530887,
"grad_norm": 2.058199644088745,
"learning_rate": 0.00020307989249431857,
"loss": 3.7654934692382813,
"step": 93400
},
{
"epoch": 0.9702491516805546,
"grad_norm": 3.3060290813446045,
"learning_rate": 0.000202976122531572,
"loss": 3.78427734375,
"step": 93500
},
{
"epoch": 0.9712868513080204,
"grad_norm": 5.642673492431641,
"learning_rate": 0.0002028723525688254,
"loss": 3.768431396484375,
"step": 93600
},
{
"epoch": 0.9723245509354862,
"grad_norm": 2.416527271270752,
"learning_rate": 0.00020276858260607884,
"loss": 3.9477734375,
"step": 93700
},
{
"epoch": 0.973362250562952,
"grad_norm": 6.023645877838135,
"learning_rate": 0.00020266481264333223,
"loss": 3.8290167236328125,
"step": 93800
},
{
"epoch": 0.9743999501904179,
"grad_norm": 3.252999782562256,
"learning_rate": 0.00020256104268058566,
"loss": 3.959106750488281,
"step": 93900
},
{
"epoch": 0.9754376498178837,
"grad_norm": 2.065927743911743,
"learning_rate": 0.0002024572727178391,
"loss": 3.868408508300781,
"step": 94000
},
{
"epoch": 0.9764753494453495,
"grad_norm": 3.3688645362854004,
"learning_rate": 0.0002023535027550925,
"loss": 3.91245361328125,
"step": 94100
},
{
"epoch": 0.9775130490728153,
"grad_norm": 3.004783868789673,
"learning_rate": 0.00020224973279234592,
"loss": 3.7105670166015625,
"step": 94200
},
{
"epoch": 0.9785507487002812,
"grad_norm": 2.6519381999969482,
"learning_rate": 0.00020214596282959932,
"loss": 3.8060031127929688,
"step": 94300
},
{
"epoch": 0.979588448327747,
"grad_norm": 2.3849129676818848,
"learning_rate": 0.00020204219286685274,
"loss": 3.7225299072265625,
"step": 94400
},
{
"epoch": 0.9806261479552129,
"grad_norm": 2.5238912105560303,
"learning_rate": 0.00020193842290410613,
"loss": 3.6197088623046874,
"step": 94500
},
{
"epoch": 0.9816638475826788,
"grad_norm": 7.388523101806641,
"learning_rate": 0.00020183465294135958,
"loss": 3.6996939086914065,
"step": 94600
},
{
"epoch": 0.9827015472101446,
"grad_norm": 10.3375883102417,
"learning_rate": 0.000201730882978613,
"loss": 3.7547808837890626,
"step": 94700
},
{
"epoch": 0.9837392468376104,
"grad_norm": 2.251610040664673,
"learning_rate": 0.0002016271130158664,
"loss": 3.794500732421875,
"step": 94800
},
{
"epoch": 0.9847769464650762,
"grad_norm": 3.8766162395477295,
"learning_rate": 0.00020152334305311982,
"loss": 3.7538128662109376,
"step": 94900
},
{
"epoch": 0.9858146460925421,
"grad_norm": 2.7171695232391357,
"learning_rate": 0.00020141957309037324,
"loss": 3.7826458740234377,
"step": 95000
},
{
"epoch": 0.9868523457200079,
"grad_norm": 3.8345425128936768,
"learning_rate": 0.00020131580312762667,
"loss": 3.8197344970703124,
"step": 95100
},
{
"epoch": 0.9878900453474737,
"grad_norm": 5.732568740844727,
"learning_rate": 0.00020121203316488006,
"loss": 3.84238525390625,
"step": 95200
},
{
"epoch": 0.9889277449749395,
"grad_norm": 2.933835744857788,
"learning_rate": 0.00020110826320213348,
"loss": 3.8682632446289062,
"step": 95300
},
{
"epoch": 0.9899654446024054,
"grad_norm": 6.234426021575928,
"learning_rate": 0.00020100449323938693,
"loss": 3.7140426635742188,
"step": 95400
},
{
"epoch": 0.9910031442298712,
"grad_norm": 3.3652026653289795,
"learning_rate": 0.00020090072327664033,
"loss": 3.7597830200195315,
"step": 95500
},
{
"epoch": 0.992040843857337,
"grad_norm": 3.030595541000366,
"learning_rate": 0.00020079695331389375,
"loss": 3.824953308105469,
"step": 95600
},
{
"epoch": 0.9930785434848028,
"grad_norm": 2.6781022548675537,
"learning_rate": 0.00020069318335114714,
"loss": 3.71589599609375,
"step": 95700
},
{
"epoch": 0.9941162431122688,
"grad_norm": 6.144374370574951,
"learning_rate": 0.00020058941338840056,
"loss": 3.856881408691406,
"step": 95800
},
{
"epoch": 0.9951539427397346,
"grad_norm": 11.093416213989258,
"learning_rate": 0.000200485643425654,
"loss": 3.8529815673828125,
"step": 95900
},
{
"epoch": 0.9961916423672004,
"grad_norm": 3.1640384197235107,
"learning_rate": 0.0002003818734629074,
"loss": 3.966211853027344,
"step": 96000
},
{
"epoch": 0.9972293419946662,
"grad_norm": 4.370779037475586,
"learning_rate": 0.00020027810350016083,
"loss": 3.7798886108398437,
"step": 96100
},
{
"epoch": 0.9982670416221321,
"grad_norm": 3.453723669052124,
"learning_rate": 0.00020017433353741422,
"loss": 3.8633013916015626,
"step": 96200
},
{
"epoch": 0.9993047412495979,
"grad_norm": 2.1785902976989746,
"learning_rate": 0.00020007056357466767,
"loss": 3.7897879028320314,
"step": 96300
},
{
"epoch": 1.0003424408770638,
"grad_norm": 7.7243971824646,
"learning_rate": 0.00019996679361192107,
"loss": 3.999345397949219,
"step": 96400
},
{
"epoch": 1.0013801405045295,
"grad_norm": 4.7181925773620605,
"learning_rate": 0.0001998630236491745,
"loss": 3.6450360107421873,
"step": 96500
},
{
"epoch": 1.0024178401319954,
"grad_norm": 5.74350643157959,
"learning_rate": 0.0001997592536864279,
"loss": 3.742356872558594,
"step": 96600
},
{
"epoch": 1.0034555397594613,
"grad_norm": 4.781228065490723,
"learning_rate": 0.0001996554837236813,
"loss": 3.88675048828125,
"step": 96700
},
{
"epoch": 1.004493239386927,
"grad_norm": 3.398968458175659,
"learning_rate": 0.00019955171376093476,
"loss": 3.604486083984375,
"step": 96800
},
{
"epoch": 1.005530939014393,
"grad_norm": 2.33478045463562,
"learning_rate": 0.00019944794379818815,
"loss": 3.6777334594726563,
"step": 96900
},
{
"epoch": 1.0065686386418586,
"grad_norm": 5.443575382232666,
"learning_rate": 0.00019934417383544157,
"loss": 3.71547119140625,
"step": 97000
},
{
"epoch": 1.0076063382693246,
"grad_norm": 9.512263298034668,
"learning_rate": 0.00019924040387269497,
"loss": 3.7301199340820315,
"step": 97100
},
{
"epoch": 1.0086440378967905,
"grad_norm": 7.4802985191345215,
"learning_rate": 0.00019913663390994842,
"loss": 3.924736328125,
"step": 97200
},
{
"epoch": 1.0096817375242562,
"grad_norm": 3.0878612995147705,
"learning_rate": 0.00019903286394720184,
"loss": 3.802860107421875,
"step": 97300
},
{
"epoch": 1.010719437151722,
"grad_norm": 3.557770252227783,
"learning_rate": 0.00019892909398445523,
"loss": 3.782970275878906,
"step": 97400
},
{
"epoch": 1.011757136779188,
"grad_norm": 4.309437274932861,
"learning_rate": 0.00019882532402170866,
"loss": 3.7818194580078126,
"step": 97500
},
{
"epoch": 1.0127948364066537,
"grad_norm": 9.057745933532715,
"learning_rate": 0.00019872155405896205,
"loss": 3.807467041015625,
"step": 97600
},
{
"epoch": 1.0138325360341196,
"grad_norm": 3.3481385707855225,
"learning_rate": 0.0001986177840962155,
"loss": 3.7055014038085936,
"step": 97700
},
{
"epoch": 1.0148702356615853,
"grad_norm": 5.001105308532715,
"learning_rate": 0.00019851401413346892,
"loss": 3.803979797363281,
"step": 97800
},
{
"epoch": 1.0159079352890512,
"grad_norm": 2.7995588779449463,
"learning_rate": 0.00019841024417072232,
"loss": 3.784454650878906,
"step": 97900
},
{
"epoch": 1.0169456349165171,
"grad_norm": 2.4021806716918945,
"learning_rate": 0.00019830647420797574,
"loss": 3.8534210205078123,
"step": 98000
},
{
"epoch": 1.0179833345439828,
"grad_norm": 2.6125597953796387,
"learning_rate": 0.00019820270424522916,
"loss": 3.6783572387695314,
"step": 98100
},
{
"epoch": 1.0190210341714487,
"grad_norm": 12.870917320251465,
"learning_rate": 0.00019809893428248258,
"loss": 3.833390808105469,
"step": 98200
},
{
"epoch": 1.0200587337989147,
"grad_norm": 5.185585021972656,
"learning_rate": 0.00019799516431973598,
"loss": 3.7223880004882814,
"step": 98300
},
{
"epoch": 1.0210964334263803,
"grad_norm": 1.9634087085723877,
"learning_rate": 0.0001978913943569894,
"loss": 3.6614044189453123,
"step": 98400
},
{
"epoch": 1.0221341330538463,
"grad_norm": 5.82041072845459,
"learning_rate": 0.00019778762439424285,
"loss": 3.729730224609375,
"step": 98500
},
{
"epoch": 1.0231718326813122,
"grad_norm": 5.905141353607178,
"learning_rate": 0.00019768385443149624,
"loss": 3.8260488891601563,
"step": 98600
},
{
"epoch": 1.0242095323087779,
"grad_norm": 3.5444912910461426,
"learning_rate": 0.00019758008446874966,
"loss": 3.687132568359375,
"step": 98700
},
{
"epoch": 1.0252472319362438,
"grad_norm": 7.397883892059326,
"learning_rate": 0.00019747631450600306,
"loss": 3.815035400390625,
"step": 98800
},
{
"epoch": 1.0262849315637095,
"grad_norm": 4.467862129211426,
"learning_rate": 0.00019737254454325648,
"loss": 3.645810241699219,
"step": 98900
},
{
"epoch": 1.0273226311911754,
"grad_norm": 7.824927806854248,
"learning_rate": 0.0001972687745805099,
"loss": 3.7502801513671873,
"step": 99000
},
{
"epoch": 1.0283603308186413,
"grad_norm": 9.055319786071777,
"learning_rate": 0.00019716500461776333,
"loss": 3.895949401855469,
"step": 99100
},
{
"epoch": 1.029398030446107,
"grad_norm": 2.499072313308716,
"learning_rate": 0.00019706123465501675,
"loss": 3.729786071777344,
"step": 99200
},
{
"epoch": 1.030435730073573,
"grad_norm": 2.091538667678833,
"learning_rate": 0.00019695746469227014,
"loss": 3.6661376953125,
"step": 99300
},
{
"epoch": 1.0314734297010388,
"grad_norm": 2.9895308017730713,
"learning_rate": 0.0001968536947295236,
"loss": 3.7620065307617185,
"step": 99400
},
{
"epoch": 1.0325111293285045,
"grad_norm": 3.8646888732910156,
"learning_rate": 0.00019674992476677699,
"loss": 3.8454522705078125,
"step": 99500
},
{
"epoch": 1.0335488289559704,
"grad_norm": 4.3288044929504395,
"learning_rate": 0.0001966461548040304,
"loss": 3.682370300292969,
"step": 99600
},
{
"epoch": 1.0345865285834361,
"grad_norm": 1.888063907623291,
"learning_rate": 0.00019654238484128383,
"loss": 3.7136306762695312,
"step": 99700
},
{
"epoch": 1.035624228210902,
"grad_norm": 2.9146947860717773,
"learning_rate": 0.00019643861487853722,
"loss": 3.7029214477539063,
"step": 99800
},
{
"epoch": 1.036661927838368,
"grad_norm": 3.3660199642181396,
"learning_rate": 0.00019633484491579067,
"loss": 3.669721984863281,
"step": 99900
},
{
"epoch": 1.0376996274658337,
"grad_norm": 3.8642494678497314,
"learning_rate": 0.00019623107495304407,
"loss": 3.718172302246094,
"step": 100000
},
{
"epoch": 1.0387373270932996,
"grad_norm": 19.524248123168945,
"learning_rate": 0.0001961273049902975,
"loss": 3.8097552490234374,
"step": 100100
},
{
"epoch": 1.0397750267207655,
"grad_norm": 2.175708293914795,
"learning_rate": 0.00019602353502755089,
"loss": 3.7663388061523437,
"step": 100200
},
{
"epoch": 1.0408127263482312,
"grad_norm": 2.0963635444641113,
"learning_rate": 0.00019591976506480433,
"loss": 3.7331805419921875,
"step": 100300
},
{
"epoch": 1.041850425975697,
"grad_norm": 4.1156134605407715,
"learning_rate": 0.00019581599510205776,
"loss": 3.7513092041015623,
"step": 100400
},
{
"epoch": 1.042888125603163,
"grad_norm": 1.9364126920700073,
"learning_rate": 0.00019571222513931115,
"loss": 3.7811895751953126,
"step": 100500
},
{
"epoch": 1.0439258252306287,
"grad_norm": 3.9929726123809814,
"learning_rate": 0.00019560845517656457,
"loss": 3.6916510009765626,
"step": 100600
},
{
"epoch": 1.0449635248580946,
"grad_norm": 6.161198139190674,
"learning_rate": 0.00019550468521381797,
"loss": 3.735494384765625,
"step": 100700
},
{
"epoch": 1.0460012244855603,
"grad_norm": 5.300504207611084,
"learning_rate": 0.00019540091525107142,
"loss": 3.6318603515625,
"step": 100800
},
{
"epoch": 1.0470389241130262,
"grad_norm": 6.671936988830566,
"learning_rate": 0.0001952971452883248,
"loss": 3.753620300292969,
"step": 100900
},
{
"epoch": 1.0480766237404922,
"grad_norm": 4.034755229949951,
"learning_rate": 0.00019519337532557823,
"loss": 3.6916033935546877,
"step": 101000
},
{
"epoch": 1.0491143233679578,
"grad_norm": 2.8349599838256836,
"learning_rate": 0.00019508960536283168,
"loss": 3.6846957397460938,
"step": 101100
},
{
"epoch": 1.0501520229954238,
"grad_norm": 4.222849369049072,
"learning_rate": 0.00019498583540008508,
"loss": 3.785768737792969,
"step": 101200
},
{
"epoch": 1.0511897226228897,
"grad_norm": 7.210328102111816,
"learning_rate": 0.0001948820654373385,
"loss": 3.674949035644531,
"step": 101300
},
{
"epoch": 1.0522274222503554,
"grad_norm": 4.031270503997803,
"learning_rate": 0.0001947782954745919,
"loss": 3.7858917236328127,
"step": 101400
},
{
"epoch": 1.0532651218778213,
"grad_norm": 28.53989601135254,
"learning_rate": 0.00019467452551184532,
"loss": 3.8007437133789064,
"step": 101500
},
{
"epoch": 1.054302821505287,
"grad_norm": 5.528784275054932,
"learning_rate": 0.00019457075554909877,
"loss": 3.624027099609375,
"step": 101600
},
{
"epoch": 1.055340521132753,
"grad_norm": 3.1289713382720947,
"learning_rate": 0.00019446698558635216,
"loss": 3.7536968994140625,
"step": 101700
},
{
"epoch": 1.0563782207602188,
"grad_norm": 2.9442858695983887,
"learning_rate": 0.00019436321562360558,
"loss": 3.569986572265625,
"step": 101800
},
{
"epoch": 1.0574159203876845,
"grad_norm": 4.8674726486206055,
"learning_rate": 0.00019425944566085898,
"loss": 3.8215240478515624,
"step": 101900
},
{
"epoch": 1.0584536200151504,
"grad_norm": 13.513835906982422,
"learning_rate": 0.0001941556756981124,
"loss": 3.6686697387695313,
"step": 102000
},
{
"epoch": 1.0594913196426163,
"grad_norm": 3.146784543991089,
"learning_rate": 0.00019405190573536582,
"loss": 3.643824462890625,
"step": 102100
},
{
"epoch": 1.060529019270082,
"grad_norm": 4.964068412780762,
"learning_rate": 0.00019394813577261924,
"loss": 3.748782043457031,
"step": 102200
},
{
"epoch": 1.061566718897548,
"grad_norm": 3.178044557571411,
"learning_rate": 0.00019384436580987266,
"loss": 3.7086587524414063,
"step": 102300
},
{
"epoch": 1.0626044185250136,
"grad_norm": 2.6959052085876465,
"learning_rate": 0.00019374059584712606,
"loss": 3.8190512084960937,
"step": 102400
},
{
"epoch": 1.0636421181524796,
"grad_norm": 4.595401763916016,
"learning_rate": 0.0001936368258843795,
"loss": 3.6920120239257814,
"step": 102500
},
{
"epoch": 1.0646798177799455,
"grad_norm": 3.383439064025879,
"learning_rate": 0.0001935330559216329,
"loss": 3.7616091918945314,
"step": 102600
},
{
"epoch": 1.0657175174074112,
"grad_norm": 6.921218395233154,
"learning_rate": 0.00019342928595888633,
"loss": 3.8070159912109376,
"step": 102700
},
{
"epoch": 1.066755217034877,
"grad_norm": 3.7757728099823,
"learning_rate": 0.00019332551599613975,
"loss": 3.64797119140625,
"step": 102800
},
{
"epoch": 1.067792916662343,
"grad_norm": 5.452692985534668,
"learning_rate": 0.00019322174603339314,
"loss": 3.7128118896484374,
"step": 102900
},
{
"epoch": 1.0688306162898087,
"grad_norm": 2.324277639389038,
"learning_rate": 0.0001931179760706466,
"loss": 3.5481451416015624,
"step": 103000
},
{
"epoch": 1.0698683159172746,
"grad_norm": 2.998181104660034,
"learning_rate": 0.00019301420610789999,
"loss": 3.6443612670898435,
"step": 103100
},
{
"epoch": 1.0709060155447405,
"grad_norm": 5.453862190246582,
"learning_rate": 0.0001929104361451534,
"loss": 3.7542648315429688,
"step": 103200
},
{
"epoch": 1.0719437151722062,
"grad_norm": 7.444779396057129,
"learning_rate": 0.0001928066661824068,
"loss": 3.696410827636719,
"step": 103300
},
{
"epoch": 1.0729814147996721,
"grad_norm": 4.7863569259643555,
"learning_rate": 0.00019270289621966025,
"loss": 3.6802603149414064,
"step": 103400
},
{
"epoch": 1.0740191144271378,
"grad_norm": 2.9291558265686035,
"learning_rate": 0.00019259912625691367,
"loss": 3.7929959106445312,
"step": 103500
},
{
"epoch": 1.0750568140546037,
"grad_norm": 3.2032582759857178,
"learning_rate": 0.00019249535629416707,
"loss": 3.6861895751953124,
"step": 103600
},
{
"epoch": 1.0760945136820697,
"grad_norm": 3.1435580253601074,
"learning_rate": 0.0001923915863314205,
"loss": 3.799478759765625,
"step": 103700
},
{
"epoch": 1.0771322133095353,
"grad_norm": 2.8310792446136475,
"learning_rate": 0.00019228781636867388,
"loss": 3.73474365234375,
"step": 103800
},
{
"epoch": 1.0781699129370013,
"grad_norm": 2.285276174545288,
"learning_rate": 0.00019218404640592733,
"loss": 3.6168304443359376,
"step": 103900
},
{
"epoch": 1.0792076125644672,
"grad_norm": 5.524131774902344,
"learning_rate": 0.00019208027644318073,
"loss": 3.710784912109375,
"step": 104000
},
{
"epoch": 1.0802453121919329,
"grad_norm": 3.545400619506836,
"learning_rate": 0.00019197650648043415,
"loss": 3.6640530395507813,
"step": 104100
},
{
"epoch": 1.0812830118193988,
"grad_norm": 3.101451873779297,
"learning_rate": 0.0001918727365176876,
"loss": 3.7735882568359376,
"step": 104200
},
{
"epoch": 1.0823207114468647,
"grad_norm": 2.4820311069488525,
"learning_rate": 0.000191768966554941,
"loss": 3.6366726684570314,
"step": 104300
},
{
"epoch": 1.0833584110743304,
"grad_norm": 26.539804458618164,
"learning_rate": 0.00019166519659219442,
"loss": 3.7211334228515627,
"step": 104400
},
{
"epoch": 1.0843961107017963,
"grad_norm": 3.41780161857605,
"learning_rate": 0.0001915614266294478,
"loss": 3.60020263671875,
"step": 104500
},
{
"epoch": 1.085433810329262,
"grad_norm": 2.689753293991089,
"learning_rate": 0.00019145765666670123,
"loss": 3.7544232177734376,
"step": 104600
},
{
"epoch": 1.086471509956728,
"grad_norm": 2.2958478927612305,
"learning_rate": 0.00019135388670395468,
"loss": 3.849725646972656,
"step": 104700
},
{
"epoch": 1.0875092095841938,
"grad_norm": 3.697185754776001,
"learning_rate": 0.00019125011674120808,
"loss": 3.813602294921875,
"step": 104800
},
{
"epoch": 1.0885469092116595,
"grad_norm": 2.1992783546447754,
"learning_rate": 0.0001911463467784615,
"loss": 3.6952606201171876,
"step": 104900
},
{
"epoch": 1.0895846088391254,
"grad_norm": 2.1027495861053467,
"learning_rate": 0.0001910425768157149,
"loss": 3.6720751953125,
"step": 105000
},
{
"epoch": 1.0906223084665914,
"grad_norm": 2.2862184047698975,
"learning_rate": 0.00019093880685296832,
"loss": 3.729759521484375,
"step": 105100
},
{
"epoch": 1.091660008094057,
"grad_norm": 2.060633659362793,
"learning_rate": 0.00019083503689022174,
"loss": 3.7085842895507812,
"step": 105200
},
{
"epoch": 1.092697707721523,
"grad_norm": 2.636503219604492,
"learning_rate": 0.00019073126692747516,
"loss": 3.6184716796875,
"step": 105300
},
{
"epoch": 1.0937354073489887,
"grad_norm": 7.98659086227417,
"learning_rate": 0.00019062749696472858,
"loss": 3.875008544921875,
"step": 105400
},
{
"epoch": 1.0947731069764546,
"grad_norm": 3.7854599952697754,
"learning_rate": 0.00019052372700198198,
"loss": 3.8590658569335936,
"step": 105500
},
{
"epoch": 1.0958108066039205,
"grad_norm": 9.304828643798828,
"learning_rate": 0.00019041995703923543,
"loss": 3.7910305786132814,
"step": 105600
},
{
"epoch": 1.0968485062313862,
"grad_norm": 6.323867321014404,
"learning_rate": 0.00019031618707648882,
"loss": 3.763433532714844,
"step": 105700
},
{
"epoch": 1.097886205858852,
"grad_norm": 5.698137283325195,
"learning_rate": 0.00019021241711374224,
"loss": 3.6159381103515624,
"step": 105800
},
{
"epoch": 1.098923905486318,
"grad_norm": 80.88331604003906,
"learning_rate": 0.00019010864715099564,
"loss": 3.738255920410156,
"step": 105900
},
{
"epoch": 1.0999616051137837,
"grad_norm": 4.7448577880859375,
"learning_rate": 0.00019000487718824906,
"loss": 3.6675250244140627,
"step": 106000
},
{
"epoch": 1.1009993047412496,
"grad_norm": 5.72471809387207,
"learning_rate": 0.0001899011072255025,
"loss": 3.7835205078125,
"step": 106100
},
{
"epoch": 1.1020370043687153,
"grad_norm": 3.3427250385284424,
"learning_rate": 0.0001897973372627559,
"loss": 3.6577874755859376,
"step": 106200
},
{
"epoch": 1.1030747039961812,
"grad_norm": 15.587642669677734,
"learning_rate": 0.00018969356730000932,
"loss": 3.716649169921875,
"step": 106300
},
{
"epoch": 1.1041124036236472,
"grad_norm": 4.485306262969971,
"learning_rate": 0.00018958979733726272,
"loss": 3.8367926025390626,
"step": 106400
},
{
"epoch": 1.1051501032511128,
"grad_norm": 2.82476806640625,
"learning_rate": 0.00018948602737451617,
"loss": 3.7493435668945314,
"step": 106500
},
{
"epoch": 1.1061878028785788,
"grad_norm": 15.561006546020508,
"learning_rate": 0.0001893822574117696,
"loss": 3.826619873046875,
"step": 106600
},
{
"epoch": 1.1072255025060447,
"grad_norm": 2.592461109161377,
"learning_rate": 0.00018927848744902299,
"loss": 3.7684344482421874,
"step": 106700
},
{
"epoch": 1.1082632021335104,
"grad_norm": 7.259844779968262,
"learning_rate": 0.0001891747174862764,
"loss": 3.758468017578125,
"step": 106800
},
{
"epoch": 1.1093009017609763,
"grad_norm": 5.973848342895508,
"learning_rate": 0.0001890709475235298,
"loss": 3.638338317871094,
"step": 106900
},
{
"epoch": 1.1103386013884422,
"grad_norm": 4.451427459716797,
"learning_rate": 0.00018896717756078325,
"loss": 3.788179626464844,
"step": 107000
},
{
"epoch": 1.111376301015908,
"grad_norm": 4.0467143058776855,
"learning_rate": 0.00018886340759803665,
"loss": 3.7329791259765623,
"step": 107100
},
{
"epoch": 1.1124140006433738,
"grad_norm": 5.440663814544678,
"learning_rate": 0.00018875963763529007,
"loss": 3.9233663940429686,
"step": 107200
},
{
"epoch": 1.1134517002708395,
"grad_norm": 2.327005386352539,
"learning_rate": 0.00018865586767254352,
"loss": 3.688836975097656,
"step": 107300
},
{
"epoch": 1.1144893998983054,
"grad_norm": 2.948439598083496,
"learning_rate": 0.0001885520977097969,
"loss": 3.623143310546875,
"step": 107400
},
{
"epoch": 1.1155270995257713,
"grad_norm": 8.996918678283691,
"learning_rate": 0.00018844832774705033,
"loss": 3.6873675537109376,
"step": 107500
},
{
"epoch": 1.116564799153237,
"grad_norm": 13.88825798034668,
"learning_rate": 0.00018834455778430373,
"loss": 3.889109802246094,
"step": 107600
},
{
"epoch": 1.117602498780703,
"grad_norm": 4.712568283081055,
"learning_rate": 0.00018824078782155715,
"loss": 3.8336361694335936,
"step": 107700
},
{
"epoch": 1.1186401984081689,
"grad_norm": 9.021018028259277,
"learning_rate": 0.00018813701785881055,
"loss": 3.818023681640625,
"step": 107800
},
{
"epoch": 1.1196778980356346,
"grad_norm": 4.5635294914245605,
"learning_rate": 0.000188033247896064,
"loss": 3.8210824584960936,
"step": 107900
},
{
"epoch": 1.1207155976631005,
"grad_norm": 6.118738651275635,
"learning_rate": 0.00018792947793331742,
"loss": 3.762948303222656,
"step": 108000
},
{
"epoch": 1.1217532972905664,
"grad_norm": 6.2977824211120605,
"learning_rate": 0.0001878257079705708,
"loss": 3.7840338134765625,
"step": 108100
},
{
"epoch": 1.122790996918032,
"grad_norm": 5.161929607391357,
"learning_rate": 0.00018772193800782423,
"loss": 3.8385690307617186,
"step": 108200
},
{
"epoch": 1.123828696545498,
"grad_norm": 19.5078067779541,
"learning_rate": 0.00018761816804507765,
"loss": 3.708250732421875,
"step": 108300
},
{
"epoch": 1.1248663961729637,
"grad_norm": 6.583184242248535,
"learning_rate": 0.00018751439808233108,
"loss": 3.6731692504882814,
"step": 108400
},
{
"epoch": 1.1259040958004296,
"grad_norm": 2.8113479614257812,
"learning_rate": 0.0001874106281195845,
"loss": 3.776397705078125,
"step": 108500
},
{
"epoch": 1.1269417954278955,
"grad_norm": 3.526796340942383,
"learning_rate": 0.0001873068581568379,
"loss": 3.713113098144531,
"step": 108600
},
{
"epoch": 1.1279794950553612,
"grad_norm": 4.96720027923584,
"learning_rate": 0.00018720308819409134,
"loss": 3.758629150390625,
"step": 108700
},
{
"epoch": 1.1290171946828271,
"grad_norm": 2.3801918029785156,
"learning_rate": 0.00018709931823134474,
"loss": 3.931161193847656,
"step": 108800
},
{
"epoch": 1.1300548943102928,
"grad_norm": 5.336031913757324,
"learning_rate": 0.00018699554826859816,
"loss": 3.7431265258789064,
"step": 108900
},
{
"epoch": 1.1310925939377587,
"grad_norm": 3.3115835189819336,
"learning_rate": 0.00018689177830585155,
"loss": 3.6016845703125,
"step": 109000
},
{
"epoch": 1.1321302935652247,
"grad_norm": 3.2625627517700195,
"learning_rate": 0.00018678800834310498,
"loss": 3.8173687744140623,
"step": 109100
},
{
"epoch": 1.1331679931926903,
"grad_norm": 3.4688777923583984,
"learning_rate": 0.00018668423838035843,
"loss": 3.7339138793945312,
"step": 109200
},
{
"epoch": 1.1342056928201563,
"grad_norm": 5.170476913452148,
"learning_rate": 0.00018658046841761182,
"loss": 3.8035733032226564,
"step": 109300
},
{
"epoch": 1.1352433924476222,
"grad_norm": 6.003453731536865,
"learning_rate": 0.00018647669845486524,
"loss": 3.7767242431640624,
"step": 109400
},
{
"epoch": 1.1362810920750879,
"grad_norm": 3.4862396717071533,
"learning_rate": 0.00018637292849211864,
"loss": 3.643880615234375,
"step": 109500
},
{
"epoch": 1.1373187917025538,
"grad_norm": 5.885380268096924,
"learning_rate": 0.00018626915852937209,
"loss": 3.7285040283203124,
"step": 109600
},
{
"epoch": 1.1383564913300197,
"grad_norm": 2.839015245437622,
"learning_rate": 0.00018616538856662548,
"loss": 3.7614910888671873,
"step": 109700
},
{
"epoch": 1.1393941909574854,
"grad_norm": 10.154685020446777,
"learning_rate": 0.0001860616186038789,
"loss": 3.635873107910156,
"step": 109800
},
{
"epoch": 1.1404318905849513,
"grad_norm": 11.110898971557617,
"learning_rate": 0.00018595784864113232,
"loss": 3.690367431640625,
"step": 109900
},
{
"epoch": 1.141469590212417,
"grad_norm": 2.4880504608154297,
"learning_rate": 0.00018585407867838572,
"loss": 3.69529541015625,
"step": 110000
},
{
"epoch": 1.142507289839883,
"grad_norm": 12.104265213012695,
"learning_rate": 0.00018575030871563917,
"loss": 3.81604736328125,
"step": 110100
},
{
"epoch": 1.1435449894673488,
"grad_norm": 4.529385089874268,
"learning_rate": 0.00018564653875289256,
"loss": 3.847531433105469,
"step": 110200
},
{
"epoch": 1.1445826890948145,
"grad_norm": 4.51477575302124,
"learning_rate": 0.00018554276879014598,
"loss": 3.6786367797851565,
"step": 110300
},
{
"epoch": 1.1456203887222804,
"grad_norm": 3.946871757507324,
"learning_rate": 0.00018543899882739943,
"loss": 3.7411343383789064,
"step": 110400
},
{
"epoch": 1.1466580883497464,
"grad_norm": 24.773929595947266,
"learning_rate": 0.00018533522886465283,
"loss": 3.6971206665039062,
"step": 110500
},
{
"epoch": 1.147695787977212,
"grad_norm": 4.848511695861816,
"learning_rate": 0.00018523145890190625,
"loss": 3.6610791015625,
"step": 110600
},
{
"epoch": 1.148733487604678,
"grad_norm": 3.155839681625366,
"learning_rate": 0.00018512768893915965,
"loss": 3.6824301147460936,
"step": 110700
},
{
"epoch": 1.1497711872321439,
"grad_norm": 3.4173624515533447,
"learning_rate": 0.00018502391897641307,
"loss": 3.729654541015625,
"step": 110800
},
{
"epoch": 1.1508088868596096,
"grad_norm": 3.1743650436401367,
"learning_rate": 0.00018492014901366646,
"loss": 3.752574157714844,
"step": 110900
},
{
"epoch": 1.1518465864870755,
"grad_norm": 5.655935287475586,
"learning_rate": 0.0001848163790509199,
"loss": 3.6886166381835936,
"step": 111000
},
{
"epoch": 1.1528842861145412,
"grad_norm": 2.8840067386627197,
"learning_rate": 0.00018471260908817333,
"loss": 3.8322817993164064,
"step": 111100
},
{
"epoch": 1.153921985742007,
"grad_norm": 4.1215057373046875,
"learning_rate": 0.00018460883912542673,
"loss": 3.634107971191406,
"step": 111200
},
{
"epoch": 1.154959685369473,
"grad_norm": 8.988388061523438,
"learning_rate": 0.00018450506916268018,
"loss": 3.813481750488281,
"step": 111300
},
{
"epoch": 1.1559973849969387,
"grad_norm": 4.154327869415283,
"learning_rate": 0.00018440129919993357,
"loss": 3.792846374511719,
"step": 111400
},
{
"epoch": 1.1570350846244046,
"grad_norm": 5.43167781829834,
"learning_rate": 0.000184297529237187,
"loss": 3.695276794433594,
"step": 111500
},
{
"epoch": 1.1580727842518705,
"grad_norm": 2.1235880851745605,
"learning_rate": 0.0001841937592744404,
"loss": 3.7109506225585935,
"step": 111600
},
{
"epoch": 1.1591104838793362,
"grad_norm": 3.2670278549194336,
"learning_rate": 0.0001840899893116938,
"loss": 3.779457702636719,
"step": 111700
},
{
"epoch": 1.1601481835068022,
"grad_norm": 4.596736431121826,
"learning_rate": 0.00018398621934894726,
"loss": 3.690837097167969,
"step": 111800
},
{
"epoch": 1.161185883134268,
"grad_norm": 5.063496112823486,
"learning_rate": 0.00018388244938620065,
"loss": 3.7899896240234376,
"step": 111900
},
{
"epoch": 1.1622235827617338,
"grad_norm": 3.2700915336608887,
"learning_rate": 0.00018377867942345408,
"loss": 3.7538375854492188,
"step": 112000
},
{
"epoch": 1.1632612823891997,
"grad_norm": 2.544558048248291,
"learning_rate": 0.00018367490946070747,
"loss": 3.7601394653320312,
"step": 112100
},
{
"epoch": 1.1642989820166654,
"grad_norm": 6.950151443481445,
"learning_rate": 0.0001835711394979609,
"loss": 3.797687683105469,
"step": 112200
},
{
"epoch": 1.1653366816441313,
"grad_norm": 2.161999464035034,
"learning_rate": 0.00018346736953521434,
"loss": 3.7408486938476564,
"step": 112300
},
{
"epoch": 1.1663743812715972,
"grad_norm": 2.824725866317749,
"learning_rate": 0.00018336359957246774,
"loss": 3.708443298339844,
"step": 112400
},
{
"epoch": 1.167412080899063,
"grad_norm": 11.807979583740234,
"learning_rate": 0.00018325982960972116,
"loss": 3.6650485229492187,
"step": 112500
},
{
"epoch": 1.1684497805265288,
"grad_norm": 12.751113891601562,
"learning_rate": 0.00018315605964697455,
"loss": 3.5273590087890625,
"step": 112600
},
{
"epoch": 1.1694874801539945,
"grad_norm": 3.0161349773406982,
"learning_rate": 0.000183052289684228,
"loss": 3.8431436157226564,
"step": 112700
},
{
"epoch": 1.1705251797814604,
"grad_norm": 8.852095603942871,
"learning_rate": 0.0001829485197214814,
"loss": 3.6667901611328126,
"step": 112800
},
{
"epoch": 1.1715628794089263,
"grad_norm": 16.80730438232422,
"learning_rate": 0.00018284474975873482,
"loss": 3.7361489868164064,
"step": 112900
},
{
"epoch": 1.172600579036392,
"grad_norm": 4.340658187866211,
"learning_rate": 0.00018274097979598824,
"loss": 3.7126028442382815,
"step": 113000
},
{
"epoch": 1.173638278663858,
"grad_norm": 2.2295515537261963,
"learning_rate": 0.00018263720983324164,
"loss": 3.6620779418945313,
"step": 113100
},
{
"epoch": 1.1746759782913239,
"grad_norm": 3.5379912853240967,
"learning_rate": 0.00018253343987049509,
"loss": 3.60224609375,
"step": 113200
},
{
"epoch": 1.1757136779187896,
"grad_norm": 3.174776315689087,
"learning_rate": 0.00018242966990774848,
"loss": 3.7180682373046876,
"step": 113300
},
{
"epoch": 1.1767513775462555,
"grad_norm": 4.343127250671387,
"learning_rate": 0.0001823258999450019,
"loss": 3.7377755737304685,
"step": 113400
},
{
"epoch": 1.1777890771737214,
"grad_norm": 21.170530319213867,
"learning_rate": 0.0001822221299822553,
"loss": 3.752294921875,
"step": 113500
},
{
"epoch": 1.178826776801187,
"grad_norm": 4.612101078033447,
"learning_rate": 0.00018211836001950875,
"loss": 3.8363751220703124,
"step": 113600
},
{
"epoch": 1.179864476428653,
"grad_norm": 6.276144981384277,
"learning_rate": 0.00018201459005676217,
"loss": 3.713616943359375,
"step": 113700
},
{
"epoch": 1.1809021760561187,
"grad_norm": 10.716604232788086,
"learning_rate": 0.00018191082009401556,
"loss": 3.629880676269531,
"step": 113800
},
{
"epoch": 1.1819398756835846,
"grad_norm": 2.2933573722839355,
"learning_rate": 0.00018180705013126898,
"loss": 3.8490249633789064,
"step": 113900
},
{
"epoch": 1.1829775753110505,
"grad_norm": 4.147966384887695,
"learning_rate": 0.00018170328016852238,
"loss": 3.5557064819335937,
"step": 114000
},
{
"epoch": 1.1840152749385162,
"grad_norm": 3.122669219970703,
"learning_rate": 0.00018159951020577583,
"loss": 3.73438232421875,
"step": 114100
},
{
"epoch": 1.1850529745659821,
"grad_norm": 9.210347175598145,
"learning_rate": 0.00018149574024302925,
"loss": 3.6972500610351564,
"step": 114200
},
{
"epoch": 1.186090674193448,
"grad_norm": 17.161890029907227,
"learning_rate": 0.00018139197028028265,
"loss": 3.819235534667969,
"step": 114300
},
{
"epoch": 1.1871283738209137,
"grad_norm": 5.225100040435791,
"learning_rate": 0.0001812882003175361,
"loss": 3.7081121826171874,
"step": 114400
},
{
"epoch": 1.1881660734483797,
"grad_norm": 8.891063690185547,
"learning_rate": 0.0001811844303547895,
"loss": 3.7459030151367188,
"step": 114500
},
{
"epoch": 1.1892037730758456,
"grad_norm": 3.465555429458618,
"learning_rate": 0.0001810806603920429,
"loss": 3.7495687866210936,
"step": 114600
},
{
"epoch": 1.1902414727033113,
"grad_norm": 2.962984561920166,
"learning_rate": 0.0001809768904292963,
"loss": 3.620650329589844,
"step": 114700
},
{
"epoch": 1.1912791723307772,
"grad_norm": 66.27200317382812,
"learning_rate": 0.00018087312046654973,
"loss": 3.8266671752929686,
"step": 114800
},
{
"epoch": 1.1923168719582429,
"grad_norm": 10.21193790435791,
"learning_rate": 0.00018076935050380318,
"loss": 3.7377734375,
"step": 114900
},
{
"epoch": 1.1933545715857088,
"grad_norm": 4.959332466125488,
"learning_rate": 0.00018066558054105657,
"loss": 3.767408752441406,
"step": 115000
},
{
"epoch": 1.1943922712131747,
"grad_norm": 4.304464817047119,
"learning_rate": 0.00018056181057831,
"loss": 3.793067626953125,
"step": 115100
},
{
"epoch": 1.1954299708406404,
"grad_norm": 4.872037887573242,
"learning_rate": 0.0001804580406155634,
"loss": 3.754971923828125,
"step": 115200
},
{
"epoch": 1.1964676704681063,
"grad_norm": 5.543403625488281,
"learning_rate": 0.0001803542706528168,
"loss": 3.6738140869140623,
"step": 115300
},
{
"epoch": 1.1975053700955722,
"grad_norm": 4.535797595977783,
"learning_rate": 0.00018025050069007023,
"loss": 3.6706658935546876,
"step": 115400
},
{
"epoch": 1.198543069723038,
"grad_norm": 3.987654209136963,
"learning_rate": 0.00018014673072732365,
"loss": 3.7104837036132814,
"step": 115500
},
{
"epoch": 1.1995807693505038,
"grad_norm": 4.604912757873535,
"learning_rate": 0.00018004296076457708,
"loss": 3.7295111083984374,
"step": 115600
},
{
"epoch": 1.2006184689779698,
"grad_norm": 7.51154088973999,
"learning_rate": 0.00017993919080183047,
"loss": 3.882249755859375,
"step": 115700
},
{
"epoch": 1.2016561686054354,
"grad_norm": 7.570425987243652,
"learning_rate": 0.00017983542083908392,
"loss": 3.7709280395507814,
"step": 115800
},
{
"epoch": 1.2026938682329014,
"grad_norm": 7.528663635253906,
"learning_rate": 0.00017973165087633731,
"loss": 3.744920654296875,
"step": 115900
},
{
"epoch": 1.203731567860367,
"grad_norm": 4.613593578338623,
"learning_rate": 0.00017962788091359074,
"loss": 3.81932373046875,
"step": 116000
},
{
"epoch": 1.204769267487833,
"grad_norm": 4.6101508140563965,
"learning_rate": 0.00017952411095084416,
"loss": 3.701668701171875,
"step": 116100
},
{
"epoch": 1.2058069671152989,
"grad_norm": 3.3336641788482666,
"learning_rate": 0.00017942034098809755,
"loss": 3.5936102294921874,
"step": 116200
},
{
"epoch": 1.2068446667427646,
"grad_norm": 8.796258926391602,
"learning_rate": 0.000179316571025351,
"loss": 3.6812298583984373,
"step": 116300
},
{
"epoch": 1.2078823663702305,
"grad_norm": 2.9002747535705566,
"learning_rate": 0.0001792128010626044,
"loss": 3.79119873046875,
"step": 116400
},
{
"epoch": 1.2089200659976962,
"grad_norm": 3.5677108764648438,
"learning_rate": 0.00017910903109985782,
"loss": 3.868831787109375,
"step": 116500
},
{
"epoch": 1.209957765625162,
"grad_norm": 10.07345199584961,
"learning_rate": 0.00017900526113711121,
"loss": 3.8205535888671873,
"step": 116600
},
{
"epoch": 1.210995465252628,
"grad_norm": 2.9789609909057617,
"learning_rate": 0.00017890149117436466,
"loss": 3.655535888671875,
"step": 116700
},
{
"epoch": 1.2120331648800937,
"grad_norm": 7.362621784210205,
"learning_rate": 0.00017879772121161808,
"loss": 3.5663858032226563,
"step": 116800
},
{
"epoch": 1.2130708645075596,
"grad_norm": 3.515774726867676,
"learning_rate": 0.00017869395124887148,
"loss": 3.64054443359375,
"step": 116900
},
{
"epoch": 1.2141085641350255,
"grad_norm": 2.5356316566467285,
"learning_rate": 0.0001785901812861249,
"loss": 3.621481628417969,
"step": 117000
},
{
"epoch": 1.2151462637624912,
"grad_norm": 4.910796642303467,
"learning_rate": 0.0001784864113233783,
"loss": 3.6991619873046875,
"step": 117100
},
{
"epoch": 1.2161839633899572,
"grad_norm": 4.202451705932617,
"learning_rate": 0.00017838264136063175,
"loss": 3.8038519287109374,
"step": 117200
},
{
"epoch": 1.217221663017423,
"grad_norm": 4.467262268066406,
"learning_rate": 0.00017827887139788514,
"loss": 3.771558837890625,
"step": 117300
},
{
"epoch": 1.2182593626448888,
"grad_norm": 3.9160234928131104,
"learning_rate": 0.00017817510143513856,
"loss": 3.7639215087890623,
"step": 117400
},
{
"epoch": 1.2192970622723547,
"grad_norm": 4.396745681762695,
"learning_rate": 0.000178071331472392,
"loss": 3.68260498046875,
"step": 117500
},
{
"epoch": 1.2203347618998204,
"grad_norm": 3.5205559730529785,
"learning_rate": 0.0001779675615096454,
"loss": 3.6396359252929686,
"step": 117600
},
{
"epoch": 1.2213724615272863,
"grad_norm": 3.1027088165283203,
"learning_rate": 0.00017786379154689883,
"loss": 3.5732858276367185,
"step": 117700
},
{
"epoch": 1.2224101611547522,
"grad_norm": 2.6304574012756348,
"learning_rate": 0.00017776002158415222,
"loss": 3.508619384765625,
"step": 117800
},
{
"epoch": 1.223447860782218,
"grad_norm": 2.9613137245178223,
"learning_rate": 0.00017765625162140564,
"loss": 3.65043212890625,
"step": 117900
},
{
"epoch": 1.2244855604096838,
"grad_norm": 3.6579976081848145,
"learning_rate": 0.0001775524816586591,
"loss": 3.805189514160156,
"step": 118000
},
{
"epoch": 1.2255232600371497,
"grad_norm": 2.3908674716949463,
"learning_rate": 0.0001774487116959125,
"loss": 3.608123474121094,
"step": 118100
},
{
"epoch": 1.2265609596646154,
"grad_norm": 3.335692882537842,
"learning_rate": 0.0001773449417331659,
"loss": 3.707095947265625,
"step": 118200
},
{
"epoch": 1.2275986592920813,
"grad_norm": 5.722865581512451,
"learning_rate": 0.0001772411717704193,
"loss": 3.7158029174804685,
"step": 118300
},
{
"epoch": 1.2286363589195473,
"grad_norm": 9.1022310256958,
"learning_rate": 0.00017713740180767273,
"loss": 3.7301669311523438,
"step": 118400
},
{
"epoch": 1.229674058547013,
"grad_norm": 5.698774814605713,
"learning_rate": 0.00017703363184492615,
"loss": 3.638455810546875,
"step": 118500
},
{
"epoch": 1.2307117581744789,
"grad_norm": 2.373983144760132,
"learning_rate": 0.00017692986188217957,
"loss": 3.6596408081054688,
"step": 118600
},
{
"epoch": 1.2317494578019446,
"grad_norm": 8.193933486938477,
"learning_rate": 0.000176826091919433,
"loss": 3.670250244140625,
"step": 118700
},
{
"epoch": 1.2327871574294105,
"grad_norm": 4.394575119018555,
"learning_rate": 0.0001767223219566864,
"loss": 3.7637249755859377,
"step": 118800
},
{
"epoch": 1.2338248570568764,
"grad_norm": 8.713273048400879,
"learning_rate": 0.00017661855199393984,
"loss": 3.7907025146484377,
"step": 118900
},
{
"epoch": 1.234862556684342,
"grad_norm": 2.0170185565948486,
"learning_rate": 0.00017651478203119323,
"loss": 3.638475036621094,
"step": 119000
},
{
"epoch": 1.235900256311808,
"grad_norm": 14.477542877197266,
"learning_rate": 0.00017641101206844665,
"loss": 3.6606521606445312,
"step": 119100
},
{
"epoch": 1.236937955939274,
"grad_norm": 3.3395235538482666,
"learning_rate": 0.00017630724210570005,
"loss": 3.5342837524414064,
"step": 119200
},
{
"epoch": 1.2379756555667396,
"grad_norm": 3.269758701324463,
"learning_rate": 0.00017620347214295347,
"loss": 3.5976416015625,
"step": 119300
},
{
"epoch": 1.2390133551942055,
"grad_norm": 7.099674224853516,
"learning_rate": 0.00017609970218020692,
"loss": 3.599384460449219,
"step": 119400
},
{
"epoch": 1.2400510548216714,
"grad_norm": 2.358044385910034,
"learning_rate": 0.00017599593221746031,
"loss": 3.4857781982421874,
"step": 119500
},
{
"epoch": 1.2410887544491371,
"grad_norm": 5.485024929046631,
"learning_rate": 0.00017589216225471374,
"loss": 3.69429931640625,
"step": 119600
},
{
"epoch": 1.242126454076603,
"grad_norm": 5.038040637969971,
"learning_rate": 0.00017578839229196713,
"loss": 3.599921875,
"step": 119700
},
{
"epoch": 1.2431641537040687,
"grad_norm": 6.716040134429932,
"learning_rate": 0.00017568462232922058,
"loss": 3.555647888183594,
"step": 119800
},
{
"epoch": 1.2442018533315347,
"grad_norm": 9.499709129333496,
"learning_rate": 0.000175580852366474,
"loss": 3.740644836425781,
"step": 119900
},
{
"epoch": 1.2452395529590006,
"grad_norm": 2.5602540969848633,
"learning_rate": 0.0001754770824037274,
"loss": 3.7783831787109374,
"step": 120000
},
{
"epoch": 1.2462772525864663,
"grad_norm": 5.06706428527832,
"learning_rate": 0.00017537331244098082,
"loss": 3.7457623291015625,
"step": 120100
},
{
"epoch": 1.2473149522139322,
"grad_norm": 4.963079452514648,
"learning_rate": 0.00017526954247823421,
"loss": 3.726761474609375,
"step": 120200
},
{
"epoch": 1.2483526518413979,
"grad_norm": 4.604287624359131,
"learning_rate": 0.00017516577251548766,
"loss": 3.8796881103515624,
"step": 120300
},
{
"epoch": 1.2493903514688638,
"grad_norm": 7.884790897369385,
"learning_rate": 0.00017506200255274106,
"loss": 3.7173165893554687,
"step": 120400
},
{
"epoch": 1.2504280510963297,
"grad_norm": 7.230984687805176,
"learning_rate": 0.00017495823258999448,
"loss": 3.7296737670898437,
"step": 120500
},
{
"epoch": 1.2514657507237956,
"grad_norm": 4.4041032791137695,
"learning_rate": 0.00017485446262724793,
"loss": 3.695928039550781,
"step": 120600
},
{
"epoch": 1.2525034503512613,
"grad_norm": 4.800326347351074,
"learning_rate": 0.00017475069266450132,
"loss": 3.692496032714844,
"step": 120700
},
{
"epoch": 1.2535411499787272,
"grad_norm": 4.20355224609375,
"learning_rate": 0.00017464692270175475,
"loss": 3.724625549316406,
"step": 120800
},
{
"epoch": 1.254578849606193,
"grad_norm": 8.89311408996582,
"learning_rate": 0.00017454315273900814,
"loss": 3.6060061645507813,
"step": 120900
},
{
"epoch": 1.2556165492336588,
"grad_norm": 3.7018239498138428,
"learning_rate": 0.00017443938277626156,
"loss": 3.7614715576171873,
"step": 121000
},
{
"epoch": 1.2566542488611248,
"grad_norm": 3.2457141876220703,
"learning_rate": 0.00017433561281351496,
"loss": 3.729616394042969,
"step": 121100
},
{
"epoch": 1.2576919484885904,
"grad_norm": 9.342671394348145,
"learning_rate": 0.0001742318428507684,
"loss": 3.717445068359375,
"step": 121200
},
{
"epoch": 1.2587296481160564,
"grad_norm": 3.293091058731079,
"learning_rate": 0.00017412807288802183,
"loss": 3.7832305908203123,
"step": 121300
},
{
"epoch": 1.259767347743522,
"grad_norm": 4.222780704498291,
"learning_rate": 0.00017402430292527522,
"loss": 3.7384588623046877,
"step": 121400
},
{
"epoch": 1.260805047370988,
"grad_norm": 3.0761492252349854,
"learning_rate": 0.00017392053296252867,
"loss": 3.7555526733398437,
"step": 121500
},
{
"epoch": 1.261842746998454,
"grad_norm": 2.887803554534912,
"learning_rate": 0.00017381676299978207,
"loss": 3.695442810058594,
"step": 121600
},
{
"epoch": 1.2628804466259196,
"grad_norm": 3.7166850566864014,
"learning_rate": 0.0001737129930370355,
"loss": 3.815606689453125,
"step": 121700
},
{
"epoch": 1.2639181462533855,
"grad_norm": 12.183484077453613,
"learning_rate": 0.0001736092230742889,
"loss": 3.637664794921875,
"step": 121800
},
{
"epoch": 1.2649558458808512,
"grad_norm": 3.1364870071411133,
"learning_rate": 0.0001735054531115423,
"loss": 3.6319699096679687,
"step": 121900
},
{
"epoch": 1.265993545508317,
"grad_norm": 4.354419708251953,
"learning_rate": 0.00017340168314879575,
"loss": 3.786130065917969,
"step": 122000
},
{
"epoch": 1.267031245135783,
"grad_norm": 4.645047664642334,
"learning_rate": 0.00017329791318604915,
"loss": 3.7552008056640624,
"step": 122100
},
{
"epoch": 1.268068944763249,
"grad_norm": 4.269083499908447,
"learning_rate": 0.00017319414322330257,
"loss": 3.7506790161132812,
"step": 122200
},
{
"epoch": 1.2691066443907146,
"grad_norm": 5.066195011138916,
"learning_rate": 0.00017309037326055597,
"loss": 3.788629455566406,
"step": 122300
},
{
"epoch": 1.2701443440181805,
"grad_norm": 5.5616021156311035,
"learning_rate": 0.0001729866032978094,
"loss": 3.6688613891601562,
"step": 122400
},
{
"epoch": 1.2711820436456462,
"grad_norm": 3.1797661781311035,
"learning_rate": 0.00017288283333506284,
"loss": 3.718145751953125,
"step": 122500
},
{
"epoch": 1.2722197432731122,
"grad_norm": 3.063791275024414,
"learning_rate": 0.00017277906337231623,
"loss": 3.66003662109375,
"step": 122600
},
{
"epoch": 1.273257442900578,
"grad_norm": 24.703685760498047,
"learning_rate": 0.00017267529340956965,
"loss": 3.697345886230469,
"step": 122700
},
{
"epoch": 1.2742951425280438,
"grad_norm": 4.573358058929443,
"learning_rate": 0.00017257152344682305,
"loss": 3.770580139160156,
"step": 122800
},
{
"epoch": 1.2753328421555097,
"grad_norm": 6.073929309844971,
"learning_rate": 0.0001724677534840765,
"loss": 3.570367736816406,
"step": 122900
},
{
"epoch": 1.2763705417829754,
"grad_norm": 4.804381847381592,
"learning_rate": 0.0001723639835213299,
"loss": 3.7930453491210936,
"step": 123000
},
{
"epoch": 1.2774082414104413,
"grad_norm": 7.542964935302734,
"learning_rate": 0.00017226021355858331,
"loss": 3.6680117797851564,
"step": 123100
},
{
"epoch": 1.2784459410379072,
"grad_norm": 7.110779285430908,
"learning_rate": 0.00017215644359583674,
"loss": 3.645113830566406,
"step": 123200
},
{
"epoch": 1.2794836406653731,
"grad_norm": 5.410161018371582,
"learning_rate": 0.00017205267363309013,
"loss": 3.7428775024414063,
"step": 123300
},
{
"epoch": 1.2805213402928388,
"grad_norm": 4.089752197265625,
"learning_rate": 0.00017194890367034358,
"loss": 3.7075924682617187,
"step": 123400
},
{
"epoch": 1.2815590399203047,
"grad_norm": 5.877744197845459,
"learning_rate": 0.00017184513370759697,
"loss": 3.546766662597656,
"step": 123500
},
{
"epoch": 1.2825967395477704,
"grad_norm": 4.295921802520752,
"learning_rate": 0.0001717413637448504,
"loss": 3.5129269409179686,
"step": 123600
},
{
"epoch": 1.2836344391752363,
"grad_norm": 7.998104572296143,
"learning_rate": 0.00017163759378210385,
"loss": 3.6661138916015625,
"step": 123700
},
{
"epoch": 1.2846721388027023,
"grad_norm": 4.939531326293945,
"learning_rate": 0.00017153382381935724,
"loss": 3.665038757324219,
"step": 123800
},
{
"epoch": 1.285709838430168,
"grad_norm": 6.5936384201049805,
"learning_rate": 0.00017143005385661066,
"loss": 3.6241445922851563,
"step": 123900
},
{
"epoch": 1.2867475380576339,
"grad_norm": 4.765341281890869,
"learning_rate": 0.00017132628389386406,
"loss": 3.651435546875,
"step": 124000
},
{
"epoch": 1.2877852376850996,
"grad_norm": 5.4220147132873535,
"learning_rate": 0.00017122251393111748,
"loss": 3.8530377197265624,
"step": 124100
},
{
"epoch": 1.2888229373125655,
"grad_norm": 5.066165447235107,
"learning_rate": 0.00017111874396837087,
"loss": 3.6765261840820314,
"step": 124200
},
{
"epoch": 1.2898606369400314,
"grad_norm": 2.871612787246704,
"learning_rate": 0.00017101497400562432,
"loss": 3.7530276489257814,
"step": 124300
},
{
"epoch": 1.2908983365674973,
"grad_norm": 3.5445234775543213,
"learning_rate": 0.00017091120404287774,
"loss": 3.65380126953125,
"step": 124400
},
{
"epoch": 1.291936036194963,
"grad_norm": 12.712068557739258,
"learning_rate": 0.00017080743408013114,
"loss": 3.651844787597656,
"step": 124500
},
{
"epoch": 1.292973735822429,
"grad_norm": 5.535710334777832,
"learning_rate": 0.0001707036641173846,
"loss": 3.648440246582031,
"step": 124600
},
{
"epoch": 1.2940114354498946,
"grad_norm": 6.527225017547607,
"learning_rate": 0.00017059989415463798,
"loss": 3.6168035888671874,
"step": 124700
},
{
"epoch": 1.2950491350773605,
"grad_norm": 3.675743579864502,
"learning_rate": 0.0001704961241918914,
"loss": 3.689391784667969,
"step": 124800
},
{
"epoch": 1.2960868347048264,
"grad_norm": 7.041729927062988,
"learning_rate": 0.0001703923542291448,
"loss": 3.6547369384765624,
"step": 124900
},
{
"epoch": 1.2971245343322921,
"grad_norm": 2.5913071632385254,
"learning_rate": 0.00017028858426639822,
"loss": 3.803846740722656,
"step": 125000
},
{
"epoch": 1.298162233959758,
"grad_norm": 5.099416732788086,
"learning_rate": 0.00017018481430365167,
"loss": 3.661207580566406,
"step": 125100
},
{
"epoch": 1.2991999335872237,
"grad_norm": 3.8206946849823,
"learning_rate": 0.00017008104434090507,
"loss": 3.552643127441406,
"step": 125200
},
{
"epoch": 1.3002376332146897,
"grad_norm": 3.769073247909546,
"learning_rate": 0.0001699772743781585,
"loss": 3.842325439453125,
"step": 125300
},
{
"epoch": 1.3012753328421556,
"grad_norm": 2.529937744140625,
"learning_rate": 0.00016987350441541188,
"loss": 3.676832275390625,
"step": 125400
},
{
"epoch": 1.3023130324696213,
"grad_norm": 7.345049858093262,
"learning_rate": 0.0001697697344526653,
"loss": 3.6286630249023437,
"step": 125500
},
{
"epoch": 1.3033507320970872,
"grad_norm": 7.380908012390137,
"learning_rate": 0.00016966596448991875,
"loss": 3.6627023315429685,
"step": 125600
},
{
"epoch": 1.3043884317245529,
"grad_norm": 2.8857064247131348,
"learning_rate": 0.00016956219452717215,
"loss": 3.641376953125,
"step": 125700
},
{
"epoch": 1.3054261313520188,
"grad_norm": 6.945189476013184,
"learning_rate": 0.00016945842456442557,
"loss": 3.606731262207031,
"step": 125800
},
{
"epoch": 1.3064638309794847,
"grad_norm": 6.422026634216309,
"learning_rate": 0.00016935465460167897,
"loss": 3.5785845947265624,
"step": 125900
},
{
"epoch": 1.3075015306069506,
"grad_norm": 8.35920524597168,
"learning_rate": 0.00016925088463893241,
"loss": 3.6259381103515627,
"step": 126000
},
{
"epoch": 1.3085392302344163,
"grad_norm": 8.193489074707031,
"learning_rate": 0.0001691471146761858,
"loss": 3.7568353271484374,
"step": 126100
},
{
"epoch": 1.3095769298618822,
"grad_norm": 5.267637252807617,
"learning_rate": 0.00016904334471343923,
"loss": 3.757891845703125,
"step": 126200
},
{
"epoch": 1.310614629489348,
"grad_norm": 3.3981618881225586,
"learning_rate": 0.00016893957475069265,
"loss": 3.6808877563476563,
"step": 126300
},
{
"epoch": 1.3116523291168138,
"grad_norm": 11.042278289794922,
"learning_rate": 0.00016883580478794605,
"loss": 3.5690008544921876,
"step": 126400
},
{
"epoch": 1.3126900287442798,
"grad_norm": 12.522445678710938,
"learning_rate": 0.0001687320348251995,
"loss": 3.675894775390625,
"step": 126500
},
{
"epoch": 1.3137277283717455,
"grad_norm": 4.374575138092041,
"learning_rate": 0.0001686282648624529,
"loss": 3.8043743896484377,
"step": 126600
},
{
"epoch": 1.3147654279992114,
"grad_norm": 2.7740325927734375,
"learning_rate": 0.00016852449489970631,
"loss": 3.7183938598632813,
"step": 126700
},
{
"epoch": 1.315803127626677,
"grad_norm": 16.38130760192871,
"learning_rate": 0.0001684207249369597,
"loss": 3.7160101318359375,
"step": 126800
},
{
"epoch": 1.316840827254143,
"grad_norm": 9.450004577636719,
"learning_rate": 0.00016831695497421316,
"loss": 3.6377835083007812,
"step": 126900
},
{
"epoch": 1.317878526881609,
"grad_norm": 8.669651985168457,
"learning_rate": 0.00016821318501146658,
"loss": 3.5026895141601564,
"step": 127000
},
{
"epoch": 1.3189162265090748,
"grad_norm": 4.877604007720947,
"learning_rate": 0.00016810941504871997,
"loss": 3.6808175659179687,
"step": 127100
},
{
"epoch": 1.3199539261365405,
"grad_norm": 9.553235054016113,
"learning_rate": 0.0001680056450859734,
"loss": 3.706498718261719,
"step": 127200
},
{
"epoch": 1.3209916257640064,
"grad_norm": 4.275841236114502,
"learning_rate": 0.0001679018751232268,
"loss": 3.752271728515625,
"step": 127300
},
{
"epoch": 1.322029325391472,
"grad_norm": 7.115382671356201,
"learning_rate": 0.00016779810516048024,
"loss": 3.721490783691406,
"step": 127400
},
{
"epoch": 1.323067025018938,
"grad_norm": 3.066580057144165,
"learning_rate": 0.00016769433519773366,
"loss": 3.67330322265625,
"step": 127500
},
{
"epoch": 1.324104724646404,
"grad_norm": 3.145909547805786,
"learning_rate": 0.00016759056523498706,
"loss": 3.7071697998046873,
"step": 127600
},
{
"epoch": 1.3251424242738696,
"grad_norm": 3.342615842819214,
"learning_rate": 0.0001674867952722405,
"loss": 3.68224853515625,
"step": 127700
},
{
"epoch": 1.3261801239013356,
"grad_norm": 4.780127048492432,
"learning_rate": 0.0001673830253094939,
"loss": 3.914273986816406,
"step": 127800
},
{
"epoch": 1.3272178235288012,
"grad_norm": 8.07118034362793,
"learning_rate": 0.00016727925534674732,
"loss": 3.6639437866210938,
"step": 127900
},
{
"epoch": 1.3282555231562672,
"grad_norm": 6.763175964355469,
"learning_rate": 0.00016717548538400072,
"loss": 3.62579345703125,
"step": 128000
},
{
"epoch": 1.329293222783733,
"grad_norm": 12.123154640197754,
"learning_rate": 0.00016707171542125414,
"loss": 3.721268615722656,
"step": 128100
},
{
"epoch": 1.330330922411199,
"grad_norm": 3.787297010421753,
"learning_rate": 0.0001669679454585076,
"loss": 3.7412783813476564,
"step": 128200
},
{
"epoch": 1.3313686220386647,
"grad_norm": 2.629784107208252,
"learning_rate": 0.00016686417549576098,
"loss": 3.7266500854492186,
"step": 128300
},
{
"epoch": 1.3324063216661306,
"grad_norm": 2.8463058471679688,
"learning_rate": 0.0001667604055330144,
"loss": 3.56947021484375,
"step": 128400
},
{
"epoch": 1.3334440212935963,
"grad_norm": 3.5442264080047607,
"learning_rate": 0.0001666566355702678,
"loss": 3.6988034057617187,
"step": 128500
},
{
"epoch": 1.3344817209210622,
"grad_norm": 3.726022243499756,
"learning_rate": 0.00016655286560752122,
"loss": 3.6229156494140624,
"step": 128600
},
{
"epoch": 1.3355194205485281,
"grad_norm": 5.090481758117676,
"learning_rate": 0.00016644909564477464,
"loss": 3.5555209350585937,
"step": 128700
},
{
"epoch": 1.3365571201759938,
"grad_norm": 5.148849964141846,
"learning_rate": 0.00016634532568202807,
"loss": 3.723890380859375,
"step": 128800
},
{
"epoch": 1.3375948198034597,
"grad_norm": 7.033978462219238,
"learning_rate": 0.0001662415557192815,
"loss": 3.6295504760742188,
"step": 128900
},
{
"epoch": 1.3386325194309254,
"grad_norm": 5.022918701171875,
"learning_rate": 0.00016613778575653488,
"loss": 3.604397888183594,
"step": 129000
},
{
"epoch": 1.3396702190583913,
"grad_norm": 3.9396724700927734,
"learning_rate": 0.00016603401579378833,
"loss": 3.740953369140625,
"step": 129100
},
{
"epoch": 1.3407079186858573,
"grad_norm": 4.96920919418335,
"learning_rate": 0.00016593024583104173,
"loss": 3.6454959106445313,
"step": 129200
},
{
"epoch": 1.341745618313323,
"grad_norm": 3.2997357845306396,
"learning_rate": 0.00016582647586829515,
"loss": 3.64101806640625,
"step": 129300
},
{
"epoch": 1.3427833179407889,
"grad_norm": 12.793081283569336,
"learning_rate": 0.00016572270590554857,
"loss": 3.537852478027344,
"step": 129400
},
{
"epoch": 1.3438210175682546,
"grad_norm": 7.696393013000488,
"learning_rate": 0.00016561893594280197,
"loss": 3.6636843872070313,
"step": 129500
},
{
"epoch": 1.3448587171957205,
"grad_norm": 4.841111183166504,
"learning_rate": 0.00016551516598005541,
"loss": 3.6766192626953127,
"step": 129600
},
{
"epoch": 1.3458964168231864,
"grad_norm": 2.822445869445801,
"learning_rate": 0.0001654113960173088,
"loss": 3.5910659790039063,
"step": 129700
},
{
"epoch": 1.3469341164506523,
"grad_norm": 7.020183086395264,
"learning_rate": 0.00016530762605456223,
"loss": 3.6770706176757812,
"step": 129800
},
{
"epoch": 1.347971816078118,
"grad_norm": 3.323997974395752,
"learning_rate": 0.00016520385609181563,
"loss": 3.673494567871094,
"step": 129900
},
{
"epoch": 1.349009515705584,
"grad_norm": 12.734125137329102,
"learning_rate": 0.00016510008612906907,
"loss": 3.645369873046875,
"step": 130000
},
{
"epoch": 1.3500472153330496,
"grad_norm": 6.959007740020752,
"learning_rate": 0.0001649963161663225,
"loss": 3.5545895385742186,
"step": 130100
},
{
"epoch": 1.3510849149605155,
"grad_norm": 5.492075443267822,
"learning_rate": 0.0001648925462035759,
"loss": 3.73270263671875,
"step": 130200
},
{
"epoch": 1.3521226145879814,
"grad_norm": 5.578936576843262,
"learning_rate": 0.0001647887762408293,
"loss": 3.633159484863281,
"step": 130300
},
{
"epoch": 1.3531603142154471,
"grad_norm": 4.073727607727051,
"learning_rate": 0.0001646850062780827,
"loss": 3.7094195556640623,
"step": 130400
},
{
"epoch": 1.354198013842913,
"grad_norm": 3.7967214584350586,
"learning_rate": 0.00016458123631533616,
"loss": 3.6143753051757814,
"step": 130500
},
{
"epoch": 1.3552357134703787,
"grad_norm": 5.993916034698486,
"learning_rate": 0.00016447746635258955,
"loss": 3.722456359863281,
"step": 130600
},
{
"epoch": 1.3562734130978447,
"grad_norm": 4.235459327697754,
"learning_rate": 0.00016437369638984297,
"loss": 3.7401913452148436,
"step": 130700
},
{
"epoch": 1.3573111127253106,
"grad_norm": 13.88862133026123,
"learning_rate": 0.00016426992642709642,
"loss": 3.746804504394531,
"step": 130800
},
{
"epoch": 1.3583488123527765,
"grad_norm": 5.165769100189209,
"learning_rate": 0.00016416615646434982,
"loss": 3.74326416015625,
"step": 130900
},
{
"epoch": 1.3593865119802422,
"grad_norm": 3.6813595294952393,
"learning_rate": 0.00016406238650160324,
"loss": 3.617030029296875,
"step": 131000
},
{
"epoch": 1.360424211607708,
"grad_norm": 5.9350152015686035,
"learning_rate": 0.00016395861653885663,
"loss": 3.873332214355469,
"step": 131100
},
{
"epoch": 1.3614619112351738,
"grad_norm": 4.220798969268799,
"learning_rate": 0.00016385484657611006,
"loss": 3.6584405517578125,
"step": 131200
},
{
"epoch": 1.3624996108626397,
"grad_norm": 21.21164894104004,
"learning_rate": 0.0001637510766133635,
"loss": 3.617677917480469,
"step": 131300
},
{
"epoch": 1.3635373104901056,
"grad_norm": 5.271477699279785,
"learning_rate": 0.0001636473066506169,
"loss": 3.5792852783203126,
"step": 131400
},
{
"epoch": 1.3645750101175713,
"grad_norm": 4.747986316680908,
"learning_rate": 0.00016354353668787032,
"loss": 3.6235577392578127,
"step": 131500
},
{
"epoch": 1.3656127097450372,
"grad_norm": 3.8399877548217773,
"learning_rate": 0.00016343976672512372,
"loss": 3.780206604003906,
"step": 131600
},
{
"epoch": 1.366650409372503,
"grad_norm": 7.428284645080566,
"learning_rate": 0.00016333599676237714,
"loss": 3.600271911621094,
"step": 131700
},
{
"epoch": 1.3676881089999688,
"grad_norm": 4.4645304679870605,
"learning_rate": 0.00016323222679963056,
"loss": 3.6348703002929685,
"step": 131800
},
{
"epoch": 1.3687258086274348,
"grad_norm": 4.429653167724609,
"learning_rate": 0.00016312845683688398,
"loss": 3.704706726074219,
"step": 131900
},
{
"epoch": 1.3697635082549007,
"grad_norm": 4.308233737945557,
"learning_rate": 0.0001630246868741374,
"loss": 3.704057312011719,
"step": 132000
},
{
"epoch": 1.3708012078823664,
"grad_norm": 12.334646224975586,
"learning_rate": 0.0001629209169113908,
"loss": 3.6710003662109374,
"step": 132100
},
{
"epoch": 1.3718389075098323,
"grad_norm": 5.286363124847412,
"learning_rate": 0.00016281714694864425,
"loss": 3.6472879028320313,
"step": 132200
},
{
"epoch": 1.372876607137298,
"grad_norm": 3.0022027492523193,
"learning_rate": 0.00016271337698589764,
"loss": 3.867461853027344,
"step": 132300
},
{
"epoch": 1.373914306764764,
"grad_norm": 3.6052401065826416,
"learning_rate": 0.00016260960702315107,
"loss": 3.465709533691406,
"step": 132400
},
{
"epoch": 1.3749520063922298,
"grad_norm": 4.250115871429443,
"learning_rate": 0.00016250583706040446,
"loss": 3.6189974975585937,
"step": 132500
},
{
"epoch": 1.3759897060196955,
"grad_norm": 4.520415306091309,
"learning_rate": 0.00016240206709765788,
"loss": 3.697256774902344,
"step": 132600
},
{
"epoch": 1.3770274056471614,
"grad_norm": 3.608278751373291,
"learning_rate": 0.00016229829713491133,
"loss": 3.6748687744140627,
"step": 132700
},
{
"epoch": 1.3780651052746271,
"grad_norm": 3.6304538249969482,
"learning_rate": 0.00016219452717216473,
"loss": 3.6889605712890625,
"step": 132800
},
{
"epoch": 1.379102804902093,
"grad_norm": 4.484381675720215,
"learning_rate": 0.00016209075720941815,
"loss": 3.667810974121094,
"step": 132900
},
{
"epoch": 1.380140504529559,
"grad_norm": 12.79962158203125,
"learning_rate": 0.00016198698724667154,
"loss": 3.901937255859375,
"step": 133000
},
{
"epoch": 1.3811782041570246,
"grad_norm": 3.6465935707092285,
"learning_rate": 0.000161883217283925,
"loss": 3.6334658813476564,
"step": 133100
},
{
"epoch": 1.3822159037844906,
"grad_norm": 2.5269343852996826,
"learning_rate": 0.00016177944732117841,
"loss": 3.6968539428710936,
"step": 133200
},
{
"epoch": 1.3832536034119562,
"grad_norm": 4.01210880279541,
"learning_rate": 0.0001616756773584318,
"loss": 3.4310296630859374,
"step": 133300
},
{
"epoch": 1.3842913030394222,
"grad_norm": 4.493933200836182,
"learning_rate": 0.00016157190739568523,
"loss": 3.719140930175781,
"step": 133400
},
{
"epoch": 1.385329002666888,
"grad_norm": 3.25607967376709,
"learning_rate": 0.00016146813743293863,
"loss": 3.6992584228515626,
"step": 133500
},
{
"epoch": 1.386366702294354,
"grad_norm": 6.134942054748535,
"learning_rate": 0.00016136436747019207,
"loss": 3.748294677734375,
"step": 133600
},
{
"epoch": 1.3874044019218197,
"grad_norm": 3.706012725830078,
"learning_rate": 0.00016126059750744547,
"loss": 3.586408996582031,
"step": 133700
},
{
"epoch": 1.3884421015492856,
"grad_norm": 5.05728816986084,
"learning_rate": 0.0001611568275446989,
"loss": 3.7400482177734373,
"step": 133800
},
{
"epoch": 1.3894798011767513,
"grad_norm": 4.292380332946777,
"learning_rate": 0.00016105305758195234,
"loss": 3.7132363891601563,
"step": 133900
},
{
"epoch": 1.3905175008042172,
"grad_norm": 9.770214080810547,
"learning_rate": 0.00016094928761920573,
"loss": 3.5888162231445313,
"step": 134000
},
{
"epoch": 1.3915552004316831,
"grad_norm": 9.073437690734863,
"learning_rate": 0.00016084551765645916,
"loss": 3.6239898681640623,
"step": 134100
},
{
"epoch": 1.3925929000591488,
"grad_norm": 5.210220813751221,
"learning_rate": 0.00016074174769371255,
"loss": 3.4854669189453125,
"step": 134200
},
{
"epoch": 1.3936305996866147,
"grad_norm": 5.995209693908691,
"learning_rate": 0.00016063797773096597,
"loss": 3.6248184204101563,
"step": 134300
},
{
"epoch": 1.3946682993140804,
"grad_norm": 8.040777206420898,
"learning_rate": 0.00016053420776821937,
"loss": 3.767200622558594,
"step": 134400
},
{
"epoch": 1.3957059989415463,
"grad_norm": 6.153497695922852,
"learning_rate": 0.00016043043780547282,
"loss": 3.6283489990234377,
"step": 134500
},
{
"epoch": 1.3967436985690123,
"grad_norm": 3.4162278175354004,
"learning_rate": 0.00016032666784272624,
"loss": 3.6065017700195314,
"step": 134600
},
{
"epoch": 1.3977813981964782,
"grad_norm": 3.4524638652801514,
"learning_rate": 0.00016022289787997963,
"loss": 3.6301129150390623,
"step": 134700
},
{
"epoch": 1.3988190978239439,
"grad_norm": 6.9367804527282715,
"learning_rate": 0.00016011912791723308,
"loss": 3.6796551513671876,
"step": 134800
},
{
"epoch": 1.3998567974514098,
"grad_norm": 3.629422903060913,
"learning_rate": 0.00016001535795448648,
"loss": 3.745485534667969,
"step": 134900
},
{
"epoch": 1.4008944970788755,
"grad_norm": 3.658010959625244,
"learning_rate": 0.0001599115879917399,
"loss": 3.6311688232421875,
"step": 135000
},
{
"epoch": 1.4019321967063414,
"grad_norm": 16.63618278503418,
"learning_rate": 0.00015980781802899332,
"loss": 3.6807235717773437,
"step": 135100
},
{
"epoch": 1.4029698963338073,
"grad_norm": 6.354872703552246,
"learning_rate": 0.00015970404806624672,
"loss": 3.5296261596679686,
"step": 135200
},
{
"epoch": 1.404007595961273,
"grad_norm": 7.496634483337402,
"learning_rate": 0.00015960027810350017,
"loss": 3.5905780029296874,
"step": 135300
},
{
"epoch": 1.405045295588739,
"grad_norm": 2.790278673171997,
"learning_rate": 0.00015949650814075356,
"loss": 3.544078369140625,
"step": 135400
},
{
"epoch": 1.4060829952162046,
"grad_norm": 5.150670528411865,
"learning_rate": 0.00015939273817800698,
"loss": 3.7144375610351563,
"step": 135500
},
{
"epoch": 1.4071206948436705,
"grad_norm": 5.606545448303223,
"learning_rate": 0.00015928896821526038,
"loss": 3.719892578125,
"step": 135600
},
{
"epoch": 1.4081583944711364,
"grad_norm": 15.23755931854248,
"learning_rate": 0.0001591851982525138,
"loss": 3.649613952636719,
"step": 135700
},
{
"epoch": 1.4091960940986021,
"grad_norm": 20.73650550842285,
"learning_rate": 0.00015908142828976725,
"loss": 3.6828762817382814,
"step": 135800
},
{
"epoch": 1.410233793726068,
"grad_norm": 8.400344848632812,
"learning_rate": 0.00015897765832702064,
"loss": 3.6613919067382814,
"step": 135900
},
{
"epoch": 1.411271493353534,
"grad_norm": 2.5724685192108154,
"learning_rate": 0.00015887388836427407,
"loss": 3.657626037597656,
"step": 136000
},
{
"epoch": 1.4123091929809997,
"grad_norm": 19.325956344604492,
"learning_rate": 0.00015877011840152746,
"loss": 3.8178024291992188,
"step": 136100
},
{
"epoch": 1.4133468926084656,
"grad_norm": 2.402404308319092,
"learning_rate": 0.0001586663484387809,
"loss": 3.59340576171875,
"step": 136200
},
{
"epoch": 1.4143845922359315,
"grad_norm": 6.188352108001709,
"learning_rate": 0.00015856257847603433,
"loss": 3.6710971069335936,
"step": 136300
},
{
"epoch": 1.4154222918633972,
"grad_norm": 4.21588659286499,
"learning_rate": 0.00015845880851328773,
"loss": 3.721273193359375,
"step": 136400
},
{
"epoch": 1.416459991490863,
"grad_norm": 4.4968485832214355,
"learning_rate": 0.00015835503855054115,
"loss": 3.6669491577148436,
"step": 136500
},
{
"epoch": 1.4174976911183288,
"grad_norm": 7.214438438415527,
"learning_rate": 0.00015825126858779454,
"loss": 3.799635925292969,
"step": 136600
},
{
"epoch": 1.4185353907457947,
"grad_norm": 7.262329578399658,
"learning_rate": 0.000158147498625048,
"loss": 3.807882995605469,
"step": 136700
},
{
"epoch": 1.4195730903732606,
"grad_norm": 3.5909628868103027,
"learning_rate": 0.00015804372866230139,
"loss": 3.7313577270507814,
"step": 136800
},
{
"epoch": 1.4206107900007263,
"grad_norm": 10.205459594726562,
"learning_rate": 0.0001579399586995548,
"loss": 3.675950622558594,
"step": 136900
},
{
"epoch": 1.4216484896281922,
"grad_norm": 5.25307559967041,
"learning_rate": 0.00015783618873680826,
"loss": 3.6014810180664063,
"step": 137000
},
{
"epoch": 1.422686189255658,
"grad_norm": 42.26997756958008,
"learning_rate": 0.00015773241877406165,
"loss": 3.6278192138671876,
"step": 137100
},
{
"epoch": 1.4237238888831238,
"grad_norm": 6.092323303222656,
"learning_rate": 0.00015762864881131507,
"loss": 3.555603332519531,
"step": 137200
},
{
"epoch": 1.4247615885105898,
"grad_norm": 2.74434232711792,
"learning_rate": 0.00015752487884856847,
"loss": 3.5426220703125,
"step": 137300
},
{
"epoch": 1.4257992881380557,
"grad_norm": 13.12152099609375,
"learning_rate": 0.0001574211088858219,
"loss": 3.7107192993164064,
"step": 137400
},
{
"epoch": 1.4268369877655214,
"grad_norm": 3.9462010860443115,
"learning_rate": 0.00015731733892307529,
"loss": 3.5455560302734375,
"step": 137500
},
{
"epoch": 1.4278746873929873,
"grad_norm": 3.7687721252441406,
"learning_rate": 0.00015721356896032873,
"loss": 3.630052490234375,
"step": 137600
},
{
"epoch": 1.428912387020453,
"grad_norm": 4.470894813537598,
"learning_rate": 0.00015710979899758216,
"loss": 3.627494201660156,
"step": 137700
},
{
"epoch": 1.429950086647919,
"grad_norm": 4.3846259117126465,
"learning_rate": 0.00015700602903483555,
"loss": 3.5804782104492188,
"step": 137800
},
{
"epoch": 1.4309877862753848,
"grad_norm": 3.9794013500213623,
"learning_rate": 0.000156902259072089,
"loss": 3.739950866699219,
"step": 137900
},
{
"epoch": 1.4320254859028505,
"grad_norm": 10.886957168579102,
"learning_rate": 0.0001567984891093424,
"loss": 3.7072845458984376,
"step": 138000
},
{
"epoch": 1.4330631855303164,
"grad_norm": 4.187902927398682,
"learning_rate": 0.00015669471914659582,
"loss": 3.64345703125,
"step": 138100
},
{
"epoch": 1.4341008851577821,
"grad_norm": 32.209293365478516,
"learning_rate": 0.00015659094918384924,
"loss": 3.6210546875,
"step": 138200
},
{
"epoch": 1.435138584785248,
"grad_norm": 3.12260365486145,
"learning_rate": 0.00015648717922110263,
"loss": 3.7005911254882813,
"step": 138300
},
{
"epoch": 1.436176284412714,
"grad_norm": 6.220150470733643,
"learning_rate": 0.00015638340925835608,
"loss": 3.7236618041992187,
"step": 138400
},
{
"epoch": 1.4372139840401799,
"grad_norm": 2.38154673576355,
"learning_rate": 0.00015627963929560948,
"loss": 3.633033447265625,
"step": 138500
},
{
"epoch": 1.4382516836676456,
"grad_norm": 7.884495258331299,
"learning_rate": 0.0001561758693328629,
"loss": 3.5666903686523437,
"step": 138600
},
{
"epoch": 1.4392893832951115,
"grad_norm": 3.8970346450805664,
"learning_rate": 0.0001560720993701163,
"loss": 3.6862808227539063,
"step": 138700
},
{
"epoch": 1.4403270829225772,
"grad_norm": 3.273268461227417,
"learning_rate": 0.00015596832940736972,
"loss": 3.6251177978515625,
"step": 138800
},
{
"epoch": 1.441364782550043,
"grad_norm": 3.0285887718200684,
"learning_rate": 0.00015586455944462317,
"loss": 3.61291015625,
"step": 138900
},
{
"epoch": 1.442402482177509,
"grad_norm": 3.4767589569091797,
"learning_rate": 0.00015576078948187656,
"loss": 3.6781646728515627,
"step": 139000
},
{
"epoch": 1.4434401818049747,
"grad_norm": 156.1669158935547,
"learning_rate": 0.00015565701951912998,
"loss": 3.6272451782226565,
"step": 139100
},
{
"epoch": 1.4444778814324406,
"grad_norm": 2.3591196537017822,
"learning_rate": 0.00015555324955638338,
"loss": 3.589447021484375,
"step": 139200
},
{
"epoch": 1.4455155810599063,
"grad_norm": 3.8040847778320312,
"learning_rate": 0.00015544947959363683,
"loss": 3.64208251953125,
"step": 139300
},
{
"epoch": 1.4465532806873722,
"grad_norm": 2.655759811401367,
"learning_rate": 0.00015534570963089022,
"loss": 3.671148376464844,
"step": 139400
},
{
"epoch": 1.4475909803148381,
"grad_norm": 7.29696798324585,
"learning_rate": 0.00015524193966814364,
"loss": 3.751770324707031,
"step": 139500
},
{
"epoch": 1.4486286799423038,
"grad_norm": 6.334928035736084,
"learning_rate": 0.00015513816970539706,
"loss": 3.7970040893554686,
"step": 139600
},
{
"epoch": 1.4496663795697697,
"grad_norm": 6.7520623207092285,
"learning_rate": 0.00015503439974265046,
"loss": 3.6929965209960938,
"step": 139700
},
{
"epoch": 1.4507040791972354,
"grad_norm": 10.428074836730957,
"learning_rate": 0.0001549306297799039,
"loss": 3.734377136230469,
"step": 139800
},
{
"epoch": 1.4517417788247013,
"grad_norm": 8.371795654296875,
"learning_rate": 0.0001548268598171573,
"loss": 3.6029412841796873,
"step": 139900
},
{
"epoch": 1.4527794784521673,
"grad_norm": 3.291740894317627,
"learning_rate": 0.00015472308985441073,
"loss": 3.6670523071289063,
"step": 140000
},
{
"epoch": 1.4538171780796332,
"grad_norm": 7.120608806610107,
"learning_rate": 0.00015461931989166417,
"loss": 3.638569030761719,
"step": 140100
},
{
"epoch": 1.4548548777070989,
"grad_norm": 6.361410617828369,
"learning_rate": 0.00015451554992891757,
"loss": 3.661440734863281,
"step": 140200
},
{
"epoch": 1.4558925773345648,
"grad_norm": 3.5337114334106445,
"learning_rate": 0.000154411779966171,
"loss": 3.69423828125,
"step": 140300
},
{
"epoch": 1.4569302769620305,
"grad_norm": 8.946898460388184,
"learning_rate": 0.00015430801000342439,
"loss": 3.636510925292969,
"step": 140400
},
{
"epoch": 1.4579679765894964,
"grad_norm": 3.5454866886138916,
"learning_rate": 0.0001542042400406778,
"loss": 3.833760986328125,
"step": 140500
},
{
"epoch": 1.4590056762169623,
"grad_norm": 20.629167556762695,
"learning_rate": 0.0001541004700779312,
"loss": 3.740248718261719,
"step": 140600
},
{
"epoch": 1.460043375844428,
"grad_norm": 3.0284929275512695,
"learning_rate": 0.00015399670011518465,
"loss": 3.6760980224609376,
"step": 140700
},
{
"epoch": 1.461081075471894,
"grad_norm": 4.971894264221191,
"learning_rate": 0.00015389293015243807,
"loss": 3.600714111328125,
"step": 140800
},
{
"epoch": 1.4621187750993596,
"grad_norm": 3.689394950866699,
"learning_rate": 0.00015378916018969147,
"loss": 3.5257861328125,
"step": 140900
},
{
"epoch": 1.4631564747268255,
"grad_norm": 4.305582523345947,
"learning_rate": 0.00015368539022694492,
"loss": 3.66658447265625,
"step": 141000
},
{
"epoch": 1.4641941743542914,
"grad_norm": 12.191847801208496,
"learning_rate": 0.0001535816202641983,
"loss": 3.5539178466796875,
"step": 141100
},
{
"epoch": 1.4652318739817574,
"grad_norm": 5.9276814460754395,
"learning_rate": 0.00015347785030145173,
"loss": 3.712036437988281,
"step": 141200
},
{
"epoch": 1.466269573609223,
"grad_norm": 7.3767008781433105,
"learning_rate": 0.00015337408033870513,
"loss": 3.688995361328125,
"step": 141300
},
{
"epoch": 1.467307273236689,
"grad_norm": 4.156796932220459,
"learning_rate": 0.00015327031037595855,
"loss": 3.5971023559570314,
"step": 141400
},
{
"epoch": 1.4683449728641547,
"grad_norm": 3.876843214035034,
"learning_rate": 0.000153166540413212,
"loss": 3.7138726806640623,
"step": 141500
},
{
"epoch": 1.4693826724916206,
"grad_norm": 2.5647096633911133,
"learning_rate": 0.0001530627704504654,
"loss": 3.575816650390625,
"step": 141600
},
{
"epoch": 1.4704203721190865,
"grad_norm": 6.341168403625488,
"learning_rate": 0.00015295900048771882,
"loss": 3.675234375,
"step": 141700
},
{
"epoch": 1.4714580717465522,
"grad_norm": 11.66984748840332,
"learning_rate": 0.0001528552305249722,
"loss": 3.5949581909179686,
"step": 141800
},
{
"epoch": 1.472495771374018,
"grad_norm": 2.7472872734069824,
"learning_rate": 0.00015275146056222563,
"loss": 3.4315753173828125,
"step": 141900
},
{
"epoch": 1.4735334710014838,
"grad_norm": 2.7182295322418213,
"learning_rate": 0.00015264769059947908,
"loss": 3.580435791015625,
"step": 142000
},
{
"epoch": 1.4745711706289497,
"grad_norm": 7.28167200088501,
"learning_rate": 0.00015254392063673248,
"loss": 3.6344500732421876,
"step": 142100
},
{
"epoch": 1.4756088702564156,
"grad_norm": 3.1541340351104736,
"learning_rate": 0.0001524401506739859,
"loss": 3.6579803466796874,
"step": 142200
},
{
"epoch": 1.4766465698838815,
"grad_norm": 4.42963171005249,
"learning_rate": 0.0001523363807112393,
"loss": 3.5743417358398437,
"step": 142300
},
{
"epoch": 1.4776842695113472,
"grad_norm": 7.278059005737305,
"learning_rate": 0.00015223261074849274,
"loss": 3.7834173583984376,
"step": 142400
},
{
"epoch": 1.4787219691388132,
"grad_norm": 10.52426528930664,
"learning_rate": 0.00015212884078574614,
"loss": 3.6968179321289063,
"step": 142500
},
{
"epoch": 1.4797596687662788,
"grad_norm": 3.5773837566375732,
"learning_rate": 0.00015202507082299956,
"loss": 3.6810809326171876,
"step": 142600
},
{
"epoch": 1.4807973683937448,
"grad_norm": 3.344587802886963,
"learning_rate": 0.00015192130086025298,
"loss": 3.6345669555664064,
"step": 142700
},
{
"epoch": 1.4818350680212107,
"grad_norm": 6.329004287719727,
"learning_rate": 0.00015181753089750638,
"loss": 3.647319641113281,
"step": 142800
},
{
"epoch": 1.4828727676486764,
"grad_norm": 6.577507495880127,
"learning_rate": 0.00015171376093475983,
"loss": 3.5769888305664064,
"step": 142900
},
{
"epoch": 1.4839104672761423,
"grad_norm": 4.545724391937256,
"learning_rate": 0.00015160999097201322,
"loss": 3.583935546875,
"step": 143000
},
{
"epoch": 1.484948166903608,
"grad_norm": 13.324125289916992,
"learning_rate": 0.00015150622100926664,
"loss": 3.612706604003906,
"step": 143100
},
{
"epoch": 1.485985866531074,
"grad_norm": 4.545955657958984,
"learning_rate": 0.00015140245104652004,
"loss": 3.4066473388671876,
"step": 143200
},
{
"epoch": 1.4870235661585398,
"grad_norm": 8.517041206359863,
"learning_rate": 0.00015129868108377349,
"loss": 3.6258444213867187,
"step": 143300
},
{
"epoch": 1.4880612657860055,
"grad_norm": 5.813758373260498,
"learning_rate": 0.0001511949111210269,
"loss": 3.686318054199219,
"step": 143400
},
{
"epoch": 1.4890989654134714,
"grad_norm": 6.236087322235107,
"learning_rate": 0.0001510911411582803,
"loss": 3.7458810424804687,
"step": 143500
},
{
"epoch": 1.4901366650409371,
"grad_norm": 5.874231815338135,
"learning_rate": 0.00015098737119553373,
"loss": 3.6481814575195313,
"step": 143600
},
{
"epoch": 1.491174364668403,
"grad_norm": 7.229684829711914,
"learning_rate": 0.00015088360123278712,
"loss": 3.6855035400390626,
"step": 143700
},
{
"epoch": 1.492212064295869,
"grad_norm": 7.212390422821045,
"learning_rate": 0.00015077983127004057,
"loss": 3.750265808105469,
"step": 143800
},
{
"epoch": 1.4932497639233349,
"grad_norm": 5.408252239227295,
"learning_rate": 0.000150676061307294,
"loss": 3.5695159912109373,
"step": 143900
},
{
"epoch": 1.4942874635508006,
"grad_norm": 8.125064849853516,
"learning_rate": 0.00015057229134454739,
"loss": 3.642791442871094,
"step": 144000
},
{
"epoch": 1.4953251631782665,
"grad_norm": 5.047210216522217,
"learning_rate": 0.00015046852138180083,
"loss": 3.588906555175781,
"step": 144100
},
{
"epoch": 1.4963628628057322,
"grad_norm": 2.775951623916626,
"learning_rate": 0.00015036475141905423,
"loss": 3.672796325683594,
"step": 144200
},
{
"epoch": 1.497400562433198,
"grad_norm": 7.114427089691162,
"learning_rate": 0.00015026098145630765,
"loss": 3.7460537719726563,
"step": 144300
},
{
"epoch": 1.498438262060664,
"grad_norm": 4.1067585945129395,
"learning_rate": 0.00015015721149356105,
"loss": 3.4047305297851564,
"step": 144400
},
{
"epoch": 1.4994759616881297,
"grad_norm": 6.3360276222229,
"learning_rate": 0.00015005344153081447,
"loss": 3.6055087280273437,
"step": 144500
},
{
"epoch": 1.5005136613155956,
"grad_norm": 3.8499081134796143,
"learning_rate": 0.0001499496715680679,
"loss": 3.6976129150390626,
"step": 144600
},
{
"epoch": 1.5015513609430613,
"grad_norm": 4.669349193572998,
"learning_rate": 0.0001498459016053213,
"loss": 3.6301043701171873,
"step": 144700
},
{
"epoch": 1.5025890605705272,
"grad_norm": 12.484715461730957,
"learning_rate": 0.0001497421316425747,
"loss": 3.6376629638671876,
"step": 144800
},
{
"epoch": 1.5036267601979931,
"grad_norm": 3.1881167888641357,
"learning_rate": 0.00014963836167982816,
"loss": 3.688013000488281,
"step": 144900
},
{
"epoch": 1.504664459825459,
"grad_norm": 3.1999073028564453,
"learning_rate": 0.00014953459171708158,
"loss": 3.767580871582031,
"step": 145000
},
{
"epoch": 1.5057021594529247,
"grad_norm": 2.503138303756714,
"learning_rate": 0.00014943082175433497,
"loss": 3.772780456542969,
"step": 145100
},
{
"epoch": 1.5067398590803904,
"grad_norm": 5.124083995819092,
"learning_rate": 0.0001493270517915884,
"loss": 3.709577941894531,
"step": 145200
},
{
"epoch": 1.5077775587078563,
"grad_norm": 12.24608039855957,
"learning_rate": 0.00014922328182884182,
"loss": 3.46869140625,
"step": 145300
},
{
"epoch": 1.5088152583353223,
"grad_norm": 11.273271560668945,
"learning_rate": 0.0001491195118660952,
"loss": 3.4797503662109377,
"step": 145400
},
{
"epoch": 1.5098529579627882,
"grad_norm": 60.867916107177734,
"learning_rate": 0.00014901574190334866,
"loss": 3.54853515625,
"step": 145500
},
{
"epoch": 1.5108906575902539,
"grad_norm": 4.276978969573975,
"learning_rate": 0.00014891197194060206,
"loss": 3.908219299316406,
"step": 145600
},
{
"epoch": 1.5119283572177198,
"grad_norm": 2.901015281677246,
"learning_rate": 0.00014880820197785548,
"loss": 3.4694091796875,
"step": 145700
},
{
"epoch": 1.5129660568451855,
"grad_norm": 2.3719887733459473,
"learning_rate": 0.0001487044320151089,
"loss": 3.7670758056640623,
"step": 145800
},
{
"epoch": 1.5140037564726514,
"grad_norm": 2.4967026710510254,
"learning_rate": 0.0001486006620523623,
"loss": 3.635834045410156,
"step": 145900
},
{
"epoch": 1.5150414561001173,
"grad_norm": 3.604675769805908,
"learning_rate": 0.00014849689208961572,
"loss": 3.5507608032226563,
"step": 146000
},
{
"epoch": 1.5160791557275832,
"grad_norm": 5.442782402038574,
"learning_rate": 0.00014839312212686916,
"loss": 3.5730636596679686,
"step": 146100
},
{
"epoch": 1.517116855355049,
"grad_norm": 3.7341339588165283,
"learning_rate": 0.00014828935216412256,
"loss": 3.569194641113281,
"step": 146200
},
{
"epoch": 1.5181545549825146,
"grad_norm": 12.070112228393555,
"learning_rate": 0.00014818558220137598,
"loss": 3.60053955078125,
"step": 146300
},
{
"epoch": 1.5191922546099805,
"grad_norm": 5.036438941955566,
"learning_rate": 0.0001480818122386294,
"loss": 3.7114804077148436,
"step": 146400
},
{
"epoch": 1.5202299542374464,
"grad_norm": 10.83106803894043,
"learning_rate": 0.0001479780422758828,
"loss": 3.5428836059570314,
"step": 146500
},
{
"epoch": 1.5212676538649124,
"grad_norm": 9.07150650024414,
"learning_rate": 0.00014787427231313622,
"loss": 3.6087515258789065,
"step": 146600
},
{
"epoch": 1.522305353492378,
"grad_norm": 3.6539382934570312,
"learning_rate": 0.00014777050235038964,
"loss": 3.6974029541015625,
"step": 146700
},
{
"epoch": 1.523343053119844,
"grad_norm": 2.5568654537200928,
"learning_rate": 0.00014766673238764306,
"loss": 3.7100448608398438,
"step": 146800
},
{
"epoch": 1.5243807527473097,
"grad_norm": 5.767122745513916,
"learning_rate": 0.00014756296242489649,
"loss": 3.494932861328125,
"step": 146900
},
{
"epoch": 1.5254184523747756,
"grad_norm": 5.006596088409424,
"learning_rate": 0.00014745919246214988,
"loss": 3.804518737792969,
"step": 147000
},
{
"epoch": 1.5264561520022415,
"grad_norm": 3.907433271408081,
"learning_rate": 0.0001473554224994033,
"loss": 3.6617333984375,
"step": 147100
},
{
"epoch": 1.5274938516297074,
"grad_norm": 6.253331184387207,
"learning_rate": 0.00014725165253665672,
"loss": 3.611311950683594,
"step": 147200
},
{
"epoch": 1.528531551257173,
"grad_norm": 5.735301494598389,
"learning_rate": 0.00014714788257391015,
"loss": 3.605543518066406,
"step": 147300
},
{
"epoch": 1.5295692508846388,
"grad_norm": 1.7375198602676392,
"learning_rate": 0.00014704411261116357,
"loss": 3.6379776000976562,
"step": 147400
},
{
"epoch": 1.5306069505121047,
"grad_norm": 4.913732051849365,
"learning_rate": 0.000146940342648417,
"loss": 3.757569580078125,
"step": 147500
},
{
"epoch": 1.5316446501395706,
"grad_norm": 3.887519598007202,
"learning_rate": 0.00014683657268567039,
"loss": 3.654621887207031,
"step": 147600
},
{
"epoch": 1.5326823497670365,
"grad_norm": 45.76445007324219,
"learning_rate": 0.0001467328027229238,
"loss": 3.611448059082031,
"step": 147700
},
{
"epoch": 1.5337200493945022,
"grad_norm": 3.629575729370117,
"learning_rate": 0.00014662903276017723,
"loss": 3.6693844604492187,
"step": 147800
},
{
"epoch": 1.5347577490219682,
"grad_norm": 2.453900098800659,
"learning_rate": 0.00014652526279743062,
"loss": 3.6880978393554686,
"step": 147900
},
{
"epoch": 1.5357954486494338,
"grad_norm": 3.411557674407959,
"learning_rate": 0.00014642149283468407,
"loss": 3.656671447753906,
"step": 148000
},
{
"epoch": 1.5368331482768998,
"grad_norm": 3.5617477893829346,
"learning_rate": 0.0001463177228719375,
"loss": 3.706895446777344,
"step": 148100
},
{
"epoch": 1.5378708479043657,
"grad_norm": 3.5422544479370117,
"learning_rate": 0.0001462139529091909,
"loss": 3.605690612792969,
"step": 148200
},
{
"epoch": 1.5389085475318316,
"grad_norm": 3.9814698696136475,
"learning_rate": 0.0001461101829464443,
"loss": 3.6530465698242187,
"step": 148300
},
{
"epoch": 1.5399462471592973,
"grad_norm": 10.028122901916504,
"learning_rate": 0.00014600641298369773,
"loss": 3.623879089355469,
"step": 148400
},
{
"epoch": 1.540983946786763,
"grad_norm": 3.4206697940826416,
"learning_rate": 0.00014590264302095113,
"loss": 3.517763366699219,
"step": 148500
},
{
"epoch": 1.542021646414229,
"grad_norm": 3.4238781929016113,
"learning_rate": 0.00014579887305820455,
"loss": 3.52829833984375,
"step": 148600
},
{
"epoch": 1.5430593460416948,
"grad_norm": 58.35453414916992,
"learning_rate": 0.00014569510309545797,
"loss": 3.682017517089844,
"step": 148700
},
{
"epoch": 1.5440970456691607,
"grad_norm": 4.933131217956543,
"learning_rate": 0.0001455913331327114,
"loss": 3.577257080078125,
"step": 148800
},
{
"epoch": 1.5451347452966264,
"grad_norm": 17.892318725585938,
"learning_rate": 0.00014548756316996482,
"loss": 3.710743713378906,
"step": 148900
},
{
"epoch": 1.5461724449240921,
"grad_norm": 6.2961249351501465,
"learning_rate": 0.0001453837932072182,
"loss": 3.6647821044921876,
"step": 149000
},
{
"epoch": 1.547210144551558,
"grad_norm": 4.278889179229736,
"learning_rate": 0.00014528002324447163,
"loss": 3.613748779296875,
"step": 149100
},
{
"epoch": 1.548247844179024,
"grad_norm": 3.2785260677337646,
"learning_rate": 0.00014517625328172505,
"loss": 3.6411376953125,
"step": 149200
},
{
"epoch": 1.5492855438064899,
"grad_norm": 3.227151393890381,
"learning_rate": 0.00014507248331897848,
"loss": 3.758666687011719,
"step": 149300
},
{
"epoch": 1.5503232434339556,
"grad_norm": 2.6391334533691406,
"learning_rate": 0.0001449687133562319,
"loss": 3.5469485473632814,
"step": 149400
},
{
"epoch": 1.5513609430614215,
"grad_norm": 2.5920772552490234,
"learning_rate": 0.00014486494339348532,
"loss": 3.621335754394531,
"step": 149500
},
{
"epoch": 1.5523986426888872,
"grad_norm": 2.864225387573242,
"learning_rate": 0.00014476117343073872,
"loss": 3.6408596801757813,
"step": 149600
},
{
"epoch": 1.553436342316353,
"grad_norm": 4.697976112365723,
"learning_rate": 0.00014465740346799214,
"loss": 3.6993423461914063,
"step": 149700
},
{
"epoch": 1.554474041943819,
"grad_norm": 4.074455738067627,
"learning_rate": 0.00014455363350524556,
"loss": 3.6419488525390626,
"step": 149800
},
{
"epoch": 1.555511741571285,
"grad_norm": 2.933537721633911,
"learning_rate": 0.00014444986354249898,
"loss": 3.622572326660156,
"step": 149900
},
{
"epoch": 1.5565494411987506,
"grad_norm": 5.856564521789551,
"learning_rate": 0.0001443460935797524,
"loss": 3.7532833862304686,
"step": 150000
},
{
"epoch": 1.5575871408262163,
"grad_norm": 4.24385929107666,
"learning_rate": 0.00014424232361700583,
"loss": 3.67490234375,
"step": 150100
},
{
"epoch": 1.5586248404536822,
"grad_norm": 5.053845405578613,
"learning_rate": 0.00014413855365425922,
"loss": 3.7350125122070312,
"step": 150200
},
{
"epoch": 1.5596625400811481,
"grad_norm": 3.423252582550049,
"learning_rate": 0.00014403478369151264,
"loss": 3.522652893066406,
"step": 150300
},
{
"epoch": 1.560700239708614,
"grad_norm": 8.40445327758789,
"learning_rate": 0.00014393101372876606,
"loss": 3.480498962402344,
"step": 150400
},
{
"epoch": 1.5617379393360797,
"grad_norm": 3.1955294609069824,
"learning_rate": 0.00014382724376601946,
"loss": 3.6813082885742188,
"step": 150500
},
{
"epoch": 1.5627756389635457,
"grad_norm": 6.0853681564331055,
"learning_rate": 0.0001437234738032729,
"loss": 3.6238223266601564,
"step": 150600
},
{
"epoch": 1.5638133385910113,
"grad_norm": 5.178461074829102,
"learning_rate": 0.0001436197038405263,
"loss": 3.6469857788085935,
"step": 150700
},
{
"epoch": 1.5648510382184773,
"grad_norm": 8.24820613861084,
"learning_rate": 0.00014351593387777972,
"loss": 3.6198629760742187,
"step": 150800
},
{
"epoch": 1.5658887378459432,
"grad_norm": 4.228358745574951,
"learning_rate": 0.00014341216391503315,
"loss": 3.4716970825195315,
"step": 150900
},
{
"epoch": 1.566926437473409,
"grad_norm": 3.555584192276001,
"learning_rate": 0.00014330839395228654,
"loss": 3.739703369140625,
"step": 151000
},
{
"epoch": 1.5679641371008748,
"grad_norm": 5.781318187713623,
"learning_rate": 0.00014320462398953996,
"loss": 3.5981024169921874,
"step": 151100
},
{
"epoch": 1.5690018367283405,
"grad_norm": 6.903919696807861,
"learning_rate": 0.0001431008540267934,
"loss": 3.5764788818359374,
"step": 151200
},
{
"epoch": 1.5700395363558064,
"grad_norm": 3.584331512451172,
"learning_rate": 0.0001429970840640468,
"loss": 3.6005426025390626,
"step": 151300
},
{
"epoch": 1.5710772359832723,
"grad_norm": 4.393853664398193,
"learning_rate": 0.00014289331410130023,
"loss": 3.78184814453125,
"step": 151400
},
{
"epoch": 1.5721149356107382,
"grad_norm": 2.4552299976348877,
"learning_rate": 0.00014278954413855365,
"loss": 3.7241311645507813,
"step": 151500
},
{
"epoch": 1.573152635238204,
"grad_norm": 6.105810642242432,
"learning_rate": 0.00014268577417580705,
"loss": 3.6668280029296874,
"step": 151600
},
{
"epoch": 1.5741903348656698,
"grad_norm": 5.4593939781188965,
"learning_rate": 0.00014258200421306047,
"loss": 3.6350604248046876,
"step": 151700
},
{
"epoch": 1.5752280344931355,
"grad_norm": 8.01681900024414,
"learning_rate": 0.0001424782342503139,
"loss": 3.636524658203125,
"step": 151800
},
{
"epoch": 1.5762657341206014,
"grad_norm": 27.08595848083496,
"learning_rate": 0.0001423744642875673,
"loss": 3.7312826538085937,
"step": 151900
},
{
"epoch": 1.5773034337480674,
"grad_norm": 3.227189064025879,
"learning_rate": 0.00014227069432482073,
"loss": 3.54576416015625,
"step": 152000
},
{
"epoch": 1.5783411333755333,
"grad_norm": 3.922788619995117,
"learning_rate": 0.00014216692436207413,
"loss": 3.762126770019531,
"step": 152100
},
{
"epoch": 1.579378833002999,
"grad_norm": 11.172755241394043,
"learning_rate": 0.00014206315439932755,
"loss": 3.6238665771484375,
"step": 152200
},
{
"epoch": 1.5804165326304647,
"grad_norm": 4.898155212402344,
"learning_rate": 0.00014195938443658097,
"loss": 3.5397454833984376,
"step": 152300
},
{
"epoch": 1.5814542322579306,
"grad_norm": 4.228941440582275,
"learning_rate": 0.0001418556144738344,
"loss": 3.482630615234375,
"step": 152400
},
{
"epoch": 1.5824919318853965,
"grad_norm": 3.2711164951324463,
"learning_rate": 0.00014175184451108782,
"loss": 3.55691162109375,
"step": 152500
},
{
"epoch": 1.5835296315128624,
"grad_norm": 4.924630641937256,
"learning_rate": 0.00014164807454834124,
"loss": 3.6941983032226564,
"step": 152600
},
{
"epoch": 1.584567331140328,
"grad_norm": 4.247806072235107,
"learning_rate": 0.00014154430458559463,
"loss": 3.704905700683594,
"step": 152700
},
{
"epoch": 1.5856050307677938,
"grad_norm": 5.901268482208252,
"learning_rate": 0.00014144053462284805,
"loss": 3.4900387573242186,
"step": 152800
},
{
"epoch": 1.5866427303952597,
"grad_norm": 2.9829347133636475,
"learning_rate": 0.00014133676466010148,
"loss": 3.560227966308594,
"step": 152900
},
{
"epoch": 1.5876804300227256,
"grad_norm": 3.3158979415893555,
"learning_rate": 0.00014123299469735487,
"loss": 3.6083251953125,
"step": 153000
},
{
"epoch": 1.5887181296501915,
"grad_norm": 3.4291346073150635,
"learning_rate": 0.00014112922473460832,
"loss": 3.634643859863281,
"step": 153100
},
{
"epoch": 1.5897558292776572,
"grad_norm": 6.855015754699707,
"learning_rate": 0.00014102545477186174,
"loss": 3.5994863891601563,
"step": 153200
},
{
"epoch": 1.5907935289051232,
"grad_norm": 5.0481133460998535,
"learning_rate": 0.00014092168480911514,
"loss": 3.7016021728515627,
"step": 153300
},
{
"epoch": 1.5918312285325888,
"grad_norm": 7.888632297515869,
"learning_rate": 0.00014081791484636856,
"loss": 3.531593017578125,
"step": 153400
},
{
"epoch": 1.5928689281600548,
"grad_norm": 3.533106565475464,
"learning_rate": 0.00014071414488362198,
"loss": 3.671497497558594,
"step": 153500
},
{
"epoch": 1.5939066277875207,
"grad_norm": 3.2950990200042725,
"learning_rate": 0.00014061037492087538,
"loss": 3.6725836181640625,
"step": 153600
},
{
"epoch": 1.5949443274149866,
"grad_norm": 5.21208381652832,
"learning_rate": 0.00014050660495812882,
"loss": 3.6607846069335936,
"step": 153700
},
{
"epoch": 1.5959820270424523,
"grad_norm": 2.718191385269165,
"learning_rate": 0.00014040283499538222,
"loss": 3.607443542480469,
"step": 153800
},
{
"epoch": 1.597019726669918,
"grad_norm": 3.6571433544158936,
"learning_rate": 0.00014029906503263564,
"loss": 3.675062255859375,
"step": 153900
},
{
"epoch": 1.598057426297384,
"grad_norm": 2.440661907196045,
"learning_rate": 0.00014019529506988906,
"loss": 3.4682305908203124,
"step": 154000
},
{
"epoch": 1.5990951259248498,
"grad_norm": 4.171643257141113,
"learning_rate": 0.00014009152510714246,
"loss": 3.682950134277344,
"step": 154100
},
{
"epoch": 1.6001328255523157,
"grad_norm": 7.624752998352051,
"learning_rate": 0.00013998775514439588,
"loss": 3.67526611328125,
"step": 154200
},
{
"epoch": 1.6011705251797814,
"grad_norm": 7.279924392700195,
"learning_rate": 0.0001398839851816493,
"loss": 3.6037884521484376,
"step": 154300
},
{
"epoch": 1.6022082248072473,
"grad_norm": 3.2470226287841797,
"learning_rate": 0.00013978021521890272,
"loss": 3.658772277832031,
"step": 154400
},
{
"epoch": 1.603245924434713,
"grad_norm": 5.602239608764648,
"learning_rate": 0.00013967644525615615,
"loss": 3.5984457397460936,
"step": 154500
},
{
"epoch": 1.604283624062179,
"grad_norm": 3.6453311443328857,
"learning_rate": 0.00013957267529340957,
"loss": 3.388334045410156,
"step": 154600
},
{
"epoch": 1.6053213236896449,
"grad_norm": 6.957507610321045,
"learning_rate": 0.00013946890533066296,
"loss": 3.617900695800781,
"step": 154700
},
{
"epoch": 1.6063590233171108,
"grad_norm": 15.978106498718262,
"learning_rate": 0.00013936513536791638,
"loss": 3.514501647949219,
"step": 154800
},
{
"epoch": 1.6073967229445765,
"grad_norm": 4.719081401824951,
"learning_rate": 0.0001392613654051698,
"loss": 3.6095266723632813,
"step": 154900
},
{
"epoch": 1.6084344225720422,
"grad_norm": 3.6483592987060547,
"learning_rate": 0.00013915759544242323,
"loss": 3.6144635009765627,
"step": 155000
},
{
"epoch": 1.609472122199508,
"grad_norm": 3.3481674194335938,
"learning_rate": 0.00013905382547967665,
"loss": 3.5398931884765625,
"step": 155100
},
{
"epoch": 1.610509821826974,
"grad_norm": 6.413243293762207,
"learning_rate": 0.00013895005551693007,
"loss": 3.6416336059570313,
"step": 155200
},
{
"epoch": 1.61154752145444,
"grad_norm": 7.17488431930542,
"learning_rate": 0.00013884628555418347,
"loss": 3.717559814453125,
"step": 155300
},
{
"epoch": 1.6125852210819056,
"grad_norm": 6.735267162322998,
"learning_rate": 0.0001387425155914369,
"loss": 3.6701150512695313,
"step": 155400
},
{
"epoch": 1.6136229207093713,
"grad_norm": 3.489192008972168,
"learning_rate": 0.0001386387456286903,
"loss": 3.607757263183594,
"step": 155500
},
{
"epoch": 1.6146606203368372,
"grad_norm": 4.3538360595703125,
"learning_rate": 0.00013853497566594373,
"loss": 3.6339166259765623,
"step": 155600
},
{
"epoch": 1.6156983199643031,
"grad_norm": 17.20830535888672,
"learning_rate": 0.00013843120570319715,
"loss": 3.42401611328125,
"step": 155700
},
{
"epoch": 1.616736019591769,
"grad_norm": 2.5314135551452637,
"learning_rate": 0.00013832743574045055,
"loss": 3.527308349609375,
"step": 155800
},
{
"epoch": 1.617773719219235,
"grad_norm": 4.076705455780029,
"learning_rate": 0.00013822366577770397,
"loss": 3.5527752685546874,
"step": 155900
},
{
"epoch": 1.6188114188467007,
"grad_norm": 3.8894543647766113,
"learning_rate": 0.0001381198958149574,
"loss": 3.68035400390625,
"step": 156000
},
{
"epoch": 1.6198491184741663,
"grad_norm": 17.054737091064453,
"learning_rate": 0.0001380161258522108,
"loss": 3.4780517578125,
"step": 156100
},
{
"epoch": 1.6208868181016323,
"grad_norm": 20.06046485900879,
"learning_rate": 0.0001379123558894642,
"loss": 3.5436282348632813,
"step": 156200
},
{
"epoch": 1.6219245177290982,
"grad_norm": 3.36186146736145,
"learning_rate": 0.00013780858592671766,
"loss": 3.6915762329101565,
"step": 156300
},
{
"epoch": 1.622962217356564,
"grad_norm": 3.333552360534668,
"learning_rate": 0.00013770481596397105,
"loss": 3.551458740234375,
"step": 156400
},
{
"epoch": 1.6239999169840298,
"grad_norm": 16.679468154907227,
"learning_rate": 0.00013760104600122448,
"loss": 3.5686306762695312,
"step": 156500
},
{
"epoch": 1.6250376166114955,
"grad_norm": 3.8986880779266357,
"learning_rate": 0.0001374972760384779,
"loss": 3.6233151245117186,
"step": 156600
},
{
"epoch": 1.6260753162389614,
"grad_norm": 5.065491199493408,
"learning_rate": 0.0001373935060757313,
"loss": 3.6737161254882813,
"step": 156700
},
{
"epoch": 1.6271130158664273,
"grad_norm": 16.096450805664062,
"learning_rate": 0.00013728973611298471,
"loss": 3.6823269653320314,
"step": 156800
},
{
"epoch": 1.6281507154938932,
"grad_norm": 3.939023733139038,
"learning_rate": 0.00013718596615023814,
"loss": 3.655545349121094,
"step": 156900
},
{
"epoch": 1.629188415121359,
"grad_norm": 5.221971035003662,
"learning_rate": 0.00013708219618749156,
"loss": 3.5299761962890623,
"step": 157000
},
{
"epoch": 1.6302261147488248,
"grad_norm": 4.515364646911621,
"learning_rate": 0.00013697842622474498,
"loss": 3.5957623291015626,
"step": 157100
},
{
"epoch": 1.6312638143762905,
"grad_norm": 2.1334664821624756,
"learning_rate": 0.00013687465626199838,
"loss": 3.5724642944335936,
"step": 157200
},
{
"epoch": 1.6323015140037564,
"grad_norm": 3.8212311267852783,
"learning_rate": 0.0001367708862992518,
"loss": 3.5870269775390624,
"step": 157300
},
{
"epoch": 1.6333392136312224,
"grad_norm": 7.132654666900635,
"learning_rate": 0.00013666711633650522,
"loss": 3.5734619140625,
"step": 157400
},
{
"epoch": 1.6343769132586883,
"grad_norm": 4.568203926086426,
"learning_rate": 0.00013656334637375864,
"loss": 3.6052120971679686,
"step": 157500
},
{
"epoch": 1.635414612886154,
"grad_norm": 6.630765438079834,
"learning_rate": 0.00013645957641101206,
"loss": 3.7074453735351565,
"step": 157600
},
{
"epoch": 1.6364523125136197,
"grad_norm": 9.513466835021973,
"learning_rate": 0.00013635580644826549,
"loss": 3.4421658325195312,
"step": 157700
},
{
"epoch": 1.6374900121410856,
"grad_norm": 3.5600993633270264,
"learning_rate": 0.00013625203648551888,
"loss": 3.472029724121094,
"step": 157800
},
{
"epoch": 1.6385277117685515,
"grad_norm": 3.796132802963257,
"learning_rate": 0.0001361482665227723,
"loss": 3.700109558105469,
"step": 157900
},
{
"epoch": 1.6395654113960174,
"grad_norm": 5.419138431549072,
"learning_rate": 0.00013604449656002572,
"loss": 3.525767517089844,
"step": 158000
},
{
"epoch": 1.640603111023483,
"grad_norm": 7.728092193603516,
"learning_rate": 0.00013594072659727912,
"loss": 3.4612411499023437,
"step": 158100
},
{
"epoch": 1.641640810650949,
"grad_norm": 5.094764232635498,
"learning_rate": 0.00013583695663453257,
"loss": 3.5728485107421877,
"step": 158200
},
{
"epoch": 1.6426785102784147,
"grad_norm": 7.930044174194336,
"learning_rate": 0.000135733186671786,
"loss": 3.547598571777344,
"step": 158300
},
{
"epoch": 1.6437162099058806,
"grad_norm": 3.853911876678467,
"learning_rate": 0.00013562941670903938,
"loss": 3.5331781005859373,
"step": 158400
},
{
"epoch": 1.6447539095333465,
"grad_norm": 14.153372764587402,
"learning_rate": 0.0001355256467462928,
"loss": 3.5483056640625,
"step": 158500
},
{
"epoch": 1.6457916091608125,
"grad_norm": 4.353669166564941,
"learning_rate": 0.00013542187678354623,
"loss": 3.5902810668945313,
"step": 158600
},
{
"epoch": 1.6468293087882782,
"grad_norm": 3.16603946685791,
"learning_rate": 0.00013531810682079962,
"loss": 3.5274386596679688,
"step": 158700
},
{
"epoch": 1.6478670084157439,
"grad_norm": 5.928895950317383,
"learning_rate": 0.00013521433685805307,
"loss": 3.662962646484375,
"step": 158800
},
{
"epoch": 1.6489047080432098,
"grad_norm": 4.497453689575195,
"learning_rate": 0.00013511056689530647,
"loss": 3.6771749877929687,
"step": 158900
},
{
"epoch": 1.6499424076706757,
"grad_norm": 6.737712383270264,
"learning_rate": 0.0001350067969325599,
"loss": 3.546751708984375,
"step": 159000
},
{
"epoch": 1.6509801072981416,
"grad_norm": 3.984771490097046,
"learning_rate": 0.0001349030269698133,
"loss": 3.5879977416992186,
"step": 159100
},
{
"epoch": 1.6520178069256073,
"grad_norm": 7.267343521118164,
"learning_rate": 0.0001347992570070667,
"loss": 3.5557431030273436,
"step": 159200
},
{
"epoch": 1.653055506553073,
"grad_norm": 5.349457263946533,
"learning_rate": 0.00013469548704432013,
"loss": 3.6174130249023437,
"step": 159300
},
{
"epoch": 1.654093206180539,
"grad_norm": 3.6522059440612793,
"learning_rate": 0.00013459171708157358,
"loss": 3.609751892089844,
"step": 159400
},
{
"epoch": 1.6551309058080048,
"grad_norm": 5.704461574554443,
"learning_rate": 0.00013448794711882697,
"loss": 3.5679837036132813,
"step": 159500
},
{
"epoch": 1.6561686054354707,
"grad_norm": 5.23817777633667,
"learning_rate": 0.0001343841771560804,
"loss": 3.5738253784179688,
"step": 159600
},
{
"epoch": 1.6572063050629366,
"grad_norm": 12.301040649414062,
"learning_rate": 0.00013428040719333382,
"loss": 3.587038879394531,
"step": 159700
},
{
"epoch": 1.6582440046904023,
"grad_norm": 6.761283874511719,
"learning_rate": 0.0001341766372305872,
"loss": 3.521001281738281,
"step": 159800
},
{
"epoch": 1.659281704317868,
"grad_norm": 5.411608695983887,
"learning_rate": 0.00013407286726784063,
"loss": 3.473619384765625,
"step": 159900
},
{
"epoch": 1.660319403945334,
"grad_norm": 14.189502716064453,
"learning_rate": 0.00013396909730509405,
"loss": 3.5413604736328126,
"step": 160000
},
{
"epoch": 1.6613571035727999,
"grad_norm": 3.0541956424713135,
"learning_rate": 0.00013386532734234748,
"loss": 3.5548626708984377,
"step": 160100
},
{
"epoch": 1.6623948032002658,
"grad_norm": 3.2475764751434326,
"learning_rate": 0.0001337615573796009,
"loss": 3.5887530517578123,
"step": 160200
},
{
"epoch": 1.6634325028277315,
"grad_norm": 4.810506343841553,
"learning_rate": 0.00013365778741685432,
"loss": 3.6068450927734377,
"step": 160300
},
{
"epoch": 1.6644702024551972,
"grad_norm": 11.347721099853516,
"learning_rate": 0.00013355401745410771,
"loss": 3.663785705566406,
"step": 160400
},
{
"epoch": 1.665507902082663,
"grad_norm": 2.9197380542755127,
"learning_rate": 0.00013345024749136114,
"loss": 3.6435916137695314,
"step": 160500
},
{
"epoch": 1.666545601710129,
"grad_norm": 5.3932037353515625,
"learning_rate": 0.00013334647752861456,
"loss": 3.6256121826171874,
"step": 160600
},
{
"epoch": 1.667583301337595,
"grad_norm": 3.6826651096343994,
"learning_rate": 0.00013324270756586798,
"loss": 3.60268798828125,
"step": 160700
},
{
"epoch": 1.6686210009650606,
"grad_norm": 4.883547782897949,
"learning_rate": 0.0001331389376031214,
"loss": 3.508822326660156,
"step": 160800
},
{
"epoch": 1.6696587005925265,
"grad_norm": 3.1789474487304688,
"learning_rate": 0.0001330351676403748,
"loss": 3.5955624389648437,
"step": 160900
},
{
"epoch": 1.6706964002199922,
"grad_norm": 3.8428354263305664,
"learning_rate": 0.00013293139767762822,
"loss": 3.6681442260742188,
"step": 161000
},
{
"epoch": 1.6717340998474581,
"grad_norm": 5.440670490264893,
"learning_rate": 0.00013282762771488164,
"loss": 3.65127197265625,
"step": 161100
},
{
"epoch": 1.672771799474924,
"grad_norm": 4.737522125244141,
"learning_rate": 0.00013272385775213504,
"loss": 3.757344055175781,
"step": 161200
},
{
"epoch": 1.67380949910239,
"grad_norm": 5.953054428100586,
"learning_rate": 0.00013262008778938848,
"loss": 3.690797119140625,
"step": 161300
},
{
"epoch": 1.6748471987298557,
"grad_norm": 8.720730781555176,
"learning_rate": 0.0001325163178266419,
"loss": 3.790602722167969,
"step": 161400
},
{
"epoch": 1.6758848983573214,
"grad_norm": 3.9143240451812744,
"learning_rate": 0.0001324125478638953,
"loss": 3.439073486328125,
"step": 161500
},
{
"epoch": 1.6769225979847873,
"grad_norm": 4.572363376617432,
"learning_rate": 0.00013230877790114872,
"loss": 3.5498342895507813,
"step": 161600
},
{
"epoch": 1.6779602976122532,
"grad_norm": 9.166924476623535,
"learning_rate": 0.00013220500793840215,
"loss": 3.479727478027344,
"step": 161700
},
{
"epoch": 1.678997997239719,
"grad_norm": 2.0057218074798584,
"learning_rate": 0.00013210123797565554,
"loss": 3.72489990234375,
"step": 161800
},
{
"epoch": 1.6800356968671848,
"grad_norm": 4.892455101013184,
"learning_rate": 0.000131997468012909,
"loss": 3.6359210205078125,
"step": 161900
},
{
"epoch": 1.6810733964946507,
"grad_norm": 8.374796867370605,
"learning_rate": 0.00013189369805016238,
"loss": 3.5657424926757812,
"step": 162000
},
{
"epoch": 1.6821110961221164,
"grad_norm": 3.702462911605835,
"learning_rate": 0.0001317899280874158,
"loss": 3.6002679443359376,
"step": 162100
},
{
"epoch": 1.6831487957495823,
"grad_norm": 6.6382856369018555,
"learning_rate": 0.00013168615812466923,
"loss": 3.5055661010742187,
"step": 162200
},
{
"epoch": 1.6841864953770482,
"grad_norm": 4.067321300506592,
"learning_rate": 0.00013158238816192262,
"loss": 3.6370770263671877,
"step": 162300
},
{
"epoch": 1.6852241950045141,
"grad_norm": 6.839338779449463,
"learning_rate": 0.00013147861819917604,
"loss": 3.68888671875,
"step": 162400
},
{
"epoch": 1.6862618946319798,
"grad_norm": 4.304868221282959,
"learning_rate": 0.00013137484823642947,
"loss": 3.5517013549804686,
"step": 162500
},
{
"epoch": 1.6872995942594455,
"grad_norm": 6.149030685424805,
"learning_rate": 0.0001312710782736829,
"loss": 3.535697326660156,
"step": 162600
},
{
"epoch": 1.6883372938869114,
"grad_norm": 3.3684825897216797,
"learning_rate": 0.0001311673083109363,
"loss": 3.4286175537109376,
"step": 162700
},
{
"epoch": 1.6893749935143774,
"grad_norm": 3.4294440746307373,
"learning_rate": 0.00013106353834818973,
"loss": 3.443184509277344,
"step": 162800
},
{
"epoch": 1.6904126931418433,
"grad_norm": 4.177918434143066,
"learning_rate": 0.00013095976838544313,
"loss": 3.6785324096679686,
"step": 162900
},
{
"epoch": 1.691450392769309,
"grad_norm": 3.914222478866577,
"learning_rate": 0.00013085599842269655,
"loss": 3.6343704223632813,
"step": 163000
},
{
"epoch": 1.6924880923967747,
"grad_norm": 10.268918991088867,
"learning_rate": 0.00013075222845994997,
"loss": 3.625147399902344,
"step": 163100
},
{
"epoch": 1.6935257920242406,
"grad_norm": 3.8632876873016357,
"learning_rate": 0.0001306484584972034,
"loss": 3.62834228515625,
"step": 163200
},
{
"epoch": 1.6945634916517065,
"grad_norm": 3.8029658794403076,
"learning_rate": 0.00013054468853445681,
"loss": 3.4555462646484374,
"step": 163300
},
{
"epoch": 1.6956011912791724,
"grad_norm": 3.983098030090332,
"learning_rate": 0.00013044091857171024,
"loss": 3.6773056030273437,
"step": 163400
},
{
"epoch": 1.6966388909066383,
"grad_norm": 3.1625497341156006,
"learning_rate": 0.00013033714860896363,
"loss": 3.525480041503906,
"step": 163500
},
{
"epoch": 1.697676590534104,
"grad_norm": 6.201349258422852,
"learning_rate": 0.00013023337864621705,
"loss": 3.626365051269531,
"step": 163600
},
{
"epoch": 1.6987142901615697,
"grad_norm": 4.032458782196045,
"learning_rate": 0.00013012960868347048,
"loss": 3.5092694091796877,
"step": 163700
},
{
"epoch": 1.6997519897890356,
"grad_norm": 3.9698915481567383,
"learning_rate": 0.0001300258387207239,
"loss": 3.273734436035156,
"step": 163800
},
{
"epoch": 1.7007896894165015,
"grad_norm": 9.877572059631348,
"learning_rate": 0.00012992206875797732,
"loss": 3.576407775878906,
"step": 163900
},
{
"epoch": 1.7018273890439675,
"grad_norm": 14.561692237854004,
"learning_rate": 0.00012981829879523071,
"loss": 3.6638983154296874,
"step": 164000
},
{
"epoch": 1.7028650886714332,
"grad_norm": 2.6718385219573975,
"learning_rate": 0.00012971452883248414,
"loss": 3.671317138671875,
"step": 164100
},
{
"epoch": 1.7039027882988989,
"grad_norm": 3.6662535667419434,
"learning_rate": 0.00012961075886973756,
"loss": 3.648578796386719,
"step": 164200
},
{
"epoch": 1.7049404879263648,
"grad_norm": 4.04230260848999,
"learning_rate": 0.00012950698890699095,
"loss": 3.4332769775390624,
"step": 164300
},
{
"epoch": 1.7059781875538307,
"grad_norm": 9.336248397827148,
"learning_rate": 0.00012940321894424437,
"loss": 3.6213333129882814,
"step": 164400
},
{
"epoch": 1.7070158871812966,
"grad_norm": 5.882486820220947,
"learning_rate": 0.00012929944898149782,
"loss": 3.525044250488281,
"step": 164500
},
{
"epoch": 1.7080535868087623,
"grad_norm": 6.984238624572754,
"learning_rate": 0.00012919567901875122,
"loss": 3.6717626953125,
"step": 164600
},
{
"epoch": 1.7090912864362282,
"grad_norm": 19.616052627563477,
"learning_rate": 0.00012909190905600464,
"loss": 3.5099832153320314,
"step": 164700
},
{
"epoch": 1.710128986063694,
"grad_norm": 8.419858932495117,
"learning_rate": 0.00012898813909325806,
"loss": 3.624603576660156,
"step": 164800
},
{
"epoch": 1.7111666856911598,
"grad_norm": 3.145763397216797,
"learning_rate": 0.00012888436913051146,
"loss": 3.5627670288085938,
"step": 164900
},
{
"epoch": 1.7122043853186257,
"grad_norm": 2.620919704437256,
"learning_rate": 0.00012878059916776488,
"loss": 3.556968994140625,
"step": 165000
},
{
"epoch": 1.7132420849460916,
"grad_norm": 3.6687073707580566,
"learning_rate": 0.0001286768292050183,
"loss": 3.590003662109375,
"step": 165100
},
{
"epoch": 1.7142797845735573,
"grad_norm": 3.51960825920105,
"learning_rate": 0.00012857305924227172,
"loss": 3.443156433105469,
"step": 165200
},
{
"epoch": 1.715317484201023,
"grad_norm": 7.178112030029297,
"learning_rate": 0.00012846928927952514,
"loss": 3.5516900634765625,
"step": 165300
},
{
"epoch": 1.716355183828489,
"grad_norm": 3.60011887550354,
"learning_rate": 0.00012836551931677857,
"loss": 3.5771609497070314,
"step": 165400
},
{
"epoch": 1.7173928834559549,
"grad_norm": 5.902312278747559,
"learning_rate": 0.00012826174935403196,
"loss": 3.590467529296875,
"step": 165500
},
{
"epoch": 1.7184305830834208,
"grad_norm": 2.6880180835723877,
"learning_rate": 0.00012815797939128538,
"loss": 3.6772579956054687,
"step": 165600
},
{
"epoch": 1.7194682827108865,
"grad_norm": 4.136773109436035,
"learning_rate": 0.0001280542094285388,
"loss": 3.7336956787109377,
"step": 165700
},
{
"epoch": 1.7205059823383524,
"grad_norm": 5.155696392059326,
"learning_rate": 0.00012795043946579223,
"loss": 3.4659573364257814,
"step": 165800
},
{
"epoch": 1.721543681965818,
"grad_norm": 5.531459331512451,
"learning_rate": 0.00012784666950304565,
"loss": 3.4835992431640626,
"step": 165900
},
{
"epoch": 1.722581381593284,
"grad_norm": 6.343237400054932,
"learning_rate": 0.00012774289954029904,
"loss": 3.5382821655273435,
"step": 166000
},
{
"epoch": 1.72361908122075,
"grad_norm": 2.731682538986206,
"learning_rate": 0.00012763912957755247,
"loss": 3.426122131347656,
"step": 166100
},
{
"epoch": 1.7246567808482158,
"grad_norm": 5.487903594970703,
"learning_rate": 0.0001275353596148059,
"loss": 3.5763626098632812,
"step": 166200
},
{
"epoch": 1.7256944804756815,
"grad_norm": 6.798583984375,
"learning_rate": 0.00012743158965205928,
"loss": 3.439584045410156,
"step": 166300
},
{
"epoch": 1.7267321801031472,
"grad_norm": 18.596773147583008,
"learning_rate": 0.00012732781968931273,
"loss": 3.4846591186523437,
"step": 166400
},
{
"epoch": 1.7277698797306131,
"grad_norm": 9.826458930969238,
"learning_rate": 0.00012722404972656615,
"loss": 3.410422668457031,
"step": 166500
},
{
"epoch": 1.728807579358079,
"grad_norm": 5.076817035675049,
"learning_rate": 0.00012712027976381955,
"loss": 3.5888720703125,
"step": 166600
},
{
"epoch": 1.729845278985545,
"grad_norm": 2.289203405380249,
"learning_rate": 0.00012701650980107297,
"loss": 3.6262445068359375,
"step": 166700
},
{
"epoch": 1.7308829786130107,
"grad_norm": 2.4246132373809814,
"learning_rate": 0.0001269127398383264,
"loss": 3.5331646728515627,
"step": 166800
},
{
"epoch": 1.7319206782404764,
"grad_norm": 20.16929054260254,
"learning_rate": 0.0001268089698755798,
"loss": 3.3934396362304686,
"step": 166900
},
{
"epoch": 1.7329583778679423,
"grad_norm": 4.409317970275879,
"learning_rate": 0.00012670519991283324,
"loss": 3.46904052734375,
"step": 167000
},
{
"epoch": 1.7339960774954082,
"grad_norm": 3.533935308456421,
"learning_rate": 0.00012660142995008663,
"loss": 3.6115313720703126,
"step": 167100
},
{
"epoch": 1.735033777122874,
"grad_norm": 3.760765790939331,
"learning_rate": 0.00012649765998734005,
"loss": 3.7661947631835937,
"step": 167200
},
{
"epoch": 1.7360714767503398,
"grad_norm": 3.174926996231079,
"learning_rate": 0.00012639389002459348,
"loss": 3.4038616943359377,
"step": 167300
},
{
"epoch": 1.7371091763778057,
"grad_norm": 4.701259136199951,
"learning_rate": 0.00012629012006184687,
"loss": 3.575841064453125,
"step": 167400
},
{
"epoch": 1.7381468760052714,
"grad_norm": 4.684348106384277,
"learning_rate": 0.0001261863500991003,
"loss": 3.650244140625,
"step": 167500
},
{
"epoch": 1.7391845756327373,
"grad_norm": 5.04356575012207,
"learning_rate": 0.00012608258013635374,
"loss": 3.512914733886719,
"step": 167600
},
{
"epoch": 1.7402222752602032,
"grad_norm": 4.33563232421875,
"learning_rate": 0.00012597881017360714,
"loss": 3.462794189453125,
"step": 167700
},
{
"epoch": 1.7412599748876691,
"grad_norm": 3.108952522277832,
"learning_rate": 0.00012587504021086056,
"loss": 3.6481967163085938,
"step": 167800
},
{
"epoch": 1.7422976745151348,
"grad_norm": 7.204711437225342,
"learning_rate": 0.00012577127024811398,
"loss": 3.3575787353515625,
"step": 167900
},
{
"epoch": 1.7433353741426005,
"grad_norm": 9.035337448120117,
"learning_rate": 0.00012566750028536737,
"loss": 3.5675091552734375,
"step": 168000
},
{
"epoch": 1.7443730737700665,
"grad_norm": 5.063663005828857,
"learning_rate": 0.0001255637303226208,
"loss": 3.48505615234375,
"step": 168100
},
{
"epoch": 1.7454107733975324,
"grad_norm": 3.2425074577331543,
"learning_rate": 0.00012545996035987422,
"loss": 3.6897207641601564,
"step": 168200
},
{
"epoch": 1.7464484730249983,
"grad_norm": 5.356579303741455,
"learning_rate": 0.00012535619039712764,
"loss": 3.5673727416992187,
"step": 168300
},
{
"epoch": 1.747486172652464,
"grad_norm": 4.124982833862305,
"learning_rate": 0.00012525242043438106,
"loss": 3.512673034667969,
"step": 168400
},
{
"epoch": 1.74852387227993,
"grad_norm": 4.768991470336914,
"learning_rate": 0.00012514865047163448,
"loss": 3.5959738159179686,
"step": 168500
},
{
"epoch": 1.7495615719073956,
"grad_norm": 9.657281875610352,
"learning_rate": 0.00012504488050888788,
"loss": 3.528682861328125,
"step": 168600
},
{
"epoch": 1.7505992715348615,
"grad_norm": 2.538902759552002,
"learning_rate": 0.0001249411105461413,
"loss": 3.4649612426757814,
"step": 168700
},
{
"epoch": 1.7516369711623274,
"grad_norm": 4.286279201507568,
"learning_rate": 0.00012483734058339472,
"loss": 3.5286309814453123,
"step": 168800
},
{
"epoch": 1.7526746707897933,
"grad_norm": 15.081319808959961,
"learning_rate": 0.00012473357062064814,
"loss": 3.492412414550781,
"step": 168900
},
{
"epoch": 1.753712370417259,
"grad_norm": 2.91190767288208,
"learning_rate": 0.00012462980065790157,
"loss": 3.4919317626953124,
"step": 169000
},
{
"epoch": 1.7547500700447247,
"grad_norm": 3.788306713104248,
"learning_rate": 0.00012452603069515496,
"loss": 3.587347412109375,
"step": 169100
},
{
"epoch": 1.7557877696721906,
"grad_norm": 4.830081462860107,
"learning_rate": 0.00012442226073240838,
"loss": 3.6080587768554686,
"step": 169200
},
{
"epoch": 1.7568254692996566,
"grad_norm": 4.777892112731934,
"learning_rate": 0.0001243184907696618,
"loss": 3.653542175292969,
"step": 169300
},
{
"epoch": 1.7578631689271225,
"grad_norm": 8.966485977172852,
"learning_rate": 0.0001242147208069152,
"loss": 3.55691650390625,
"step": 169400
},
{
"epoch": 1.7589008685545882,
"grad_norm": 1.9701244831085205,
"learning_rate": 0.00012411095084416865,
"loss": 3.587906799316406,
"step": 169500
},
{
"epoch": 1.759938568182054,
"grad_norm": 12.719783782958984,
"learning_rate": 0.00012400718088142207,
"loss": 3.413060302734375,
"step": 169600
},
{
"epoch": 1.7609762678095198,
"grad_norm": 3.8632144927978516,
"learning_rate": 0.00012390341091867547,
"loss": 3.6044146728515627,
"step": 169700
},
{
"epoch": 1.7620139674369857,
"grad_norm": 5.806576251983643,
"learning_rate": 0.0001237996409559289,
"loss": 3.59072509765625,
"step": 169800
},
{
"epoch": 1.7630516670644516,
"grad_norm": 7.052939414978027,
"learning_rate": 0.0001236958709931823,
"loss": 3.4161257934570313,
"step": 169900
},
{
"epoch": 1.7640893666919175,
"grad_norm": 4.090539455413818,
"learning_rate": 0.0001235921010304357,
"loss": 3.4862603759765625,
"step": 170000
},
{
"epoch": 1.7651270663193832,
"grad_norm": 8.032806396484375,
"learning_rate": 0.00012348833106768913,
"loss": 3.5226229858398437,
"step": 170100
},
{
"epoch": 1.766164765946849,
"grad_norm": 7.900229454040527,
"learning_rate": 0.00012338456110494255,
"loss": 3.428408203125,
"step": 170200
},
{
"epoch": 1.7672024655743148,
"grad_norm": 3.3465304374694824,
"learning_rate": 0.00012328079114219597,
"loss": 3.4806304931640626,
"step": 170300
},
{
"epoch": 1.7682401652017807,
"grad_norm": 2.737323522567749,
"learning_rate": 0.0001231770211794494,
"loss": 3.5239492797851564,
"step": 170400
},
{
"epoch": 1.7692778648292466,
"grad_norm": 5.74827766418457,
"learning_rate": 0.00012307325121670281,
"loss": 3.5097976684570313,
"step": 170500
},
{
"epoch": 1.7703155644567123,
"grad_norm": 6.033031463623047,
"learning_rate": 0.0001229694812539562,
"loss": 3.4570046997070314,
"step": 170600
},
{
"epoch": 1.771353264084178,
"grad_norm": 8.032061576843262,
"learning_rate": 0.00012286571129120963,
"loss": 3.560968017578125,
"step": 170700
},
{
"epoch": 1.772390963711644,
"grad_norm": 4.955009460449219,
"learning_rate": 0.00012276194132846305,
"loss": 3.54818115234375,
"step": 170800
},
{
"epoch": 1.7734286633391099,
"grad_norm": 10.685212135314941,
"learning_rate": 0.00012265817136571647,
"loss": 3.5968731689453124,
"step": 170900
},
{
"epoch": 1.7744663629665758,
"grad_norm": 6.002890110015869,
"learning_rate": 0.0001225544014029699,
"loss": 3.6380169677734373,
"step": 171000
},
{
"epoch": 1.7755040625940415,
"grad_norm": 2.442901849746704,
"learning_rate": 0.0001224506314402233,
"loss": 3.546981201171875,
"step": 171100
},
{
"epoch": 1.7765417622215074,
"grad_norm": 7.106812000274658,
"learning_rate": 0.0001223468614774767,
"loss": 3.4353497314453123,
"step": 171200
},
{
"epoch": 1.777579461848973,
"grad_norm": 4.951285362243652,
"learning_rate": 0.00012224309151473014,
"loss": 3.5619387817382813,
"step": 171300
},
{
"epoch": 1.778617161476439,
"grad_norm": 4.533148765563965,
"learning_rate": 0.00012213932155198356,
"loss": 3.4085040283203125,
"step": 171400
},
{
"epoch": 1.779654861103905,
"grad_norm": 3.1281020641326904,
"learning_rate": 0.00012203555158923698,
"loss": 3.5755316162109376,
"step": 171500
},
{
"epoch": 1.7806925607313708,
"grad_norm": 3.2438437938690186,
"learning_rate": 0.00012193178162649039,
"loss": 3.419034118652344,
"step": 171600
},
{
"epoch": 1.7817302603588365,
"grad_norm": 6.113760948181152,
"learning_rate": 0.0001218280116637438,
"loss": 3.4608731079101562,
"step": 171700
},
{
"epoch": 1.7827679599863022,
"grad_norm": 3.805856227874756,
"learning_rate": 0.00012172424170099722,
"loss": 3.542497253417969,
"step": 171800
},
{
"epoch": 1.7838056596137681,
"grad_norm": 11.923066139221191,
"learning_rate": 0.00012162047173825063,
"loss": 3.5580120849609376,
"step": 171900
},
{
"epoch": 1.784843359241234,
"grad_norm": 7.653703212738037,
"learning_rate": 0.00012151670177550405,
"loss": 3.464120178222656,
"step": 172000
},
{
"epoch": 1.7858810588687,
"grad_norm": 4.955140113830566,
"learning_rate": 0.00012141293181275747,
"loss": 3.5985858154296877,
"step": 172100
},
{
"epoch": 1.7869187584961657,
"grad_norm": 2.7006173133850098,
"learning_rate": 0.00012130916185001089,
"loss": 3.608409423828125,
"step": 172200
},
{
"epoch": 1.7879564581236316,
"grad_norm": 10.799352645874023,
"learning_rate": 0.0001212053918872643,
"loss": 3.5314166259765627,
"step": 172300
},
{
"epoch": 1.7889941577510973,
"grad_norm": 2.7497682571411133,
"learning_rate": 0.00012110162192451772,
"loss": 3.5095343017578124,
"step": 172400
},
{
"epoch": 1.7900318573785632,
"grad_norm": 3.47670316696167,
"learning_rate": 0.00012099785196177113,
"loss": 3.508272705078125,
"step": 172500
},
{
"epoch": 1.791069557006029,
"grad_norm": 5.199550151824951,
"learning_rate": 0.00012089408199902454,
"loss": 3.503916015625,
"step": 172600
},
{
"epoch": 1.792107256633495,
"grad_norm": 5.3487043380737305,
"learning_rate": 0.00012079031203627797,
"loss": 3.63627685546875,
"step": 172700
},
{
"epoch": 1.7931449562609607,
"grad_norm": 4.6182074546813965,
"learning_rate": 0.00012068654207353138,
"loss": 3.517708740234375,
"step": 172800
},
{
"epoch": 1.7941826558884264,
"grad_norm": 2.607217788696289,
"learning_rate": 0.0001205827721107848,
"loss": 3.555519714355469,
"step": 172900
},
{
"epoch": 1.7952203555158923,
"grad_norm": 9.180208206176758,
"learning_rate": 0.00012047900214803821,
"loss": 3.5748587036132813,
"step": 173000
},
{
"epoch": 1.7962580551433582,
"grad_norm": 5.080584526062012,
"learning_rate": 0.00012037523218529164,
"loss": 3.5299716186523438,
"step": 173100
},
{
"epoch": 1.7972957547708241,
"grad_norm": 2.5319409370422363,
"learning_rate": 0.00012027146222254504,
"loss": 3.5544561767578124,
"step": 173200
},
{
"epoch": 1.7983334543982898,
"grad_norm": 4.81158447265625,
"learning_rate": 0.00012016769225979848,
"loss": 3.6039208984375,
"step": 173300
},
{
"epoch": 1.7993711540257558,
"grad_norm": Infinity,
"learning_rate": 0.00012006392229705189,
"loss": 3.439290771484375,
"step": 173400
},
{
"epoch": 1.8004088536532215,
"grad_norm": 2.6214425563812256,
"learning_rate": 0.00011996015233430531,
"loss": 3.670186767578125,
"step": 173500
},
{
"epoch": 1.8014465532806874,
"grad_norm": 2.7172493934631348,
"learning_rate": 0.00011985638237155872,
"loss": 3.5838592529296873,
"step": 173600
},
{
"epoch": 1.8024842529081533,
"grad_norm": 8.898774147033691,
"learning_rate": 0.00011975261240881213,
"loss": 3.4800985717773436,
"step": 173700
},
{
"epoch": 1.8035219525356192,
"grad_norm": 3.5623104572296143,
"learning_rate": 0.00011964884244606555,
"loss": 3.511365966796875,
"step": 173800
},
{
"epoch": 1.804559652163085,
"grad_norm": 8.46833610534668,
"learning_rate": 0.00011954507248331896,
"loss": 3.7088421630859374,
"step": 173900
},
{
"epoch": 1.8055973517905506,
"grad_norm": 5.097702980041504,
"learning_rate": 0.00011944130252057239,
"loss": 3.6202734375,
"step": 174000
},
{
"epoch": 1.8066350514180165,
"grad_norm": 2.758472204208374,
"learning_rate": 0.0001193375325578258,
"loss": 3.561451721191406,
"step": 174100
},
{
"epoch": 1.8076727510454824,
"grad_norm": 10.48659610748291,
"learning_rate": 0.00011923376259507922,
"loss": 3.5661395263671873,
"step": 174200
},
{
"epoch": 1.8087104506729483,
"grad_norm": 4.996297836303711,
"learning_rate": 0.00011912999263233263,
"loss": 3.680464782714844,
"step": 174300
},
{
"epoch": 1.809748150300414,
"grad_norm": 3.927097797393799,
"learning_rate": 0.00011902622266958605,
"loss": 3.4924087524414062,
"step": 174400
},
{
"epoch": 1.8107858499278797,
"grad_norm": 9.367024421691895,
"learning_rate": 0.00011892245270683946,
"loss": 3.4610064697265623,
"step": 174500
},
{
"epoch": 1.8118235495553456,
"grad_norm": 2.7783424854278564,
"learning_rate": 0.0001188186827440929,
"loss": 3.411673583984375,
"step": 174600
},
{
"epoch": 1.8128612491828116,
"grad_norm": 8.61545181274414,
"learning_rate": 0.0001187149127813463,
"loss": 3.5328875732421876,
"step": 174700
},
{
"epoch": 1.8138989488102775,
"grad_norm": 7.4906182289123535,
"learning_rate": 0.00011861114281859971,
"loss": 3.376343078613281,
"step": 174800
},
{
"epoch": 1.8149366484377432,
"grad_norm": 1.9939513206481934,
"learning_rate": 0.00011850737285585314,
"loss": 3.428880615234375,
"step": 174900
},
{
"epoch": 1.815974348065209,
"grad_norm": 6.011395454406738,
"learning_rate": 0.00011840360289310654,
"loss": 3.5798504638671873,
"step": 175000
},
{
"epoch": 1.8170120476926748,
"grad_norm": 2.0973944664001465,
"learning_rate": 0.00011829983293035997,
"loss": 3.5833367919921875,
"step": 175100
},
{
"epoch": 1.8180497473201407,
"grad_norm": 4.992910861968994,
"learning_rate": 0.00011819606296761339,
"loss": 3.6261285400390624,
"step": 175200
},
{
"epoch": 1.8190874469476066,
"grad_norm": 89.73089599609375,
"learning_rate": 0.00011809229300486681,
"loss": 3.390103454589844,
"step": 175300
},
{
"epoch": 1.8201251465750725,
"grad_norm": 4.343557834625244,
"learning_rate": 0.00011798852304212022,
"loss": 3.6147576904296876,
"step": 175400
},
{
"epoch": 1.8211628462025382,
"grad_norm": Infinity,
"learning_rate": 0.00011788475307937364,
"loss": 3.5382080078125,
"step": 175500
},
{
"epoch": 1.822200545830004,
"grad_norm": 8.41909408569336,
"learning_rate": 0.00011778098311662705,
"loss": 3.5090155029296874,
"step": 175600
},
{
"epoch": 1.8232382454574698,
"grad_norm": 7.508602619171143,
"learning_rate": 0.00011767721315388046,
"loss": 3.540501708984375,
"step": 175700
},
{
"epoch": 1.8242759450849357,
"grad_norm": 2.713555335998535,
"learning_rate": 0.00011757344319113388,
"loss": 3.5669735717773436,
"step": 175800
},
{
"epoch": 1.8253136447124017,
"grad_norm": 9.780903816223145,
"learning_rate": 0.0001174696732283873,
"loss": 3.542893981933594,
"step": 175900
},
{
"epoch": 1.8263513443398673,
"grad_norm": 2.6435556411743164,
"learning_rate": 0.00011736590326564072,
"loss": 3.6134707641601564,
"step": 176000
},
{
"epoch": 1.8273890439673333,
"grad_norm": 3.3884384632110596,
"learning_rate": 0.00011726213330289413,
"loss": 3.574878845214844,
"step": 176100
},
{
"epoch": 1.828426743594799,
"grad_norm": 4.323862552642822,
"learning_rate": 0.00011715836334014755,
"loss": 3.5432839965820313,
"step": 176200
},
{
"epoch": 1.8294644432222649,
"grad_norm": 6.794419765472412,
"learning_rate": 0.00011705459337740096,
"loss": 3.4334552001953127,
"step": 176300
},
{
"epoch": 1.8305021428497308,
"grad_norm": 3.3329992294311523,
"learning_rate": 0.00011695082341465438,
"loss": 3.511024169921875,
"step": 176400
},
{
"epoch": 1.8315398424771967,
"grad_norm": 6.582189083099365,
"learning_rate": 0.0001168470534519078,
"loss": 3.4057382202148436,
"step": 176500
},
{
"epoch": 1.8325775421046624,
"grad_norm": 3.5420665740966797,
"learning_rate": 0.00011674328348916123,
"loss": 3.530198974609375,
"step": 176600
},
{
"epoch": 1.833615241732128,
"grad_norm": 3.2835450172424316,
"learning_rate": 0.00011663951352641463,
"loss": 3.3689605712890627,
"step": 176700
},
{
"epoch": 1.834652941359594,
"grad_norm": 4.352384567260742,
"learning_rate": 0.00011653574356366804,
"loss": 3.5228622436523436,
"step": 176800
},
{
"epoch": 1.83569064098706,
"grad_norm": 6.940867900848389,
"learning_rate": 0.00011643197360092147,
"loss": 3.422699279785156,
"step": 176900
},
{
"epoch": 1.8367283406145258,
"grad_norm": 9.627628326416016,
"learning_rate": 0.00011632820363817487,
"loss": 3.4203256225585936,
"step": 177000
},
{
"epoch": 1.8377660402419915,
"grad_norm": 7.819676399230957,
"learning_rate": 0.00011622443367542831,
"loss": 3.569815673828125,
"step": 177100
},
{
"epoch": 1.8388037398694572,
"grad_norm": 3.4782094955444336,
"learning_rate": 0.00011612066371268172,
"loss": 3.5252569580078124,
"step": 177200
},
{
"epoch": 1.8398414394969231,
"grad_norm": 9.448952674865723,
"learning_rate": 0.00011601689374993514,
"loss": 3.43080322265625,
"step": 177300
},
{
"epoch": 1.840879139124389,
"grad_norm": 5.754225730895996,
"learning_rate": 0.00011591312378718855,
"loss": 3.45312744140625,
"step": 177400
},
{
"epoch": 1.841916838751855,
"grad_norm": 2.9918229579925537,
"learning_rate": 0.00011580935382444197,
"loss": 3.548991394042969,
"step": 177500
},
{
"epoch": 1.8429545383793209,
"grad_norm": 4.406205177307129,
"learning_rate": 0.00011570558386169538,
"loss": 3.5221047973632813,
"step": 177600
},
{
"epoch": 1.8439922380067866,
"grad_norm": 3.79978346824646,
"learning_rate": 0.00011560181389894879,
"loss": 3.4272702026367186,
"step": 177700
},
{
"epoch": 1.8450299376342523,
"grad_norm": 8.362844467163086,
"learning_rate": 0.00011549804393620222,
"loss": 3.496314697265625,
"step": 177800
},
{
"epoch": 1.8460676372617182,
"grad_norm": 4.00974702835083,
"learning_rate": 0.00011539427397345563,
"loss": 3.4739456176757812,
"step": 177900
},
{
"epoch": 1.847105336889184,
"grad_norm": 4.4382853507995605,
"learning_rate": 0.00011529050401070905,
"loss": 3.637906799316406,
"step": 178000
},
{
"epoch": 1.84814303651665,
"grad_norm": 3.4561121463775635,
"learning_rate": 0.00011518673404796246,
"loss": 3.4582669067382814,
"step": 178100
},
{
"epoch": 1.8491807361441157,
"grad_norm": 9.542756080627441,
"learning_rate": 0.00011508296408521588,
"loss": 3.5665469360351563,
"step": 178200
},
{
"epoch": 1.8502184357715814,
"grad_norm": 5.516635894775391,
"learning_rate": 0.00011497919412246929,
"loss": 3.5199371337890626,
"step": 178300
},
{
"epoch": 1.8512561353990473,
"grad_norm": 10.64023494720459,
"learning_rate": 0.00011487542415972273,
"loss": 3.605532531738281,
"step": 178400
},
{
"epoch": 1.8522938350265132,
"grad_norm": 3.7197024822235107,
"learning_rate": 0.00011477165419697613,
"loss": 3.5585647583007813,
"step": 178500
},
{
"epoch": 1.8533315346539792,
"grad_norm": 8.84176254272461,
"learning_rate": 0.00011466788423422956,
"loss": 3.469338684082031,
"step": 178600
},
{
"epoch": 1.8543692342814448,
"grad_norm": 13.789299011230469,
"learning_rate": 0.00011456411427148297,
"loss": 3.654618835449219,
"step": 178700
},
{
"epoch": 1.8554069339089108,
"grad_norm": 3.7758259773254395,
"learning_rate": 0.00011446034430873637,
"loss": 3.511930236816406,
"step": 178800
},
{
"epoch": 1.8564446335363765,
"grad_norm": 4.542521953582764,
"learning_rate": 0.0001143565743459898,
"loss": 3.572850341796875,
"step": 178900
},
{
"epoch": 1.8574823331638424,
"grad_norm": 7.155478477478027,
"learning_rate": 0.00011425280438324322,
"loss": 3.6194467163085937,
"step": 179000
},
{
"epoch": 1.8585200327913083,
"grad_norm": 5.109609603881836,
"learning_rate": 0.00011414903442049664,
"loss": 3.585841064453125,
"step": 179100
},
{
"epoch": 1.8595577324187742,
"grad_norm": 4.251883506774902,
"learning_rate": 0.00011404526445775005,
"loss": 3.4581594848632813,
"step": 179200
},
{
"epoch": 1.86059543204624,
"grad_norm": 22.98354148864746,
"learning_rate": 0.00011394149449500347,
"loss": 3.47680419921875,
"step": 179300
},
{
"epoch": 1.8616331316737056,
"grad_norm": 4.897403240203857,
"learning_rate": 0.00011383772453225688,
"loss": 3.5364599609375,
"step": 179400
},
{
"epoch": 1.8626708313011715,
"grad_norm": 11.166070938110352,
"learning_rate": 0.0001137339545695103,
"loss": 3.4703445434570312,
"step": 179500
},
{
"epoch": 1.8637085309286374,
"grad_norm": 3.64528226852417,
"learning_rate": 0.00011363018460676372,
"loss": 3.612529296875,
"step": 179600
},
{
"epoch": 1.8647462305561033,
"grad_norm": 3.4828524589538574,
"learning_rate": 0.00011352641464401714,
"loss": 3.622635803222656,
"step": 179700
},
{
"epoch": 1.865783930183569,
"grad_norm": 4.965012550354004,
"learning_rate": 0.00011342264468127055,
"loss": 3.420509033203125,
"step": 179800
},
{
"epoch": 1.866821629811035,
"grad_norm": 6.657770156860352,
"learning_rate": 0.00011331887471852396,
"loss": 3.57205810546875,
"step": 179900
},
{
"epoch": 1.8678593294385006,
"grad_norm": 6.785094738006592,
"learning_rate": 0.00011321510475577738,
"loss": 3.613439025878906,
"step": 180000
},
{
"epoch": 1.8688970290659666,
"grad_norm": 3.2131218910217285,
"learning_rate": 0.00011311133479303079,
"loss": 3.721015625,
"step": 180100
},
{
"epoch": 1.8699347286934325,
"grad_norm": 3.327937364578247,
"learning_rate": 0.00011300756483028421,
"loss": 3.47718017578125,
"step": 180200
},
{
"epoch": 1.8709724283208984,
"grad_norm": 8.65044116973877,
"learning_rate": 0.00011290379486753763,
"loss": 3.6089404296875,
"step": 180300
},
{
"epoch": 1.872010127948364,
"grad_norm": 2.0018603801727295,
"learning_rate": 0.00011280002490479106,
"loss": 3.3825112915039064,
"step": 180400
},
{
"epoch": 1.8730478275758298,
"grad_norm": 2.7814066410064697,
"learning_rate": 0.00011269625494204446,
"loss": 3.4428082275390626,
"step": 180500
},
{
"epoch": 1.8740855272032957,
"grad_norm": 2.5407564640045166,
"learning_rate": 0.00011259248497929789,
"loss": 3.6512811279296873,
"step": 180600
},
{
"epoch": 1.8751232268307616,
"grad_norm": 3.6118102073669434,
"learning_rate": 0.0001124887150165513,
"loss": 3.4491305541992188,
"step": 180700
},
{
"epoch": 1.8761609264582275,
"grad_norm": 4.681710720062256,
"learning_rate": 0.0001123849450538047,
"loss": 3.5399176025390626,
"step": 180800
},
{
"epoch": 1.8771986260856932,
"grad_norm": 5.6345062255859375,
"learning_rate": 0.00011228117509105814,
"loss": 3.580292053222656,
"step": 180900
},
{
"epoch": 1.878236325713159,
"grad_norm": 4.881344318389893,
"learning_rate": 0.00011217740512831155,
"loss": 3.5553582763671874,
"step": 181000
},
{
"epoch": 1.8792740253406248,
"grad_norm": 3.3916895389556885,
"learning_rate": 0.00011207363516556497,
"loss": 3.468414001464844,
"step": 181100
},
{
"epoch": 1.8803117249680907,
"grad_norm": 4.611287593841553,
"learning_rate": 0.00011196986520281838,
"loss": 3.420959167480469,
"step": 181200
},
{
"epoch": 1.8813494245955567,
"grad_norm": 3.4268012046813965,
"learning_rate": 0.0001118660952400718,
"loss": 3.614518737792969,
"step": 181300
},
{
"epoch": 1.8823871242230226,
"grad_norm": 9.675979614257812,
"learning_rate": 0.00011176232527732521,
"loss": 3.460643310546875,
"step": 181400
},
{
"epoch": 1.8834248238504883,
"grad_norm": 4.765254497528076,
"learning_rate": 0.00011165855531457864,
"loss": 3.5331201171875,
"step": 181500
},
{
"epoch": 1.884462523477954,
"grad_norm": 12.958268165588379,
"learning_rate": 0.00011155478535183205,
"loss": 3.458702392578125,
"step": 181600
},
{
"epoch": 1.8855002231054199,
"grad_norm": 3.9760847091674805,
"learning_rate": 0.00011145101538908547,
"loss": 3.5144024658203126,
"step": 181700
},
{
"epoch": 1.8865379227328858,
"grad_norm": 3.063124656677246,
"learning_rate": 0.00011134724542633888,
"loss": 3.3591217041015624,
"step": 181800
},
{
"epoch": 1.8875756223603517,
"grad_norm": 14.115145683288574,
"learning_rate": 0.00011124347546359229,
"loss": 3.5416494750976564,
"step": 181900
},
{
"epoch": 1.8886133219878174,
"grad_norm": 2.602299213409424,
"learning_rate": 0.00011113970550084571,
"loss": 3.4190499877929685,
"step": 182000
},
{
"epoch": 1.889651021615283,
"grad_norm": 6.7280168533325195,
"learning_rate": 0.00011103593553809912,
"loss": 3.3795068359375,
"step": 182100
},
{
"epoch": 1.890688721242749,
"grad_norm": 6.911862850189209,
"learning_rate": 0.00011093216557535256,
"loss": 3.5166439819335937,
"step": 182200
},
{
"epoch": 1.891726420870215,
"grad_norm": 6.751010894775391,
"learning_rate": 0.00011082839561260596,
"loss": 3.4338143920898436,
"step": 182300
},
{
"epoch": 1.8927641204976808,
"grad_norm": 4.327939510345459,
"learning_rate": 0.00011072462564985939,
"loss": 3.4822421264648438,
"step": 182400
},
{
"epoch": 1.8938018201251465,
"grad_norm": 2.485795259475708,
"learning_rate": 0.0001106208556871128,
"loss": 3.464154052734375,
"step": 182500
},
{
"epoch": 1.8948395197526124,
"grad_norm": 104.476318359375,
"learning_rate": 0.00011051708572436622,
"loss": 3.4480935668945314,
"step": 182600
},
{
"epoch": 1.8958772193800781,
"grad_norm": 2.829188346862793,
"learning_rate": 0.00011041331576161963,
"loss": 3.593952331542969,
"step": 182700
},
{
"epoch": 1.896914919007544,
"grad_norm": 4.845984935760498,
"learning_rate": 0.00011030954579887306,
"loss": 3.244365234375,
"step": 182800
},
{
"epoch": 1.89795261863501,
"grad_norm": 2.055333375930786,
"learning_rate": 0.00011020577583612647,
"loss": 3.5465518188476564,
"step": 182900
},
{
"epoch": 1.8989903182624759,
"grad_norm": 19.445037841796875,
"learning_rate": 0.00011010200587337988,
"loss": 3.5760122680664064,
"step": 183000
},
{
"epoch": 1.9000280178899416,
"grad_norm": 3.0907251834869385,
"learning_rate": 0.0001099982359106333,
"loss": 3.524999084472656,
"step": 183100
},
{
"epoch": 1.9010657175174073,
"grad_norm": 1.9697469472885132,
"learning_rate": 0.00010989446594788671,
"loss": 3.4634637451171875,
"step": 183200
},
{
"epoch": 1.9021034171448732,
"grad_norm": 6.751926898956299,
"learning_rate": 0.00010979069598514013,
"loss": 3.4596926879882814,
"step": 183300
},
{
"epoch": 1.903141116772339,
"grad_norm": 2.561213493347168,
"learning_rate": 0.00010968692602239355,
"loss": 3.5389044189453127,
"step": 183400
},
{
"epoch": 1.904178816399805,
"grad_norm": 6.130541801452637,
"learning_rate": 0.00010958315605964697,
"loss": 3.4779763793945313,
"step": 183500
},
{
"epoch": 1.9052165160272707,
"grad_norm": 3.2996444702148438,
"learning_rate": 0.00010947938609690038,
"loss": 3.4853436279296877,
"step": 183600
},
{
"epoch": 1.9062542156547366,
"grad_norm": 4.535896301269531,
"learning_rate": 0.0001093756161341538,
"loss": 3.4235238647460937,
"step": 183700
},
{
"epoch": 1.9072919152822023,
"grad_norm": 4.082485675811768,
"learning_rate": 0.00010927184617140721,
"loss": 3.4645541381835936,
"step": 183800
},
{
"epoch": 1.9083296149096682,
"grad_norm": 5.501161098480225,
"learning_rate": 0.00010916807620866062,
"loss": 3.555899658203125,
"step": 183900
},
{
"epoch": 1.9093673145371342,
"grad_norm": 7.624723434448242,
"learning_rate": 0.00010906430624591404,
"loss": 3.4653219604492187,
"step": 184000
},
{
"epoch": 1.9104050141646,
"grad_norm": 3.386392116546631,
"learning_rate": 0.00010896053628316746,
"loss": 3.5530450439453123,
"step": 184100
},
{
"epoch": 1.9114427137920658,
"grad_norm": 4.087791442871094,
"learning_rate": 0.00010885676632042089,
"loss": 3.470418701171875,
"step": 184200
},
{
"epoch": 1.9124804134195315,
"grad_norm": 4.145429611206055,
"learning_rate": 0.0001087529963576743,
"loss": 3.416697692871094,
"step": 184300
},
{
"epoch": 1.9135181130469974,
"grad_norm": 4.366927623748779,
"learning_rate": 0.00010864922639492772,
"loss": 3.4999765014648436,
"step": 184400
},
{
"epoch": 1.9145558126744633,
"grad_norm": 4.084202289581299,
"learning_rate": 0.00010854545643218113,
"loss": 3.435041809082031,
"step": 184500
},
{
"epoch": 1.9155935123019292,
"grad_norm": 9.935702323913574,
"learning_rate": 0.00010844168646943455,
"loss": 3.54100341796875,
"step": 184600
},
{
"epoch": 1.916631211929395,
"grad_norm": 6.931925296783447,
"learning_rate": 0.00010833791650668797,
"loss": 3.5136874389648436,
"step": 184700
},
{
"epoch": 1.9176689115568606,
"grad_norm": 3.0231878757476807,
"learning_rate": 0.00010823414654394139,
"loss": 3.6150555419921875,
"step": 184800
},
{
"epoch": 1.9187066111843265,
"grad_norm": 3.3393242359161377,
"learning_rate": 0.0001081303765811948,
"loss": 3.479617004394531,
"step": 184900
},
{
"epoch": 1.9197443108117924,
"grad_norm": 1.9449257850646973,
"learning_rate": 0.00010802660661844821,
"loss": 3.4772000122070312,
"step": 185000
},
{
"epoch": 1.9207820104392583,
"grad_norm": 5.924251079559326,
"learning_rate": 0.00010792283665570163,
"loss": 3.558631591796875,
"step": 185100
},
{
"epoch": 1.9218197100667243,
"grad_norm": 3.7242231369018555,
"learning_rate": 0.00010781906669295504,
"loss": 3.4901129150390626,
"step": 185200
},
{
"epoch": 1.92285740969419,
"grad_norm": 4.291270732879639,
"learning_rate": 0.00010771529673020847,
"loss": 3.4830392456054686,
"step": 185300
},
{
"epoch": 1.9238951093216556,
"grad_norm": 8.315948486328125,
"learning_rate": 0.00010761152676746188,
"loss": 3.654394226074219,
"step": 185400
},
{
"epoch": 1.9249328089491216,
"grad_norm": 3.3864219188690186,
"learning_rate": 0.0001075077568047153,
"loss": 3.4916171264648437,
"step": 185500
},
{
"epoch": 1.9259705085765875,
"grad_norm": 2.4446215629577637,
"learning_rate": 0.00010740398684196871,
"loss": 3.5801641845703127,
"step": 185600
},
{
"epoch": 1.9270082082040534,
"grad_norm": 4.319270133972168,
"learning_rate": 0.00010730021687922213,
"loss": 3.485596008300781,
"step": 185700
},
{
"epoch": 1.928045907831519,
"grad_norm": 12.243918418884277,
"learning_rate": 0.00010719644691647554,
"loss": 3.297283020019531,
"step": 185800
},
{
"epoch": 1.9290836074589848,
"grad_norm": 3.614396333694458,
"learning_rate": 0.00010709267695372895,
"loss": 3.4672842407226563,
"step": 185900
},
{
"epoch": 1.9301213070864507,
"grad_norm": 7.824878692626953,
"learning_rate": 0.00010698890699098239,
"loss": 3.5030999755859376,
"step": 186000
},
{
"epoch": 1.9311590067139166,
"grad_norm": 11.845438003540039,
"learning_rate": 0.0001068851370282358,
"loss": 3.5722430419921873,
"step": 186100
},
{
"epoch": 1.9321967063413825,
"grad_norm": 8.008241653442383,
"learning_rate": 0.00010678136706548922,
"loss": 3.4848983764648436,
"step": 186200
},
{
"epoch": 1.9332344059688482,
"grad_norm": 38.26485824584961,
"learning_rate": 0.00010667759710274262,
"loss": 3.4654171752929686,
"step": 186300
},
{
"epoch": 1.9342721055963141,
"grad_norm": 3.587207317352295,
"learning_rate": 0.00010657382713999605,
"loss": 3.443753967285156,
"step": 186400
},
{
"epoch": 1.9353098052237798,
"grad_norm": 7.548192024230957,
"learning_rate": 0.00010647005717724946,
"loss": 3.555989074707031,
"step": 186500
},
{
"epoch": 1.9363475048512457,
"grad_norm": 5.652491092681885,
"learning_rate": 0.00010636628721450289,
"loss": 3.5138848876953124,
"step": 186600
},
{
"epoch": 1.9373852044787117,
"grad_norm": 4.181760311126709,
"learning_rate": 0.0001062625172517563,
"loss": 3.4649755859375,
"step": 186700
},
{
"epoch": 1.9384229041061776,
"grad_norm": 39.51677703857422,
"learning_rate": 0.00010615874728900972,
"loss": 3.4170611572265623,
"step": 186800
},
{
"epoch": 1.9394606037336433,
"grad_norm": 5.663796901702881,
"learning_rate": 0.00010605497732626313,
"loss": 3.6423403930664064,
"step": 186900
},
{
"epoch": 1.940498303361109,
"grad_norm": 49.58971405029297,
"learning_rate": 0.00010595120736351654,
"loss": 3.556903076171875,
"step": 187000
},
{
"epoch": 1.9415360029885749,
"grad_norm": 4.037705421447754,
"learning_rate": 0.00010584743740076996,
"loss": 3.581287536621094,
"step": 187100
},
{
"epoch": 1.9425737026160408,
"grad_norm": 2.6354784965515137,
"learning_rate": 0.00010574366743802338,
"loss": 3.4927523803710936,
"step": 187200
},
{
"epoch": 1.9436114022435067,
"grad_norm": 3.8889167308807373,
"learning_rate": 0.0001056398974752768,
"loss": 3.485701904296875,
"step": 187300
},
{
"epoch": 1.9446491018709724,
"grad_norm": 6.694062232971191,
"learning_rate": 0.00010553612751253021,
"loss": 3.3910641479492187,
"step": 187400
},
{
"epoch": 1.9456868014984383,
"grad_norm": 5.231113910675049,
"learning_rate": 0.00010543235754978363,
"loss": 3.5116064453125,
"step": 187500
},
{
"epoch": 1.946724501125904,
"grad_norm": 13.281269073486328,
"learning_rate": 0.00010532858758703704,
"loss": 3.5454452514648436,
"step": 187600
},
{
"epoch": 1.94776220075337,
"grad_norm": 5.362813472747803,
"learning_rate": 0.00010522481762429046,
"loss": 3.5717642211914065,
"step": 187700
},
{
"epoch": 1.9487999003808358,
"grad_norm": 3.0265583992004395,
"learning_rate": 0.00010512104766154387,
"loss": 3.529712829589844,
"step": 187800
},
{
"epoch": 1.9498376000083018,
"grad_norm": 2.4003071784973145,
"learning_rate": 0.00010501727769879731,
"loss": 3.5179287719726564,
"step": 187900
},
{
"epoch": 1.9508752996357674,
"grad_norm": 3.5519869327545166,
"learning_rate": 0.00010491350773605072,
"loss": 3.3665447998046876,
"step": 188000
},
{
"epoch": 1.9519129992632331,
"grad_norm": 1.9300223588943481,
"learning_rate": 0.00010480973777330412,
"loss": 3.5477023315429688,
"step": 188100
},
{
"epoch": 1.952950698890699,
"grad_norm": 3.3745410442352295,
"learning_rate": 0.00010470596781055755,
"loss": 3.5283209228515626,
"step": 188200
},
{
"epoch": 1.953988398518165,
"grad_norm": 18.314775466918945,
"learning_rate": 0.00010460219784781096,
"loss": 3.4730484008789064,
"step": 188300
},
{
"epoch": 1.9550260981456309,
"grad_norm": 4.006529331207275,
"learning_rate": 0.00010449842788506438,
"loss": 3.4675115966796874,
"step": 188400
},
{
"epoch": 1.9560637977730966,
"grad_norm": 4.9441094398498535,
"learning_rate": 0.0001043946579223178,
"loss": 3.404721984863281,
"step": 188500
},
{
"epoch": 1.9571014974005623,
"grad_norm": 3.18265962600708,
"learning_rate": 0.00010429088795957122,
"loss": 3.667085876464844,
"step": 188600
},
{
"epoch": 1.9581391970280282,
"grad_norm": 3.0164151191711426,
"learning_rate": 0.00010418711799682463,
"loss": 3.5224847412109375,
"step": 188700
},
{
"epoch": 1.959176896655494,
"grad_norm": 5.3650007247924805,
"learning_rate": 0.00010408334803407805,
"loss": 3.4098544311523438,
"step": 188800
},
{
"epoch": 1.96021459628296,
"grad_norm": 6.3775224685668945,
"learning_rate": 0.00010397957807133146,
"loss": 3.649906005859375,
"step": 188900
},
{
"epoch": 1.9612522959104257,
"grad_norm": 18.32954978942871,
"learning_rate": 0.00010387580810858487,
"loss": 3.642203674316406,
"step": 189000
},
{
"epoch": 1.9622899955378916,
"grad_norm": 3.267017126083374,
"learning_rate": 0.0001037720381458383,
"loss": 3.522268981933594,
"step": 189100
},
{
"epoch": 1.9633276951653573,
"grad_norm": 3.3189854621887207,
"learning_rate": 0.00010366826818309171,
"loss": 3.525494384765625,
"step": 189200
},
{
"epoch": 1.9643653947928232,
"grad_norm": 20.459917068481445,
"learning_rate": 0.00010356449822034513,
"loss": 3.4846673583984376,
"step": 189300
},
{
"epoch": 1.9654030944202892,
"grad_norm": 10.600302696228027,
"learning_rate": 0.00010346072825759854,
"loss": 3.4710623168945314,
"step": 189400
},
{
"epoch": 1.966440794047755,
"grad_norm": 5.836012363433838,
"learning_rate": 0.00010335695829485196,
"loss": 3.395472412109375,
"step": 189500
},
{
"epoch": 1.9674784936752208,
"grad_norm": 1.8093000650405884,
"learning_rate": 0.00010325318833210537,
"loss": 3.4295391845703125,
"step": 189600
},
{
"epoch": 1.9685161933026865,
"grad_norm": 3.580705165863037,
"learning_rate": 0.0001031494183693588,
"loss": 3.571369934082031,
"step": 189700
},
{
"epoch": 1.9695538929301524,
"grad_norm": 4.870438575744629,
"learning_rate": 0.00010304564840661222,
"loss": 3.520045166015625,
"step": 189800
},
{
"epoch": 1.9705915925576183,
"grad_norm": 3.781505823135376,
"learning_rate": 0.00010294187844386564,
"loss": 3.5424517822265624,
"step": 189900
},
{
"epoch": 1.9716292921850842,
"grad_norm": 3.340085983276367,
"learning_rate": 0.00010283810848111905,
"loss": 3.518573913574219,
"step": 190000
},
{
"epoch": 1.97266699181255,
"grad_norm": 5.02490234375,
"learning_rate": 0.00010273433851837245,
"loss": 3.3679263305664064,
"step": 190100
},
{
"epoch": 1.9737046914400158,
"grad_norm": 4.117876052856445,
"learning_rate": 0.00010263056855562588,
"loss": 3.5929489135742188,
"step": 190200
},
{
"epoch": 1.9747423910674815,
"grad_norm": 3.8365478515625,
"learning_rate": 0.00010252679859287929,
"loss": 3.40560302734375,
"step": 190300
},
{
"epoch": 1.9757800906949474,
"grad_norm": 7.205904006958008,
"learning_rate": 0.00010242302863013272,
"loss": 3.38099609375,
"step": 190400
},
{
"epoch": 1.9768177903224133,
"grad_norm": 2.767961025238037,
"learning_rate": 0.00010231925866738613,
"loss": 3.4381674194335936,
"step": 190500
},
{
"epoch": 1.9778554899498793,
"grad_norm": 4.335025310516357,
"learning_rate": 0.00010221548870463955,
"loss": 3.3964199829101562,
"step": 190600
},
{
"epoch": 1.978893189577345,
"grad_norm": 4.294001579284668,
"learning_rate": 0.00010211171874189296,
"loss": 3.411571350097656,
"step": 190700
},
{
"epoch": 1.9799308892048106,
"grad_norm": 3.6443490982055664,
"learning_rate": 0.00010200794877914638,
"loss": 3.4534707641601563,
"step": 190800
},
{
"epoch": 1.9809685888322766,
"grad_norm": 4.729245662689209,
"learning_rate": 0.00010190417881639979,
"loss": 3.577586669921875,
"step": 190900
},
{
"epoch": 1.9820062884597425,
"grad_norm": 3.587510108947754,
"learning_rate": 0.00010180040885365323,
"loss": 3.4148577880859374,
"step": 191000
},
{
"epoch": 1.9830439880872084,
"grad_norm": 13.635988235473633,
"learning_rate": 0.00010169663889090663,
"loss": 3.531971435546875,
"step": 191100
},
{
"epoch": 1.984081687714674,
"grad_norm": 4.0034356117248535,
"learning_rate": 0.00010159286892816004,
"loss": 3.464627685546875,
"step": 191200
},
{
"epoch": 1.98511938734214,
"grad_norm": 4.326283931732178,
"learning_rate": 0.00010148909896541346,
"loss": 3.4689093017578125,
"step": 191300
},
{
"epoch": 1.9861570869696057,
"grad_norm": 10.159041404724121,
"learning_rate": 0.00010138532900266687,
"loss": 3.4093603515625,
"step": 191400
},
{
"epoch": 1.9871947865970716,
"grad_norm": 6.295145511627197,
"learning_rate": 0.0001012815590399203,
"loss": 3.4013311767578127,
"step": 191500
},
{
"epoch": 1.9882324862245375,
"grad_norm": 2.6228549480438232,
"learning_rate": 0.0001011777890771737,
"loss": 3.4039892578125,
"step": 191600
},
{
"epoch": 1.9892701858520034,
"grad_norm": 2.0637784004211426,
"learning_rate": 0.00010107401911442714,
"loss": 3.4192919921875,
"step": 191700
},
{
"epoch": 1.9903078854794691,
"grad_norm": 4.193583011627197,
"learning_rate": 0.00010097024915168055,
"loss": 3.5069757080078126,
"step": 191800
},
{
"epoch": 1.9913455851069348,
"grad_norm": 3.6812117099761963,
"learning_rate": 0.00010086647918893397,
"loss": 3.421480712890625,
"step": 191900
},
{
"epoch": 1.9923832847344007,
"grad_norm": 33.859195709228516,
"learning_rate": 0.00010076270922618738,
"loss": 3.506886291503906,
"step": 192000
},
{
"epoch": 1.9934209843618667,
"grad_norm": 3.308947801589966,
"learning_rate": 0.00010065893926344079,
"loss": 3.424991455078125,
"step": 192100
},
{
"epoch": 1.9944586839893326,
"grad_norm": 4.380412578582764,
"learning_rate": 0.00010055516930069421,
"loss": 3.4896340942382813,
"step": 192200
},
{
"epoch": 1.9954963836167983,
"grad_norm": 3.492359161376953,
"learning_rate": 0.00010045139933794763,
"loss": 3.403392333984375,
"step": 192300
},
{
"epoch": 1.996534083244264,
"grad_norm": 8.865891456604004,
"learning_rate": 0.00010034762937520105,
"loss": 3.60391845703125,
"step": 192400
},
{
"epoch": 1.9975717828717299,
"grad_norm": 1.982731819152832,
"learning_rate": 0.00010024385941245446,
"loss": 3.5614895629882812,
"step": 192500
},
{
"epoch": 1.9986094824991958,
"grad_norm": 2.9287161827087402,
"learning_rate": 0.00010014008944970788,
"loss": 3.5097760009765624,
"step": 192600
},
{
"epoch": 1.9996471821266617,
"grad_norm": 1.8267062902450562,
"learning_rate": 0.00010003631948696129,
"loss": 3.4958160400390623,
"step": 192700
}
],
"logging_steps": 100,
"max_steps": 289101,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2645192822135194e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}