| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.820250284414107, | |
| "eval_steps": 200, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.7517368793487549, | |
| "learning_rate": 9.999e-07, | |
| "loss": 0.2253, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.4760167598724365, | |
| "learning_rate": 9.998e-07, | |
| "loss": 0.1588, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "eval_validation_loss": 0.1540723592042923, | |
| "eval_validation_runtime": 386.0092, | |
| "eval_validation_samples_per_second": 3.0, | |
| "eval_validation_steps_per_second": 0.376, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "eval_benchmark_loss": 0.3658876121044159, | |
| "eval_benchmark_runtime": 9.4993, | |
| "eval_benchmark_samples_per_second": 2.737, | |
| "eval_benchmark_steps_per_second": 0.421, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.4102065563201904, | |
| "learning_rate": 9.997e-07, | |
| "loss": 0.1423, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.657212495803833, | |
| "learning_rate": 9.996e-07, | |
| "loss": 0.1345, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_validation_loss": 0.13775885105133057, | |
| "eval_validation_runtime": 383.1365, | |
| "eval_validation_samples_per_second": 3.022, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_benchmark_loss": 0.3596523106098175, | |
| "eval_benchmark_runtime": 9.4551, | |
| "eval_benchmark_samples_per_second": 2.75, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.1109451055526733, | |
| "learning_rate": 9.995e-07, | |
| "loss": 0.1278, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.219743013381958, | |
| "learning_rate": 9.994e-07, | |
| "loss": 0.128, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "eval_validation_loss": 0.1310659945011139, | |
| "eval_validation_runtime": 382.987, | |
| "eval_validation_samples_per_second": 3.024, | |
| "eval_validation_steps_per_second": 0.379, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "eval_benchmark_loss": 0.35413601994514465, | |
| "eval_benchmark_runtime": 9.452, | |
| "eval_benchmark_samples_per_second": 2.751, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.3631513118743896, | |
| "learning_rate": 9.993e-07, | |
| "loss": 0.1251, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.6779521703720093, | |
| "learning_rate": 9.992e-07, | |
| "loss": 0.1198, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "eval_validation_loss": 0.12629443407058716, | |
| "eval_validation_runtime": 383.2491, | |
| "eval_validation_samples_per_second": 3.022, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "eval_benchmark_loss": 0.35233384370803833, | |
| "eval_benchmark_runtime": 9.4696, | |
| "eval_benchmark_samples_per_second": 2.746, | |
| "eval_benchmark_steps_per_second": 0.422, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 2.0499491691589355, | |
| "learning_rate": 9.990999999999999e-07, | |
| "loss": 0.1127, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.0875236988067627, | |
| "learning_rate": 9.989999999999999e-07, | |
| "loss": 0.1227, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_validation_loss": 0.12267088890075684, | |
| "eval_validation_runtime": 383.4228, | |
| "eval_validation_samples_per_second": 3.02, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_benchmark_loss": 0.35131120681762695, | |
| "eval_benchmark_runtime": 9.4614, | |
| "eval_benchmark_samples_per_second": 2.748, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.061170816421509, | |
| "learning_rate": 9.988999999999999e-07, | |
| "loss": 0.1178, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.5514030456542969, | |
| "learning_rate": 9.988e-07, | |
| "loss": 0.1157, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_validation_loss": 0.12000161409378052, | |
| "eval_validation_runtime": 383.1507, | |
| "eval_validation_samples_per_second": 3.022, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_benchmark_loss": 0.3518659174442291, | |
| "eval_benchmark_runtime": 9.4808, | |
| "eval_benchmark_samples_per_second": 2.742, | |
| "eval_benchmark_steps_per_second": 0.422, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.9566336870193481, | |
| "learning_rate": 9.987e-07, | |
| "loss": 0.1114, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.365792155265808, | |
| "learning_rate": 9.986e-07, | |
| "loss": 0.115, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "eval_validation_loss": 0.117983378469944, | |
| "eval_validation_runtime": 383.1539, | |
| "eval_validation_samples_per_second": 3.022, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "eval_benchmark_loss": 0.3509116470813751, | |
| "eval_benchmark_runtime": 9.4708, | |
| "eval_benchmark_samples_per_second": 2.745, | |
| "eval_benchmark_steps_per_second": 0.422, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.6281362771987915, | |
| "learning_rate": 9.985e-07, | |
| "loss": 0.1192, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.7361140251159668, | |
| "learning_rate": 9.983999999999998e-07, | |
| "loss": 0.1114, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "eval_validation_loss": 0.11662287265062332, | |
| "eval_validation_runtime": 383.2086, | |
| "eval_validation_samples_per_second": 3.022, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "eval_benchmark_loss": 0.3516782522201538, | |
| "eval_benchmark_runtime": 9.4592, | |
| "eval_benchmark_samples_per_second": 2.749, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 2.332932710647583, | |
| "learning_rate": 9.982999999999998e-07, | |
| "loss": 0.1128, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.4797985553741455, | |
| "learning_rate": 9.982e-07, | |
| "loss": 0.1036, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "eval_validation_loss": 0.11438746005296707, | |
| "eval_validation_runtime": 383.1904, | |
| "eval_validation_samples_per_second": 3.022, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "eval_benchmark_loss": 0.34705713391304016, | |
| "eval_benchmark_runtime": 9.4646, | |
| "eval_benchmark_samples_per_second": 2.747, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.546353816986084, | |
| "learning_rate": 9.981e-07, | |
| "loss": 0.1132, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.1084446907043457, | |
| "learning_rate": 9.98e-07, | |
| "loss": 0.1125, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "eval_validation_loss": 0.11280551552772522, | |
| "eval_validation_runtime": 383.2179, | |
| "eval_validation_samples_per_second": 3.022, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "eval_benchmark_loss": 0.34782856702804565, | |
| "eval_benchmark_runtime": 9.4577, | |
| "eval_benchmark_samples_per_second": 2.749, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.3358793258666992, | |
| "learning_rate": 9.979e-07, | |
| "loss": 0.1155, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.6584503650665283, | |
| "learning_rate": 9.978e-07, | |
| "loss": 0.1034, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_validation_loss": 0.11192985624074936, | |
| "eval_validation_runtime": 383.2409, | |
| "eval_validation_samples_per_second": 3.022, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_benchmark_loss": 0.3532721698284149, | |
| "eval_benchmark_runtime": 9.4718, | |
| "eval_benchmark_samples_per_second": 2.745, | |
| "eval_benchmark_steps_per_second": 0.422, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.2788842916488647, | |
| "learning_rate": 9.977e-07, | |
| "loss": 0.1091, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.3015437126159668, | |
| "learning_rate": 9.976e-07, | |
| "loss": 0.1024, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "eval_validation_loss": 0.11032804101705551, | |
| "eval_validation_runtime": 383.3623, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "eval_benchmark_loss": 0.34818902611732483, | |
| "eval_benchmark_runtime": 9.4633, | |
| "eval_benchmark_samples_per_second": 2.747, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.2690401077270508, | |
| "learning_rate": 9.975e-07, | |
| "loss": 0.0981, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.4752624034881592, | |
| "learning_rate": 9.974e-07, | |
| "loss": 0.1036, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "eval_validation_loss": 0.10945109277963638, | |
| "eval_validation_runtime": 383.3638, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "eval_benchmark_loss": 0.3477522134780884, | |
| "eval_benchmark_runtime": 9.4688, | |
| "eval_benchmark_samples_per_second": 2.746, | |
| "eval_benchmark_steps_per_second": 0.422, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.502576231956482, | |
| "learning_rate": 9.973e-07, | |
| "loss": 0.11, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.9190167784690857, | |
| "learning_rate": 9.972e-07, | |
| "loss": 0.086, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "eval_validation_loss": 0.11021959781646729, | |
| "eval_validation_runtime": 383.4523, | |
| "eval_validation_samples_per_second": 3.02, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "eval_benchmark_loss": 0.35620927810668945, | |
| "eval_benchmark_runtime": 9.4715, | |
| "eval_benchmark_samples_per_second": 2.745, | |
| "eval_benchmark_steps_per_second": 0.422, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 2.0933382511138916, | |
| "learning_rate": 9.971e-07, | |
| "loss": 0.0849, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.020569920539856, | |
| "learning_rate": 9.97e-07, | |
| "loss": 0.0858, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "eval_validation_loss": 0.10909596085548401, | |
| "eval_validation_runtime": 383.3001, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "eval_benchmark_loss": 0.3547273278236389, | |
| "eval_benchmark_runtime": 9.4578, | |
| "eval_benchmark_samples_per_second": 2.749, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.2632180452346802, | |
| "learning_rate": 9.969e-07, | |
| "loss": 0.0834, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.0163979530334473, | |
| "learning_rate": 9.968e-07, | |
| "loss": 0.0786, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "eval_validation_loss": 0.11020828038454056, | |
| "eval_validation_runtime": 383.4418, | |
| "eval_validation_samples_per_second": 3.02, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "eval_benchmark_loss": 0.35691550374031067, | |
| "eval_benchmark_runtime": 9.4772, | |
| "eval_benchmark_samples_per_second": 2.743, | |
| "eval_benchmark_steps_per_second": 0.422, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.0731770992279053, | |
| "learning_rate": 9.967e-07, | |
| "loss": 0.0817, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.4652243852615356, | |
| "learning_rate": 9.966e-07, | |
| "loss": 0.0885, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "eval_validation_loss": 0.10934263467788696, | |
| "eval_validation_runtime": 383.3702, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "eval_benchmark_loss": 0.35489001870155334, | |
| "eval_benchmark_runtime": 9.4613, | |
| "eval_benchmark_samples_per_second": 2.748, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.1992924213409424, | |
| "learning_rate": 9.965e-07, | |
| "loss": 0.078, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.3917715549468994, | |
| "learning_rate": 9.964e-07, | |
| "loss": 0.0857, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "eval_validation_loss": 0.10760623216629028, | |
| "eval_validation_runtime": 383.3793, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "eval_benchmark_loss": 0.35664084553718567, | |
| "eval_benchmark_runtime": 9.4529, | |
| "eval_benchmark_samples_per_second": 2.75, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.9880099892616272, | |
| "learning_rate": 9.962999999999999e-07, | |
| "loss": 0.084, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.15889310836792, | |
| "learning_rate": 9.961999999999999e-07, | |
| "loss": 0.0833, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "eval_validation_loss": 0.10785996913909912, | |
| "eval_validation_runtime": 383.3503, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "eval_benchmark_loss": 0.3560557961463928, | |
| "eval_benchmark_runtime": 9.457, | |
| "eval_benchmark_samples_per_second": 2.749, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.4265196323394775, | |
| "learning_rate": 9.960999999999999e-07, | |
| "loss": 0.0831, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.4567248821258545, | |
| "learning_rate": 9.959999999999999e-07, | |
| "loss": 0.084, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "eval_validation_loss": 0.1076769158244133, | |
| "eval_validation_runtime": 383.3556, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "eval_benchmark_loss": 0.3554157614707947, | |
| "eval_benchmark_runtime": 9.4598, | |
| "eval_benchmark_samples_per_second": 2.748, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.7788864374160767, | |
| "learning_rate": 9.958999999999999e-07, | |
| "loss": 0.0813, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.4983494281768799, | |
| "learning_rate": 9.958e-07, | |
| "loss": 0.0815, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "eval_validation_loss": 0.10664806514978409, | |
| "eval_validation_runtime": 383.2552, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "eval_benchmark_loss": 0.35523322224617004, | |
| "eval_benchmark_runtime": 9.4557, | |
| "eval_benchmark_samples_per_second": 2.75, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 0.7813511490821838, | |
| "learning_rate": 9.957e-07, | |
| "loss": 0.0877, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.3383861780166626, | |
| "learning_rate": 9.956e-07, | |
| "loss": 0.0812, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_validation_loss": 0.10593847930431366, | |
| "eval_validation_runtime": 383.4244, | |
| "eval_validation_samples_per_second": 3.02, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_benchmark_loss": 0.3536509871482849, | |
| "eval_benchmark_runtime": 9.4751, | |
| "eval_benchmark_samples_per_second": 2.744, | |
| "eval_benchmark_steps_per_second": 0.422, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.5067020654678345, | |
| "learning_rate": 9.955e-07, | |
| "loss": 0.0821, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.46514093875885, | |
| "learning_rate": 9.953999999999998e-07, | |
| "loss": 0.0841, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "eval_validation_loss": 0.10573519766330719, | |
| "eval_validation_runtime": 383.3608, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "eval_benchmark_loss": 0.35240989923477173, | |
| "eval_benchmark_runtime": 9.4682, | |
| "eval_benchmark_samples_per_second": 2.746, | |
| "eval_benchmark_steps_per_second": 0.422, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.124515175819397, | |
| "learning_rate": 9.952999999999998e-07, | |
| "loss": 0.0833, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.2566176652908325, | |
| "learning_rate": 9.952e-07, | |
| "loss": 0.0822, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "eval_validation_loss": 0.10490844398736954, | |
| "eval_validation_runtime": 383.3754, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "eval_benchmark_loss": 0.3501981496810913, | |
| "eval_benchmark_runtime": 9.4492, | |
| "eval_benchmark_samples_per_second": 2.752, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.4075120687484741, | |
| "learning_rate": 9.951e-07, | |
| "loss": 0.0824, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.9538173079490662, | |
| "learning_rate": 9.95e-07, | |
| "loss": 0.0809, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "eval_validation_loss": 0.10531202703714371, | |
| "eval_validation_runtime": 383.2794, | |
| "eval_validation_samples_per_second": 3.021, | |
| "eval_validation_steps_per_second": 0.378, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "eval_benchmark_loss": 0.3513197898864746, | |
| "eval_benchmark_runtime": 9.4616, | |
| "eval_benchmark_samples_per_second": 2.748, | |
| "eval_benchmark_steps_per_second": 0.423, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 1000000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 365, | |
| "save_steps": 200, | |
| "total_flos": 1.617395756875776e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |