| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 290, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.7135222218930721, |
| "epoch": 0.1386481802426343, |
| "grad_norm": 2.765625, |
| "learning_rate": 3.2758620689655175e-06, |
| "loss": 1.0037, |
| "mean_token_accuracy": 0.766643451154232, |
| "num_tokens": 6832690.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.6983387872576714, |
| "epoch": 0.2772963604852686, |
| "grad_norm": 1.171875, |
| "learning_rate": 6.724137931034484e-06, |
| "loss": 0.7837, |
| "mean_token_accuracy": 0.7993580244481564, |
| "num_tokens": 13664933.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.5203136764466763, |
| "epoch": 0.41594454072790293, |
| "grad_norm": 0.44140625, |
| "learning_rate": 9.999541586764836e-06, |
| "loss": 0.5293, |
| "mean_token_accuracy": 0.8474333696067333, |
| "num_tokens": 20500452.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.4632941197603941, |
| "epoch": 0.5545927209705372, |
| "grad_norm": 0.34765625, |
| "learning_rate": 9.799195340909569e-06, |
| "loss": 0.4664, |
| "mean_token_accuracy": 0.8605481564998627, |
| "num_tokens": 27342335.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.44385356418788435, |
| "epoch": 0.6932409012131716, |
| "grad_norm": 0.384765625, |
| "learning_rate": 9.248987682898576e-06, |
| "loss": 0.4448, |
| "mean_token_accuracy": 0.8655192881822587, |
| "num_tokens": 34182590.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.44103220105171204, |
| "epoch": 0.8318890814558059, |
| "grad_norm": 0.341796875, |
| "learning_rate": 8.389028759232816e-06, |
| "loss": 0.4425, |
| "mean_token_accuracy": 0.8660077638924122, |
| "num_tokens": 41024570.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.4328003875911236, |
| "epoch": 0.9705372616984402, |
| "grad_norm": 0.318359375, |
| "learning_rate": 7.2820095883138456e-06, |
| "loss": 0.4334, |
| "mean_token_accuracy": 0.8682045668363572, |
| "num_tokens": 47861377.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.4284165660282234, |
| "epoch": 1.1039861351819757, |
| "grad_norm": 0.326171875, |
| "learning_rate": 6.008631884264387e-06, |
| "loss": 0.4289, |
| "mean_token_accuracy": 0.868948469688366, |
| "num_tokens": 54391813.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.4236792534589767, |
| "epoch": 1.24263431542461, |
| "grad_norm": 0.341796875, |
| "learning_rate": 4.661724900761355e-06, |
| "loss": 0.4239, |
| "mean_token_accuracy": 0.8704770557582379, |
| "num_tokens": 61227970.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.42442810237407685, |
| "epoch": 1.3812824956672443, |
| "grad_norm": 0.33984375, |
| "learning_rate": 3.3394781770539406e-06, |
| "loss": 0.4245, |
| "mean_token_accuracy": 0.8702129699289799, |
| "num_tokens": 68065726.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.42482112273573874, |
| "epoch": 1.5199306759098787, |
| "grad_norm": 0.318359375, |
| "learning_rate": 2.138283519083281e-06, |
| "loss": 0.4249, |
| "mean_token_accuracy": 0.8700512439012528, |
| "num_tokens": 74903041.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.4213219854980707, |
| "epoch": 1.658578856152513, |
| "grad_norm": 0.32421875, |
| "learning_rate": 1.145708035387177e-06, |
| "loss": 0.4219, |
| "mean_token_accuracy": 0.8707552805542946, |
| "num_tokens": 81743295.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.422089908644557, |
| "epoch": 1.7972270363951472, |
| "grad_norm": 0.322265625, |
| "learning_rate": 4.341104935775442e-07, |
| "loss": 0.4229, |
| "mean_token_accuracy": 0.8708024740219116, |
| "num_tokens": 88577789.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.4224289160221815, |
| "epoch": 1.9358752166377817, |
| "grad_norm": 0.322265625, |
| "learning_rate": 5.536636509891225e-08, |
| "loss": 0.4232, |
| "mean_token_accuracy": 0.8705480195581913, |
| "num_tokens": 95417722.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.4213579764237275, |
| "epoch": 2.0, |
| "mean_token_accuracy": 0.8710838395196039, |
| "num_tokens": 98528536.0, |
| "step": 290, |
| "total_flos": 2.1561577524323942e+18, |
| "train_loss": 0.5024350297862086, |
| "train_runtime": 9862.6956, |
| "train_samples_per_second": 9.819, |
| "train_steps_per_second": 0.029 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 290, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.1561577524323942e+18, |
| "train_batch_size": 84, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|