Upload folder using huggingface_hub
Browse files- config.json +130 -0
- model.safetensors +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +17 -0
- trainer_state.json +874 -0
- training_args.bin +3 -0
config.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architecture_type": "uni-encoder",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GLiClassModel"
|
| 5 |
+
],
|
| 6 |
+
"class_token_index": 50368,
|
| 7 |
+
"class_token_pooling": "first",
|
| 8 |
+
"contrastive_loss_coef": 0.0,
|
| 9 |
+
"cross_encoder_config": null,
|
| 10 |
+
"dropout": 0.1,
|
| 11 |
+
"dtype": "float32",
|
| 12 |
+
"embed_class_token": true,
|
| 13 |
+
"encoder_config": {
|
| 14 |
+
"_attn_implementation_autoset": false,
|
| 15 |
+
"_name_or_path": "jhu-clsp/ettin-encoder-32m",
|
| 16 |
+
"add_cross_attention": false,
|
| 17 |
+
"architectures": [
|
| 18 |
+
"ModernBertForMaskedLM"
|
| 19 |
+
],
|
| 20 |
+
"attention_bias": false,
|
| 21 |
+
"attention_dropout": 0.0,
|
| 22 |
+
"bos_token_id": 50281,
|
| 23 |
+
"causal_mask": false,
|
| 24 |
+
"classifier_activation": "gelu",
|
| 25 |
+
"classifier_bias": false,
|
| 26 |
+
"classifier_dropout": 0.0,
|
| 27 |
+
"classifier_pooling": "mean",
|
| 28 |
+
"cls_token_id": 50281,
|
| 29 |
+
"cross_attention_hidden_size": null,
|
| 30 |
+
"decoder_bias": true,
|
| 31 |
+
"decoder_start_token_id": null,
|
| 32 |
+
"deterministic_flash_attn": false,
|
| 33 |
+
"dtype": "float32",
|
| 34 |
+
"embedding_dropout": 0.0,
|
| 35 |
+
"eos_token_id": 50282,
|
| 36 |
+
"finetuning_task": null,
|
| 37 |
+
"global_attn_every_n_layers": 3,
|
| 38 |
+
"gradient_checkpointing": false,
|
| 39 |
+
"hidden_activation": "gelu",
|
| 40 |
+
"hidden_size": 384,
|
| 41 |
+
"initializer_cutoff_factor": 2.0,
|
| 42 |
+
"initializer_range": 0.02,
|
| 43 |
+
"intermediate_size": 576,
|
| 44 |
+
"is_causal": false,
|
| 45 |
+
"is_decoder": false,
|
| 46 |
+
"layer_norm_eps": 1e-05,
|
| 47 |
+
"layer_types": [
|
| 48 |
+
"full_attention",
|
| 49 |
+
"sliding_attention",
|
| 50 |
+
"sliding_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"sliding_attention",
|
| 53 |
+
"sliding_attention",
|
| 54 |
+
"full_attention",
|
| 55 |
+
"sliding_attention",
|
| 56 |
+
"sliding_attention",
|
| 57 |
+
"full_attention"
|
| 58 |
+
],
|
| 59 |
+
"local_attention": 128,
|
| 60 |
+
"max_position_embeddings": 7999,
|
| 61 |
+
"mlp_bias": false,
|
| 62 |
+
"mlp_dropout": 0.0,
|
| 63 |
+
"model_type": "modernbert",
|
| 64 |
+
"norm_bias": false,
|
| 65 |
+
"norm_eps": 1e-05,
|
| 66 |
+
"num_attention_heads": 6,
|
| 67 |
+
"num_hidden_layers": 10,
|
| 68 |
+
"pad_token_id": 50283,
|
| 69 |
+
"position_embedding_type": "sans_pos",
|
| 70 |
+
"prefix": null,
|
| 71 |
+
"pruned_heads": {},
|
| 72 |
+
"repad_logits_with_grad": false,
|
| 73 |
+
"rope_parameters": {
|
| 74 |
+
"full_attention": {
|
| 75 |
+
"rope_theta": 160000.0,
|
| 76 |
+
"rope_type": "default"
|
| 77 |
+
},
|
| 78 |
+
"sliding_attention": {
|
| 79 |
+
"rope_theta": 160000.0,
|
| 80 |
+
"rope_type": "default"
|
| 81 |
+
}
|
| 82 |
+
},
|
| 83 |
+
"sep_token_id": 50282,
|
| 84 |
+
"sparse_pred_ignore_index": -100,
|
| 85 |
+
"sparse_prediction": false,
|
| 86 |
+
"task_specific_params": null,
|
| 87 |
+
"tf_legacy_loss": false,
|
| 88 |
+
"tie_encoder_decoder": false,
|
| 89 |
+
"tie_word_embeddings": true,
|
| 90 |
+
"tokenizer_class": null,
|
| 91 |
+
"torchscript": false,
|
| 92 |
+
"use_bfloat16": false,
|
| 93 |
+
"vocab_size": 50370
|
| 94 |
+
},
|
| 95 |
+
"encoder_layer_id": -1,
|
| 96 |
+
"encoder_model_name": "jhu-clsp/ettin-encoder-32m",
|
| 97 |
+
"example_token_index": 50372,
|
| 98 |
+
"extract_text_features": false,
|
| 99 |
+
"focal_loss_alpha": 0.7,
|
| 100 |
+
"focal_loss_gamma": -1,
|
| 101 |
+
"focal_loss_reduction": "none",
|
| 102 |
+
"hidden_size": 384,
|
| 103 |
+
"ignore_index": -100,
|
| 104 |
+
"initializer_range": 0.03,
|
| 105 |
+
"label_model_config": null,
|
| 106 |
+
"label_model_name": null,
|
| 107 |
+
"layer_wise": false,
|
| 108 |
+
"logit_scale_init_value": 2.6592,
|
| 109 |
+
"max_labels_alloc": "dynamic",
|
| 110 |
+
"max_num_classes": 25,
|
| 111 |
+
"model_type": "GLiClass",
|
| 112 |
+
"normalize_features": false,
|
| 113 |
+
"pad_token_id": 50283,
|
| 114 |
+
"pooling_strategy": "first",
|
| 115 |
+
"problem_type": "multi_label_classification",
|
| 116 |
+
"projector_hidden_act": "gelu",
|
| 117 |
+
"prompt_first": true,
|
| 118 |
+
"scorer_attn_dropout": 0.1,
|
| 119 |
+
"scorer_mlp_hidden_size": 1024,
|
| 120 |
+
"scorer_num_heads": 16,
|
| 121 |
+
"scorer_type": "mlp",
|
| 122 |
+
"shuffle_labels": true,
|
| 123 |
+
"squeeze_layers": false,
|
| 124 |
+
"text_token_index": 50369,
|
| 125 |
+
"transformers_version": "5.1.0",
|
| 126 |
+
"use_cache": false,
|
| 127 |
+
"use_lstm": false,
|
| 128 |
+
"use_segment_embeddings": false,
|
| 129 |
+
"vocab_size": 50370
|
| 130 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:979d72fa0026def128a9fdfccdb763c3f226d5df87883ecc205dac45be53c893
|
| 3 |
+
size 130829312
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": true,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"clean_up_tokenization_spaces": true,
|
| 5 |
+
"cls_token": "[CLS]",
|
| 6 |
+
"is_local": true,
|
| 7 |
+
"mask_token": "[MASK]",
|
| 8 |
+
"model_input_names": [
|
| 9 |
+
"input_ids",
|
| 10 |
+
"attention_mask"
|
| 11 |
+
],
|
| 12 |
+
"model_max_length": 8192,
|
| 13 |
+
"pad_token": "[PAD]",
|
| 14 |
+
"sep_token": "[SEP]",
|
| 15 |
+
"tokenizer_class": "TokenizersBackend",
|
| 16 |
+
"unk_token": "[UNK]"
|
| 17 |
+
}
|
trainer_state.json
ADDED
|
@@ -0,0 +1,874 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 9.892827699917559,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 12000,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.08244023083264633,
|
| 14 |
+
"grad_norm": 1.0573325157165527,
|
| 15 |
+
"learning_rate": 4.892915980230642e-07,
|
| 16 |
+
"loss": 0.05755834102630615,
|
| 17 |
+
"step": 100
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.16488046166529266,
|
| 21 |
+
"grad_norm": 10.260250091552734,
|
| 22 |
+
"learning_rate": 9.835255354200989e-07,
|
| 23 |
+
"loss": 0.04523322582244873,
|
| 24 |
+
"step": 200
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.247320692497939,
|
| 28 |
+
"grad_norm": 0.5175366997718811,
|
| 29 |
+
"learning_rate": 1.4777594728171335e-06,
|
| 30 |
+
"loss": 0.03658797979354858,
|
| 31 |
+
"step": 300
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.3297609233305853,
|
| 35 |
+
"grad_norm": 5.0066118240356445,
|
| 36 |
+
"learning_rate": 1.971993410214168e-06,
|
| 37 |
+
"loss": 0.04241968631744385,
|
| 38 |
+
"step": 400
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.41220115416323166,
|
| 42 |
+
"grad_norm": 16.185205459594727,
|
| 43 |
+
"learning_rate": 2.466227347611203e-06,
|
| 44 |
+
"loss": 0.03233745574951172,
|
| 45 |
+
"step": 500
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.494641384995878,
|
| 49 |
+
"grad_norm": 0.4887259900569916,
|
| 50 |
+
"learning_rate": 2.9604612850082373e-06,
|
| 51 |
+
"loss": 0.04120903968811035,
|
| 52 |
+
"step": 600
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.5770816158285244,
|
| 56 |
+
"grad_norm": 1.360079050064087,
|
| 57 |
+
"learning_rate": 2.999528173020731e-06,
|
| 58 |
+
"loss": 0.03448781490325928,
|
| 59 |
+
"step": 700
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.6595218466611706,
|
| 63 |
+
"grad_norm": 1.5775821208953857,
|
| 64 |
+
"learning_rate": 2.997945372145348e-06,
|
| 65 |
+
"loss": 0.03620944499969483,
|
| 66 |
+
"step": 800
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.7419620774938169,
|
| 70 |
+
"grad_norm": 37.682884216308594,
|
| 71 |
+
"learning_rate": 2.9952492059335665e-06,
|
| 72 |
+
"loss": 0.033068180084228516,
|
| 73 |
+
"step": 900
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.8244023083264633,
|
| 77 |
+
"grad_norm": 0.5983888506889343,
|
| 78 |
+
"learning_rate": 2.99144167834231e-06,
|
| 79 |
+
"loss": 0.03487896919250488,
|
| 80 |
+
"step": 1000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.9068425391591096,
|
| 84 |
+
"grad_norm": 0.4312576651573181,
|
| 85 |
+
"learning_rate": 2.986525619360788e-06,
|
| 86 |
+
"loss": 0.032608640193939206,
|
| 87 |
+
"step": 1100
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.989282769991756,
|
| 91 |
+
"grad_norm": 1.5949877500534058,
|
| 92 |
+
"learning_rate": 2.980504682907069e-06,
|
| 93 |
+
"loss": 0.03462482452392578,
|
| 94 |
+
"step": 1200
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.0717230008244023,
|
| 98 |
+
"grad_norm": 0.5395241379737854,
|
| 99 |
+
"learning_rate": 2.9733833441122652e-06,
|
| 100 |
+
"loss": 0.029383325576782228,
|
| 101 |
+
"step": 1300
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.1541632316570487,
|
| 105 |
+
"grad_norm": 7.3991804122924805,
|
| 106 |
+
"learning_rate": 2.9651668959943407e-06,
|
| 107 |
+
"loss": 0.03298326969146728,
|
| 108 |
+
"step": 1400
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.2366034624896949,
|
| 112 |
+
"grad_norm": 3.2689311504364014,
|
| 113 |
+
"learning_rate": 2.955861445524012e-06,
|
| 114 |
+
"loss": 0.03119053363800049,
|
| 115 |
+
"step": 1500
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.3190436933223413,
|
| 119 |
+
"grad_norm": 1.3022229671478271,
|
| 120 |
+
"learning_rate": 2.9454739090856716e-06,
|
| 121 |
+
"loss": 0.03160511493682861,
|
| 122 |
+
"step": 1600
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.4014839241549877,
|
| 126 |
+
"grad_norm": 1.545177936553955,
|
| 127 |
+
"learning_rate": 2.9340120073367064e-06,
|
| 128 |
+
"loss": 0.03241936206817627,
|
| 129 |
+
"step": 1700
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.4839241549876339,
|
| 133 |
+
"grad_norm": 0.3807108998298645,
|
| 134 |
+
"learning_rate": 2.921484259469025e-06,
|
| 135 |
+
"loss": 0.028808791637420655,
|
| 136 |
+
"step": 1800
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.5663643858202803,
|
| 140 |
+
"grad_norm": 1.6713629961013794,
|
| 141 |
+
"learning_rate": 2.907899976877075e-06,
|
| 142 |
+
"loss": 0.026836631298065187,
|
| 143 |
+
"step": 1900
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.6488046166529267,
|
| 147 |
+
"grad_norm": 0.8516013622283936,
|
| 148 |
+
"learning_rate": 2.8932692562370356e-06,
|
| 149 |
+
"loss": 0.02777437925338745,
|
| 150 |
+
"step": 2000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 1.731244847485573,
|
| 154 |
+
"grad_norm": 6.933730602264404,
|
| 155 |
+
"learning_rate": 2.8776029720023492e-06,
|
| 156 |
+
"loss": 0.03317852735519409,
|
| 157 |
+
"step": 2100
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 1.8136850783182195,
|
| 161 |
+
"grad_norm": 2.6626434326171875,
|
| 162 |
+
"learning_rate": 2.8609127683211535e-06,
|
| 163 |
+
"loss": 0.030532124042510985,
|
| 164 |
+
"step": 2200
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 1.8961253091508656,
|
| 168 |
+
"grad_norm": 0.9223625659942627,
|
| 169 |
+
"learning_rate": 2.8432110503816364e-06,
|
| 170 |
+
"loss": 0.028622543811798094,
|
| 171 |
+
"step": 2300
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 1.9785655399835118,
|
| 175 |
+
"grad_norm": 1.147865891456604,
|
| 176 |
+
"learning_rate": 2.824510975191734e-06,
|
| 177 |
+
"loss": 0.0330674409866333,
|
| 178 |
+
"step": 2400
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 2.061005770816158,
|
| 182 |
+
"grad_norm": 0.507649838924408,
|
| 183 |
+
"learning_rate": 2.8048264418000297e-06,
|
| 184 |
+
"loss": 0.029734506607055664,
|
| 185 |
+
"step": 2500
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 2.1434460016488046,
|
| 189 |
+
"grad_norm": 0.648606538772583,
|
| 190 |
+
"learning_rate": 2.7841720809651287e-06,
|
| 191 |
+
"loss": 0.02952629327774048,
|
| 192 |
+
"step": 2600
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 2.225886232481451,
|
| 196 |
+
"grad_norm": 6.885851860046387,
|
| 197 |
+
"learning_rate": 2.762563244281172e-06,
|
| 198 |
+
"loss": 0.033362927436828616,
|
| 199 |
+
"step": 2700
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 2.3083264633140974,
|
| 203 |
+
"grad_norm": 0.7185697555541992,
|
| 204 |
+
"learning_rate": 2.7400159927675868e-06,
|
| 205 |
+
"loss": 0.03310190677642822,
|
| 206 |
+
"step": 2800
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 2.390766694146744,
|
| 210 |
+
"grad_norm": 0.9038472771644592,
|
| 211 |
+
"learning_rate": 2.7165470849315476e-06,
|
| 212 |
+
"loss": 0.027033202648162842,
|
| 213 |
+
"step": 2900
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 2.4732069249793898,
|
| 217 |
+
"grad_norm": 1.6981157064437866,
|
| 218 |
+
"learning_rate": 2.692173964312021e-06,
|
| 219 |
+
"loss": 0.03098677396774292,
|
| 220 |
+
"step": 3000
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 2.555647155812036,
|
| 224 |
+
"grad_norm": 0.38965705037117004,
|
| 225 |
+
"learning_rate": 2.666914746514651e-06,
|
| 226 |
+
"loss": 0.025331289768218995,
|
| 227 |
+
"step": 3100
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 2.6380873866446826,
|
| 231 |
+
"grad_norm": 0.9481067061424255,
|
| 232 |
+
"learning_rate": 2.6407882057471234e-06,
|
| 233 |
+
"loss": 0.02945547103881836,
|
| 234 |
+
"step": 3200
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 2.720527617477329,
|
| 238 |
+
"grad_norm": 0.990297794342041,
|
| 239 |
+
"learning_rate": 2.61381376086502e-06,
|
| 240 |
+
"loss": 0.025877423286437988,
|
| 241 |
+
"step": 3300
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 2.8029678483099754,
|
| 245 |
+
"grad_norm": 0.7205927968025208,
|
| 246 |
+
"learning_rate": 2.586011460938527e-06,
|
| 247 |
+
"loss": 0.0348543381690979,
|
| 248 |
+
"step": 3400
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 2.8854080791426218,
|
| 252 |
+
"grad_norm": 1.2724188566207886,
|
| 253 |
+
"learning_rate": 2.5574019703507284e-06,
|
| 254 |
+
"loss": 0.025783424377441407,
|
| 255 |
+
"step": 3500
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 2.9678483099752677,
|
| 259 |
+
"grad_norm": 0.3951258063316345,
|
| 260 |
+
"learning_rate": 2.528006553438566e-06,
|
| 261 |
+
"loss": 0.024769763946533203,
|
| 262 |
+
"step": 3600
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 3.050288540807914,
|
| 266 |
+
"grad_norm": 0.7116732001304626,
|
| 267 |
+
"learning_rate": 2.4978470586878702e-06,
|
| 268 |
+
"loss": 0.0287685227394104,
|
| 269 |
+
"step": 3700
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 3.1327287716405605,
|
| 273 |
+
"grad_norm": 0.9427194595336914,
|
| 274 |
+
"learning_rate": 2.4669459024942216e-06,
|
| 275 |
+
"loss": 0.03142688512802124,
|
| 276 |
+
"step": 3800
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 3.215169002473207,
|
| 280 |
+
"grad_norm": 4.737741470336914,
|
| 281 |
+
"learning_rate": 2.4353260525017004e-06,
|
| 282 |
+
"loss": 0.025456478595733644,
|
| 283 |
+
"step": 3900
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 3.2976092333058533,
|
| 287 |
+
"grad_norm": 1.237444519996643,
|
| 288 |
+
"learning_rate": 2.4030110105319206e-06,
|
| 289 |
+
"loss": 0.02399829149246216,
|
| 290 |
+
"step": 4000
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 3.3800494641384997,
|
| 294 |
+
"grad_norm": 4.982211589813232,
|
| 295 |
+
"learning_rate": 2.370024795116028e-06,
|
| 296 |
+
"loss": 0.03101783275604248,
|
| 297 |
+
"step": 4100
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"epoch": 3.462489694971146,
|
| 301 |
+
"grad_norm": 5.987162113189697,
|
| 302 |
+
"learning_rate": 2.336391923642643e-06,
|
| 303 |
+
"loss": 0.023696184158325195,
|
| 304 |
+
"step": 4200
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"epoch": 3.5449299258037925,
|
| 308 |
+
"grad_norm": 0.5781798362731934,
|
| 309 |
+
"learning_rate": 2.302137394135031e-06,
|
| 310 |
+
"loss": 0.026220085620880126,
|
| 311 |
+
"step": 4300
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"epoch": 3.6273701566364385,
|
| 315 |
+
"grad_norm": 1.0097777843475342,
|
| 316 |
+
"learning_rate": 2.267286666671027e-06,
|
| 317 |
+
"loss": 0.029556829929351807,
|
| 318 |
+
"step": 4400
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"epoch": 3.709810387469085,
|
| 322 |
+
"grad_norm": 1.0615909099578857,
|
| 323 |
+
"learning_rate": 2.2318656444595387e-06,
|
| 324 |
+
"loss": 0.02913331747055054,
|
| 325 |
+
"step": 4500
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 3.7922506183017313,
|
| 329 |
+
"grad_norm": 0.6333794593811035,
|
| 330 |
+
"learning_rate": 2.1959006545876846e-06,
|
| 331 |
+
"loss": 0.025029284954071043,
|
| 332 |
+
"step": 4600
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"epoch": 3.8746908491343777,
|
| 336 |
+
"grad_norm": 0.660271167755127,
|
| 337 |
+
"learning_rate": 2.1594184284528776e-06,
|
| 338 |
+
"loss": 0.031456120014190674,
|
| 339 |
+
"step": 4700
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"epoch": 3.957131079967024,
|
| 343 |
+
"grad_norm": 1.3678995370864868,
|
| 344 |
+
"learning_rate": 2.1224460818944066e-06,
|
| 345 |
+
"loss": 0.03159121990203857,
|
| 346 |
+
"step": 4800
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 4.03957131079967,
|
| 350 |
+
"grad_norm": 1.6306558847427368,
|
| 351 |
+
"learning_rate": 2.0850110950392694e-06,
|
| 352 |
+
"loss": 0.023191912174224852,
|
| 353 |
+
"step": 4900
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"epoch": 4.122011541632316,
|
| 357 |
+
"grad_norm": 1.2425507307052612,
|
| 358 |
+
"learning_rate": 2.047141291877252e-06,
|
| 359 |
+
"loss": 0.030113303661346437,
|
| 360 |
+
"step": 5000
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"epoch": 4.204451772464963,
|
| 364 |
+
"grad_norm": 0.47728583216667175,
|
| 365 |
+
"learning_rate": 2.00886481958042e-06,
|
| 366 |
+
"loss": 0.024720721244812012,
|
| 367 |
+
"step": 5100
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"epoch": 4.286892003297609,
|
| 371 |
+
"grad_norm": 7.698798179626465,
|
| 372 |
+
"learning_rate": 1.97021012758241e-06,
|
| 373 |
+
"loss": 0.027791497707366945,
|
| 374 |
+
"step": 5200
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"epoch": 4.369332234130256,
|
| 378 |
+
"grad_norm": 0.9410794973373413,
|
| 379 |
+
"learning_rate": 1.9312059464330545e-06,
|
| 380 |
+
"loss": 0.022262310981750487,
|
| 381 |
+
"step": 5300
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"epoch": 4.451772464962902,
|
| 385 |
+
"grad_norm": 6.470306873321533,
|
| 386 |
+
"learning_rate": 1.8918812664440643e-06,
|
| 387 |
+
"loss": 0.023623311519622804,
|
| 388 |
+
"step": 5400
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"epoch": 4.534212695795548,
|
| 392 |
+
"grad_norm": 0.5035800933837891,
|
| 393 |
+
"learning_rate": 1.8522653161416466e-06,
|
| 394 |
+
"loss": 0.02535334587097168,
|
| 395 |
+
"step": 5500
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"epoch": 4.616652926628195,
|
| 399 |
+
"grad_norm": 2.6831247806549072,
|
| 400 |
+
"learning_rate": 1.8123875405420576e-06,
|
| 401 |
+
"loss": 0.022764217853546143,
|
| 402 |
+
"step": 5600
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 4.699093157460841,
|
| 406 |
+
"grad_norm": 0.7162504196166992,
|
| 407 |
+
"learning_rate": 1.7722775792662551e-06,
|
| 408 |
+
"loss": 0.027024078369140624,
|
| 409 |
+
"step": 5700
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"epoch": 4.781533388293488,
|
| 413 |
+
"grad_norm": 0.8824426531791687,
|
| 414 |
+
"learning_rate": 1.7319652445099035e-06,
|
| 415 |
+
"loss": 0.02422706365585327,
|
| 416 |
+
"step": 5800
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"epoch": 4.863973619126133,
|
| 420 |
+
"grad_norm": 0.9424160122871399,
|
| 421 |
+
"learning_rate": 1.6914804988851126e-06,
|
| 422 |
+
"loss": 0.030813112258911132,
|
| 423 |
+
"step": 5900
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"epoch": 4.9464138499587795,
|
| 427 |
+
"grad_norm": 0.8058213591575623,
|
| 428 |
+
"learning_rate": 1.6508534331503764e-06,
|
| 429 |
+
"loss": 0.0320208215713501,
|
| 430 |
+
"step": 6000
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"epoch": 5.028854080791426,
|
| 434 |
+
"grad_norm": 0.6557570099830627,
|
| 435 |
+
"learning_rate": 1.610114243845269e-06,
|
| 436 |
+
"loss": 0.032469592094421386,
|
| 437 |
+
"step": 6100
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"epoch": 5.111294311624072,
|
| 441 |
+
"grad_norm": 0.6476128697395325,
|
| 442 |
+
"learning_rate": 1.569293210846512e-06,
|
| 443 |
+
"loss": 0.028678691387176512,
|
| 444 |
+
"step": 6200
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"epoch": 5.193734542456719,
|
| 448 |
+
"grad_norm": 17.67458724975586,
|
| 449 |
+
"learning_rate": 1.5284206748621066e-06,
|
| 450 |
+
"loss": 0.022574949264526366,
|
| 451 |
+
"step": 6300
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"epoch": 5.276174773289365,
|
| 455 |
+
"grad_norm": 0.6299949884414673,
|
| 456 |
+
"learning_rate": 1.4875270148802465e-06,
|
| 457 |
+
"loss": 0.028071470260620117,
|
| 458 |
+
"step": 6400
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 5.3586150041220115,
|
| 462 |
+
"grad_norm": 0.17750702798366547,
|
| 463 |
+
"learning_rate": 1.4466426255897827e-06,
|
| 464 |
+
"loss": 0.02460844039916992,
|
| 465 |
+
"step": 6500
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"epoch": 5.441055234954658,
|
| 469 |
+
"grad_norm": 3.046633720397949,
|
| 470 |
+
"learning_rate": 1.4057978947890166e-06,
|
| 471 |
+
"loss": 0.028238294124603273,
|
| 472 |
+
"step": 6600
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"epoch": 5.523495465787304,
|
| 476 |
+
"grad_norm": 1.1183128356933594,
|
| 477 |
+
"learning_rate": 1.3650231807996163e-06,
|
| 478 |
+
"loss": 0.022344279289245605,
|
| 479 |
+
"step": 6700
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"epoch": 5.605935696619951,
|
| 483 |
+
"grad_norm": 0.39727112650871277,
|
| 484 |
+
"learning_rate": 1.3243487899024401e-06,
|
| 485 |
+
"loss": 0.024733517169952392,
|
| 486 |
+
"step": 6800
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"epoch": 5.688375927452597,
|
| 490 |
+
"grad_norm": 1.1431382894515991,
|
| 491 |
+
"learning_rate": 1.2838049538120375e-06,
|
| 492 |
+
"loss": 0.024790296554565428,
|
| 493 |
+
"step": 6900
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"epoch": 5.7708161582852435,
|
| 497 |
+
"grad_norm": 0.270554780960083,
|
| 498 |
+
"learning_rate": 1.243421807206581e-06,
|
| 499 |
+
"loss": 0.023978376388549806,
|
| 500 |
+
"step": 7000
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"epoch": 5.85325638911789,
|
| 504 |
+
"grad_norm": 2.6371097564697266,
|
| 505 |
+
"learning_rate": 1.2032293653299107e-06,
|
| 506 |
+
"loss": 0.024820666313171386,
|
| 507 |
+
"step": 7100
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"epoch": 5.935696619950535,
|
| 511 |
+
"grad_norm": 0.4371972680091858,
|
| 512 |
+
"learning_rate": 1.1632575016823583e-06,
|
| 513 |
+
"loss": 0.023902339935302733,
|
| 514 |
+
"step": 7200
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 6.018136850783182,
|
| 518 |
+
"grad_norm": 0.3910028040409088,
|
| 519 |
+
"learning_rate": 1.1235359258169183e-06,
|
| 520 |
+
"loss": 0.023613801002502443,
|
| 521 |
+
"step": 7300
|
| 522 |
+
},
|
| 523 |
+
{
|
| 524 |
+
"epoch": 6.100577081615828,
|
| 525 |
+
"grad_norm": 0.5897251963615417,
|
| 526 |
+
"learning_rate": 1.0840941612572765e-06,
|
| 527 |
+
"loss": 0.025773169994354247,
|
| 528 |
+
"step": 7400
|
| 529 |
+
},
|
| 530 |
+
{
|
| 531 |
+
"epoch": 6.183017312448475,
|
| 532 |
+
"grad_norm": 7.796975135803223,
|
| 533 |
+
"learning_rate": 1.0449615235541093e-06,
|
| 534 |
+
"loss": 0.028378984928131103,
|
| 535 |
+
"step": 7500
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"epoch": 6.265457543281121,
|
| 539 |
+
"grad_norm": 2.8554458618164062,
|
| 540 |
+
"learning_rate": 1.0061670984959582e-06,
|
| 541 |
+
"loss": 0.023146660327911378,
|
| 542 |
+
"step": 7600
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"epoch": 6.347897774113767,
|
| 546 |
+
"grad_norm": 1.5360387563705444,
|
| 547 |
+
"learning_rate": 9.677397204908788e-07,
|
| 548 |
+
"loss": 0.025156841278076172,
|
| 549 |
+
"step": 7700
|
| 550 |
+
},
|
| 551 |
+
{
|
| 552 |
+
"epoch": 6.430338004946414,
|
| 553 |
+
"grad_norm": 0.2781471312046051,
|
| 554 |
+
"learning_rate": 9.297079511349307e-07,
|
| 555 |
+
"loss": 0.023876771926879883,
|
| 556 |
+
"step": 7800
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"epoch": 6.51277823577906,
|
| 560 |
+
"grad_norm": 0.5193030834197998,
|
| 561 |
+
"learning_rate": 8.921000579834404e-07,
|
| 562 |
+
"loss": 0.021548757553100584,
|
| 563 |
+
"step": 7900
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"epoch": 6.595218466611707,
|
| 567 |
+
"grad_norm": 1.3652324676513672,
|
| 568 |
+
"learning_rate": 8.549439935408109e-07,
|
| 569 |
+
"loss": 0.027704770565032958,
|
| 570 |
+
"step": 8000
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 6.677658697444353,
|
| 574 |
+
"grad_norm": 0.567807137966156,
|
| 575 |
+
"learning_rate": 8.182673744844971e-07,
|
| 576 |
+
"loss": 0.024193654060363768,
|
| 577 |
+
"step": 8100
|
| 578 |
+
},
|
| 579 |
+
{
|
| 580 |
+
"epoch": 6.760098928276999,
|
| 581 |
+
"grad_norm": 1.0867784023284912,
|
| 582 |
+
"learning_rate": 7.820974611385887e-07,
|
| 583 |
+
"loss": 0.024752085208892823,
|
| 584 |
+
"step": 8200
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"epoch": 6.842539159109646,
|
| 588 |
+
"grad_norm": 12.679245948791504,
|
| 589 |
+
"learning_rate": 7.464611372122565e-07,
|
| 590 |
+
"loss": 0.02242401838302612,
|
| 591 |
+
"step": 8300
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"epoch": 6.924979389942292,
|
| 595 |
+
"grad_norm": 1.22047758102417,
|
| 596 |
+
"learning_rate": 7.113848898181258e-07,
|
| 597 |
+
"loss": 0.031413540840148926,
|
| 598 |
+
"step": 8400
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"epoch": 7.007419620774938,
|
| 602 |
+
"grad_norm": 36.422203063964844,
|
| 603 |
+
"learning_rate": 6.76894789785417e-07,
|
| 604 |
+
"loss": 0.0245809006690979,
|
| 605 |
+
"step": 8500
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"epoch": 7.089859851607584,
|
| 609 |
+
"grad_norm": 1.0370782613754272,
|
| 610 |
+
"learning_rate": 6.430164722825002e-07,
|
| 611 |
+
"loss": 0.02020338773727417,
|
| 612 |
+
"step": 8600
|
| 613 |
+
},
|
| 614 |
+
{
|
| 615 |
+
"epoch": 7.1723000824402305,
|
| 616 |
+
"grad_norm": 1.1046611070632935,
|
| 617 |
+
"learning_rate": 6.097751177632599e-07,
|
| 618 |
+
"loss": 0.02843546390533447,
|
| 619 |
+
"step": 8700
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"epoch": 7.254740313272877,
|
| 623 |
+
"grad_norm": 1.1630239486694336,
|
| 624 |
+
"learning_rate": 5.77195433251426e-07,
|
| 625 |
+
"loss": 0.023567092418670655,
|
| 626 |
+
"step": 8800
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 7.337180544105523,
|
| 630 |
+
"grad_norm": 0.6131232976913452,
|
| 631 |
+
"learning_rate": 5.453016339767939e-07,
|
| 632 |
+
"loss": 0.02418841600418091,
|
| 633 |
+
"step": 8900
|
| 634 |
+
},
|
| 635 |
+
{
|
| 636 |
+
"epoch": 7.41962077493817,
|
| 637 |
+
"grad_norm": 1.4645527601242065,
|
| 638 |
+
"learning_rate": 5.141174253769704e-07,
|
| 639 |
+
"loss": 0.0257061243057251,
|
| 640 |
+
"step": 9000
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"epoch": 7.502061005770816,
|
| 644 |
+
"grad_norm": 0.9117152690887451,
|
| 645 |
+
"learning_rate": 4.836659854780321e-07,
|
| 646 |
+
"loss": 0.02661696672439575,
|
| 647 |
+
"step": 9100
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"epoch": 7.5845012366034625,
|
| 651 |
+
"grad_norm": 1.2171125411987305,
|
| 652 |
+
"learning_rate": 4.5396994766719033e-07,
|
| 653 |
+
"loss": 0.02204680919647217,
|
| 654 |
+
"step": 9200
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"epoch": 7.666941467436109,
|
| 658 |
+
"grad_norm": 0.6495709419250488,
|
| 659 |
+
"learning_rate": 4.2505138387025753e-07,
|
| 660 |
+
"loss": 0.029301033020019532,
|
| 661 |
+
"step": 9300
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"epoch": 7.749381698268755,
|
| 665 |
+
"grad_norm": 0.6626441478729248,
|
| 666 |
+
"learning_rate": 3.969317881464367e-07,
|
| 667 |
+
"loss": 0.021526577472686766,
|
| 668 |
+
"step": 9400
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"epoch": 7.831821929101402,
|
| 672 |
+
"grad_norm": 16.135684967041016,
|
| 673 |
+
"learning_rate": 3.696320607126065e-07,
|
| 674 |
+
"loss": 0.026122121810913085,
|
| 675 |
+
"step": 9500
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"epoch": 7.914262159934048,
|
| 679 |
+
"grad_norm": 0.8759784698486328,
|
| 680 |
+
"learning_rate": 3.4317249240899686e-07,
|
| 681 |
+
"loss": 0.025852499008178712,
|
| 682 |
+
"step": 9600
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 7.9967023907666945,
|
| 686 |
+
"grad_norm": 0.800774872303009,
|
| 687 |
+
"learning_rate": 3.175727496177826e-07,
|
| 688 |
+
"loss": 0.02209474563598633,
|
| 689 |
+
"step": 9700
|
| 690 |
+
},
|
| 691 |
+
{
|
| 692 |
+
"epoch": 8.07914262159934,
|
| 693 |
+
"grad_norm": 0.8495875000953674,
|
| 694 |
+
"learning_rate": 2.928518596458161e-07,
|
| 695 |
+
"loss": 0.02649038314819336,
|
| 696 |
+
"step": 9800
|
| 697 |
+
},
|
| 698 |
+
{
|
| 699 |
+
"epoch": 8.161582852431987,
|
| 700 |
+
"grad_norm": 0.7733114361763,
|
| 701 |
+
"learning_rate": 2.690281965823608e-07,
|
| 702 |
+
"loss": 0.024643311500549315,
|
| 703 |
+
"step": 9900
|
| 704 |
+
},
|
| 705 |
+
{
|
| 706 |
+
"epoch": 8.244023083264633,
|
| 707 |
+
"grad_norm": 12.20607852935791,
|
| 708 |
+
"learning_rate": 2.461194676423352e-07,
|
| 709 |
+
"loss": 0.026993231773376467,
|
| 710 |
+
"step": 10000
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"epoch": 8.32646331409728,
|
| 714 |
+
"grad_norm": 26.955886840820312,
|
| 715 |
+
"learning_rate": 2.2414270000521946e-07,
|
| 716 |
+
"loss": 0.02212390899658203,
|
| 717 |
+
"step": 10100
|
| 718 |
+
},
|
| 719 |
+
{
|
| 720 |
+
"epoch": 8.408903544929926,
|
| 721 |
+
"grad_norm": 0.8895039558410645,
|
| 722 |
+
"learning_rate": 2.031142281594066e-07,
|
| 723 |
+
"loss": 0.02063568115234375,
|
| 724 |
+
"step": 10200
|
| 725 |
+
},
|
| 726 |
+
{
|
| 727 |
+
"epoch": 8.491343775762573,
|
| 728 |
+
"grad_norm": 0.5100795030593872,
|
| 729 |
+
"learning_rate": 1.8304968176140035e-07,
|
| 730 |
+
"loss": 0.024049296379089355,
|
| 731 |
+
"step": 10300
|
| 732 |
+
},
|
| 733 |
+
{
|
| 734 |
+
"epoch": 8.573784006595218,
|
| 735 |
+
"grad_norm": 1.8475189208984375,
|
| 736 |
+
"learning_rate": 1.6396397401889679e-07,
|
| 737 |
+
"loss": 0.02524357557296753,
|
| 738 |
+
"step": 10400
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"epoch": 8.656224237427864,
|
| 742 |
+
"grad_norm": 0.6764451861381531,
|
| 743 |
+
"learning_rate": 1.4587129060636367e-07,
|
| 744 |
+
"loss": 0.028507494926452638,
|
| 745 |
+
"step": 10500
|
| 746 |
+
},
|
| 747 |
+
{
|
| 748 |
+
"epoch": 8.738664468260511,
|
| 749 |
+
"grad_norm": 1.1165720224380493,
|
| 750 |
+
"learning_rate": 1.2878507912137682e-07,
|
| 751 |
+
"loss": 0.02533757209777832,
|
| 752 |
+
"step": 10600
|
| 753 |
+
},
|
| 754 |
+
{
|
| 755 |
+
"epoch": 8.821104699093157,
|
| 756 |
+
"grad_norm": 0.7385006546974182,
|
| 757 |
+
"learning_rate": 1.1271803908953354e-07,
|
| 758 |
+
"loss": 0.022719991207122803,
|
| 759 |
+
"step": 10700
|
| 760 |
+
},
|
| 761 |
+
{
|
| 762 |
+
"epoch": 8.903544929925804,
|
| 763 |
+
"grad_norm": 8.316965103149414,
|
| 764 |
+
"learning_rate": 9.76821125253844e-08,
|
| 765 |
+
"loss": 0.021582581996917725,
|
| 766 |
+
"step": 10800
|
| 767 |
+
},
|
| 768 |
+
{
|
| 769 |
+
"epoch": 8.98598516075845,
|
| 770 |
+
"grad_norm": 1.0215933322906494,
|
| 771 |
+
"learning_rate": 8.368847505639149e-08,
|
| 772 |
+
"loss": 0.02419511079788208,
|
| 773 |
+
"step": 10900
|
| 774 |
+
},
|
| 775 |
+
{
|
| 776 |
+
"epoch": 9.068425391591097,
|
| 777 |
+
"grad_norm": 8.314555168151855,
|
| 778 |
+
"learning_rate": 7.074752761651227e-08,
|
| 779 |
+
"loss": 0.026871368885040284,
|
| 780 |
+
"step": 11000
|
| 781 |
+
},
|
| 782 |
+
{
|
| 783 |
+
"epoch": 9.150865622423742,
|
| 784 |
+
"grad_norm": 7.914130687713623,
|
| 785 |
+
"learning_rate": 5.8868888715585325e-08,
|
| 786 |
+
"loss": 0.02440465450286865,
|
| 787 |
+
"step": 11100
|
| 788 |
+
},
|
| 789 |
+
{
|
| 790 |
+
"epoch": 9.23330585325639,
|
| 791 |
+
"grad_norm": 17.640907287597656,
|
| 792 |
+
"learning_rate": 4.806138729026111e-08,
|
| 793 |
+
"loss": 0.020871245861053468,
|
| 794 |
+
"step": 11200
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"epoch": 9.315746084089035,
|
| 798 |
+
"grad_norm": 0.4218542277812958,
|
| 799 |
+
"learning_rate": 3.833305614179061e-08,
|
| 800 |
+
"loss": 0.024367706775665285,
|
| 801 |
+
"step": 11300
|
| 802 |
+
},
|
| 803 |
+
{
|
| 804 |
+
"epoch": 9.398186314921682,
|
| 805 |
+
"grad_norm": 0.497744619846344,
|
| 806 |
+
"learning_rate": 2.9691125965554454e-08,
|
| 807 |
+
"loss": 0.027901573181152342,
|
| 808 |
+
"step": 11400
|
| 809 |
+
},
|
| 810 |
+
{
|
| 811 |
+
"epoch": 9.480626545754328,
|
| 812 |
+
"grad_norm": 0.6788234710693359,
|
| 813 |
+
"learning_rate": 2.214201997676152e-08,
|
| 814 |
+
"loss": 0.021528902053833007,
|
| 815 |
+
"step": 11500
|
| 816 |
+
},
|
| 817 |
+
{
|
| 818 |
+
"epoch": 9.563066776586975,
|
| 819 |
+
"grad_norm": 0.21850308775901794,
|
| 820 |
+
"learning_rate": 1.5691349136322697e-08,
|
| 821 |
+
"loss": 0.02456042289733887,
|
| 822 |
+
"step": 11600
|
| 823 |
+
},
|
| 824 |
+
{
|
| 825 |
+
"epoch": 9.64550700741962,
|
| 826 |
+
"grad_norm": 0.7916850447654724,
|
| 827 |
+
"learning_rate": 1.0343907980436218e-08,
|
| 828 |
+
"loss": 0.019252289533615113,
|
| 829 |
+
"step": 11700
|
| 830 |
+
},
|
| 831 |
+
{
|
| 832 |
+
"epoch": 9.727947238252266,
|
| 833 |
+
"grad_norm": 0.3894334137439728,
|
| 834 |
+
"learning_rate": 6.103671056994387e-09,
|
| 835 |
+
"loss": 0.020840485095977784,
|
| 836 |
+
"step": 11800
|
| 837 |
+
},
|
| 838 |
+
{
|
| 839 |
+
"epoch": 9.810387469084914,
|
| 840 |
+
"grad_norm": 1.1665902137756348,
|
| 841 |
+
"learning_rate": 2.9737899714539775e-09,
|
| 842 |
+
"loss": 0.030273616313934326,
|
| 843 |
+
"step": 11900
|
| 844 |
+
},
|
| 845 |
+
{
|
| 846 |
+
"epoch": 9.892827699917559,
|
| 847 |
+
"grad_norm": 0.6720395088195801,
|
| 848 |
+
"learning_rate": 9.565910443685155e-10,
|
| 849 |
+
"loss": 0.02424750566482544,
|
| 850 |
+
"step": 12000
|
| 851 |
+
}
|
| 852 |
+
],
|
| 853 |
+
"logging_steps": 100,
|
| 854 |
+
"max_steps": 12130,
|
| 855 |
+
"num_input_tokens_seen": 0,
|
| 856 |
+
"num_train_epochs": 10,
|
| 857 |
+
"save_steps": 1000,
|
| 858 |
+
"stateful_callbacks": {
|
| 859 |
+
"TrainerControl": {
|
| 860 |
+
"args": {
|
| 861 |
+
"should_epoch_stop": false,
|
| 862 |
+
"should_evaluate": false,
|
| 863 |
+
"should_log": false,
|
| 864 |
+
"should_save": true,
|
| 865 |
+
"should_training_stop": false
|
| 866 |
+
},
|
| 867 |
+
"attributes": {}
|
| 868 |
+
}
|
| 869 |
+
},
|
| 870 |
+
"total_flos": 7807792431338136.0,
|
| 871 |
+
"train_batch_size": 8,
|
| 872 |
+
"trial_name": null,
|
| 873 |
+
"trial_params": null
|
| 874 |
+
}
|
training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5baf6aa343fd5badad92c7c7d46ab369c567b17b3a2aaee0ea3b0e8bbd731d12
|
| 3 |
+
size 5329
|