Ihor commited on
Commit
4dac1e9
·
verified ·
1 Parent(s): 75b1d66

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture_type": "uni-encoder",
3
+ "architectures": [
4
+ "GLiClassModel"
5
+ ],
6
+ "class_token_index": 50368,
7
+ "class_token_pooling": "first",
8
+ "contrastive_loss_coef": 0.0,
9
+ "cross_encoder_config": null,
10
+ "dropout": 0.1,
11
+ "dtype": "float32",
12
+ "embed_class_token": true,
13
+ "encoder_config": {
14
+ "_attn_implementation_autoset": false,
15
+ "_name_or_path": "jhu-clsp/ettin-encoder-32m",
16
+ "add_cross_attention": false,
17
+ "architectures": [
18
+ "ModernBertForMaskedLM"
19
+ ],
20
+ "attention_bias": false,
21
+ "attention_dropout": 0.0,
22
+ "bos_token_id": 50281,
23
+ "causal_mask": false,
24
+ "classifier_activation": "gelu",
25
+ "classifier_bias": false,
26
+ "classifier_dropout": 0.0,
27
+ "classifier_pooling": "mean",
28
+ "cls_token_id": 50281,
29
+ "cross_attention_hidden_size": null,
30
+ "decoder_bias": true,
31
+ "decoder_start_token_id": null,
32
+ "deterministic_flash_attn": false,
33
+ "dtype": "float32",
34
+ "embedding_dropout": 0.0,
35
+ "eos_token_id": 50282,
36
+ "finetuning_task": null,
37
+ "global_attn_every_n_layers": 3,
38
+ "gradient_checkpointing": false,
39
+ "hidden_activation": "gelu",
40
+ "hidden_size": 384,
41
+ "initializer_cutoff_factor": 2.0,
42
+ "initializer_range": 0.02,
43
+ "intermediate_size": 576,
44
+ "is_causal": false,
45
+ "is_decoder": false,
46
+ "layer_norm_eps": 1e-05,
47
+ "layer_types": [
48
+ "full_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "sliding_attention",
54
+ "full_attention",
55
+ "sliding_attention",
56
+ "sliding_attention",
57
+ "full_attention"
58
+ ],
59
+ "local_attention": 128,
60
+ "max_position_embeddings": 7999,
61
+ "mlp_bias": false,
62
+ "mlp_dropout": 0.0,
63
+ "model_type": "modernbert",
64
+ "norm_bias": false,
65
+ "norm_eps": 1e-05,
66
+ "num_attention_heads": 6,
67
+ "num_hidden_layers": 10,
68
+ "pad_token_id": 50283,
69
+ "position_embedding_type": "sans_pos",
70
+ "prefix": null,
71
+ "pruned_heads": {},
72
+ "repad_logits_with_grad": false,
73
+ "rope_parameters": {
74
+ "full_attention": {
75
+ "rope_theta": 160000.0,
76
+ "rope_type": "default"
77
+ },
78
+ "sliding_attention": {
79
+ "rope_theta": 160000.0,
80
+ "rope_type": "default"
81
+ }
82
+ },
83
+ "sep_token_id": 50282,
84
+ "sparse_pred_ignore_index": -100,
85
+ "sparse_prediction": false,
86
+ "task_specific_params": null,
87
+ "tf_legacy_loss": false,
88
+ "tie_encoder_decoder": false,
89
+ "tie_word_embeddings": true,
90
+ "tokenizer_class": null,
91
+ "torchscript": false,
92
+ "use_bfloat16": false,
93
+ "vocab_size": 50370
94
+ },
95
+ "encoder_layer_id": -1,
96
+ "encoder_model_name": "jhu-clsp/ettin-encoder-32m",
97
+ "example_token_index": 50372,
98
+ "extract_text_features": false,
99
+ "focal_loss_alpha": 0.7,
100
+ "focal_loss_gamma": -1,
101
+ "focal_loss_reduction": "none",
102
+ "hidden_size": 384,
103
+ "ignore_index": -100,
104
+ "initializer_range": 0.03,
105
+ "label_model_config": null,
106
+ "label_model_name": null,
107
+ "layer_wise": false,
108
+ "logit_scale_init_value": 2.6592,
109
+ "max_labels_alloc": "dynamic",
110
+ "max_num_classes": 25,
111
+ "model_type": "GLiClass",
112
+ "normalize_features": false,
113
+ "pad_token_id": 50283,
114
+ "pooling_strategy": "first",
115
+ "problem_type": "multi_label_classification",
116
+ "projector_hidden_act": "gelu",
117
+ "prompt_first": true,
118
+ "scorer_attn_dropout": 0.1,
119
+ "scorer_mlp_hidden_size": 1024,
120
+ "scorer_num_heads": 16,
121
+ "scorer_type": "mlp",
122
+ "shuffle_labels": true,
123
+ "squeeze_layers": false,
124
+ "text_token_index": 50369,
125
+ "transformers_version": "5.1.0",
126
+ "use_cache": false,
127
+ "use_lstm": false,
128
+ "use_segment_embeddings": false,
129
+ "vocab_size": 50370
130
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:979d72fa0026def128a9fdfccdb763c3f226d5df87883ecc205dac45be53c893
3
+ size 130829312
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "backend": "tokenizers",
4
+ "clean_up_tokenization_spaces": true,
5
+ "cls_token": "[CLS]",
6
+ "is_local": true,
7
+ "mask_token": "[MASK]",
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 8192,
13
+ "pad_token": "[PAD]",
14
+ "sep_token": "[SEP]",
15
+ "tokenizer_class": "TokenizersBackend",
16
+ "unk_token": "[UNK]"
17
+ }
trainer_state.json ADDED
@@ -0,0 +1,874 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 9.892827699917559,
6
+ "eval_steps": 500,
7
+ "global_step": 12000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.08244023083264633,
14
+ "grad_norm": 1.0573325157165527,
15
+ "learning_rate": 4.892915980230642e-07,
16
+ "loss": 0.05755834102630615,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.16488046166529266,
21
+ "grad_norm": 10.260250091552734,
22
+ "learning_rate": 9.835255354200989e-07,
23
+ "loss": 0.04523322582244873,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.247320692497939,
28
+ "grad_norm": 0.5175366997718811,
29
+ "learning_rate": 1.4777594728171335e-06,
30
+ "loss": 0.03658797979354858,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.3297609233305853,
35
+ "grad_norm": 5.0066118240356445,
36
+ "learning_rate": 1.971993410214168e-06,
37
+ "loss": 0.04241968631744385,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.41220115416323166,
42
+ "grad_norm": 16.185205459594727,
43
+ "learning_rate": 2.466227347611203e-06,
44
+ "loss": 0.03233745574951172,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.494641384995878,
49
+ "grad_norm": 0.4887259900569916,
50
+ "learning_rate": 2.9604612850082373e-06,
51
+ "loss": 0.04120903968811035,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.5770816158285244,
56
+ "grad_norm": 1.360079050064087,
57
+ "learning_rate": 2.999528173020731e-06,
58
+ "loss": 0.03448781490325928,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.6595218466611706,
63
+ "grad_norm": 1.5775821208953857,
64
+ "learning_rate": 2.997945372145348e-06,
65
+ "loss": 0.03620944499969483,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.7419620774938169,
70
+ "grad_norm": 37.682884216308594,
71
+ "learning_rate": 2.9952492059335665e-06,
72
+ "loss": 0.033068180084228516,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.8244023083264633,
77
+ "grad_norm": 0.5983888506889343,
78
+ "learning_rate": 2.99144167834231e-06,
79
+ "loss": 0.03487896919250488,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.9068425391591096,
84
+ "grad_norm": 0.4312576651573181,
85
+ "learning_rate": 2.986525619360788e-06,
86
+ "loss": 0.032608640193939206,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.989282769991756,
91
+ "grad_norm": 1.5949877500534058,
92
+ "learning_rate": 2.980504682907069e-06,
93
+ "loss": 0.03462482452392578,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 1.0717230008244023,
98
+ "grad_norm": 0.5395241379737854,
99
+ "learning_rate": 2.9733833441122652e-06,
100
+ "loss": 0.029383325576782228,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 1.1541632316570487,
105
+ "grad_norm": 7.3991804122924805,
106
+ "learning_rate": 2.9651668959943407e-06,
107
+ "loss": 0.03298326969146728,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 1.2366034624896949,
112
+ "grad_norm": 3.2689311504364014,
113
+ "learning_rate": 2.955861445524012e-06,
114
+ "loss": 0.03119053363800049,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 1.3190436933223413,
119
+ "grad_norm": 1.3022229671478271,
120
+ "learning_rate": 2.9454739090856716e-06,
121
+ "loss": 0.03160511493682861,
122
+ "step": 1600
123
+ },
124
+ {
125
+ "epoch": 1.4014839241549877,
126
+ "grad_norm": 1.545177936553955,
127
+ "learning_rate": 2.9340120073367064e-06,
128
+ "loss": 0.03241936206817627,
129
+ "step": 1700
130
+ },
131
+ {
132
+ "epoch": 1.4839241549876339,
133
+ "grad_norm": 0.3807108998298645,
134
+ "learning_rate": 2.921484259469025e-06,
135
+ "loss": 0.028808791637420655,
136
+ "step": 1800
137
+ },
138
+ {
139
+ "epoch": 1.5663643858202803,
140
+ "grad_norm": 1.6713629961013794,
141
+ "learning_rate": 2.907899976877075e-06,
142
+ "loss": 0.026836631298065187,
143
+ "step": 1900
144
+ },
145
+ {
146
+ "epoch": 1.6488046166529267,
147
+ "grad_norm": 0.8516013622283936,
148
+ "learning_rate": 2.8932692562370356e-06,
149
+ "loss": 0.02777437925338745,
150
+ "step": 2000
151
+ },
152
+ {
153
+ "epoch": 1.731244847485573,
154
+ "grad_norm": 6.933730602264404,
155
+ "learning_rate": 2.8776029720023492e-06,
156
+ "loss": 0.03317852735519409,
157
+ "step": 2100
158
+ },
159
+ {
160
+ "epoch": 1.8136850783182195,
161
+ "grad_norm": 2.6626434326171875,
162
+ "learning_rate": 2.8609127683211535e-06,
163
+ "loss": 0.030532124042510985,
164
+ "step": 2200
165
+ },
166
+ {
167
+ "epoch": 1.8961253091508656,
168
+ "grad_norm": 0.9223625659942627,
169
+ "learning_rate": 2.8432110503816364e-06,
170
+ "loss": 0.028622543811798094,
171
+ "step": 2300
172
+ },
173
+ {
174
+ "epoch": 1.9785655399835118,
175
+ "grad_norm": 1.147865891456604,
176
+ "learning_rate": 2.824510975191734e-06,
177
+ "loss": 0.0330674409866333,
178
+ "step": 2400
179
+ },
180
+ {
181
+ "epoch": 2.061005770816158,
182
+ "grad_norm": 0.507649838924408,
183
+ "learning_rate": 2.8048264418000297e-06,
184
+ "loss": 0.029734506607055664,
185
+ "step": 2500
186
+ },
187
+ {
188
+ "epoch": 2.1434460016488046,
189
+ "grad_norm": 0.648606538772583,
190
+ "learning_rate": 2.7841720809651287e-06,
191
+ "loss": 0.02952629327774048,
192
+ "step": 2600
193
+ },
194
+ {
195
+ "epoch": 2.225886232481451,
196
+ "grad_norm": 6.885851860046387,
197
+ "learning_rate": 2.762563244281172e-06,
198
+ "loss": 0.033362927436828616,
199
+ "step": 2700
200
+ },
201
+ {
202
+ "epoch": 2.3083264633140974,
203
+ "grad_norm": 0.7185697555541992,
204
+ "learning_rate": 2.7400159927675868e-06,
205
+ "loss": 0.03310190677642822,
206
+ "step": 2800
207
+ },
208
+ {
209
+ "epoch": 2.390766694146744,
210
+ "grad_norm": 0.9038472771644592,
211
+ "learning_rate": 2.7165470849315476e-06,
212
+ "loss": 0.027033202648162842,
213
+ "step": 2900
214
+ },
215
+ {
216
+ "epoch": 2.4732069249793898,
217
+ "grad_norm": 1.6981157064437866,
218
+ "learning_rate": 2.692173964312021e-06,
219
+ "loss": 0.03098677396774292,
220
+ "step": 3000
221
+ },
222
+ {
223
+ "epoch": 2.555647155812036,
224
+ "grad_norm": 0.38965705037117004,
225
+ "learning_rate": 2.666914746514651e-06,
226
+ "loss": 0.025331289768218995,
227
+ "step": 3100
228
+ },
229
+ {
230
+ "epoch": 2.6380873866446826,
231
+ "grad_norm": 0.9481067061424255,
232
+ "learning_rate": 2.6407882057471234e-06,
233
+ "loss": 0.02945547103881836,
234
+ "step": 3200
235
+ },
236
+ {
237
+ "epoch": 2.720527617477329,
238
+ "grad_norm": 0.990297794342041,
239
+ "learning_rate": 2.61381376086502e-06,
240
+ "loss": 0.025877423286437988,
241
+ "step": 3300
242
+ },
243
+ {
244
+ "epoch": 2.8029678483099754,
245
+ "grad_norm": 0.7205927968025208,
246
+ "learning_rate": 2.586011460938527e-06,
247
+ "loss": 0.0348543381690979,
248
+ "step": 3400
249
+ },
250
+ {
251
+ "epoch": 2.8854080791426218,
252
+ "grad_norm": 1.2724188566207886,
253
+ "learning_rate": 2.5574019703507284e-06,
254
+ "loss": 0.025783424377441407,
255
+ "step": 3500
256
+ },
257
+ {
258
+ "epoch": 2.9678483099752677,
259
+ "grad_norm": 0.3951258063316345,
260
+ "learning_rate": 2.528006553438566e-06,
261
+ "loss": 0.024769763946533203,
262
+ "step": 3600
263
+ },
264
+ {
265
+ "epoch": 3.050288540807914,
266
+ "grad_norm": 0.7116732001304626,
267
+ "learning_rate": 2.4978470586878702e-06,
268
+ "loss": 0.0287685227394104,
269
+ "step": 3700
270
+ },
271
+ {
272
+ "epoch": 3.1327287716405605,
273
+ "grad_norm": 0.9427194595336914,
274
+ "learning_rate": 2.4669459024942216e-06,
275
+ "loss": 0.03142688512802124,
276
+ "step": 3800
277
+ },
278
+ {
279
+ "epoch": 3.215169002473207,
280
+ "grad_norm": 4.737741470336914,
281
+ "learning_rate": 2.4353260525017004e-06,
282
+ "loss": 0.025456478595733644,
283
+ "step": 3900
284
+ },
285
+ {
286
+ "epoch": 3.2976092333058533,
287
+ "grad_norm": 1.237444519996643,
288
+ "learning_rate": 2.4030110105319206e-06,
289
+ "loss": 0.02399829149246216,
290
+ "step": 4000
291
+ },
292
+ {
293
+ "epoch": 3.3800494641384997,
294
+ "grad_norm": 4.982211589813232,
295
+ "learning_rate": 2.370024795116028e-06,
296
+ "loss": 0.03101783275604248,
297
+ "step": 4100
298
+ },
299
+ {
300
+ "epoch": 3.462489694971146,
301
+ "grad_norm": 5.987162113189697,
302
+ "learning_rate": 2.336391923642643e-06,
303
+ "loss": 0.023696184158325195,
304
+ "step": 4200
305
+ },
306
+ {
307
+ "epoch": 3.5449299258037925,
308
+ "grad_norm": 0.5781798362731934,
309
+ "learning_rate": 2.302137394135031e-06,
310
+ "loss": 0.026220085620880126,
311
+ "step": 4300
312
+ },
313
+ {
314
+ "epoch": 3.6273701566364385,
315
+ "grad_norm": 1.0097777843475342,
316
+ "learning_rate": 2.267286666671027e-06,
317
+ "loss": 0.029556829929351807,
318
+ "step": 4400
319
+ },
320
+ {
321
+ "epoch": 3.709810387469085,
322
+ "grad_norm": 1.0615909099578857,
323
+ "learning_rate": 2.2318656444595387e-06,
324
+ "loss": 0.02913331747055054,
325
+ "step": 4500
326
+ },
327
+ {
328
+ "epoch": 3.7922506183017313,
329
+ "grad_norm": 0.6333794593811035,
330
+ "learning_rate": 2.1959006545876846e-06,
331
+ "loss": 0.025029284954071043,
332
+ "step": 4600
333
+ },
334
+ {
335
+ "epoch": 3.8746908491343777,
336
+ "grad_norm": 0.660271167755127,
337
+ "learning_rate": 2.1594184284528776e-06,
338
+ "loss": 0.031456120014190674,
339
+ "step": 4700
340
+ },
341
+ {
342
+ "epoch": 3.957131079967024,
343
+ "grad_norm": 1.3678995370864868,
344
+ "learning_rate": 2.1224460818944066e-06,
345
+ "loss": 0.03159121990203857,
346
+ "step": 4800
347
+ },
348
+ {
349
+ "epoch": 4.03957131079967,
350
+ "grad_norm": 1.6306558847427368,
351
+ "learning_rate": 2.0850110950392694e-06,
352
+ "loss": 0.023191912174224852,
353
+ "step": 4900
354
+ },
355
+ {
356
+ "epoch": 4.122011541632316,
357
+ "grad_norm": 1.2425507307052612,
358
+ "learning_rate": 2.047141291877252e-06,
359
+ "loss": 0.030113303661346437,
360
+ "step": 5000
361
+ },
362
+ {
363
+ "epoch": 4.204451772464963,
364
+ "grad_norm": 0.47728583216667175,
365
+ "learning_rate": 2.00886481958042e-06,
366
+ "loss": 0.024720721244812012,
367
+ "step": 5100
368
+ },
369
+ {
370
+ "epoch": 4.286892003297609,
371
+ "grad_norm": 7.698798179626465,
372
+ "learning_rate": 1.97021012758241e-06,
373
+ "loss": 0.027791497707366945,
374
+ "step": 5200
375
+ },
376
+ {
377
+ "epoch": 4.369332234130256,
378
+ "grad_norm": 0.9410794973373413,
379
+ "learning_rate": 1.9312059464330545e-06,
380
+ "loss": 0.022262310981750487,
381
+ "step": 5300
382
+ },
383
+ {
384
+ "epoch": 4.451772464962902,
385
+ "grad_norm": 6.470306873321533,
386
+ "learning_rate": 1.8918812664440643e-06,
387
+ "loss": 0.023623311519622804,
388
+ "step": 5400
389
+ },
390
+ {
391
+ "epoch": 4.534212695795548,
392
+ "grad_norm": 0.5035800933837891,
393
+ "learning_rate": 1.8522653161416466e-06,
394
+ "loss": 0.02535334587097168,
395
+ "step": 5500
396
+ },
397
+ {
398
+ "epoch": 4.616652926628195,
399
+ "grad_norm": 2.6831247806549072,
400
+ "learning_rate": 1.8123875405420576e-06,
401
+ "loss": 0.022764217853546143,
402
+ "step": 5600
403
+ },
404
+ {
405
+ "epoch": 4.699093157460841,
406
+ "grad_norm": 0.7162504196166992,
407
+ "learning_rate": 1.7722775792662551e-06,
408
+ "loss": 0.027024078369140624,
409
+ "step": 5700
410
+ },
411
+ {
412
+ "epoch": 4.781533388293488,
413
+ "grad_norm": 0.8824426531791687,
414
+ "learning_rate": 1.7319652445099035e-06,
415
+ "loss": 0.02422706365585327,
416
+ "step": 5800
417
+ },
418
+ {
419
+ "epoch": 4.863973619126133,
420
+ "grad_norm": 0.9424160122871399,
421
+ "learning_rate": 1.6914804988851126e-06,
422
+ "loss": 0.030813112258911132,
423
+ "step": 5900
424
+ },
425
+ {
426
+ "epoch": 4.9464138499587795,
427
+ "grad_norm": 0.8058213591575623,
428
+ "learning_rate": 1.6508534331503764e-06,
429
+ "loss": 0.0320208215713501,
430
+ "step": 6000
431
+ },
432
+ {
433
+ "epoch": 5.028854080791426,
434
+ "grad_norm": 0.6557570099830627,
435
+ "learning_rate": 1.610114243845269e-06,
436
+ "loss": 0.032469592094421386,
437
+ "step": 6100
438
+ },
439
+ {
440
+ "epoch": 5.111294311624072,
441
+ "grad_norm": 0.6476128697395325,
442
+ "learning_rate": 1.569293210846512e-06,
443
+ "loss": 0.028678691387176512,
444
+ "step": 6200
445
+ },
446
+ {
447
+ "epoch": 5.193734542456719,
448
+ "grad_norm": 17.67458724975586,
449
+ "learning_rate": 1.5284206748621066e-06,
450
+ "loss": 0.022574949264526366,
451
+ "step": 6300
452
+ },
453
+ {
454
+ "epoch": 5.276174773289365,
455
+ "grad_norm": 0.6299949884414673,
456
+ "learning_rate": 1.4875270148802465e-06,
457
+ "loss": 0.028071470260620117,
458
+ "step": 6400
459
+ },
460
+ {
461
+ "epoch": 5.3586150041220115,
462
+ "grad_norm": 0.17750702798366547,
463
+ "learning_rate": 1.4466426255897827e-06,
464
+ "loss": 0.02460844039916992,
465
+ "step": 6500
466
+ },
467
+ {
468
+ "epoch": 5.441055234954658,
469
+ "grad_norm": 3.046633720397949,
470
+ "learning_rate": 1.4057978947890166e-06,
471
+ "loss": 0.028238294124603273,
472
+ "step": 6600
473
+ },
474
+ {
475
+ "epoch": 5.523495465787304,
476
+ "grad_norm": 1.1183128356933594,
477
+ "learning_rate": 1.3650231807996163e-06,
478
+ "loss": 0.022344279289245605,
479
+ "step": 6700
480
+ },
481
+ {
482
+ "epoch": 5.605935696619951,
483
+ "grad_norm": 0.39727112650871277,
484
+ "learning_rate": 1.3243487899024401e-06,
485
+ "loss": 0.024733517169952392,
486
+ "step": 6800
487
+ },
488
+ {
489
+ "epoch": 5.688375927452597,
490
+ "grad_norm": 1.1431382894515991,
491
+ "learning_rate": 1.2838049538120375e-06,
492
+ "loss": 0.024790296554565428,
493
+ "step": 6900
494
+ },
495
+ {
496
+ "epoch": 5.7708161582852435,
497
+ "grad_norm": 0.270554780960083,
498
+ "learning_rate": 1.243421807206581e-06,
499
+ "loss": 0.023978376388549806,
500
+ "step": 7000
501
+ },
502
+ {
503
+ "epoch": 5.85325638911789,
504
+ "grad_norm": 2.6371097564697266,
505
+ "learning_rate": 1.2032293653299107e-06,
506
+ "loss": 0.024820666313171386,
507
+ "step": 7100
508
+ },
509
+ {
510
+ "epoch": 5.935696619950535,
511
+ "grad_norm": 0.4371972680091858,
512
+ "learning_rate": 1.1632575016823583e-06,
513
+ "loss": 0.023902339935302733,
514
+ "step": 7200
515
+ },
516
+ {
517
+ "epoch": 6.018136850783182,
518
+ "grad_norm": 0.3910028040409088,
519
+ "learning_rate": 1.1235359258169183e-06,
520
+ "loss": 0.023613801002502443,
521
+ "step": 7300
522
+ },
523
+ {
524
+ "epoch": 6.100577081615828,
525
+ "grad_norm": 0.5897251963615417,
526
+ "learning_rate": 1.0840941612572765e-06,
527
+ "loss": 0.025773169994354247,
528
+ "step": 7400
529
+ },
530
+ {
531
+ "epoch": 6.183017312448475,
532
+ "grad_norm": 7.796975135803223,
533
+ "learning_rate": 1.0449615235541093e-06,
534
+ "loss": 0.028378984928131103,
535
+ "step": 7500
536
+ },
537
+ {
538
+ "epoch": 6.265457543281121,
539
+ "grad_norm": 2.8554458618164062,
540
+ "learning_rate": 1.0061670984959582e-06,
541
+ "loss": 0.023146660327911378,
542
+ "step": 7600
543
+ },
544
+ {
545
+ "epoch": 6.347897774113767,
546
+ "grad_norm": 1.5360387563705444,
547
+ "learning_rate": 9.677397204908788e-07,
548
+ "loss": 0.025156841278076172,
549
+ "step": 7700
550
+ },
551
+ {
552
+ "epoch": 6.430338004946414,
553
+ "grad_norm": 0.2781471312046051,
554
+ "learning_rate": 9.297079511349307e-07,
555
+ "loss": 0.023876771926879883,
556
+ "step": 7800
557
+ },
558
+ {
559
+ "epoch": 6.51277823577906,
560
+ "grad_norm": 0.5193030834197998,
561
+ "learning_rate": 8.921000579834404e-07,
562
+ "loss": 0.021548757553100584,
563
+ "step": 7900
564
+ },
565
+ {
566
+ "epoch": 6.595218466611707,
567
+ "grad_norm": 1.3652324676513672,
568
+ "learning_rate": 8.549439935408109e-07,
569
+ "loss": 0.027704770565032958,
570
+ "step": 8000
571
+ },
572
+ {
573
+ "epoch": 6.677658697444353,
574
+ "grad_norm": 0.567807137966156,
575
+ "learning_rate": 8.182673744844971e-07,
576
+ "loss": 0.024193654060363768,
577
+ "step": 8100
578
+ },
579
+ {
580
+ "epoch": 6.760098928276999,
581
+ "grad_norm": 1.0867784023284912,
582
+ "learning_rate": 7.820974611385887e-07,
583
+ "loss": 0.024752085208892823,
584
+ "step": 8200
585
+ },
586
+ {
587
+ "epoch": 6.842539159109646,
588
+ "grad_norm": 12.679245948791504,
589
+ "learning_rate": 7.464611372122565e-07,
590
+ "loss": 0.02242401838302612,
591
+ "step": 8300
592
+ },
593
+ {
594
+ "epoch": 6.924979389942292,
595
+ "grad_norm": 1.22047758102417,
596
+ "learning_rate": 7.113848898181258e-07,
597
+ "loss": 0.031413540840148926,
598
+ "step": 8400
599
+ },
600
+ {
601
+ "epoch": 7.007419620774938,
602
+ "grad_norm": 36.422203063964844,
603
+ "learning_rate": 6.76894789785417e-07,
604
+ "loss": 0.0245809006690979,
605
+ "step": 8500
606
+ },
607
+ {
608
+ "epoch": 7.089859851607584,
609
+ "grad_norm": 1.0370782613754272,
610
+ "learning_rate": 6.430164722825002e-07,
611
+ "loss": 0.02020338773727417,
612
+ "step": 8600
613
+ },
614
+ {
615
+ "epoch": 7.1723000824402305,
616
+ "grad_norm": 1.1046611070632935,
617
+ "learning_rate": 6.097751177632599e-07,
618
+ "loss": 0.02843546390533447,
619
+ "step": 8700
620
+ },
621
+ {
622
+ "epoch": 7.254740313272877,
623
+ "grad_norm": 1.1630239486694336,
624
+ "learning_rate": 5.77195433251426e-07,
625
+ "loss": 0.023567092418670655,
626
+ "step": 8800
627
+ },
628
+ {
629
+ "epoch": 7.337180544105523,
630
+ "grad_norm": 0.6131232976913452,
631
+ "learning_rate": 5.453016339767939e-07,
632
+ "loss": 0.02418841600418091,
633
+ "step": 8900
634
+ },
635
+ {
636
+ "epoch": 7.41962077493817,
637
+ "grad_norm": 1.4645527601242065,
638
+ "learning_rate": 5.141174253769704e-07,
639
+ "loss": 0.0257061243057251,
640
+ "step": 9000
641
+ },
642
+ {
643
+ "epoch": 7.502061005770816,
644
+ "grad_norm": 0.9117152690887451,
645
+ "learning_rate": 4.836659854780321e-07,
646
+ "loss": 0.02661696672439575,
647
+ "step": 9100
648
+ },
649
+ {
650
+ "epoch": 7.5845012366034625,
651
+ "grad_norm": 1.2171125411987305,
652
+ "learning_rate": 4.5396994766719033e-07,
653
+ "loss": 0.02204680919647217,
654
+ "step": 9200
655
+ },
656
+ {
657
+ "epoch": 7.666941467436109,
658
+ "grad_norm": 0.6495709419250488,
659
+ "learning_rate": 4.2505138387025753e-07,
660
+ "loss": 0.029301033020019532,
661
+ "step": 9300
662
+ },
663
+ {
664
+ "epoch": 7.749381698268755,
665
+ "grad_norm": 0.6626441478729248,
666
+ "learning_rate": 3.969317881464367e-07,
667
+ "loss": 0.021526577472686766,
668
+ "step": 9400
669
+ },
670
+ {
671
+ "epoch": 7.831821929101402,
672
+ "grad_norm": 16.135684967041016,
673
+ "learning_rate": 3.696320607126065e-07,
674
+ "loss": 0.026122121810913085,
675
+ "step": 9500
676
+ },
677
+ {
678
+ "epoch": 7.914262159934048,
679
+ "grad_norm": 0.8759784698486328,
680
+ "learning_rate": 3.4317249240899686e-07,
681
+ "loss": 0.025852499008178712,
682
+ "step": 9600
683
+ },
684
+ {
685
+ "epoch": 7.9967023907666945,
686
+ "grad_norm": 0.800774872303009,
687
+ "learning_rate": 3.175727496177826e-07,
688
+ "loss": 0.02209474563598633,
689
+ "step": 9700
690
+ },
691
+ {
692
+ "epoch": 8.07914262159934,
693
+ "grad_norm": 0.8495875000953674,
694
+ "learning_rate": 2.928518596458161e-07,
695
+ "loss": 0.02649038314819336,
696
+ "step": 9800
697
+ },
698
+ {
699
+ "epoch": 8.161582852431987,
700
+ "grad_norm": 0.7733114361763,
701
+ "learning_rate": 2.690281965823608e-07,
702
+ "loss": 0.024643311500549315,
703
+ "step": 9900
704
+ },
705
+ {
706
+ "epoch": 8.244023083264633,
707
+ "grad_norm": 12.20607852935791,
708
+ "learning_rate": 2.461194676423352e-07,
709
+ "loss": 0.026993231773376467,
710
+ "step": 10000
711
+ },
712
+ {
713
+ "epoch": 8.32646331409728,
714
+ "grad_norm": 26.955886840820312,
715
+ "learning_rate": 2.2414270000521946e-07,
716
+ "loss": 0.02212390899658203,
717
+ "step": 10100
718
+ },
719
+ {
720
+ "epoch": 8.408903544929926,
721
+ "grad_norm": 0.8895039558410645,
722
+ "learning_rate": 2.031142281594066e-07,
723
+ "loss": 0.02063568115234375,
724
+ "step": 10200
725
+ },
726
+ {
727
+ "epoch": 8.491343775762573,
728
+ "grad_norm": 0.5100795030593872,
729
+ "learning_rate": 1.8304968176140035e-07,
730
+ "loss": 0.024049296379089355,
731
+ "step": 10300
732
+ },
733
+ {
734
+ "epoch": 8.573784006595218,
735
+ "grad_norm": 1.8475189208984375,
736
+ "learning_rate": 1.6396397401889679e-07,
737
+ "loss": 0.02524357557296753,
738
+ "step": 10400
739
+ },
740
+ {
741
+ "epoch": 8.656224237427864,
742
+ "grad_norm": 0.6764451861381531,
743
+ "learning_rate": 1.4587129060636367e-07,
744
+ "loss": 0.028507494926452638,
745
+ "step": 10500
746
+ },
747
+ {
748
+ "epoch": 8.738664468260511,
749
+ "grad_norm": 1.1165720224380493,
750
+ "learning_rate": 1.2878507912137682e-07,
751
+ "loss": 0.02533757209777832,
752
+ "step": 10600
753
+ },
754
+ {
755
+ "epoch": 8.821104699093157,
756
+ "grad_norm": 0.7385006546974182,
757
+ "learning_rate": 1.1271803908953354e-07,
758
+ "loss": 0.022719991207122803,
759
+ "step": 10700
760
+ },
761
+ {
762
+ "epoch": 8.903544929925804,
763
+ "grad_norm": 8.316965103149414,
764
+ "learning_rate": 9.76821125253844e-08,
765
+ "loss": 0.021582581996917725,
766
+ "step": 10800
767
+ },
768
+ {
769
+ "epoch": 8.98598516075845,
770
+ "grad_norm": 1.0215933322906494,
771
+ "learning_rate": 8.368847505639149e-08,
772
+ "loss": 0.02419511079788208,
773
+ "step": 10900
774
+ },
775
+ {
776
+ "epoch": 9.068425391591097,
777
+ "grad_norm": 8.314555168151855,
778
+ "learning_rate": 7.074752761651227e-08,
779
+ "loss": 0.026871368885040284,
780
+ "step": 11000
781
+ },
782
+ {
783
+ "epoch": 9.150865622423742,
784
+ "grad_norm": 7.914130687713623,
785
+ "learning_rate": 5.8868888715585325e-08,
786
+ "loss": 0.02440465450286865,
787
+ "step": 11100
788
+ },
789
+ {
790
+ "epoch": 9.23330585325639,
791
+ "grad_norm": 17.640907287597656,
792
+ "learning_rate": 4.806138729026111e-08,
793
+ "loss": 0.020871245861053468,
794
+ "step": 11200
795
+ },
796
+ {
797
+ "epoch": 9.315746084089035,
798
+ "grad_norm": 0.4218542277812958,
799
+ "learning_rate": 3.833305614179061e-08,
800
+ "loss": 0.024367706775665285,
801
+ "step": 11300
802
+ },
803
+ {
804
+ "epoch": 9.398186314921682,
805
+ "grad_norm": 0.497744619846344,
806
+ "learning_rate": 2.9691125965554454e-08,
807
+ "loss": 0.027901573181152342,
808
+ "step": 11400
809
+ },
810
+ {
811
+ "epoch": 9.480626545754328,
812
+ "grad_norm": 0.6788234710693359,
813
+ "learning_rate": 2.214201997676152e-08,
814
+ "loss": 0.021528902053833007,
815
+ "step": 11500
816
+ },
817
+ {
818
+ "epoch": 9.563066776586975,
819
+ "grad_norm": 0.21850308775901794,
820
+ "learning_rate": 1.5691349136322697e-08,
821
+ "loss": 0.02456042289733887,
822
+ "step": 11600
823
+ },
824
+ {
825
+ "epoch": 9.64550700741962,
826
+ "grad_norm": 0.7916850447654724,
827
+ "learning_rate": 1.0343907980436218e-08,
828
+ "loss": 0.019252289533615113,
829
+ "step": 11700
830
+ },
831
+ {
832
+ "epoch": 9.727947238252266,
833
+ "grad_norm": 0.3894334137439728,
834
+ "learning_rate": 6.103671056994387e-09,
835
+ "loss": 0.020840485095977784,
836
+ "step": 11800
837
+ },
838
+ {
839
+ "epoch": 9.810387469084914,
840
+ "grad_norm": 1.1665902137756348,
841
+ "learning_rate": 2.9737899714539775e-09,
842
+ "loss": 0.030273616313934326,
843
+ "step": 11900
844
+ },
845
+ {
846
+ "epoch": 9.892827699917559,
847
+ "grad_norm": 0.6720395088195801,
848
+ "learning_rate": 9.565910443685155e-10,
849
+ "loss": 0.02424750566482544,
850
+ "step": 12000
851
+ }
852
+ ],
853
+ "logging_steps": 100,
854
+ "max_steps": 12130,
855
+ "num_input_tokens_seen": 0,
856
+ "num_train_epochs": 10,
857
+ "save_steps": 1000,
858
+ "stateful_callbacks": {
859
+ "TrainerControl": {
860
+ "args": {
861
+ "should_epoch_stop": false,
862
+ "should_evaluate": false,
863
+ "should_log": false,
864
+ "should_save": true,
865
+ "should_training_stop": false
866
+ },
867
+ "attributes": {}
868
+ }
869
+ },
870
+ "total_flos": 7807792431338136.0,
871
+ "train_batch_size": 8,
872
+ "trial_name": null,
873
+ "trial_params": null
874
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5baf6aa343fd5badad92c7c7d46ab369c567b17b3a2aaee0ea3b0e8bbd731d12
3
+ size 5329