Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

config.json +130 -0
model.safetensors +3 -0
tokenizer.json +0 -0
tokenizer_config.json +17 -0
trainer_state.json +874 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,130 @@

+{
+  "architecture_type": "uni-encoder",
+  "architectures": [
+    "GLiClassModel"
+  ],
+  "class_token_index": 50368,
+  "class_token_pooling": "first",
+  "contrastive_loss_coef": 0.0,
+  "cross_encoder_config": null,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "embed_class_token": true,
+  "encoder_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "jhu-clsp/ettin-encoder-32m",
+    "add_cross_attention": false,
+    "architectures": [
+      "ModernBertForMaskedLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 50281,
+    "causal_mask": false,
+    "classifier_activation": "gelu",
+    "classifier_bias": false,
+    "classifier_dropout": 0.0,
+    "classifier_pooling": "mean",
+    "cls_token_id": 50281,
+    "cross_attention_hidden_size": null,
+    "decoder_bias": true,
+    "decoder_start_token_id": null,
+    "deterministic_flash_attn": false,
+    "dtype": "float32",
+    "embedding_dropout": 0.0,
+    "eos_token_id": 50282,
+    "finetuning_task": null,
+    "global_attn_every_n_layers": 3,
+    "gradient_checkpointing": false,
+    "hidden_activation": "gelu",
+    "hidden_size": 384,
+    "initializer_cutoff_factor": 2.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 576,
+    "is_causal": false,
+    "is_decoder": false,
+    "layer_norm_eps": 1e-05,
+    "layer_types": [
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention"
+    ],
+    "local_attention": 128,
+    "max_position_embeddings": 7999,
+    "mlp_bias": false,
+    "mlp_dropout": 0.0,
+    "model_type": "modernbert",
+    "norm_bias": false,
+    "norm_eps": 1e-05,
+    "num_attention_heads": 6,
+    "num_hidden_layers": 10,
+    "pad_token_id": 50283,
+    "position_embedding_type": "sans_pos",
+    "prefix": null,
+    "pruned_heads": {},
+    "repad_logits_with_grad": false,
+    "rope_parameters": {
+      "full_attention": {
+        "rope_theta": 160000.0,
+        "rope_type": "default"
+      },
+      "sliding_attention": {
+        "rope_theta": 160000.0,
+        "rope_type": "default"
+      }
+    },
+    "sep_token_id": 50282,
+    "sparse_pred_ignore_index": -100,
+    "sparse_prediction": false,
+    "task_specific_params": null,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "torchscript": false,
+    "use_bfloat16": false,
+    "vocab_size": 50370
+  },
+  "encoder_layer_id": -1,
+  "encoder_model_name": "jhu-clsp/ettin-encoder-32m",
+  "example_token_index": 50372,
+  "extract_text_features": false,
+  "focal_loss_alpha": 0.7,
+  "focal_loss_gamma": -1,
+  "focal_loss_reduction": "none",
+  "hidden_size": 384,
+  "ignore_index": -100,
+  "initializer_range": 0.03,
+  "label_model_config": null,
+  "label_model_name": null,
+  "layer_wise": false,
+  "logit_scale_init_value": 2.6592,
+  "max_labels_alloc": "dynamic",
+  "max_num_classes": 25,
+  "model_type": "GLiClass",
+  "normalize_features": false,
+  "pad_token_id": 50283,
+  "pooling_strategy": "first",
+  "problem_type": "multi_label_classification",
+  "projector_hidden_act": "gelu",
+  "prompt_first": true,
+  "scorer_attn_dropout": 0.1,
+  "scorer_mlp_hidden_size": 1024,
+  "scorer_num_heads": 16,
+  "scorer_type": "mlp",
+  "shuffle_labels": true,
+  "squeeze_layers": false,
+  "text_token_index": 50369,
+  "transformers_version": "5.1.0",
+  "use_cache": false,
+  "use_lstm": false,
+  "use_segment_embeddings": false,
+  "vocab_size": 50370
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:979d72fa0026def128a9fdfccdb763c3f226d5df87883ecc205dac45be53c893
+size 130829312

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "add_prefix_space": true,
+  "backend": "tokenizers",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "is_local": true,
+  "mask_token": "[MASK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 8192,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "[UNK]"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,874 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 9.892827699917559,
+  "eval_steps": 500,
+  "global_step": 12000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08244023083264633,
+      "grad_norm": 1.0573325157165527,
+      "learning_rate": 4.892915980230642e-07,
+      "loss": 0.05755834102630615,
+      "step": 100
+    },
+    {
+      "epoch": 0.16488046166529266,
+      "grad_norm": 10.260250091552734,
+      "learning_rate": 9.835255354200989e-07,
+      "loss": 0.04523322582244873,
+      "step": 200
+    },
+    {
+      "epoch": 0.247320692497939,
+      "grad_norm": 0.5175366997718811,
+      "learning_rate": 1.4777594728171335e-06,
+      "loss": 0.03658797979354858,
+      "step": 300
+    },
+    {
+      "epoch": 0.3297609233305853,
+      "grad_norm": 5.0066118240356445,
+      "learning_rate": 1.971993410214168e-06,
+      "loss": 0.04241968631744385,
+      "step": 400
+    },
+    {
+      "epoch": 0.41220115416323166,
+      "grad_norm": 16.185205459594727,
+      "learning_rate": 2.466227347611203e-06,
+      "loss": 0.03233745574951172,
+      "step": 500
+    },
+    {
+      "epoch": 0.494641384995878,
+      "grad_norm": 0.4887259900569916,
+      "learning_rate": 2.9604612850082373e-06,
+      "loss": 0.04120903968811035,
+      "step": 600
+    },
+    {
+      "epoch": 0.5770816158285244,
+      "grad_norm": 1.360079050064087,
+      "learning_rate": 2.999528173020731e-06,
+      "loss": 0.03448781490325928,
+      "step": 700
+    },
+    {
+      "epoch": 0.6595218466611706,
+      "grad_norm": 1.5775821208953857,
+      "learning_rate": 2.997945372145348e-06,
+      "loss": 0.03620944499969483,
+      "step": 800
+    },
+    {
+      "epoch": 0.7419620774938169,
+      "grad_norm": 37.682884216308594,
+      "learning_rate": 2.9952492059335665e-06,
+      "loss": 0.033068180084228516,
+      "step": 900
+    },
+    {
+      "epoch": 0.8244023083264633,
+      "grad_norm": 0.5983888506889343,
+      "learning_rate": 2.99144167834231e-06,
+      "loss": 0.03487896919250488,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9068425391591096,
+      "grad_norm": 0.4312576651573181,
+      "learning_rate": 2.986525619360788e-06,
+      "loss": 0.032608640193939206,
+      "step": 1100
+    },
+    {
+      "epoch": 0.989282769991756,
+      "grad_norm": 1.5949877500534058,
+      "learning_rate": 2.980504682907069e-06,
+      "loss": 0.03462482452392578,
+      "step": 1200
+    },
+    {
+      "epoch": 1.0717230008244023,
+      "grad_norm": 0.5395241379737854,
+      "learning_rate": 2.9733833441122652e-06,
+      "loss": 0.029383325576782228,
+      "step": 1300
+    },
+    {
+      "epoch": 1.1541632316570487,
+      "grad_norm": 7.3991804122924805,
+      "learning_rate": 2.9651668959943407e-06,
+      "loss": 0.03298326969146728,
+      "step": 1400
+    },
+    {
+      "epoch": 1.2366034624896949,
+      "grad_norm": 3.2689311504364014,
+      "learning_rate": 2.955861445524012e-06,
+      "loss": 0.03119053363800049,
+      "step": 1500
+    },
+    {
+      "epoch": 1.3190436933223413,
+      "grad_norm": 1.3022229671478271,
+      "learning_rate": 2.9454739090856716e-06,
+      "loss": 0.03160511493682861,
+      "step": 1600
+    },
+    {
+      "epoch": 1.4014839241549877,
+      "grad_norm": 1.545177936553955,
+      "learning_rate": 2.9340120073367064e-06,
+      "loss": 0.03241936206817627,
+      "step": 1700
+    },
+    {
+      "epoch": 1.4839241549876339,
+      "grad_norm": 0.3807108998298645,
+      "learning_rate": 2.921484259469025e-06,
+      "loss": 0.028808791637420655,
+      "step": 1800
+    },
+    {
+      "epoch": 1.5663643858202803,
+      "grad_norm": 1.6713629961013794,
+      "learning_rate": 2.907899976877075e-06,
+      "loss": 0.026836631298065187,
+      "step": 1900
+    },
+    {
+      "epoch": 1.6488046166529267,
+      "grad_norm": 0.8516013622283936,
+      "learning_rate": 2.8932692562370356e-06,
+      "loss": 0.02777437925338745,
+      "step": 2000
+    },
+    {
+      "epoch": 1.731244847485573,
+      "grad_norm": 6.933730602264404,
+      "learning_rate": 2.8776029720023492e-06,
+      "loss": 0.03317852735519409,
+      "step": 2100
+    },
+    {
+      "epoch": 1.8136850783182195,
+      "grad_norm": 2.6626434326171875,
+      "learning_rate": 2.8609127683211535e-06,
+      "loss": 0.030532124042510985,
+      "step": 2200
+    },
+    {
+      "epoch": 1.8961253091508656,
+      "grad_norm": 0.9223625659942627,
+      "learning_rate": 2.8432110503816364e-06,
+      "loss": 0.028622543811798094,
+      "step": 2300
+    },
+    {
+      "epoch": 1.9785655399835118,
+      "grad_norm": 1.147865891456604,
+      "learning_rate": 2.824510975191734e-06,
+      "loss": 0.0330674409866333,
+      "step": 2400
+    },
+    {
+      "epoch": 2.061005770816158,
+      "grad_norm": 0.507649838924408,
+      "learning_rate": 2.8048264418000297e-06,
+      "loss": 0.029734506607055664,
+      "step": 2500
+    },
+    {
+      "epoch": 2.1434460016488046,
+      "grad_norm": 0.648606538772583,
+      "learning_rate": 2.7841720809651287e-06,
+      "loss": 0.02952629327774048,
+      "step": 2600
+    },
+    {
+      "epoch": 2.225886232481451,
+      "grad_norm": 6.885851860046387,
+      "learning_rate": 2.762563244281172e-06,
+      "loss": 0.033362927436828616,
+      "step": 2700
+    },
+    {
+      "epoch": 2.3083264633140974,
+      "grad_norm": 0.7185697555541992,
+      "learning_rate": 2.7400159927675868e-06,
+      "loss": 0.03310190677642822,
+      "step": 2800
+    },
+    {
+      "epoch": 2.390766694146744,
+      "grad_norm": 0.9038472771644592,
+      "learning_rate": 2.7165470849315476e-06,
+      "loss": 0.027033202648162842,
+      "step": 2900
+    },
+    {
+      "epoch": 2.4732069249793898,
+      "grad_norm": 1.6981157064437866,
+      "learning_rate": 2.692173964312021e-06,
+      "loss": 0.03098677396774292,
+      "step": 3000
+    },
+    {
+      "epoch": 2.555647155812036,
+      "grad_norm": 0.38965705037117004,
+      "learning_rate": 2.666914746514651e-06,
+      "loss": 0.025331289768218995,
+      "step": 3100
+    },
+    {
+      "epoch": 2.6380873866446826,
+      "grad_norm": 0.9481067061424255,
+      "learning_rate": 2.6407882057471234e-06,
+      "loss": 0.02945547103881836,
+      "step": 3200
+    },
+    {
+      "epoch": 2.720527617477329,
+      "grad_norm": 0.990297794342041,
+      "learning_rate": 2.61381376086502e-06,
+      "loss": 0.025877423286437988,
+      "step": 3300
+    },
+    {
+      "epoch": 2.8029678483099754,
+      "grad_norm": 0.7205927968025208,
+      "learning_rate": 2.586011460938527e-06,
+      "loss": 0.0348543381690979,
+      "step": 3400
+    },
+    {
+      "epoch": 2.8854080791426218,
+      "grad_norm": 1.2724188566207886,
+      "learning_rate": 2.5574019703507284e-06,
+      "loss": 0.025783424377441407,
+      "step": 3500
+    },
+    {
+      "epoch": 2.9678483099752677,
+      "grad_norm": 0.3951258063316345,
+      "learning_rate": 2.528006553438566e-06,
+      "loss": 0.024769763946533203,
+      "step": 3600
+    },
+    {
+      "epoch": 3.050288540807914,
+      "grad_norm": 0.7116732001304626,
+      "learning_rate": 2.4978470586878702e-06,
+      "loss": 0.0287685227394104,
+      "step": 3700
+    },
+    {
+      "epoch": 3.1327287716405605,
+      "grad_norm": 0.9427194595336914,
+      "learning_rate": 2.4669459024942216e-06,
+      "loss": 0.03142688512802124,
+      "step": 3800
+    },
+    {
+      "epoch": 3.215169002473207,
+      "grad_norm": 4.737741470336914,
+      "learning_rate": 2.4353260525017004e-06,
+      "loss": 0.025456478595733644,
+      "step": 3900
+    },
+    {
+      "epoch": 3.2976092333058533,
+      "grad_norm": 1.237444519996643,
+      "learning_rate": 2.4030110105319206e-06,
+      "loss": 0.02399829149246216,
+      "step": 4000
+    },
+    {
+      "epoch": 3.3800494641384997,
+      "grad_norm": 4.982211589813232,
+      "learning_rate": 2.370024795116028e-06,
+      "loss": 0.03101783275604248,
+      "step": 4100
+    },
+    {
+      "epoch": 3.462489694971146,
+      "grad_norm": 5.987162113189697,
+      "learning_rate": 2.336391923642643e-06,
+      "loss": 0.023696184158325195,
+      "step": 4200
+    },
+    {
+      "epoch": 3.5449299258037925,
+      "grad_norm": 0.5781798362731934,
+      "learning_rate": 2.302137394135031e-06,
+      "loss": 0.026220085620880126,
+      "step": 4300
+    },
+    {
+      "epoch": 3.6273701566364385,
+      "grad_norm": 1.0097777843475342,
+      "learning_rate": 2.267286666671027e-06,
+      "loss": 0.029556829929351807,
+      "step": 4400
+    },
+    {
+      "epoch": 3.709810387469085,
+      "grad_norm": 1.0615909099578857,
+      "learning_rate": 2.2318656444595387e-06,
+      "loss": 0.02913331747055054,
+      "step": 4500
+    },
+    {
+      "epoch": 3.7922506183017313,
+      "grad_norm": 0.6333794593811035,
+      "learning_rate": 2.1959006545876846e-06,
+      "loss": 0.025029284954071043,
+      "step": 4600
+    },
+    {
+      "epoch": 3.8746908491343777,
+      "grad_norm": 0.660271167755127,
+      "learning_rate": 2.1594184284528776e-06,
+      "loss": 0.031456120014190674,
+      "step": 4700
+    },
+    {
+      "epoch": 3.957131079967024,
+      "grad_norm": 1.3678995370864868,
+      "learning_rate": 2.1224460818944066e-06,
+      "loss": 0.03159121990203857,
+      "step": 4800
+    },
+    {
+      "epoch": 4.03957131079967,
+      "grad_norm": 1.6306558847427368,
+      "learning_rate": 2.0850110950392694e-06,
+      "loss": 0.023191912174224852,
+      "step": 4900
+    },
+    {
+      "epoch": 4.122011541632316,
+      "grad_norm": 1.2425507307052612,
+      "learning_rate": 2.047141291877252e-06,
+      "loss": 0.030113303661346437,
+      "step": 5000
+    },
+    {
+      "epoch": 4.204451772464963,
+      "grad_norm": 0.47728583216667175,
+      "learning_rate": 2.00886481958042e-06,
+      "loss": 0.024720721244812012,
+      "step": 5100
+    },
+    {
+      "epoch": 4.286892003297609,
+      "grad_norm": 7.698798179626465,
+      "learning_rate": 1.97021012758241e-06,
+      "loss": 0.027791497707366945,
+      "step": 5200
+    },
+    {
+      "epoch": 4.369332234130256,
+      "grad_norm": 0.9410794973373413,
+      "learning_rate": 1.9312059464330545e-06,
+      "loss": 0.022262310981750487,
+      "step": 5300
+    },
+    {
+      "epoch": 4.451772464962902,
+      "grad_norm": 6.470306873321533,
+      "learning_rate": 1.8918812664440643e-06,
+      "loss": 0.023623311519622804,
+      "step": 5400
+    },
+    {
+      "epoch": 4.534212695795548,
+      "grad_norm": 0.5035800933837891,
+      "learning_rate": 1.8522653161416466e-06,
+      "loss": 0.02535334587097168,
+      "step": 5500
+    },
+    {
+      "epoch": 4.616652926628195,
+      "grad_norm": 2.6831247806549072,
+      "learning_rate": 1.8123875405420576e-06,
+      "loss": 0.022764217853546143,
+      "step": 5600
+    },
+    {
+      "epoch": 4.699093157460841,
+      "grad_norm": 0.7162504196166992,
+      "learning_rate": 1.7722775792662551e-06,
+      "loss": 0.027024078369140624,
+      "step": 5700
+    },
+    {
+      "epoch": 4.781533388293488,
+      "grad_norm": 0.8824426531791687,
+      "learning_rate": 1.7319652445099035e-06,
+      "loss": 0.02422706365585327,
+      "step": 5800
+    },
+    {
+      "epoch": 4.863973619126133,
+      "grad_norm": 0.9424160122871399,
+      "learning_rate": 1.6914804988851126e-06,
+      "loss": 0.030813112258911132,
+      "step": 5900
+    },
+    {
+      "epoch": 4.9464138499587795,
+      "grad_norm": 0.8058213591575623,
+      "learning_rate": 1.6508534331503764e-06,
+      "loss": 0.0320208215713501,
+      "step": 6000
+    },
+    {
+      "epoch": 5.028854080791426,
+      "grad_norm": 0.6557570099830627,
+      "learning_rate": 1.610114243845269e-06,
+      "loss": 0.032469592094421386,
+      "step": 6100
+    },
+    {
+      "epoch": 5.111294311624072,
+      "grad_norm": 0.6476128697395325,
+      "learning_rate": 1.569293210846512e-06,
+      "loss": 0.028678691387176512,
+      "step": 6200
+    },
+    {
+      "epoch": 5.193734542456719,
+      "grad_norm": 17.67458724975586,
+      "learning_rate": 1.5284206748621066e-06,
+      "loss": 0.022574949264526366,
+      "step": 6300
+    },
+    {
+      "epoch": 5.276174773289365,
+      "grad_norm": 0.6299949884414673,
+      "learning_rate": 1.4875270148802465e-06,
+      "loss": 0.028071470260620117,
+      "step": 6400
+    },
+    {
+      "epoch": 5.3586150041220115,
+      "grad_norm": 0.17750702798366547,
+      "learning_rate": 1.4466426255897827e-06,
+      "loss": 0.02460844039916992,
+      "step": 6500
+    },
+    {
+      "epoch": 5.441055234954658,
+      "grad_norm": 3.046633720397949,
+      "learning_rate": 1.4057978947890166e-06,
+      "loss": 0.028238294124603273,
+      "step": 6600
+    },
+    {
+      "epoch": 5.523495465787304,
+      "grad_norm": 1.1183128356933594,
+      "learning_rate": 1.3650231807996163e-06,
+      "loss": 0.022344279289245605,
+      "step": 6700
+    },
+    {
+      "epoch": 5.605935696619951,
+      "grad_norm": 0.39727112650871277,
+      "learning_rate": 1.3243487899024401e-06,
+      "loss": 0.024733517169952392,
+      "step": 6800
+    },
+    {
+      "epoch": 5.688375927452597,
+      "grad_norm": 1.1431382894515991,
+      "learning_rate": 1.2838049538120375e-06,
+      "loss": 0.024790296554565428,
+      "step": 6900
+    },
+    {
+      "epoch": 5.7708161582852435,
+      "grad_norm": 0.270554780960083,
+      "learning_rate": 1.243421807206581e-06,
+      "loss": 0.023978376388549806,
+      "step": 7000
+    },
+    {
+      "epoch": 5.85325638911789,
+      "grad_norm": 2.6371097564697266,
+      "learning_rate": 1.2032293653299107e-06,
+      "loss": 0.024820666313171386,
+      "step": 7100
+    },
+    {
+      "epoch": 5.935696619950535,
+      "grad_norm": 0.4371972680091858,
+      "learning_rate": 1.1632575016823583e-06,
+      "loss": 0.023902339935302733,
+      "step": 7200
+    },
+    {
+      "epoch": 6.018136850783182,
+      "grad_norm": 0.3910028040409088,
+      "learning_rate": 1.1235359258169183e-06,
+      "loss": 0.023613801002502443,
+      "step": 7300
+    },
+    {
+      "epoch": 6.100577081615828,
+      "grad_norm": 0.5897251963615417,
+      "learning_rate": 1.0840941612572765e-06,
+      "loss": 0.025773169994354247,
+      "step": 7400
+    },
+    {
+      "epoch": 6.183017312448475,
+      "grad_norm": 7.796975135803223,
+      "learning_rate": 1.0449615235541093e-06,
+      "loss": 0.028378984928131103,
+      "step": 7500
+    },
+    {
+      "epoch": 6.265457543281121,
+      "grad_norm": 2.8554458618164062,
+      "learning_rate": 1.0061670984959582e-06,
+      "loss": 0.023146660327911378,
+      "step": 7600
+    },
+    {
+      "epoch": 6.347897774113767,
+      "grad_norm": 1.5360387563705444,
+      "learning_rate": 9.677397204908788e-07,
+      "loss": 0.025156841278076172,
+      "step": 7700
+    },
+    {
+      "epoch": 6.430338004946414,
+      "grad_norm": 0.2781471312046051,
+      "learning_rate": 9.297079511349307e-07,
+      "loss": 0.023876771926879883,
+      "step": 7800
+    },
+    {
+      "epoch": 6.51277823577906,
+      "grad_norm": 0.5193030834197998,
+      "learning_rate": 8.921000579834404e-07,
+      "loss": 0.021548757553100584,
+      "step": 7900
+    },
+    {
+      "epoch": 6.595218466611707,
+      "grad_norm": 1.3652324676513672,
+      "learning_rate": 8.549439935408109e-07,
+      "loss": 0.027704770565032958,
+      "step": 8000
+    },
+    {
+      "epoch": 6.677658697444353,
+      "grad_norm": 0.567807137966156,
+      "learning_rate": 8.182673744844971e-07,
+      "loss": 0.024193654060363768,
+      "step": 8100
+    },
+    {
+      "epoch": 6.760098928276999,
+      "grad_norm": 1.0867784023284912,
+      "learning_rate": 7.820974611385887e-07,
+      "loss": 0.024752085208892823,
+      "step": 8200
+    },
+    {
+      "epoch": 6.842539159109646,
+      "grad_norm": 12.679245948791504,
+      "learning_rate": 7.464611372122565e-07,
+      "loss": 0.02242401838302612,
+      "step": 8300
+    },
+    {
+      "epoch": 6.924979389942292,
+      "grad_norm": 1.22047758102417,
+      "learning_rate": 7.113848898181258e-07,
+      "loss": 0.031413540840148926,
+      "step": 8400
+    },
+    {
+      "epoch": 7.007419620774938,
+      "grad_norm": 36.422203063964844,
+      "learning_rate": 6.76894789785417e-07,
+      "loss": 0.0245809006690979,
+      "step": 8500
+    },
+    {
+      "epoch": 7.089859851607584,
+      "grad_norm": 1.0370782613754272,
+      "learning_rate": 6.430164722825002e-07,
+      "loss": 0.02020338773727417,
+      "step": 8600
+    },
+    {
+      "epoch": 7.1723000824402305,
+      "grad_norm": 1.1046611070632935,
+      "learning_rate": 6.097751177632599e-07,
+      "loss": 0.02843546390533447,
+      "step": 8700
+    },
+    {
+      "epoch": 7.254740313272877,
+      "grad_norm": 1.1630239486694336,
+      "learning_rate": 5.77195433251426e-07,
+      "loss": 0.023567092418670655,
+      "step": 8800
+    },
+    {
+      "epoch": 7.337180544105523,
+      "grad_norm": 0.6131232976913452,
+      "learning_rate": 5.453016339767939e-07,
+      "loss": 0.02418841600418091,
+      "step": 8900
+    },
+    {
+      "epoch": 7.41962077493817,
+      "grad_norm": 1.4645527601242065,
+      "learning_rate": 5.141174253769704e-07,
+      "loss": 0.0257061243057251,
+      "step": 9000
+    },
+    {
+      "epoch": 7.502061005770816,
+      "grad_norm": 0.9117152690887451,
+      "learning_rate": 4.836659854780321e-07,
+      "loss": 0.02661696672439575,
+      "step": 9100
+    },
+    {
+      "epoch": 7.5845012366034625,
+      "grad_norm": 1.2171125411987305,
+      "learning_rate": 4.5396994766719033e-07,
+      "loss": 0.02204680919647217,
+      "step": 9200
+    },
+    {
+      "epoch": 7.666941467436109,
+      "grad_norm": 0.6495709419250488,
+      "learning_rate": 4.2505138387025753e-07,
+      "loss": 0.029301033020019532,
+      "step": 9300
+    },
+    {
+      "epoch": 7.749381698268755,
+      "grad_norm": 0.6626441478729248,
+      "learning_rate": 3.969317881464367e-07,
+      "loss": 0.021526577472686766,
+      "step": 9400
+    },
+    {
+      "epoch": 7.831821929101402,
+      "grad_norm": 16.135684967041016,
+      "learning_rate": 3.696320607126065e-07,
+      "loss": 0.026122121810913085,
+      "step": 9500
+    },
+    {
+      "epoch": 7.914262159934048,
+      "grad_norm": 0.8759784698486328,
+      "learning_rate": 3.4317249240899686e-07,
+      "loss": 0.025852499008178712,
+      "step": 9600
+    },
+    {
+      "epoch": 7.9967023907666945,
+      "grad_norm": 0.800774872303009,
+      "learning_rate": 3.175727496177826e-07,
+      "loss": 0.02209474563598633,
+      "step": 9700
+    },
+    {
+      "epoch": 8.07914262159934,
+      "grad_norm": 0.8495875000953674,
+      "learning_rate": 2.928518596458161e-07,
+      "loss": 0.02649038314819336,
+      "step": 9800
+    },
+    {
+      "epoch": 8.161582852431987,
+      "grad_norm": 0.7733114361763,
+      "learning_rate": 2.690281965823608e-07,
+      "loss": 0.024643311500549315,
+      "step": 9900
+    },
+    {
+      "epoch": 8.244023083264633,
+      "grad_norm": 12.20607852935791,
+      "learning_rate": 2.461194676423352e-07,
+      "loss": 0.026993231773376467,
+      "step": 10000
+    },
+    {
+      "epoch": 8.32646331409728,
+      "grad_norm": 26.955886840820312,
+      "learning_rate": 2.2414270000521946e-07,
+      "loss": 0.02212390899658203,
+      "step": 10100
+    },
+    {
+      "epoch": 8.408903544929926,
+      "grad_norm": 0.8895039558410645,
+      "learning_rate": 2.031142281594066e-07,
+      "loss": 0.02063568115234375,
+      "step": 10200
+    },
+    {
+      "epoch": 8.491343775762573,
+      "grad_norm": 0.5100795030593872,
+      "learning_rate": 1.8304968176140035e-07,
+      "loss": 0.024049296379089355,
+      "step": 10300
+    },
+    {
+      "epoch": 8.573784006595218,
+      "grad_norm": 1.8475189208984375,
+      "learning_rate": 1.6396397401889679e-07,
+      "loss": 0.02524357557296753,
+      "step": 10400
+    },
+    {
+      "epoch": 8.656224237427864,
+      "grad_norm": 0.6764451861381531,
+      "learning_rate": 1.4587129060636367e-07,
+      "loss": 0.028507494926452638,
+      "step": 10500
+    },
+    {
+      "epoch": 8.738664468260511,
+      "grad_norm": 1.1165720224380493,
+      "learning_rate": 1.2878507912137682e-07,
+      "loss": 0.02533757209777832,
+      "step": 10600
+    },
+    {
+      "epoch": 8.821104699093157,
+      "grad_norm": 0.7385006546974182,
+      "learning_rate": 1.1271803908953354e-07,
+      "loss": 0.022719991207122803,
+      "step": 10700
+    },
+    {
+      "epoch": 8.903544929925804,
+      "grad_norm": 8.316965103149414,
+      "learning_rate": 9.76821125253844e-08,
+      "loss": 0.021582581996917725,
+      "step": 10800
+    },
+    {
+      "epoch": 8.98598516075845,
+      "grad_norm": 1.0215933322906494,
+      "learning_rate": 8.368847505639149e-08,
+      "loss": 0.02419511079788208,
+      "step": 10900
+    },
+    {
+      "epoch": 9.068425391591097,
+      "grad_norm": 8.314555168151855,
+      "learning_rate": 7.074752761651227e-08,
+      "loss": 0.026871368885040284,
+      "step": 11000
+    },
+    {
+      "epoch": 9.150865622423742,
+      "grad_norm": 7.914130687713623,
+      "learning_rate": 5.8868888715585325e-08,
+      "loss": 0.02440465450286865,
+      "step": 11100
+    },
+    {
+      "epoch": 9.23330585325639,
+      "grad_norm": 17.640907287597656,
+      "learning_rate": 4.806138729026111e-08,
+      "loss": 0.020871245861053468,
+      "step": 11200
+    },
+    {
+      "epoch": 9.315746084089035,
+      "grad_norm": 0.4218542277812958,
+      "learning_rate": 3.833305614179061e-08,
+      "loss": 0.024367706775665285,
+      "step": 11300
+    },
+    {
+      "epoch": 9.398186314921682,
+      "grad_norm": 0.497744619846344,
+      "learning_rate": 2.9691125965554454e-08,
+      "loss": 0.027901573181152342,
+      "step": 11400
+    },
+    {
+      "epoch": 9.480626545754328,
+      "grad_norm": 0.6788234710693359,
+      "learning_rate": 2.214201997676152e-08,
+      "loss": 0.021528902053833007,
+      "step": 11500
+    },
+    {
+      "epoch": 9.563066776586975,
+      "grad_norm": 0.21850308775901794,
+      "learning_rate": 1.5691349136322697e-08,
+      "loss": 0.02456042289733887,
+      "step": 11600
+    },
+    {
+      "epoch": 9.64550700741962,
+      "grad_norm": 0.7916850447654724,
+      "learning_rate": 1.0343907980436218e-08,
+      "loss": 0.019252289533615113,
+      "step": 11700
+    },
+    {
+      "epoch": 9.727947238252266,
+      "grad_norm": 0.3894334137439728,
+      "learning_rate": 6.103671056994387e-09,
+      "loss": 0.020840485095977784,
+      "step": 11800
+    },
+    {
+      "epoch": 9.810387469084914,
+      "grad_norm": 1.1665902137756348,
+      "learning_rate": 2.9737899714539775e-09,
+      "loss": 0.030273616313934326,
+      "step": 11900
+    },
+    {
+      "epoch": 9.892827699917559,
+      "grad_norm": 0.6720395088195801,
+      "learning_rate": 9.565910443685155e-10,
+      "loss": 0.02424750566482544,
+      "step": 12000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 12130,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7807792431338136.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5baf6aa343fd5badad92c7c7d46ab369c567b17b3a2aaee0ea3b0e8bbd731d12
+size 5329