AmitMY commited on
Commit
fb6b7fa
·
verified ·
1 Parent(s): 4827e94

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: mit
4
+ base_model: sbintuitions/tiny-lm
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - HuggingFaceFW/fineweb
9
+ model-index:
10
+ - name: output-tiny-lm-fineweb
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # output-tiny-lm-fineweb
18
+
19
+ This model is a fine-tuned version of [sbintuitions/tiny-lm](https://huggingface.co/sbintuitions/tiny-lm) on the HuggingFaceFW/fineweb dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.0003
39
+ - train_batch_size: 128
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
+ - lr_scheduler_type: cosine
44
+ - lr_scheduler_warmup_ratio: 0.01
45
+ - training_steps: 20000
46
+
47
+ ### Training results
48
+
49
+
50
+
51
+ ### Framework versions
52
+
53
+ - Transformers 4.57.3
54
+ - Pytorch 2.9.1+cu130
55
+ - Datasets 4.4.1
56
+ - Tokenizers 0.22.1
all_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "num_input_tokens_seen": 655360000,
4
+ "total_flos": 1.23866185728e+16,
5
+ "train_loss": 1.4302955017089845,
6
+ "train_runtime": 4951.1263,
7
+ "train_samples": 0,
8
+ "train_samples_per_second": 517.054,
9
+ "train_steps_per_second": 4.039
10
+ }
bit_projection_weights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b0ae34ff7325d9251433afe0add80c29dfb19c95ce713dd15d98617dd2ac420
3
+ size 9938
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {# -------------------------------Optional: Define available tools------------------------------- #}{%- if tools %}{%- for tool in tools %}{{- "\x11" -}} {# Start Of Tool Definition #}{{- "\x0E" -}} {# Start Of Attention Block #}{{- tool['function']['name'] + '\n' -}}{# Loop through all argument names and their descriptions #}{%- for argument in tool['function']['parameters']['properties'] %}{{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' -}}{%- endfor %}{{- '\x0F' -}} {# End Of Attention Block #}{%- endfor %}{%- endif %}{# -------------------------------Main conversation message loop------------------------------- #}{% for message in messages %}{{- "\x01" -}} {# Start Of Text Block #}{# Print the role tag, e.g. user or assistant #}{{- message.role + "\n" -}}{# If not the "assistant", we wrap with an attention block #}{% if message.role != "assistant" %}{{- "\x0E" -}}{% endif %}{# If the message contains normal content, print it #}{% if message.content %}{{- message.content -}}{% endif %}{# If the assistant called any tools, print those tool calls #}{% if message.tool_calls %}{% for call in message.tool_calls %}{{- "\x1A" -}} {# Start Of Tool Call #}{{- call.type + " " + call.function.name+ "(" + call.function.arguments | tojson + ")" -}}{{- "\x1B" -}} {# End Of Tool Call #}{% endfor %}{% endif %}{# If not the "assistant", we wrap with an attention block #}{% if message.role != "assistant" %}{{- "\x0F" -}}{% endif %}{{- "\x17" -}} {# End Of Text Block #}{{- "\n" -}} {# Newline after each message #}{% endfor %}{# -------------------------------Add a final assistant prompt marker to continue generation------------------------------- #}{%- if add_generation_prompt %}{{- "\x01assistant\n" -}}{%- endif %}
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 2,
8
+ "dtype": "float32",
9
+ "eos_token_id": 3,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 256,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 640,
15
+ "max_position_embeddings": 2048,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 4,
19
+ "num_hidden_layers": 4,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 0,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "transformers_version": "4.57.3",
28
+ "use_cache": true,
29
+ "vocab_size": 256
30
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": [
5
+ 3,
6
+ 2
7
+ ],
8
+ "pad_token_id": 0,
9
+ "transformers_version": "4.57.3"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af0c67f5b41359f3c33dcc00a8756fa0dbb2254abb6fb8f7e324b2cddb671f7e
3
+ size 12596248
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "\u0002",
3
+ "eos_token": "\u0003",
4
+ "pad_token": "\u0000"
5
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "\u0000",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "2": {
12
+ "content": "\u0002",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "3": {
20
+ "content": "\u0003",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "bos_token": "\u0002",
29
+ "bos_token_id": 2,
30
+ "clean_up_tokenization_spaces": false,
31
+ "eos_token": "\u0003",
32
+ "eos_token_id": 3,
33
+ "extra_special_tokens": {},
34
+ "model_max_length": 1000000000000000019884624838656,
35
+ "pad_token": "\u0000",
36
+ "pad_token_id": 0,
37
+ "tokenizer_class": "UTF8Tokenizer"
38
+ }
train_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "num_input_tokens_seen": 655360000,
4
+ "total_flos": 1.23866185728e+16,
5
+ "train_loss": 1.4302955017089845,
6
+ "train_runtime": 4951.1263,
7
+ "train_samples": 0,
8
+ "train_samples_per_second": 517.054,
9
+ "train_steps_per_second": 4.039
10
+ }
trainer_state.json ADDED
@@ -0,0 +1,2044 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 20000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.005,
14
+ "grad_norm": 0.5784105658531189,
15
+ "learning_rate": 0.00014849999999999998,
16
+ "loss": 4.9555,
17
+ "num_input_tokens_seen": 3276800,
18
+ "step": 100,
19
+ "train_runtime": 30.2422,
20
+ "train_tokens_per_second": 108352.003
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "grad_norm": 1.457616925239563,
25
+ "learning_rate": 0.0002985,
26
+ "loss": 3.0038,
27
+ "num_input_tokens_seen": 6553600,
28
+ "step": 200,
29
+ "train_runtime": 55.436,
30
+ "train_tokens_per_second": 118219.297
31
+ },
32
+ {
33
+ "epoch": 0.015,
34
+ "grad_norm": 2.997375249862671,
35
+ "learning_rate": 0.0002999814948722491,
36
+ "loss": 2.4591,
37
+ "num_input_tokens_seen": 9830400,
38
+ "step": 300,
39
+ "train_runtime": 86.3647,
40
+ "train_tokens_per_second": 113824.237
41
+ },
42
+ {
43
+ "epoch": 0.02,
44
+ "grad_norm": 2.3580925464630127,
45
+ "learning_rate": 0.0002999252345933521,
46
+ "loss": 2.147,
47
+ "num_input_tokens_seen": 13107200,
48
+ "step": 400,
49
+ "train_runtime": 109.562,
50
+ "train_tokens_per_second": 119632.766
51
+ },
52
+ {
53
+ "epoch": 0.025,
54
+ "grad_norm": 2.9443857669830322,
55
+ "learning_rate": 0.000299831231438409,
56
+ "loss": 1.9844,
57
+ "num_input_tokens_seen": 16384000,
58
+ "step": 500,
59
+ "train_runtime": 133.562,
60
+ "train_tokens_per_second": 122669.618
61
+ },
62
+ {
63
+ "epoch": 0.03,
64
+ "grad_norm": 2.476471185684204,
65
+ "learning_rate": 0.0002996995090722112,
66
+ "loss": 1.8654,
67
+ "num_input_tokens_seen": 19660800,
68
+ "step": 600,
69
+ "train_runtime": 157.7502,
70
+ "train_tokens_per_second": 124632.515
71
+ },
72
+ {
73
+ "epoch": 0.035,
74
+ "grad_norm": 3.6577234268188477,
75
+ "learning_rate": 0.00029953010065516004,
76
+ "loss": 1.8002,
77
+ "num_input_tokens_seen": 22937600,
78
+ "step": 700,
79
+ "train_runtime": 182.0366,
80
+ "train_tokens_per_second": 126005.43
81
+ },
82
+ {
83
+ "epoch": 0.04,
84
+ "grad_norm": 2.630089044570923,
85
+ "learning_rate": 0.0002993230488349186,
86
+ "loss": 1.7327,
87
+ "num_input_tokens_seen": 26214400,
88
+ "step": 800,
89
+ "train_runtime": 205.9679,
90
+ "train_tokens_per_second": 127274.199
91
+ },
92
+ {
93
+ "epoch": 0.045,
94
+ "grad_norm": 2.3071346282958984,
95
+ "learning_rate": 0.00029907840573567524,
96
+ "loss": 1.7033,
97
+ "num_input_tokens_seen": 29491200,
98
+ "step": 900,
99
+ "train_runtime": 229.9223,
100
+ "train_tokens_per_second": 128265.936
101
+ },
102
+ {
103
+ "epoch": 0.05,
104
+ "grad_norm": 2.2288811206817627,
105
+ "learning_rate": 0.00029879623294502204,
106
+ "loss": 1.6789,
107
+ "num_input_tokens_seen": 32768000,
108
+ "step": 1000,
109
+ "train_runtime": 258.3246,
110
+ "train_tokens_per_second": 126848.171
111
+ },
112
+ {
113
+ "epoch": 0.055,
114
+ "grad_norm": 2.1029598712921143,
115
+ "learning_rate": 0.00029847660149844995,
116
+ "loss": 1.6491,
117
+ "num_input_tokens_seen": 36044800,
118
+ "step": 1100,
119
+ "train_runtime": 282.4776,
120
+ "train_tokens_per_second": 127602.309
121
+ },
122
+ {
123
+ "epoch": 0.06,
124
+ "grad_norm": 1.4104539155960083,
125
+ "learning_rate": 0.0002981195918614664,
126
+ "loss": 1.6318,
127
+ "num_input_tokens_seen": 39321600,
128
+ "step": 1200,
129
+ "train_runtime": 306.4453,
130
+ "train_tokens_per_second": 128315.216
131
+ },
132
+ {
133
+ "epoch": 0.065,
134
+ "grad_norm": 1.5436947345733643,
135
+ "learning_rate": 0.0002977252939093383,
136
+ "loss": 1.6277,
137
+ "num_input_tokens_seen": 42598400,
138
+ "step": 1300,
139
+ "train_runtime": 330.6399,
140
+ "train_tokens_per_second": 128836.255
141
+ },
142
+ {
143
+ "epoch": 0.07,
144
+ "grad_norm": 1.8562994003295898,
145
+ "learning_rate": 0.00029729380690446654,
146
+ "loss": 1.6016,
147
+ "num_input_tokens_seen": 45875200,
148
+ "step": 1400,
149
+ "train_runtime": 354.9473,
150
+ "train_tokens_per_second": 129245.099
151
+ },
152
+ {
153
+ "epoch": 0.075,
154
+ "grad_norm": 1.7223314046859741,
155
+ "learning_rate": 0.000296825239471397,
156
+ "loss": 1.5832,
157
+ "num_input_tokens_seen": 49152000,
158
+ "step": 1500,
159
+ "train_runtime": 380.0343,
160
+ "train_tokens_per_second": 129335.708
161
+ },
162
+ {
163
+ "epoch": 0.08,
164
+ "grad_norm": 1.294960618019104,
165
+ "learning_rate": 0.00029631970956947514,
166
+ "loss": 1.5697,
167
+ "num_input_tokens_seen": 52428800,
168
+ "step": 1600,
169
+ "train_runtime": 403.7379,
170
+ "train_tokens_per_second": 129858.509
171
+ },
172
+ {
173
+ "epoch": 0.085,
174
+ "grad_norm": 1.6293072700500488,
175
+ "learning_rate": 0.0002957773444631505,
176
+ "loss": 1.5668,
177
+ "num_input_tokens_seen": 55705600,
178
+ "step": 1700,
179
+ "train_runtime": 427.714,
180
+ "train_tokens_per_second": 130240.289
181
+ },
182
+ {
183
+ "epoch": 0.09,
184
+ "grad_norm": 1.2281590700149536,
185
+ "learning_rate": 0.000295198280689938,
186
+ "loss": 1.5866,
187
+ "num_input_tokens_seen": 58982400,
188
+ "step": 1800,
189
+ "train_runtime": 452.5238,
190
+ "train_tokens_per_second": 130340.981
191
+ },
192
+ {
193
+ "epoch": 0.095,
194
+ "grad_norm": 1.2015036344528198,
195
+ "learning_rate": 0.000294582664026046,
196
+ "loss": 1.5497,
197
+ "num_input_tokens_seen": 62259200,
198
+ "step": 1900,
199
+ "train_runtime": 476.482,
200
+ "train_tokens_per_second": 130664.34
201
+ },
202
+ {
203
+ "epoch": 0.1,
204
+ "grad_norm": 1.569770336151123,
205
+ "learning_rate": 0.00029393064944967733,
206
+ "loss": 1.5564,
207
+ "num_input_tokens_seen": 65536000,
208
+ "step": 2000,
209
+ "train_runtime": 505.6146,
210
+ "train_tokens_per_second": 129616.522
211
+ },
212
+ {
213
+ "epoch": 0.105,
214
+ "grad_norm": 1.0509306192398071,
215
+ "learning_rate": 0.0002932424011020149,
216
+ "loss": 1.5755,
217
+ "num_input_tokens_seen": 68812800,
218
+ "step": 2100,
219
+ "train_runtime": 529.3779,
220
+ "train_tokens_per_second": 129988.045
221
+ },
222
+ {
223
+ "epoch": 0.11,
224
+ "grad_norm": 1.2682929039001465,
225
+ "learning_rate": 0.0002925180922458996,
226
+ "loss": 1.532,
227
+ "num_input_tokens_seen": 72089600,
228
+ "step": 2200,
229
+ "train_runtime": 553.4037,
230
+ "train_tokens_per_second": 130265.845
231
+ },
232
+ {
233
+ "epoch": 0.115,
234
+ "grad_norm": 0.8587242960929871,
235
+ "learning_rate": 0.00029175790522221253,
236
+ "loss": 1.5221,
237
+ "num_input_tokens_seen": 75366400,
238
+ "step": 2300,
239
+ "train_runtime": 577.2313,
240
+ "train_tokens_per_second": 130565.344
241
+ },
242
+ {
243
+ "epoch": 0.12,
244
+ "grad_norm": 0.9015905261039734,
245
+ "learning_rate": 0.00029096203140397157,
246
+ "loss": 1.5253,
247
+ "num_input_tokens_seen": 78643200,
248
+ "step": 2400,
249
+ "train_runtime": 601.9877,
250
+ "train_tokens_per_second": 130639.212
251
+ },
252
+ {
253
+ "epoch": 0.125,
254
+ "grad_norm": 0.9860896468162537,
255
+ "learning_rate": 0.0002901306711481544,
256
+ "loss": 1.518,
257
+ "num_input_tokens_seen": 81920000,
258
+ "step": 2500,
259
+ "train_runtime": 625.982,
260
+ "train_tokens_per_second": 130866.382
261
+ },
262
+ {
263
+ "epoch": 0.13,
264
+ "grad_norm": 0.9716039896011353,
265
+ "learning_rate": 0.00028926403374525953,
266
+ "loss": 1.5216,
267
+ "num_input_tokens_seen": 85196800,
268
+ "step": 2600,
269
+ "train_runtime": 649.7987,
270
+ "train_tokens_per_second": 131112.604
271
+ },
272
+ {
273
+ "epoch": 0.135,
274
+ "grad_norm": 1.115633487701416,
275
+ "learning_rate": 0.00028836233736661843,
276
+ "loss": 1.504,
277
+ "num_input_tokens_seen": 88473600,
278
+ "step": 2700,
279
+ "train_runtime": 678.8811,
280
+ "train_tokens_per_second": 130322.683
281
+ },
282
+ {
283
+ "epoch": 0.14,
284
+ "grad_norm": 0.8320448398590088,
285
+ "learning_rate": 0.0002874258090094726,
286
+ "loss": 1.5041,
287
+ "num_input_tokens_seen": 91750400,
288
+ "step": 2800,
289
+ "train_runtime": 703.1036,
290
+ "train_tokens_per_second": 130493.433
291
+ },
292
+ {
293
+ "epoch": 0.145,
294
+ "grad_norm": 1.2689387798309326,
295
+ "learning_rate": 0.00028645468443982747,
296
+ "loss": 1.4988,
297
+ "num_input_tokens_seen": 95027200,
298
+ "step": 2900,
299
+ "train_runtime": 727.3731,
300
+ "train_tokens_per_second": 130644.376
301
+ },
302
+ {
303
+ "epoch": 0.15,
304
+ "grad_norm": 0.9444186091423035,
305
+ "learning_rate": 0.0002854492081331002,
306
+ "loss": 1.4737,
307
+ "num_input_tokens_seen": 98304000,
308
+ "step": 3000,
309
+ "train_runtime": 750.7174,
310
+ "train_tokens_per_second": 130946.747
311
+ },
312
+ {
313
+ "epoch": 0.155,
314
+ "grad_norm": 0.9993096590042114,
315
+ "learning_rate": 0.00028440963321257385,
316
+ "loss": 1.4902,
317
+ "num_input_tokens_seen": 101580800,
318
+ "step": 3100,
319
+ "train_runtime": 774.175,
320
+ "train_tokens_per_second": 131211.683
321
+ },
322
+ {
323
+ "epoch": 0.16,
324
+ "grad_norm": 0.8590114712715149,
325
+ "learning_rate": 0.00028333622138567544,
326
+ "loss": 1.4691,
327
+ "num_input_tokens_seen": 104857600,
328
+ "step": 3200,
329
+ "train_runtime": 804.4318,
330
+ "train_tokens_per_second": 130349.895
331
+ },
332
+ {
333
+ "epoch": 0.165,
334
+ "grad_norm": 0.8465983867645264,
335
+ "learning_rate": 0.000282229242878092,
336
+ "loss": 1.4961,
337
+ "num_input_tokens_seen": 108134400,
338
+ "step": 3300,
339
+ "train_runtime": 822.6712,
340
+ "train_tokens_per_second": 131443.042
341
+ },
342
+ {
343
+ "epoch": 0.17,
344
+ "grad_norm": 0.8805415630340576,
345
+ "learning_rate": 0.00028108897636574284,
346
+ "loss": 1.5042,
347
+ "num_input_tokens_seen": 111411200,
348
+ "step": 3400,
349
+ "train_runtime": 847.8836,
350
+ "train_tokens_per_second": 131399.175
351
+ },
352
+ {
353
+ "epoch": 0.175,
354
+ "grad_norm": 0.8398001194000244,
355
+ "learning_rate": 0.0002799157089046248,
356
+ "loss": 1.4675,
357
+ "num_input_tokens_seen": 114688000,
358
+ "step": 3500,
359
+ "train_runtime": 871.6987,
360
+ "train_tokens_per_second": 131568.398
361
+ },
362
+ {
363
+ "epoch": 0.18,
364
+ "grad_norm": 0.7862799763679504,
365
+ "learning_rate": 0.00027870973585854665,
366
+ "loss": 1.4679,
367
+ "num_input_tokens_seen": 117964800,
368
+ "step": 3600,
369
+ "train_runtime": 896.6814,
370
+ "train_tokens_per_second": 131557.096
371
+ },
372
+ {
373
+ "epoch": 0.185,
374
+ "grad_norm": 0.7606053352355957,
375
+ "learning_rate": 0.00027747136082477365,
376
+ "loss": 1.4592,
377
+ "num_input_tokens_seen": 121241600,
378
+ "step": 3700,
379
+ "train_runtime": 925.0842,
380
+ "train_tokens_per_second": 131060.069
381
+ },
382
+ {
383
+ "epoch": 0.19,
384
+ "grad_norm": 0.7865683436393738,
385
+ "learning_rate": 0.00027620089555759807,
386
+ "loss": 1.463,
387
+ "num_input_tokens_seen": 124518400,
388
+ "step": 3800,
389
+ "train_runtime": 948.479,
390
+ "train_tokens_per_second": 131282.194
391
+ },
392
+ {
393
+ "epoch": 0.195,
394
+ "grad_norm": 0.8784666657447815,
395
+ "learning_rate": 0.0002748986598898566,
396
+ "loss": 1.4662,
397
+ "num_input_tokens_seen": 127795200,
398
+ "step": 3900,
399
+ "train_runtime": 972.3642,
400
+ "train_tokens_per_second": 131427.304
401
+ },
402
+ {
403
+ "epoch": 0.2,
404
+ "grad_norm": 0.7535898685455322,
405
+ "learning_rate": 0.00027356498165241475,
406
+ "loss": 1.4661,
407
+ "num_input_tokens_seen": 131072000,
408
+ "step": 4000,
409
+ "train_runtime": 995.9673,
410
+ "train_tokens_per_second": 131602.716
411
+ },
412
+ {
413
+ "epoch": 0.205,
414
+ "grad_norm": 0.7886632084846497,
415
+ "learning_rate": 0.00027220019659163653,
416
+ "loss": 1.4523,
417
+ "num_input_tokens_seen": 134348800,
418
+ "step": 4100,
419
+ "train_runtime": 1025.0973,
420
+ "train_tokens_per_second": 131059.561
421
+ },
422
+ {
423
+ "epoch": 0.21,
424
+ "grad_norm": 0.7285377383232117,
425
+ "learning_rate": 0.00027080464828486246,
426
+ "loss": 1.4382,
427
+ "num_input_tokens_seen": 137625600,
428
+ "step": 4200,
429
+ "train_runtime": 1049.2312,
430
+ "train_tokens_per_second": 131168.036
431
+ },
432
+ {
433
+ "epoch": 0.215,
434
+ "grad_norm": 0.6270678639411926,
435
+ "learning_rate": 0.0002693786880539158,
436
+ "loss": 1.4381,
437
+ "num_input_tokens_seen": 140902400,
438
+ "step": 4300,
439
+ "train_runtime": 1072.6988,
440
+ "train_tokens_per_second": 131353.186
441
+ },
442
+ {
443
+ "epoch": 0.22,
444
+ "grad_norm": 0.9621595144271851,
445
+ "learning_rate": 0.0002679226748766589,
446
+ "loss": 1.456,
447
+ "num_input_tokens_seen": 144179200,
448
+ "step": 4400,
449
+ "train_runtime": 1097.9055,
450
+ "train_tokens_per_second": 131322.044
451
+ },
452
+ {
453
+ "epoch": 0.225,
454
+ "grad_norm": 0.6201126575469971,
455
+ "learning_rate": 0.0002664369752966228,
456
+ "loss": 1.4744,
457
+ "num_input_tokens_seen": 147456000,
458
+ "step": 4500,
459
+ "train_runtime": 1122.0946,
460
+ "train_tokens_per_second": 131411.382
461
+ },
462
+ {
463
+ "epoch": 0.23,
464
+ "grad_norm": 0.8038685917854309,
465
+ "learning_rate": 0.00026492196333073164,
466
+ "loss": 1.4537,
467
+ "num_input_tokens_seen": 150732800,
468
+ "step": 4600,
469
+ "train_runtime": 1146.3348,
470
+ "train_tokens_per_second": 131491.08
471
+ },
472
+ {
473
+ "epoch": 0.235,
474
+ "grad_norm": 0.8983737826347351,
475
+ "learning_rate": 0.0002633780203751459,
476
+ "loss": 1.4298,
477
+ "num_input_tokens_seen": 154009600,
478
+ "step": 4700,
479
+ "train_runtime": 1170.6825,
480
+ "train_tokens_per_second": 131555.396
481
+ },
482
+ {
483
+ "epoch": 0.24,
484
+ "grad_norm": 0.9596767425537109,
485
+ "learning_rate": 0.0002618055351092481,
486
+ "loss": 1.4341,
487
+ "num_input_tokens_seen": 157286400,
488
+ "step": 4800,
489
+ "train_runtime": 1194.1389,
490
+ "train_tokens_per_second": 131715.33
491
+ },
492
+ {
493
+ "epoch": 0.245,
494
+ "grad_norm": 0.734665036201477,
495
+ "learning_rate": 0.0002602049033977945,
496
+ "loss": 1.4458,
497
+ "num_input_tokens_seen": 160563200,
498
+ "step": 4900,
499
+ "train_runtime": 1217.5015,
500
+ "train_tokens_per_second": 131879.268
501
+ },
502
+ {
503
+ "epoch": 0.25,
504
+ "grad_norm": 0.5745177268981934,
505
+ "learning_rate": 0.0002585765281912588,
506
+ "loss": 1.444,
507
+ "num_input_tokens_seen": 163840000,
508
+ "step": 5000,
509
+ "train_runtime": 1247.5321,
510
+ "train_tokens_per_second": 131331.29
511
+ },
512
+ {
513
+ "epoch": 0.255,
514
+ "grad_norm": 0.7568549513816833,
515
+ "learning_rate": 0.00025692081942439113,
516
+ "loss": 1.4307,
517
+ "num_input_tokens_seen": 167116800,
518
+ "step": 5100,
519
+ "train_runtime": 1270.9694,
520
+ "train_tokens_per_second": 131487.668
521
+ },
522
+ {
523
+ "epoch": 0.26,
524
+ "grad_norm": 0.7054229378700256,
525
+ "learning_rate": 0.0002552381939130192,
526
+ "loss": 1.4325,
527
+ "num_input_tokens_seen": 170393600,
528
+ "step": 5200,
529
+ "train_runtime": 1294.8044,
530
+ "train_tokens_per_second": 131597.945
531
+ },
532
+ {
533
+ "epoch": 0.265,
534
+ "grad_norm": 0.5529087781906128,
535
+ "learning_rate": 0.00025352907524911716,
536
+ "loss": 1.4271,
537
+ "num_input_tokens_seen": 173670400,
538
+ "step": 5300,
539
+ "train_runtime": 1318.1084,
540
+ "train_tokens_per_second": 131757.299
541
+ },
542
+ {
543
+ "epoch": 0.27,
544
+ "grad_norm": 0.669465184211731,
545
+ "learning_rate": 0.00025179389369416885,
546
+ "loss": 1.4056,
547
+ "num_input_tokens_seen": 176947200,
548
+ "step": 5400,
549
+ "train_runtime": 1341.5822,
550
+ "train_tokens_per_second": 131894.417
551
+ },
552
+ {
553
+ "epoch": 0.275,
554
+ "grad_norm": 0.662200927734375,
555
+ "learning_rate": 0.0002500330860708513,
556
+ "loss": 1.4634,
557
+ "num_input_tokens_seen": 180224000,
558
+ "step": 5500,
559
+ "train_runtime": 1366.4603,
560
+ "train_tokens_per_second": 131891.134
561
+ },
562
+ {
563
+ "epoch": 0.28,
564
+ "grad_norm": 0.8842505216598511,
565
+ "learning_rate": 0.00024824709565306733,
566
+ "loss": 1.422,
567
+ "num_input_tokens_seen": 183500800,
568
+ "step": 5600,
569
+ "train_runtime": 1390.3365,
570
+ "train_tokens_per_second": 131983.017
571
+ },
572
+ {
573
+ "epoch": 0.285,
574
+ "grad_norm": 0.5874491930007935,
575
+ "learning_rate": 0.00024643637205435363,
576
+ "loss": 1.4204,
577
+ "num_input_tokens_seen": 186777600,
578
+ "step": 5700,
579
+ "train_runtime": 1420.1755,
580
+ "train_tokens_per_second": 131517.271
581
+ },
582
+ {
583
+ "epoch": 0.29,
584
+ "grad_norm": 0.6088038682937622,
585
+ "learning_rate": 0.00024460137111469296,
586
+ "loss": 1.4286,
587
+ "num_input_tokens_seen": 190054400,
588
+ "step": 5800,
589
+ "train_runtime": 1443.4282,
590
+ "train_tokens_per_second": 131668.763
591
+ },
592
+ {
593
+ "epoch": 0.295,
594
+ "grad_norm": 0.5753700733184814,
595
+ "learning_rate": 0.00024274255478575854,
596
+ "loss": 1.3925,
597
+ "num_input_tokens_seen": 193331200,
598
+ "step": 5900,
599
+ "train_runtime": 1467.8392,
600
+ "train_tokens_per_second": 131711.428
601
+ },
602
+ {
603
+ "epoch": 0.3,
604
+ "grad_norm": 1.0680171251296997,
605
+ "learning_rate": 0.00024086039101462085,
606
+ "loss": 1.4237,
607
+ "num_input_tokens_seen": 196608000,
608
+ "step": 6000,
609
+ "train_runtime": 1492.1963,
610
+ "train_tokens_per_second": 131757.468
611
+ },
612
+ {
613
+ "epoch": 0.305,
614
+ "grad_norm": 0.6099634170532227,
615
+ "learning_rate": 0.00023895535362594388,
616
+ "loss": 1.4171,
617
+ "num_input_tokens_seen": 199884800,
618
+ "step": 6100,
619
+ "train_runtime": 1515.7396,
620
+ "train_tokens_per_second": 131872.784
621
+ },
622
+ {
623
+ "epoch": 0.31,
624
+ "grad_norm": 0.6650487184524536,
625
+ "learning_rate": 0.0002370279222027026,
626
+ "loss": 1.4142,
627
+ "num_input_tokens_seen": 203161600,
628
+ "step": 6200,
629
+ "train_runtime": 1539.3684,
630
+ "train_tokens_per_second": 131977.242
631
+ },
632
+ {
633
+ "epoch": 0.315,
634
+ "grad_norm": 0.6537796258926392,
635
+ "learning_rate": 0.00023507858196545036,
636
+ "loss": 1.4055,
637
+ "num_input_tokens_seen": 206438400,
638
+ "step": 6300,
639
+ "train_runtime": 1569.0555,
640
+ "train_tokens_per_second": 131568.574
641
+ },
642
+ {
643
+ "epoch": 0.32,
644
+ "grad_norm": 0.9217991828918457,
645
+ "learning_rate": 0.00023310782365016728,
646
+ "loss": 1.4001,
647
+ "num_input_tokens_seen": 209715200,
648
+ "step": 6400,
649
+ "train_runtime": 1593.9139,
650
+ "train_tokens_per_second": 131572.477
651
+ },
652
+ {
653
+ "epoch": 0.325,
654
+ "grad_norm": 0.5520761609077454,
655
+ "learning_rate": 0.00023111614338472018,
656
+ "loss": 1.3995,
657
+ "num_input_tokens_seen": 212992000,
658
+ "step": 6500,
659
+ "train_runtime": 1618.0503,
660
+ "train_tokens_per_second": 131634.969
661
+ },
662
+ {
663
+ "epoch": 0.33,
664
+ "grad_norm": 0.6622788906097412,
665
+ "learning_rate": 0.00022910404256396485,
666
+ "loss": 1.4155,
667
+ "num_input_tokens_seen": 216268800,
668
+ "step": 6600,
669
+ "train_runtime": 1641.7748,
670
+ "train_tokens_per_second": 131728.667
671
+ },
672
+ {
673
+ "epoch": 0.335,
674
+ "grad_norm": 0.5474684834480286,
675
+ "learning_rate": 0.00022707202772352265,
676
+ "loss": 1.41,
677
+ "num_input_tokens_seen": 219545600,
678
+ "step": 6700,
679
+ "train_runtime": 1665.6292,
680
+ "train_tokens_per_second": 131809.411
681
+ },
682
+ {
683
+ "epoch": 0.34,
684
+ "grad_norm": 0.631249189376831,
685
+ "learning_rate": 0.00022502061041226298,
686
+ "loss": 1.3895,
687
+ "num_input_tokens_seen": 222822400,
688
+ "step": 6800,
689
+ "train_runtime": 1689.8435,
690
+ "train_tokens_per_second": 131859.781
691
+ },
692
+ {
693
+ "epoch": 0.345,
694
+ "grad_norm": 0.6114600300788879,
695
+ "learning_rate": 0.00022295030706352356,
696
+ "loss": 1.4042,
697
+ "num_input_tokens_seen": 226099200,
698
+ "step": 6900,
699
+ "train_runtime": 1713.7744,
700
+ "train_tokens_per_second": 131930.552
701
+ },
702
+ {
703
+ "epoch": 0.35,
704
+ "grad_norm": 0.5166604518890381,
705
+ "learning_rate": 0.00022086163886510086,
706
+ "loss": 1.3822,
707
+ "num_input_tokens_seen": 229376000,
708
+ "step": 7000,
709
+ "train_runtime": 1737.7511,
710
+ "train_tokens_per_second": 131995.885
711
+ },
712
+ {
713
+ "epoch": 0.355,
714
+ "grad_norm": 0.7250663638114929,
715
+ "learning_rate": 0.00021875513162804427,
716
+ "loss": 1.4204,
717
+ "num_input_tokens_seen": 232652800,
718
+ "step": 7100,
719
+ "train_runtime": 1762.0205,
720
+ "train_tokens_per_second": 132037.511
721
+ },
722
+ {
723
+ "epoch": 0.36,
724
+ "grad_norm": 0.7085418701171875,
725
+ "learning_rate": 0.00021663131565428554,
726
+ "loss": 1.3858,
727
+ "num_input_tokens_seen": 235929600,
728
+ "step": 7200,
729
+ "train_runtime": 1785.6176,
730
+ "train_tokens_per_second": 132127.731
731
+ },
732
+ {
733
+ "epoch": 0.365,
734
+ "grad_norm": 0.7583802938461304,
735
+ "learning_rate": 0.00021449072560313843,
736
+ "loss": 1.3885,
737
+ "num_input_tokens_seen": 239206400,
738
+ "step": 7300,
739
+ "train_runtime": 1814.7871,
740
+ "train_tokens_per_second": 131809.624
741
+ },
742
+ {
743
+ "epoch": 0.37,
744
+ "grad_norm": 0.5545434951782227,
745
+ "learning_rate": 0.00021233390035670105,
746
+ "loss": 1.3765,
747
+ "num_input_tokens_seen": 242483200,
748
+ "step": 7400,
749
+ "train_runtime": 1838.4345,
750
+ "train_tokens_per_second": 131896.568
751
+ },
752
+ {
753
+ "epoch": 0.375,
754
+ "grad_norm": 0.7899689078330994,
755
+ "learning_rate": 0.00021016138288419497,
756
+ "loss": 1.394,
757
+ "num_input_tokens_seen": 245760000,
758
+ "step": 7500,
759
+ "train_runtime": 1862.6752,
760
+ "train_tokens_per_second": 131939.27
761
+ },
762
+ {
763
+ "epoch": 0.38,
764
+ "grad_norm": 0.668602466583252,
765
+ "learning_rate": 0.0002079737201052759,
766
+ "loss": 1.3727,
767
+ "num_input_tokens_seen": 249036800,
768
+ "step": 7600,
769
+ "train_runtime": 1887.7021,
770
+ "train_tokens_per_second": 131925.902
771
+ },
772
+ {
773
+ "epoch": 0.385,
774
+ "grad_norm": 0.5739292502403259,
775
+ "learning_rate": 0.0002057714627523492,
776
+ "loss": 1.3855,
777
+ "num_input_tokens_seen": 252313600,
778
+ "step": 7700,
779
+ "train_runtime": 1911.7352,
780
+ "train_tokens_per_second": 131981.456
781
+ },
782
+ {
783
+ "epoch": 0.39,
784
+ "grad_norm": 0.5914629697799683,
785
+ "learning_rate": 0.00020355516523192651,
786
+ "loss": 1.4026,
787
+ "num_input_tokens_seen": 255590400,
788
+ "step": 7800,
789
+ "train_runtime": 1935.658,
790
+ "train_tokens_per_second": 132043.16
791
+ },
792
+ {
793
+ "epoch": 0.395,
794
+ "grad_norm": 0.6528250575065613,
795
+ "learning_rate": 0.00020132538548505634,
796
+ "loss": 1.3895,
797
+ "num_input_tokens_seen": 258867200,
798
+ "step": 7900,
799
+ "train_runtime": 1959.0455,
800
+ "train_tokens_per_second": 132139.453
801
+ },
802
+ {
803
+ "epoch": 0.4,
804
+ "grad_norm": 0.6278071403503418,
805
+ "learning_rate": 0.00019908268484686558,
806
+ "loss": 1.3902,
807
+ "num_input_tokens_seen": 262144000,
808
+ "step": 8000,
809
+ "train_runtime": 1982.9613,
810
+ "train_tokens_per_second": 132198.243
811
+ },
812
+ {
813
+ "epoch": 0.405,
814
+ "grad_norm": 1.0000662803649902,
815
+ "learning_rate": 0.00019682762790524657,
816
+ "loss": 1.3806,
817
+ "num_input_tokens_seen": 265420800,
818
+ "step": 8100,
819
+ "train_runtime": 2007.4327,
820
+ "train_tokens_per_second": 132219.03
821
+ },
822
+ {
823
+ "epoch": 0.41,
824
+ "grad_norm": 0.5706632137298584,
825
+ "learning_rate": 0.00019456078235872488,
826
+ "loss": 1.4018,
827
+ "num_input_tokens_seen": 268697600,
828
+ "step": 8200,
829
+ "train_runtime": 2037.3211,
830
+ "train_tokens_per_second": 131887.703
831
+ },
832
+ {
833
+ "epoch": 0.415,
834
+ "grad_norm": 0.52730792760849,
835
+ "learning_rate": 0.0001922827188735443,
836
+ "loss": 1.3884,
837
+ "num_input_tokens_seen": 271974400,
838
+ "step": 8300,
839
+ "train_runtime": 2061.1352,
840
+ "train_tokens_per_second": 131953.693
841
+ },
842
+ {
843
+ "epoch": 0.42,
844
+ "grad_norm": 0.7067908644676208,
845
+ "learning_rate": 0.00018999401094000473,
846
+ "loss": 1.3724,
847
+ "num_input_tokens_seen": 275251200,
848
+ "step": 8400,
849
+ "train_runtime": 2085.2036,
850
+ "train_tokens_per_second": 132002.073
851
+ },
852
+ {
853
+ "epoch": 0.425,
854
+ "grad_norm": 0.5584085583686829,
855
+ "learning_rate": 0.00018769523472808897,
856
+ "loss": 1.3828,
857
+ "num_input_tokens_seen": 278528000,
858
+ "step": 8500,
859
+ "train_runtime": 2108.8848,
860
+ "train_tokens_per_second": 132073.599
861
+ },
862
+ {
863
+ "epoch": 0.43,
864
+ "grad_norm": 0.7685338854789734,
865
+ "learning_rate": 0.0001853869689424151,
866
+ "loss": 1.384,
867
+ "num_input_tokens_seen": 281804800,
868
+ "step": 8600,
869
+ "train_runtime": 2133.0174,
870
+ "train_tokens_per_second": 132115.568
871
+ },
872
+ {
873
+ "epoch": 0.435,
874
+ "grad_norm": 0.71221923828125,
875
+ "learning_rate": 0.00018306979467655062,
876
+ "loss": 1.3768,
877
+ "num_input_tokens_seen": 285081600,
878
+ "step": 8700,
879
+ "train_runtime": 2157.3919,
880
+ "train_tokens_per_second": 132141.775
881
+ },
882
+ {
883
+ "epoch": 0.44,
884
+ "grad_norm": 0.587790310382843,
885
+ "learning_rate": 0.0001807442952667254,
886
+ "loss": 1.3646,
887
+ "num_input_tokens_seen": 288358400,
888
+ "step": 8800,
889
+ "train_runtime": 2185.9477,
890
+ "train_tokens_per_second": 131914.592
891
+ },
892
+ {
893
+ "epoch": 0.445,
894
+ "grad_norm": 0.6176910400390625,
895
+ "learning_rate": 0.00017841105614497952,
896
+ "loss": 1.3793,
897
+ "num_input_tokens_seen": 291635200,
898
+ "step": 8900,
899
+ "train_runtime": 2210.0556,
900
+ "train_tokens_per_second": 131958.309
901
+ },
902
+ {
903
+ "epoch": 0.45,
904
+ "grad_norm": 0.5924071073532104,
905
+ "learning_rate": 0.00017607066469178443,
906
+ "loss": 1.3727,
907
+ "num_input_tokens_seen": 294912000,
908
+ "step": 9000,
909
+ "train_runtime": 2233.2228,
910
+ "train_tokens_per_second": 132056.685
911
+ },
912
+ {
913
+ "epoch": 0.455,
914
+ "grad_norm": 0.6892787218093872,
915
+ "learning_rate": 0.00017372371008817256,
916
+ "loss": 1.3598,
917
+ "num_input_tokens_seen": 298188800,
918
+ "step": 9100,
919
+ "train_runtime": 2258.9526,
920
+ "train_tokens_per_second": 132003.123
921
+ },
922
+ {
923
+ "epoch": 0.46,
924
+ "grad_norm": 0.5958895087242126,
925
+ "learning_rate": 0.00017137078316741442,
926
+ "loss": 1.3913,
927
+ "num_input_tokens_seen": 301465600,
928
+ "step": 9200,
929
+ "train_runtime": 2283.0976,
930
+ "train_tokens_per_second": 132042.364
931
+ },
932
+ {
933
+ "epoch": 0.465,
934
+ "grad_norm": 0.5727524757385254,
935
+ "learning_rate": 0.00016901247626627952,
936
+ "loss": 1.3556,
937
+ "num_input_tokens_seen": 304742400,
938
+ "step": 9300,
939
+ "train_runtime": 2307.8709,
940
+ "train_tokens_per_second": 132044.818
941
+ },
942
+ {
943
+ "epoch": 0.47,
944
+ "grad_norm": 0.7339876890182495,
945
+ "learning_rate": 0.00016664938307591853,
946
+ "loss": 1.352,
947
+ "num_input_tokens_seen": 308019200,
948
+ "step": 9400,
949
+ "train_runtime": 2330.8551,
950
+ "train_tokens_per_second": 132148.583
951
+ },
952
+ {
953
+ "epoch": 0.475,
954
+ "grad_norm": 0.6451588869094849,
955
+ "learning_rate": 0.00016428209849240496,
956
+ "loss": 1.3603,
957
+ "num_input_tokens_seen": 311296000,
958
+ "step": 9500,
959
+ "train_runtime": 2354.8224,
960
+ "train_tokens_per_second": 132195.107
961
+ },
962
+ {
963
+ "epoch": 0.48,
964
+ "grad_norm": 0.5640471577644348,
965
+ "learning_rate": 0.00016191121846697302,
966
+ "loss": 1.353,
967
+ "num_input_tokens_seen": 314572800,
968
+ "step": 9600,
969
+ "train_runtime": 2384.4134,
970
+ "train_tokens_per_second": 131928.799
971
+ },
972
+ {
973
+ "epoch": 0.485,
974
+ "grad_norm": 0.7020093202590942,
975
+ "learning_rate": 0.00015953733985599023,
976
+ "loss": 1.3514,
977
+ "num_input_tokens_seen": 317849600,
978
+ "step": 9700,
979
+ "train_runtime": 2408.1089,
980
+ "train_tokens_per_second": 131991.375
981
+ },
982
+ {
983
+ "epoch": 0.49,
984
+ "grad_norm": 0.5688868761062622,
985
+ "learning_rate": 0.000157161060270702,
986
+ "loss": 1.3626,
987
+ "num_input_tokens_seen": 321126400,
988
+ "step": 9800,
989
+ "train_runtime": 2431.721,
990
+ "train_tokens_per_second": 132057.253
991
+ },
992
+ {
993
+ "epoch": 0.495,
994
+ "grad_norm": 0.702392041683197,
995
+ "learning_rate": 0.00015478297792678616,
996
+ "loss": 1.3396,
997
+ "num_input_tokens_seen": 324403200,
998
+ "step": 9900,
999
+ "train_runtime": 2455.4838,
1000
+ "train_tokens_per_second": 132113.76
1001
+ },
1002
+ {
1003
+ "epoch": 0.5,
1004
+ "grad_norm": 0.6182683110237122,
1005
+ "learning_rate": 0.00015240369149375544,
1006
+ "loss": 1.3433,
1007
+ "num_input_tokens_seen": 327680000,
1008
+ "step": 10000,
1009
+ "train_runtime": 2479.369,
1010
+ "train_tokens_per_second": 132162.661
1011
+ },
1012
+ {
1013
+ "epoch": 0.505,
1014
+ "grad_norm": 0.7529293298721313,
1015
+ "learning_rate": 0.00015002379994424547,
1016
+ "loss": 1.3443,
1017
+ "num_input_tokens_seen": 330956800,
1018
+ "step": 10100,
1019
+ "train_runtime": 2504.1688,
1020
+ "train_tokens_per_second": 132162.336
1021
+ },
1022
+ {
1023
+ "epoch": 0.51,
1024
+ "grad_norm": 0.5510791540145874,
1025
+ "learning_rate": 0.00014764390240322691,
1026
+ "loss": 1.3602,
1027
+ "num_input_tokens_seen": 334233600,
1028
+ "step": 10200,
1029
+ "train_runtime": 2529.0077,
1030
+ "train_tokens_per_second": 132159.975
1031
+ },
1032
+ {
1033
+ "epoch": 0.515,
1034
+ "grad_norm": 0.6009634137153625,
1035
+ "learning_rate": 0.00014526459799717842,
1036
+ "loss": 1.3558,
1037
+ "num_input_tokens_seen": 337510400,
1038
+ "step": 10300,
1039
+ "train_runtime": 2553.0238,
1040
+ "train_tokens_per_second": 132200.255
1041
+ },
1042
+ {
1043
+ "epoch": 0.52,
1044
+ "grad_norm": 0.6439092755317688,
1045
+ "learning_rate": 0.0001428864857032605,
1046
+ "loss": 1.3526,
1047
+ "num_input_tokens_seen": 340787200,
1048
+ "step": 10400,
1049
+ "train_runtime": 2578.3654,
1050
+ "train_tokens_per_second": 132171.799
1051
+ },
1052
+ {
1053
+ "epoch": 0.525,
1054
+ "grad_norm": 0.6969897747039795,
1055
+ "learning_rate": 0.00014051016419852538,
1056
+ "loss": 1.3567,
1057
+ "num_input_tokens_seen": 344064000,
1058
+ "step": 10500,
1059
+ "train_runtime": 2607.8404,
1060
+ "train_tokens_per_second": 131934.456
1061
+ },
1062
+ {
1063
+ "epoch": 0.53,
1064
+ "grad_norm": 0.5618866682052612,
1065
+ "learning_rate": 0.00013813623170920404,
1066
+ "loss": 1.3636,
1067
+ "num_input_tokens_seen": 347340800,
1068
+ "step": 10600,
1069
+ "train_runtime": 2631.6887,
1070
+ "train_tokens_per_second": 131984.0
1071
+ },
1072
+ {
1073
+ "epoch": 0.535,
1074
+ "grad_norm": 0.7540677785873413,
1075
+ "learning_rate": 0.00013576528586010548,
1076
+ "loss": 1.3507,
1077
+ "num_input_tokens_seen": 350617600,
1078
+ "step": 10700,
1079
+ "train_runtime": 2655.342,
1080
+ "train_tokens_per_second": 132042.35
1081
+ },
1082
+ {
1083
+ "epoch": 0.54,
1084
+ "grad_norm": 0.5177021622657776,
1085
+ "learning_rate": 0.0001333979235241679,
1086
+ "loss": 1.3391,
1087
+ "num_input_tokens_seen": 353894400,
1088
+ "step": 10800,
1089
+ "train_runtime": 2678.6561,
1090
+ "train_tokens_per_second": 132116.4
1091
+ },
1092
+ {
1093
+ "epoch": 0.545,
1094
+ "grad_norm": 0.5168038606643677,
1095
+ "learning_rate": 0.0001310347406721994,
1096
+ "loss": 1.3596,
1097
+ "num_input_tokens_seen": 357171200,
1098
+ "step": 10900,
1099
+ "train_runtime": 2701.9577,
1100
+ "train_tokens_per_second": 132189.781
1101
+ },
1102
+ {
1103
+ "epoch": 0.55,
1104
+ "grad_norm": 0.5508900284767151,
1105
+ "learning_rate": 0.00012867633222284514,
1106
+ "loss": 1.3495,
1107
+ "num_input_tokens_seen": 360448000,
1108
+ "step": 11000,
1109
+ "train_runtime": 2730.6775,
1110
+ "train_tokens_per_second": 131999.474
1111
+ },
1112
+ {
1113
+ "epoch": 0.555,
1114
+ "grad_norm": 0.4799763560295105,
1115
+ "learning_rate": 0.0001263232918928202,
1116
+ "loss": 1.3444,
1117
+ "num_input_tokens_seen": 363724800,
1118
+ "step": 11100,
1119
+ "train_runtime": 2755.3984,
1120
+ "train_tokens_per_second": 132004.431
1121
+ },
1122
+ {
1123
+ "epoch": 0.56,
1124
+ "grad_norm": 0.5217195153236389,
1125
+ "learning_rate": 0.00012397621204744406,
1126
+ "loss": 1.3813,
1127
+ "num_input_tokens_seen": 367001600,
1128
+ "step": 11200,
1129
+ "train_runtime": 2780.3769,
1130
+ "train_tokens_per_second": 131997.07
1131
+ },
1132
+ {
1133
+ "epoch": 0.565,
1134
+ "grad_norm": 0.5274467468261719,
1135
+ "learning_rate": 0.00012163568355151628,
1136
+ "loss": 1.3503,
1137
+ "num_input_tokens_seen": 370278400,
1138
+ "step": 11300,
1139
+ "train_runtime": 2804.6323,
1140
+ "train_tokens_per_second": 132023.868
1141
+ },
1142
+ {
1143
+ "epoch": 0.57,
1144
+ "grad_norm": 0.5383314490318298,
1145
+ "learning_rate": 0.00011930229562056919,
1146
+ "loss": 1.3684,
1147
+ "num_input_tokens_seen": 373555200,
1148
+ "step": 11400,
1149
+ "train_runtime": 2828.7646,
1150
+ "train_tokens_per_second": 132055.952
1151
+ },
1152
+ {
1153
+ "epoch": 0.575,
1154
+ "grad_norm": 0.5812146663665771,
1155
+ "learning_rate": 0.00011697663567253592,
1156
+ "loss": 1.3536,
1157
+ "num_input_tokens_seen": 376832000,
1158
+ "step": 11500,
1159
+ "train_runtime": 2852.3871,
1160
+ "train_tokens_per_second": 132111.102
1161
+ },
1162
+ {
1163
+ "epoch": 0.58,
1164
+ "grad_norm": 0.6360482573509216,
1165
+ "learning_rate": 0.00011465928917987139,
1166
+ "loss": 1.3566,
1167
+ "num_input_tokens_seen": 380108800,
1168
+ "step": 11600,
1169
+ "train_runtime": 2875.7456,
1170
+ "train_tokens_per_second": 132177.479
1171
+ },
1172
+ {
1173
+ "epoch": 0.585,
1174
+ "grad_norm": 0.7270791530609131,
1175
+ "learning_rate": 0.00011235083952216253,
1176
+ "loss": 1.3634,
1177
+ "num_input_tokens_seen": 383385600,
1178
+ "step": 11700,
1179
+ "train_runtime": 2899.9756,
1180
+ "train_tokens_per_second": 132203.042
1181
+ },
1182
+ {
1183
+ "epoch": 0.59,
1184
+ "grad_norm": 0.6202597618103027,
1185
+ "learning_rate": 0.00011005186783926572,
1186
+ "loss": 1.3424,
1187
+ "num_input_tokens_seen": 386662400,
1188
+ "step": 11800,
1189
+ "train_runtime": 2923.5674,
1190
+ "train_tokens_per_second": 132257.049
1191
+ },
1192
+ {
1193
+ "epoch": 0.595,
1194
+ "grad_norm": 0.6157307624816895,
1195
+ "learning_rate": 0.00010776295288500768,
1196
+ "loss": 1.3455,
1197
+ "num_input_tokens_seen": 389939200,
1198
+ "step": 11900,
1199
+ "train_runtime": 2952.7095,
1200
+ "train_tokens_per_second": 132061.485
1201
+ },
1202
+ {
1203
+ "epoch": 0.6,
1204
+ "grad_norm": 0.5824424028396606,
1205
+ "learning_rate": 0.00010548467088148766,
1206
+ "loss": 1.3406,
1207
+ "num_input_tokens_seen": 393216000,
1208
+ "step": 12000,
1209
+ "train_runtime": 2975.865,
1210
+ "train_tokens_per_second": 132135.028
1211
+ },
1212
+ {
1213
+ "epoch": 0.605,
1214
+ "grad_norm": 0.5702685713768005,
1215
+ "learning_rate": 0.00010321759537401644,
1216
+ "loss": 1.3371,
1217
+ "num_input_tokens_seen": 396492800,
1218
+ "step": 12100,
1219
+ "train_runtime": 3000.5381,
1220
+ "train_tokens_per_second": 132140.564
1221
+ },
1222
+ {
1223
+ "epoch": 0.61,
1224
+ "grad_norm": 0.5175092816352844,
1225
+ "learning_rate": 0.0001009622970867292,
1226
+ "loss": 1.3338,
1227
+ "num_input_tokens_seen": 399769600,
1228
+ "step": 12200,
1229
+ "train_runtime": 3024.9092,
1230
+ "train_tokens_per_second": 132159.207
1231
+ },
1232
+ {
1233
+ "epoch": 0.615,
1234
+ "grad_norm": 0.6169431805610657,
1235
+ "learning_rate": 9.871934377890893e-05,
1236
+ "loss": 1.3471,
1237
+ "num_input_tokens_seen": 403046400,
1238
+ "step": 12300,
1239
+ "train_runtime": 3048.4879,
1240
+ "train_tokens_per_second": 132211.909
1241
+ },
1242
+ {
1243
+ "epoch": 0.62,
1244
+ "grad_norm": 0.512392520904541,
1245
+ "learning_rate": 9.648930010205619e-05,
1246
+ "loss": 1.3327,
1247
+ "num_input_tokens_seen": 406323200,
1248
+ "step": 12400,
1249
+ "train_runtime": 3072.2957,
1250
+ "train_tokens_per_second": 132253.935
1251
+ },
1252
+ {
1253
+ "epoch": 0.625,
1254
+ "grad_norm": 0.7224143147468567,
1255
+ "learning_rate": 9.4272727457741e-05,
1256
+ "loss": 1.3438,
1257
+ "num_input_tokens_seen": 409600000,
1258
+ "step": 12500,
1259
+ "train_runtime": 3096.0293,
1260
+ "train_tokens_per_second": 132298.492
1261
+ },
1262
+ {
1263
+ "epoch": 0.63,
1264
+ "grad_norm": 0.6021597981452942,
1265
+ "learning_rate": 9.20701838562727e-05,
1266
+ "loss": 1.3284,
1267
+ "num_input_tokens_seen": 412876800,
1268
+ "step": 12600,
1269
+ "train_runtime": 3125.5109,
1270
+ "train_tokens_per_second": 132098.979
1271
+ },
1272
+ {
1273
+ "epoch": 0.635,
1274
+ "grad_norm": 0.6892530918121338,
1275
+ "learning_rate": 8.988222377622442e-05,
1276
+ "loss": 1.3507,
1277
+ "num_input_tokens_seen": 416153600,
1278
+ "step": 12700,
1279
+ "train_runtime": 3149.5923,
1280
+ "train_tokens_per_second": 132129.355
1281
+ },
1282
+ {
1283
+ "epoch": 0.64,
1284
+ "grad_norm": 1.1807767152786255,
1285
+ "learning_rate": 8.770939802484568e-05,
1286
+ "loss": 1.3229,
1287
+ "num_input_tokens_seen": 419430400,
1288
+ "step": 12800,
1289
+ "train_runtime": 3174.0646,
1290
+ "train_tokens_per_second": 132142.993
1291
+ },
1292
+ {
1293
+ "epoch": 0.645,
1294
+ "grad_norm": 1.419129490852356,
1295
+ "learning_rate": 8.555225359939956e-05,
1296
+ "loss": 1.3552,
1297
+ "num_input_tokens_seen": 422707200,
1298
+ "step": 12900,
1299
+ "train_runtime": 3197.4391,
1300
+ "train_tokens_per_second": 132201.8
1301
+ },
1302
+ {
1303
+ "epoch": 0.65,
1304
+ "grad_norm": 0.8219085931777954,
1305
+ "learning_rate": 8.341133354945939e-05,
1306
+ "loss": 1.3221,
1307
+ "num_input_tokens_seen": 425984000,
1308
+ "step": 13000,
1309
+ "train_runtime": 3220.7263,
1310
+ "train_tokens_per_second": 132263.333
1311
+ },
1312
+ {
1313
+ "epoch": 0.655,
1314
+ "grad_norm": 0.4796520173549652,
1315
+ "learning_rate": 8.12871768401986e-05,
1316
+ "loss": 1.3212,
1317
+ "num_input_tokens_seen": 429260800,
1318
+ "step": 13100,
1319
+ "train_runtime": 3250.0068,
1320
+ "train_tokens_per_second": 132079.971
1321
+ },
1322
+ {
1323
+ "epoch": 0.66,
1324
+ "grad_norm": 0.6295695900917053,
1325
+ "learning_rate": 7.918031821670926e-05,
1326
+ "loss": 1.3547,
1327
+ "num_input_tokens_seen": 432537600,
1328
+ "step": 13200,
1329
+ "train_runtime": 3273.9064,
1330
+ "train_tokens_per_second": 132116.666
1331
+ },
1332
+ {
1333
+ "epoch": 0.665,
1334
+ "grad_norm": 0.7862270474433899,
1335
+ "learning_rate": 7.709128806938292e-05,
1336
+ "loss": 1.3495,
1337
+ "num_input_tokens_seen": 435814400,
1338
+ "step": 13300,
1339
+ "train_runtime": 3298.5265,
1340
+ "train_tokens_per_second": 132123.965
1341
+ },
1342
+ {
1343
+ "epoch": 0.67,
1344
+ "grad_norm": 0.548321008682251,
1345
+ "learning_rate": 7.502061230038749e-05,
1346
+ "loss": 1.3587,
1347
+ "num_input_tokens_seen": 439091200,
1348
+ "step": 13400,
1349
+ "train_runtime": 3322.8737,
1350
+ "train_tokens_per_second": 132142.007
1351
+ },
1352
+ {
1353
+ "epoch": 0.675,
1354
+ "grad_norm": 1.1466425657272339,
1355
+ "learning_rate": 7.296881219127452e-05,
1356
+ "loss": 1.3253,
1357
+ "num_input_tokens_seen": 442368000,
1358
+ "step": 13500,
1359
+ "train_runtime": 3346.3922,
1360
+ "train_tokens_per_second": 132192.514
1361
+ },
1362
+ {
1363
+ "epoch": 0.68,
1364
+ "grad_norm": 0.6317051649093628,
1365
+ "learning_rate": 7.093640427174874e-05,
1366
+ "loss": 1.3333,
1367
+ "num_input_tokens_seen": 445644800,
1368
+ "step": 13600,
1369
+ "train_runtime": 3370.0791,
1370
+ "train_tokens_per_second": 132235.711
1371
+ },
1372
+ {
1373
+ "epoch": 0.685,
1374
+ "grad_norm": 0.7266567349433899,
1375
+ "learning_rate": 6.892390018963525e-05,
1376
+ "loss": 1.3336,
1377
+ "num_input_tokens_seen": 448921600,
1378
+ "step": 13700,
1379
+ "train_runtime": 3394.8242,
1380
+ "train_tokens_per_second": 132237.069
1381
+ },
1382
+ {
1383
+ "epoch": 0.69,
1384
+ "grad_norm": 0.54411381483078,
1385
+ "learning_rate": 6.693180658207431e-05,
1386
+ "loss": 1.3142,
1387
+ "num_input_tokens_seen": 452198400,
1388
+ "step": 13800,
1389
+ "train_runtime": 3418.9202,
1390
+ "train_tokens_per_second": 132263.513
1391
+ },
1392
+ {
1393
+ "epoch": 0.695,
1394
+ "grad_norm": 0.6269710063934326,
1395
+ "learning_rate": 6.496062494797838e-05,
1396
+ "loss": 1.3443,
1397
+ "num_input_tokens_seen": 455475200,
1398
+ "step": 13900,
1399
+ "train_runtime": 3442.671,
1400
+ "train_tokens_per_second": 132302.853
1401
+ },
1402
+ {
1403
+ "epoch": 0.7,
1404
+ "grad_norm": 0.6389264464378357,
1405
+ "learning_rate": 6.301085152178248e-05,
1406
+ "loss": 1.3257,
1407
+ "num_input_tokens_seen": 458752000,
1408
+ "step": 14000,
1409
+ "train_runtime": 3466.1379,
1410
+ "train_tokens_per_second": 132352.497
1411
+ },
1412
+ {
1413
+ "epoch": 0.705,
1414
+ "grad_norm": 0.5984766483306885,
1415
+ "learning_rate": 6.108297714851969e-05,
1416
+ "loss": 1.3206,
1417
+ "num_input_tokens_seen": 462028800,
1418
+ "step": 14100,
1419
+ "train_runtime": 3496.6779,
1420
+ "train_tokens_per_second": 132133.647
1421
+ },
1422
+ {
1423
+ "epoch": 0.71,
1424
+ "grad_norm": 0.7461857199668884,
1425
+ "learning_rate": 5.9177487160253855e-05,
1426
+ "loss": 1.3363,
1427
+ "num_input_tokens_seen": 465305600,
1428
+ "step": 14200,
1429
+ "train_runtime": 3521.4286,
1430
+ "train_tokens_per_second": 132135.464
1431
+ },
1432
+ {
1433
+ "epoch": 0.715,
1434
+ "grad_norm": 0.6152383685112,
1435
+ "learning_rate": 5.729486125389922e-05,
1436
+ "loss": 1.3509,
1437
+ "num_input_tokens_seen": 468582400,
1438
+ "step": 14300,
1439
+ "train_runtime": 3539.6668,
1440
+ "train_tokens_per_second": 132380.37
1441
+ },
1442
+ {
1443
+ "epoch": 0.72,
1444
+ "grad_norm": 1.0594799518585205,
1445
+ "learning_rate": 5.5435573370460045e-05,
1446
+ "loss": 1.3429,
1447
+ "num_input_tokens_seen": 471859200,
1448
+ "step": 14400,
1449
+ "train_runtime": 3570.9274,
1450
+ "train_tokens_per_second": 132139.118
1451
+ },
1452
+ {
1453
+ "epoch": 0.725,
1454
+ "grad_norm": 0.6609026193618774,
1455
+ "learning_rate": 5.3600091575717944e-05,
1456
+ "loss": 1.3192,
1457
+ "num_input_tokens_seen": 475136000,
1458
+ "step": 14500,
1459
+ "train_runtime": 3595.0028,
1460
+ "train_tokens_per_second": 132165.684
1461
+ },
1462
+ {
1463
+ "epoch": 0.73,
1464
+ "grad_norm": 0.6246200203895569,
1465
+ "learning_rate": 5.178887794239904e-05,
1466
+ "loss": 1.3237,
1467
+ "num_input_tokens_seen": 478412800,
1468
+ "step": 14600,
1469
+ "train_runtime": 3618.4736,
1470
+ "train_tokens_per_second": 132213.982
1471
+ },
1472
+ {
1473
+ "epoch": 0.735,
1474
+ "grad_norm": 0.6195717453956604,
1475
+ "learning_rate": 5.00023884338496e-05,
1476
+ "loss": 1.3175,
1477
+ "num_input_tokens_seen": 481689600,
1478
+ "step": 14700,
1479
+ "train_runtime": 3642.0847,
1480
+ "train_tokens_per_second": 132256.56
1481
+ },
1482
+ {
1483
+ "epoch": 0.74,
1484
+ "grad_norm": 0.7129009366035461,
1485
+ "learning_rate": 4.82410727892497e-05,
1486
+ "loss": 1.3324,
1487
+ "num_input_tokens_seen": 484966400,
1488
+ "step": 14800,
1489
+ "train_runtime": 3665.9901,
1490
+ "train_tokens_per_second": 132287.972
1491
+ },
1492
+ {
1493
+ "epoch": 0.745,
1494
+ "grad_norm": 1.336125373840332,
1495
+ "learning_rate": 4.650537441039379e-05,
1496
+ "loss": 1.3359,
1497
+ "num_input_tokens_seen": 488243200,
1498
+ "step": 14900,
1499
+ "train_runtime": 3689.171,
1500
+ "train_tokens_per_second": 132344.964
1501
+ },
1502
+ {
1503
+ "epoch": 0.75,
1504
+ "grad_norm": 0.6141300797462463,
1505
+ "learning_rate": 4.479573025006664e-05,
1506
+ "loss": 1.3461,
1507
+ "num_input_tokens_seen": 491520000,
1508
+ "step": 15000,
1509
+ "train_runtime": 3712.6329,
1510
+ "train_tokens_per_second": 132391.222
1511
+ },
1512
+ {
1513
+ "epoch": 0.755,
1514
+ "grad_norm": 0.5975731015205383,
1515
+ "learning_rate": 4.311257070204293e-05,
1516
+ "loss": 1.3182,
1517
+ "num_input_tokens_seen": 494796800,
1518
+ "step": 15100,
1519
+ "train_runtime": 3742.4523,
1520
+ "train_tokens_per_second": 132211.918
1521
+ },
1522
+ {
1523
+ "epoch": 0.76,
1524
+ "grad_norm": 0.5820591449737549,
1525
+ "learning_rate": 4.145631949273772e-05,
1526
+ "loss": 1.3308,
1527
+ "num_input_tokens_seen": 498073600,
1528
+ "step": 15200,
1529
+ "train_runtime": 3767.5955,
1530
+ "train_tokens_per_second": 132199.328
1531
+ },
1532
+ {
1533
+ "epoch": 0.765,
1534
+ "grad_norm": 0.5704116225242615,
1535
+ "learning_rate": 3.982739357453573e-05,
1536
+ "loss": 1.3205,
1537
+ "num_input_tokens_seen": 501350400,
1538
+ "step": 15300,
1539
+ "train_runtime": 3791.6072,
1540
+ "train_tokens_per_second": 132226.355
1541
+ },
1542
+ {
1543
+ "epoch": 0.77,
1544
+ "grad_norm": 0.6396017670631409,
1545
+ "learning_rate": 3.8226203020825794e-05,
1546
+ "loss": 1.3236,
1547
+ "num_input_tokens_seen": 504627200,
1548
+ "step": 15400,
1549
+ "train_runtime": 3815.7237,
1550
+ "train_tokens_per_second": 132249.409
1551
+ },
1552
+ {
1553
+ "epoch": 0.775,
1554
+ "grad_norm": 0.5288236737251282,
1555
+ "learning_rate": 3.665315092276703e-05,
1556
+ "loss": 1.3172,
1557
+ "num_input_tokens_seen": 507904000,
1558
+ "step": 15500,
1559
+ "train_runtime": 3839.9672,
1560
+ "train_tokens_per_second": 132267.796
1561
+ },
1562
+ {
1563
+ "epoch": 0.78,
1564
+ "grad_norm": 0.5525203943252563,
1565
+ "learning_rate": 3.510863328781284e-05,
1566
+ "loss": 1.3332,
1567
+ "num_input_tokens_seen": 511180800,
1568
+ "step": 15600,
1569
+ "train_runtime": 3863.4067,
1570
+ "train_tokens_per_second": 132313.486
1571
+ },
1572
+ {
1573
+ "epoch": 0.785,
1574
+ "grad_norm": 0.5880547165870667,
1575
+ "learning_rate": 3.359303894001809e-05,
1576
+ "loss": 1.3175,
1577
+ "num_input_tokens_seen": 514457600,
1578
+ "step": 15700,
1579
+ "train_runtime": 3887.3027,
1580
+ "train_tokens_per_second": 132343.077
1581
+ },
1582
+ {
1583
+ "epoch": 0.79,
1584
+ "grad_norm": 0.6689568161964417,
1585
+ "learning_rate": 3.210674942215488e-05,
1586
+ "loss": 1.3132,
1587
+ "num_input_tokens_seen": 517734400,
1588
+ "step": 15800,
1589
+ "train_runtime": 3911.3563,
1590
+ "train_tokens_per_second": 132366.975
1591
+ },
1592
+ {
1593
+ "epoch": 0.795,
1594
+ "grad_norm": 0.7104946970939636,
1595
+ "learning_rate": 3.065013889966106e-05,
1596
+ "loss": 1.3231,
1597
+ "num_input_tokens_seen": 521011200,
1598
+ "step": 15900,
1599
+ "train_runtime": 3940.5019,
1600
+ "train_tokens_per_second": 132219.501
1601
+ },
1602
+ {
1603
+ "epoch": 0.8,
1604
+ "grad_norm": 0.5700929760932922,
1605
+ "learning_rate": 2.922357406644594e-05,
1606
+ "loss": 1.3239,
1607
+ "num_input_tokens_seen": 524288000,
1608
+ "step": 16000,
1609
+ "train_runtime": 3964.1157,
1610
+ "train_tokens_per_second": 132258.5
1611
+ },
1612
+ {
1613
+ "epoch": 0.805,
1614
+ "grad_norm": 0.5516105890274048,
1615
+ "learning_rate": 2.7827414052577485e-05,
1616
+ "loss": 1.3176,
1617
+ "num_input_tokens_seen": 527564800,
1618
+ "step": 16100,
1619
+ "train_runtime": 3988.691,
1620
+ "train_tokens_per_second": 132265.148
1621
+ },
1622
+ {
1623
+ "epoch": 0.81,
1624
+ "grad_norm": 0.66648268699646,
1625
+ "learning_rate": 2.6462010333872835e-05,
1626
+ "loss": 1.3176,
1627
+ "num_input_tokens_seen": 530841600,
1628
+ "step": 16200,
1629
+ "train_runtime": 4013.8341,
1630
+ "train_tokens_per_second": 132253.001
1631
+ },
1632
+ {
1633
+ "epoch": 0.815,
1634
+ "grad_norm": 0.6033647060394287,
1635
+ "learning_rate": 2.512770664341634e-05,
1636
+ "loss": 1.3266,
1637
+ "num_input_tokens_seen": 534118400,
1638
+ "step": 16300,
1639
+ "train_runtime": 4038.866,
1640
+ "train_tokens_per_second": 132244.645
1641
+ },
1642
+ {
1643
+ "epoch": 0.82,
1644
+ "grad_norm": 0.6393758654594421,
1645
+ "learning_rate": 2.3824838885026542e-05,
1646
+ "loss": 1.3145,
1647
+ "num_input_tokens_seen": 537395200,
1648
+ "step": 16400,
1649
+ "train_runtime": 4063.16,
1650
+ "train_tokens_per_second": 132260.407
1651
+ },
1652
+ {
1653
+ "epoch": 0.825,
1654
+ "grad_norm": 0.6355799436569214,
1655
+ "learning_rate": 2.2553735048694176e-05,
1656
+ "loss": 1.3327,
1657
+ "num_input_tokens_seen": 540672000,
1658
+ "step": 16500,
1659
+ "train_runtime": 4087.2347,
1660
+ "train_tokens_per_second": 132283.082
1661
+ },
1662
+ {
1663
+ "epoch": 0.83,
1664
+ "grad_norm": 0.5949875116348267,
1665
+ "learning_rate": 2.13147151280126e-05,
1666
+ "loss": 1.3195,
1667
+ "num_input_tokens_seen": 543948800,
1668
+ "step": 16600,
1669
+ "train_runtime": 4110.6472,
1670
+ "train_tokens_per_second": 132326.802
1671
+ },
1672
+ {
1673
+ "epoch": 0.835,
1674
+ "grad_norm": 0.5466151237487793,
1675
+ "learning_rate": 2.0108091039620746e-05,
1676
+ "loss": 1.2963,
1677
+ "num_input_tokens_seen": 547225600,
1678
+ "step": 16700,
1679
+ "train_runtime": 4134.8391,
1680
+ "train_tokens_per_second": 132345.078
1681
+ },
1682
+ {
1683
+ "epoch": 0.84,
1684
+ "grad_norm": 0.6407220959663391,
1685
+ "learning_rate": 1.893416654468022e-05,
1686
+ "loss": 1.3068,
1687
+ "num_input_tokens_seen": 550502400,
1688
+ "step": 16800,
1689
+ "train_runtime": 4158.5574,
1690
+ "train_tokens_per_second": 132378.213
1691
+ },
1692
+ {
1693
+ "epoch": 0.845,
1694
+ "grad_norm": 0.8987123966217041,
1695
+ "learning_rate": 1.7793237172404756e-05,
1696
+ "loss": 1.3141,
1697
+ "num_input_tokens_seen": 553779200,
1698
+ "step": 16900,
1699
+ "train_runtime": 4182.0603,
1700
+ "train_tokens_per_second": 132417.795
1701
+ },
1702
+ {
1703
+ "epoch": 0.85,
1704
+ "grad_norm": 0.5130665898323059,
1705
+ "learning_rate": 1.6685590145662397e-05,
1706
+ "loss": 1.3224,
1707
+ "num_input_tokens_seen": 557056000,
1708
+ "step": 17000,
1709
+ "train_runtime": 4211.0014,
1710
+ "train_tokens_per_second": 132285.874
1711
+ },
1712
+ {
1713
+ "epoch": 0.855,
1714
+ "grad_norm": 0.6242457032203674,
1715
+ "learning_rate": 1.5611504308668742e-05,
1716
+ "loss": 1.3044,
1717
+ "num_input_tokens_seen": 560332800,
1718
+ "step": 17100,
1719
+ "train_runtime": 4235.4217,
1720
+ "train_tokens_per_second": 132296.813
1721
+ },
1722
+ {
1723
+ "epoch": 0.86,
1724
+ "grad_norm": 3.367483377456665,
1725
+ "learning_rate": 1.4571250056789274e-05,
1726
+ "loss": 1.3335,
1727
+ "num_input_tokens_seen": 563609600,
1728
+ "step": 17200,
1729
+ "train_runtime": 4260.3571,
1730
+ "train_tokens_per_second": 132291.634
1731
+ },
1732
+ {
1733
+ "epoch": 0.865,
1734
+ "grad_norm": 0.758712887763977,
1735
+ "learning_rate": 1.356508926846892e-05,
1736
+ "loss": 1.343,
1737
+ "num_input_tokens_seen": 566886400,
1738
+ "step": 17300,
1739
+ "train_runtime": 4284.7372,
1740
+ "train_tokens_per_second": 132303.658
1741
+ },
1742
+ {
1743
+ "epoch": 0.87,
1744
+ "grad_norm": 0.700606644153595,
1745
+ "learning_rate": 1.2593275239305378e-05,
1746
+ "loss": 1.3026,
1747
+ "num_input_tokens_seen": 570163200,
1748
+ "step": 17400,
1749
+ "train_runtime": 4309.3996,
1750
+ "train_tokens_per_second": 132306.877
1751
+ },
1752
+ {
1753
+ "epoch": 0.875,
1754
+ "grad_norm": 0.6233837008476257,
1755
+ "learning_rate": 1.1656052618283552e-05,
1756
+ "loss": 1.3068,
1757
+ "num_input_tokens_seen": 573440000,
1758
+ "step": 17500,
1759
+ "train_runtime": 4333.6553,
1760
+ "train_tokens_per_second": 132322.477
1761
+ },
1762
+ {
1763
+ "epoch": 0.88,
1764
+ "grad_norm": 0.7638545632362366,
1765
+ "learning_rate": 1.0753657346186217e-05,
1766
+ "loss": 1.3114,
1767
+ "num_input_tokens_seen": 576716800,
1768
+ "step": 17600,
1769
+ "train_runtime": 4357.7488,
1770
+ "train_tokens_per_second": 132342.827
1771
+ },
1772
+ {
1773
+ "epoch": 0.885,
1774
+ "grad_norm": 0.6683318614959717,
1775
+ "learning_rate": 9.886316596197341e-06,
1776
+ "loss": 1.3141,
1777
+ "num_input_tokens_seen": 579993600,
1778
+ "step": 17700,
1779
+ "train_runtime": 4381.4392,
1780
+ "train_tokens_per_second": 132375.134
1781
+ },
1782
+ {
1783
+ "epoch": 0.89,
1784
+ "grad_norm": 0.6613823175430298,
1785
+ "learning_rate": 9.054248716712404e-06,
1786
+ "loss": 1.3556,
1787
+ "num_input_tokens_seen": 583270400,
1788
+ "step": 17800,
1789
+ "train_runtime": 4406.3414,
1790
+ "train_tokens_per_second": 132370.679
1791
+ },
1792
+ {
1793
+ "epoch": 0.895,
1794
+ "grad_norm": 0.6166129112243652,
1795
+ "learning_rate": 8.257663176370389e-06,
1796
+ "loss": 1.2998,
1797
+ "num_input_tokens_seen": 586547200,
1798
+ "step": 17900,
1799
+ "train_runtime": 4429.85,
1800
+ "train_tokens_per_second": 132407.914
1801
+ },
1802
+ {
1803
+ "epoch": 0.9,
1804
+ "grad_norm": 0.5475023984909058,
1805
+ "learning_rate": 7.496760511321115e-06,
1806
+ "loss": 1.3117,
1807
+ "num_input_tokens_seen": 589824000,
1808
+ "step": 18000,
1809
+ "train_runtime": 4458.7377,
1810
+ "train_tokens_per_second": 132284.974
1811
+ },
1812
+ {
1813
+ "epoch": 0.905,
1814
+ "grad_norm": 0.611223042011261,
1815
+ "learning_rate": 6.7717322747414036e-06,
1816
+ "loss": 1.3084,
1817
+ "num_input_tokens_seen": 593100800,
1818
+ "step": 18100,
1819
+ "train_runtime": 4482.5986,
1820
+ "train_tokens_per_second": 132311.824
1821
+ },
1822
+ {
1823
+ "epoch": 0.91,
1824
+ "grad_norm": 0.8295536041259766,
1825
+ "learning_rate": 6.0827609886125505e-06,
1826
+ "loss": 1.3056,
1827
+ "num_input_tokens_seen": 596377600,
1828
+ "step": 18200,
1829
+ "train_runtime": 4506.3151,
1830
+ "train_tokens_per_second": 132342.632
1831
+ },
1832
+ {
1833
+ "epoch": 0.915,
1834
+ "grad_norm": 0.9818552136421204,
1835
+ "learning_rate": 5.430020097771398e-06,
1836
+ "loss": 1.3115,
1837
+ "num_input_tokens_seen": 599654400,
1838
+ "step": 18300,
1839
+ "train_runtime": 4531.2552,
1840
+ "train_tokens_per_second": 132337.372
1841
+ },
1842
+ {
1843
+ "epoch": 0.92,
1844
+ "grad_norm": 1.4284358024597168,
1845
+ "learning_rate": 4.8136739262464974e-06,
1846
+ "loss": 1.3283,
1847
+ "num_input_tokens_seen": 602931200,
1848
+ "step": 18400,
1849
+ "train_runtime": 4555.0502,
1850
+ "train_tokens_per_second": 132365.435
1851
+ },
1852
+ {
1853
+ "epoch": 0.925,
1854
+ "grad_norm": 0.6662764549255371,
1855
+ "learning_rate": 4.233877635890481e-06,
1856
+ "loss": 1.3041,
1857
+ "num_input_tokens_seen": 606208000,
1858
+ "step": 18500,
1859
+ "train_runtime": 4579.4146,
1860
+ "train_tokens_per_second": 132376.747
1861
+ },
1862
+ {
1863
+ "epoch": 0.93,
1864
+ "grad_norm": 0.5408564805984497,
1865
+ "learning_rate": 3.690777187318844e-06,
1866
+ "loss": 1.2923,
1867
+ "num_input_tokens_seen": 609484800,
1868
+ "step": 18600,
1869
+ "train_runtime": 4604.2652,
1870
+ "train_tokens_per_second": 132373.955
1871
+ },
1872
+ {
1873
+ "epoch": 0.935,
1874
+ "grad_norm": 0.7473659515380859,
1875
+ "learning_rate": 3.1845093031651335e-06,
1876
+ "loss": 1.2959,
1877
+ "num_input_tokens_seen": 612761600,
1878
+ "step": 18700,
1879
+ "train_runtime": 4627.7441,
1880
+ "train_tokens_per_second": 132410.433
1881
+ },
1882
+ {
1883
+ "epoch": 0.94,
1884
+ "grad_norm": 0.6809543371200562,
1885
+ "learning_rate": 2.715201433661801e-06,
1886
+ "loss": 1.3088,
1887
+ "num_input_tokens_seen": 616038400,
1888
+ "step": 18800,
1889
+ "train_runtime": 4652.153,
1890
+ "train_tokens_per_second": 132420.065
1891
+ },
1892
+ {
1893
+ "epoch": 0.945,
1894
+ "grad_norm": 0.5511806011199951,
1895
+ "learning_rate": 2.282971724555249e-06,
1896
+ "loss": 1.2991,
1897
+ "num_input_tokens_seen": 619315200,
1898
+ "step": 18900,
1899
+ "train_runtime": 4676.0174,
1900
+ "train_tokens_per_second": 132445.017
1901
+ },
1902
+ {
1903
+ "epoch": 0.95,
1904
+ "grad_norm": 0.5333660244941711,
1905
+ "learning_rate": 1.8879289873632907e-06,
1906
+ "loss": 1.3112,
1907
+ "num_input_tokens_seen": 622592000,
1908
+ "step": 19000,
1909
+ "train_runtime": 4705.3038,
1910
+ "train_tokens_per_second": 132317.068
1911
+ },
1912
+ {
1913
+ "epoch": 0.955,
1914
+ "grad_norm": 0.5114225745201111,
1915
+ "learning_rate": 1.530172671982427e-06,
1916
+ "loss": 1.304,
1917
+ "num_input_tokens_seen": 625868800,
1918
+ "step": 19100,
1919
+ "train_runtime": 4728.0888,
1920
+ "train_tokens_per_second": 132372.471
1921
+ },
1922
+ {
1923
+ "epoch": 0.96,
1924
+ "grad_norm": 0.7311033010482788,
1925
+ "learning_rate": 1.2097928416518577e-06,
1926
+ "loss": 1.3016,
1927
+ "num_input_tokens_seen": 629145600,
1928
+ "step": 19200,
1929
+ "train_runtime": 4751.8878,
1930
+ "train_tokens_per_second": 132399.086
1931
+ },
1932
+ {
1933
+ "epoch": 0.965,
1934
+ "grad_norm": 0.5196536183357239,
1935
+ "learning_rate": 9.268701502805509e-07,
1936
+ "loss": 1.3029,
1937
+ "num_input_tokens_seen": 632422400,
1938
+ "step": 19300,
1939
+ "train_runtime": 4776.4621,
1940
+ "train_tokens_per_second": 132403.941
1941
+ },
1942
+ {
1943
+ "epoch": 0.97,
1944
+ "grad_norm": 0.6195040345191956,
1945
+ "learning_rate": 6.814758221430683e-07,
1946
+ "loss": 1.3103,
1947
+ "num_input_tokens_seen": 635699200,
1948
+ "step": 19400,
1949
+ "train_runtime": 4800.6355,
1950
+ "train_tokens_per_second": 132419.8
1951
+ },
1952
+ {
1953
+ "epoch": 0.975,
1954
+ "grad_norm": 0.6139786243438721,
1955
+ "learning_rate": 4.7367163394922416e-07,
1956
+ "loss": 1.2979,
1957
+ "num_input_tokens_seen": 638976000,
1958
+ "step": 19500,
1959
+ "train_runtime": 4825.134,
1960
+ "train_tokens_per_second": 132426.581
1961
+ },
1962
+ {
1963
+ "epoch": 0.98,
1964
+ "grad_norm": 0.49304258823394775,
1965
+ "learning_rate": 3.035098992920926e-07,
1966
+ "loss": 1.3025,
1967
+ "num_input_tokens_seen": 642252800,
1968
+ "step": 19600,
1969
+ "train_runtime": 4849.4238,
1970
+ "train_tokens_per_second": 132438.992
1971
+ },
1972
+ {
1973
+ "epoch": 0.985,
1974
+ "grad_norm": 0.5793161988258362,
1975
+ "learning_rate": 1.710334554783921e-07,
1976
+ "loss": 1.3106,
1977
+ "num_input_tokens_seen": 645529600,
1978
+ "step": 19700,
1979
+ "train_runtime": 4879.0091,
1980
+ "train_tokens_per_second": 132307.52
1981
+ },
1982
+ {
1983
+ "epoch": 0.99,
1984
+ "grad_norm": 0.6929790377616882,
1985
+ "learning_rate": 7.627565274441194e-08,
1986
+ "loss": 1.3241,
1987
+ "num_input_tokens_seen": 648806400,
1988
+ "step": 19800,
1989
+ "train_runtime": 4902.4093,
1990
+ "train_tokens_per_second": 132344.397
1991
+ },
1992
+ {
1993
+ "epoch": 0.995,
1994
+ "grad_norm": 0.6057826280593872,
1995
+ "learning_rate": 1.9260345860244718e-08,
1996
+ "loss": 1.3207,
1997
+ "num_input_tokens_seen": 652083200,
1998
+ "step": 19900,
1999
+ "train_runtime": 4926.578,
2000
+ "train_tokens_per_second": 132360.27
2001
+ },
2002
+ {
2003
+ "epoch": 1.0,
2004
+ "grad_norm": 0.596373438835144,
2005
+ "learning_rate": 1.8881245067436934e-12,
2006
+ "loss": 1.3027,
2007
+ "num_input_tokens_seen": 655360000,
2008
+ "step": 20000,
2009
+ "train_runtime": 4950.9316,
2010
+ "train_tokens_per_second": 132371.046
2011
+ },
2012
+ {
2013
+ "epoch": 1.0,
2014
+ "num_input_tokens_seen": 655360000,
2015
+ "step": 20000,
2016
+ "total_flos": 1.23866185728e+16,
2017
+ "train_loss": 1.4302955017089845,
2018
+ "train_runtime": 4951.1263,
2019
+ "train_samples_per_second": 517.054,
2020
+ "train_steps_per_second": 4.039
2021
+ }
2022
+ ],
2023
+ "logging_steps": 100,
2024
+ "max_steps": 20000,
2025
+ "num_input_tokens_seen": 655360000,
2026
+ "num_train_epochs": 9223372036854775807,
2027
+ "save_steps": 1000,
2028
+ "stateful_callbacks": {
2029
+ "TrainerControl": {
2030
+ "args": {
2031
+ "should_epoch_stop": false,
2032
+ "should_evaluate": false,
2033
+ "should_log": false,
2034
+ "should_save": true,
2035
+ "should_training_stop": true
2036
+ },
2037
+ "attributes": {}
2038
+ }
2039
+ },
2040
+ "total_flos": 1.23866185728e+16,
2041
+ "train_batch_size": 128,
2042
+ "trial_name": null,
2043
+ "trial_params": null
2044
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5641ba09bd0c0ab3f39b0e2bc742cb033a6a6494c1a96dd5f0c0ab5c802ebc52
3
+ size 5841