amirali1985 commited on
Commit
8d6147f
·
verified ·
1 Parent(s): a31a0bb

Upload add_sub_baseline_10K

Browse files
add_sub_baseline_10K/config.json CHANGED
@@ -4,9 +4,7 @@
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
  "dtype": "float32",
9
- "eos_token_id": null,
10
  "head_dim": 128,
11
  "hidden_act": "silu",
12
  "hidden_size": 510,
@@ -22,15 +20,12 @@
22
  "num_attention_heads": 3,
23
  "num_hidden_layers": 2,
24
  "num_key_value_heads": 3,
25
- "pad_token_id": null,
26
  "rms_norm_eps": 1e-06,
27
- "rope_parameters": {
28
- "rope_theta": 10000.0,
29
- "rope_type": "default"
30
- },
31
  "sliding_window": null,
32
  "tie_word_embeddings": false,
33
- "transformers_version": "5.5.0",
34
  "use_cache": true,
35
  "use_sliding_window": false,
36
  "vocab_size": 151645
 
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
 
7
  "dtype": "float32",
 
8
  "head_dim": 128,
9
  "hidden_act": "silu",
10
  "hidden_size": 510,
 
20
  "num_attention_heads": 3,
21
  "num_hidden_layers": 2,
22
  "num_key_value_heads": 3,
 
23
  "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
 
 
26
  "sliding_window": null,
27
  "tie_word_embeddings": false,
28
+ "transformers_version": "4.57.6",
29
  "use_cache": true,
30
  "use_sliding_window": false,
31
  "vocab_size": 151645
add_sub_baseline_10K/generation_config.json CHANGED
@@ -1,7 +1,4 @@
1
  {
2
  "_from_model_config": true,
3
- "output_attentions": false,
4
- "output_hidden_states": false,
5
- "transformers_version": "5.5.0",
6
- "use_cache": true
7
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "transformers_version": "4.57.6"
 
 
 
4
  }
add_sub_baseline_10K/metrics.json CHANGED
@@ -3,327 +3,34 @@
3
  "step": [
4
  50,
5
  100,
6
- 150,
7
- 200,
8
- 250,
9
- 300,
10
- 350,
11
- 400,
12
- 450,
13
- 500,
14
- 550,
15
- 600,
16
- 650,
17
- 700,
18
- 750,
19
- 800,
20
- 850,
21
- 900,
22
- 950,
23
- 1000,
24
- 1050,
25
- 1100,
26
- 1150,
27
- 1200,
28
- 1250,
29
- 1300,
30
- 1350,
31
- 1400,
32
- 1450,
33
- 1500,
34
- 1550,
35
- 1600,
36
- 1650,
37
- 1700,
38
- 1750,
39
- 1800,
40
- 1850,
41
- 1900,
42
- 1950,
43
- 2000,
44
- 2050,
45
- 2100,
46
- 2150,
47
- 2200,
48
- 2250,
49
- 2300,
50
- 2350,
51
- 2400,
52
- 2450,
53
- 2500,
54
- 2550,
55
- 2600,
56
- 2650,
57
- 2700,
58
- 2750,
59
- 2800,
60
- 2850,
61
- 2900,
62
- 2950,
63
- 3000,
64
- 3050,
65
- 3100
66
  ],
67
  "loss": [
68
- 7.406970977783203,
69
- 4.287904262542725,
70
- 2.028409719467163,
71
- 1.9122778177261353,
72
- 1.7641465663909912,
73
- 1.6944198608398438,
74
- 1.5549181699752808,
75
- 1.4273613691329956,
76
- 1.1411476135253906,
77
- 0.7707713842391968,
78
- 0.652624249458313,
79
- 0.5090309977531433,
80
- 0.4863796830177307,
81
- 0.4217025935649872,
82
- 0.3141794204711914,
83
- 0.26072242856025696,
84
- 0.20925267040729523,
85
- 0.1722976416349411,
86
- 0.16701845824718475,
87
- 0.14609448611736298,
88
- 0.15161116421222687,
89
- 0.11793585866689682,
90
- 0.12784767150878906,
91
- 0.17903172969818115,
92
- 0.07992793619632721,
93
- 0.09357694536447525,
94
- 0.07849029451608658,
95
- 0.08744285255670547,
96
- 0.047625500708818436,
97
- 0.06857192516326904,
98
- 0.058801162987947464,
99
- 0.08054264634847641,
100
- 0.0829831138253212,
101
- 0.08300244808197021,
102
- 0.04127458110451698,
103
- 0.045694876462221146,
104
- 0.0545954629778862,
105
- 0.04024777188897133,
106
- 0.07766951620578766,
107
- 0.024591432884335518,
108
- 0.06888258457183838,
109
- 0.03272942453622818,
110
- 0.05081992596387863,
111
- 0.07547096163034439,
112
- 0.04898810759186745,
113
- 0.04332876205444336,
114
- 0.018458085134625435,
115
- 0.03549834340810776,
116
- 0.020787600427865982,
117
- 0.01855045184493065,
118
- 0.018782520666718483,
119
- 0.013612723909318447,
120
- 0.0217971820384264,
121
- 0.01920527033507824,
122
- 0.01275918073952198,
123
- 0.023383263498544693,
124
- 0.017817409709095955,
125
- 0.01498924009501934,
126
- 0.021258754655718803,
127
- 0.022163305431604385,
128
- 0.012838104739785194,
129
- 0.015554169192910194
130
  ],
131
  "base_loss": [
132
- 7.406970977783203,
133
- 4.287904262542725,
134
- 2.028409719467163,
135
- 1.9122778177261353,
136
- 1.7641465663909912,
137
- 1.6944198608398438,
138
- 1.5549181699752808,
139
- 1.4273613691329956,
140
- 1.1411476135253906,
141
- 0.7707713842391968,
142
- 0.652624249458313,
143
- 0.5090309977531433,
144
- 0.4863796830177307,
145
- 0.4217025935649872,
146
- 0.3141794204711914,
147
- 0.26072242856025696,
148
- 0.20925267040729523,
149
- 0.1722976416349411,
150
- 0.16701845824718475,
151
- 0.14609448611736298,
152
- 0.15161116421222687,
153
- 0.11793585866689682,
154
- 0.12784767150878906,
155
- 0.17903172969818115,
156
- 0.07992793619632721,
157
- 0.09357694536447525,
158
- 0.07849029451608658,
159
- 0.08744285255670547,
160
- 0.047625500708818436,
161
- 0.06857192516326904,
162
- 0.058801162987947464,
163
- 0.08054264634847641,
164
- 0.0829831138253212,
165
- 0.08300244808197021,
166
- 0.04127458110451698,
167
- 0.045694876462221146,
168
- 0.0545954629778862,
169
- 0.04024777188897133,
170
- 0.07766951620578766,
171
- 0.024591432884335518,
172
- 0.06888258457183838,
173
- 0.03272942453622818,
174
- 0.05081992596387863,
175
- 0.07547096163034439,
176
- 0.04898810759186745,
177
- 0.04332876205444336,
178
- 0.018458085134625435,
179
- 0.03549834340810776,
180
- 0.020787600427865982,
181
- 0.01855045184493065,
182
- 0.018782520666718483,
183
- 0.013612723909318447,
184
- 0.0217971820384264,
185
- 0.01920527033507824,
186
- 0.01275918073952198,
187
- 0.023383263498544693,
188
- 0.017817409709095955,
189
- 0.01498924009501934,
190
- 0.021258754655718803,
191
- 0.022163305431604385,
192
- 0.012838104739785194,
193
- 0.015554169192910194
194
  ],
195
  "lr": [
196
  3.9200000000000004e-05,
197
  7.92e-05,
198
- 7.994872780244471e-05,
199
- 7.979084217550451e-05,
200
- 7.952674320281786e-05,
201
- 7.915713584145437e-05,
202
- 7.868300668109943e-05,
203
- 7.810562131055899e-05,
204
- 7.742652093953451e-05,
205
- 7.664751828468545e-05,
206
- 7.57706927309605e-05,
207
- 7.47983847811137e-05,
208
- 7.373318980822093e-05,
209
- 7.25779511278734e-05,
210
- 7.133575240854014e-05,
211
- 7.000990944035905e-05,
212
- 6.860396128432721e-05,
213
- 6.712166082551651e-05,
214
- 6.556696475553043e-05,
215
- 6.394402301094192e-05,
216
- 6.225716769590408e-05,
217
- 6.0510901518502626e-05,
218
- 5.8709885771716645e-05,
219
- 5.685892789107026e-05,
220
- 5.4962968622187084e-05,
221
- 5.3027068832501364e-05,
222
- 5.1056396002328924e-05,
223
- 4.9056210431357356e-05,
224
- 4.703185119737419e-05,
225
- 4.498872190471344e-05,
226
- 4.293227626046202e-05,
227
- 4.086800351692724e-05,
228
- 3.880141381922381e-05,
229
- 3.673802349709187e-05,
230
- 3.468334034020644e-05,
231
- 3.2642848896282794e-05,
232
- 3.062199583122133e-05,
233
- 2.862617539037015e-05,
234
- 2.6660714999713274e-05,
235
- 2.4730861045419232e-05,
236
- 2.284176486970851e-05,
237
- 2.099846902042102e-05,
238
- 1.9205893790987304e-05,
239
- 1.7468824086732586e-05,
240
- 1.579189665257094e-05,
241
- 1.4179587696182778e-05,
242
- 1.2636200939713001e-05,
243
- 1.1165856131883247e-05,
244
- 9.772478051182794e-06,
245
- 8.459786029491775e-06,
246
- 7.231284024101261e-06,
247
- 6.090251264630804e-06,
248
- 5.039733499809587e-06,
249
- 4.082534867486105e-06,
250
- 3.221210409567612e-06,
251
- 2.458059251869167e-06,
252
- 1.7951184670772902e-06,
253
- 1.2341576372097185e-06,
254
- 7.766741300856728e-07,
255
- 4.23889102415056e-07,
256
- 1.7674424017557922e-07,
257
- 3.58992449786566e-08
258
  ],
259
  "eval_step": [
260
- 156,
261
- 312,
262
- 468,
263
- 624,
264
- 780,
265
- 936,
266
- 1092,
267
- 1248,
268
- 1404,
269
- 1560,
270
- 1716,
271
- 1872,
272
- 2028,
273
- 2184,
274
- 2340,
275
- 2496,
276
- 2652,
277
- 2808,
278
- 2964,
279
- 3120
280
  ],
281
  "eval_epoch": [
282
- 1,
283
- 2,
284
- 3,
285
- 4,
286
- 5,
287
- 6,
288
- 7,
289
- 8,
290
- 9,
291
- 10,
292
- 11,
293
- 12,
294
- 13,
295
- 14,
296
- 15,
297
- 16,
298
- 17,
299
- 18,
300
- 19,
301
- 20
302
  ],
303
  "eval_accuracy": [
304
- 0.005263157894736842,
305
- 0.005263157894736842,
306
- 0.023157894736842106,
307
- 0.10421052631578948,
308
- 0.4073684210526316,
309
- 0.6189473684210526,
310
- 0.6663157894736842,
311
- 0.7210526315789474,
312
- 0.7315789473684211,
313
- 0.76,
314
- 0.7884210526315789,
315
- 0.783157894736842,
316
- 0.7631578947368421,
317
- 0.8126315789473684,
318
- 0.8147368421052632,
319
- 0.8126315789473684,
320
- 0.8105263157894737,
321
- 0.8189473684210526,
322
- 0.82,
323
- 0.8094736842105263
324
  ]
325
  },
326
- "final_accuracy": 0.76,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
@@ -334,330 +41,330 @@
334
  },
335
  "splits": {
336
  "add_S0": {
337
- "full_accuracy": 0.97,
338
- "digit_accuracy": 0.9957142857142857,
339
  "n_examples": 100,
340
  "per_subtask": {
341
  "SA": {
342
- "accuracy": 0.9950413223140496,
343
  "count": 605
344
  },
345
  "SS": {
346
- "accuracy": 1.0,
347
  "count": 95
348
  }
349
  }
350
  },
351
  "add_S1": {
352
- "full_accuracy": 1.0,
353
- "digit_accuracy": 1.0,
354
  "n_examples": 100,
355
  "per_subtask": {
356
  "SA": {
357
- "accuracy": 1.0,
358
  "count": 204
359
  },
360
  "SC": {
361
- "accuracy": 1.0,
362
  "count": 169
363
  },
364
  "SS": {
365
- "accuracy": 1.0,
366
  "count": 31
367
  },
368
  "UC": {
369
- "accuracy": 1.0,
370
  "count": 296
371
  }
372
  }
373
  },
374
  "add_S2": {
375
- "full_accuracy": 0.97,
376
- "digit_accuracy": 0.9957142857142857,
377
  "n_examples": 100,
378
  "per_subtask": {
379
  "SA": {
380
- "accuracy": 1.0,
381
  "count": 163
382
  },
383
  "SC": {
384
- "accuracy": 0.9846153846153847,
385
  "count": 130
386
  },
387
  "SS": {
388
- "accuracy": 0.9885057471264368,
389
  "count": 87
390
  },
391
  "UC": {
392
- "accuracy": 1.0,
393
  "count": 203
394
  },
395
  "US": {
396
- "accuracy": 1.0,
397
  "count": 117
398
  }
399
  }
400
  },
401
  "add_S3": {
402
- "full_accuracy": 0.67,
403
- "digit_accuracy": 0.9528571428571428,
404
  "n_examples": 100,
405
  "per_subtask": {
406
  "SA": {
407
- "accuracy": 1.0,
408
  "count": 121
409
  },
410
  "SC": {
411
- "accuracy": 0.9834710743801653,
412
  "count": 121
413
  },
414
  "SS": {
415
- "accuracy": 1.0,
416
  "count": 49
417
  },
418
  "UC": {
419
- "accuracy": 0.8333333333333334,
420
  "count": 186
421
  },
422
  "US": {
423
- "accuracy": 1.0,
424
  "count": 223
425
  }
426
  }
427
  },
428
  "add_S4": {
429
- "full_accuracy": 0.6,
430
- "digit_accuracy": 0.9171428571428571,
431
  "n_examples": 100,
432
  "per_subtask": {
433
  "SA": {
434
- "accuracy": 1.0,
435
  "count": 104
436
  },
437
  "SC": {
438
- "accuracy": 1.0,
439
  "count": 106
440
  },
441
  "SS": {
442
- "accuracy": 1.0,
443
  "count": 23
444
  },
445
  "UC": {
446
- "accuracy": 0.8,
447
  "count": 160
448
  },
449
  "US": {
450
- "accuracy": 0.9153094462540716,
451
  "count": 307
452
  }
453
  }
454
  },
455
  "add_S5": {
456
- "full_accuracy": 0.5,
457
- "digit_accuracy": 0.8371428571428572,
458
  "n_examples": 100,
459
  "per_subtask": {
460
  "SA": {
461
- "accuracy": 1.0,
462
  "count": 100
463
  },
464
  "SC": {
465
- "accuracy": 1.0,
466
  "count": 100
467
  },
468
  "UC": {
469
- "accuracy": 0.65,
470
  "count": 100
471
  },
472
  "US": {
473
- "accuracy": 0.8025,
474
  "count": 400
475
  }
476
  }
477
  },
478
  "add_S6": {
479
- "full_accuracy": 0.5,
480
- "digit_accuracy": 0.8142857142857143,
481
  "n_examples": 100,
482
  "per_subtask": {
483
  "SC": {
484
- "accuracy": 1.0,
485
  "count": 100
486
  },
487
  "UC": {
488
- "accuracy": 0.76,
489
  "count": 100
490
  },
491
  "US": {
492
- "accuracy": 0.788,
493
  "count": 500
494
  }
495
  }
496
  },
497
  "add_random": {
498
- "full_accuracy": 0.97,
499
- "digit_accuracy": 0.9957142857142857,
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 0.9955257270693513,
504
  "count": 447
505
  },
506
  "SC": {
507
- "accuracy": 0.99375,
508
  "count": 320
509
  },
510
  "SS": {
511
- "accuracy": 1.0,
512
  "count": 56
513
  },
514
  "UC": {
515
- "accuracy": 0.996219281663516,
516
  "count": 529
517
  },
518
  "US": {
519
- "accuracy": 1.0,
520
  "count": 48
521
  }
522
  }
523
  },
524
  "add_C1": {
525
- "full_accuracy": 0.99,
526
- "digit_accuracy": 0.9985714285714286,
527
  "n_examples": 100,
528
  "per_subtask": {
529
  "SA": {
530
- "accuracy": 0.998,
531
  "count": 500
532
  },
533
  "SC": {
534
- "accuracy": 1.0,
535
  "count": 100
536
  },
537
  "UC": {
538
- "accuracy": 1.0,
539
  "count": 100
540
  }
541
  }
542
  },
543
  "add_C2": {
544
- "full_accuracy": 0.93,
545
- "digit_accuracy": 0.99,
546
  "n_examples": 100,
547
  "per_subtask": {
548
  "SA": {
549
- "accuracy": 0.995,
550
  "count": 400
551
  },
552
  "SC": {
553
- "accuracy": 1.0,
554
  "count": 100
555
  },
556
  "UC": {
557
- "accuracy": 0.9743589743589743,
558
  "count": 156
559
  },
560
  "US": {
561
- "accuracy": 0.9772727272727273,
562
  "count": 44
563
  }
564
  }
565
  },
566
  "add_C3": {
567
- "full_accuracy": 0.85,
568
- "digit_accuracy": 0.9785714285714285,
569
  "n_examples": 100,
570
  "per_subtask": {
571
  "SA": {
572
- "accuracy": 1.0,
573
  "count": 300
574
  },
575
  "SC": {
576
- "accuracy": 1.0,
577
  "count": 100
578
  },
579
  "UC": {
580
- "accuracy": 0.9246231155778895,
581
  "count": 199
582
  },
583
  "US": {
584
- "accuracy": 1.0,
585
  "count": 101
586
  }
587
  }
588
  },
589
  "add_C4": {
590
- "full_accuracy": 0.78,
591
- "digit_accuracy": 0.9657142857142857,
592
  "n_examples": 100,
593
  "per_subtask": {
594
  "SA": {
595
- "accuracy": 1.0,
596
  "count": 200
597
  },
598
  "SC": {
599
- "accuracy": 1.0,
600
  "count": 100
601
  },
602
  "UC": {
603
- "accuracy": 0.9242424242424242,
604
  "count": 264
605
  },
606
  "US": {
607
- "accuracy": 0.9705882352941176,
608
  "count": 136
609
  }
610
  }
611
  },
612
  "add_C5": {
613
- "full_accuracy": 0.79,
614
- "digit_accuracy": 0.9614285714285714,
615
  "n_examples": 100,
616
  "per_subtask": {
617
  "SA": {
618
- "accuracy": 1.0,
619
  "count": 100
620
  },
621
  "SC": {
622
- "accuracy": 1.0,
623
  "count": 100
624
  },
625
  "UC": {
626
- "accuracy": 0.9354838709677419,
627
  "count": 310
628
  },
629
  "US": {
630
- "accuracy": 0.9631578947368421,
631
  "count": 190
632
  }
633
  }
634
  },
635
  "add_C6": {
636
- "full_accuracy": 0.74,
637
- "digit_accuracy": 0.9542857142857143,
638
  "n_examples": 100,
639
  "per_subtask": {
640
  "SC": {
641
- "accuracy": 1.0,
642
  "count": 100
643
  },
644
  "UC": {
645
- "accuracy": 0.9459459459459459,
646
  "count": 370
647
  },
648
  "US": {
649
- "accuracy": 0.9478260869565217,
650
  "count": 230
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
- "full_accuracy": 0.97,
656
- "digit_accuracy": 0.9957142857142857,
657
  "n_examples": 100,
658
  "per_subtask": {
659
  "MD": {
660
- "accuracy": 0.9951219512195122,
661
  "count": 615
662
  },
663
  "ME": {
@@ -667,16 +374,16 @@
667
  }
668
  },
669
  "sub_M1": {
670
- "full_accuracy": 0.97,
671
- "digit_accuracy": 0.9957142857142857,
672
  "n_examples": 100,
673
  "per_subtask": {
674
  "MD": {
675
- "accuracy": 0.9931506849315068,
676
  "count": 292
677
  },
678
  "MB": {
679
- "accuracy": 0.9930555555555556,
680
  "count": 144
681
  },
682
  "ME": {
@@ -684,49 +391,49 @@
684
  "count": 25
685
  },
686
  "UB": {
687
- "accuracy": 1.0,
688
  "count": 239
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
- "full_accuracy": 0.94,
694
- "digit_accuracy": 0.9914285714285714,
695
  "n_examples": 100,
696
  "per_subtask": {
697
  "MD": {
698
- "accuracy": 1.0,
699
  "count": 211
700
  },
701
  "MB": {
702
- "accuracy": 1.0,
703
  "count": 115
704
  },
705
  "ME": {
706
- "accuracy": 0.9882352941176471,
707
  "count": 85
708
  },
709
  "UB": {
710
- "accuracy": 0.9723756906077348,
711
  "count": 181
712
  },
713
  "UD": {
714
- "accuracy": 1.0,
715
  "count": 108
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
- "full_accuracy": 0.38,
721
- "digit_accuracy": 0.9028571428571428,
722
  "n_examples": 100,
723
  "per_subtask": {
724
  "MD": {
725
- "accuracy": 1.0,
726
  "count": 179
727
  },
728
  "MB": {
729
- "accuracy": 1.0,
730
  "count": 103
731
  },
732
  "ME": {
@@ -734,41 +441,41 @@
734
  "count": 56
735
  },
736
  "UB": {
737
- "accuracy": 0.610738255033557,
738
  "count": 149
739
  },
740
  "UD": {
741
- "accuracy": 0.9530516431924883,
742
  "count": 213
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
- "full_accuracy": 0.18,
748
- "digit_accuracy": 0.7942857142857143,
749
  "n_examples": 100,
750
  "per_subtask": {
751
  "MD": {
752
- "accuracy": 1.0,
753
  "count": 200
754
  },
755
  "MB": {
756
- "accuracy": 1.0,
757
  "count": 100
758
  },
759
  "UB": {
760
- "accuracy": 0.47,
761
  "count": 100
762
  },
763
  "UD": {
764
- "accuracy": 0.6966666666666667,
765
  "count": 300
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
- "full_accuracy": 0.13,
771
- "digit_accuracy": 0.6685714285714286,
772
  "n_examples": 100,
773
  "per_subtask": {
774
  "MD": {
@@ -776,30 +483,30 @@
776
  "count": 100
777
  },
778
  "MB": {
779
- "accuracy": 1.0,
780
  "count": 100
781
  },
782
  "UB": {
783
- "accuracy": 0.46,
784
  "count": 100
785
  },
786
  "UD": {
787
- "accuracy": 0.555,
788
  "count": 400
789
  }
790
  }
791
  },
792
  "sub_random": {
793
- "full_accuracy": 0.95,
794
- "digit_accuracy": 0.9928571428571429,
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
- "accuracy": 0.9883333333333333,
799
  "count": 600
800
  },
801
  "MB": {
802
- "accuracy": 0.9962546816479401,
803
  "count": 267
804
  },
805
  "ME": {
@@ -807,64 +514,64 @@
807
  "count": 53
808
  },
809
  "UB": {
810
- "accuracy": 0.9954441913439636,
811
  "count": 439
812
  },
813
  "UD": {
814
- "accuracy": 1.0,
815
  "count": 41
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
- "full_accuracy": 0.79,
821
- "digit_accuracy": 0.9685714285714285,
822
  "n_examples": 100,
823
  "per_subtask": {
824
  "MD": {
825
- "accuracy": 0.9966666666666667,
826
  "count": 300
827
  },
828
  "MB": {
829
- "accuracy": 1.0,
830
  "count": 100
831
  },
832
  "UB": {
833
- "accuracy": 0.8984771573604061,
834
  "count": 197
835
  },
836
  "UD": {
837
- "accuracy": 0.9902912621359223,
838
  "count": 103
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
- "full_accuracy": 0.61,
844
- "digit_accuracy": 0.9342857142857143,
845
  "n_examples": 100,
846
  "per_subtask": {
847
  "MD": {
848
- "accuracy": 0.98,
849
  "count": 200
850
  },
851
  "MB": {
852
- "accuracy": 1.0,
853
  "count": 100
854
  },
855
  "UB": {
856
- "accuracy": 0.8744939271255061,
857
  "count": 247
858
  },
859
  "UD": {
860
- "accuracy": 0.9281045751633987,
861
  "count": 153
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
- "full_accuracy": 0.66,
867
- "digit_accuracy": 0.9342857142857143,
868
  "n_examples": 100,
869
  "per_subtask": {
870
  "MD": {
@@ -872,23 +579,23 @@
872
  "count": 100
873
  },
874
  "MB": {
875
- "accuracy": 1.0,
876
  "count": 100
877
  },
878
  "UB": {
879
- "accuracy": 0.8993288590604027,
880
  "count": 298
881
  },
882
  "UD": {
883
- "accuracy": 0.9207920792079208,
884
  "count": 202
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
- "overall_accuracy": 0.76,
891
- "digit_accuracy": 0.9431318681318681,
892
  "total_examples": 2600,
893
  "n_splits": 24
894
  }
 
3
  "step": [
4
  50,
5
  100,
6
+ 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  ],
8
  "loss": [
9
+ 7.406985759735107,
10
+ 4.287901878356934,
11
+ 2.494678020477295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ],
13
  "base_loss": [
14
+ 7.406985759735107,
15
+ 4.287901878356934,
16
+ 2.494678020477295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ],
18
  "lr": [
19
  3.9200000000000004e-05,
20
  7.92e-05,
21
+ 3.825713572096903e-06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  ],
23
  "eval_step": [
24
+ 156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  ],
26
  "eval_epoch": [
27
+ 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  ],
29
  "eval_accuracy": [
30
+ 0.004210526315789474
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ]
32
  },
33
+ "final_accuracy": 0.005,
34
  "sft_eval": {
35
  "config": {
36
  "ops": "add_sub",
 
41
  },
42
  "splits": {
43
  "add_S0": {
44
+ "full_accuracy": 0.0,
45
+ "digit_accuracy": 0.31142857142857144,
46
  "n_examples": 100,
47
  "per_subtask": {
48
  "SA": {
49
+ "accuracy": 0.2049586776859504,
50
  "count": 605
51
  },
52
  "SS": {
53
+ "accuracy": 0.9894736842105263,
54
  "count": 95
55
  }
56
  }
57
  },
58
  "add_S1": {
59
+ "full_accuracy": 0.0,
60
+ "digit_accuracy": 0.19428571428571428,
61
  "n_examples": 100,
62
  "per_subtask": {
63
  "SA": {
64
+ "accuracy": 0.22549019607843138,
65
  "count": 204
66
  },
67
  "SC": {
68
+ "accuracy": 0.09467455621301775,
69
  "count": 169
70
  },
71
  "SS": {
72
+ "accuracy": 0.6774193548387096,
73
  "count": 31
74
  },
75
  "UC": {
76
+ "accuracy": 0.17905405405405406,
77
  "count": 296
78
  }
79
  }
80
  },
81
  "add_S2": {
82
+ "full_accuracy": 0.0,
83
+ "digit_accuracy": 0.31285714285714283,
84
  "n_examples": 100,
85
  "per_subtask": {
86
  "SA": {
87
+ "accuracy": 0.294478527607362,
88
  "count": 163
89
  },
90
  "SC": {
91
+ "accuracy": 0.12307692307692308,
92
  "count": 130
93
  },
94
  "SS": {
95
+ "accuracy": 0.5402298850574713,
96
  "count": 87
97
  },
98
  "UC": {
99
+ "accuracy": 0.24630541871921183,
100
  "count": 203
101
  },
102
  "US": {
103
+ "accuracy": 0.49572649572649574,
104
  "count": 117
105
  }
106
  }
107
  },
108
  "add_S3": {
109
+ "full_accuracy": 0.0,
110
+ "digit_accuracy": 0.36857142857142855,
111
  "n_examples": 100,
112
  "per_subtask": {
113
  "SA": {
114
+ "accuracy": 0.2727272727272727,
115
  "count": 121
116
  },
117
  "SC": {
118
+ "accuracy": 0.08264462809917356,
119
  "count": 121
120
  },
121
  "SS": {
122
+ "accuracy": 0.40816326530612246,
123
  "count": 49
124
  },
125
  "UC": {
126
+ "accuracy": 0.3118279569892473,
127
  "count": 186
128
  },
129
  "US": {
130
+ "accuracy": 0.6143497757847534,
131
  "count": 223
132
  }
133
  }
134
  },
135
  "add_S4": {
136
+ "full_accuracy": 0.0,
137
+ "digit_accuracy": 0.37142857142857144,
138
  "n_examples": 100,
139
  "per_subtask": {
140
  "SA": {
141
+ "accuracy": 0.38461538461538464,
142
  "count": 104
143
  },
144
  "SC": {
145
+ "accuracy": 0.09433962264150944,
146
  "count": 106
147
  },
148
  "SS": {
149
+ "accuracy": 0.6086956521739131,
150
  "count": 23
151
  },
152
  "UC": {
153
+ "accuracy": 0.325,
154
  "count": 160
155
  },
156
  "US": {
157
+ "accuracy": 0.46905537459283386,
158
  "count": 307
159
  }
160
  }
161
  },
162
  "add_S5": {
163
+ "full_accuracy": 0.0,
164
+ "digit_accuracy": 0.22,
165
  "n_examples": 100,
166
  "per_subtask": {
167
  "SA": {
168
+ "accuracy": 0.4,
169
  "count": 100
170
  },
171
  "SC": {
172
+ "accuracy": 0.05,
173
  "count": 100
174
  },
175
  "UC": {
176
+ "accuracy": 0.21,
177
  "count": 100
178
  },
179
  "US": {
180
+ "accuracy": 0.22,
181
  "count": 400
182
  }
183
  }
184
  },
185
  "add_S6": {
186
+ "full_accuracy": 0.13,
187
+ "digit_accuracy": 0.5328571428571428,
188
  "n_examples": 100,
189
  "per_subtask": {
190
  "SC": {
191
+ "accuracy": 0.13,
192
  "count": 100
193
  },
194
  "UC": {
195
+ "accuracy": 0.6,
196
  "count": 100
197
  },
198
  "US": {
199
+ "accuracy": 0.6,
200
  "count": 500
201
  }
202
  }
203
  },
204
  "add_random": {
205
+ "full_accuracy": 0.0,
206
+ "digit_accuracy": 0.19714285714285715,
207
  "n_examples": 200,
208
  "per_subtask": {
209
  "SA": {
210
+ "accuracy": 0.21029082774049218,
211
  "count": 447
212
  },
213
  "SC": {
214
+ "accuracy": 0.103125,
215
  "count": 320
216
  },
217
  "SS": {
218
+ "accuracy": 0.5535714285714286,
219
  "count": 56
220
  },
221
  "UC": {
222
+ "accuracy": 0.18714555765595464,
223
  "count": 529
224
  },
225
  "US": {
226
+ "accuracy": 0.3958333333333333,
227
  "count": 48
228
  }
229
  }
230
  },
231
  "add_C1": {
232
+ "full_accuracy": 0.0,
233
+ "digit_accuracy": 0.12142857142857143,
234
  "n_examples": 100,
235
  "per_subtask": {
236
  "SA": {
237
+ "accuracy": 0.164,
238
  "count": 500
239
  },
240
  "SC": {
241
+ "accuracy": 0.0,
242
  "count": 100
243
  },
244
  "UC": {
245
+ "accuracy": 0.03,
246
  "count": 100
247
  }
248
  }
249
  },
250
  "add_C2": {
251
+ "full_accuracy": 0.0,
252
+ "digit_accuracy": 0.1357142857142857,
253
  "n_examples": 100,
254
  "per_subtask": {
255
  "SA": {
256
+ "accuracy": 0.2075,
257
  "count": 400
258
  },
259
  "SC": {
260
+ "accuracy": 0.01,
261
  "count": 100
262
  },
263
  "UC": {
264
+ "accuracy": 0.0641025641025641,
265
  "count": 156
266
  },
267
  "US": {
268
+ "accuracy": 0.022727272727272728,
269
  "count": 44
270
  }
271
  }
272
  },
273
  "add_C3": {
274
+ "full_accuracy": 0.0,
275
+ "digit_accuracy": 0.15571428571428572,
276
  "n_examples": 100,
277
  "per_subtask": {
278
  "SA": {
279
+ "accuracy": 0.25,
280
  "count": 300
281
  },
282
  "SC": {
283
+ "accuracy": 0.07,
284
  "count": 100
285
  },
286
  "UC": {
287
+ "accuracy": 0.08040201005025126,
288
  "count": 199
289
  },
290
  "US": {
291
+ "accuracy": 0.10891089108910891,
292
  "count": 101
293
  }
294
  }
295
  },
296
  "add_C4": {
297
+ "full_accuracy": 0.0,
298
+ "digit_accuracy": 0.17285714285714285,
299
  "n_examples": 100,
300
  "per_subtask": {
301
  "SA": {
302
+ "accuracy": 0.365,
303
  "count": 200
304
  },
305
  "SC": {
306
+ "accuracy": 0.04,
307
  "count": 100
308
  },
309
  "UC": {
310
+ "accuracy": 0.07954545454545454,
311
  "count": 264
312
  },
313
  "US": {
314
+ "accuracy": 0.16911764705882354,
315
  "count": 136
316
  }
317
  }
318
  },
319
  "add_C5": {
320
+ "full_accuracy": 0.0,
321
+ "digit_accuracy": 0.19857142857142857,
322
  "n_examples": 100,
323
  "per_subtask": {
324
  "SA": {
325
+ "accuracy": 0.51,
326
  "count": 100
327
  },
328
  "SC": {
329
+ "accuracy": 0.07,
330
  "count": 100
331
  },
332
  "UC": {
333
+ "accuracy": 0.09032258064516129,
334
  "count": 310
335
  },
336
  "US": {
337
+ "accuracy": 0.2789473684210526,
338
  "count": 190
339
  }
340
  }
341
  },
342
  "add_C6": {
343
+ "full_accuracy": 0.0,
344
+ "digit_accuracy": 0.36142857142857143,
345
  "n_examples": 100,
346
  "per_subtask": {
347
  "SC": {
348
+ "accuracy": 0.15,
349
  "count": 100
350
  },
351
  "UC": {
352
+ "accuracy": 0.20270270270270271,
353
  "count": 370
354
  },
355
  "US": {
356
+ "accuracy": 0.7086956521739131,
357
  "count": 230
358
  }
359
  }
360
  },
361
  "sub_M0": {
362
+ "full_accuracy": 0.0,
363
+ "digit_accuracy": 0.29285714285714287,
364
  "n_examples": 100,
365
  "per_subtask": {
366
  "MD": {
367
+ "accuracy": 0.1951219512195122,
368
  "count": 615
369
  },
370
  "ME": {
 
374
  }
375
  },
376
  "sub_M1": {
377
+ "full_accuracy": 0.0,
378
+ "digit_accuracy": 0.22428571428571428,
379
  "n_examples": 100,
380
  "per_subtask": {
381
  "MD": {
382
+ "accuracy": 0.3698630136986301,
383
  "count": 292
384
  },
385
  "MB": {
386
+ "accuracy": 0.0,
387
  "count": 144
388
  },
389
  "ME": {
 
391
  "count": 25
392
  },
393
  "UB": {
394
+ "accuracy": 0.100418410041841,
395
  "count": 239
396
  }
397
  }
398
  },
399
  "sub_M2": {
400
+ "full_accuracy": 0.0,
401
+ "digit_accuracy": 0.35428571428571426,
402
  "n_examples": 100,
403
  "per_subtask": {
404
  "MD": {
405
+ "accuracy": 0.6208530805687204,
406
  "count": 211
407
  },
408
  "MB": {
409
+ "accuracy": 0.0,
410
  "count": 115
411
  },
412
  "ME": {
413
+ "accuracy": 1.0,
414
  "count": 85
415
  },
416
  "UB": {
417
+ "accuracy": 0.17679558011049723,
418
  "count": 181
419
  },
420
  "UD": {
421
+ "accuracy": 0.0,
422
  "count": 108
423
  }
424
  }
425
  },
426
  "sub_M3": {
427
+ "full_accuracy": 0.0,
428
+ "digit_accuracy": 0.3,
429
  "n_examples": 100,
430
  "per_subtask": {
431
  "MD": {
432
+ "accuracy": 0.7597765363128491,
433
  "count": 179
434
  },
435
  "MB": {
436
+ "accuracy": 0.0,
437
  "count": 103
438
  },
439
  "ME": {
 
441
  "count": 56
442
  },
443
  "UB": {
444
+ "accuracy": 0.12080536912751678,
445
  "count": 149
446
  },
447
  "UD": {
448
+ "accuracy": 0.0,
449
  "count": 213
450
  }
451
  }
452
  },
453
  "sub_M4": {
454
+ "full_accuracy": 0.0,
455
+ "digit_accuracy": 0.18571428571428572,
456
  "n_examples": 100,
457
  "per_subtask": {
458
  "MD": {
459
+ "accuracy": 0.5,
460
  "count": 200
461
  },
462
  "MB": {
463
+ "accuracy": 0.0,
464
  "count": 100
465
  },
466
  "UB": {
467
+ "accuracy": 0.3,
468
  "count": 100
469
  },
470
  "UD": {
471
+ "accuracy": 0.0,
472
  "count": 300
473
  }
474
  }
475
  },
476
  "sub_M5": {
477
+ "full_accuracy": 0.0,
478
+ "digit_accuracy": 0.18714285714285714,
479
  "n_examples": 100,
480
  "per_subtask": {
481
  "MD": {
 
483
  "count": 100
484
  },
485
  "MB": {
486
+ "accuracy": 0.0,
487
  "count": 100
488
  },
489
  "UB": {
490
+ "accuracy": 0.31,
491
  "count": 100
492
  },
493
  "UD": {
494
+ "accuracy": 0.0,
495
  "count": 400
496
  }
497
  }
498
  },
499
  "sub_random": {
500
+ "full_accuracy": 0.0,
501
+ "digit_accuracy": 0.2307142857142857,
502
  "n_examples": 200,
503
  "per_subtask": {
504
  "MD": {
505
+ "accuracy": 0.3616666666666667,
506
  "count": 600
507
  },
508
  "MB": {
509
+ "accuracy": 0.0,
510
  "count": 267
511
  },
512
  "ME": {
 
514
  "count": 53
515
  },
516
  "UB": {
517
+ "accuracy": 0.12072892938496584,
518
  "count": 439
519
  },
520
  "UD": {
521
+ "accuracy": 0.0,
522
  "count": 41
523
  }
524
  }
525
  },
526
  "sub_B3": {
527
+ "full_accuracy": 0.0,
528
+ "digit_accuracy": 0.19285714285714287,
529
  "n_examples": 100,
530
  "per_subtask": {
531
  "MD": {
532
+ "accuracy": 0.3333333333333333,
533
  "count": 300
534
  },
535
  "MB": {
536
+ "accuracy": 0.0,
537
  "count": 100
538
  },
539
  "UB": {
540
+ "accuracy": 0.17766497461928935,
541
  "count": 197
542
  },
543
  "UD": {
544
+ "accuracy": 0.0,
545
  "count": 103
546
  }
547
  }
548
  },
549
  "sub_B4": {
550
+ "full_accuracy": 0.0,
551
+ "digit_accuracy": 0.19428571428571428,
552
  "n_examples": 100,
553
  "per_subtask": {
554
  "MD": {
555
+ "accuracy": 0.5,
556
  "count": 200
557
  },
558
  "MB": {
559
+ "accuracy": 0.0,
560
  "count": 100
561
  },
562
  "UB": {
563
+ "accuracy": 0.145748987854251,
564
  "count": 247
565
  },
566
  "UD": {
567
+ "accuracy": 0.0,
568
  "count": 153
569
  }
570
  }
571
  },
572
  "sub_B5": {
573
+ "full_accuracy": 0.0,
574
+ "digit_accuracy": 0.19,
575
  "n_examples": 100,
576
  "per_subtask": {
577
  "MD": {
 
579
  "count": 100
580
  },
581
  "MB": {
582
+ "accuracy": 0.0,
583
  "count": 100
584
  },
585
  "UB": {
586
+ "accuracy": 0.11073825503355705,
587
  "count": 298
588
  },
589
  "UD": {
590
+ "accuracy": 0.0,
591
  "count": 202
592
  }
593
  }
594
  }
595
  },
596
  "summary": {
597
+ "overall_accuracy": 0.005,
598
+ "digit_accuracy": 0.24736263736263736,
599
  "total_examples": 2600,
600
  "n_splits": 24
601
  }
add_sub_baseline_10K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9024dda0f3d40530abfd72b50df46a8e6a8a4dde5fed1a429b758171421411ab
3
  size 650266922
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93cfd7c10000750b8a27620e26b0445a5a43d40542a11ffc61a0136c8f555e0b
3
  size 650266922
add_sub_baseline_10K/train_config.json CHANGED
@@ -30,13 +30,13 @@
30
  "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
  "gradient_accumulation_steps": 1,
33
- "num_epochs": 20,
34
  "emb_warmup_steps": 0,
35
  "log_every": 50,
36
  "eval_every": 156,
37
  "save_every": 999999,
38
  "eval_samples": 100,
39
- "output_dir": "ckpt/sweep/as_baseline_10K_2L3H510d",
40
  "eval_K": 4,
41
  "alpha_traj": 0.0,
42
  "corrupt_method": "shuffle",
@@ -66,23 +66,24 @@
66
  "mode": "baseline",
67
  "device": "cuda",
68
  "push_to_hub": true,
69
- "no_wandb": false,
70
  "n_params": 162490082,
71
  "run_name": "add_sub_baseline_10K",
72
- "git_commit": "f835493c19eb98267697007042c9d440cad2afbb",
73
- "timestamp": "2026-04-16T01:10:30.945547+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "train_dataset": "fixed_train/train_10K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
- "wandb_run_id": "dvsplayi",
81
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/dvsplayi",
82
  "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
  "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
  "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
- "final_accuracy": 0.76,
86
- "sft_accuracy": 0.76,
 
87
  "eval_method": "ArithmeticEvaluator"
88
  }
 
30
  "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
  "gradient_accumulation_steps": 1,
33
+ "num_epochs": 1,
34
  "emb_warmup_steps": 0,
35
  "log_every": 50,
36
  "eval_every": 156,
37
  "save_every": 999999,
38
  "eval_samples": 100,
39
+ "output_dir": "ckpt/smoke_test_baseline",
40
  "eval_K": 4,
41
  "alpha_traj": 0.0,
42
  "corrupt_method": "shuffle",
 
66
  "mode": "baseline",
67
  "device": "cuda",
68
  "push_to_hub": true,
69
+ "no_wandb": true,
70
  "n_params": 162490082,
71
  "run_name": "add_sub_baseline_10K",
72
+ "git_commit": "2cf43f15c420a037503a723221cfde984733f98c",
73
+ "timestamp": "2026-04-19T15:25:53.286620+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "train_dataset": "fixed_train/train_10K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": null,
81
+ "wandb_url": null,
82
  "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
  "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
  "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "config_hash": "bd3188fbcb22",
86
+ "final_accuracy": 0.005,
87
+ "sft_accuracy": 0.005,
88
  "eval_method": "ArithmeticEvaluator"
89
  }