djgagne commited on
Commit
8c9de0c
·
1 Parent(s): 342ba9a

Added single step and finetune weights

Browse files
backup_single/.ipynb_checkpoints/training_log-checkpoint.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ index,epoch,train_loss,valid_loss,train_acc,valid_acc,train_mae,valid_mae,train_forecast_len,lr,valid_forecast_len
2
+ 0,0.0,0.1009794427448193,0.1027883764770295,0.94327420322957,0.9739029765129088,0.1534553673871782,0.1048697493142551,1.0,0.0009994965332706,
3
+ 1,1.0,0.019675959971095,0.0868489891290664,0.978004109926438,0.9811553531222872,0.0957085051940063,0.089651234779093,1.0,0.0009979871469976,
4
+ 2,2.0,0.0151972808869911,0.0786430971490012,0.9831368846170018,0.984541834725274,0.0845575840662369,0.0817086655232641,1.0,0.0009954748808839,
5
+ 3,3.0,0.0130550836908808,0.0741522577073839,0.9857291311360477,0.9864424957169426,0.0784229645735762,0.0773795636163817,1.0,0.0009919647942993,
6
+ 4,4.0,0.0119756458991657,0.0715151440766122,0.9870807892820808,0.9872670186890496,0.074865428542488,0.0748724273509449,1.0,0.0009874639560909,
7
+ 5,5.0,0.0112956749882256,0.0761996052331394,0.9878721300135838,0.9880205419328476,0.0725079033147083,0.0729023743006918,1.0,0.0009819814303479,
8
+ 6,6.0,0.0108025243594782,0.0742437612679269,0.988425366228886,0.9885241680675084,0.0707315514512946,0.0707739344901508,1.0,0.0009755282581475,
9
+ 7,7.0,0.010423366222112,0.0740774704350365,0.9888492891292894,0.9887091702885098,0.0693481621238287,0.070707170996401,1.0,0.0009681174353198,
10
+ 8,8.0,0.0101209639440803,0.0727778664893574,0.9891780853606342,0.9890378952026369,0.0682236003239503,0.069233152270317,1.0,0.0009597638862757,
11
+ 9,9.0,0.0098750877382463,0.0722406513161129,0.9894475843799247,0.9893197271558972,0.0672991637810227,0.0686951773034201,1.0,0.0009504844339512,
12
+ 10,10.0,0.0096599225844308,0.0712286000450452,0.989684408903122,0.9894908361964756,0.0664751630988013,0.0675681359238094,1.0,0.0009402977659283,
13
+ 11,11.0,0.0094818470246169,0.0702389660808775,0.9898788600825192,0.989692693286472,0.0657934360666556,0.0665735589133368,1.0,0.0009292243968009,
14
+ 12,12.0,0.00931685034374,0.0702956752644644,0.9900572099042744,0.9898342741860284,0.0651478600091813,0.0665175035595893,1.0,0.0009172866268606,
15
+ 13,13.0,0.0091760145034641,0.0696684651904636,0.99021095000626,0.9899267342355516,0.0645949800488319,0.0658018956581751,1.0,0.0009045084971874,
16
+ 14,14.0,0.009054467713544,0.069178935388724,0.990342890546563,0.990099212858412,0.0641164990309416,0.0652842425637775,1.0,0.000890915741234,
17
+ 15,15.0,0.008937944289234,0.0689917723337809,0.9904679285676292,0.9901854236920674,0.0636501339274678,0.0650799024436209,1.0,0.0008765357330018,
18
+ 16,16.0,0.0088368062871728,0.0683553102943632,0.99057797605402,0.990315987004174,0.0632446752358855,0.0643851502074135,1.0,0.0008613974319136,
19
+ 17,17.0,0.0087418129835068,0.0601519813968075,0.9906821045982704,0.990372551812066,0.0628605140498682,0.0643506565027766,1.0,0.0008455313244934,
20
+ 18,18.0,0.0086546849888445,0.0597193823920355,0.9907743083961894,0.9904850191540188,0.062502304592243,0.0639373604622152,1.0,0.0008289693629698,
21
+ 19,19.0,0.0085733014553931,0.0593640875485208,0.9908616850885112,0.9905507524808248,0.062172229688489,0.0636733880473507,1.0,0.0008117449009293,
22
+ 20,20.0,0.0084998264312325,0.0590407683617538,0.9909425107615716,0.9906116353140936,0.0618693786913926,0.0633704620103041,1.0,0.0007938926261462,
23
+ 21,21.0,0.0084292321816463,0.0588373551766077,0.99101791097207,0.9906822787390814,0.0615794747293497,0.0632358468241161,1.0,0.000775448490726,
24
+ 22,22.0,0.0083631074385654,0.0583808085984653,0.9910881071278218,0.9907019972801208,0.061308835618449,0.0627665976683298,1.0,0.0007564496387029,
25
+ 23,23.0,0.0082998081518525,0.0583362696071465,0.9911576903937908,0.9907915671666464,0.0610430538654327,0.0627390105691221,1.0,0.0007369343312364,
26
+ 24,24.0,0.0082426977341241,0.0579863401750723,0.9912183459555164,0.9908138381110296,0.0608090944985827,0.0624360193808873,1.0,0.0007169418695587,
27
+ 25,25.0,0.0081857938227882,0.0603038504573234,0.9912798084904638,0.9913705920570351,0.0605694888222418,0.0602742368208893,1.0,0.0006965125158269,
28
+ 26,26.0,0.0081330842490234,0.0601081264069241,0.9913358248016808,0.9913957462551888,0.0603508613396729,0.0600969716236832,1.0,0.0006756874120406,
29
+ 27,,,,,,,,,,
30
+ 28,27.0,0.0080793036346773,0.0581171515087286,0.9913951707020234,0.990975715054406,0.0601222649823581,0.0619419970446162,1.0,0.0006545084971874,1.0
31
+ 29,28.0,0.0080303936715373,0.0577960211369726,0.9914486467838288,0.991037999259101,0.0599184757893842,0.0616544663078255,1.0,0.0006330184227833,1.0
32
+ 30,29.0,0.0079833286958275,0.0577082768082618,0.9914989711193556,0.9910609987046984,0.0597187550236167,0.0615710041589207,1.0,0.0006112604669781,1.0
33
+ 31,30.0,0.0079363946847387,0.0573835965659883,0.991549631886268,0.9911303440729776,0.0595209216657146,0.0613138885961638,1.0,0.0005892784473993,1.0
34
+ 32,31.0,0.0078930764391055,0.056579791340563,0.9915970942612444,0.9911372542381288,0.0593344994663689,0.0610890839662816,1.0,0.0005671166329088,1.0
35
+ 33,32.0,0.0078520439688137,0.0566349982387489,0.9916415573171016,0.9911991384294296,0.0591628991109266,0.0612032888664139,1.0,0.0005448196544517,1.0
36
+ 34,33.0,0.0078106326422538,0.056273600541883,0.99168776166573,0.9912242041693792,0.0589861686414714,0.0608326474825541,1.0,0.0005224324151752,1.0
37
+ 35,34.0,0.0077714874694291,0.0560553298228316,0.9917285669720576,0.9912754522429572,0.0588185834813486,0.0606301685174306,1.0,0.0005,1.0
38
+ 36,35.0,0.0077335360143438,0.0566925521526071,0.9917714921611078,0.9912715819146898,0.0586554208602965,0.0606366270118289,1.0,0.0004775675848247,1.0
39
+ 37,36.0,0.0076967598778834,0.05643899159299,0.9918113076619888,0.9913144058651394,0.0584977892593721,0.0604506749245855,1.0,0.0004551803455482,1.0
40
+ 38,37.0,0.0076611397421678,0.0564949775735537,0.9918505828702048,0.9912973059548272,0.0583438508379995,0.0604623614086045,1.0,0.0004328833670911,1.0
41
+ 39,38.0,0.0076274576790505,0.0561670700709025,0.9918875504075813,0.991361051135593,0.0581993957395466,0.0601712295578585,1.0,0.0004107215526006,1.0
42
+ 40,39.0,0.0075949914102034,0.0644460654093159,0.9919232221131914,0.9913600550757514,0.0580608733772729,0.0601557648016346,1.0,0.0003887395330218,1.0
43
+ 41,40.0,0.0075620844107372,0.064289897100793,0.9919600250680796,0.9914333489206102,0.0579155459141965,0.0599343188107013,1.0,0.0003669815772166,1.0
44
+ 42,41.0,0.0075315364097569,0.0641076669096946,0.9919931383280272,0.9914571894539728,0.057783521611369,0.0598166498045126,1.0,0.0003454915028125,1.0
45
+ 43,42.0,0.0075010784370615,0.0640382651653554,0.9920276840416232,0.9914649923642476,0.0576503183692693,0.0596944563090801,1.0,0.0003243125879593,1.0
46
+ 44,43.0,0.007472936848304,0.0639235826830069,0.9920596264721302,0.9915061102973092,0.0575274907805946,0.0595703116721577,1.0,0.000303487484173,1.0
47
+ 45,44.0,0.0074444434827905,0.0639039695262908,0.9920910296815164,0.9915179557270474,0.0574037855532899,0.0595297686755657,1.0,0.0002830581304412,1.0
48
+ 46,45.0,0.0074174716718118,0.0547702770266268,0.9921217421802242,0.9915496057934232,0.0572855648511413,0.0593745187752776,1.0,0.0002630656687635,1.0
49
+ 47,46.0,0.0073907052564403,0.0548254136410024,0.9921517261963212,0.99152289364073,0.0571677961229775,0.0594311359028021,1.0,0.000243550361297,1.0
50
+ 48,47.0,0.0073657991422151,0.0545883624090088,0.9921797737981496,0.99158462550905,0.0570587853304623,0.0592154045899709,1.0,0.0002245515092739,1.0
51
+ 49,48.0,0.0073413327940624,0.0549789651400513,0.9922071443514876,0.9915956417719524,0.0569513976636729,0.0591794966823524,1.0,0.0002061073738537,1.0
52
+ 50,49.0,0.0073178111899937,0.0549225054681301,0.992234143480826,0.9916122992833456,0.0568474041507317,0.0591052624086538,1.0,0.0001882550990706,1.0
53
+ 51,50.0,0.0072954127986214,0.0547993102007442,0.9922600163837496,0.9916072050730388,0.0567492793638552,0.0590278472337457,1.0,0.0001710306370301,1.0
54
+ 52,51.0,0.007273508728419,0.0633058957755565,0.99228490205963,0.9916460341877408,0.0566524313302354,0.0589173430369959,1.0,0.0001544686755065,1.0
55
+ 53,52.0,0.0072527236388891,0.0541604169540935,0.9923085340957964,0.9916655633184644,0.0565598978545893,0.0588861417439248,1.0,0.0001386025680863,1.0
56
+ 54,53.0,0.0072332986735142,0.0540786994000275,0.9923308940415972,0.9916760828759936,0.0564748854554268,0.0588100816640589,1.0,0.0001234642669981,1.0
57
+ 55,54.0,0.0072141582214304,0.0539976932936244,0.992352688915274,0.991687744193607,0.0563887013038725,0.0587553933262825,1.0,0.0001090842587659,1.0
58
+ 56,55.0,0.0071961989158011,0.0539236054652267,0.9923736041851258,0.991705028216044,0.0563090115932099,0.0586584944691922,1.0,9.54915028125264e-05,1.0
59
+ 57,56.0,0.0071795341116637,0.0538788974285125,0.9923931933186028,0.9917213280995688,0.056234765938075,0.0586185601022508,1.0,8.271337313934873e-05,1.0
60
+ 58,57.0,0.0071636978502835,0.0537894251445929,0.99241107735741,0.991727234257592,0.0561641372949554,0.0585509944293234,1.0,7.0775603199067e-05,1.0
61
+ 59,58.0,0.0071492903426373,0.0628995653655793,0.9924280616339672,0.9917405486106872,0.0560998941426364,0.0585076812240812,1.0,5.970223407163104e-05,1.0
62
+ 60,59.0,0.007135755741142,0.0628647624618477,0.992444023222066,0.9917464388741388,0.0560391753451542,0.0584567987256579,1.0,4.951556604879051e-05,1.0
63
+ 61,60.0,0.0071229146066215,0.062842031651073,0.9924588899599032,0.9917527622646756,0.0559809200988894,0.0584456553889645,1.0,4.023611372427474e-05,1.0
64
+ 62,,,,,,,,,,
65
+ 63,61.0,0.0071116714105219,0.054225208527512,0.9924720488572388,0.9917570485009088,0.0559307862449897,0.0584127764734956,1.0,3.188256468013142e-05,1.0
66
+ 64,62.0,0.0071012400428168,0.0541794104708565,0.9924842232398772,0.9917684568299188,0.0558836536754048,0.0583652096490065,1.0,2.447174185242325e-05,1.0
67
+ 65,63.0,0.0070923216879012,0.0541527594129244,0.992494883698024,0.9917673733499316,0.0558433167908466,0.0583537354237503,1.0,1.8018569652073392e-05,1.0
68
+ 66,64.0,0.0070842290056471,0.0541142652432123,0.9925042093134998,0.9917791989114548,0.055806908285601,0.0583142161369323,1.0,1.2536043909088198e-05,1.0
69
+ 67,65.0,0.0070775183637134,0.0540941999190383,0.9925123827176148,0.9917832758691576,0.0557763982752568,0.0583013370633125,1.0,8.03520570068517e-06,1.0
70
+ 68,66.0,0.0070716849972985,0.0540783844060368,0.9925192085544716,0.9917850136756896,0.0557499875869141,0.0582870386540889,1.0,4.525119116032653e-06,1.0
71
+ 69,67.0,0.0070673599365701,0.0540728816555606,0.9925242980879344,0.9917882919311524,0.0557304875004325,0.058270867665608725,1.0,2.012853002380467e-06,1.0
72
+ 70,68.0,0.007063989337073283,0.054064563827382194,0.9925282489382818,0.9917892840173509,0.055715665660714835,0.05826829680138164,1.0,5.034667293427055e-07,1.0
73
+ 71,69.0,0.007062000732638695,0.05405975985858175,0.9925306557604436,0.9917901198069254,0.05570720089913419,0.0582624898188644,1.0,0.0,1.0
backup_single/backup_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9be8fb23430592b596035f034d342d18cb6c3c03c0921a3c7f79604808e9588
3
+ size 1132
backup_single/backup_model_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3fc4023ef69eb06b51fcde0f5a7fe5f1b6eadbca8f9c34590e2d40e4af01e01
3
+ size 499596942
backup_single/backup_optimizer_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5289c997b54e3ca945e950a7e5b135b3e20eead14c64ae0d0f7d8cee620ad902
3
+ size 995925198
backup_single/best_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:394d8dc509edd026166d2dfec1bdb8c01968dd47e41e9d5887503d6a3f6e52e5
3
+ size 1132
backup_single/best_model_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b279c4117c4c863769b2bbeb7394b202b4099df4988f9a43c4a6ef07f28f872c
3
+ size 499596942
backup_single/best_optimizer_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89edd5ebcf6ace8bc1813c577f47edc70fa05c134b832cbd439f4d72c7c511a9
3
+ size 995925198
backup_single/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a09df09ff71135cd8429fd199646fbaa0e2585c571eb6161f00b2a3bb9b9db53
3
+ size 1132
backup_single/model_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e342d121b2cb5eabf8adc15ced4bf2d499de99c1a8be5bdd4f0768cb92510dc
3
+ size 499596942
backup_single/optimizer_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8aafa5d31b09a6fa9ce6a322548188c3e5cef0e46c2c3c9fb91126763fdb720
3
+ size 995925198
backup_single/training_log.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ index,epoch,train_loss,valid_loss,train_acc,valid_acc,train_mae,valid_mae,train_forecast_len,lr,valid_forecast_len
2
+ 0,0.0,0.1009794427448193,0.1027883764770295,0.94327420322957,0.9739029765129088,0.1534553673871782,0.1048697493142551,1.0,0.0009994965332706,
3
+ 1,1.0,0.019675959971095,0.0868489891290664,0.978004109926438,0.9811553531222872,0.0957085051940063,0.089651234779093,1.0,0.0009979871469976,
4
+ 2,2.0,0.0151972808869911,0.0786430971490012,0.9831368846170018,0.984541834725274,0.0845575840662369,0.0817086655232641,1.0,0.0009954748808839,
5
+ 3,3.0,0.0130550836908808,0.0741522577073839,0.9857291311360477,0.9864424957169426,0.0784229645735762,0.0773795636163817,1.0,0.0009919647942993,
6
+ 4,4.0,0.0119756458991657,0.0715151440766122,0.9870807892820808,0.9872670186890496,0.074865428542488,0.0748724273509449,1.0,0.0009874639560909,
7
+ 5,5.0,0.0112956749882256,0.0761996052331394,0.9878721300135838,0.9880205419328476,0.0725079033147083,0.0729023743006918,1.0,0.0009819814303479,
8
+ 6,6.0,0.0108025243594782,0.0742437612679269,0.988425366228886,0.9885241680675084,0.0707315514512946,0.0707739344901508,1.0,0.0009755282581475,
9
+ 7,7.0,0.010423366222112,0.0740774704350365,0.9888492891292894,0.9887091702885098,0.0693481621238287,0.070707170996401,1.0,0.0009681174353198,
10
+ 8,8.0,0.0101209639440803,0.0727778664893574,0.9891780853606342,0.9890378952026369,0.0682236003239503,0.069233152270317,1.0,0.0009597638862757,
11
+ 9,9.0,0.0098750877382463,0.0722406513161129,0.9894475843799247,0.9893197271558972,0.0672991637810227,0.0686951773034201,1.0,0.0009504844339512,
12
+ 10,10.0,0.0096599225844308,0.0712286000450452,0.989684408903122,0.9894908361964756,0.0664751630988013,0.0675681359238094,1.0,0.0009402977659283,
13
+ 11,11.0,0.0094818470246169,0.0702389660808775,0.9898788600825192,0.989692693286472,0.0657934360666556,0.0665735589133368,1.0,0.0009292243968009,
14
+ 12,12.0,0.00931685034374,0.0702956752644644,0.9900572099042744,0.9898342741860284,0.0651478600091813,0.0665175035595893,1.0,0.0009172866268606,
15
+ 13,13.0,0.0091760145034641,0.0696684651904636,0.99021095000626,0.9899267342355516,0.0645949800488319,0.0658018956581751,1.0,0.0009045084971874,
16
+ 14,14.0,0.009054467713544,0.069178935388724,0.990342890546563,0.990099212858412,0.0641164990309416,0.0652842425637775,1.0,0.000890915741234,
17
+ 15,15.0,0.008937944289234,0.0689917723337809,0.9904679285676292,0.9901854236920674,0.0636501339274678,0.0650799024436209,1.0,0.0008765357330018,
18
+ 16,16.0,0.0088368062871728,0.0683553102943632,0.99057797605402,0.990315987004174,0.0632446752358855,0.0643851502074135,1.0,0.0008613974319136,
19
+ 17,17.0,0.0087418129835068,0.0601519813968075,0.9906821045982704,0.990372551812066,0.0628605140498682,0.0643506565027766,1.0,0.0008455313244934,
20
+ 18,18.0,0.0086546849888445,0.0597193823920355,0.9907743083961894,0.9904850191540188,0.062502304592243,0.0639373604622152,1.0,0.0008289693629698,
21
+ 19,19.0,0.0085733014553931,0.0593640875485208,0.9908616850885112,0.9905507524808248,0.062172229688489,0.0636733880473507,1.0,0.0008117449009293,
22
+ 20,20.0,0.0084998264312325,0.0590407683617538,0.9909425107615716,0.9906116353140936,0.0618693786913926,0.0633704620103041,1.0,0.0007938926261462,
23
+ 21,21.0,0.0084292321816463,0.0588373551766077,0.99101791097207,0.9906822787390814,0.0615794747293497,0.0632358468241161,1.0,0.000775448490726,
24
+ 22,22.0,0.0083631074385654,0.0583808085984653,0.9910881071278218,0.9907019972801208,0.061308835618449,0.0627665976683298,1.0,0.0007564496387029,
25
+ 23,23.0,0.0082998081518525,0.0583362696071465,0.9911576903937908,0.9907915671666464,0.0610430538654327,0.0627390105691221,1.0,0.0007369343312364,
26
+ 24,24.0,0.0082426977341241,0.0579863401750723,0.9912183459555164,0.9908138381110296,0.0608090944985827,0.0624360193808873,1.0,0.0007169418695587,
27
+ 25,25.0,0.0081857938227882,0.0603038504573234,0.9912798084904638,0.9913705920570351,0.0605694888222418,0.0602742368208893,1.0,0.0006965125158269,
28
+ 26,26.0,0.0081330842490234,0.0601081264069241,0.9913358248016808,0.9913957462551888,0.0603508613396729,0.0600969716236832,1.0,0.0006756874120406,
29
+ 27,,,,,,,,,,
30
+ 28,27.0,0.0080793036346773,0.0581171515087286,0.9913951707020234,0.990975715054406,0.0601222649823581,0.0619419970446162,1.0,0.0006545084971874,1.0
31
+ 29,28.0,0.0080303936715373,0.0577960211369726,0.9914486467838288,0.991037999259101,0.0599184757893842,0.0616544663078255,1.0,0.0006330184227833,1.0
32
+ 30,29.0,0.0079833286958275,0.0577082768082618,0.9914989711193556,0.9910609987046984,0.0597187550236167,0.0615710041589207,1.0,0.0006112604669781,1.0
33
+ 31,30.0,0.0079363946847387,0.0573835965659883,0.991549631886268,0.9911303440729776,0.0595209216657146,0.0613138885961638,1.0,0.0005892784473993,1.0
34
+ 32,31.0,0.0078930764391055,0.056579791340563,0.9915970942612444,0.9911372542381288,0.0593344994663689,0.0610890839662816,1.0,0.0005671166329088,1.0
35
+ 33,32.0,0.0078520439688137,0.0566349982387489,0.9916415573171016,0.9911991384294296,0.0591628991109266,0.0612032888664139,1.0,0.0005448196544517,1.0
36
+ 34,33.0,0.0078106326422538,0.056273600541883,0.99168776166573,0.9912242041693792,0.0589861686414714,0.0608326474825541,1.0,0.0005224324151752,1.0
37
+ 35,34.0,0.0077714874694291,0.0560553298228316,0.9917285669720576,0.9912754522429572,0.0588185834813486,0.0606301685174306,1.0,0.0005,1.0
38
+ 36,35.0,0.0077335360143438,0.0566925521526071,0.9917714921611078,0.9912715819146898,0.0586554208602965,0.0606366270118289,1.0,0.0004775675848247,1.0
39
+ 37,36.0,0.0076967598778834,0.05643899159299,0.9918113076619888,0.9913144058651394,0.0584977892593721,0.0604506749245855,1.0,0.0004551803455482,1.0
40
+ 38,37.0,0.0076611397421678,0.0564949775735537,0.9918505828702048,0.9912973059548272,0.0583438508379995,0.0604623614086045,1.0,0.0004328833670911,1.0
41
+ 39,38.0,0.0076274576790505,0.0561670700709025,0.9918875504075813,0.991361051135593,0.0581993957395466,0.0601712295578585,1.0,0.0004107215526006,1.0
42
+ 40,39.0,0.0075949914102034,0.0644460654093159,0.9919232221131914,0.9913600550757514,0.0580608733772729,0.0601557648016346,1.0,0.0003887395330218,1.0
43
+ 41,40.0,0.0075620844107372,0.064289897100793,0.9919600250680796,0.9914333489206102,0.0579155459141965,0.0599343188107013,1.0,0.0003669815772166,1.0
44
+ 42,41.0,0.0075315364097569,0.0641076669096946,0.9919931383280272,0.9914571894539728,0.057783521611369,0.0598166498045126,1.0,0.0003454915028125,1.0
45
+ 43,42.0,0.0075010784370615,0.0640382651653554,0.9920276840416232,0.9914649923642476,0.0576503183692693,0.0596944563090801,1.0,0.0003243125879593,1.0
46
+ 44,43.0,0.007472936848304,0.0639235826830069,0.9920596264721302,0.9915061102973092,0.0575274907805946,0.0595703116721577,1.0,0.000303487484173,1.0
47
+ 45,44.0,0.0074444434827905,0.0639039695262908,0.9920910296815164,0.9915179557270474,0.0574037855532899,0.0595297686755657,1.0,0.0002830581304412,1.0
48
+ 46,45.0,0.0074174716718118,0.0547702770266268,0.9921217421802242,0.9915496057934232,0.0572855648511413,0.0593745187752776,1.0,0.0002630656687635,1.0
49
+ 47,46.0,0.0073907052564403,0.0548254136410024,0.9921517261963212,0.99152289364073,0.0571677961229775,0.0594311359028021,1.0,0.000243550361297,1.0
50
+ 48,47.0,0.0073657991422151,0.0545883624090088,0.9921797737981496,0.99158462550905,0.0570587853304623,0.0592154045899709,1.0,0.0002245515092739,1.0
51
+ 49,48.0,0.0073413327940624,0.0549789651400513,0.9922071443514876,0.9915956417719524,0.0569513976636729,0.0591794966823524,1.0,0.0002061073738537,1.0
52
+ 50,49.0,0.0073178111899937,0.0549225054681301,0.992234143480826,0.9916122992833456,0.0568474041507317,0.0591052624086538,1.0,0.0001882550990706,1.0
53
+ 51,50.0,0.0072954127986214,0.0547993102007442,0.9922600163837496,0.9916072050730388,0.0567492793638552,0.0590278472337457,1.0,0.0001710306370301,1.0
54
+ 52,51.0,0.007273508728419,0.0633058957755565,0.99228490205963,0.9916460341877408,0.0566524313302354,0.0589173430369959,1.0,0.0001544686755065,1.0
55
+ 53,52.0,0.0072527236388891,0.0541604169540935,0.9923085340957964,0.9916655633184644,0.0565598978545893,0.0588861417439248,1.0,0.0001386025680863,1.0
56
+ 54,53.0,0.0072332986735142,0.0540786994000275,0.9923308940415972,0.9916760828759936,0.0564748854554268,0.0588100816640589,1.0,0.0001234642669981,1.0
57
+ 55,54.0,0.0072141582214304,0.0539976932936244,0.992352688915274,0.991687744193607,0.0563887013038725,0.0587553933262825,1.0,0.0001090842587659,1.0
58
+ 56,55.0,0.0071961989158011,0.0539236054652267,0.9923736041851258,0.991705028216044,0.0563090115932099,0.0586584944691922,1.0,9.54915028125264e-05,1.0
59
+ 57,56.0,0.0071795341116637,0.0538788974285125,0.9923931933186028,0.9917213280995688,0.056234765938075,0.0586185601022508,1.0,8.271337313934873e-05,1.0
60
+ 58,57.0,0.0071636978502835,0.0537894251445929,0.99241107735741,0.991727234257592,0.0561641372949554,0.0585509944293234,1.0,7.0775603199067e-05,1.0
61
+ 59,58.0,0.0071492903426373,0.0628995653655793,0.9924280616339672,0.9917405486106872,0.0560998941426364,0.0585076812240812,1.0,5.970223407163104e-05,1.0
62
+ 60,59.0,0.007135755741142,0.0628647624618477,0.992444023222066,0.9917464388741388,0.0560391753451542,0.0584567987256579,1.0,4.951556604879051e-05,1.0
63
+ 61,60.0,0.0071229146066215,0.062842031651073,0.9924588899599032,0.9917527622646756,0.0559809200988894,0.0584456553889645,1.0,4.023611372427474e-05,1.0
64
+ 62,,,,,,,,,,
65
+ 63,61.0,0.0071116714105219,0.054225208527512,0.9924720488572388,0.9917570485009088,0.0559307862449897,0.0584127764734956,1.0,3.188256468013142e-05,1.0
66
+ 64,62.0,0.0071012400428168,0.0541794104708565,0.9924842232398772,0.9917684568299188,0.0558836536754048,0.0583652096490065,1.0,2.447174185242325e-05,1.0
67
+ 65,63.0,0.0070923216879012,0.0541527594129244,0.992494883698024,0.9917673733499316,0.0558433167908466,0.0583537354237503,1.0,1.8018569652073392e-05,1.0
68
+ 66,64.0,0.0070842290056471,0.0541142652432123,0.9925042093134998,0.9917791989114548,0.055806908285601,0.0583142161369323,1.0,1.2536043909088198e-05,1.0
69
+ 67,65.0,0.0070775183637134,0.0540941999190383,0.9925123827176148,0.9917832758691576,0.0557763982752568,0.0583013370633125,1.0,8.03520570068517e-06,1.0
70
+ 68,66.0,0.0070716849972985,0.0540783844060368,0.9925192085544716,0.9917850136756896,0.0557499875869141,0.0582870386540889,1.0,4.525119116032653e-06,1.0
71
+ 69,67.0,0.0070673599365701,0.0540728816555606,0.9925242980879344,0.9917882919311524,0.0557304875004325,0.058270867665608725,1.0,2.012853002380467e-06,1.0
72
+ 70,68.0,0.007063989337073283,0.054064563827382194,0.9925282489382818,0.9917892840173509,0.055715665660714835,0.05826829680138164,1.0,5.034667293427055e-07,1.0
73
+ 71,69.0,0.007062000732638695,0.05405975985858175,0.9925306557604436,0.9917901198069254,0.05570720089913419,0.0582624898188644,1.0,0.0,1.0
finetune_final/backup_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fb8e5eabec9f82f7b0a0e4ec83273a8ee2359359bb73a229116dfed8591f239
3
+ size 1260
finetune_final/backup_model_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:408ed1a5b4fb39a6f275e8426ed28b861529d728696273b526b069012a6ac1a5
3
+ size 499596942
finetune_final/backup_optimizer_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0437aa22813d3039f703c77bd1a9031ec3a969aa4257bd8c9379f32fb2683d1
3
+ size 995925198
finetune_final/best_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ca1881774f11a71c79804851862002aa7bc93c4763f6ae97addfa4c4f31af07
3
+ size 1260
finetune_final/best_model_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:265a212fb0cb0474528d4a986458e99ca83444422d29d77065790632ceb7b0f3
3
+ size 499596942
finetune_final/best_optimizer_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa64034131c007b4d2f5dedd41e21667cd97005e6aede6687cf49932da996c44
3
+ size 995925198
finetune_final/casper_predict.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash -l
2
+ #PBS -N wx12-pred
3
+ #PBS -l select=1:ncpus=8:ngpus=1:mem=128GB
4
+ #PBS -l walltime=12:00:00
5
+ #PBS -l gpu_type=a100
6
+ #PBS -A NAML0001
7
+ #PBS -q casper
8
+ #PBS -o out
9
+ #PBS -e out
10
+ source ~/.bashrc
11
+ conda activate credit
12
+ torchrun /glade/work/schreck/repos/credit/miles-credit/applications/rollout_metrics.py -c model_multi.yml --backend nccl
finetune_final/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ca1881774f11a71c79804851862002aa7bc93c4763f6ae97addfa4c4f31af07
3
+ size 1260
finetune_final/derecho_predict.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #PBS -A NAML0001
3
+ ##PBS -A NCIS0010
4
+ #PBS -N wx12_pred
5
+ #PBS -l walltime=12:00:00
6
+ #PBS -l select=8:ncpus=64:ngpus=4:mem=480GB
7
+ #PBS -q main
8
+ #PBS -j oe
9
+ #PBS -k eod
10
+ # Load modules
11
+ module purge
12
+ module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
13
+ conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
14
+
15
+ # Export environment variables
16
+ export LSCRATCH=/glade/derecho/scratch/schreck/
17
+ export LOGLEVEL=INFO
18
+ export NCCL_DEBUG=INFO
19
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
20
+ export NCCL_SOCKET_IFNAME=hsn
21
+ export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
22
+ export MPICH_OFI_NIC_POLICY=GPU
23
+ export MPICH_GPU_SUPPORT_ENABLED=1
24
+ export NCCL_IB_DISABLE=1
25
+ export NCCL_CROSS_NIC=1
26
+ export NCCL_NCHANNELS_PER_NET_PEER=4
27
+ export MPICH_RDMA_ENABLED_CUDA=1
28
+ export NCCL_NET="AWS Libfabric"
29
+ export NCCL_NET_GDR_LEVEL=PBH
30
+ export FI_CXI_DISABLE_HOST_REGISTER=1
31
+ export FI_CXI_OPTIMIZED_MRS=false
32
+ export FI_MR_CACHE_MONITOR=userfaultfd
33
+ export FI_CXI_DEFAULT_CQ_SIZE=131072
34
+ # logger.info the results
35
+ echo "Number of nodes: 8"
36
+ echo "Number of GPUs per node: 4"
37
+ echo "Total number of GPUs: 32"
38
+ # Log in to WandB if needed
39
+ # wandb login 02d2b1af00b5df901cb2bee071872de774781520
40
+ # Launch MPIs
41
+ nodes=( $( cat $PBS_NODEFILE ) )
42
+ echo nodes: $nodes
43
+ # Find headnode's IP:
44
+ head_node=${nodes[0]}
45
+ head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
46
+ MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/work/schreck/repos/credit/miles-credit/applications/rollout_metrics.py -c model_multi.yml
finetune_final/launch_multi.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #PBS -A NCIS0010
3
+ #PBS -N wx6h_12step
4
+ #PBS -l walltime=12:00:00
5
+ #PBS -l select=8:ncpus=64:ngpus=4
6
+ #PBS -q main
7
+ #PBS -j oe
8
+ #PBS -k eod
9
+ #PBS -r n
10
+ # Load modules
11
+ module purge
12
+ module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
13
+ conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
14
+
15
+ # Export environment variables
16
+ export LSCRATCH=/glade/derecho/scratch/schreck/
17
+ export LOGLEVEL=INFO
18
+ export NCCL_DEBUG=INFO
19
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
20
+ export NCCL_SOCKET_IFNAME=hsn
21
+ export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
22
+ export MPICH_OFI_NIC_POLICY=GPU
23
+ export MPICH_GPU_SUPPORT_ENABLED=1
24
+ export NCCL_IB_DISABLE=1
25
+ export NCCL_CROSS_NIC=1
26
+ export NCCL_NCHANNELS_PER_NET_PEER=4
27
+ export MPICH_RDMA_ENABLED_CUDA=1
28
+ export NCCL_NET="AWS Libfabric"
29
+ export NCCL_NET_GDR_LEVEL=PBH
30
+ export FI_CXI_DISABLE_HOST_REGISTER=1
31
+ export FI_CXI_OPTIMIZED_MRS=false
32
+ export FI_MR_CACHE_MONITOR=userfaultfd
33
+ export FI_CXI_DEFAULT_CQ_SIZE=131072
34
+ # logger.info the results
35
+ echo "Number of nodes: 8"
36
+ echo "Number of GPUs per node: 4"
37
+ echo "Total number of GPUs: 32"
38
+ # Log in to WandB if needed
39
+ # wandb login 02d2b1af00b5df901cb2bee071872de774781520
40
+ # Launch MPIs
41
+ nodes=( $( cat $PBS_NODEFILE ) )
42
+ echo nodes: $nodes
43
+ # Find headnode's IP:
44
+ head_node=${nodes[0]}
45
+ head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
46
+ MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python train_multistep.py -c model_multi.yml --backend nccl
finetune_final/launch_predict.sh ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #PBS -A NCIS0010
3
+ #PBS -N wx6h_pred
4
+ #PBS -l walltime=12:00:00
5
+ #PBS -l select=8:ncpus=64:ngpus=4
6
+ #PBS -q main
7
+ #PBS -j oe
8
+ #PBS -k eod
9
+ #PBS -r n
10
+ # Load modules
11
+ module purge
12
+ module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
13
+ conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
14
+
15
+ # Export environment variables
16
+ export LSCRATCH=/glade/derecho/scratch/schreck/
17
+ export LOGLEVEL=INFO
18
+ export NCCL_DEBUG=INFO
19
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
20
+ export NCCL_SOCKET_IFNAME=hsn
21
+ export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
22
+ export MPICH_OFI_NIC_POLICY=GPU
23
+ export MPICH_GPU_SUPPORT_ENABLED=1
24
+ export NCCL_IB_DISABLE=1
25
+ export NCCL_CROSS_NIC=1
26
+ export NCCL_NCHANNELS_PER_NET_PEER=4
27
+ export MPICH_RDMA_ENABLED_CUDA=1
28
+ export NCCL_NET="AWS Libfabric"
29
+ export NCCL_NET_GDR_LEVEL=PBH
30
+ export FI_CXI_DISABLE_HOST_REGISTER=1
31
+ export FI_CXI_OPTIMIZED_MRS=false
32
+ export FI_MR_CACHE_MONITOR=userfaultfd
33
+ export FI_CXI_DEFAULT_CQ_SIZE=131072
34
+ # logger.info the results
35
+ echo "Number of nodes: 8"
36
+ echo "Number of GPUs per node: 4"
37
+ echo "Total number of GPUs: 32"
38
+ # Log in to WandB if needed
39
+ # wandb login 02d2b1af00b5df901cb2bee071872de774781520
40
+ # Launch MPIs
41
+ nodes=( $( cat $PBS_NODEFILE ) )
42
+ echo nodes: $nodes
43
+ # Find headnode's IP:
44
+ head_node=${nodes[0]}
45
+ head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
46
+
47
+ # MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/u/home/ksha/miles-physics/applications/rollout_to_netcdf.py -c model_multi.yml --backend nccl
48
+
49
+ mpiexec -n 8 --ppn 1 --cpu-bind none torchrun --nnodes=8 --nproc-per-node=4 --rdzv-backend=c10d --rdzv-endpoint=$head_node_ip /glade/u/home/ksha/miles-physics/applications/rollout_to_netcdf.py -c model_multi.yml
50
+
51
+
52
+ # module purge
53
+ # module load nvhpc cuda cray-mpich conda
54
+ # conda activate /glade/work/ksha/miniconda3/envs/credit
55
+ # # Get a list of allocated nodes
56
+ # nodes=( $( cat $PBS_NODEFILE ) )
57
+ # head_node=${nodes[0]}
58
+ # head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
59
+ # # Export environment variables
60
+ # export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
61
+ # export LSCRATCH=/glade/derecho/scratch/schreck/
62
+ # export LOGLEVEL=INFO
63
+ # #export NCCL_DEBUG=INFO
64
+
65
+ # export NCCL_SOCKET_IFNAME=hsn
66
+ # export NCCL_HOME=/glade/u/home/dhoward/work/nccl-ofi-plugin/install
67
+ # export LD_LIBRARY_PATH=$NCCL_HOME/lib:$NCCL_HOME/plugin/lib:$LD_LIBRARY_PATH
68
+
69
+ # export NCCL_NCHANNELS_PER_NET_PEER=4
70
+ # export MPICH_GPU_SUPPORT_ENABLED=1
71
+ # export MPICH_OFI_NIC_POLICY=GPU
72
+ # export MPICH_RDMA_ENABLED_CUDA=1
73
+ # export NCCL_DISABLE_IB=1
74
+ # export NCCL_CROSS_NIC=1
75
+ # export FI_CXI_DISABLE_HOST_REGISTER=1
76
+ # export FI_CXI_OPTIMIZED_MRS=false
77
+
78
+ # # Print the results
79
+ # echo "Number of nodes: 8"
80
+ # echo "Number of GPUs per node: 4"
81
+ # echo "Total number of GPUs: 32"
82
+ # # Log in to WandB if needed
83
+ # # wandb login 02d2b1af00b5df901cb2bee071872de774781520
84
+
85
+ # # Launch MPIs
86
+ # mpiexec -n 8 --ppn 1 --cpu-bind none torchrun --nnodes=8 --nproc-per-node=4 --rdzv-backend=c10d --rdzv-endpoint=$head_node_ip /glade/u/home/ksha/miles-physics/applications/rollout_to_netcdf.py -c model_multi.yml
finetune_final/launch_single.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #PBS -A NCIS0010
3
+ #PBS -N wx_6h
4
+ #PBS -l walltime=12:00:00
5
+ #PBS -l select=8:ncpus=64:ngpus=4
6
+ #PBS -q main
7
+ #PBS -j oe
8
+ #PBS -k eod
9
+ #PBS -r n
10
+ # Load modules
11
+ module purge
12
+ module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
13
+ conda activate /glade/work/ksha/miniconda3/envs/credit-derecho
14
+ # conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
15
+ # Export environment variables
16
+ export LSCRATCH=/glade/derecho/scratch/ksha/
17
+ export LOGLEVEL=INFO
18
+ export NCCL_DEBUG=INFO
19
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
20
+ export NCCL_SOCKET_IFNAME=hsn
21
+ export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
22
+ export MPICH_OFI_NIC_POLICY=GPU
23
+ export MPICH_GPU_SUPPORT_ENABLED=1
24
+ export NCCL_IB_DISABLE=1
25
+ export NCCL_CROSS_NIC=1
26
+ export NCCL_NCHANNELS_PER_NET_PEER=4
27
+ export MPICH_RDMA_ENABLED_CUDA=1
28
+ export NCCL_NET="AWS Libfabric"
29
+ export NCCL_NET_GDR_LEVEL=PBH
30
+ export FI_CXI_DISABLE_HOST_REGISTER=1
31
+ export FI_CXI_OPTIMIZED_MRS=false
32
+ export FI_MR_CACHE_MONITOR=userfaultfd
33
+ export FI_CXI_DEFAULT_CQ_SIZE=131072
34
+ # logger.info the results
35
+ echo "Number of nodes: 8"
36
+ echo "Number of GPUs per node: 4"
37
+ echo "Total number of GPUs: 32"
38
+ # Log in to WandB if needed
39
+ # wandb login 02d2b1af00b5df901cb2bee071872de774781520
40
+ # Launch MPIs
41
+ nodes=( $( cat $PBS_NODEFILE ) )
42
+ echo nodes: $nodes
43
+ # Find headnode's IP:
44
+ head_node=${nodes[0]}
45
+ head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
46
+ MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/u/home/ksha/miles-credit/applications/train.py -c /glade/work/ksha/CREDIT_runs/wxformer_6h/model_single.yml --backend nccl
finetune_final/model.yml ADDED
File without changes
finetune_final/model_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:265a212fb0cb0474528d4a986458e99ca83444422d29d77065790632ceb7b0f3
3
+ size 499596942
finetune_final/model_multi.yml ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------------------------------------------------------------------- #
2
+ # This yaml file implements 6 hourly state-in-state-out crossformer
3
+ # on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
4
+ # The model is trained on 6 hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask
5
+ # Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
6
+ # --------------------------------------------------------------------------------------------------------------------- #
7
+ save_loc: '/glade/u/home/schreck/scratch/finetune/wx12/'
8
+ seed: 1000
9
+
10
+ data:
11
+ # upper-air variables
12
+ variables: ['U','V','T','Q']
13
+ save_loc: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL*'
14
+
15
+ # surface variables
16
+ surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
17
+ save_loc_surface: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL*'
18
+
19
+ # dynamic forcing variables
20
+ dynamic_forcing_variables: ['tsi']
21
+ save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
22
+
23
+ # static variables
24
+ static_variables: ['Z_GDS4_SFC','LSM']
25
+ save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
26
+
27
+ # mean / std path
28
+ mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
29
+ std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
30
+
31
+ # train / validation split
32
+ train_years: [1979, 2018]
33
+ valid_years: [2018, 2019]
34
+
35
+ # data workflow
36
+ scaler_type: 'std_new'
37
+
38
+ # number of input states
39
+ # FuXi has 2 input states
40
+ history_len: 1
41
+ valid_history_len: 1
42
+
43
+ # number of forecast steps to compute loss
44
+ # 0 for single step training / validation
45
+ # larger than 0 for multi-step training / validation
46
+ forecast_len: 11
47
+ valid_forecast_len: 11
48
+
49
+ # one_shot: True --> compute loss on the last forecast step only
50
+ # one_shot: False --> compute loss on all forecast steps
51
+ one_shot: False
52
+
53
+ # 1 for hourly model
54
+ lead_time_periods: 6
55
+
56
+ # do not use skip_period
57
+ skip_periods: null
58
+
59
+ # compatible with the old 'std'
60
+ static_first: True
61
+
62
+ trainer:
63
+ type: multi-step # <---------- change to your type
64
+
65
+ mode: fsdp
66
+ cpu_offload: False
67
+ activation_checkpoint: True
68
+
69
+ load_weights: True
70
+ load_optimizer: True
71
+ load_scaler: True
72
+ load_scheduler: True
73
+ reload_epoch: True
74
+
75
+ skip_validation: False
76
+ update_learning_rate: True
77
+
78
+ save_backup_weights: True
79
+ save_best_weights: True
80
+
81
+ learning_rate: 1.0e-05 # <-- change to your lr
82
+ weight_decay: 0
83
+
84
+ train_batch_size: 1
85
+ valid_batch_size: 1
86
+
87
+ batches_per_epoch: 100
88
+ valid_batches_per_epoch: 0
89
+ stopping_patience: 50
90
+
91
+ start_epoch: 0
92
+ num_epoch: 4
93
+ # False when switching from single-step to multi-step
94
+ epochs: &epochs 20
95
+ use_scheduler: True
96
+ #scheduler: {'scheduler_type': 'cosine-annealing', 'T_max': *epochs, 'last_epoch': -1}
97
+ scheduler:
98
+ scheduler_type: cosine-annealing-restarts
99
+ first_cycle_steps: 250
100
+ cycle_mult: 6.0 # Multiplier for steps in subsequent cycles
101
+ max_lr: 1.0e-05
102
+ min_lr: 1.0e-08
103
+ warmup_steps: 249
104
+ gamma: 0.7 # LR reduction factor at each cycle restart
105
+
106
+ # Automatic Mixed Precision: False
107
+ amp: False
108
+
109
+ # rescale loss as loss = loss / grad_accum_every
110
+ grad_accum_every: 1
111
+ # gradient clipping
112
+ grad_max_norm: 1.0
113
+
114
+ # number of workers
115
+ thread_workers: 4
116
+ valid_thread_workers: 4
117
+
118
+ model:
119
+ # crossformer example
120
+ type: "crossformer"
121
+ frames: 1 # number of input states (default: 1)
122
+ image_height: 640 # number of latitude grids (default: 640)
123
+ image_width: 1280 # number of longitude grids (default: 1280)
124
+ levels: 16 # number of upper-air variable levels (default: 15)
125
+ channels: 4 # upper-air variable channels
126
+ surface_channels: 7 # surface variable channels
127
+ input_only_channels: 3 # dynamic forcing, forcing, static channels
128
+ output_only_channels: 0 # diagnostic variable channels
129
+
130
+ patch_width: 1 # number of latitude grids in each 3D patch (default: 1)
131
+ patch_height: 1 # number of longitude grids in each 3D patch (default: 1)
132
+ frame_patch_size: 1 # number of input states in each 3D patch (default: 1)
133
+
134
+ dim: [128, 256, 512, 1024] # Dimensionality of each layer
135
+ depth: [2, 2, 8, 2] # Depth of each layer
136
+ global_window_size: [10, 5, 2, 1] # Global window size for each layer
137
+ local_window_size: 10 # Local window size
138
+ cross_embed_kernel_sizes: # kernel sizes for cross-embedding
139
+ - [4, 8, 16, 32]
140
+ - [2, 4]
141
+ - [2, 4]
142
+ - [2, 4]
143
+ cross_embed_strides: [2, 2, 2, 2] # Strides for cross-embedding (default: [4, 2, 2, 2])
144
+ attn_dropout: 0. # Dropout probability for attention layers (default: 0.0)
145
+ ff_dropout: 0. # Dropout probability for feed-forward layers (default: 0.0)
146
+
147
+ use_spectral_norm: True
148
+
149
+ # =============================================================== #
150
+ # New
151
+
152
+ # use interpolation to match the output size
153
+ interp: True
154
+
155
+ # map boundary padding
156
+ padding_conf:
157
+ activate: True
158
+ mode: earth
159
+ pad_lat: 80
160
+ pad_lon: 80
161
+
162
+ post_conf:
163
+ activate: True
164
+
165
+ tracer_fixer:
166
+ activate: True
167
+ denorm: True
168
+ tracer_name: ['Q', 'Q500']
169
+ tracer_thres: [1e-8, 1e-8]
170
+
171
+
172
+ loss:
173
+ # the main training loss
174
+ training_loss: "mse"
175
+
176
+ # power loss (x), spectral_loss (x)
177
+ use_power_loss: False
178
+ use_spectral_loss: False
179
+
180
+ # use latitude weighting
181
+ use_latitude_weights: True
182
+ latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
183
+
184
+ # turn-off variable weighting
185
+ use_variable_weights: False
186
+
187
+ predict:
188
+ forecasts:
189
+ type: "custom" # keep it as "custom"
190
+ start_year: 2019 # year of the first initialization (where rollout will start)
191
+ start_month: 1 # month of the first initialization
192
+ start_day: 1 # day of the first initialization
193
+ start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
194
+ duration: 1152 # number of days to initialize, starting from the (year, mon, day) above
195
+ # duration should be divisible by the number of GPUs
196
+ # (e.g., duration: 384 for 365-day rollout using 32 GPUs)
197
+ days: 10 # forecast lead time as days (1 means 24-hour forecast)
198
+
199
+ save_forecast: '/glade/u/home/schreck/scratch/finetune/wx12/netcdf/'
200
+ # save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
201
+ metadata: '/glade/u/home/ksha/miles-credit/credit/metadata/era5.yaml'
202
+
203
+ # turn-off low-pass filter
204
+ use_laplace_filter: False
205
+
206
+ # deprecated
207
+ # save_format: "nc"
208
+
209
+ pbs: #derecho
210
+ conda: "/glade/work/ksha/miniconda3/envs/credit"
211
+ project: "NAML0001"
212
+ job_name: "wxformer_6h"
213
+ walltime: "12:00:00"
214
+ nodes: 8
215
+ ncpus: 64
216
+ ngpus: 4
217
+ mem: '480GB'
218
+ queue: 'main'
finetune_final/model_predict.yml ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------------------------------------------------------------------- #
2
+ # This yaml file implements 6 hourly state-in-state-out crossformer
3
+ # on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
4
+ # The model is trained on 6 hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask
5
+ # Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
6
+ # --------------------------------------------------------------------------------------------------------------------- #
7
+ save_loc: '/glade/work/ksha/CREDIT_runs/wxformer_6h/'
8
+ seed: 1000
9
+
10
+ data:
11
+ # upper-air variables
12
+ variables: ['U','V','T','Q']
13
+ save_loc: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL*'
14
+
15
+ # surface variables
16
+ surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
17
+ save_loc_surface: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL*'
18
+
19
+ # dynamic forcing variables
20
+ dynamic_forcing_variables: ['tsi']
21
+ save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
22
+
23
+ # static variables
24
+ static_variables: ['Z_GDS4_SFC','LSM']
25
+ save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
26
+
27
+ # mean / std path
28
+ mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
29
+ std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
30
+
31
+ # train / validation split
32
+ train_years: [1979, 2018]
33
+ valid_years: [2018, 2019]
34
+
35
+ # data workflow
36
+ scaler_type: 'std_new'
37
+
38
+ history_len: 1
39
+ valid_history_len: 1
40
+
41
+ forecast_len: 0
42
+ valid_forecast_len: 0
43
+
44
+ # 1 for hourly model
45
+ lead_time_periods: 6
46
+
47
+ # do not use skip_period
48
+ skip_periods: null
49
+
50
+ # compatible with the old 'std'
51
+ static_first: True
52
+
53
+ trainer:
54
+ mode: fsdp
55
+ type: standard
56
+
57
+ model:
58
+ # crossformer example
59
+ type: "crossformer"
60
+ frames: 1 # number of input states (default: 1)
61
+ image_height: 640 # number of latitude grids (default: 640)
62
+ image_width: 1280 # number of longitude grids (default: 1280)
63
+ levels: 16 # number of upper-air variable levels (default: 15)
64
+ channels: 4 # upper-air variable channels
65
+ surface_channels: 7 # surface variable channels
66
+ input_only_channels: 3 # dynamic forcing, forcing, static channels
67
+ output_only_channels: 0 # diagnostic variable channels
68
+
69
+ patch_width: 1 # number of latitude grids in each 3D patch (default: 1)
70
+ patch_height: 1 # number of longitude grids in each 3D patch (default: 1)
71
+ frame_patch_size: 1 # number of input states in each 3D patch (default: 1)
72
+
73
+ dim: [128, 256, 512, 1024] # Dimensionality of each layer
74
+ depth: [2, 2, 8, 2] # Depth of each layer
75
+ global_window_size: [10, 5, 2, 1] # Global window size for each layer
76
+ local_window_size: 10 # Local window size
77
+ cross_embed_kernel_sizes: # kernel sizes for cross-embedding
78
+ - [4, 8, 16, 32]
79
+ - [2, 4]
80
+ - [2, 4]
81
+ - [2, 4]
82
+ cross_embed_strides: [2, 2, 2, 2] # Strides for cross-embedding (default: [4, 2, 2, 2])
83
+ attn_dropout: 0. # Dropout probability for attention layers (default: 0.0)
84
+ ff_dropout: 0. # Dropout probability for feed-forward layers (default: 0.0)
85
+
86
+ interp: True
87
+
88
+ # map boundary padding
89
+ padding_conf:
90
+ activate: True
91
+ mode: mirror
92
+ pad_lon: 80 # number of grids to pad on 0 and 360 deg lon
93
+ pad_lat: 80 # number of grids to pad on -90 and 90 deg lat
94
+
95
+ loss:
96
+ use_latitude_weights: True
97
+ latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
98
+
99
+
100
+ predict:
101
+ forecasts:
102
+ type: "custom" # keep it as "custom"
103
+ start_year: 2020 # year of the first initialization (where rollout will start)
104
+ start_month: 1 # month of the first initialization
105
+ start_day: 1 # day of the first initialization
106
+ start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
107
+ duration: 32 # number of days to initialize, starting from the (year, mon, day) above
108
+ # duration should be divisible by the number of GPUs
109
+ # (e.g., duration: 384 for 365-day rollout using 32 GPUs)
110
+ days: 10 # forecast lead time as days (1 means 24-hour forecast)
111
+
112
+ metadata: '/glade/u/home/ksha/miles-credit/credit/metadata/era5.yaml'
113
+ save_forecast: '/glade/derecho/scratch/ksha/CREDIT/RAW_OUTPUT/wxformer_6h_test/'
114
+
115
+ # turn-off low-pass filter
116
+ use_laplace_filter: False
117
+
finetune_final/model_single.yml ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------------------------------------------------------------------- #
2
+ # This yaml file implements 6 hourly state-in-state-out crossformer
3
+ # on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
4
+ # The model is trained on 6 hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask
5
+ # Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
6
+ # --------------------------------------------------------------------------------------------------------------------- #
7
+ save_loc: '/glade/work/ksha/CREDIT_runs/wxformer_6h/'
8
+ seed: 1000
9
+
10
+ data:
11
+ # upper-air variables
12
+ variables: ['U','V','T','Q']
13
+ save_loc: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
14
+
15
+ # surface variables
16
+ surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
17
+ save_loc_surface: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
18
+
19
+ # dynamic forcing variables
20
+ dynamic_forcing_variables: ['tsi']
21
+ save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
22
+
23
+ # static variables
24
+ static_variables: ['Z_GDS4_SFC','LSM']
25
+ save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
26
+
27
+ # mean / std path
28
+ mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
29
+ std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
30
+
31
+ # train / validation split
32
+ train_years: [1979, 2018]
33
+ valid_years: [2018, 2019]
34
+
35
+ # data workflow
36
+ scaler_type: 'std_new'
37
+
38
+ # state-in-state-out
39
+ history_len: 1
40
+ valid_history_len: 1
41
+
42
+ forecast_len: 0
43
+ valid_forecast_len: 0
44
+
45
+ one_shot: True
46
+
47
+ # 1 for hourly model
48
+ lead_time_periods: 6
49
+
50
+ # do not use skip_period
51
+ skip_periods: null
52
+
53
+ # compatible with the old 'std'
54
+ static_first: True
55
+
56
+ trainer:
57
+ type: standard # <---------- change to your type
58
+
59
+ mode: fsdp
60
+ cpu_offload: False
61
+ activation_checkpoint: True
62
+
63
+ load_weights: True
64
+ load_optimizer: True
65
+ load_scaler: True
66
+ load_sheduler: True
67
+
68
+ skip_validation: False
69
+ update_learning_rate: False
70
+
71
+ save_backup_weights: True
72
+ save_best_weights: True
73
+
74
+ learning_rate: 1.0e-03 # <-- change to your lr
75
+ weight_decay: 0
76
+
77
+ train_batch_size: 1
78
+ valid_batch_size: 1
79
+
80
+ batches_per_epoch: 0
81
+ valid_batches_per_epoch: 0
82
+ stopping_patience: 999
83
+
84
+ start_epoch: 0
85
+ num_epoch: 6
86
+ reload_epoch: True
87
+ epochs: &epochs 70
88
+
89
+ use_scheduler: True
90
+ scheduler: {'scheduler_type': 'cosine-annealing', 'T_max': *epochs, 'last_epoch': -1}
91
+
92
+ # Automatic Mixed Precision: False
93
+ amp: False
94
+
95
+ # rescale loss as loss = loss / grad_accum_every
96
+ grad_accum_every: 1
97
+ # gradient clipping
98
+ grad_max_norm: 1.0
99
+
100
+ # number of workers
101
+ thread_workers: 4
102
+ valid_thread_workers: 0
103
+
104
+ model:
105
+ # crossformer example
106
+ type: "crossformer"
107
+ frames: 1 # number of input states (default: 1)
108
+ image_height: 640 # number of latitude grids (default: 640)
109
+ image_width: 1280 # number of longitude grids (default: 1280)
110
+ levels: 16 # number of upper-air variable levels (default: 15)
111
+ channels: 4 # upper-air variable channels
112
+ surface_channels: 7 # surface variable channels
113
+ input_only_channels: 3 # dynamic forcing, forcing, static channels
114
+ output_only_channels: 0 # diagnostic variable channels
115
+
116
+ patch_width: 1 # number of latitude grids in each 3D patch (default: 1)
117
+ patch_height: 1 # number of longitude grids in each 3D patch (default: 1)
118
+ frame_patch_size: 1 # number of input states in each 3D patch (default: 1)
119
+
120
+ dim: [128, 256, 512, 1024] # Dimensionality of each layer
121
+ depth: [2, 2, 8, 2] # Depth of each layer
122
+ global_window_size: [10, 5, 2, 1] # Global window size for each layer
123
+ local_window_size: 10 # Local window size
124
+ cross_embed_kernel_sizes: # kernel sizes for cross-embedding
125
+ - [4, 8, 16, 32]
126
+ - [2, 4]
127
+ - [2, 4]
128
+ - [2, 4]
129
+ cross_embed_strides: [2, 2, 2, 2] # Strides for cross-embedding (default: [4, 2, 2, 2])
130
+ attn_dropout: 0. # Dropout probability for attention layers (default: 0.0)
131
+ ff_dropout: 0. # Dropout probability for feed-forward layers (default: 0.0)
132
+
133
+ # map boundary padding
134
+ pad_lon: 80 # number of grids to pad on 0 and 360 deg lon
135
+ pad_lat: 80 # number of grids to pad on -90 and 90 deg lat
136
+
137
+ loss:
138
+ # the main training loss
139
+ training_loss: "mse"
140
+
141
+ # power loss (x), spectral_loss (x)
142
+ use_power_loss: False
143
+ use_spectral_loss: False
144
+
145
+ # use latitude weighting
146
+ use_latitude_weights: True
147
+ latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
148
+
149
+ # turn-off variable weighting
150
+ use_variable_weights: False
151
+
152
+ predict:
153
+ forecasts:
154
+ type: "custom" # keep it as "custom"
155
+ start_year: 2020 # year of the first initialization (where rollout will start)
156
+ start_month: 1 # month of the first initialization
157
+ start_day: 1 # day of the first initialization
158
+ start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
159
+ duration: 30 # number of days to initialize, starting from the (year, mon, day) above
160
+ # duration should be divisible by the number of GPUs
161
+ # (e.g., duration: 384 for 365-day rollout using 32 GPUs)
162
+ days: 2 # forecast lead time as days (1 means 24-hour forecast)
163
+
164
+ save_forecast: '/glade/derecho/scratch/ksha/CREDIT/wxformer_6h/'
165
+ save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
166
+
167
+ # turn-off low-pass filter
168
+ use_laplace_filter: False
169
+
170
+ # deprecated
171
+ # save_format: "nc"
172
+
173
+ pbs: #derecho
174
+ conda: "/glade/work/ksha/miniconda3/envs/credit"
175
+ project: "NAML0001"
176
+ job_name: "wxformer_6h"
177
+ walltime: "12:00:00"
178
+ nodes: 8
179
+ ncpus: 64
180
+ ngpus: 4
181
+ mem: '480GB'
182
+ queue: 'main'
finetune_final/model_single_cached.yml ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------------------------------------------------------------------- #
2
+ # This yaml file implements 6 hourly state-in-state-out crossformer
3
+ # on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
4
+ # The model is trained on 6 hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask
5
+ # Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
6
+ # --------------------------------------------------------------------------------------------------------------------- #
7
+ save_loc: '/glade/work/ksha/CREDIT_runs/wxformer_6h/'
8
+ seed: 1000
9
+
10
+ data:
11
+ # upper-air variables
12
+ variables: ['U','V','T','Q']
13
+ save_loc: '/glade/derecho/scratch/ksha/CREDIT_data/arXiv_cached/cache_arXiv_6h_*'
14
+
15
+ # surface variables
16
+ surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
17
+ save_loc_surface: '/glade/derecho/scratch/ksha/CREDIT_data/arXiv_cached/cache_arXiv_6h_*'
18
+
19
+ # dynamic forcing variables
20
+ dynamic_forcing_variables: ['tsi']
21
+ save_loc_dynamic_forcing: '/glade/derecho/scratch/ksha/CREDIT_data/arXiv_cached/cache_arXiv_6h_*'
22
+
23
+ # static variables
24
+ static_variables: ['Z_GDS4_SFC','LSM']
25
+ save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
26
+
27
+ # mean / std path
28
+ mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
29
+ std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
30
+
31
+ # train / validation split
32
+ train_years: [1979, 2018]
33
+ valid_years: [2018, 2019]
34
+
35
+ # data workflow
36
+ scaler_type: 'std_cached'
37
+
38
+ # state-in-state-out
39
+ history_len: 1
40
+ valid_history_len: 1
41
+
42
+ forecast_len: 0
43
+ valid_forecast_len: 0
44
+
45
+ one_shot: True
46
+
47
+ # 1 for hourly model
48
+ lead_time_periods: 6
49
+
50
+ # do not use skip_period
51
+ skip_periods: null
52
+
53
+ # compatible with the old 'std'
54
+ static_first: True
55
+
56
+ trainer:
57
+ type: standard # <---------- change to your type
58
+
59
+ mode: fsdp
60
+ cpu_offload: False
61
+ activation_checkpoint: True
62
+
63
+ load_weights: True
64
+ load_optimizer: True
65
+ load_scaler: True
66
+ load_sheduler: True
67
+
68
+ skip_validation: False
69
+ update_learning_rate: False
70
+
71
+ save_backup_weights: True
72
+ save_best_weights: True
73
+
74
+ learning_rate: 1.0e-03 # <-- change to your lr
75
+ weight_decay: 0
76
+
77
+ train_batch_size: 1
78
+ valid_batch_size: 1
79
+
80
+ batches_per_epoch: 0
81
+ valid_batches_per_epoch: 0
82
+ stopping_patience: 999
83
+
84
+ start_epoch: 0
85
+ num_epoch: 6
86
+ reload_epoch: True
87
+ epochs: &epochs 70
88
+
89
+ use_scheduler: True
90
+ scheduler: {'scheduler_type': 'cosine-annealing', 'T_max': *epochs, 'last_epoch': -1}
91
+
92
+ # Automatic Mixed Precision: False
93
+ amp: False
94
+
95
+ # rescale loss as loss = loss / grad_accum_every
96
+ grad_accum_every: 1
97
+ # gradient clipping
98
+ grad_max_norm: 1.0
99
+
100
+ # number of workers
101
+ thread_workers: 4
102
+ valid_thread_workers: 0
103
+
104
+ model:
105
+ # crossformer example
106
+ type: "crossformer"
107
+ frames: 1 # number of input states (default: 1)
108
+ image_height: 640 # number of latitude grids (default: 640)
109
+ image_width: 1280 # number of longitude grids (default: 1280)
110
+ levels: 16 # number of upper-air variable levels (default: 15)
111
+ channels: 4 # upper-air variable channels
112
+ surface_channels: 7 # surface variable channels
113
+ input_only_channels: 3 # dynamic forcing, forcing, static channels
114
+ output_only_channels: 0 # diagnostic variable channels
115
+
116
+ patch_width: 1 # number of latitude grids in each 3D patch (default: 1)
117
+ patch_height: 1 # number of longitude grids in each 3D patch (default: 1)
118
+ frame_patch_size: 1 # number of input states in each 3D patch (default: 1)
119
+
120
+ dim: [128, 256, 512, 1024] # Dimensionality of each layer
121
+ depth: [2, 2, 8, 2] # Depth of each layer
122
+ global_window_size: [10, 5, 2, 1] # Global window size for each layer
123
+ local_window_size: 10 # Local window size
124
+ cross_embed_kernel_sizes: # kernel sizes for cross-embedding
125
+ - [4, 8, 16, 32]
126
+ - [2, 4]
127
+ - [2, 4]
128
+ - [2, 4]
129
+ cross_embed_strides: [2, 2, 2, 2] # Strides for cross-embedding (default: [4, 2, 2, 2])
130
+ attn_dropout: 0. # Dropout probability for attention layers (default: 0.0)
131
+ ff_dropout: 0. # Dropout probability for feed-forward layers (default: 0.0)
132
+
133
+ # map boundary padding
134
+ pad_lon: 80 # number of grids to pad on 0 and 360 deg lon
135
+ pad_lat: 80 # number of grids to pad on -90 and 90 deg lat
136
+
137
+ loss:
138
+ # the main training loss
139
+ training_loss: "mse"
140
+
141
+ # power loss (x), spectral_loss (x)
142
+ use_power_loss: False
143
+ use_spectral_loss: False
144
+
145
+ # use latitude weighting
146
+ use_latitude_weights: True
147
+ latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
148
+
149
+ # turn-off variable weighting
150
+ use_variable_weights: False
151
+
152
+ predict:
153
+ forecasts:
154
+ type: "custom" # keep it as "custom"
155
+ start_year: 2020 # year of the first initialization (where rollout will start)
156
+ start_month: 1 # month of the first initialization
157
+ start_day: 1 # day of the first initialization
158
+ start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
159
+ duration: 30 # number of days to initialize, starting from the (year, mon, day) above
160
+ # duration should be divisible by the number of GPUs
161
+ # (e.g., duration: 384 for 365-day rollout using 32 GPUs)
162
+ days: 2 # forecast lead time as days (1 means 24-hour forecast)
163
+
164
+ save_forecast: '/glade/derecho/scratch/ksha/CREDIT/wxformer_6h/'
165
+ save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
166
+
167
+ # turn-off low-pass filter
168
+ use_laplace_filter: False
169
+
170
+ # deprecated
171
+ # save_format: "nc"
172
+
173
+ pbs: #derecho
174
+ conda: "/glade/work/ksha/miniconda3/envs/credit"
175
+ project: "NAML0001"
176
+ job_name: "wxformer_6h"
177
+ walltime: "12:00:00"
178
+ nodes: 8
179
+ ncpus: 64
180
+ ngpus: 4
181
+ mem: '480GB'
182
+ queue: 'main'
finetune_final/optimizer_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa64034131c007b4d2f5dedd41e21667cd97005e6aede6687cf49932da996c44
3
+ size 995925198