Added single step and finetune weights
Browse files- backup_single/.ipynb_checkpoints/training_log-checkpoint.csv +73 -0
- backup_single/backup_checkpoint.pt +3 -0
- backup_single/backup_model_checkpoint.pt +3 -0
- backup_single/backup_optimizer_checkpoint.pt +3 -0
- backup_single/best_checkpoint.pt +3 -0
- backup_single/best_model_checkpoint.pt +3 -0
- backup_single/best_optimizer_checkpoint.pt +3 -0
- backup_single/checkpoint.pt +3 -0
- backup_single/model_checkpoint.pt +3 -0
- backup_single/optimizer_checkpoint.pt +3 -0
- backup_single/training_log.csv +73 -0
- finetune_final/backup_checkpoint.pt +3 -0
- finetune_final/backup_model_checkpoint.pt +3 -0
- finetune_final/backup_optimizer_checkpoint.pt +3 -0
- finetune_final/best_checkpoint.pt +3 -0
- finetune_final/best_model_checkpoint.pt +3 -0
- finetune_final/best_optimizer_checkpoint.pt +3 -0
- finetune_final/casper_predict.sh +12 -0
- finetune_final/checkpoint.pt +3 -0
- finetune_final/derecho_predict.sh +46 -0
- finetune_final/launch_multi.sh +46 -0
- finetune_final/launch_predict.sh +86 -0
- finetune_final/launch_single.sh +46 -0
- finetune_final/model.yml +0 -0
- finetune_final/model_checkpoint.pt +3 -0
- finetune_final/model_multi.yml +218 -0
- finetune_final/model_predict.yml +117 -0
- finetune_final/model_single.yml +182 -0
- finetune_final/model_single_cached.yml +182 -0
- finetune_final/optimizer_checkpoint.pt +3 -0
backup_single/.ipynb_checkpoints/training_log-checkpoint.csv
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
index,epoch,train_loss,valid_loss,train_acc,valid_acc,train_mae,valid_mae,train_forecast_len,lr,valid_forecast_len
|
| 2 |
+
0,0.0,0.1009794427448193,0.1027883764770295,0.94327420322957,0.9739029765129088,0.1534553673871782,0.1048697493142551,1.0,0.0009994965332706,
|
| 3 |
+
1,1.0,0.019675959971095,0.0868489891290664,0.978004109926438,0.9811553531222872,0.0957085051940063,0.089651234779093,1.0,0.0009979871469976,
|
| 4 |
+
2,2.0,0.0151972808869911,0.0786430971490012,0.9831368846170018,0.984541834725274,0.0845575840662369,0.0817086655232641,1.0,0.0009954748808839,
|
| 5 |
+
3,3.0,0.0130550836908808,0.0741522577073839,0.9857291311360477,0.9864424957169426,0.0784229645735762,0.0773795636163817,1.0,0.0009919647942993,
|
| 6 |
+
4,4.0,0.0119756458991657,0.0715151440766122,0.9870807892820808,0.9872670186890496,0.074865428542488,0.0748724273509449,1.0,0.0009874639560909,
|
| 7 |
+
5,5.0,0.0112956749882256,0.0761996052331394,0.9878721300135838,0.9880205419328476,0.0725079033147083,0.0729023743006918,1.0,0.0009819814303479,
|
| 8 |
+
6,6.0,0.0108025243594782,0.0742437612679269,0.988425366228886,0.9885241680675084,0.0707315514512946,0.0707739344901508,1.0,0.0009755282581475,
|
| 9 |
+
7,7.0,0.010423366222112,0.0740774704350365,0.9888492891292894,0.9887091702885098,0.0693481621238287,0.070707170996401,1.0,0.0009681174353198,
|
| 10 |
+
8,8.0,0.0101209639440803,0.0727778664893574,0.9891780853606342,0.9890378952026369,0.0682236003239503,0.069233152270317,1.0,0.0009597638862757,
|
| 11 |
+
9,9.0,0.0098750877382463,0.0722406513161129,0.9894475843799247,0.9893197271558972,0.0672991637810227,0.0686951773034201,1.0,0.0009504844339512,
|
| 12 |
+
10,10.0,0.0096599225844308,0.0712286000450452,0.989684408903122,0.9894908361964756,0.0664751630988013,0.0675681359238094,1.0,0.0009402977659283,
|
| 13 |
+
11,11.0,0.0094818470246169,0.0702389660808775,0.9898788600825192,0.989692693286472,0.0657934360666556,0.0665735589133368,1.0,0.0009292243968009,
|
| 14 |
+
12,12.0,0.00931685034374,0.0702956752644644,0.9900572099042744,0.9898342741860284,0.0651478600091813,0.0665175035595893,1.0,0.0009172866268606,
|
| 15 |
+
13,13.0,0.0091760145034641,0.0696684651904636,0.99021095000626,0.9899267342355516,0.0645949800488319,0.0658018956581751,1.0,0.0009045084971874,
|
| 16 |
+
14,14.0,0.009054467713544,0.069178935388724,0.990342890546563,0.990099212858412,0.0641164990309416,0.0652842425637775,1.0,0.000890915741234,
|
| 17 |
+
15,15.0,0.008937944289234,0.0689917723337809,0.9904679285676292,0.9901854236920674,0.0636501339274678,0.0650799024436209,1.0,0.0008765357330018,
|
| 18 |
+
16,16.0,0.0088368062871728,0.0683553102943632,0.99057797605402,0.990315987004174,0.0632446752358855,0.0643851502074135,1.0,0.0008613974319136,
|
| 19 |
+
17,17.0,0.0087418129835068,0.0601519813968075,0.9906821045982704,0.990372551812066,0.0628605140498682,0.0643506565027766,1.0,0.0008455313244934,
|
| 20 |
+
18,18.0,0.0086546849888445,0.0597193823920355,0.9907743083961894,0.9904850191540188,0.062502304592243,0.0639373604622152,1.0,0.0008289693629698,
|
| 21 |
+
19,19.0,0.0085733014553931,0.0593640875485208,0.9908616850885112,0.9905507524808248,0.062172229688489,0.0636733880473507,1.0,0.0008117449009293,
|
| 22 |
+
20,20.0,0.0084998264312325,0.0590407683617538,0.9909425107615716,0.9906116353140936,0.0618693786913926,0.0633704620103041,1.0,0.0007938926261462,
|
| 23 |
+
21,21.0,0.0084292321816463,0.0588373551766077,0.99101791097207,0.9906822787390814,0.0615794747293497,0.0632358468241161,1.0,0.000775448490726,
|
| 24 |
+
22,22.0,0.0083631074385654,0.0583808085984653,0.9910881071278218,0.9907019972801208,0.061308835618449,0.0627665976683298,1.0,0.0007564496387029,
|
| 25 |
+
23,23.0,0.0082998081518525,0.0583362696071465,0.9911576903937908,0.9907915671666464,0.0610430538654327,0.0627390105691221,1.0,0.0007369343312364,
|
| 26 |
+
24,24.0,0.0082426977341241,0.0579863401750723,0.9912183459555164,0.9908138381110296,0.0608090944985827,0.0624360193808873,1.0,0.0007169418695587,
|
| 27 |
+
25,25.0,0.0081857938227882,0.0603038504573234,0.9912798084904638,0.9913705920570351,0.0605694888222418,0.0602742368208893,1.0,0.0006965125158269,
|
| 28 |
+
26,26.0,0.0081330842490234,0.0601081264069241,0.9913358248016808,0.9913957462551888,0.0603508613396729,0.0600969716236832,1.0,0.0006756874120406,
|
| 29 |
+
27,,,,,,,,,,
|
| 30 |
+
28,27.0,0.0080793036346773,0.0581171515087286,0.9913951707020234,0.990975715054406,0.0601222649823581,0.0619419970446162,1.0,0.0006545084971874,1.0
|
| 31 |
+
29,28.0,0.0080303936715373,0.0577960211369726,0.9914486467838288,0.991037999259101,0.0599184757893842,0.0616544663078255,1.0,0.0006330184227833,1.0
|
| 32 |
+
30,29.0,0.0079833286958275,0.0577082768082618,0.9914989711193556,0.9910609987046984,0.0597187550236167,0.0615710041589207,1.0,0.0006112604669781,1.0
|
| 33 |
+
31,30.0,0.0079363946847387,0.0573835965659883,0.991549631886268,0.9911303440729776,0.0595209216657146,0.0613138885961638,1.0,0.0005892784473993,1.0
|
| 34 |
+
32,31.0,0.0078930764391055,0.056579791340563,0.9915970942612444,0.9911372542381288,0.0593344994663689,0.0610890839662816,1.0,0.0005671166329088,1.0
|
| 35 |
+
33,32.0,0.0078520439688137,0.0566349982387489,0.9916415573171016,0.9911991384294296,0.0591628991109266,0.0612032888664139,1.0,0.0005448196544517,1.0
|
| 36 |
+
34,33.0,0.0078106326422538,0.056273600541883,0.99168776166573,0.9912242041693792,0.0589861686414714,0.0608326474825541,1.0,0.0005224324151752,1.0
|
| 37 |
+
35,34.0,0.0077714874694291,0.0560553298228316,0.9917285669720576,0.9912754522429572,0.0588185834813486,0.0606301685174306,1.0,0.0005,1.0
|
| 38 |
+
36,35.0,0.0077335360143438,0.0566925521526071,0.9917714921611078,0.9912715819146898,0.0586554208602965,0.0606366270118289,1.0,0.0004775675848247,1.0
|
| 39 |
+
37,36.0,0.0076967598778834,0.05643899159299,0.9918113076619888,0.9913144058651394,0.0584977892593721,0.0604506749245855,1.0,0.0004551803455482,1.0
|
| 40 |
+
38,37.0,0.0076611397421678,0.0564949775735537,0.9918505828702048,0.9912973059548272,0.0583438508379995,0.0604623614086045,1.0,0.0004328833670911,1.0
|
| 41 |
+
39,38.0,0.0076274576790505,0.0561670700709025,0.9918875504075813,0.991361051135593,0.0581993957395466,0.0601712295578585,1.0,0.0004107215526006,1.0
|
| 42 |
+
40,39.0,0.0075949914102034,0.0644460654093159,0.9919232221131914,0.9913600550757514,0.0580608733772729,0.0601557648016346,1.0,0.0003887395330218,1.0
|
| 43 |
+
41,40.0,0.0075620844107372,0.064289897100793,0.9919600250680796,0.9914333489206102,0.0579155459141965,0.0599343188107013,1.0,0.0003669815772166,1.0
|
| 44 |
+
42,41.0,0.0075315364097569,0.0641076669096946,0.9919931383280272,0.9914571894539728,0.057783521611369,0.0598166498045126,1.0,0.0003454915028125,1.0
|
| 45 |
+
43,42.0,0.0075010784370615,0.0640382651653554,0.9920276840416232,0.9914649923642476,0.0576503183692693,0.0596944563090801,1.0,0.0003243125879593,1.0
|
| 46 |
+
44,43.0,0.007472936848304,0.0639235826830069,0.9920596264721302,0.9915061102973092,0.0575274907805946,0.0595703116721577,1.0,0.000303487484173,1.0
|
| 47 |
+
45,44.0,0.0074444434827905,0.0639039695262908,0.9920910296815164,0.9915179557270474,0.0574037855532899,0.0595297686755657,1.0,0.0002830581304412,1.0
|
| 48 |
+
46,45.0,0.0074174716718118,0.0547702770266268,0.9921217421802242,0.9915496057934232,0.0572855648511413,0.0593745187752776,1.0,0.0002630656687635,1.0
|
| 49 |
+
47,46.0,0.0073907052564403,0.0548254136410024,0.9921517261963212,0.99152289364073,0.0571677961229775,0.0594311359028021,1.0,0.000243550361297,1.0
|
| 50 |
+
48,47.0,0.0073657991422151,0.0545883624090088,0.9921797737981496,0.99158462550905,0.0570587853304623,0.0592154045899709,1.0,0.0002245515092739,1.0
|
| 51 |
+
49,48.0,0.0073413327940624,0.0549789651400513,0.9922071443514876,0.9915956417719524,0.0569513976636729,0.0591794966823524,1.0,0.0002061073738537,1.0
|
| 52 |
+
50,49.0,0.0073178111899937,0.0549225054681301,0.992234143480826,0.9916122992833456,0.0568474041507317,0.0591052624086538,1.0,0.0001882550990706,1.0
|
| 53 |
+
51,50.0,0.0072954127986214,0.0547993102007442,0.9922600163837496,0.9916072050730388,0.0567492793638552,0.0590278472337457,1.0,0.0001710306370301,1.0
|
| 54 |
+
52,51.0,0.007273508728419,0.0633058957755565,0.99228490205963,0.9916460341877408,0.0566524313302354,0.0589173430369959,1.0,0.0001544686755065,1.0
|
| 55 |
+
53,52.0,0.0072527236388891,0.0541604169540935,0.9923085340957964,0.9916655633184644,0.0565598978545893,0.0588861417439248,1.0,0.0001386025680863,1.0
|
| 56 |
+
54,53.0,0.0072332986735142,0.0540786994000275,0.9923308940415972,0.9916760828759936,0.0564748854554268,0.0588100816640589,1.0,0.0001234642669981,1.0
|
| 57 |
+
55,54.0,0.0072141582214304,0.0539976932936244,0.992352688915274,0.991687744193607,0.0563887013038725,0.0587553933262825,1.0,0.0001090842587659,1.0
|
| 58 |
+
56,55.0,0.0071961989158011,0.0539236054652267,0.9923736041851258,0.991705028216044,0.0563090115932099,0.0586584944691922,1.0,9.54915028125264e-05,1.0
|
| 59 |
+
57,56.0,0.0071795341116637,0.0538788974285125,0.9923931933186028,0.9917213280995688,0.056234765938075,0.0586185601022508,1.0,8.271337313934873e-05,1.0
|
| 60 |
+
58,57.0,0.0071636978502835,0.0537894251445929,0.99241107735741,0.991727234257592,0.0561641372949554,0.0585509944293234,1.0,7.0775603199067e-05,1.0
|
| 61 |
+
59,58.0,0.0071492903426373,0.0628995653655793,0.9924280616339672,0.9917405486106872,0.0560998941426364,0.0585076812240812,1.0,5.970223407163104e-05,1.0
|
| 62 |
+
60,59.0,0.007135755741142,0.0628647624618477,0.992444023222066,0.9917464388741388,0.0560391753451542,0.0584567987256579,1.0,4.951556604879051e-05,1.0
|
| 63 |
+
61,60.0,0.0071229146066215,0.062842031651073,0.9924588899599032,0.9917527622646756,0.0559809200988894,0.0584456553889645,1.0,4.023611372427474e-05,1.0
|
| 64 |
+
62,,,,,,,,,,
|
| 65 |
+
63,61.0,0.0071116714105219,0.054225208527512,0.9924720488572388,0.9917570485009088,0.0559307862449897,0.0584127764734956,1.0,3.188256468013142e-05,1.0
|
| 66 |
+
64,62.0,0.0071012400428168,0.0541794104708565,0.9924842232398772,0.9917684568299188,0.0558836536754048,0.0583652096490065,1.0,2.447174185242325e-05,1.0
|
| 67 |
+
65,63.0,0.0070923216879012,0.0541527594129244,0.992494883698024,0.9917673733499316,0.0558433167908466,0.0583537354237503,1.0,1.8018569652073392e-05,1.0
|
| 68 |
+
66,64.0,0.0070842290056471,0.0541142652432123,0.9925042093134998,0.9917791989114548,0.055806908285601,0.0583142161369323,1.0,1.2536043909088198e-05,1.0
|
| 69 |
+
67,65.0,0.0070775183637134,0.0540941999190383,0.9925123827176148,0.9917832758691576,0.0557763982752568,0.0583013370633125,1.0,8.03520570068517e-06,1.0
|
| 70 |
+
68,66.0,0.0070716849972985,0.0540783844060368,0.9925192085544716,0.9917850136756896,0.0557499875869141,0.0582870386540889,1.0,4.525119116032653e-06,1.0
|
| 71 |
+
69,67.0,0.0070673599365701,0.0540728816555606,0.9925242980879344,0.9917882919311524,0.0557304875004325,0.058270867665608725,1.0,2.012853002380467e-06,1.0
|
| 72 |
+
70,68.0,0.007063989337073283,0.054064563827382194,0.9925282489382818,0.9917892840173509,0.055715665660714835,0.05826829680138164,1.0,5.034667293427055e-07,1.0
|
| 73 |
+
71,69.0,0.007062000732638695,0.05405975985858175,0.9925306557604436,0.9917901198069254,0.05570720089913419,0.0582624898188644,1.0,0.0,1.0
|
backup_single/backup_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9be8fb23430592b596035f034d342d18cb6c3c03c0921a3c7f79604808e9588
|
| 3 |
+
size 1132
|
backup_single/backup_model_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3fc4023ef69eb06b51fcde0f5a7fe5f1b6eadbca8f9c34590e2d40e4af01e01
|
| 3 |
+
size 499596942
|
backup_single/backup_optimizer_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5289c997b54e3ca945e950a7e5b135b3e20eead14c64ae0d0f7d8cee620ad902
|
| 3 |
+
size 995925198
|
backup_single/best_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:394d8dc509edd026166d2dfec1bdb8c01968dd47e41e9d5887503d6a3f6e52e5
|
| 3 |
+
size 1132
|
backup_single/best_model_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b279c4117c4c863769b2bbeb7394b202b4099df4988f9a43c4a6ef07f28f872c
|
| 3 |
+
size 499596942
|
backup_single/best_optimizer_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89edd5ebcf6ace8bc1813c577f47edc70fa05c134b832cbd439f4d72c7c511a9
|
| 3 |
+
size 995925198
|
backup_single/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a09df09ff71135cd8429fd199646fbaa0e2585c571eb6161f00b2a3bb9b9db53
|
| 3 |
+
size 1132
|
backup_single/model_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e342d121b2cb5eabf8adc15ced4bf2d499de99c1a8be5bdd4f0768cb92510dc
|
| 3 |
+
size 499596942
|
backup_single/optimizer_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e8aafa5d31b09a6fa9ce6a322548188c3e5cef0e46c2c3c9fb91126763fdb720
|
| 3 |
+
size 995925198
|
backup_single/training_log.csv
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
index,epoch,train_loss,valid_loss,train_acc,valid_acc,train_mae,valid_mae,train_forecast_len,lr,valid_forecast_len
|
| 2 |
+
0,0.0,0.1009794427448193,0.1027883764770295,0.94327420322957,0.9739029765129088,0.1534553673871782,0.1048697493142551,1.0,0.0009994965332706,
|
| 3 |
+
1,1.0,0.019675959971095,0.0868489891290664,0.978004109926438,0.9811553531222872,0.0957085051940063,0.089651234779093,1.0,0.0009979871469976,
|
| 4 |
+
2,2.0,0.0151972808869911,0.0786430971490012,0.9831368846170018,0.984541834725274,0.0845575840662369,0.0817086655232641,1.0,0.0009954748808839,
|
| 5 |
+
3,3.0,0.0130550836908808,0.0741522577073839,0.9857291311360477,0.9864424957169426,0.0784229645735762,0.0773795636163817,1.0,0.0009919647942993,
|
| 6 |
+
4,4.0,0.0119756458991657,0.0715151440766122,0.9870807892820808,0.9872670186890496,0.074865428542488,0.0748724273509449,1.0,0.0009874639560909,
|
| 7 |
+
5,5.0,0.0112956749882256,0.0761996052331394,0.9878721300135838,0.9880205419328476,0.0725079033147083,0.0729023743006918,1.0,0.0009819814303479,
|
| 8 |
+
6,6.0,0.0108025243594782,0.0742437612679269,0.988425366228886,0.9885241680675084,0.0707315514512946,0.0707739344901508,1.0,0.0009755282581475,
|
| 9 |
+
7,7.0,0.010423366222112,0.0740774704350365,0.9888492891292894,0.9887091702885098,0.0693481621238287,0.070707170996401,1.0,0.0009681174353198,
|
| 10 |
+
8,8.0,0.0101209639440803,0.0727778664893574,0.9891780853606342,0.9890378952026369,0.0682236003239503,0.069233152270317,1.0,0.0009597638862757,
|
| 11 |
+
9,9.0,0.0098750877382463,0.0722406513161129,0.9894475843799247,0.9893197271558972,0.0672991637810227,0.0686951773034201,1.0,0.0009504844339512,
|
| 12 |
+
10,10.0,0.0096599225844308,0.0712286000450452,0.989684408903122,0.9894908361964756,0.0664751630988013,0.0675681359238094,1.0,0.0009402977659283,
|
| 13 |
+
11,11.0,0.0094818470246169,0.0702389660808775,0.9898788600825192,0.989692693286472,0.0657934360666556,0.0665735589133368,1.0,0.0009292243968009,
|
| 14 |
+
12,12.0,0.00931685034374,0.0702956752644644,0.9900572099042744,0.9898342741860284,0.0651478600091813,0.0665175035595893,1.0,0.0009172866268606,
|
| 15 |
+
13,13.0,0.0091760145034641,0.0696684651904636,0.99021095000626,0.9899267342355516,0.0645949800488319,0.0658018956581751,1.0,0.0009045084971874,
|
| 16 |
+
14,14.0,0.009054467713544,0.069178935388724,0.990342890546563,0.990099212858412,0.0641164990309416,0.0652842425637775,1.0,0.000890915741234,
|
| 17 |
+
15,15.0,0.008937944289234,0.0689917723337809,0.9904679285676292,0.9901854236920674,0.0636501339274678,0.0650799024436209,1.0,0.0008765357330018,
|
| 18 |
+
16,16.0,0.0088368062871728,0.0683553102943632,0.99057797605402,0.990315987004174,0.0632446752358855,0.0643851502074135,1.0,0.0008613974319136,
|
| 19 |
+
17,17.0,0.0087418129835068,0.0601519813968075,0.9906821045982704,0.990372551812066,0.0628605140498682,0.0643506565027766,1.0,0.0008455313244934,
|
| 20 |
+
18,18.0,0.0086546849888445,0.0597193823920355,0.9907743083961894,0.9904850191540188,0.062502304592243,0.0639373604622152,1.0,0.0008289693629698,
|
| 21 |
+
19,19.0,0.0085733014553931,0.0593640875485208,0.9908616850885112,0.9905507524808248,0.062172229688489,0.0636733880473507,1.0,0.0008117449009293,
|
| 22 |
+
20,20.0,0.0084998264312325,0.0590407683617538,0.9909425107615716,0.9906116353140936,0.0618693786913926,0.0633704620103041,1.0,0.0007938926261462,
|
| 23 |
+
21,21.0,0.0084292321816463,0.0588373551766077,0.99101791097207,0.9906822787390814,0.0615794747293497,0.0632358468241161,1.0,0.000775448490726,
|
| 24 |
+
22,22.0,0.0083631074385654,0.0583808085984653,0.9910881071278218,0.9907019972801208,0.061308835618449,0.0627665976683298,1.0,0.0007564496387029,
|
| 25 |
+
23,23.0,0.0082998081518525,0.0583362696071465,0.9911576903937908,0.9907915671666464,0.0610430538654327,0.0627390105691221,1.0,0.0007369343312364,
|
| 26 |
+
24,24.0,0.0082426977341241,0.0579863401750723,0.9912183459555164,0.9908138381110296,0.0608090944985827,0.0624360193808873,1.0,0.0007169418695587,
|
| 27 |
+
25,25.0,0.0081857938227882,0.0603038504573234,0.9912798084904638,0.9913705920570351,0.0605694888222418,0.0602742368208893,1.0,0.0006965125158269,
|
| 28 |
+
26,26.0,0.0081330842490234,0.0601081264069241,0.9913358248016808,0.9913957462551888,0.0603508613396729,0.0600969716236832,1.0,0.0006756874120406,
|
| 29 |
+
27,,,,,,,,,,
|
| 30 |
+
28,27.0,0.0080793036346773,0.0581171515087286,0.9913951707020234,0.990975715054406,0.0601222649823581,0.0619419970446162,1.0,0.0006545084971874,1.0
|
| 31 |
+
29,28.0,0.0080303936715373,0.0577960211369726,0.9914486467838288,0.991037999259101,0.0599184757893842,0.0616544663078255,1.0,0.0006330184227833,1.0
|
| 32 |
+
30,29.0,0.0079833286958275,0.0577082768082618,0.9914989711193556,0.9910609987046984,0.0597187550236167,0.0615710041589207,1.0,0.0006112604669781,1.0
|
| 33 |
+
31,30.0,0.0079363946847387,0.0573835965659883,0.991549631886268,0.9911303440729776,0.0595209216657146,0.0613138885961638,1.0,0.0005892784473993,1.0
|
| 34 |
+
32,31.0,0.0078930764391055,0.056579791340563,0.9915970942612444,0.9911372542381288,0.0593344994663689,0.0610890839662816,1.0,0.0005671166329088,1.0
|
| 35 |
+
33,32.0,0.0078520439688137,0.0566349982387489,0.9916415573171016,0.9911991384294296,0.0591628991109266,0.0612032888664139,1.0,0.0005448196544517,1.0
|
| 36 |
+
34,33.0,0.0078106326422538,0.056273600541883,0.99168776166573,0.9912242041693792,0.0589861686414714,0.0608326474825541,1.0,0.0005224324151752,1.0
|
| 37 |
+
35,34.0,0.0077714874694291,0.0560553298228316,0.9917285669720576,0.9912754522429572,0.0588185834813486,0.0606301685174306,1.0,0.0005,1.0
|
| 38 |
+
36,35.0,0.0077335360143438,0.0566925521526071,0.9917714921611078,0.9912715819146898,0.0586554208602965,0.0606366270118289,1.0,0.0004775675848247,1.0
|
| 39 |
+
37,36.0,0.0076967598778834,0.05643899159299,0.9918113076619888,0.9913144058651394,0.0584977892593721,0.0604506749245855,1.0,0.0004551803455482,1.0
|
| 40 |
+
38,37.0,0.0076611397421678,0.0564949775735537,0.9918505828702048,0.9912973059548272,0.0583438508379995,0.0604623614086045,1.0,0.0004328833670911,1.0
|
| 41 |
+
39,38.0,0.0076274576790505,0.0561670700709025,0.9918875504075813,0.991361051135593,0.0581993957395466,0.0601712295578585,1.0,0.0004107215526006,1.0
|
| 42 |
+
40,39.0,0.0075949914102034,0.0644460654093159,0.9919232221131914,0.9913600550757514,0.0580608733772729,0.0601557648016346,1.0,0.0003887395330218,1.0
|
| 43 |
+
41,40.0,0.0075620844107372,0.064289897100793,0.9919600250680796,0.9914333489206102,0.0579155459141965,0.0599343188107013,1.0,0.0003669815772166,1.0
|
| 44 |
+
42,41.0,0.0075315364097569,0.0641076669096946,0.9919931383280272,0.9914571894539728,0.057783521611369,0.0598166498045126,1.0,0.0003454915028125,1.0
|
| 45 |
+
43,42.0,0.0075010784370615,0.0640382651653554,0.9920276840416232,0.9914649923642476,0.0576503183692693,0.0596944563090801,1.0,0.0003243125879593,1.0
|
| 46 |
+
44,43.0,0.007472936848304,0.0639235826830069,0.9920596264721302,0.9915061102973092,0.0575274907805946,0.0595703116721577,1.0,0.000303487484173,1.0
|
| 47 |
+
45,44.0,0.0074444434827905,0.0639039695262908,0.9920910296815164,0.9915179557270474,0.0574037855532899,0.0595297686755657,1.0,0.0002830581304412,1.0
|
| 48 |
+
46,45.0,0.0074174716718118,0.0547702770266268,0.9921217421802242,0.9915496057934232,0.0572855648511413,0.0593745187752776,1.0,0.0002630656687635,1.0
|
| 49 |
+
47,46.0,0.0073907052564403,0.0548254136410024,0.9921517261963212,0.99152289364073,0.0571677961229775,0.0594311359028021,1.0,0.000243550361297,1.0
|
| 50 |
+
48,47.0,0.0073657991422151,0.0545883624090088,0.9921797737981496,0.99158462550905,0.0570587853304623,0.0592154045899709,1.0,0.0002245515092739,1.0
|
| 51 |
+
49,48.0,0.0073413327940624,0.0549789651400513,0.9922071443514876,0.9915956417719524,0.0569513976636729,0.0591794966823524,1.0,0.0002061073738537,1.0
|
| 52 |
+
50,49.0,0.0073178111899937,0.0549225054681301,0.992234143480826,0.9916122992833456,0.0568474041507317,0.0591052624086538,1.0,0.0001882550990706,1.0
|
| 53 |
+
51,50.0,0.0072954127986214,0.0547993102007442,0.9922600163837496,0.9916072050730388,0.0567492793638552,0.0590278472337457,1.0,0.0001710306370301,1.0
|
| 54 |
+
52,51.0,0.007273508728419,0.0633058957755565,0.99228490205963,0.9916460341877408,0.0566524313302354,0.0589173430369959,1.0,0.0001544686755065,1.0
|
| 55 |
+
53,52.0,0.0072527236388891,0.0541604169540935,0.9923085340957964,0.9916655633184644,0.0565598978545893,0.0588861417439248,1.0,0.0001386025680863,1.0
|
| 56 |
+
54,53.0,0.0072332986735142,0.0540786994000275,0.9923308940415972,0.9916760828759936,0.0564748854554268,0.0588100816640589,1.0,0.0001234642669981,1.0
|
| 57 |
+
55,54.0,0.0072141582214304,0.0539976932936244,0.992352688915274,0.991687744193607,0.0563887013038725,0.0587553933262825,1.0,0.0001090842587659,1.0
|
| 58 |
+
56,55.0,0.0071961989158011,0.0539236054652267,0.9923736041851258,0.991705028216044,0.0563090115932099,0.0586584944691922,1.0,9.54915028125264e-05,1.0
|
| 59 |
+
57,56.0,0.0071795341116637,0.0538788974285125,0.9923931933186028,0.9917213280995688,0.056234765938075,0.0586185601022508,1.0,8.271337313934873e-05,1.0
|
| 60 |
+
58,57.0,0.0071636978502835,0.0537894251445929,0.99241107735741,0.991727234257592,0.0561641372949554,0.0585509944293234,1.0,7.0775603199067e-05,1.0
|
| 61 |
+
59,58.0,0.0071492903426373,0.0628995653655793,0.9924280616339672,0.9917405486106872,0.0560998941426364,0.0585076812240812,1.0,5.970223407163104e-05,1.0
|
| 62 |
+
60,59.0,0.007135755741142,0.0628647624618477,0.992444023222066,0.9917464388741388,0.0560391753451542,0.0584567987256579,1.0,4.951556604879051e-05,1.0
|
| 63 |
+
61,60.0,0.0071229146066215,0.062842031651073,0.9924588899599032,0.9917527622646756,0.0559809200988894,0.0584456553889645,1.0,4.023611372427474e-05,1.0
|
| 64 |
+
62,,,,,,,,,,
|
| 65 |
+
63,61.0,0.0071116714105219,0.054225208527512,0.9924720488572388,0.9917570485009088,0.0559307862449897,0.0584127764734956,1.0,3.188256468013142e-05,1.0
|
| 66 |
+
64,62.0,0.0071012400428168,0.0541794104708565,0.9924842232398772,0.9917684568299188,0.0558836536754048,0.0583652096490065,1.0,2.447174185242325e-05,1.0
|
| 67 |
+
65,63.0,0.0070923216879012,0.0541527594129244,0.992494883698024,0.9917673733499316,0.0558433167908466,0.0583537354237503,1.0,1.8018569652073392e-05,1.0
|
| 68 |
+
66,64.0,0.0070842290056471,0.0541142652432123,0.9925042093134998,0.9917791989114548,0.055806908285601,0.0583142161369323,1.0,1.2536043909088198e-05,1.0
|
| 69 |
+
67,65.0,0.0070775183637134,0.0540941999190383,0.9925123827176148,0.9917832758691576,0.0557763982752568,0.0583013370633125,1.0,8.03520570068517e-06,1.0
|
| 70 |
+
68,66.0,0.0070716849972985,0.0540783844060368,0.9925192085544716,0.9917850136756896,0.0557499875869141,0.0582870386540889,1.0,4.525119116032653e-06,1.0
|
| 71 |
+
69,67.0,0.0070673599365701,0.0540728816555606,0.9925242980879344,0.9917882919311524,0.0557304875004325,0.058270867665608725,1.0,2.012853002380467e-06,1.0
|
| 72 |
+
70,68.0,0.007063989337073283,0.054064563827382194,0.9925282489382818,0.9917892840173509,0.055715665660714835,0.05826829680138164,1.0,5.034667293427055e-07,1.0
|
| 73 |
+
71,69.0,0.007062000732638695,0.05405975985858175,0.9925306557604436,0.9917901198069254,0.05570720089913419,0.0582624898188644,1.0,0.0,1.0
|
finetune_final/backup_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2fb8e5eabec9f82f7b0a0e4ec83273a8ee2359359bb73a229116dfed8591f239
|
| 3 |
+
size 1260
|
finetune_final/backup_model_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:408ed1a5b4fb39a6f275e8426ed28b861529d728696273b526b069012a6ac1a5
|
| 3 |
+
size 499596942
|
finetune_final/backup_optimizer_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f0437aa22813d3039f703c77bd1a9031ec3a969aa4257bd8c9379f32fb2683d1
|
| 3 |
+
size 995925198
|
finetune_final/best_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ca1881774f11a71c79804851862002aa7bc93c4763f6ae97addfa4c4f31af07
|
| 3 |
+
size 1260
|
finetune_final/best_model_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:265a212fb0cb0474528d4a986458e99ca83444422d29d77065790632ceb7b0f3
|
| 3 |
+
size 499596942
|
finetune_final/best_optimizer_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa64034131c007b4d2f5dedd41e21667cd97005e6aede6687cf49932da996c44
|
| 3 |
+
size 995925198
|
finetune_final/casper_predict.sh
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash -l
|
| 2 |
+
#PBS -N wx12-pred
|
| 3 |
+
#PBS -l select=1:ncpus=8:ngpus=1:mem=128GB
|
| 4 |
+
#PBS -l walltime=12:00:00
|
| 5 |
+
#PBS -l gpu_type=a100
|
| 6 |
+
#PBS -A NAML0001
|
| 7 |
+
#PBS -q casper
|
| 8 |
+
#PBS -o out
|
| 9 |
+
#PBS -e out
|
| 10 |
+
source ~/.bashrc
|
| 11 |
+
conda activate credit
|
| 12 |
+
torchrun /glade/work/schreck/repos/credit/miles-credit/applications/rollout_metrics.py -c model_multi.yml --backend nccl
|
finetune_final/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ca1881774f11a71c79804851862002aa7bc93c4763f6ae97addfa4c4f31af07
|
| 3 |
+
size 1260
|
finetune_final/derecho_predict.sh
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#PBS -A NAML0001
|
| 3 |
+
##PBS -A NCIS0010
|
| 4 |
+
#PBS -N wx12_pred
|
| 5 |
+
#PBS -l walltime=12:00:00
|
| 6 |
+
#PBS -l select=8:ncpus=64:ngpus=4:mem=480GB
|
| 7 |
+
#PBS -q main
|
| 8 |
+
#PBS -j oe
|
| 9 |
+
#PBS -k eod
|
| 10 |
+
# Load modules
|
| 11 |
+
module purge
|
| 12 |
+
module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
|
| 13 |
+
conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
|
| 14 |
+
|
| 15 |
+
# Export environment variables
|
| 16 |
+
export LSCRATCH=/glade/derecho/scratch/schreck/
|
| 17 |
+
export LOGLEVEL=INFO
|
| 18 |
+
export NCCL_DEBUG=INFO
|
| 19 |
+
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
| 20 |
+
export NCCL_SOCKET_IFNAME=hsn
|
| 21 |
+
export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
|
| 22 |
+
export MPICH_OFI_NIC_POLICY=GPU
|
| 23 |
+
export MPICH_GPU_SUPPORT_ENABLED=1
|
| 24 |
+
export NCCL_IB_DISABLE=1
|
| 25 |
+
export NCCL_CROSS_NIC=1
|
| 26 |
+
export NCCL_NCHANNELS_PER_NET_PEER=4
|
| 27 |
+
export MPICH_RDMA_ENABLED_CUDA=1
|
| 28 |
+
export NCCL_NET="AWS Libfabric"
|
| 29 |
+
export NCCL_NET_GDR_LEVEL=PBH
|
| 30 |
+
export FI_CXI_DISABLE_HOST_REGISTER=1
|
| 31 |
+
export FI_CXI_OPTIMIZED_MRS=false
|
| 32 |
+
export FI_MR_CACHE_MONITOR=userfaultfd
|
| 33 |
+
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
| 34 |
+
# logger.info the results
|
| 35 |
+
echo "Number of nodes: 8"
|
| 36 |
+
echo "Number of GPUs per node: 4"
|
| 37 |
+
echo "Total number of GPUs: 32"
|
| 38 |
+
# Log in to WandB if needed
|
| 39 |
+
# wandb login 02d2b1af00b5df901cb2bee071872de774781520
|
| 40 |
+
# Launch MPIs
|
| 41 |
+
nodes=( $( cat $PBS_NODEFILE ) )
|
| 42 |
+
echo nodes: $nodes
|
| 43 |
+
# Find headnode's IP:
|
| 44 |
+
head_node=${nodes[0]}
|
| 45 |
+
head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
|
| 46 |
+
MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/work/schreck/repos/credit/miles-credit/applications/rollout_metrics.py -c model_multi.yml
|
finetune_final/launch_multi.sh
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#PBS -A NCIS0010
|
| 3 |
+
#PBS -N wx6h_12step
|
| 4 |
+
#PBS -l walltime=12:00:00
|
| 5 |
+
#PBS -l select=8:ncpus=64:ngpus=4
|
| 6 |
+
#PBS -q main
|
| 7 |
+
#PBS -j oe
|
| 8 |
+
#PBS -k eod
|
| 9 |
+
#PBS -r n
|
| 10 |
+
# Load modules
|
| 11 |
+
module purge
|
| 12 |
+
module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
|
| 13 |
+
conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
|
| 14 |
+
|
| 15 |
+
# Export environment variables
|
| 16 |
+
export LSCRATCH=/glade/derecho/scratch/schreck/
|
| 17 |
+
export LOGLEVEL=INFO
|
| 18 |
+
export NCCL_DEBUG=INFO
|
| 19 |
+
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
| 20 |
+
export NCCL_SOCKET_IFNAME=hsn
|
| 21 |
+
export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
|
| 22 |
+
export MPICH_OFI_NIC_POLICY=GPU
|
| 23 |
+
export MPICH_GPU_SUPPORT_ENABLED=1
|
| 24 |
+
export NCCL_IB_DISABLE=1
|
| 25 |
+
export NCCL_CROSS_NIC=1
|
| 26 |
+
export NCCL_NCHANNELS_PER_NET_PEER=4
|
| 27 |
+
export MPICH_RDMA_ENABLED_CUDA=1
|
| 28 |
+
export NCCL_NET="AWS Libfabric"
|
| 29 |
+
export NCCL_NET_GDR_LEVEL=PBH
|
| 30 |
+
export FI_CXI_DISABLE_HOST_REGISTER=1
|
| 31 |
+
export FI_CXI_OPTIMIZED_MRS=false
|
| 32 |
+
export FI_MR_CACHE_MONITOR=userfaultfd
|
| 33 |
+
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
| 34 |
+
# logger.info the results
|
| 35 |
+
echo "Number of nodes: 8"
|
| 36 |
+
echo "Number of GPUs per node: 4"
|
| 37 |
+
echo "Total number of GPUs: 32"
|
| 38 |
+
# Log in to WandB if needed
|
| 39 |
+
# wandb login 02d2b1af00b5df901cb2bee071872de774781520
|
| 40 |
+
# Launch MPIs
|
| 41 |
+
nodes=( $( cat $PBS_NODEFILE ) )
|
| 42 |
+
echo nodes: $nodes
|
| 43 |
+
# Find headnode's IP:
|
| 44 |
+
head_node=${nodes[0]}
|
| 45 |
+
head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
|
| 46 |
+
MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python train_multistep.py -c model_multi.yml --backend nccl
|
finetune_final/launch_predict.sh
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#PBS -A NCIS0010
|
| 3 |
+
#PBS -N wx6h_pred
|
| 4 |
+
#PBS -l walltime=12:00:00
|
| 5 |
+
#PBS -l select=8:ncpus=64:ngpus=4
|
| 6 |
+
#PBS -q main
|
| 7 |
+
#PBS -j oe
|
| 8 |
+
#PBS -k eod
|
| 9 |
+
#PBS -r n
|
| 10 |
+
# Load modules
|
| 11 |
+
module purge
|
| 12 |
+
module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
|
| 13 |
+
conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
|
| 14 |
+
|
| 15 |
+
# Export environment variables
|
| 16 |
+
export LSCRATCH=/glade/derecho/scratch/schreck/
|
| 17 |
+
export LOGLEVEL=INFO
|
| 18 |
+
export NCCL_DEBUG=INFO
|
| 19 |
+
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
| 20 |
+
export NCCL_SOCKET_IFNAME=hsn
|
| 21 |
+
export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
|
| 22 |
+
export MPICH_OFI_NIC_POLICY=GPU
|
| 23 |
+
export MPICH_GPU_SUPPORT_ENABLED=1
|
| 24 |
+
export NCCL_IB_DISABLE=1
|
| 25 |
+
export NCCL_CROSS_NIC=1
|
| 26 |
+
export NCCL_NCHANNELS_PER_NET_PEER=4
|
| 27 |
+
export MPICH_RDMA_ENABLED_CUDA=1
|
| 28 |
+
export NCCL_NET="AWS Libfabric"
|
| 29 |
+
export NCCL_NET_GDR_LEVEL=PBH
|
| 30 |
+
export FI_CXI_DISABLE_HOST_REGISTER=1
|
| 31 |
+
export FI_CXI_OPTIMIZED_MRS=false
|
| 32 |
+
export FI_MR_CACHE_MONITOR=userfaultfd
|
| 33 |
+
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
| 34 |
+
# logger.info the results
|
| 35 |
+
echo "Number of nodes: 8"
|
| 36 |
+
echo "Number of GPUs per node: 4"
|
| 37 |
+
echo "Total number of GPUs: 32"
|
| 38 |
+
# Log in to WandB if needed
|
| 39 |
+
# wandb login 02d2b1af00b5df901cb2bee071872de774781520
|
| 40 |
+
# Launch MPIs
|
| 41 |
+
nodes=( $( cat $PBS_NODEFILE ) )
|
| 42 |
+
echo nodes: $nodes
|
| 43 |
+
# Find headnode's IP:
|
| 44 |
+
head_node=${nodes[0]}
|
| 45 |
+
head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
|
| 46 |
+
|
| 47 |
+
# MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/u/home/ksha/miles-physics/applications/rollout_to_netcdf.py -c model_multi.yml --backend nccl
|
| 48 |
+
|
| 49 |
+
mpiexec -n 8 --ppn 1 --cpu-bind none torchrun --nnodes=8 --nproc-per-node=4 --rdzv-backend=c10d --rdzv-endpoint=$head_node_ip /glade/u/home/ksha/miles-physics/applications/rollout_to_netcdf.py -c model_multi.yml
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# module purge
|
| 53 |
+
# module load nvhpc cuda cray-mpich conda
|
| 54 |
+
# conda activate /glade/work/ksha/miniconda3/envs/credit
|
| 55 |
+
# # Get a list of allocated nodes
|
| 56 |
+
# nodes=( $( cat $PBS_NODEFILE ) )
|
| 57 |
+
# head_node=${nodes[0]}
|
| 58 |
+
# head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
|
| 59 |
+
# # Export environment variables
|
| 60 |
+
# export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
|
| 61 |
+
# export LSCRATCH=/glade/derecho/scratch/schreck/
|
| 62 |
+
# export LOGLEVEL=INFO
|
| 63 |
+
# #export NCCL_DEBUG=INFO
|
| 64 |
+
|
| 65 |
+
# export NCCL_SOCKET_IFNAME=hsn
|
| 66 |
+
# export NCCL_HOME=/glade/u/home/dhoward/work/nccl-ofi-plugin/install
|
| 67 |
+
# export LD_LIBRARY_PATH=$NCCL_HOME/lib:$NCCL_HOME/plugin/lib:$LD_LIBRARY_PATH
|
| 68 |
+
|
| 69 |
+
# export NCCL_NCHANNELS_PER_NET_PEER=4
|
| 70 |
+
# export MPICH_GPU_SUPPORT_ENABLED=1
|
| 71 |
+
# export MPICH_OFI_NIC_POLICY=GPU
|
| 72 |
+
# export MPICH_RDMA_ENABLED_CUDA=1
|
| 73 |
+
# export NCCL_DISABLE_IB=1
|
| 74 |
+
# export NCCL_CROSS_NIC=1
|
| 75 |
+
# export FI_CXI_DISABLE_HOST_REGISTER=1
|
| 76 |
+
# export FI_CXI_OPTIMIZED_MRS=false
|
| 77 |
+
|
| 78 |
+
# # Print the results
|
| 79 |
+
# echo "Number of nodes: 8"
|
| 80 |
+
# echo "Number of GPUs per node: 4"
|
| 81 |
+
# echo "Total number of GPUs: 32"
|
| 82 |
+
# # Log in to WandB if needed
|
| 83 |
+
# # wandb login 02d2b1af00b5df901cb2bee071872de774781520
|
| 84 |
+
|
| 85 |
+
# # Launch MPIs
|
| 86 |
+
# mpiexec -n 8 --ppn 1 --cpu-bind none torchrun --nnodes=8 --nproc-per-node=4 --rdzv-backend=c10d --rdzv-endpoint=$head_node_ip /glade/u/home/ksha/miles-physics/applications/rollout_to_netcdf.py -c model_multi.yml
|
finetune_final/launch_single.sh
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#PBS -A NCIS0010
|
| 3 |
+
#PBS -N wx_6h
|
| 4 |
+
#PBS -l walltime=12:00:00
|
| 5 |
+
#PBS -l select=8:ncpus=64:ngpus=4
|
| 6 |
+
#PBS -q main
|
| 7 |
+
#PBS -j oe
|
| 8 |
+
#PBS -k eod
|
| 9 |
+
#PBS -r n
|
| 10 |
+
# Load modules
|
| 11 |
+
module purge
|
| 12 |
+
module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
|
| 13 |
+
conda activate /glade/work/ksha/miniconda3/envs/credit-derecho
|
| 14 |
+
# conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
|
| 15 |
+
# Export environment variables
|
| 16 |
+
export LSCRATCH=/glade/derecho/scratch/ksha/
|
| 17 |
+
export LOGLEVEL=INFO
|
| 18 |
+
export NCCL_DEBUG=INFO
|
| 19 |
+
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
| 20 |
+
export NCCL_SOCKET_IFNAME=hsn
|
| 21 |
+
export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
|
| 22 |
+
export MPICH_OFI_NIC_POLICY=GPU
|
| 23 |
+
export MPICH_GPU_SUPPORT_ENABLED=1
|
| 24 |
+
export NCCL_IB_DISABLE=1
|
| 25 |
+
export NCCL_CROSS_NIC=1
|
| 26 |
+
export NCCL_NCHANNELS_PER_NET_PEER=4
|
| 27 |
+
export MPICH_RDMA_ENABLED_CUDA=1
|
| 28 |
+
export NCCL_NET="AWS Libfabric"
|
| 29 |
+
export NCCL_NET_GDR_LEVEL=PBH
|
| 30 |
+
export FI_CXI_DISABLE_HOST_REGISTER=1
|
| 31 |
+
export FI_CXI_OPTIMIZED_MRS=false
|
| 32 |
+
export FI_MR_CACHE_MONITOR=userfaultfd
|
| 33 |
+
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
| 34 |
+
# logger.info the results
|
| 35 |
+
echo "Number of nodes: 8"
|
| 36 |
+
echo "Number of GPUs per node: 4"
|
| 37 |
+
echo "Total number of GPUs: 32"
|
| 38 |
+
# Log in to WandB if needed
|
| 39 |
+
# wandb login 02d2b1af00b5df901cb2bee071872de774781520
|
| 40 |
+
# Launch MPIs
|
| 41 |
+
nodes=( $( cat $PBS_NODEFILE ) )
|
| 42 |
+
echo nodes: $nodes
|
| 43 |
+
# Find headnode's IP:
|
| 44 |
+
head_node=${nodes[0]}
|
| 45 |
+
head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
|
| 46 |
+
MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/u/home/ksha/miles-credit/applications/train.py -c /glade/work/ksha/CREDIT_runs/wxformer_6h/model_single.yml --backend nccl
|
finetune_final/model.yml
ADDED
|
File without changes
|
finetune_final/model_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:265a212fb0cb0474528d4a986458e99ca83444422d29d77065790632ceb7b0f3
|
| 3 |
+
size 499596942
|
finetune_final/model_multi.yml
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------------------------------------------------------------------- #
|
| 2 |
+
# This yaml file implements 6 hourly state-in-state-out crossformer
|
| 3 |
+
# on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
|
| 4 |
+
# The model is trained on 6 hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask
|
| 5 |
+
# Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
|
| 6 |
+
# --------------------------------------------------------------------------------------------------------------------- #
|
| 7 |
+
save_loc: '/glade/u/home/schreck/scratch/finetune/wx12/'
|
| 8 |
+
seed: 1000
|
| 9 |
+
|
| 10 |
+
data:
|
| 11 |
+
# upper-air variables
|
| 12 |
+
variables: ['U','V','T','Q']
|
| 13 |
+
save_loc: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL*'
|
| 14 |
+
|
| 15 |
+
# surface variables
|
| 16 |
+
surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
|
| 17 |
+
save_loc_surface: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL*'
|
| 18 |
+
|
| 19 |
+
# dynamic forcing variables
|
| 20 |
+
dynamic_forcing_variables: ['tsi']
|
| 21 |
+
save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
|
| 22 |
+
|
| 23 |
+
# static variables
|
| 24 |
+
static_variables: ['Z_GDS4_SFC','LSM']
|
| 25 |
+
save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
|
| 26 |
+
|
| 27 |
+
# mean / std path
|
| 28 |
+
mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
|
| 29 |
+
std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
|
| 30 |
+
|
| 31 |
+
# train / validation split
|
| 32 |
+
train_years: [1979, 2018]
|
| 33 |
+
valid_years: [2018, 2019]
|
| 34 |
+
|
| 35 |
+
# data workflow
|
| 36 |
+
scaler_type: 'std_new'
|
| 37 |
+
|
| 38 |
+
# number of input states
|
| 39 |
+
# FuXi has 2 input states
|
| 40 |
+
history_len: 1
|
| 41 |
+
valid_history_len: 1
|
| 42 |
+
|
| 43 |
+
# number of forecast steps to compute loss
|
| 44 |
+
# 0 for single step training / validation
|
| 45 |
+
# larger than 0 for multi-step training / validation
|
| 46 |
+
forecast_len: 11
|
| 47 |
+
valid_forecast_len: 11
|
| 48 |
+
|
| 49 |
+
# one_shot: True --> compute loss on the last forecast step only
|
| 50 |
+
# one_shot: False --> compute loss on all forecast steps
|
| 51 |
+
one_shot: False
|
| 52 |
+
|
| 53 |
+
# 1 for hourly model
|
| 54 |
+
lead_time_periods: 6
|
| 55 |
+
|
| 56 |
+
# do not use skip_period
|
| 57 |
+
skip_periods: null
|
| 58 |
+
|
| 59 |
+
# compatible with the old 'std'
|
| 60 |
+
static_first: True
|
| 61 |
+
|
| 62 |
+
trainer:
|
| 63 |
+
type: multi-step # <---------- change to your type
|
| 64 |
+
|
| 65 |
+
mode: fsdp
|
| 66 |
+
cpu_offload: False
|
| 67 |
+
activation_checkpoint: True
|
| 68 |
+
|
| 69 |
+
load_weights: True
|
| 70 |
+
load_optimizer: True
|
| 71 |
+
load_scaler: True
|
| 72 |
+
load_scheduler: True
|
| 73 |
+
reload_epoch: True
|
| 74 |
+
|
| 75 |
+
skip_validation: False
|
| 76 |
+
update_learning_rate: True
|
| 77 |
+
|
| 78 |
+
save_backup_weights: True
|
| 79 |
+
save_best_weights: True
|
| 80 |
+
|
| 81 |
+
learning_rate: 1.0e-05 # <-- change to your lr
|
| 82 |
+
weight_decay: 0
|
| 83 |
+
|
| 84 |
+
train_batch_size: 1
|
| 85 |
+
valid_batch_size: 1
|
| 86 |
+
|
| 87 |
+
batches_per_epoch: 100
|
| 88 |
+
valid_batches_per_epoch: 0
|
| 89 |
+
stopping_patience: 50
|
| 90 |
+
|
| 91 |
+
start_epoch: 0
|
| 92 |
+
num_epoch: 4
|
| 93 |
+
# False when switching from single-step to multi-step
|
| 94 |
+
epochs: &epochs 20
|
| 95 |
+
use_scheduler: True
|
| 96 |
+
#scheduler: {'scheduler_type': 'cosine-annealing', 'T_max': *epochs, 'last_epoch': -1}
|
| 97 |
+
scheduler:
|
| 98 |
+
scheduler_type: cosine-annealing-restarts
|
| 99 |
+
first_cycle_steps: 250
|
| 100 |
+
cycle_mult: 6.0 # Multiplier for steps in subsequent cycles
|
| 101 |
+
max_lr: 1.0e-05
|
| 102 |
+
min_lr: 1.0e-08
|
| 103 |
+
warmup_steps: 249
|
| 104 |
+
gamma: 0.7 # LR reduction factor at each cycle restart
|
| 105 |
+
|
| 106 |
+
# Automatic Mixed Precision: False
|
| 107 |
+
amp: False
|
| 108 |
+
|
| 109 |
+
# rescale loss as loss = loss / grad_accum_every
|
| 110 |
+
grad_accum_every: 1
|
| 111 |
+
# gradient clipping
|
| 112 |
+
grad_max_norm: 1.0
|
| 113 |
+
|
| 114 |
+
# number of workers
|
| 115 |
+
thread_workers: 4
|
| 116 |
+
valid_thread_workers: 4
|
| 117 |
+
|
| 118 |
+
model:
|
| 119 |
+
# crossformer example
|
| 120 |
+
type: "crossformer"
|
| 121 |
+
frames: 1 # number of input states (default: 1)
|
| 122 |
+
image_height: 640 # number of latitude grids (default: 640)
|
| 123 |
+
image_width: 1280 # number of longitude grids (default: 1280)
|
| 124 |
+
levels: 16 # number of upper-air variable levels (default: 15)
|
| 125 |
+
channels: 4 # upper-air variable channels
|
| 126 |
+
surface_channels: 7 # surface variable channels
|
| 127 |
+
input_only_channels: 3 # dynamic forcing, forcing, static channels
|
| 128 |
+
output_only_channels: 0 # diagnostic variable channels
|
| 129 |
+
|
| 130 |
+
patch_width: 1 # number of latitude grids in each 3D patch (default: 1)
|
| 131 |
+
patch_height: 1 # number of longitude grids in each 3D patch (default: 1)
|
| 132 |
+
frame_patch_size: 1 # number of input states in each 3D patch (default: 1)
|
| 133 |
+
|
| 134 |
+
dim: [128, 256, 512, 1024] # Dimensionality of each layer
|
| 135 |
+
depth: [2, 2, 8, 2] # Depth of each layer
|
| 136 |
+
global_window_size: [10, 5, 2, 1] # Global window size for each layer
|
| 137 |
+
local_window_size: 10 # Local window size
|
| 138 |
+
cross_embed_kernel_sizes: # kernel sizes for cross-embedding
|
| 139 |
+
- [4, 8, 16, 32]
|
| 140 |
+
- [2, 4]
|
| 141 |
+
- [2, 4]
|
| 142 |
+
- [2, 4]
|
| 143 |
+
cross_embed_strides: [2, 2, 2, 2] # Strides for cross-embedding (default: [4, 2, 2, 2])
|
| 144 |
+
attn_dropout: 0. # Dropout probability for attention layers (default: 0.0)
|
| 145 |
+
ff_dropout: 0. # Dropout probability for feed-forward layers (default: 0.0)
|
| 146 |
+
|
| 147 |
+
use_spectral_norm: True
|
| 148 |
+
|
| 149 |
+
# =============================================================== #
|
| 150 |
+
# New
|
| 151 |
+
|
| 152 |
+
# use interpolation to match the output size
|
| 153 |
+
interp: True
|
| 154 |
+
|
| 155 |
+
# map boundary padding
|
| 156 |
+
padding_conf:
|
| 157 |
+
activate: True
|
| 158 |
+
mode: earth
|
| 159 |
+
pad_lat: 80
|
| 160 |
+
pad_lon: 80
|
| 161 |
+
|
| 162 |
+
post_conf:
|
| 163 |
+
activate: True
|
| 164 |
+
|
| 165 |
+
tracer_fixer:
|
| 166 |
+
activate: True
|
| 167 |
+
denorm: True
|
| 168 |
+
tracer_name: ['Q', 'Q500']
|
| 169 |
+
tracer_thres: [1e-8, 1e-8]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
loss:
|
| 173 |
+
# the main training loss
|
| 174 |
+
training_loss: "mse"
|
| 175 |
+
|
| 176 |
+
# power loss (x), spectral_loss (x)
|
| 177 |
+
use_power_loss: False
|
| 178 |
+
use_spectral_loss: False
|
| 179 |
+
|
| 180 |
+
# use latitude weighting
|
| 181 |
+
use_latitude_weights: True
|
| 182 |
+
latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
|
| 183 |
+
|
| 184 |
+
# turn-off variable weighting
|
| 185 |
+
use_variable_weights: False
|
| 186 |
+
|
| 187 |
+
predict:
|
| 188 |
+
forecasts:
|
| 189 |
+
type: "custom" # keep it as "custom"
|
| 190 |
+
start_year: 2019 # year of the first initialization (where rollout will start)
|
| 191 |
+
start_month: 1 # month of the first initialization
|
| 192 |
+
start_day: 1 # day of the first initialization
|
| 193 |
+
start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
|
| 194 |
+
duration: 1152 # number of days to initialize, starting from the (year, mon, day) above
|
| 195 |
+
# duration should be divisible by the number of GPUs
|
| 196 |
+
# (e.g., duration: 384 for 365-day rollout using 32 GPUs)
|
| 197 |
+
days: 10 # forecast lead time as days (1 means 24-hour forecast)
|
| 198 |
+
|
| 199 |
+
save_forecast: '/glade/u/home/schreck/scratch/finetune/wx12/netcdf/'
|
| 200 |
+
# save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
|
| 201 |
+
metadata: '/glade/u/home/ksha/miles-credit/credit/metadata/era5.yaml'
|
| 202 |
+
|
| 203 |
+
# turn-off low-pass filter
|
| 204 |
+
use_laplace_filter: False
|
| 205 |
+
|
| 206 |
+
# deprecated
|
| 207 |
+
# save_format: "nc"
|
| 208 |
+
|
| 209 |
+
pbs: #derecho
|
| 210 |
+
conda: "/glade/work/ksha/miniconda3/envs/credit"
|
| 211 |
+
project: "NAML0001"
|
| 212 |
+
job_name: "wxformer_6h"
|
| 213 |
+
walltime: "12:00:00"
|
| 214 |
+
nodes: 8
|
| 215 |
+
ncpus: 64
|
| 216 |
+
ngpus: 4
|
| 217 |
+
mem: '480GB'
|
| 218 |
+
queue: 'main'
|
finetune_final/model_predict.yml
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------------------------------------------------------------------- #
|
| 2 |
+
# This yaml file implements 6 hourly state-in-state-out crossformer
|
| 3 |
+
# on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
|
| 4 |
+
# The model is trained on 6 hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask
|
| 5 |
+
# Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
|
| 6 |
+
# --------------------------------------------------------------------------------------------------------------------- #
|
| 7 |
+
save_loc: '/glade/work/ksha/CREDIT_runs/wxformer_6h/'
|
| 8 |
+
seed: 1000
|
| 9 |
+
|
| 10 |
+
data:
|
| 11 |
+
# upper-air variables
|
| 12 |
+
variables: ['U','V','T','Q']
|
| 13 |
+
save_loc: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL*'
|
| 14 |
+
|
| 15 |
+
# surface variables
|
| 16 |
+
surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
|
| 17 |
+
save_loc_surface: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL*'
|
| 18 |
+
|
| 19 |
+
# dynamic forcing variables
|
| 20 |
+
dynamic_forcing_variables: ['tsi']
|
| 21 |
+
save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
|
| 22 |
+
|
| 23 |
+
# static variables
|
| 24 |
+
static_variables: ['Z_GDS4_SFC','LSM']
|
| 25 |
+
save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
|
| 26 |
+
|
| 27 |
+
# mean / std path
|
| 28 |
+
mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
|
| 29 |
+
std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
|
| 30 |
+
|
| 31 |
+
# train / validation split
|
| 32 |
+
train_years: [1979, 2018]
|
| 33 |
+
valid_years: [2018, 2019]
|
| 34 |
+
|
| 35 |
+
# data workflow
|
| 36 |
+
scaler_type: 'std_new'
|
| 37 |
+
|
| 38 |
+
history_len: 1
|
| 39 |
+
valid_history_len: 1
|
| 40 |
+
|
| 41 |
+
forecast_len: 0
|
| 42 |
+
valid_forecast_len: 0
|
| 43 |
+
|
| 44 |
+
# 1 for hourly model
|
| 45 |
+
lead_time_periods: 6
|
| 46 |
+
|
| 47 |
+
# do not use skip_period
|
| 48 |
+
skip_periods: null
|
| 49 |
+
|
| 50 |
+
# compatible with the old 'std'
|
| 51 |
+
static_first: True
|
| 52 |
+
|
| 53 |
+
trainer:
|
| 54 |
+
mode: fsdp
|
| 55 |
+
type: standard
|
| 56 |
+
|
| 57 |
+
model:
|
| 58 |
+
# crossformer example
|
| 59 |
+
type: "crossformer"
|
| 60 |
+
frames: 1 # number of input states (default: 1)
|
| 61 |
+
image_height: 640 # number of latitude grids (default: 640)
|
| 62 |
+
image_width: 1280 # number of longitude grids (default: 1280)
|
| 63 |
+
levels: 16 # number of upper-air variable levels (default: 15)
|
| 64 |
+
channels: 4 # upper-air variable channels
|
| 65 |
+
surface_channels: 7 # surface variable channels
|
| 66 |
+
input_only_channels: 3 # dynamic forcing, forcing, static channels
|
| 67 |
+
output_only_channels: 0 # diagnostic variable channels
|
| 68 |
+
|
| 69 |
+
patch_width: 1 # number of latitude grids in each 3D patch (default: 1)
|
| 70 |
+
patch_height: 1 # number of longitude grids in each 3D patch (default: 1)
|
| 71 |
+
frame_patch_size: 1 # number of input states in each 3D patch (default: 1)
|
| 72 |
+
|
| 73 |
+
dim: [128, 256, 512, 1024] # Dimensionality of each layer
|
| 74 |
+
depth: [2, 2, 8, 2] # Depth of each layer
|
| 75 |
+
global_window_size: [10, 5, 2, 1] # Global window size for each layer
|
| 76 |
+
local_window_size: 10 # Local window size
|
| 77 |
+
cross_embed_kernel_sizes: # kernel sizes for cross-embedding
|
| 78 |
+
- [4, 8, 16, 32]
|
| 79 |
+
- [2, 4]
|
| 80 |
+
- [2, 4]
|
| 81 |
+
- [2, 4]
|
| 82 |
+
cross_embed_strides: [2, 2, 2, 2] # Strides for cross-embedding (default: [4, 2, 2, 2])
|
| 83 |
+
attn_dropout: 0. # Dropout probability for attention layers (default: 0.0)
|
| 84 |
+
ff_dropout: 0. # Dropout probability for feed-forward layers (default: 0.0)
|
| 85 |
+
|
| 86 |
+
interp: True
|
| 87 |
+
|
| 88 |
+
# map boundary padding
|
| 89 |
+
padding_conf:
|
| 90 |
+
activate: True
|
| 91 |
+
mode: mirror
|
| 92 |
+
pad_lon: 80 # number of grids to pad on 0 and 360 deg lon
|
| 93 |
+
pad_lat: 80 # number of grids to pad on -90 and 90 deg lat
|
| 94 |
+
|
| 95 |
+
loss:
|
| 96 |
+
use_latitude_weights: True
|
| 97 |
+
latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
predict:
|
| 101 |
+
forecasts:
|
| 102 |
+
type: "custom" # keep it as "custom"
|
| 103 |
+
start_year: 2020 # year of the first initialization (where rollout will start)
|
| 104 |
+
start_month: 1 # month of the first initialization
|
| 105 |
+
start_day: 1 # day of the first initialization
|
| 106 |
+
start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
|
| 107 |
+
duration: 32 # number of days to initialize, starting from the (year, mon, day) above
|
| 108 |
+
# duration should be divisible by the number of GPUs
|
| 109 |
+
# (e.g., duration: 384 for 365-day rollout using 32 GPUs)
|
| 110 |
+
days: 10 # forecast lead time as days (1 means 24-hour forecast)
|
| 111 |
+
|
| 112 |
+
metadata: '/glade/u/home/ksha/miles-credit/credit/metadata/era5.yaml'
|
| 113 |
+
save_forecast: '/glade/derecho/scratch/ksha/CREDIT/RAW_OUTPUT/wxformer_6h_test/'
|
| 114 |
+
|
| 115 |
+
# turn-off low-pass filter
|
| 116 |
+
use_laplace_filter: False
|
| 117 |
+
|
finetune_final/model_single.yml
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------------------------------------------------------------------- #
|
| 2 |
+
# This yaml file implements 6 hourly state-in-state-out crossformer
|
| 3 |
+
# on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
|
| 4 |
+
# The model is trained on 6 hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask
|
| 5 |
+
# Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
|
| 6 |
+
# --------------------------------------------------------------------------------------------------------------------- #
|
| 7 |
+
save_loc: '/glade/work/ksha/CREDIT_runs/wxformer_6h/'
|
| 8 |
+
seed: 1000
|
| 9 |
+
|
| 10 |
+
data:
|
| 11 |
+
# upper-air variables
|
| 12 |
+
variables: ['U','V','T','Q']
|
| 13 |
+
save_loc: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
|
| 14 |
+
|
| 15 |
+
# surface variables
|
| 16 |
+
surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
|
| 17 |
+
save_loc_surface: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
|
| 18 |
+
|
| 19 |
+
# dynamic forcing variables
|
| 20 |
+
dynamic_forcing_variables: ['tsi']
|
| 21 |
+
save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
|
| 22 |
+
|
| 23 |
+
# static variables
|
| 24 |
+
static_variables: ['Z_GDS4_SFC','LSM']
|
| 25 |
+
save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
|
| 26 |
+
|
| 27 |
+
# mean / std path
|
| 28 |
+
mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
|
| 29 |
+
std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
|
| 30 |
+
|
| 31 |
+
# train / validation split
|
| 32 |
+
train_years: [1979, 2018]
|
| 33 |
+
valid_years: [2018, 2019]
|
| 34 |
+
|
| 35 |
+
# data workflow
|
| 36 |
+
scaler_type: 'std_new'
|
| 37 |
+
|
| 38 |
+
# state-in-state-out
|
| 39 |
+
history_len: 1
|
| 40 |
+
valid_history_len: 1
|
| 41 |
+
|
| 42 |
+
forecast_len: 0
|
| 43 |
+
valid_forecast_len: 0
|
| 44 |
+
|
| 45 |
+
one_shot: True
|
| 46 |
+
|
| 47 |
+
# 1 for hourly model
|
| 48 |
+
lead_time_periods: 6
|
| 49 |
+
|
| 50 |
+
# do not use skip_period
|
| 51 |
+
skip_periods: null
|
| 52 |
+
|
| 53 |
+
# compatible with the old 'std'
|
| 54 |
+
static_first: True
|
| 55 |
+
|
| 56 |
+
trainer:
|
| 57 |
+
type: standard # <---------- change to your type
|
| 58 |
+
|
| 59 |
+
mode: fsdp
|
| 60 |
+
cpu_offload: False
|
| 61 |
+
activation_checkpoint: True
|
| 62 |
+
|
| 63 |
+
load_weights: True
|
| 64 |
+
load_optimizer: True
|
| 65 |
+
load_scaler: True
|
| 66 |
+
load_sheduler: True
|
| 67 |
+
|
| 68 |
+
skip_validation: False
|
| 69 |
+
update_learning_rate: False
|
| 70 |
+
|
| 71 |
+
save_backup_weights: True
|
| 72 |
+
save_best_weights: True
|
| 73 |
+
|
| 74 |
+
learning_rate: 1.0e-03 # <-- change to your lr
|
| 75 |
+
weight_decay: 0
|
| 76 |
+
|
| 77 |
+
train_batch_size: 1
|
| 78 |
+
valid_batch_size: 1
|
| 79 |
+
|
| 80 |
+
batches_per_epoch: 0
|
| 81 |
+
valid_batches_per_epoch: 0
|
| 82 |
+
stopping_patience: 999
|
| 83 |
+
|
| 84 |
+
start_epoch: 0
|
| 85 |
+
num_epoch: 6
|
| 86 |
+
reload_epoch: True
|
| 87 |
+
epochs: &epochs 70
|
| 88 |
+
|
| 89 |
+
use_scheduler: True
|
| 90 |
+
scheduler: {'scheduler_type': 'cosine-annealing', 'T_max': *epochs, 'last_epoch': -1}
|
| 91 |
+
|
| 92 |
+
# Automatic Mixed Precision: False
|
| 93 |
+
amp: False
|
| 94 |
+
|
| 95 |
+
# rescale loss as loss = loss / grad_accum_every
|
| 96 |
+
grad_accum_every: 1
|
| 97 |
+
# gradient clipping
|
| 98 |
+
grad_max_norm: 1.0
|
| 99 |
+
|
| 100 |
+
# number of workers
|
| 101 |
+
thread_workers: 4
|
| 102 |
+
valid_thread_workers: 0
|
| 103 |
+
|
| 104 |
+
model:
|
| 105 |
+
# crossformer example
|
| 106 |
+
type: "crossformer"
|
| 107 |
+
frames: 1 # number of input states (default: 1)
|
| 108 |
+
image_height: 640 # number of latitude grids (default: 640)
|
| 109 |
+
image_width: 1280 # number of longitude grids (default: 1280)
|
| 110 |
+
levels: 16 # number of upper-air variable levels (default: 15)
|
| 111 |
+
channels: 4 # upper-air variable channels
|
| 112 |
+
surface_channels: 7 # surface variable channels
|
| 113 |
+
input_only_channels: 3 # dynamic forcing, forcing, static channels
|
| 114 |
+
output_only_channels: 0 # diagnostic variable channels
|
| 115 |
+
|
| 116 |
+
patch_width: 1 # number of latitude grids in each 3D patch (default: 1)
|
| 117 |
+
patch_height: 1 # number of longitude grids in each 3D patch (default: 1)
|
| 118 |
+
frame_patch_size: 1 # number of input states in each 3D patch (default: 1)
|
| 119 |
+
|
| 120 |
+
dim: [128, 256, 512, 1024] # Dimensionality of each layer
|
| 121 |
+
depth: [2, 2, 8, 2] # Depth of each layer
|
| 122 |
+
global_window_size: [10, 5, 2, 1] # Global window size for each layer
|
| 123 |
+
local_window_size: 10 # Local window size
|
| 124 |
+
cross_embed_kernel_sizes: # kernel sizes for cross-embedding
|
| 125 |
+
- [4, 8, 16, 32]
|
| 126 |
+
- [2, 4]
|
| 127 |
+
- [2, 4]
|
| 128 |
+
- [2, 4]
|
| 129 |
+
cross_embed_strides: [2, 2, 2, 2] # Strides for cross-embedding (default: [4, 2, 2, 2])
|
| 130 |
+
attn_dropout: 0. # Dropout probability for attention layers (default: 0.0)
|
| 131 |
+
ff_dropout: 0. # Dropout probability for feed-forward layers (default: 0.0)
|
| 132 |
+
|
| 133 |
+
# map boundary padding
|
| 134 |
+
pad_lon: 80 # number of grids to pad on 0 and 360 deg lon
|
| 135 |
+
pad_lat: 80 # number of grids to pad on -90 and 90 deg lat
|
| 136 |
+
|
| 137 |
+
loss:
|
| 138 |
+
# the main training loss
|
| 139 |
+
training_loss: "mse"
|
| 140 |
+
|
| 141 |
+
# power loss (x), spectral_loss (x)
|
| 142 |
+
use_power_loss: False
|
| 143 |
+
use_spectral_loss: False
|
| 144 |
+
|
| 145 |
+
# use latitude weighting
|
| 146 |
+
use_latitude_weights: True
|
| 147 |
+
latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
|
| 148 |
+
|
| 149 |
+
# turn-off variable weighting
|
| 150 |
+
use_variable_weights: False
|
| 151 |
+
|
| 152 |
+
predict:
|
| 153 |
+
forecasts:
|
| 154 |
+
type: "custom" # keep it as "custom"
|
| 155 |
+
start_year: 2020 # year of the first initialization (where rollout will start)
|
| 156 |
+
start_month: 1 # month of the first initialization
|
| 157 |
+
start_day: 1 # day of the first initialization
|
| 158 |
+
start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
|
| 159 |
+
duration: 30 # number of days to initialize, starting from the (year, mon, day) above
|
| 160 |
+
# duration should be divisible by the number of GPUs
|
| 161 |
+
# (e.g., duration: 384 for 365-day rollout using 32 GPUs)
|
| 162 |
+
days: 2 # forecast lead time as days (1 means 24-hour forecast)
|
| 163 |
+
|
| 164 |
+
save_forecast: '/glade/derecho/scratch/ksha/CREDIT/wxformer_6h/'
|
| 165 |
+
save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
|
| 166 |
+
|
| 167 |
+
# turn-off low-pass filter
|
| 168 |
+
use_laplace_filter: False
|
| 169 |
+
|
| 170 |
+
# deprecated
|
| 171 |
+
# save_format: "nc"
|
| 172 |
+
|
| 173 |
+
pbs: #derecho
|
| 174 |
+
conda: "/glade/work/ksha/miniconda3/envs/credit"
|
| 175 |
+
project: "NAML0001"
|
| 176 |
+
job_name: "wxformer_6h"
|
| 177 |
+
walltime: "12:00:00"
|
| 178 |
+
nodes: 8
|
| 179 |
+
ncpus: 64
|
| 180 |
+
ngpus: 4
|
| 181 |
+
mem: '480GB'
|
| 182 |
+
queue: 'main'
|
finetune_final/model_single_cached.yml
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------------------------------------------------------------------- #
|
| 2 |
+
# This yaml file implements 6 hourly state-in-state-out crossformer
|
| 3 |
+
# on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
|
| 4 |
+
# The model is trained on 6 hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask
|
| 5 |
+
# Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
|
| 6 |
+
# --------------------------------------------------------------------------------------------------------------------- #
|
| 7 |
+
save_loc: '/glade/work/ksha/CREDIT_runs/wxformer_6h/'
|
| 8 |
+
seed: 1000
|
| 9 |
+
|
| 10 |
+
data:
|
| 11 |
+
# upper-air variables
|
| 12 |
+
variables: ['U','V','T','Q']
|
| 13 |
+
save_loc: '/glade/derecho/scratch/ksha/CREDIT_data/arXiv_cached/cache_arXiv_6h_*'
|
| 14 |
+
|
| 15 |
+
# surface variables
|
| 16 |
+
surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
|
| 17 |
+
save_loc_surface: '/glade/derecho/scratch/ksha/CREDIT_data/arXiv_cached/cache_arXiv_6h_*'
|
| 18 |
+
|
| 19 |
+
# dynamic forcing variables
|
| 20 |
+
dynamic_forcing_variables: ['tsi']
|
| 21 |
+
save_loc_dynamic_forcing: '/glade/derecho/scratch/ksha/CREDIT_data/arXiv_cached/cache_arXiv_6h_*'
|
| 22 |
+
|
| 23 |
+
# static variables
|
| 24 |
+
static_variables: ['Z_GDS4_SFC','LSM']
|
| 25 |
+
save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
|
| 26 |
+
|
| 27 |
+
# mean / std path
|
| 28 |
+
mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
|
| 29 |
+
std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
|
| 30 |
+
|
| 31 |
+
# train / validation split
|
| 32 |
+
train_years: [1979, 2018]
|
| 33 |
+
valid_years: [2018, 2019]
|
| 34 |
+
|
| 35 |
+
# data workflow
|
| 36 |
+
scaler_type: 'std_cached'
|
| 37 |
+
|
| 38 |
+
# state-in-state-out
|
| 39 |
+
history_len: 1
|
| 40 |
+
valid_history_len: 1
|
| 41 |
+
|
| 42 |
+
forecast_len: 0
|
| 43 |
+
valid_forecast_len: 0
|
| 44 |
+
|
| 45 |
+
one_shot: True
|
| 46 |
+
|
| 47 |
+
# 1 for hourly model
|
| 48 |
+
lead_time_periods: 6
|
| 49 |
+
|
| 50 |
+
# do not use skip_period
|
| 51 |
+
skip_periods: null
|
| 52 |
+
|
| 53 |
+
# compatible with the old 'std'
|
| 54 |
+
static_first: True
|
| 55 |
+
|
| 56 |
+
trainer:
|
| 57 |
+
type: standard # <---------- change to your type
|
| 58 |
+
|
| 59 |
+
mode: fsdp
|
| 60 |
+
cpu_offload: False
|
| 61 |
+
activation_checkpoint: True
|
| 62 |
+
|
| 63 |
+
load_weights: True
|
| 64 |
+
load_optimizer: True
|
| 65 |
+
load_scaler: True
|
| 66 |
+
load_sheduler: True
|
| 67 |
+
|
| 68 |
+
skip_validation: False
|
| 69 |
+
update_learning_rate: False
|
| 70 |
+
|
| 71 |
+
save_backup_weights: True
|
| 72 |
+
save_best_weights: True
|
| 73 |
+
|
| 74 |
+
learning_rate: 1.0e-03 # <-- change to your lr
|
| 75 |
+
weight_decay: 0
|
| 76 |
+
|
| 77 |
+
train_batch_size: 1
|
| 78 |
+
valid_batch_size: 1
|
| 79 |
+
|
| 80 |
+
batches_per_epoch: 0
|
| 81 |
+
valid_batches_per_epoch: 0
|
| 82 |
+
stopping_patience: 999
|
| 83 |
+
|
| 84 |
+
start_epoch: 0
|
| 85 |
+
num_epoch: 6
|
| 86 |
+
reload_epoch: True
|
| 87 |
+
epochs: &epochs 70
|
| 88 |
+
|
| 89 |
+
use_scheduler: True
|
| 90 |
+
scheduler: {'scheduler_type': 'cosine-annealing', 'T_max': *epochs, 'last_epoch': -1}
|
| 91 |
+
|
| 92 |
+
# Automatic Mixed Precision: False
|
| 93 |
+
amp: False
|
| 94 |
+
|
| 95 |
+
# rescale loss as loss = loss / grad_accum_every
|
| 96 |
+
grad_accum_every: 1
|
| 97 |
+
# gradient clipping
|
| 98 |
+
grad_max_norm: 1.0
|
| 99 |
+
|
| 100 |
+
# number of workers
|
| 101 |
+
thread_workers: 4
|
| 102 |
+
valid_thread_workers: 0
|
| 103 |
+
|
| 104 |
+
model:
|
| 105 |
+
# crossformer example
|
| 106 |
+
type: "crossformer"
|
| 107 |
+
frames: 1 # number of input states (default: 1)
|
| 108 |
+
image_height: 640 # number of latitude grids (default: 640)
|
| 109 |
+
image_width: 1280 # number of longitude grids (default: 1280)
|
| 110 |
+
levels: 16 # number of upper-air variable levels (default: 15)
|
| 111 |
+
channels: 4 # upper-air variable channels
|
| 112 |
+
surface_channels: 7 # surface variable channels
|
| 113 |
+
input_only_channels: 3 # dynamic forcing, forcing, static channels
|
| 114 |
+
output_only_channels: 0 # diagnostic variable channels
|
| 115 |
+
|
| 116 |
+
patch_width: 1 # number of latitude grids in each 3D patch (default: 1)
|
| 117 |
+
patch_height: 1 # number of longitude grids in each 3D patch (default: 1)
|
| 118 |
+
frame_patch_size: 1 # number of input states in each 3D patch (default: 1)
|
| 119 |
+
|
| 120 |
+
dim: [128, 256, 512, 1024] # Dimensionality of each layer
|
| 121 |
+
depth: [2, 2, 8, 2] # Depth of each layer
|
| 122 |
+
global_window_size: [10, 5, 2, 1] # Global window size for each layer
|
| 123 |
+
local_window_size: 10 # Local window size
|
| 124 |
+
cross_embed_kernel_sizes: # kernel sizes for cross-embedding
|
| 125 |
+
- [4, 8, 16, 32]
|
| 126 |
+
- [2, 4]
|
| 127 |
+
- [2, 4]
|
| 128 |
+
- [2, 4]
|
| 129 |
+
cross_embed_strides: [2, 2, 2, 2] # Strides for cross-embedding (default: [4, 2, 2, 2])
|
| 130 |
+
attn_dropout: 0. # Dropout probability for attention layers (default: 0.0)
|
| 131 |
+
ff_dropout: 0. # Dropout probability for feed-forward layers (default: 0.0)
|
| 132 |
+
|
| 133 |
+
# map boundary padding
|
| 134 |
+
pad_lon: 80 # number of grids to pad on 0 and 360 deg lon
|
| 135 |
+
pad_lat: 80 # number of grids to pad on -90 and 90 deg lat
|
| 136 |
+
|
| 137 |
+
loss:
|
| 138 |
+
# the main training loss
|
| 139 |
+
training_loss: "mse"
|
| 140 |
+
|
| 141 |
+
# power loss (x), spectral_loss (x)
|
| 142 |
+
use_power_loss: False
|
| 143 |
+
use_spectral_loss: False
|
| 144 |
+
|
| 145 |
+
# use latitude weighting
|
| 146 |
+
use_latitude_weights: True
|
| 147 |
+
latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
|
| 148 |
+
|
| 149 |
+
# turn-off variable weighting
|
| 150 |
+
use_variable_weights: False
|
| 151 |
+
|
| 152 |
+
predict:
|
| 153 |
+
forecasts:
|
| 154 |
+
type: "custom" # keep it as "custom"
|
| 155 |
+
start_year: 2020 # year of the first initialization (where rollout will start)
|
| 156 |
+
start_month: 1 # month of the first initialization
|
| 157 |
+
start_day: 1 # day of the first initialization
|
| 158 |
+
start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
|
| 159 |
+
duration: 30 # number of days to initialize, starting from the (year, mon, day) above
|
| 160 |
+
# duration should be divisible by the number of GPUs
|
| 161 |
+
# (e.g., duration: 384 for 365-day rollout using 32 GPUs)
|
| 162 |
+
days: 2 # forecast lead time as days (1 means 24-hour forecast)
|
| 163 |
+
|
| 164 |
+
save_forecast: '/glade/derecho/scratch/ksha/CREDIT/wxformer_6h/'
|
| 165 |
+
save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
|
| 166 |
+
|
| 167 |
+
# turn-off low-pass filter
|
| 168 |
+
use_laplace_filter: False
|
| 169 |
+
|
| 170 |
+
# deprecated
|
| 171 |
+
# save_format: "nc"
|
| 172 |
+
|
| 173 |
+
pbs: #derecho
|
| 174 |
+
conda: "/glade/work/ksha/miniconda3/envs/credit"
|
| 175 |
+
project: "NAML0001"
|
| 176 |
+
job_name: "wxformer_6h"
|
| 177 |
+
walltime: "12:00:00"
|
| 178 |
+
nodes: 8
|
| 179 |
+
ncpus: 64
|
| 180 |
+
ngpus: 4
|
| 181 |
+
mem: '480GB'
|
| 182 |
+
queue: 'main'
|
finetune_final/optimizer_checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa64034131c007b4d2f5dedd41e21667cd97005e6aede6687cf49932da996c44
|
| 3 |
+
size 995925198
|