Spaces:
Running
Running
update eq-bench
Browse files- app.py +5 -0
- benchmark_results.csv +11 -0
- metadata.json +7 -1
app.py
CHANGED
|
@@ -64,6 +64,8 @@ with demo:
|
|
| 64 |
def get_params(model_name):
|
| 65 |
if model_name in metadata:
|
| 66 |
return metadata[model_name]
|
|
|
|
|
|
|
| 67 |
return numpy.nan
|
| 68 |
|
| 69 |
|
|
@@ -77,6 +79,9 @@ with demo:
|
|
| 77 |
# change value of column to nan
|
| 78 |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
|
| 79 |
|
|
|
|
|
|
|
|
|
|
| 80 |
# set datatype of column
|
| 81 |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
|
| 82 |
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
|
|
|
|
| 64 |
def get_params(model_name):
|
| 65 |
if model_name in metadata:
|
| 66 |
return metadata[model_name]
|
| 67 |
+
else:
|
| 68 |
+
print(model_name)
|
| 69 |
return numpy.nan
|
| 70 |
|
| 71 |
|
|
|
|
| 79 |
# change value of column to nan
|
| 80 |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
|
| 81 |
|
| 82 |
+
#scale Benchmark Score by Num Questions Parseable*171
|
| 83 |
+
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 171))
|
| 84 |
+
|
| 85 |
# set datatype of column
|
| 86 |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
|
| 87 |
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
|
benchmark_results.csv
CHANGED
|
@@ -34,6 +34,7 @@ openchat-gemma,2024-06-19 10:19:44,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-
|
|
| 34 |
Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 35 |
SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
|
| 36 |
Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
|
|
|
|
| 37 |
Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 38 |
Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
|
| 39 |
Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
|
|
@@ -141,3 +142,13 @@ mistralai/Mixtral-8x22B-v0.1,2024-06-21 20:20:37,,mistralai/Mixtral-8x22B-v0.1,,
|
|
| 141 |
mistralai/Mixtral-8x22B-Instruct-v0.1,2024-06-26 23:40:01,,mistralai/Mixtral-8x22B-Instruct-v0.1,,,67.63,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 142 |
mistralai/Mixtral-8x22B-v0.1,2024-06-27 01:17:13,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,50.0 questions were parseable (min is 83%)
|
| 143 |
alpindale/WizardLM-2-8x22B,2024-06-27 01:50:42,,alpindale/WizardLM-2-8x22B,,,69.56,eq-bench_v2_pl,171.0,1,transformers, ,,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 35 |
SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
|
| 36 |
Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 37 |
+
models/gwint2,2024-06-19 11:21:15,,speakleash/Bielik-11B-v2.0-Instruct,,,68.24,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 38 |
Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 39 |
Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
|
| 40 |
Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
|
|
|
|
| 142 |
mistralai/Mixtral-8x22B-Instruct-v0.1,2024-06-26 23:40:01,,mistralai/Mixtral-8x22B-Instruct-v0.1,,,67.63,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 143 |
mistralai/Mixtral-8x22B-v0.1,2024-06-27 01:17:13,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,50.0 questions were parseable (min is 83%)
|
| 144 |
alpindale/WizardLM-2-8x22B,2024-06-27 01:50:42,,alpindale/WizardLM-2-8x22B,,,69.56,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 145 |
+
Bielik_v2.2b,2024-08-24 09:54:33,,speakleash/Bielik-11B-v2.2-Instruct,,,69.05,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 146 |
+
Bielik_v2.1,2024-08-24 10:07:46,,speakleash/Bielik-11B-v2.1-Instruct,,,66.27,eq-bench_v2_pl,155.0,1,transformers, ,,
|
| 147 |
+
meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 21:24:39,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,FAILED,eq-bench,FAILED,1,transformers, ,,`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
|
| 148 |
+
mistralai/Mistral-Large-Instruct-2407,2024-08-24 21:51:53,,mistralai/Mistral-Large-Instruct-2407,,,78.07,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 149 |
+
meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 22:23:40,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,72.53,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 150 |
+
meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,2024-08-25 20:59:04,openai_api,meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,,,77.23,eq-bench_v2_pl,171.0,1,openai,,,
|
| 151 |
+
gpt-3.5-turbo,2024-08-25 21:14:25,openai_api,gpt-3.5-turbo,,,57.7,eq-bench_v2_pl,171.0,1,openai,,,
|
| 152 |
+
gpt-4o-mini-2024-07-18,2024-08-25 21:17:34,openai_api,gpt-4o-mini-2024-07-18,,,71.15,eq-bench_v2_pl,171.0,1,openai,,,
|
| 153 |
+
gpt-4o-2024-08-06,2024-08-25 21:24:35,openai_api,gpt-4o-2024-08-06,,,75.15,eq-bench_v2_pl,171.0,1,openai,,,
|
| 154 |
+
gpt-4-turbo-2024-04-09,2024-08-25 21:31:42,openai_api,gpt-4-turbo-2024-04-09,,,77.77,eq-bench_v2_pl,164.0,1,openai,,,
|
metadata.json
CHANGED
|
@@ -312,5 +312,11 @@
|
|
| 312 |
"microsoft/Phi-3-small-8k-instruct": 7.4,
|
| 313 |
"ssmits/Falcon2-5.5B-Polish": 5.5,
|
| 314 |
"alpindale/WizardLM-2-8x22B,max_length=4096": 141,
|
| 315 |
-
"dreamgen/WizardLM-2-7B": 7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
}
|
|
|
|
| 312 |
"microsoft/Phi-3-small-8k-instruct": 7.4,
|
| 313 |
"ssmits/Falcon2-5.5B-Polish": 5.5,
|
| 314 |
"alpindale/WizardLM-2-8x22B,max_length=4096": 141,
|
| 315 |
+
"dreamgen/WizardLM-2-7B": 7,
|
| 316 |
+
"mistralai/Mistral-Large-Instruct-2407": 123,
|
| 317 |
+
"meta-llama/Meta-Llama-3.1-70B-Instruct": 70,
|
| 318 |
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-FP8": 405,
|
| 319 |
+
"speakleash/Bielik-11B-v2.0-Instruct": 11,
|
| 320 |
+
"speakleash/Bielik-11B-v2.2-Instruct": 11,
|
| 321 |
+
"speakleash/Bielik-11B-v2.1-Instruct": 11
|
| 322 |
}
|