Spaces:
Running
Running
clear typo
Browse files- app.py +8 -8
- gen_table.py +5 -5
- meta_data.py +7 -0
- src/detail_math_score.json +3 -3
app.py
CHANGED
|
@@ -16,9 +16,9 @@ from meta_data import *
|
|
| 16 |
# }
|
| 17 |
# }
|
| 18 |
|
| 19 |
-
# /*
|
| 20 |
# .gr-checkbox {
|
| 21 |
-
# accent-color: rgb(59, 130, 246) !important; /*
|
| 22 |
# }
|
| 23 |
|
| 24 |
# .gr-checkbox-group label input[type="checkbox"] {
|
|
@@ -78,14 +78,14 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
| 78 |
headers = ['Rank'] + check_box['essential'] + fields
|
| 79 |
# df = overall_table.copy()
|
| 80 |
|
| 81 |
-
#
|
| 82 |
available_headers = [h for h in headers if h in overall_table.columns]
|
| 83 |
|
| 84 |
original_columns = overall_table.columns.tolist()
|
| 85 |
available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
|
| 86 |
|
| 87 |
|
| 88 |
-
#
|
| 89 |
if not available_headers:
|
| 90 |
available_headers = ['Rank'] + check_box['essential']
|
| 91 |
|
|
@@ -159,7 +159,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
| 159 |
headers = ['Rank'] + fields
|
| 160 |
df = table.copy()
|
| 161 |
|
| 162 |
-
#
|
| 163 |
df['flag'] = df.apply(lambda row: (
|
| 164 |
row['Algorithm'] in algos and
|
| 165 |
row['Dataset'] in datasets and
|
|
@@ -169,12 +169,12 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
| 169 |
df = df[df['flag']].copy()
|
| 170 |
df.pop('flag')
|
| 171 |
|
| 172 |
-
#
|
| 173 |
if 'Score' in df.columns:
|
| 174 |
-
#
|
| 175 |
df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
|
| 176 |
|
| 177 |
-
#
|
| 178 |
df['Rank'] = df['Rank'].astype(int)
|
| 179 |
|
| 180 |
|
|
|
|
| 16 |
# }
|
| 17 |
# }
|
| 18 |
|
| 19 |
+
# /* Add checkbox styles */
|
| 20 |
# .gr-checkbox {
|
| 21 |
+
# accent-color: rgb(59, 130, 246) !important; /* blue */
|
| 22 |
# }
|
| 23 |
|
| 24 |
# .gr-checkbox-group label input[type="checkbox"] {
|
|
|
|
| 78 |
headers = ['Rank'] + check_box['essential'] + fields
|
| 79 |
# df = overall_table.copy()
|
| 80 |
|
| 81 |
+
# Ensure all requested columns exist
|
| 82 |
available_headers = [h for h in headers if h in overall_table.columns]
|
| 83 |
|
| 84 |
original_columns = overall_table.columns.tolist()
|
| 85 |
available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
|
| 86 |
|
| 87 |
|
| 88 |
+
# If no columns are available, return an empty DataFrame with basic columns
|
| 89 |
if not available_headers:
|
| 90 |
available_headers = ['Rank'] + check_box['essential']
|
| 91 |
|
|
|
|
| 159 |
headers = ['Rank'] + fields
|
| 160 |
df = table.copy()
|
| 161 |
|
| 162 |
+
# Filter data
|
| 163 |
df['flag'] = df.apply(lambda row: (
|
| 164 |
row['Algorithm'] in algos and
|
| 165 |
row['Dataset'] in datasets and
|
|
|
|
| 169 |
df = df[df['flag']].copy()
|
| 170 |
df.pop('flag')
|
| 171 |
|
| 172 |
+
# Group by dataset and calculate ranking within each group based on Score
|
| 173 |
if 'Score' in df.columns:
|
| 174 |
+
# Create a temporary ranking column
|
| 175 |
df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
|
| 176 |
|
| 177 |
+
# Ensure ranking is integer
|
| 178 |
df['Rank'] = df['Rank'].astype(int)
|
| 179 |
|
| 180 |
|
gen_table.py
CHANGED
|
@@ -34,14 +34,14 @@ def BUILD_L1_DF(results, fields):
|
|
| 34 |
check_box = {}
|
| 35 |
check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
|
| 36 |
|
| 37 |
-
#
|
| 38 |
sample_data = next(iter(results.values()))
|
| 39 |
available_fields = []
|
| 40 |
for field in fields:
|
| 41 |
if field in sample_data:
|
| 42 |
available_fields.append(field)
|
| 43 |
|
| 44 |
-
#
|
| 45 |
score_columns = [f"{field}-Score" for field in available_fields]
|
| 46 |
cost_columns = [f"{field}-Cost($)" for field in available_fields]
|
| 47 |
|
|
@@ -134,7 +134,7 @@ def generate_table(results, fields):
|
|
| 134 |
res[k].append(meta[k])
|
| 135 |
scores, costs = [], []
|
| 136 |
|
| 137 |
-
#
|
| 138 |
for d in fields:
|
| 139 |
if d in item:
|
| 140 |
score = item[d].get("Score")
|
|
@@ -149,12 +149,12 @@ def generate_table(results, fields):
|
|
| 149 |
res[f"{d}-Score"].append(None)
|
| 150 |
res[f"{d}-Cost($)"].append(None)
|
| 151 |
|
| 152 |
-
#
|
| 153 |
res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
|
| 154 |
|
| 155 |
df = pd.DataFrame(res)
|
| 156 |
|
| 157 |
-
#
|
| 158 |
valid = df[~pd.isna(df['Avg Score'])].copy()
|
| 159 |
missing = df[pd.isna(df['Avg Score'])].copy()
|
| 160 |
|
|
|
|
| 34 |
check_box = {}
|
| 35 |
check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
|
| 36 |
|
| 37 |
+
# First check which columns exist in the actual data structure
|
| 38 |
sample_data = next(iter(results.values()))
|
| 39 |
available_fields = []
|
| 40 |
for field in fields:
|
| 41 |
if field in sample_data:
|
| 42 |
available_fields.append(field)
|
| 43 |
|
| 44 |
+
# Build column names, ensure they match exactly with those in generate_table function
|
| 45 |
score_columns = [f"{field}-Score" for field in available_fields]
|
| 46 |
cost_columns = [f"{field}-Cost($)" for field in available_fields]
|
| 47 |
|
|
|
|
| 134 |
res[k].append(meta[k])
|
| 135 |
scores, costs = [], []
|
| 136 |
|
| 137 |
+
# Ensure column names format matches with BUILD_L1_DF
|
| 138 |
for d in fields:
|
| 139 |
if d in item:
|
| 140 |
score = item[d].get("Score")
|
|
|
|
| 149 |
res[f"{d}-Score"].append(None)
|
| 150 |
res[f"{d}-Cost($)"].append(None)
|
| 151 |
|
| 152 |
+
# Calculate average score
|
| 153 |
res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
|
| 154 |
|
| 155 |
df = pd.DataFrame(res)
|
| 156 |
|
| 157 |
+
# Sorting and ranking logic remains unchanged
|
| 158 |
valid = df[~pd.isna(df['Avg Score'])].copy()
|
| 159 |
missing = df[pd.isna(df['Avg Score'])].copy()
|
| 160 |
|
meta_data.py
CHANGED
|
@@ -26,12 +26,19 @@ LEADERBOARD_MD['MATH_MAIN'] = f"""
|
|
| 26 |
- Cost: The cost on each math Benchmarks (the lower the better).
|
| 27 |
|
| 28 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
LEADERBOARD_MD['MATH_DETAIL'] = f"""
|
| 32 |
## Math task detail Evaluation Results
|
| 33 |
|
| 34 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
- default parameters: temperature=0.0
|
| 36 |
- LLM prices:
|
| 37 |
- gpt-3.5-turbo:
|
|
|
|
| 26 |
- Cost: The cost on each math Benchmarks (the lower the better).
|
| 27 |
|
| 28 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
|
| 29 |
+
- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
|
| 30 |
"""
|
| 31 |
|
| 32 |
LEADERBOARD_MD['MATH_DETAIL'] = f"""
|
| 33 |
## Math task detail Evaluation Results
|
| 34 |
|
| 35 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
|
| 36 |
+
- Metrics:
|
| 37 |
+
- Score: The evaluation score on each math Benchmarks (the higher the better).
|
| 38 |
+
- Pass rate: The percentage of response that are valid, where a response is valid if it is neither empty nor null.
|
| 39 |
+
- Cost: The cost on each math Benchmarks (the lower the better).
|
| 40 |
+
- Rank: The rank on each math Benchmarks (the lower the better).
|
| 41 |
+
|
| 42 |
- default parameters: temperature=0.0
|
| 43 |
- LLM prices:
|
| 44 |
- gpt-3.5-turbo:
|
src/detail_math_score.json
CHANGED
|
@@ -273,10 +273,10 @@
|
|
| 273 |
}
|
| 274 |
}
|
| 275 |
},
|
| 276 |
-
"ReAct-Pro": {
|
| 277 |
"gpt-3.5-turbo": {
|
| 278 |
"META": {
|
| 279 |
-
"Algorithm": "ReAct-Pro",
|
| 280 |
"LLM": "gpt-3.5-turbo",
|
| 281 |
"Eval Date": "2025/01/07"
|
| 282 |
},
|
|
@@ -309,7 +309,7 @@
|
|
| 309 |
},
|
| 310 |
"Doubao-lite-32k": {
|
| 311 |
"META": {
|
| 312 |
-
"Algorithm": "ReAct-Pro",
|
| 313 |
"LLM": "Doubao-lite-32k",
|
| 314 |
"Eval Date": "2025/01/07"
|
| 315 |
},
|
|
|
|
| 273 |
}
|
| 274 |
}
|
| 275 |
},
|
| 276 |
+
"ReAct-Pro*": {
|
| 277 |
"gpt-3.5-turbo": {
|
| 278 |
"META": {
|
| 279 |
+
"Algorithm": "ReAct-Pro*",
|
| 280 |
"LLM": "gpt-3.5-turbo",
|
| 281 |
"Eval Date": "2025/01/07"
|
| 282 |
},
|
|
|
|
| 309 |
},
|
| 310 |
"Doubao-lite-32k": {
|
| 311 |
"META": {
|
| 312 |
+
"Algorithm": "ReAct-Pro*",
|
| 313 |
"LLM": "Doubao-lite-32k",
|
| 314 |
"Eval Date": "2025/01/07"
|
| 315 |
},
|