open-agent-leaderboard

Running

App Files Files Community

qq-hzlh commited on Jan 10

Commit

a7d1809

1 Parent(s): e90e797

clear typo

Browse files

Files changed (4) hide show

app.py +8 -8
gen_table.py +5 -5
meta_data.py +7 -0
src/detail_math_score.json +3 -3

app.py CHANGED Viewed

@@ -16,9 +16,9 @@ from meta_data import *
 #     }
 # }
-# /* 添加复选框样式 */
 # .gr-checkbox {
-#     accent-color: rgb(59, 130, 246) !important;  /* 蓝色 */
 # }
 # .gr-checkbox-group label input[type="checkbox"] {
@@ -78,14 +78,14 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
                 headers = ['Rank'] + check_box['essential'] + fields
                 # df = overall_table.copy()
-                # 确保所有请求的列都存在
                 available_headers = [h for h in headers if h in overall_table.columns]
                 original_columns = overall_table.columns.tolist()
                 available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
-                # 如果没有可用的列，返回一个带有基本列的空DataFrame
                 if not available_headers:
                     available_headers = ['Rank'] + check_box['essential']
@@ -159,7 +159,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
                 headers =  ['Rank'] + fields
                 df = table.copy()
-                # 过滤数据
                 df['flag'] = df.apply(lambda row: (
                     row['Algorithm'] in algos and
                     row['Dataset'] in datasets and
@@ -169,12 +169,12 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
                 df = df[df['flag']].copy()
                 df.pop('flag')
-                # 按数据集分组，在每个组内根据Score排序并计算排名
                 if 'Score' in df.columns:
-                    # 创建一个临时的排名列
                     df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
-                    # 确保排名为整数
                     df['Rank'] = df['Rank'].astype(int)

 #     }
 # }
+# /* Add checkbox styles */
 # .gr-checkbox {
+#     accent-color: rgb(59, 130, 246) !important;  /* blue */
 # }
 # .gr-checkbox-group label input[type="checkbox"] {
                 headers = ['Rank'] + check_box['essential'] + fields
                 # df = overall_table.copy()
+                # Ensure all requested columns exist
                 available_headers = [h for h in headers if h in overall_table.columns]
                 original_columns = overall_table.columns.tolist()
                 available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
+                # If no columns are available, return an empty DataFrame with basic columns
                 if not available_headers:
                     available_headers = ['Rank'] + check_box['essential']
                 headers =  ['Rank'] + fields
                 df = table.copy()
+                # Filter data
                 df['flag'] = df.apply(lambda row: (
                     row['Algorithm'] in algos and
                     row['Dataset'] in datasets and
                 df = df[df['flag']].copy()
                 df.pop('flag')
+                # Group by dataset and calculate ranking within each group based on Score
                 if 'Score' in df.columns:
+                    # Create a temporary ranking column
                     df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
+                    # Ensure ranking is integer
                     df['Rank'] = df['Rank'].astype(int)

gen_table.py CHANGED Viewed

@@ -34,14 +34,14 @@ def BUILD_L1_DF(results, fields):
     check_box = {}
     check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
-    # 首先检查实际的数据结构中有哪些列
     sample_data = next(iter(results.values()))
     available_fields = []
     for field in fields:
         if field in sample_data:
             available_fields.append(field)
-    # 构建列名，确保与generate_table函数中的列名完全一致
     score_columns = [f"{field}-Score" for field in available_fields]
     cost_columns = [f"{field}-Cost($)" for field in available_fields]
@@ -134,7 +134,7 @@ def generate_table(results, fields):
             res[k].append(meta[k])
         scores, costs = [], []
-        # 确保列名格式与BUILD_L1_DF中的一致
         for d in fields:
             if d in item:
                 score = item[d].get("Score")
@@ -149,12 +149,12 @@ def generate_table(results, fields):
                 res[f"{d}-Score"].append(None)
                 res[f"{d}-Cost($)"].append(None)
-        # 计算平均分
         res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
     df = pd.DataFrame(res)
-    # 排序和排名逻辑保持不变
     valid = df[~pd.isna(df['Avg Score'])].copy()
     missing = df[pd.isna(df['Avg Score'])].copy()

     check_box = {}
     check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
+    # First check which columns exist in the actual data structure
     sample_data = next(iter(results.values()))
     available_fields = []
     for field in fields:
         if field in sample_data:
             available_fields.append(field)
+    # Build column names, ensure they match exactly with those in generate_table function
     score_columns = [f"{field}-Score" for field in available_fields]
     cost_columns = [f"{field}-Cost($)" for field in available_fields]
             res[k].append(meta[k])
         scores, costs = [], []
+        # Ensure column names format matches with BUILD_L1_DF
         for d in fields:
             if d in item:
                 score = item[d].get("Score")
                 res[f"{d}-Score"].append(None)
                 res[f"{d}-Cost($)"].append(None)
+        # Calculate average score
         res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
     df = pd.DataFrame(res)
+    # Sorting and ranking logic remains unchanged
     valid = df[~pd.isna(df['Avg Score'])].copy()
     missing = df[pd.isna(df['Avg Score'])].copy()

meta_data.py CHANGED Viewed

@@ -26,12 +26,19 @@ LEADERBOARD_MD['MATH_MAIN'] = f"""
   - Cost: The cost on each math Benchmarks (the lower the better).
 - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
 """
 LEADERBOARD_MD['MATH_DETAIL'] = f"""
 ## Math task detail Evaluation Results
 - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
 - default parameters: temperature=0.0
 - LLM prices:
   - gpt-3.5-turbo:

   - Cost: The cost on each math Benchmarks (the lower the better).
 - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
+- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
 """
 LEADERBOARD_MD['MATH_DETAIL'] = f"""
 ## Math task detail Evaluation Results
 - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
+- Metrics:
+  - Score: The evaluation score on each math Benchmarks (the higher the better).
+  - Pass rate: The percentage of response that are valid, where a response is valid if it is neither empty nor null.
+  - Cost: The cost on each math Benchmarks (the lower the better).
+  - Rank: The rank on each math Benchmarks (the lower the better).
 - default parameters: temperature=0.0
 - LLM prices:
   - gpt-3.5-turbo:

src/detail_math_score.json CHANGED Viewed

@@ -273,10 +273,10 @@
                 }
             }
         },
-        "ReAct-Pro": {
             "gpt-3.5-turbo": {
                 "META": {
-                    "Algorithm": "ReAct-Pro",
                     "LLM": "gpt-3.5-turbo",
                     "Eval Date": "2025/01/07"
                 },
@@ -309,7 +309,7 @@
             },
             "Doubao-lite-32k": {
                 "META": {
-                    "Algorithm": "ReAct-Pro",
                     "LLM": "Doubao-lite-32k",
                     "Eval Date": "2025/01/07"
                 },

                 }
             }
         },
+        "ReAct-Pro*": {
             "gpt-3.5-turbo": {
                 "META": {
+                    "Algorithm": "ReAct-Pro*",
                     "LLM": "gpt-3.5-turbo",
                     "Eval Date": "2025/01/07"
                 },
             },
             "Doubao-lite-32k": {
                 "META": {
+                    "Algorithm": "ReAct-Pro*",
                     "LLM": "Doubao-lite-32k",
                     "Eval Date": "2025/01/07"
                 },