# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Accuracy metric for the Test of Time benchmark by Bahar et al. (2025).""" import ast import json from typing import Literal import datasets import evaluate _CITATION = """\ @InProceedings{huggingface:module, title = {Test of Time Accuracy}, authors={Auss Abbood}, year={2025} } """ _DESCRIPTION = """\ The Test of Time (ToT) benchmarks expects models format their answers as a JSON with an explanation field and an answer field that follows a predefined format. The metrics extracts JSONs objects from the model's output, retains only the first JSON, drops the explanation field and compares it with the reference answer. """ # TODO: Add description of the arguments of the module here _KWARGS_DESCRIPTION = """ Compares the extracted answer from the model's output with the reference answer. Args: predictions: list of predictions to score. Each predictions should be a string with tokens separated by spaces. references: list of reference for each prediction. Each reference should be a string with tokens separated by spaces. Returns: accuracy: description of the first score, another_score: description of the second score, Examples: Examples should be written in doctest format, and should illustrate how to use the function. >>> my_new_module = evaluate.load("my_new_module") >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1]) >>> print(results) {'accuracy': 1.0} """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class TestofTimeAccuracy(evaluate.Metric): """Accuracy metric for the Test of Time benchmark by Bahar et al. (2025).""" __test__ = False def _info(self): # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.MetricInfo( module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features( { "predictions": datasets.Value("string"), "references": datasets.Value("string"), "subset": datasets.Value("string"), "return_average": datasets.Value("bool"), } ), # Homepage of the module for documentation # homepage="http://module.homepage", # Additional links to the codebase or references # codebase_urls=["http://github.com/path/to/codebase/of/new_module"], # reference_urls=["http://path.to.reference.url/new_module"], ) @staticmethod def _extract_first_json_object(s: str) -> dict | None: decoder = json.JSONDecoder() idx, end = 0, len(s) while idx < end: try: obj, next_idx = decoder.raw_decode(s, idx) idx = next_idx if isinstance(obj, dict): return obj except ValueError: idx += 1 return None @staticmethod def _pop_explanation(d): if isinstance(d, dict): d.pop("explanation", None) return d @staticmethod def _get_answer(d): if isinstance(d, dict): return d.get("answer", None) return d @staticmethod def _parse_label(s): """Parses a string that could be a JSON object or a Python dict.""" try: return json.loads(s) except json.JSONDecodeError: try: # Safe: only parses literals, does not execute code return ast.literal_eval(s) except (ValueError, SyntaxError): return None def _compute( self, predictions, references, subset: Literal["arithmetic", "semantic"], return_average: bool = True, ): """Returns the scores""" predictions = [self._extract_first_json_object(p) for p in predictions] if subset == "semantic": predictions = [self._get_answer(p) for p in predictions] elif subset == "arithmetic": predictions = [self._pop_explanation(p) for p in predictions] references = [self._parse_label(r) for r in references] else: raise ValueError(f"Invalid subset: {subset}") accuracy = [i == j for i, j in zip(predictions, references)] if return_average: return {"accuracy": sum(accuracy) / len(accuracy)} return {"accuracy": accuracy}