Spaces:

sunhill
/

spice

Runtime error

App Files Files Community

sunhill commited on Sep 28

Commit

5689a44

1 Parent(s): 03c869a

regular input

Browse files

Files changed (2) hide show

spice.py +62 -61
tests.py +12 -6

spice.py CHANGED Viewed

@@ -137,12 +137,20 @@ class SPICE(evaluate.Metric):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features(
-                {
-                    "predictions": datasets.List(datasets.Value("string")),
-                    "references": datasets.List(datasets.Value("string")),
-                }
-            ),
             # Homepage of the module for documentation
             homepage="https://huggingface.co/spaces/sunhill/spice",
             # Additional links to the codebase or references
@@ -182,51 +190,42 @@ class SPICE(evaluate.Metric):
     def _compute_batch(self, scores: List[Dict]) -> Dict[str, float]:
         """Compute average scores over all images in the batch."""
-        aggregate_scores = {}
         num_images = len(scores)
         if num_images == 0:
             return aggregate_scores
-        # Initialize aggregate_scores with zero values
-        for category in scores[0].keys():
-            aggregate_scores[category] = {
-                "pr": 0.0,
-                "re": 0.0,
-                "f": 0.0,
-                "fn": 0.0,
-                "numImages": 0.0,
-                "fp": 0.0,
-                "tp": 0.0,
-            }
         # Sum up scores for each category
         for score in scores:
-            for category, score_dict in score.items():
-                for k, v in score_dict.items():
-                    if k in ["fn", "fp", "tp"]:
-                        aggregate_scores[category][k] += v
-                aggregate_scores[category]["numImages"] += 1
         # Compute average scores
-        for category, score_dict in aggregate_scores.items():
-            tp = score_dict["tp"]
-            fp = score_dict["fp"]
-            fn = score_dict["fn"]
-            precision = tp / (tp + fp) if (tp + fp) > 0 else float("nan")
-            recall = tp / (tp + fn) if (tp + fn) > 0 else float("nan")
-            f_score = (
-                2 * precision * recall / (precision + recall)
-                if precision is not None
-                and recall is not None
-                and (precision + recall) > 0
-                else float("nan")
-            )
-            aggregate_scores[category]["pr"] = precision
-            aggregate_scores[category]["re"] = recall
-            aggregate_scores[category]["f"] = f_score
         return aggregate_scores
     def _compute(self, predictions, references, spice_name="All"):
@@ -237,11 +236,19 @@ class SPICE(evaluate.Metric):
         )
         input_data = []
         for i, (prediction, reference) in enumerate(zip(predictions, references)):
-            assert len(prediction) == 1 and len(reference) >= 1, (
-                "SPICE expects exactly one prediction and at least one reference per image. "
-                f"Got {len(prediction)} predictions and {len(reference)} references."
             )
-            input_data.append({"image_id": i, "test": prediction[0], "refs": reference})
         in_file = tempfile.NamedTemporaryFile(delete=False)
         in_file.write(json.dumps(input_data, indent=2).encode("utf-8"))
@@ -281,17 +288,11 @@ class SPICE(evaluate.Metric):
         os.remove(in_file.name)
         os.remove(out_file.name)
-        img_id_to_scores = {item["image_id"]: item["scores"] for item in results}
-        scores = []
-        for image_id in range(len(predictions)):
-            # Convert none to NaN before saving scores over subcategories
-            score_set = {}
-            for category, score_tuple in img_id_to_scores[image_id].items():
-                score_set[category] = {
-                    k: self.float_convert(v) for k, v in score_tuple.items()
-                }
-            scores.append(score_set)
-        result_score = {}
-        for k, v in self._compute_batch(scores)[spice_name].items():
-            result_score["spice_" + spice_name.lower() + "_" + k] = v
-        return result_score

             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string"),
+                        "references": datasets.Value("string"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string"),
+                        "references": datasets.Sequence(datasets.Value("string")),
+                    }
+                ),
+            ],
             # Homepage of the module for documentation
             homepage="https://huggingface.co/spaces/sunhill/spice",
             # Additional links to the codebase or references
     def _compute_batch(self, scores: List[Dict]) -> Dict[str, float]:
         """Compute average scores over all images in the batch."""
+        # Initialize aggregate_scores with zero values
+        aggregate_scores = {
+            "pr": 0.0,
+            "re": 0.0,
+            "f": 0.0,
+            "fn": 0.0,
+            "numImages": 0.0,
+            "fp": 0.0,
+            "tp": 0.0,
+        }
         num_images = len(scores)
         if num_images == 0:
             return aggregate_scores
         # Sum up scores for each category
         for score in scores:
+            for k, v in score.items():
+                if k in ["fn", "fp", "tp"]:
+                    aggregate_scores[k] += v
+            aggregate_scores["numImages"] += 1
         # Compute average scores
+        tp = aggregate_scores["tp"]
+        fp = aggregate_scores["fp"]
+        fn = aggregate_scores["fn"]
+        precision = tp / (tp + fp) if (tp + fp) > 0 else float("nan")
+        recall = tp / (tp + fn) if (tp + fn) > 0 else float("nan")
+        f_score = (
+            2 * precision * recall / (precision + recall)
+            if precision is not None and recall is not None and (precision + recall) > 0
+            else float("nan")
+        )
+        aggregate_scores["pr"] = precision
+        aggregate_scores["re"] = recall
+        aggregate_scores["f"] = f_score
         return aggregate_scores
     def _compute(self, predictions, references, spice_name="All"):
         )
         input_data = []
         for i, (prediction, reference) in enumerate(zip(predictions, references)):
+            assert isinstance(prediction, str), (
+                "Each prediction should be a string. "
+                f"Got {type(prediction)} for image {i}."
+            )
+            if isinstance(reference, str):
+                reference = [reference]
+            assert isinstance(reference, list) and all(
+                isinstance(ref, str) for ref in reference
+            ), (
+                "Each reference should be a list of strings. "
+                f"Got {type(reference)} with elements of type {[type(ref) for ref in reference]} for index {i}."
             )
+            input_data.append({"image_id": i, "test": prediction, "refs": reference})
         in_file = tempfile.NamedTemporaryFile(delete=False)
         in_file.write(json.dumps(input_data, indent=2).encode("utf-8"))
         os.remove(in_file.name)
         os.remove(out_file.name)
+        img_id_to_scores = {
+            item["image_id"]: item["scores"][spice_name] for item in results
+        }
+        scores = [
+            {k: self.float_convert(v) for k, v in img_id_to_scores[image_id].items()}
+            for image_id in range(len(predictions))
+        ]
+        return {f"spice_{k}": v for k, v in self._compute_batch(scores).items()}

tests.py CHANGED Viewed

@@ -3,7 +3,7 @@ import evaluate
 test_cases = [
     {
-        "predictions": [["train traveling down a track in front of a road"]],
         "references": [
             [
                 "a train traveling down tracks next to lights",
@@ -12,12 +12,18 @@ test_cases = [
                 "a passenger train pulls into a train station",
                 "a train coming down the tracks arriving at a station",
             ]
-        ]
     },
     {
         "predictions": [
-            ["plane is flying through the sky"],
-            ["birthday cake sitting on top of a white plate"],
         ],
         "references": [
             [
@@ -28,7 +34,7 @@ test_cases = [
                 "the plane is flying over top of the cars",
             ],
             ["a blue plate filled with marshmallows chocolate chips and banana"],
-        ]
     },
 ]
@@ -37,7 +43,7 @@ for i, test_case in enumerate(test_cases):
     results = metric.compute(
         predictions=test_case["predictions"], references=test_case["references"]
     )
-    print(f"Test case {i+1}:")
     print("Predictions:", test_case["predictions"])
     print("References:", test_case["references"])
     print(results)

 test_cases = [
     {
+        "predictions": ["train traveling down a track in front of a road"],
         "references": [
             [
                 "a train traveling down tracks next to lights",
                 "a passenger train pulls into a train station",
                 "a train coming down the tracks arriving at a station",
             ]
+        ],
+    },
+    {
+        "predictions": ["birthday cake sitting on top of a white plate"],
+        "references": [
+            "a blue plate filled with marshmallows chocolate chips and banana"
+        ],
     },
     {
         "predictions": [
+            "plane is flying through the sky",
+            "birthday cake sitting on top of a white plate",
         ],
         "references": [
             [
                 "the plane is flying over top of the cars",
             ],
             ["a blue plate filled with marshmallows chocolate chips and banana"],
+        ],
     },
 ]
     results = metric.compute(
         predictions=test_case["predictions"], references=test_case["references"]
     )
+    print(f"Test case {i + 1}:")
     print("Predictions:", test_case["predictions"])
     print("References:", test_case["references"])
     print(results)