Spaces:

sunhill
/

spice

Runtime error

App Files Files Community

sunhill commited on Sep 23

Commit

92aad95

1 Parent(s): ec09ccb

update SPICE metric

Browse files

Files changed (7) hide show

.gitignore +2 -1
.spice.py +0 -95
README.md +5 -5
app.py +1 -1
get_stanford_models.sh +0 -23
spice.py +132 -53
tests.py +42 -12

.gitignore CHANGED Viewed

@@ -218,4 +218,5 @@ __marimo__/
 # Custom additions
 lib/stanford*.jar
 !lib/
-**/.DS_Store

 # Custom additions
 lib/stanford*.jar
 !lib/
+**/.DS_Store
+.vscode/

.spice.py DELETED Viewed

@@ -1,95 +0,0 @@
-import os
-import subprocess
-import json
-import numpy as np
-import tempfile
-# Assumes spice.jar is in the same directory as spice.py.  Change as needed.
-SPICE_JAR = "spice-1.0.jar"
-TEMP_DIR = "tmp"
-CACHE_DIR = "cache"
-class Spice:
-    """
-    Main Class to compute the SPICE metric
-    """
-    def float_convert(self, obj):
-        try:
-            return float(obj)
-        except (ValueError, TypeError):
-            return np.nan
-    def compute_score(self, gts, res):
-        assert sorted(gts.keys()) == sorted(res.keys())
-        imgIds = sorted(gts.keys())
-        # Prepare temp input file for the SPICE scorer
-        input_data = []
-        for id in imgIds:
-            hypo = res[id]
-            ref = gts[id]
-            # Sanity check.
-            assert type(hypo) is list
-            assert len(hypo) == 1
-            assert type(ref) is list
-            assert len(ref) >= 1
-            input_data.append({"image_id": id, "test": hypo[0], "refs": ref})
-        cwd = os.path.dirname(os.path.abspath(__file__))
-        temp_dir = os.path.join(cwd, TEMP_DIR)
-        if not os.path.exists(temp_dir):
-            os.makedirs(temp_dir)
-        in_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
-        json.dump(input_data, in_file, indent=2)
-        in_file.close()
-        # Start job
-        out_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
-        out_file.close()
-        cache_dir = os.path.join(cwd, CACHE_DIR)
-        if not os.path.exists(cache_dir):
-            os.makedirs(cache_dir)
-        spice_cmd = [
-            "java",
-            "-jar",
-            "-Xmx8G",
-            SPICE_JAR,
-            in_file.name,
-            "-cache",
-            cache_dir,
-            "-out",
-            out_file.name,
-            "-subset",
-            "-silent",
-        ]
-        subprocess.check_call(spice_cmd, cwd=os.path.dirname(os.path.abspath(__file__)))
-        # Read and process results
-        with open(out_file.name) as data_file:
-            results = json.load(data_file)
-        os.remove(in_file.name)
-        os.remove(out_file.name)
-        imgId_to_scores = {}
-        spice_scores = []
-        for item in results:
-            imgId_to_scores[item["image_id"]] = item["scores"]
-            spice_scores.append(self.float_convert(item["scores"]["All"]["f"]))
-        average_score = np.mean(np.array(spice_scores))
-        scores = []
-        for image_id in imgIds:
-            # Convert none to NaN before saving scores over subcategories
-            score_set = {}
-            for category, score_tuple in imgId_to_scores[image_id].iteritems():
-                score_set[category] = {
-                    k: self.float_convert(v) for k, v in score_tuple.items()
-                }
-            scores.append(score_set)
-        return average_score, scores
-    def method(self):
-        return "SPICE"

README.md CHANGED Viewed

@@ -5,22 +5,22 @@ tags:
 - metric
 description: "TODO: add a description here"
 sdk: gradio
-sdk_version: 3.19.1
 app_file: app.py
 pinned: false
 ---
 # Metric Card for SPICE
-***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
 ## Metric Description
-*Give a brief overview of this metric, including what task(s) it is usually used for, if any.*
 ## How to Use
-*Give general statement of how to use the metric*
-*Provide simplest possible example for using the metric*
 ### Inputs
 *List all input arguments in the format below*

 - metric
 description: "TODO: add a description here"
 sdk: gradio
+sdk_version: 5.45.0
 app_file: app.py
 pinned: false
 ---
 # Metric Card for SPICE
+***Module Card Instructions:*** *This module calculates the SPICE metric for evaluating image captioning models.*
 ## Metric Description
+*SPICE (Semantic Propositional Image Caption Evaluation) is a metric for evaluating the quality of image captions. It measures the semantic similarity between the generated captions and a set of reference captions by analyzing the underlying semantic propositions.*
 ## How to Use
+*To use the SPICE metric, you need to provide a set of generated captions and a set of reference captions. The metric will then compute the SPICE score based on the semantic similarity between the two sets of captions.*
+*Here is a simple example of using the SPICE metric:*
 ### Inputs
 *List all input arguments in the format below*

app.py CHANGED Viewed

@@ -3,4 +3,4 @@ from evaluate.utils import launch_gradio_widget
 module = evaluate.load("sunhill/spice")
-launch_gradio_widget(module)


3
4
5	module = evaluate.load("sunhill/spice")
6	+ launch_gradio_widget(module)

get_stanford_models.sh DELETED Viewed

@@ -1,23 +0,0 @@
-#!/usr/bin/env sh
-# This script downloads the Stanford CoreNLP models.
-CORENLP=stanford-corenlp-full-2015-12-09
-SPICELIB=lib
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-echo "Downloading..."
-wget http://nlp.stanford.edu/software/$CORENLP.zip
-echo "Unzipping..."
-mkdir -p .tmp
-unzip $CORENLP.zip -d .tmp/
-mv .tmp/$CORENLP/stanford-corenlp-3.6.0.jar $SPICELIB/
-mv .tmp/$CORENLP/stanford-corenlp-3.6.0-models.jar $SPICELIB/
-rm -f stanford-corenlp-full-2015-12-09.zip
-rm -rf .tmp
-echo "Done."

spice.py CHANGED Viewed

@@ -1,68 +1,58 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TODO: Add a description here."""
 import evaluate
 import datasets
-# TODO: Add BibTeX citation
 _CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
 }
 """
-# TODO: Add description of the module here
 _DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
-# TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
 Args:
     predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
     references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
 Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
 Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
-    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
-    {'accuracy': 1.0}
 """
-# TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class SPICE(evaluate.Metric):
-    """TODO: Short description of my evaluation module."""
     def _info(self):
-        # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
@@ -70,26 +60,115 @@ class SPICE(evaluate.Metric):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
-            }),
             # Homepage of the module for documentation
-            homepage="http://module.homepage",
             # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
     def _compute(self, predictions, references):
         """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
-        return {
-            "accuracy": accuracy,
-        }

+"""This module implements the SPICE metric."""
+import os
+import shutil
+import subprocess
+import json
+import tempfile
 import evaluate
 import datasets
+import numpy as np
+from evaluate.utils.logging import get_logger
+logger = get_logger(__name__)
+CORENLP = "stanford-corenlp-full-2015-12-09"
+SPICELIB = "lib"
+SPICE_JAR = "spice-1.0.jar"
 _CITATION = """\
+@inproceedings{spice2016,
+  title     = {SPICE: Semantic Propositional Image Caption Evaluation},
+  author    = {Peter Anderson and Basura Fernando and Mark Johnson and Stephen Gould},
+  year      = {2016},
+  booktitle = {ECCV}
 }
 """
 _DESCRIPTION = """\
+This module is designed to evaluate the quality of image captions using the SPICE metric.
+It compares generated captions with reference captions to assess their semantic similarity.
 """
 _KWARGS_DESCRIPTION = """
+Compute SPICE score.
 Args:
     predictions: list of predictions to score. Each predictions
+        should be a string.
     references: list of reference for each prediction. Each
+        reference should be a string.
 Returns:
+    spice: SPICE score
 Examples:
+    >>> metric = evaluate.load("sunhill/spice")
+    >>> results = metric.compute(predictions=["a cat on a mat"], references=["a cat is on the mat"])
     >>> print(results)
+    {'spice': 0.5}
 """
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class SPICE(evaluate.Metric):
+    """This module implements the SPICE metric for evaluating image captioning models."""
     def _info(self):
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
+            features=datasets.Features(
+                {
+                    "predictions": datasets.List(datasets.Value("string")),
+                    "references": datasets.List(datasets.Value("string")),
+                }
+            ),
             # Homepage of the module for documentation
+            homepage="https://huggingface.co/spaces/sunhill/spice",
             # Additional links to the codebase or references
+            codebase_urls=[
+                "https://github.com/peteanderson80/SPICE",
+                "https://github.com/EricWWWW/image-caption-metrics",
+            ],
+            reference_urls=["https://panderson.me/spice"],
         )
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
+        if os.path.exists("lib/stanford-corenlp-3.6.0-models.jar") and os.path.exists(
+            "lib/stanford-corenlp-3.6.0.jar"
+        ):
+            logger.info("`stanford-corenlp` already exists. Skip downloading.")
+            return
+        logger.info("Downloading `stanford-corenlp`...")
+        url = f"http://nlp.stanford.edu/software/{CORENLP}.zip"
+        extracted_path = dl_manager.download_and_extract(url)
+        tmp_path = os.path.join(extracted_path, CORENLP)
+        shutil.copyfile(
+            os.path.join(tmp_path, "stanford-corenlp-3.6.0-models.jar"),
+            os.path.join(SPICELIB, "stanford-corenlp-3.6.0-models.jar"),
+        )
+        shutil.copyfile(
+            os.path.join(tmp_path, "stanford-corenlp-3.6.0.jar"),
+            os.path.join(SPICELIB, "stanford-corenlp-3.6.0.jar"),
+        )
+        logger.info(f"`stanford-corenlp` has been downloaded to {SPICELIB}")
+    def float_convert(self, obj):
+        try:
+            return float(obj)
+        except (ValueError, TypeError):
+            return np.nan
     def _compute(self, predictions, references):
         """Returns the scores"""
+        assert len(predictions) == len(references), (
+            "The number of predictions and references should be the same. "
+            f"Got {len(predictions)} predictions and {len(references)} references."
+        )
+        input_data = []
+        for i, (prediction, reference) in enumerate(zip(predictions, references)):
+            assert len(prediction) == 1 and len(reference) >= 1, (
+                "SPICE expects exactly one prediction and at least one reference per image. "
+                f"Got {len(prediction)} predictions and {len(reference)} references."
+            )
+            input_data.append({"image_id": i, "test": prediction[0], "refs": reference})
+            print(prediction, reference)
+        in_file = tempfile.NamedTemporaryFile(delete=False)
+        json.dump(input_data, in_file, indent=2)
+        in_file.close()
+        out_file = tempfile.NamedTemporaryFile(delete=False)
+        out_file.close()
+        with tempfile.TemporaryDirectory() as cache_dir:
+            spice_cmd = [
+                "java",
+                "-jar",
+                "-Xmx8G",
+                SPICE_JAR,
+                in_file.name,
+                "-cache",
+                cache_dir,
+                "-out",
+                out_file.name,
+                "-subset",
+                "-silent",
+            ]
+            try:
+                subprocess.run(
+                    spice_cmd,
+                    check=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+            except subprocess.CalledProcessError as e:
+                raise RuntimeError(
+                    f"SPICE command '{' '.join(spice_cmd)}' returned non-zero exit status {e.returncode}. "
+                    f"stderr: {e.stderr.decode('utf-8')}"
+                ) from e
+        with open(out_file.name, "r") as f:
+            results = json.load(f)
+        os.remove(in_file.name)
+        os.remove(out_file.name)
+        img_id_to_scores = {}
+        spice_scores = []
+        for item in results:
+            img_id_to_scores[item["image_id"]] = item["scores"]
+            spice_scores.append(self.float_convert(item["scores"]["All"]["f"]))
+        average_score = np.mean(np.array(spice_scores))
+        scores = []
+        for image_id in range(len(predictions)):
+            # Convert none to NaN before saving scores over subcategories
+            score_set = {}
+            for category, score_tuple in img_id_to_scores[image_id].iteritems():
+                score_set[category] = {
+                    k: self.float_convert(v) for k, v in score_tuple.items()
+                }
+            scores.append(score_set)
+        return average_score, scores

tests.py CHANGED Viewed

@@ -1,17 +1,47 @@
 test_cases = [
     {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
     },
     {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
     }
-]

+import evaluate
 test_cases = [
     {
+        "predictions": [["train traveling down a track in front of a road"]],
+        "references": [
+            [
+                "a train traveling down tracks next to lights",
+                "a blue and silver train next to train station and trees",
+                "a blue train is next to a sidewalk on the rails",
+                "a passenger train pulls into a train station",
+                "a train coming down the tracks arriving at a station",
+            ]
+        ],
+        "result": {"metric_score": 0},
     },
     {
+        "predictions": [
+            ["plane is flying through the sky"],
+            ["birthday cake sitting on top of a white plate"],
+        ],
+        "references": [
+            [
+                "a large jetliner flying over a traffic filled street",
+                "an airplane flies low in the sky over a city street",
+                "an airplane flies over a street with many cars",
+                "an airplane comes in to land over a road full of cars",
+                "the plane is flying over top of the cars",
+            ],
+            ["a blue plate filled with marshmallows chocolate chips and banana"],
+        ],
+        "result": {"metric_score": 1},
     }
+]
+metric = evaluate.load("./spice.py")
+for i, test_case in enumerate(test_cases):
+    results = metric.compute(
+        predictions=test_case["predictions"], references=test_case["references"]
+    )
+    print(f"Test case {i+1}:")
+    print("Predictions:", test_case["predictions"])
+    print("References:", test_case["references"])
+    print("Results:", results)
+    print("Expected:", test_case["result"])
+    print()