add CIDEr score
Browse files- README.md +41 -19
- app.py +44 -2
- cider.py +60 -54
- cider_scorer.py +204 -0
- tests.py +38 -12
README.md
CHANGED
|
@@ -3,46 +3,68 @@ title: CIDEr
|
|
| 3 |
tags:
|
| 4 |
- evaluate
|
| 5 |
- metric
|
| 6 |
-
description: "
|
| 7 |
sdk: gradio
|
| 8 |
-
sdk_version:
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
---
|
| 12 |
|
| 13 |
# Metric Card for CIDEr
|
| 14 |
|
| 15 |
-
***Module Card Instructions:*** *
|
| 16 |
|
| 17 |
## Metric Description
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
## How to Use
|
| 21 |
-
*Give general statement of how to use the metric*
|
| 22 |
|
| 23 |
-
*
|
| 24 |
|
| 25 |
### Inputs
|
| 26 |
-
*List all input arguments in the format below*
|
| 27 |
-
- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
|
| 28 |
-
|
| 29 |
-
### Output Values
|
| 30 |
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
|
| 37 |
|
| 38 |
### Examples
|
| 39 |
-
*Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
## Citation
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
## Further References
|
| 48 |
-
|
|
|
|
|
|
|
|
|
| 3 |
tags:
|
| 4 |
- evaluate
|
| 5 |
- metric
|
| 6 |
+
description: "CIDEr (Consensus-based Image Description Evaluation) is a metric used to evaluate the quality of image captions by measuring their similarity to human-generated reference captions."
|
| 7 |
sdk: gradio
|
| 8 |
+
sdk_version: 5.45.0
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
---
|
| 12 |
|
| 13 |
# Metric Card for CIDEr
|
| 14 |
|
| 15 |
+
***Module Card Instructions:*** *This module implements the CIDEr metric for image captioning evaluation.*
|
| 16 |
|
| 17 |
## Metric Description
|
| 18 |
+
|
| 19 |
+
CIDEr (Consensus-based Image Description Evaluation) is a metric used to evaluate the quality of image captions by measuring their similarity to human-generated reference captions. It does this by comparing the n-grams of the candidate caption to the n-grams of the reference captions, and measuring how many n-grams are shared between the candidate and the references.
|
| 20 |
|
| 21 |
## How to Use
|
|
|
|
| 22 |
|
| 23 |
+
*To use this metric, you can call the `compute` method with the following parameters:*
|
| 24 |
|
| 25 |
### Inputs
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
- **predictions** *(batch of list of strings): The generated captions to evaluate.*
|
| 28 |
+
- **references** *(batch of list of strings): The reference captions for each generated caption.*
|
| 29 |
|
| 30 |
+
### Output Values
|
| 31 |
|
| 32 |
+
- **score** *(dict): The CIDEr score, which ranges from 0 to 1, with higher scores indicating better quality captions.*
|
|
|
|
| 33 |
|
| 34 |
### Examples
|
|
|
|
| 35 |
|
| 36 |
+
```python
|
| 37 |
+
import evaluate
|
| 38 |
+
|
| 39 |
+
metric = evaluate.load("sunhill/cider")
|
| 40 |
+
results = metric.compute(
|
| 41 |
+
predictions=[["train traveling down a track in front of a road"]],
|
| 42 |
+
references=[
|
| 43 |
+
[
|
| 44 |
+
"a train traveling down tracks next to lights",
|
| 45 |
+
"a blue and silver train next to train station and trees",
|
| 46 |
+
"a blue train is next to a sidewalk on the rails",
|
| 47 |
+
"a passenger train pulls into a train station",
|
| 48 |
+
"a train coming down the tracks arriving at a station",
|
| 49 |
+
]
|
| 50 |
+
]
|
| 51 |
+
)
|
| 52 |
+
print(results)
|
| 53 |
+
```
|
| 54 |
|
| 55 |
## Citation
|
| 56 |
+
|
| 57 |
+
```bibtex
|
| 58 |
+
@InProceedings{Vedantam_2015_CVPR,
|
| 59 |
+
author = {Vedantam, Ramakrishna and Lawrence Zitnick, C. and Parikh, Devi},
|
| 60 |
+
title = {CIDEr: Consensus-Based Image Description Evaluation},
|
| 61 |
+
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
|
| 62 |
+
month = {June},
|
| 63 |
+
year = {2015}
|
| 64 |
+
}
|
| 65 |
+
```
|
| 66 |
|
| 67 |
## Further References
|
| 68 |
+
|
| 69 |
+
- [CIDEr](https://github.com/ramavedantam/cider)
|
| 70 |
+
- [Image Caption Metrics](https://github.com/EricWWWW/image-caption-metrics)
|
app.py
CHANGED
|
@@ -1,6 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import evaluate
|
| 2 |
-
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
module = evaluate.load("sunhill/cider")
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
import evaluate
|
| 5 |
+
import gradio as gr
|
| 6 |
+
from evaluate import parse_readme
|
| 7 |
|
| 8 |
|
| 9 |
module = evaluate.load("sunhill/cider")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def compute_cider(references, predictions):
|
| 13 |
+
predictions = [[predictions]]
|
| 14 |
+
references = [[ref.strip() for ref in references.split(";") if ref.strip()]]
|
| 15 |
+
return module.compute(predictions=predictions, references=references)["cider_score"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
iface = gr.Interface(
|
| 19 |
+
fn=compute_cider,
|
| 20 |
+
inputs=[
|
| 21 |
+
gr.Textbox(
|
| 22 |
+
label="References",
|
| 23 |
+
placeholder="Enter reference texts here, separated by semicolon... (e.g. ref1; ref2; ref3)",
|
| 24 |
+
),
|
| 25 |
+
gr.Textbox(
|
| 26 |
+
label="Predictions",
|
| 27 |
+
placeholder="Enter prediction text here, Only one prediction is allowed...",
|
| 28 |
+
),
|
| 29 |
+
],
|
| 30 |
+
outputs=gr.Number(label="CIDEr Score"),
|
| 31 |
+
title="CIDEr Score Evaluator",
|
| 32 |
+
description="Evaluate the alignment between an image and a text using CIDEr Score.",
|
| 33 |
+
examples=[
|
| 34 |
+
[
|
| 35 |
+
(
|
| 36 |
+
"a train traveling down tracks next to lights; "
|
| 37 |
+
"a blue and silver train next to train station and trees; "
|
| 38 |
+
"a blue train is next to a sidewalk on the rails; "
|
| 39 |
+
"a passenger train pulls into a train station; "
|
| 40 |
+
"a train coming down the tracks arriving at a station;"
|
| 41 |
+
),
|
| 42 |
+
"train traveling down a track in front of a road",
|
| 43 |
+
]
|
| 44 |
+
],
|
| 45 |
+
article=parse_readme(Path(sys.path[0]) / "README.md"),
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
iface.launch()
|
cider.py
CHANGED
|
@@ -1,68 +1,61 @@
|
|
| 1 |
-
|
| 2 |
-
#
|
| 3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
-
# you may not use this file except in compliance with the License.
|
| 5 |
-
# You may obtain a copy of the License at
|
| 6 |
-
#
|
| 7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
-
#
|
| 9 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
-
# See the License for the specific language governing permissions and
|
| 13 |
-
# limitations under the License.
|
| 14 |
-
"""TODO: Add a description here."""
|
| 15 |
|
| 16 |
import evaluate
|
| 17 |
import datasets
|
| 18 |
|
|
|
|
| 19 |
|
| 20 |
-
# TODO: Add BibTeX citation
|
| 21 |
_CITATION = """\
|
| 22 |
-
@InProceedings{
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
"""
|
| 28 |
|
| 29 |
-
# TODO: Add description of the module here
|
| 30 |
_DESCRIPTION = """\
|
| 31 |
-
This
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
"""
|
| 33 |
|
| 34 |
|
| 35 |
-
# TODO: Add description of the arguments of the module here
|
| 36 |
_KWARGS_DESCRIPTION = """
|
| 37 |
-
|
|
|
|
| 38 |
Args:
|
| 39 |
-
predictions: list of predictions to score.
|
| 40 |
-
|
| 41 |
-
references: list of reference for each prediction. Each
|
| 42 |
-
reference should be a string with tokens separated by spaces.
|
| 43 |
Returns:
|
| 44 |
-
|
| 45 |
-
another_score: description of the second score,
|
| 46 |
Examples:
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
"""
|
| 55 |
|
| 56 |
-
# TODO: Define external resources urls if needed
|
| 57 |
-
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
| 58 |
-
|
| 59 |
|
| 60 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
| 61 |
class CIDEr(evaluate.Metric):
|
| 62 |
-
"""
|
| 63 |
|
| 64 |
def _info(self):
|
| 65 |
-
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
| 66 |
return evaluate.MetricInfo(
|
| 67 |
# This is the description that will appear on the modules page.
|
| 68 |
module_type="metric",
|
|
@@ -70,26 +63,39 @@ class CIDEr(evaluate.Metric):
|
|
| 70 |
citation=_CITATION,
|
| 71 |
inputs_description=_KWARGS_DESCRIPTION,
|
| 72 |
# This defines the format of each prediction and reference
|
| 73 |
-
features=datasets.Features(
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
# Homepage of the module for documentation
|
| 78 |
-
homepage="
|
| 79 |
# Additional links to the codebase or references
|
| 80 |
-
codebase_urls=[
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
)
|
| 83 |
|
| 84 |
def _download_and_prepare(self, dl_manager):
|
| 85 |
"""Optional: download external resources useful to compute the scores"""
|
| 86 |
-
# TODO: Download external resources if needed
|
| 87 |
pass
|
| 88 |
|
| 89 |
def _compute(self, predictions, references):
|
| 90 |
"""Returns the scores"""
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""This module implements the CIDEr metric for image captioning evaluation."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import evaluate
|
| 4 |
import datasets
|
| 5 |
|
| 6 |
+
from .cider_scorer import CiderScorer
|
| 7 |
|
|
|
|
| 8 |
_CITATION = """\
|
| 9 |
+
@InProceedings{Vedantam_2015_CVPR,
|
| 10 |
+
author = {Vedantam, Ramakrishna and Lawrence Zitnick, C. and Parikh, Devi},
|
| 11 |
+
title = {CIDEr: Consensus-Based Image Description Evaluation},
|
| 12 |
+
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
|
| 13 |
+
month = {June},
|
| 14 |
+
year = {2015}
|
| 15 |
}
|
| 16 |
"""
|
| 17 |
|
|
|
|
| 18 |
_DESCRIPTION = """\
|
| 19 |
+
This is a metric to evaluate image captioning. It is based on the idea of
|
| 20 |
+
measuring the consensus between a candidate image caption and a set of
|
| 21 |
+
reference image captions written by humans. The CIDEr score is computed by
|
| 22 |
+
comparing the n-grams of the candidate caption to the n-grams of the reference
|
| 23 |
+
captions, and measuring how many n-grams are shared between the candidate and
|
| 24 |
+
the references. The score is then normalized by the length of the candidate
|
| 25 |
+
caption and the number of reference captions.
|
| 26 |
"""
|
| 27 |
|
| 28 |
|
|
|
|
| 29 |
_KWARGS_DESCRIPTION = """
|
| 30 |
+
CIDEr (Consensus-based Image Description Evaluation) is a metric for evaluating the quality of image captions.
|
| 31 |
+
It measures how similar a generated caption is to a set of reference captions written by humans.
|
| 32 |
Args:
|
| 33 |
+
predictions: list of predictions to score.
|
| 34 |
+
references: list of references for each prediction.
|
|
|
|
|
|
|
| 35 |
Returns:
|
| 36 |
+
score: CIDEr score.
|
|
|
|
| 37 |
Examples:
|
| 38 |
+
>>> metric = evaluate.load("sunhill/cider")
|
| 39 |
+
>>> results = metric.compute(
|
| 40 |
+
predictions=[['train traveling down a track in front of a road']],
|
| 41 |
+
references=[
|
| 42 |
+
[
|
| 43 |
+
'a train traveling down tracks next to lights',
|
| 44 |
+
'a blue and silver train next to train station and trees',
|
| 45 |
+
'a blue train is next to a sidewalk on the rails',
|
| 46 |
+
'a passenger train pulls into a train station',
|
| 47 |
+
'a train coming down the tracks arriving at a station'
|
| 48 |
+
]
|
| 49 |
+
]
|
| 50 |
+
)
|
| 51 |
"""
|
| 52 |
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
| 55 |
class CIDEr(evaluate.Metric):
|
| 56 |
+
"""CIDEr metric."""
|
| 57 |
|
| 58 |
def _info(self):
|
|
|
|
| 59 |
return evaluate.MetricInfo(
|
| 60 |
# This is the description that will appear on the modules page.
|
| 61 |
module_type="metric",
|
|
|
|
| 63 |
citation=_CITATION,
|
| 64 |
inputs_description=_KWARGS_DESCRIPTION,
|
| 65 |
# This defines the format of each prediction and reference
|
| 66 |
+
features=datasets.Features(
|
| 67 |
+
{
|
| 68 |
+
"predictions": datasets.List((datasets.Value("string"))),
|
| 69 |
+
"references": datasets.List(datasets.Value("string")),
|
| 70 |
+
}
|
| 71 |
+
),
|
| 72 |
# Homepage of the module for documentation
|
| 73 |
+
homepage="https://huggingface.co/spaces/sunhill/cider",
|
| 74 |
# Additional links to the codebase or references
|
| 75 |
+
codebase_urls=[
|
| 76 |
+
"https://github.com/ramavedantam/cider",
|
| 77 |
+
"https://github.com/EricWWWW/image-caption-metrics",
|
| 78 |
+
],
|
| 79 |
+
reference_urls=[
|
| 80 |
+
(
|
| 81 |
+
"https://openaccess.thecvf.com/content_cvpr_2015/html/"
|
| 82 |
+
"Vedantam_CIDEr_Consensus-Based_Image_2015_CVPR_paper.html"
|
| 83 |
+
)
|
| 84 |
+
],
|
| 85 |
)
|
| 86 |
|
| 87 |
def _download_and_prepare(self, dl_manager):
|
| 88 |
"""Optional: download external resources useful to compute the scores"""
|
|
|
|
| 89 |
pass
|
| 90 |
|
| 91 |
def _compute(self, predictions, references):
|
| 92 |
"""Returns the scores"""
|
| 93 |
+
assert len(predictions) == len(references), (
|
| 94 |
+
"The number of predictions and references should be the same. "
|
| 95 |
+
f"Got {len(predictions)} predictions and {len(references)} references."
|
| 96 |
+
)
|
| 97 |
+
cider_scorer = CiderScorer(n=4, sigma=6.0)
|
| 98 |
+
for pred, ref in zip(predictions, references):
|
| 99 |
+
cider_scorer += (pred[0], ref)
|
| 100 |
+
score, _ = cider_scorer.compute_score()
|
| 101 |
+
return {"cider_score": score.item()}
|
cider_scorer.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# Tsung-Yi Lin <[email protected]>
|
| 3 |
+
# Ramakrishna Vedantam <[email protected]>
|
| 4 |
+
|
| 5 |
+
import math
|
| 6 |
+
import copy
|
| 7 |
+
from collections import defaultdict
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def precook(s, n=4, out=False):
|
| 13 |
+
"""
|
| 14 |
+
Takes a string as input and returns an object that can be given to
|
| 15 |
+
either cook_refs or cook_test. This is optional: cook_refs and cook_test
|
| 16 |
+
can take string arguments as well.
|
| 17 |
+
:param s: string : sentence to be converted into ngrams
|
| 18 |
+
:param n: int : number of ngrams for which representation is calculated
|
| 19 |
+
:return: term frequency vector for occuring ngrams
|
| 20 |
+
"""
|
| 21 |
+
words = s.split()
|
| 22 |
+
counts = defaultdict(int)
|
| 23 |
+
for k in range(1, n + 1):
|
| 24 |
+
for i in range(len(words) - k + 1):
|
| 25 |
+
ngram = tuple(words[i: i + k])
|
| 26 |
+
counts[ngram] += 1
|
| 27 |
+
return counts
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def cook_refs(refs, n=4):
|
| 31 |
+
"""Takes a list of reference sentences for a single segment
|
| 32 |
+
and returns an object that encapsulates everything that BLEU
|
| 33 |
+
needs to know about them.
|
| 34 |
+
:param refs: list of string : reference sentences for some image
|
| 35 |
+
:param n: int : number of ngrams for which (ngram) representation is calculated
|
| 36 |
+
:return: result (list of dict)
|
| 37 |
+
"""
|
| 38 |
+
# lhuang: oracle will call with "average"
|
| 39 |
+
return [precook(ref, n) for ref in refs]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def cook_test(test, n=4):
|
| 43 |
+
"""Takes a test sentence and returns an object that
|
| 44 |
+
encapsulates everything that BLEU needs to know about it.
|
| 45 |
+
:param test: list of string : hypothesis sentence for some image
|
| 46 |
+
:param n: int : number of ngrams for which (ngram) representation is calculated
|
| 47 |
+
:return: result (dict)
|
| 48 |
+
"""
|
| 49 |
+
return precook(test, n, True)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class CiderScorer(object):
|
| 53 |
+
"""CIDEr scorer."""
|
| 54 |
+
|
| 55 |
+
def copy(self):
|
| 56 |
+
"""copy the refs."""
|
| 57 |
+
new = CiderScorer(n=self.n)
|
| 58 |
+
new.ctest = copy.copy(self.ctest)
|
| 59 |
+
new.crefs = copy.copy(self.crefs)
|
| 60 |
+
return new
|
| 61 |
+
|
| 62 |
+
def __init__(self, test=None, refs=None, n=4, sigma=6.0):
|
| 63 |
+
"""singular instance"""
|
| 64 |
+
self.n = n
|
| 65 |
+
self.sigma = sigma
|
| 66 |
+
self.crefs = []
|
| 67 |
+
self.ctest = []
|
| 68 |
+
self.document_frequency = defaultdict(float)
|
| 69 |
+
self.cook_append(test, refs)
|
| 70 |
+
self.ref_len = None
|
| 71 |
+
|
| 72 |
+
def cook_append(self, test, refs):
|
| 73 |
+
"""called by constructor and __iadd__ to avoid creating new instances."""
|
| 74 |
+
|
| 75 |
+
if refs is not None:
|
| 76 |
+
self.crefs.append(cook_refs(refs))
|
| 77 |
+
if test is not None:
|
| 78 |
+
# N.B.: -1
|
| 79 |
+
self.ctest.append(cook_test(test))
|
| 80 |
+
else:
|
| 81 |
+
self.ctest.append(None) # lens of crefs and ctest have to match
|
| 82 |
+
|
| 83 |
+
def size(self):
|
| 84 |
+
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (
|
| 85 |
+
len(self.crefs),
|
| 86 |
+
len(self.ctest),
|
| 87 |
+
)
|
| 88 |
+
return len(self.crefs)
|
| 89 |
+
|
| 90 |
+
def __iadd__(self, other):
|
| 91 |
+
"""add an instance (e.g., from another sentence)."""
|
| 92 |
+
|
| 93 |
+
if type(other) is tuple:
|
| 94 |
+
# avoid creating new CiderScorer instances
|
| 95 |
+
self.cook_append(other[0], other[1])
|
| 96 |
+
else:
|
| 97 |
+
self.ctest.extend(other.ctest)
|
| 98 |
+
self.crefs.extend(other.crefs)
|
| 99 |
+
|
| 100 |
+
return self
|
| 101 |
+
|
| 102 |
+
def compute_doc_freq(self):
|
| 103 |
+
"""
|
| 104 |
+
Compute term frequency for reference data.
|
| 105 |
+
This will be used to compute idf (inverse document frequency later)
|
| 106 |
+
The term frequency is stored in the object
|
| 107 |
+
:return: None
|
| 108 |
+
"""
|
| 109 |
+
for refs in self.crefs:
|
| 110 |
+
# refs, k ref captions of one image
|
| 111 |
+
for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]):
|
| 112 |
+
self.document_frequency[ngram] += 1
|
| 113 |
+
# maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
|
| 114 |
+
|
| 115 |
+
def compute_cider(self):
|
| 116 |
+
def counts2vec(cnts):
|
| 117 |
+
"""
|
| 118 |
+
Function maps counts of ngram to vector of tfidf weights.
|
| 119 |
+
The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
|
| 120 |
+
The n-th entry of array denotes length of n-grams.
|
| 121 |
+
:param cnts:
|
| 122 |
+
:return: vec (array of dict), norm (array of float), length (int)
|
| 123 |
+
"""
|
| 124 |
+
vec = [defaultdict(float) for _ in range(self.n)]
|
| 125 |
+
length = 0
|
| 126 |
+
norm = [0.0 for _ in range(self.n)]
|
| 127 |
+
for ngram, term_freq in cnts.items():
|
| 128 |
+
# give word count 1 if it doesn't appear in reference corpus
|
| 129 |
+
df = np.log(max(1.0, self.document_frequency[ngram]))
|
| 130 |
+
# ngram index
|
| 131 |
+
n = len(ngram) - 1
|
| 132 |
+
# tf (term_freq) * idf (precomputed idf) for n-grams
|
| 133 |
+
vec[n][ngram] = float(term_freq) * (self.ref_len - df)
|
| 134 |
+
# compute norm for the vector. the norm will be used for computing similarity
|
| 135 |
+
norm[n] += pow(vec[n][ngram], 2)
|
| 136 |
+
|
| 137 |
+
if n == 1:
|
| 138 |
+
length += term_freq
|
| 139 |
+
norm = [np.sqrt(n) for n in norm]
|
| 140 |
+
return vec, norm, length
|
| 141 |
+
|
| 142 |
+
def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
|
| 143 |
+
"""
|
| 144 |
+
Compute the cosine similarity of two vectors.
|
| 145 |
+
:param vec_hyp: array of dictionary for vector corresponding to hypothesis
|
| 146 |
+
:param vec_ref: array of dictionary for vector corresponding to reference
|
| 147 |
+
:param norm_hyp: array of float for vector corresponding to hypothesis
|
| 148 |
+
:param norm_ref: array of float for vector corresponding to reference
|
| 149 |
+
:param length_hyp: int containing length of hypothesis
|
| 150 |
+
:param length_ref: int containing length of reference
|
| 151 |
+
:return: array of score for each n-grams cosine similarity
|
| 152 |
+
"""
|
| 153 |
+
delta = float(length_hyp - length_ref)
|
| 154 |
+
# measure consine similarity
|
| 155 |
+
val = np.array([0.0 for _ in range(self.n)])
|
| 156 |
+
for n in range(self.n):
|
| 157 |
+
# ngram
|
| 158 |
+
for ngram, count in vec_hyp[n].items():
|
| 159 |
+
# vrama91 : added clipping
|
| 160 |
+
val[n] += (
|
| 161 |
+
min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
|
| 165 |
+
val[n] /= norm_hyp[n] * norm_ref[n]
|
| 166 |
+
|
| 167 |
+
assert not math.isnan(val[n])
|
| 168 |
+
# vrama91: added a length based gaussian penalty
|
| 169 |
+
val[n] *= np.e ** (-(delta**2) / (2 * self.sigma**2))
|
| 170 |
+
return val
|
| 171 |
+
|
| 172 |
+
# compute log reference length
|
| 173 |
+
self.ref_len = np.log(float(len(self.crefs)))
|
| 174 |
+
if len(self.crefs) == 1:
|
| 175 |
+
self.ref_len = 1
|
| 176 |
+
scores = []
|
| 177 |
+
for test, refs in zip(self.ctest, self.crefs):
|
| 178 |
+
# compute vector for test captions
|
| 179 |
+
vec, norm, length = counts2vec(test)
|
| 180 |
+
# compute vector for ref captions
|
| 181 |
+
score = np.array([0.0 for _ in range(self.n)])
|
| 182 |
+
for ref in refs:
|
| 183 |
+
vec_ref, norm_ref, length_ref = counts2vec(ref)
|
| 184 |
+
score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
|
| 185 |
+
# change by vrama91 - mean of ngram scores, instead of sum
|
| 186 |
+
score_avg = np.mean(score)
|
| 187 |
+
# divide by number of references
|
| 188 |
+
score_avg /= len(refs)
|
| 189 |
+
# multiply score by 10
|
| 190 |
+
# score_avg *= 10.0
|
| 191 |
+
# append score of an image to the score list
|
| 192 |
+
scores.append(score_avg)
|
| 193 |
+
return scores
|
| 194 |
+
|
| 195 |
+
def compute_score(self, option=None, verbose=0):
|
| 196 |
+
# compute idf
|
| 197 |
+
self.compute_doc_freq()
|
| 198 |
+
# assert to check document frequency
|
| 199 |
+
assert len(self.ctest) >= max(self.document_frequency.values())
|
| 200 |
+
# compute cider score
|
| 201 |
+
score = self.compute_cider()
|
| 202 |
+
# debug
|
| 203 |
+
# print score
|
| 204 |
+
return np.mean(np.array(score)), np.array(score)
|
tests.py
CHANGED
|
@@ -1,17 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
test_cases = [
|
| 2 |
{
|
| 3 |
-
"predictions": [
|
| 4 |
-
"references": [
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
},
|
| 7 |
{
|
| 8 |
-
"predictions": [
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
},
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import evaluate
|
| 2 |
+
|
| 3 |
+
|
| 4 |
test_cases = [
|
| 5 |
{
|
| 6 |
+
"predictions": [["train traveling down a track in front of a road"]],
|
| 7 |
+
"references": [
|
| 8 |
+
[
|
| 9 |
+
"a train traveling down tracks next to lights",
|
| 10 |
+
"a blue and silver train next to train station and trees",
|
| 11 |
+
"a blue train is next to a sidewalk on the rails",
|
| 12 |
+
"a passenger train pulls into a train station",
|
| 13 |
+
"a train coming down the tracks arriving at a station",
|
| 14 |
+
]
|
| 15 |
+
]
|
| 16 |
},
|
| 17 |
{
|
| 18 |
+
"predictions": [
|
| 19 |
+
["plane is flying through the sky"],
|
| 20 |
+
["birthday cake sitting on top of a white plate"],
|
| 21 |
+
],
|
| 22 |
+
"references": [
|
| 23 |
+
[
|
| 24 |
+
"a large jetliner flying over a traffic filled street",
|
| 25 |
+
"an airplane flies low in the sky over a city street",
|
| 26 |
+
"an airplane flies over a street with many cars",
|
| 27 |
+
"an airplane comes in to land over a road full of cars",
|
| 28 |
+
"the plane is flying over top of the cars",
|
| 29 |
+
],
|
| 30 |
+
["a blue plate filled with marshmallows chocolate chips and banana"],
|
| 31 |
+
]
|
| 32 |
},
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
metric = evaluate.load("sunhill/cider")
|
| 36 |
+
for i, test_case in enumerate(test_cases):
|
| 37 |
+
results = metric.compute(
|
| 38 |
+
predictions=test_case["predictions"], references=test_case["references"]
|
| 39 |
+
)
|
| 40 |
+
print(f"Test case {i+1}:")
|
| 41 |
+
print("Predictions:", test_case["predictions"])
|
| 42 |
+
print("References:", test_case["references"])
|
| 43 |
+
print(results)
|