"""This module implements the CIDEr metric for image captioning evaluation.""" import evaluate import datasets from .cider_scorer import CiderScorer _CITATION = """\ @InProceedings{Vedantam_2015_CVPR, author = {Vedantam, Ramakrishna and Lawrence Zitnick, C. and Parikh, Devi}, title = {CIDEr: Consensus-Based Image Description Evaluation}, booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2015} } """ _DESCRIPTION = """\ This is a metric to evaluate image captioning. It is based on the idea of measuring the consensus between a candidate image caption and a set of reference image captions written by humans. The CIDEr score is computed by comparing the n-grams of the candidate caption to the n-grams of the reference captions, and measuring how many n-grams are shared between the candidate and the references. The score is then normalized by the length of the candidate caption and the number of reference captions. """ _KWARGS_DESCRIPTION = """ CIDEr (Consensus-based Image Description Evaluation) is a metric for evaluating the quality of image captions. It measures how similar a generated caption is to a set of reference captions written by humans. Args: predictions: list of predictions to score. references: list of references for each prediction. Returns: score: CIDEr score. Examples: >>> metric = evaluate.load("sunhill/cider") >>> results = metric.compute( predictions=[['train traveling down a track in front of a road']], references=[ [ 'a train traveling down tracks next to lights', 'a blue and silver train next to train station and trees', 'a blue train is next to a sidewalk on the rails', 'a passenger train pulls into a train station', 'a train coming down the tracks arriving at a station' ] ] ) """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class CIDEr(evaluate.Metric): """CIDEr metric.""" def _info(self): return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=[ datasets.Features( { "predictions": datasets.Value("string"), "references": datasets.Value("string"), } ), datasets.Features( { "predictions": datasets.Value("string"), "references": datasets.Sequence(datasets.Value("string")), } ), ], # Homepage of the module for documentation homepage="https://huggingface.co/spaces/sunhill/cider", # Additional links to the codebase or references codebase_urls=[ "https://github.com/ramavedantam/cider", "https://github.com/EricWWWW/image-caption-metrics", ], reference_urls=[ ( "https://openaccess.thecvf.com/content_cvpr_2015/html/" "Vedantam_CIDEr_Consensus-Based_Image_2015_CVPR_paper.html" ) ], ) def _compute(self, predictions, references): """Returns the scores""" assert len(predictions) == len(references), ( "The number of predictions and references should be the same. " f"Got {len(predictions)} predictions and {len(references)} references." ) cider_scorer = CiderScorer(n=4, sigma=6.0) for pred, ref in zip(predictions, references): assert isinstance(pred, str), ( f"Each prediction should be a string. Got {type(pred)}." ) if isinstance(ref, str): ref = [ref] assert isinstance(ref, list) and all(isinstance(r, str) for r in ref), ( "Each reference should be a list of strings. " f"Got {type(ref)} with elements of type {[type(r) for r in ref]}." ) cider_scorer += (pred, ref) score, _ = cider_scorer.compute_score() return {"cider_score": score.item()}