# Copyright 2021 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""FEVER (Fact Extraction and VERification) metric."""

import datasets

import evaluate


_CITATION = """\
@inproceedings{thorne2018fever,
  title={FEVER: Fact Extraction and VERification},
  author={Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},
  booktitle={Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  pages={809--819},
  year={2018}
}
"""
_DESCRIPTION = """\
The FEVER (Fact Extraction and VERification) metric evaluates the performance of systems that verify factual claims against evidence retrieved from Wikipedia.

It consists of three main components:
- **Label accuracy**: measures how often the predicted claim label (SUPPORTED, REFUTED, or NOT ENOUGH INFO) matches the gold label.
- **FEVER score**: considers a prediction correct only if the label is correct *and* at least one complete gold evidence set is retrieved.
- **Evidence F1**: computes the micro-averaged precision, recall, and F1 between predicted and gold evidence sentences.

The FEVER score is the official leaderboard metric used in the FEVER shared tasks.
"""
_KWARGS_DESCRIPTION = """
Computes the FEVER evaluation metrics.

Args:
    predictions (list of dict): Each prediction should be a dictionary with:
        - "label" (str): the predicted claim label.
        - "evidence" (list of str): the predicted evidence sentences.
    references (list of dict): Each reference should be a dictionary with:
        - "label" (str): the gold claim label.
        - "evidence_sets" (list of list of str): all possible gold evidence sets.

Returns:
    A dictionary containing:
        - 'label_accuracy': proportion of claims with correctly predicted labels.
        - 'fever_score': proportion of claims where both the label and at least one full gold evidence set are correct.
        - 'evidence_precision': micro-averaged precision of evidence retrieval.
        - 'evidence_recall': micro-averaged recall of evidence retrieval.
        - 'evidence_f1': micro-averaged F1 of evidence retrieval.

Example:
    >>> predictions = [{"label": "SUPPORTED", "evidence": ["E1", "E2"]}]
    >>> references = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"], ["E3", "E4"]]}]
    >>> fever = evaluate.load("fever")
    >>> results = fever.compute(predictions=predictions, references=references)
    >>> print(results["label_accuracy"])
    1.0
    >>> print(results["fever_score"])
    1.0
    >>> print(results["evidence_precision"])
    1.0
    >>> print(results["evidence_recall"])
    0.5
    >>> print(round(results["evidence_f1"], 3))
    0.667
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class FEVER(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": {
                        "label": datasets.Value("string"),
                        "evidence": datasets.Sequence(datasets.Value("string")),
                    },
                    "references": {
                        "label": datasets.Value("string"),
                        "evidence_sets": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
                    },
                }
            ),
            reference_urls=[
                "https://fever.ai/dataset/",
                "https://arxiv.org/abs/1803.05355",
            ],
        )

    def _compute(self, predictions, references):
        """
        Computes FEVER metrics:
        - Label accuracy
        - FEVER score (label + complete evidence set)
        - Evidence precision, recall, and F1 (micro-averaged)
        """
        total = len(predictions)
        label_correct, fever_correct = 0, 0
        total_overlap, total_pred, total_gold = 0, 0, 0

        for pred, ref in zip(predictions, references):
            pred_label = pred["label"]
            pred_evidence = set(e.strip().lower() for e in pred["evidence"])
            gold_label = ref["label"]
            gold_sets = []
            for s in ref["evidence_sets"]:
                gold_sets.append([e.strip().lower() for e in s])

            if pred_label == gold_label:
                label_correct += 1
                for g_set in gold_sets:
                    if set(g_set).issubset(pred_evidence):
                        fever_correct += 1
                        break

            gold_evidence = set().union(*gold_sets) if gold_sets else set()
            overlap = len(gold_evidence.intersection(pred_evidence))
            total_overlap += overlap
            total_pred += len(pred_evidence)
            total_gold += len(gold_evidence)

        precision = (total_overlap / total_pred) if total_pred else 0
        recall = (total_overlap / total_gold) if total_gold else 0
        evidence_f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        fever_score = fever_correct / total if total else 0
        label_accuracy = label_correct / total if total else 0

        return {
            "label_accuracy": label_accuracy,
            "fever_score": fever_score,
            "evidence_precision": precision,
            "evidence_recall": recall,
            "evidence_f1": evidence_f1,
        }