import json import sys from pathlib import Path from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch # Ensure repo root is on the path so `shared` package is found when run directly ROOT = Path(__file__).resolve().parents[1] if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) from shared.metrics import compute_rouge, compute_bleu, factuality_score # noqa: E402 from shared.utils import print_banner, load_yaml_config # noqa: E402 from retailgpt_evaluator.dataset_loader import load_retail_dataset # noqa: E402 def run_eval_for_model(model_name, dataset): print_banner(f"Evaluating {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) preds, refs = [], [] for row in dataset: inputs = tokenizer(row["question"], return_tensors="pt", truncation=True) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=128) preds.append(tokenizer.decode(outputs[0], skip_special_tokens=True)) refs.append(row["answer"]) r = compute_rouge(preds, refs) b = compute_bleu(preds, refs) f = factuality_score(preds, refs) return {"model": model_name, **r, **b, **f} def evaluate_all(): config_path = Path(__file__).resolve().parent / "config.yaml" cfg = load_yaml_config(config_path) dataset_path = ROOT / "datasets/retail_sample.jsonl" if not dataset_path.exists(): print_banner("Dataset not found. Creating sample dataset...") load_retail_dataset() dataset = load_dataset("json", data_files=str(dataset_path), split="train[:50]") (ROOT / "models").mkdir(exist_ok=True) results = [run_eval_for_model(m, dataset) for m in cfg["models"]] json.dump(results, open("models/retail_eval_results.json", "w"), indent=2) print("✅ Saved results to models/retail_eval_results.json") return results if __name__ == "__main__": evaluate_all()