File size: 4,233 Bytes
a06a0b6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | """
Evaluate on HellaSwag and ARC-Easy benchmarks.
Both are multiple-choice — we pick the completion with lowest loss.
"""
import torch
import torch.nn.functional as F
import json
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from model import LLM
import sentencepiece as spm
from datasets import load_dataset
@torch.no_grad()
def score_completion(
model: LLM,
tokenizer: spm.SentencePieceProcessor,
context: str,
completion: str,
device: str = "cpu",
) -> float:
"""
Score a completion given a context.
Returns average cross-entropy loss — lower = model prefers this completion.
"""
ctx_ids = tokenizer.encode(context, out_type=int)
comp_ids = tokenizer.encode(completion, out_type=int)
# Full sequence: context + completion
input_ids = [tokenizer.bos_id()] + ctx_ids + comp_ids
target_ids = [-1] * (len(ctx_ids) + 1) + comp_ids # only score completion
# Truncate if too long
max_len = model.config.max_seq_len
if len(input_ids) > max_len:
input_ids = input_ids[-max_len:]
target_ids = target_ids[-max_len:]
x = torch.tensor([input_ids], dtype=torch.long).to(device)
y = torch.tensor([target_ids], dtype=torch.long).to(device)
_, loss = model(x, y)
return loss.item()
@torch.no_grad()
def eval_hellaswag(
model: LLM,
tokenizer: spm.SentencePieceProcessor,
num_samples: int = 100,
device: str = "cpu",
) -> dict:
"""
HellaSwag: pick the most likely sentence completion from 4 choices.
Random baseline = 25%. GPT-2 small = ~31%. Your target = ~27-30%.
"""
print(f"\nEvaluating HellaSwag ({num_samples} samples)...")
dataset = load_dataset(
"Rowan/hellaswag", split="validation", streaming=True
)
correct = 0
total = 0
for item in dataset:
if total >= num_samples:
break
context = item["activity_label"] + ": " + item["ctx"]
endings = item["endings"]
label = int(item["label"])
# Score each ending
scores = [
score_completion(model, tokenizer, context, ending, device)
for ending in endings
]
predicted = scores.index(min(scores)) # lowest loss = most likely
if predicted == label:
correct += 1
total += 1
if total % 25 == 0:
print(f" {total}/{num_samples} | accuracy: {correct/total*100:.1f}%")
accuracy = correct / total * 100
return {
"benchmark": "HellaSwag",
"accuracy": round(accuracy, 2),
"correct": correct,
"total": total,
"random_baseline": 25.0,
}
@torch.no_grad()
def eval_arc_easy(
model: LLM,
tokenizer: spm.SentencePieceProcessor,
num_samples: int = 100,
device: str = "cpu",
) -> dict:
"""
ARC-Easy: multiple choice science questions.
Random baseline = 25%. GPT-2 small = ~43%. Your target = ~30-35%.
"""
print(f"\nEvaluating ARC-Easy ({num_samples} samples)...")
dataset = load_dataset(
"allenai/ai2_arc", "ARC-Easy",
split="test", streaming=True
)
correct = 0
total = 0
for item in dataset:
if total >= num_samples:
break
question = item["question"]
choices = item["choices"]["text"]
labels = item["choices"]["label"]
answer = item["answerKey"]
# Find correct index
try:
correct_idx = labels.index(answer)
except ValueError:
continue
# Score each choice
scores = [
score_completion(model, tokenizer, question, choice, device)
for choice in choices
]
predicted = scores.index(min(scores))
if predicted == correct_idx:
correct += 1
total += 1
if total % 25 == 0:
print(f" {total}/{num_samples} | accuracy: {correct/total*100:.1f}%")
accuracy = correct / total * 100
return {
"benchmark": "ARC-Easy",
"accuracy": round(accuracy, 2),
"correct": correct,
"total": total,
"random_baseline": 25.0,
} |