Spaces:
Sleeping
Sleeping
Expand escaping of \n \r \t
Browse files- test_of_time_accuracy.py +45 -23
- tests/test_arithmetic_scoring.py +16 -4
test_of_time_accuracy.py
CHANGED
|
@@ -89,43 +89,65 @@ class TestOfTimeAccuracy(evaluate.Metric):
|
|
| 89 |
@staticmethod
|
| 90 |
def _extract_first_json_object(text: str) -> dict | None:
|
| 91 |
"""
|
| 92 |
-
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
other text or markdown formatting.
|
| 97 |
|
| 98 |
Args:
|
| 99 |
text: String that may contain JSON objects
|
| 100 |
|
| 101 |
Returns:
|
| 102 |
-
The first JSON dictionary found, or None if no valid JSON
|
| 103 |
"""
|
|
|
|
|
|
|
|
|
|
| 104 |
decoder = json.JSONDecoder()
|
| 105 |
-
idx
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
try:
|
| 109 |
-
obj, next_idx = decoder.raw_decode(text, idx)
|
| 110 |
-
if isinstance(obj, dict):
|
| 111 |
-
return obj
|
| 112 |
-
idx = next_idx
|
| 113 |
-
except json.JSONDecodeError:
|
| 114 |
-
# Try escaping newlines and parsing again from this position
|
| 115 |
try:
|
| 116 |
-
|
| 117 |
-
remaining = text[idx:]
|
| 118 |
-
fixed = remaining.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
|
| 119 |
-
obj, _ = decoder.raw_decode(fixed, 0)
|
| 120 |
if isinstance(obj, dict):
|
| 121 |
return obj
|
| 122 |
-
except
|
| 123 |
pass
|
| 124 |
-
|
| 125 |
-
except ValueError:
|
| 126 |
-
idx += 1
|
| 127 |
return None
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
@staticmethod
|
| 130 |
def _parse_reference_label(label_str: str) -> dict | None:
|
| 131 |
"""
|
|
|
|
| 89 |
@staticmethod
|
| 90 |
def _extract_first_json_object(text: str) -> dict | None:
|
| 91 |
"""
|
| 92 |
+
Extract the first valid JSON object from text.
|
| 93 |
|
| 94 |
+
Handles common LLM output issues like unescaped newlines in string
|
| 95 |
+
values (LLMs produce human-readable output, not strict JSON).
|
|
|
|
| 96 |
|
| 97 |
Args:
|
| 98 |
text: String that may contain JSON objects
|
| 99 |
|
| 100 |
Returns:
|
| 101 |
+
The first JSON dictionary found, or None if no valid JSON exists
|
| 102 |
"""
|
| 103 |
+
# Fix unescaped control chars in strings (common LLM issue)
|
| 104 |
+
text = TestOfTimeAccuracy._escape_control_chars_in_strings(text)
|
| 105 |
+
|
| 106 |
decoder = json.JSONDecoder()
|
| 107 |
+
idx = 0
|
| 108 |
+
while idx < len(text):
|
| 109 |
+
if text[idx] == '{':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
try:
|
| 111 |
+
obj, _ = decoder.raw_decode(text, idx)
|
|
|
|
|
|
|
|
|
|
| 112 |
if isinstance(obj, dict):
|
| 113 |
return obj
|
| 114 |
+
except json.JSONDecodeError:
|
| 115 |
pass
|
| 116 |
+
idx += 1
|
|
|
|
|
|
|
| 117 |
return None
|
| 118 |
|
| 119 |
+
@staticmethod
|
| 120 |
+
def _escape_control_chars_in_strings(text: str) -> str:
|
| 121 |
+
"""
|
| 122 |
+
Escape literal control characters inside JSON string values.
|
| 123 |
+
|
| 124 |
+
LLMs produce newlines/tabs for readability, but JSON requires them
|
| 125 |
+
to be escaped within strings.
|
| 126 |
+
"""
|
| 127 |
+
result = []
|
| 128 |
+
in_string = False
|
| 129 |
+
i = 0
|
| 130 |
+
while i < len(text):
|
| 131 |
+
char = text[i]
|
| 132 |
+
if char == '\\' and in_string and i + 1 < len(text):
|
| 133 |
+
# Preserve existing escape sequences
|
| 134 |
+
result.append(char)
|
| 135 |
+
result.append(text[i + 1])
|
| 136 |
+
i += 2
|
| 137 |
+
continue
|
| 138 |
+
if char == '"':
|
| 139 |
+
in_string = not in_string
|
| 140 |
+
if in_string and char == '\n':
|
| 141 |
+
result.append('\\n')
|
| 142 |
+
elif in_string and char == '\r':
|
| 143 |
+
result.append('\\r')
|
| 144 |
+
elif in_string and char == '\t':
|
| 145 |
+
result.append('\\t')
|
| 146 |
+
else:
|
| 147 |
+
result.append(char)
|
| 148 |
+
i += 1
|
| 149 |
+
return ''.join(result)
|
| 150 |
+
|
| 151 |
@staticmethod
|
| 152 |
def _parse_reference_label(label_str: str) -> dict | None:
|
| 153 |
"""
|
tests/test_arithmetic_scoring.py
CHANGED
|
@@ -12,8 +12,8 @@ arithmetic_test_cases = {
|
|
| 12 |
' "To find the date of the second most important game, we need to subtract 7 days from the date of the most important game. We can do this by counting back 7 days from April 14, 2005. April 14 - 7 days = April 7, 2005", "answer": "2005-04-07"}',
|
| 13 |
'\n```json\n{\n "explanation": "Step 1: Determine the time it takes the robot to carry a single box. The robot takes 4 hours, 34 minutes, and 30 seconds to carry 2 boxes. We divide this time by 2 to find the time per box.\\n- Hours: 4 / 2 = 2 hours\\n- Minutes: 34 / 2 = 17 minutes\\n- Seconds: 30 / 2 = 15 seconds\\nSo, it takes the robot 2 hours, 17 minutes, and 15 seconds to carry one box.\\n\\nStep 2: Calculate the total time to carry 25 boxes. We multiply the time per box by the total number of boxes (25).\\n- Total Hours: 2 hours/box * 25 boxes = 50 hours\\n- Total Minutes: 17 minutes/box * 25 boxes = 425 minutes\\n- Total Seconds: 15 seconds/box * 25 boxes = 375 seconds\\n\\nStep 3: Convert the calculated time into the standard H:M:S format by carrying over excess seconds and minutes.\\n- Convert seconds to minutes: 375 seconds is equal to 6 minutes and 15 seconds (since 375 / 60 = 6 with a remainder of 15). We add the 6 minutes to our minutes total.\\n- New total: 50 hours, (425 + 6) minutes, 15 seconds -> 50 hours, 431 minutes, 15 seconds.\\n- Convert minutes to hours: 431 minutes is equal to 7 hours and 11 minutes (since 431 / 60 = 7 with a remainder of 11). We add the 7 hours to our hours total.\\n- New total: (50 + 7) hours, 11 minutes, 15 seconds -> 57 hours, 11 minutes, 15 seconds.\\n\\nThe final time is 57 hours, 11 minutes, and 15 seconds.",\n "H": 57,\n "M": 11,\n "S": 15\n}\n```',
|
| 14 |
'{"explanation": "some explanation", "age": "3319"}',
|
| 15 |
-
'JSON = {"explanation": "Let\'s analyze the available time slots for a 30-minute meeting, considering the constraints and schedules of Clara, William, and Alice.\n\n**1. Define Working Hours:** 9:00 AM to 5:00 PM\n\n**2. Identify Booked Time Slots:**\n\n* Clara: 9:30 AM - 12:30 PM\n* William: 1:30 PM - 3:30 PM\n* Alice: 9:00 AM - 11:00 AM (flexible, but this constraint impacts the meeting time)\n\n**3. Determine Available Time Slots for Clara and William:**\n\n* Clara is available: 9:00 AM - 9:30 AM and 12:30 PM - 5:00 PM\n* William is available: 9:00 AM - 1:30 PM and 3:30 PM - 5:00 PM\n\n**4. Find the Intersection of Clara and William\'s Availability:**\n\n* From 9:00 AM to 9:30 AM: Both are available.\n* From 9:30 AM to 12:30 PM: Clara is booked.\n* From 12:30 PM to 1:30 PM: Both are available.\n* From 1:30 PM to 3:30 PM: William is booked.\n* From 3:30 PM to 5:00 PM: Both are available.\n\nSo, the available time slots for both Clara and William are:\n* 9:00 AM - 9:30 AM\n* 12:30 PM - 1:30 PM\n* 3:30 PM - 5:00 PM\n\n**5. Consider Alice\'s Flexibility:** Since Alice is flexible and can shift her meetings, we don\'t need to consider her booked time slot (9:00 AM - 11:00 AM) when determining the possibilities.\n\n**6. Calculate Possible Meeting Start Times:**\n\n* **9:00 AM - 9:30 AM:** Possible start time: 9:00 AM. (1 possibility)\n* **12:30 PM - 1:30 PM:** Possible start times: 12:30 PM, 1:00 PM. (2 possibilities)\n* **3:30 PM - 5:00 PM:** Possible start times: 3:30 PM, 4:00 PM, 4:30 PM. (3 possibilities)\n\n**7. Total Possible Meeting Times:** 1 + 2 + 3 = 6\n\n", "answer": 6}'
|
| 16 |
-
|
| 17 |
],
|
| 18 |
"references": [
|
| 19 |
'{"answer": "352 BC"}',
|
|
@@ -25,9 +25,21 @@ arithmetic_test_cases = {
|
|
| 25 |
'{"H": 57.0, "M": 11.0, "S": 15.0}',
|
| 26 |
'{"answer": 3319}',
|
| 27 |
'{"answer": 6}',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
],
|
| 29 |
-
"result": {"accuracy": 6 / 9},
|
| 30 |
-
"per_item_accuracy": [True, True, True, False,False, False, True, True, True],
|
| 31 |
}
|
| 32 |
|
| 33 |
|
|
|
|
| 12 |
' "To find the date of the second most important game, we need to subtract 7 days from the date of the most important game. We can do this by counting back 7 days from April 14, 2005. April 14 - 7 days = April 7, 2005", "answer": "2005-04-07"}',
|
| 13 |
'\n```json\n{\n "explanation": "Step 1: Determine the time it takes the robot to carry a single box. The robot takes 4 hours, 34 minutes, and 30 seconds to carry 2 boxes. We divide this time by 2 to find the time per box.\\n- Hours: 4 / 2 = 2 hours\\n- Minutes: 34 / 2 = 17 minutes\\n- Seconds: 30 / 2 = 15 seconds\\nSo, it takes the robot 2 hours, 17 minutes, and 15 seconds to carry one box.\\n\\nStep 2: Calculate the total time to carry 25 boxes. We multiply the time per box by the total number of boxes (25).\\n- Total Hours: 2 hours/box * 25 boxes = 50 hours\\n- Total Minutes: 17 minutes/box * 25 boxes = 425 minutes\\n- Total Seconds: 15 seconds/box * 25 boxes = 375 seconds\\n\\nStep 3: Convert the calculated time into the standard H:M:S format by carrying over excess seconds and minutes.\\n- Convert seconds to minutes: 375 seconds is equal to 6 minutes and 15 seconds (since 375 / 60 = 6 with a remainder of 15). We add the 6 minutes to our minutes total.\\n- New total: 50 hours, (425 + 6) minutes, 15 seconds -> 50 hours, 431 minutes, 15 seconds.\\n- Convert minutes to hours: 431 minutes is equal to 7 hours and 11 minutes (since 431 / 60 = 7 with a remainder of 11). We add the 7 hours to our hours total.\\n- New total: (50 + 7) hours, 11 minutes, 15 seconds -> 57 hours, 11 minutes, 15 seconds.\\n\\nThe final time is 57 hours, 11 minutes, and 15 seconds.",\n "H": 57,\n "M": 11,\n "S": 15\n}\n```',
|
| 14 |
'{"explanation": "some explanation", "age": "3319"}',
|
| 15 |
+
'JSON = {"explanation": "Let\'s analyze the available time slots for a 30-minute meeting, considering the constraints and schedules of Clara, William, and Alice.\n\n**1. Define Working Hours:** 9:00 AM to 5:00 PM\n\n**2. Identify Booked Time Slots:**\n\n* Clara: 9:30 AM - 12:30 PM\n* William: 1:30 PM - 3:30 PM\n* Alice: 9:00 AM - 11:00 AM (flexible, but this constraint impacts the meeting time)\n\n**3. Determine Available Time Slots for Clara and William:**\n\n* Clara is available: 9:00 AM - 9:30 AM and 12:30 PM - 5:00 PM\n* William is available: 9:00 AM - 1:30 PM and 3:30 PM - 5:00 PM\n\n**4. Find the Intersection of Clara and William\'s Availability:**\n\n* From 9:00 AM to 9:30 AM: Both are available.\n* From 9:30 AM to 12:30 PM: Clara is booked.\n* From 12:30 PM to 1:30 PM: Both are available.\n* From 1:30 PM to 3:30 PM: William is booked.\n* From 3:30 PM to 5:00 PM: Both are available.\n\nSo, the available time slots for both Clara and William are:\n* 9:00 AM - 9:30 AM\n* 12:30 PM - 1:30 PM\n* 3:30 PM - 5:00 PM\n\n**5. Consider Alice\'s Flexibility:** Since Alice is flexible and can shift her meetings, we don\'t need to consider her booked time slot (9:00 AM - 11:00 AM) when determining the possibilities.\n\n**6. Calculate Possible Meeting Start Times:**\n\n* **9:00 AM - 9:30 AM:** Possible start time: 9:00 AM. (1 possibility)\n* **12:30 PM - 1:30 PM:** Possible start times: 12:30 PM, 1:00 PM. (2 possibilities)\n* **3:30 PM - 5:00 PM:** Possible start times: 3:30 PM, 4:00 PM, 4:30 PM. (3 possibilities)\n\n**7. Total Possible Meeting Times:** 1 + 2 + 3 = 6\n\n", "answer": 6}',
|
| 16 |
+
'{\n "explanation": "First, we calculate the difference in hours, minutes, and seconds between the two times. The start time is 08:30:33 and the end time is 14:33:31. Since the end time is later than the start time, we do not need to consider the next day scenario. \n Hours difference: 14 - 8 = 6\n Minutes difference: 33 - 30 = 3\n Seconds difference: 31 - 33 = -2, but since we cannot have negative seconds, we borrow 1 minute from the minutes difference, making it 2 and adding 60 to the seconds difference, resulting in 59 - 2 = 59",\n "hours": 6,\n "minutes": 2,\n "seconds": 58\n}',
|
| 17 |
],
|
| 18 |
"references": [
|
| 19 |
'{"answer": "352 BC"}',
|
|
|
|
| 25 |
'{"H": 57.0, "M": 11.0, "S": 15.0}',
|
| 26 |
'{"answer": 3319}',
|
| 27 |
'{"answer": 6}',
|
| 28 |
+
"{'hours': 6, 'minutes': 2, 'seconds': 58}",
|
| 29 |
+
],
|
| 30 |
+
"result": {"accuracy": 7 / 10},
|
| 31 |
+
"per_item_accuracy": [
|
| 32 |
+
True,
|
| 33 |
+
True,
|
| 34 |
+
True,
|
| 35 |
+
False,
|
| 36 |
+
False,
|
| 37 |
+
False,
|
| 38 |
+
True,
|
| 39 |
+
True,
|
| 40 |
+
True,
|
| 41 |
+
True,
|
| 42 |
],
|
|
|
|
|
|
|
| 43 |
}
|
| 44 |
|
| 45 |
|