File size: 12,495 Bytes
7134ce7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 |
import os
import re
import json
import ast
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from openai import OpenAI
# OpenAI API key (replace with a secure method)
openai_api_key = ""
client = OpenAI(api_key=openai_api_key)
# Define input and output file paths
# input_file = "results/DriveLMMo1_internvl.json"
input_file = "results/DriveLMMo1_250422011413.json" # put your result json here
output_file = "DriveLMMo1-results_result.json"
csv_output_file = "DriveLMMo1-results_result.csv"
# dataset = "./ReasoningDriveLM_TEST.json"
dataset = "./data/DriveLMMo1/DriveLMMo1_TEST.json" # put the DriveLMMo1 test gt file here
# System prompt for evaluation
system_prompt = """
You are an autonomous driving reasoning evaluator. Your task is to assess the alignment, coherence, and quality of reasoning steps in text responses for safety-critical driving scenarios.
You will evaluate the model-generated reasoning using the following metrics:
1. Faithfulness-Step (1-10)
Measures how well the model's reasoning steps align with the ground truth.
9-10: All steps correctly match or closely reflect the reference.
7-8: Most steps align, with minor deviations.
5-6: Some steps align, but several are incorrect or missing.
3-4: Few steps align; most are inaccurate or missing.
1-2: Majority of steps are incorrect.
2. Informativeness-Step (1-10)
Measures completeness of reasoning:
9-10: Captures almost all critical information.
7-8: Covers most key points, with minor omissions.
5-6: Missing significant details.
3-4: Only partial reasoning present.
1-2: Poor extraction of relevant reasoning.
3. Risk Assessment Accuracy (1-10)
Evaluates if the model correctly prioritizes high-risk objects or scenarios.
9-10: Correctly identifies and prioritizes key dangers.
7-8: Mostly accurate, with minor misprioritizations.
5-6: Some important risks are overlooked.
3-4: Significant misjudgments in risk prioritization.
1-2: Misidentifies key risks or misses them entirely.
4. Traffic Rule Adherence (1-10)
Evaluates whether the response follows traffic laws and driving best practices.
9-10: Fully compliant with legal and safe driving practices.
7-8: Minor deviations, but mostly correct.
5-6: Some inaccuracies in legal/safe driving recommendations.
3-4: Several rule violations or unsafe suggestions.
1-2: Promotes highly unsafe driving behavior.
5. Scene Awareness & Object Understanding (1-10)
Measures how well the response interprets objects, their positions, and actions.
9-10: Clearly understands all relevant objects and their relationships.
7-8: Minor misinterpretations but mostly correct.
5-6: Some key objects misunderstood or ignored.
3-4: Many errors in object recognition and reasoning.
1-2: Misidentifies or ignores key objects.
6. Repetition-Token (1-10)
Identifies unnecessary repetition in reasoning.
9-10: No redundancy, very concise.
7-8: Minor repetition but still clear.
5-6: Noticeable redundancy.
3-4: Frequent repetition that disrupts reasoning.
1-2: Excessive redundancy, making reasoning unclear.
7. Hallucination (1-10)
Detects irrelevant or invented reasoning steps not aligned with ground truth.
9-10: No hallucinations, all reasoning is grounded.
7-8: One or two minor hallucinations.
5-6: Some fabricated details.
3-4: Frequent hallucinations.
1-2: Majority of reasoning is hallucinated.
8. Semantic Coverage-Step (1-10)
Checks if the response fully covers the critical reasoning elements.
9-10: Nearly complete semantic coverage.
7-8: Good coverage, some minor omissions.
5-6: Partial coverage with key gaps.
3-4: Major gaps in reasoning.
1-2: Very poor semantic coverage.
9. Commonsense Reasoning (1-10)
Assesses the use of intuitive driving logic in reasoning.
9-10: Displays strong commonsense understanding.
7-8: Mostly correct, with minor gaps.
5-6: Some commonsense errors.
3-4: Frequent commonsense mistakes.
1-2: Lacks basic driving commonsense.
10. Missing Step (1-10)
Evaluates if any necessary reasoning steps are missing.
9-10: No critical steps missing.
7-8: Minor missing steps, but answer is mostly intact.
5-6: Some important steps missing.
3-4: Many critical reasoning gaps.
1-2: Response is highly incomplete.
11. Relevance (1-10)
Measures how well the response is specific to the given scenario and ground truth. This includes whether the answer is too general or vague instead of directly addressing the key aspects of the scenario.
9-10: Highly specific and directly relevant to the driving scenario. Captures critical elements precisely, with no unnecessary generalization.
7-8: Mostly relevant, but some minor parts may be overly generic or slightly off-focus.
5-6: Somewhat relevant but lacks precision; response contains vague or general reasoning without clear scenario-based details.
3-4: Mostly generic or off-topic reasoning, with significant irrelevant content.
1-2: Largely irrelevant, missing key aspects of the scenario and failing to align with the ground truth.
12. Missing Details (1-10)
Evaluates the extent to which critical information is missing from the response, impacting the reasoning quality.
9-10: No significant details are missing; response is comprehensive and complete.
7-8: Covers most important details, with minor omissions that do not severely impact reasoning.
5-6: Some essential details are missing, affecting the completeness of reasoning.
3-4: Many critical reasoning steps or contextual details are absent, making the response incomplete.
1-2: Response is highly lacking in necessary details, leaving major gaps in understanding.
Final Evaluation:
Compute the Overall Score as the average of all metric scores.
Avoid subjective interpretation and adhere to the given thresholds.
Always strictly follow these scoring guidelines.
Do not add any additional explanations beyond the structured JSON output.
Example output format:
{
"Faithfulness-Step": 6.0,
"Informativeness-Step": 6.5,
"Risk Assessment Accuracy": 7.0,
"Traffic Rule Adherence": 7.5,
"Scene Awareness & Object Understanding": 8.0,
"Repetition-Token": 7.0,
"Hallucination": 8.5,
"Semantic Coverage-Step": 7.5,
"Commonsense Reasoning": 7.0,
"Missing Step": 8.5,
"Relevance": 8.5,
"Missing Details": 7.0,
"Overall Score": 7.42
}
"""
# OpenAI Response Format
response_format = {
"type": "json_schema",
"json_schema": {
"name": "EvaluationScores",
"strict": True,
"schema": {
"type": "object",
"properties": {
"Faithfulness-Step": {"type": "number"},
"Informativeness-Step": {"type": "number"},
"Risk Assessment Accuracy": {"type": "number"},
"Traffic Rule Adherence":{"type": "number"},
"Scene Awareness & Object Understanding": {"type": "number"},
"Repetition-Token": {"type": "number"},
"Hallucination": {"type": "number"},
"Semantic Coverage-Step": {"type": "number"},
"Commonsense": {"type": "number"},
"Missing Step": {"type": "number"},
"Relevance": {"type": "number"},
"Missing Details": {"type": "number"},
"Overall Score": {"type": "number"}
},
"required": [
"Faithfulness-Step",
"Informativeness-Step",
"Risk Assessment Accuracy",
"Traffic Rule Adherence",
"Scene Awareness & Object Understanding",
"Repetition-Token",
"Hallucination",
"Semantic Coverage-Step",
"Commonsense",
"Missing Step",
"Relevance",
"Missing Details",
"Overall Score"
],
"additionalProperties": False
}
}
}
def extract_final_answer(text):
options = ["The final answer is:","**Final Answer:**","Final Answer","Answer","Why take this action?:",
"**Final Answer**","**Final Decision**:","Final Step:", "<CONCLUSION>"
]
final_ans = ""
for opt in options:
if opt in text:
final_ans = text.split(opt)[-1]
break
return final_ans
def extract_options(text):
pattern = r'([A-F])\)\s*(.+)'
matches = re.findall(pattern, text)
options = [(label, option) for label, option in matches]
if not options:
if "none of the " in text.lower():
return [('F', 'None of the option.')]
return [('none','none')]
return options
# Function to evaluate reasoning
def evaluate_steps(question, ground_truth, llm_response):
messages = [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": [
{"type": "text", "text": f"Question: {question}\nGround Truth: {ground_truth}\nLLM Response: {llm_response}"}
],
}
]
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
response_format=response_format,
max_tokens=500,
temperature=0.0,
)
return response.choices[0].message.content
with open(input_file, "r", encoding="utf-8") as f:
data = json.load(f)
with open(dataset, "r", encoding="utf-8") as f:
dataset = json.load(f)
dataset = dataset[:3]
valid_ids = [item["idx"] for item in dataset if "idx" in item]
data_dict = {}
for item in dataset:
key = item["idx"]
data_dict[key] = {
"image": item["image"],
"question": item["question"],
"final_answer": item["final_answer"],
"steps": item["steps"]
}
# List to store results
evaluation_results = []
# Function to process each file
def process_file(item):
# breakpoint()
question = item["question"]
try:
idx = item["idx"]
except:
idx = item["id"]
if idx not in valid_ids:
return None
ground_truth = "**Step-by-Step Reasoning**: "+ data_dict[idx]['steps'] + " **Final Answer**: " + data_dict[idx]["final_answer"]
try:
llm_response = item["llm-response"]
except:
llm_response = item["answer"]
gt_final_answer = data_dict[idx]['final_answer']
llm_final_answer = extract_final_answer(llm_response)
# set mcq -1 for non MCQ questions
mcq = -1
gt = ""
llm = ""
if int(idx.rsplit("_",1)[-1]) in [2,3,4,5,8]:
gt, _ = extract_options(gt_final_answer)[0]
llm, _ = extract_options(llm_final_answer)[0]
if llm == gt:
mcq = 1
else:
mcq = 0
# Evaluate steps
res = {}
try:
res = evaluate_steps(question, ground_truth, llm_response)
res = ast.literal_eval(res)
except Exception as e:
print(f"Error processing item {idx}: {e}")
return None
res["id"] = idx
res["question"] = question
res["ground_truth"] = ground_truth
res["llm_response"] = llm_response
res["mcq"] = mcq
return res
# Process files in parallel
with ThreadPoolExecutor() as executor:
results = list(tqdm(executor.map(process_file, data), total=len(data), desc="Processing"))
# Remove None values from results
results = [r for r in results if r is not None]
# Save results to JSON
with open(output_file, "w", encoding="utf-8") as f_out:
json.dump(results, f_out, indent=4, ensure_ascii=False)
# Convert results to DataFrame and save as CSV
df = pd.DataFrame(results)
df.to_csv(csv_output_file, index=False)
# Accuracy for MCQs
df_mcq = df[df["mcq"] != -1]
mcq_accuracy = df_mcq["mcq"].mean()
print(f"MCQ Accuracy: {mcq_accuracy:.2%}")
# Normalized step evaluation
max_score = 10
percentage_correct = (df["Overall Score"].sum() / (len(df) * max_score)) * 100
print(f"Percentage of correct steps (normalized): {percentage_correct:.2f}%")
print(f"Results saved: JSON -> {output_file}, CSV -> {csv_output_file}")
|