| import re |
| import pandas as pd |
| from dabstep_benchmark.evaluation.scorer import question_scorer |
|
|
|
|
| def format_error(msg): |
| return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>" |
|
|
| def format_warning(msg): |
| return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>" |
|
|
| def format_log(msg): |
| return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>" |
|
|
| def model_hyperlink(link, model_name): |
| return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
|
| def is_valid_https_url(url): |
| pattern = re.compile( |
| r'^https://' |
| r'(?!10(?:\.\d{1,3}){3})' |
| r'(?!127(?:\.\d{1,3}){3})' |
| r'(?!169\.254(?:\.\d{1,3}){2})' |
| r'(?!192\.168(?:\.\d{1,3}){2})' |
| r'(?!172\.(?:1[6-9]|2[0-9]|3[0-1])(?:\.\d{1,3}){2})' |
| r'(?:(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,})' |
| r'(?::\d{2,5})?' |
| r'(?:/[^\s]*)?$', |
| re.IGNORECASE |
| ) |
| return re.match(pattern, url) is not None |
|
|
|
|
| def evaluate(agent_answers: pd.DataFrame, tasks_with_gt: pd.DataFrame, submission_id: str = ""): |
| task_scores = [] |
| for index, row in tasks_with_gt.iterrows(): |
| correct_answer = row["answer"] |
| level = str(row["level"]) |
| task_id = str(row["task_id"]) |
|
|
| if task_id not in agent_answers["task_id"].values: |
| raise KeyError(f"Task ID: {task_id} not found. Are you sure you submitted the correct file?") |
|
|
| agent_answer = agent_answers.loc[agent_answers.task_id == task_id, "agent_answer"].values[0] |
| |
| score = question_scorer(agent_answer, correct_answer) |
|
|
| task_scores.append( |
| { |
| "submission_id": submission_id, |
| "task_id": task_id, |
| "score": score, |
| "level": level, |
| "agent_answer": agent_answer, |
| |
| } |
| ) |
|
|
| return task_scores |