Spaces:

OpenHandsCommunity
/

evaluation

Running

App Files Files Community

evaluation / utils /mint.py

xingyaoww

Create visualization for MINT benchmark & upload results (#2)

054cb87 verified almost 2 years ago

raw

history blame

3.48 kB

	import json
	import re
	import os

	import pandas as pd
	from glob import glob
	import streamlit as st


	def parse_filepath(filepath: str):
	splited = (
	filepath.removeprefix('outputs/')
	.removesuffix('output.jsonl')
	.removesuffix('output.merged.jsonl')
	.strip('/')
	.split('/')
	)

	metadata_path = os.path.join(os.path.dirname(filepath), 'metadata.json')
	with open(metadata_path, 'r') as f:
	metadata = json.load(f)
	try:
	benchmark = splited[0]
	agent_name = splited[1]
	subset = splited[3]
	# gpt-4-turbo-2024-04-09_maxiter_50(optional)_N_XXX
	# use regex to match the model name & maxiter
	matched = re.match(r'(.+)_maxiter_(\d+)(_.+)?', splited[2])
	model_name = matched.group(1)
	maxiter = matched.group(2)
	note = ''
	if matched.group(3):
	note += matched.group(3).removeprefix('_N_')
	assert len(splited) == 4

	return {
	'benchmark': benchmark,
	'subset': subset,
	'agent_name': agent_name,
	'model_name': model_name,
	'maxiter': maxiter,
	'note': note,
	'filepath': filepath,
	**metadata,
	}
	except Exception as e:
	st.write([filepath, e, splited])


	def load_filepaths():
	# FIXME:
	# glob_pattern = 'outputs/**/output.merged.jsonl'
	glob_pattern = 'outputs/mint/**/output.jsonl'
	filepaths = list(set(glob(glob_pattern, recursive=True)))
	filepaths = pd.DataFrame(list(map(parse_filepath, filepaths)))
	filepaths = filepaths.sort_values(
	[
	'benchmark',
	'subset',
	'agent_name',
	'model_name',
	'maxiter',
	]
	)
	st.write(f'Matching glob pattern: `{glob_pattern}`. {len(filepaths)} files found.')
	return filepaths


	def load_df_from_selected_filepaths(select_filepaths):
	data = []
	if isinstance(select_filepaths, str):
	select_filepaths = [select_filepaths]
	for filepath in select_filepaths:
	with open(filepath, 'r') as f:
	for line in f.readlines():
	d = json.loads(line)
	# # clear out git patch
	# if 'git_patch' in d:
	# d['git_patch'] = clean_git_patch(d['git_patch'])
	# d['history'] = reformat_history(d['history'])
	d['task_name'] = filepath.split('/')[-2]
	data.append(d)
	df = pd.DataFrame(data)
	return df


	def agg_stats(data):
	stats = []

	for idx, entry in enumerate(data):
	# if len(entry["state"]["history"]) % 2 != 0: continue
	task = {
	k: v for k, v in entry.items() if k not in ["state", "test_result"]
	}
	# if "metadata" in task:
	# for k, v in task["metadata"].items():
	# task[k] = v
	# del task["metadata"]

	stats.append(
	{
	"idx": idx,
	"success": entry["test_result"],
	"task_name": entry["task_name"],
	# TODO: add `task_name` after merging all subtasks
	# "n_turns": len(entry["state"]["history"]) // 2,
	# "terminate_reason": entry["state"]["terminate_reason"],
	# "agent_action_count": entry["state"]["agent_action_count"],
	# **task,
	}
	)
	return pd.DataFrame(stats)