File size: 3,462 Bytes
9c60174 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | import sys
import os
import json
import re
import openai
import time
import tiktoken
input_data = sys.argv[1]
openai_modelid = sys.argv[2]
openai.api_key = sys.argv[3]
output_path = sys.argv[4]
prompt_path = sys.argv[5]
encoding = tiktoken.encoding_for_model(openai_modelid)
prompts = json.load(open(prompt_path, "r"))
judge_prompt_raw = prompts["judge"]["system"]
def gen_model_output(input_qs):
input_qs_token_l = len(encoding.encode(input_qs)) # token num
input_qs_word_l = len(input_qs.split(" ")) # word num
qs_w_t_ratio = input_qs_word_l / input_qs_token_l
max_word_num = int(4096 * qs_w_t_ratio)
input_qs = " ".join(input_qs.split(" ")[-max_word_num:])
messages = [{"role": "system", "content": input_qs}]
chat = None
for _ in range(5):
try:
chat = openai.ChatCompletion.create(
model=openai_modelid, messages=messages
)
break
except:
time.sleep(5)
if chat is None:
return "Cannot generate output."
model_outputs = chat.choices[0].message.content
return model_outputs
data = json.load(open(input_data, "r"))
# do llm judge
output_ratings = []
for d in data:
print("=" * 20 + "Processing: " + d["id"] + "=" * 20)
judge_conversation = []
d_conversations = d['conversations']
last_q = d_conversations[-2]
turn_infos = last_q["turn-info"].split("-")
r_turns = [turn_infos[0] + "-" + turn_infos[1]]
if len(turn_infos) == 5:
r_turns.append(turn_infos[2] + "-" + turn_infos[3])
for l_i in range(len(d_conversations) // 2 - 1):
if d_conversations[l_i * 2]["turn-info"][:-2] in r_turns:
judge_conversation.append("user: " + d_conversations[l_i * 2]["value"])
judge_conversation.append("bot: " + d_conversations[l_i * 2 + 1]["value"])
judge_prompt = judge_prompt_raw.replace("RCH_0", "\n".join(judge_conversation)).replace("UQ_1", "user: " + last_q["value"]).replace("BR_2", "bot: " + d_conversations[-1]["value"])
print(judge_prompt)
print('-' * 20)
outputs = gen_model_output(judge_prompt)
print(outputs)
print("=" * 20 + "Processed: " + d["id"] + "=" * 20)
match = re.search(r'\[\[(\d+)\]\]', outputs)
try:
rating = int(match.group(1))
except:
rating = None
output_ratings.append({
"id": d["id"],
"type": d["type"],
"judge_prompt": judge_prompt,
"evaluation": outputs,
"rating": rating
})
json.dump(output_ratings, open(output_path, "w"), indent=2)
# compute score
count = {
"continuation": [],
"retrospection": [],
"conjunction": []
}
for d in output_ratings:
if d["type"] == "continuation":
count["continuation"].append(d["rating"])
elif d["type"] == "retrospection":
count["retrospection"].append(d["rating"])
elif d["type"] == "conjunction":
count["conjunction"].append(d["rating"])
print("Retrospection Score: {}, Continuation Score: {}, Conjunction Score: {}, Overall Score: {} of file {}".format(
round(sum(count["retrospection"]) / len(count["retrospection"]), 2),
round(sum(count["continuation"]) / len(count["continuation"]), 2),
round(sum(count["conjunction"]) / len(count["conjunction"]), 2),
round(sum(count["continuation"] + count["retrospection"] + count["conjunction"]) / len(count["continuation"] + count["retrospection"] + count["conjunction"]), 2),
input_data
))
|