|
|
import json |
|
|
import datetime |
|
|
import re |
|
|
import pandas as pd |
|
|
import os, argparse |
|
|
import random |
|
|
import csv |
|
|
from openai import OpenAI |
|
|
from huggingface_hub import hf_hub_download |
|
|
import json |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
def gpt_4o_useful(input): |
|
|
client=OpenAI(api_key=os.environ.get("OAI")) |
|
|
response = client.chat.completions.create( |
|
|
model="gpt-4o", |
|
|
messages=[ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{ |
|
|
"type": "text", |
|
|
"text": input |
|
|
} |
|
|
] |
|
|
} |
|
|
], |
|
|
response_format={"type": "text"}, |
|
|
temperature=0.0000000001, |
|
|
max_tokens=4096, |
|
|
top_p=0, |
|
|
frequency_penalty=0, |
|
|
presence_penalty=0, |
|
|
logprobs=True |
|
|
) |
|
|
|
|
|
text = response.choices[0].message.content |
|
|
|
|
|
if response.choices[0].logprobs and response.choices[0].logprobs.content: |
|
|
first_token_logprob = response.choices[0].logprobs.content[0] |
|
|
token = first_token_logprob.token |
|
|
logprob = first_token_logprob.logprob |
|
|
else: |
|
|
token = None |
|
|
logprob = None |
|
|
|
|
|
return text, token, logprob |
|
|
|
|
|
|
|
|
|
|
|
def get_ICL(data, top_k=None): |
|
|
|
|
|
ICL ="" |
|
|
if top_k == None: |
|
|
data = data |
|
|
else: |
|
|
|
|
|
data = data[:top_k] |
|
|
for line in data: |
|
|
|
|
|
pledge = line["pledge"] |
|
|
event = line["event_description"] |
|
|
time = line["event_date"] |
|
|
input=f"Pledge: {pledge}\nEvent Summary: {event} (Event Date: {time})\nIs this event summary useful to track the fulfilment of this pledge" |
|
|
input = input.strip() |
|
|
output = line["label"].strip() |
|
|
ICL = f"{ICL}Input: {input}\nOutput: {output}\n\n" |
|
|
return ICL |
|
|
|
|
|
def load_json(file_path): |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
return data |
|
|
|
|
|
|
|
|
def gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=None): |
|
|
|
|
|
if suggestion_meta: |
|
|
|
|
|
|
|
|
train_data = [line for line in train_data if str(line.get("pledge_id")) == str(ICL_id)] |
|
|
|
|
|
else: |
|
|
random.seed(42) |
|
|
random.shuffle(train_data) |
|
|
|
|
|
ICL = get_ICL(train_data, top_k=50) |
|
|
|
|
|
input = f"{instruction}\nBelow are examples:\n\n{ICL}Now, please assign a label for the below instance.\nInput: {test_instance}\nOutput:" |
|
|
|
|
|
try: |
|
|
text, tokens, logprobs = gpt_4o_useful(input) |
|
|
except Exception as e: |
|
|
print(e) |
|
|
tokens = None |
|
|
logprobs = None |
|
|
|
|
|
return tokens, logprobs |
|
|
|
|
|
def extract_columns_to_dict(file_path, delimiter='\t'): |
|
|
|
|
|
data_dict = {} |
|
|
|
|
|
with open(file_path, mode='r', encoding='utf-8') as file: |
|
|
reader = csv.reader(file, delimiter=delimiter) |
|
|
for row in reader: |
|
|
if len(row) >= 4: |
|
|
key = row[2] |
|
|
value = row[3] |
|
|
data_dict[key] = value |
|
|
|
|
|
return data_dict |
|
|
|
|
|
|
|
|
import datetime |
|
|
import re |
|
|
|
|
|
def parse_date(date_str): |
|
|
if not date_str: |
|
|
return None, date_str |
|
|
date_str = date_str.strip() |
|
|
|
|
|
|
|
|
try: |
|
|
return datetime.datetime.strptime(date_str, "%Y-%m-%d"), date_str |
|
|
except ValueError: |
|
|
pass |
|
|
|
|
|
|
|
|
match = re.search(r'(.*) \(relative to (\d{4}-\d{2}-\d{2})\)', date_str) |
|
|
if match: |
|
|
reference = datetime.datetime.strptime(match.group(2), "%Y-%m-%d") |
|
|
relative_term = match.group(1).strip().lower() |
|
|
if relative_term == "last month": |
|
|
target_date = reference - datetime.timedelta(days=30) |
|
|
elif relative_term == "yesterday": |
|
|
target_date = reference - datetime.timedelta(days=1) |
|
|
elif relative_term == "last week": |
|
|
target_date = reference - datetime.timedelta(days=7) |
|
|
elif relative_term == "this week": |
|
|
target_date = reference |
|
|
else: |
|
|
return None, date_str |
|
|
return target_date, date_str |
|
|
|
|
|
|
|
|
match = re.fullmatch(r'(\d{4})', date_str) |
|
|
if match: |
|
|
year = int(match.group(1)) |
|
|
return datetime.datetime(year, 1, 1), date_str |
|
|
|
|
|
|
|
|
match = re.fullmatch(r'(\w+) (\d{4})', date_str) |
|
|
if match: |
|
|
try: |
|
|
target_date = datetime.datetime.strptime(date_str, "%B %Y") |
|
|
return target_date, date_str |
|
|
except ValueError: |
|
|
return None, date_str |
|
|
|
|
|
|
|
|
match = re.fullmatch(r'(\d{4})-Q(\d)', date_str) |
|
|
if match: |
|
|
year, quarter = int(match.group(1)), int(match.group(2)) |
|
|
month = (quarter - 1) * 3 + 1 |
|
|
return datetime.datetime(year, month, 1), date_str |
|
|
|
|
|
|
|
|
match = re.fullmatch(r'(\d{4}) (Spring|Summer|Autumn|Fall|Winter)', date_str, re.IGNORECASE) |
|
|
if match: |
|
|
year = int(match.group(1)) |
|
|
season_map = {"spring": 3, "summer": 6, "autumn": 9, "fall": 9, "winter": 12} |
|
|
month = season_map[match.group(2).lower()] |
|
|
return datetime.datetime(year, month, 1), date_str |
|
|
|
|
|
return None, date_str |
|
|
|
|
|
|
|
|
def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggestion_meta): |
|
|
|
|
|
events = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pledge = claim.strip() |
|
|
|
|
|
file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json") |
|
|
gpt4_results_json = load_json(file_path) |
|
|
|
|
|
|
|
|
train_file_path = hf_hub_download( |
|
|
repo_id="PledgeTracker/demo_feedback", |
|
|
filename="train_useful.json", |
|
|
repo_type="dataset", |
|
|
token=os.environ["HF_TOKEN"] |
|
|
) |
|
|
|
|
|
with open(train_file_path, "r", encoding="utf-8") as f: |
|
|
train_data = json.load(f) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
instruction_path = hf_hub_download( |
|
|
repo_id="PledgeTracker/demo_feedback", |
|
|
filename="instruction.txt", |
|
|
repo_type="dataset", |
|
|
token=os.environ["HF_TOKEN"] |
|
|
) |
|
|
|
|
|
instruction = open(instruction_path, "r").read() |
|
|
|
|
|
map_file_path = hf_hub_download( |
|
|
repo_id="PledgeTracker/demo_feedback", |
|
|
filename="mapping.txt", |
|
|
repo_type="dataset", |
|
|
token=os.environ["HF_TOKEN"] |
|
|
) |
|
|
mapping_f = open(map_file_path, "r").readlines() |
|
|
mapping = {} |
|
|
|
|
|
for map_id, line in enumerate(mapping_f): |
|
|
mapping[map_id] = int(line.strip()) |
|
|
|
|
|
ICL_id = None |
|
|
if suggestion_meta: |
|
|
try: |
|
|
idx = int(suggestion_meta["index"]) |
|
|
ICL_id = mapping.get(idx) |
|
|
print(f"[Suggestion] index: {idx} → pledge_id: {ICL_id}") |
|
|
except Exception as e: |
|
|
print(f"[Mapping error]: {e}") |
|
|
|
|
|
for doc in gpt4_results_json: |
|
|
mete_date = doc["date"] |
|
|
for event in doc.get("output", {}).get("events", []): |
|
|
parsed_date, original_date = parse_date(event["date"]) |
|
|
|
|
|
if parsed_date: |
|
|
parsed_date_str = parsed_date.strftime("%Y-%m-%d") |
|
|
if parsed_date_str != mete_date: |
|
|
event_date_and_pub_date = f"{parsed_date_str} ({mete_date})" |
|
|
else: |
|
|
event_date_and_pub_date = parsed_date_str |
|
|
|
|
|
test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful to track the fulfilment of this pledge" |
|
|
|
|
|
label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id) |
|
|
|
|
|
URL = doc["url"] |
|
|
events.append({ |
|
|
"date": original_date, |
|
|
"event date (publication date if different)": event_date_and_pub_date, |
|
|
"event": event["event"], |
|
|
"url": URL, |
|
|
"label": label, |
|
|
"confident": score, |
|
|
}) |
|
|
|
|
|
events.sort(key=lambda x: parse_date(x["date"])[0], reverse=True) |
|
|
return events |
|
|
|
|
|
|
|
|
|
|
|
|