| """ |
| Code copied from AGXNet: |
| https://github.com/batmanlab/AGXNet |
| """ |
|
|
| import argparse |
| import pandas as pd |
| import json |
| from tqdm import tqdm |
| import nltk |
|
|
|
|
| parser = argparse.ArgumentParser(description="Itemize RadGraph Dataset.") |
|
|
| parser.add_argument( |
| "--data-path", |
| default="/PATH TO RADGRAPH DATA/RadGraph/physionet.org/files/radgraph/1.0.0/MIMIC-CXR_graphs.json", |
| help="RadGraph data path.", |
| ) |
| parser.add_argument( |
| "--output-path", |
| default="/PROJECT DIR/preprocessing/mimic-cxr-radgraph-itemized.csv", |
| help="Output path for itemized RadGraph data.", |
| ) |
|
|
|
|
| def get_ids(key): |
| """Convert keys in the RadGraph file into IDs""" |
| lst = key.split("/") |
| partition = lst[0] |
| pid = lst[1][1:] |
| sid = lst[2].split(".")[0][1:] |
| return partition, pid, sid |
|
|
|
|
| def get_sen_from_token_ix(text, ix): |
| """get the sentence to which the input token index belongs.""" |
| sen_lst = nltk.sent_tokenize(text) |
| dict_ws = {} |
| ix_w = 0 |
| ix_s = 0 |
| for s in sen_lst: |
| words = nltk.word_tokenize(s) |
| for w in words: |
| dict_ws[ix_w] = ix_s |
| ix_w += 1 |
| ix_s += 1 |
| return dict_ws[ix], sen_lst[dict_ws[ix]] |
|
|
|
|
| def get_entity_relation(value): |
| """itemize each relation""" |
| source_lst = [] |
| target_lst = [] |
| token_lst = [] |
| token_ix_lst = [] |
| label_lst = [] |
| relation_lst = [] |
| sen_lst = [] |
| sen_ix_lst = [] |
|
|
| text = value["text"] |
|
|
| entities = value["entities"] |
| for k, v in entities.items(): |
| six, sen = get_sen_from_token_ix(text, v["start_ix"]) |
| relations = v["relations"] |
|
|
| |
| if (len(relations) == 0) or (relations[0] is None): |
| source_lst.append(k) |
| token_ix_lst.append(v["start_ix"]) |
| token_lst.append(v["tokens"]) |
| label_lst.append(v["label"]) |
| relation_lst.append(None) |
| target_lst.append(None) |
| sen_ix_lst.append(six) |
| sen_lst.append(sen) |
| else: |
| for r in relations: |
| source_lst.append(k) |
| token_ix_lst.append(v["start_ix"]) |
| token_lst.append(v["tokens"]) |
| label_lst.append(v["label"]) |
| relation_lst.append(r[0]) |
| target_lst.append(r[1]) |
| sen_ix_lst.append(six) |
| sen_lst.append(sen) |
|
|
| |
| return pd.DataFrame( |
| { |
| "source": source_lst, |
| "token": token_lst, |
| "token_ix": token_ix_lst, |
| "label": label_lst, |
| "relation": relation_lst, |
| "target": target_lst, |
| "sentence_ix": sen_ix_lst, |
| "sentence": sen_lst, |
| } |
| ) |
|
|
|
|
| def radgraph_itemize(args): |
| """Convert nested RadGraph data to itemized examples.""" |
|
|
| print("Loading RadGraph data...") |
| f = open(args.data_path) |
| data = json.load(f) |
| print("RadGraph data is loaded.") |
|
|
| |
| df_lst = [] |
| pid_lst = [] |
| sid_lst = [] |
| text_lst = [] |
| print("Itemizing RadGraph data...") |
| for key, value in tqdm(data.items()): |
| _, pid, sid = get_ids(key) |
| pid_lst.append(pid) |
| sid_lst.append(sid) |
| text_lst.append(data[key]["text"]) |
| df = get_entity_relation(value) |
| df["subject_id"] = pid |
| df["study_id"] = sid |
| df_lst.append(df) |
|
|
| |
| df_itemized = pd.concat(df_lst) |
|
|
| |
| df_itemized.to_csv(args.output_path, index=False) |
| print("Outputs have been saved!") |
|
|
|
|
| if __name__ == "__main__": |
| args = parser.parse_args() |
| radgraph_itemize(args) |
|
|