| | import json |
| | import os |
| | import random |
| | from datasets import load_dataset |
| | from tqdm import tqdm |
| |
|
| | random.seed(1234) |
| | VAL_NUM = 5000 |
| |
|
| |
|
| | def create_r1_train_dataset( |
| | valid_pair_json, |
| | data_dir, |
| | img_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/trainA/", |
| | ): |
| | os.makedirs(data_dir, exist_ok=True) |
| | pairs = [json.loads(line) for line in open(valid_pair_json, "r")] |
| | mapped_pairs = [] |
| |
|
| | for idx, pair in tqdm(enumerate(pairs)): |
| | img_filename = pair["img_filename"] |
| | new_pair = {} |
| | try: |
| | new_pair["thinking"] = ( |
| | pair["r1_response"] |
| | .split("<think>")[1] |
| | .split("</think>")[0] |
| | .replace("scene description", "image") |
| | ) |
| | except Exception as e: |
| | print(f"Error processing pair response: ", pair["r1_response"]) |
| | continue |
| | |
| | dataset_filename = ( |
| | img_filename.split(".")[0] + "_" + str(idx) + "." + img_filename.split(".")[1] |
| | ) |
| | if not os.path.exists(f"{data_dir}/{img_filename}"): |
| | os.system(f"cp {img_dir}/{img_filename} {data_dir}/{dataset_filename}") |
| | q, a = pair["q"], pair["a"] |
| | new_pair["problem"] = q |
| | |
| | |
| | new_pair["thinking"] = "<think>" + new_pair["thinking"] + "</think>" |
| | new_pair["solution"] = f"<answer> {a} </answer>" |
| | new_pair["file_name"] = dataset_filename |
| | mapped_pairs.append(new_pair) |
| | with open(f"{data_dir}/metadata.jsonl", "w") as f: |
| | for pair in mapped_pairs: |
| | f.write(json.dumps(pair) + "\n") |
| |
|
| | train_dataset = load_dataset( |
| | "imagefolder", |
| | data_dir=data_dir, |
| | split="train", |
| | ) |
| | return train_dataset |
| |
|
| |
|
| | def create_val_dataset( |
| | json_file, |
| | data_dir, |
| | val_num=VAL_NUM, |
| | image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valB", |
| | ): |
| | os.makedirs(data_dir, exist_ok=True) |
| | val = json.load(open(json_file)) |
| | random.shuffle(val) |
| | val = val[:val_num] |
| | val_pairs = [] |
| | for idx, pair in tqdm(enumerate(val)): |
| | q, a = pair["q"], pair["a"] |
| | img_filename = pair["img_filename"] |
| | |
| | val_filename = ( |
| | img_filename.split(".")[0] + f"_{idx}." + img_filename.split(".")[1] |
| | ) |
| | if not os.path.exists(f"{data_dir}/{img_filename}"): |
| | os.system(f"cp {image_dir}/{img_filename} {data_dir}/{val_filename}") |
| | new_pair = {} |
| | new_pair["problem"] = q |
| | new_pair["solution"] = f"<answer> {a} </answer>" |
| | new_pair["file_name"] = val_filename |
| | val_pairs.append(new_pair) |
| | with open(f"{data_dir}/metadata.jsonl", "w") as f: |
| | for pair in val_pairs: |
| | f.write(json.dumps(pair) + "\n") |
| | val_dataset = load_dataset("imagefolder", data_dir=data_dir, split="train") |
| | return val_dataset |
| |
|
| |
|
| | |
| | VALA_DATA_DIR = "data/Clevr_CoGenT_ValA" |
| | VALB_DATA_DIR = "data/Clevr_CoGenT_ValB" |
| | valA_json = ( |
| | "/home/lilei/Visual-R1/data/clever_counting_problems_clevr_cogent_v1.0_valA.json" |
| | ) |
| | valB_json = ( |
| | "/home/lilei/Visual-R1/data/clever_counting_problems_clevr_cogent_v1.0_valB.json" |
| | ) |
| | TRAIN_DATADIR = "data/Clevr_CoGenT_TrainA_R1" |
| | train_dataset = create_r1_train_dataset( |
| | "/home/lilei/Visual-R1/filter_results_v2/valid_pairs.jsonl", |
| | TRAIN_DATADIR, |
| | ) |
| |
|
| | |
| | valA_dataset = create_val_dataset( |
| | valA_json, |
| | VALA_DATA_DIR, |
| | image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valA", |
| | ) |
| | valB_dataset = create_val_dataset( |
| | valB_json, |
| | VALB_DATA_DIR, |
| | image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valB", |
| | ) |
| | valA_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_ValA") |
| | valB_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_ValB") |
| | train_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_TrainA_R1") |
| |
|