starriver030515
/

diffusion

Model card Files Files and versions

diffusion / utils /split_dataset.py

starriver030515's picture

starriver030515

Upload folder using huggingface_hub

a501a0c verified over 1 year ago

history blame contribute delete

937 Bytes

	import json
	import os
	import math

	anno_json_path = (
	"/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/llava_gen_558k.json"
	)
	with open(anno_json_path, "r") as f:
	annotation_data = json.load(f)

	total_annotations = len(annotation_data)
	num_parts = 8
	annotations_per_part = math.ceil(total_annotations / num_parts)

	anno_output_dir = "../annotations/"
	if not os.path.exists(anno_output_dir):
	os.makedirs(anno_output_dir)

	for i in range(num_parts):
	start_idx = i * annotations_per_part
	end_idx = min((i + 1) * annotations_per_part, total_annotations)
	annotations_subset = annotation_data[start_idx:end_idx]
	part_anno_json_path = os.path.join(
	anno_output_dir, f"annotations_part_{i + 1}.json"
	)
	with open(part_anno_json_path, "w") as f:
	json.dump(annotations_subset, f)
	print(len(annotations_subset))

	print("标注已成功分成8份，并保存到文件夹中。")