diff --git a/AbdomenAtlas/config_format.json b/AbdomenAtlas/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..01e9febcb5f7b8c2946d49b246139faf6e8272b1 --- /dev/null +++ b/AbdomenAtlas/config_format.json @@ -0,0 +1,125 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + + "Sub_modality": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/AbdomenAtlas/dataclean_abdomen_atlas.py b/AbdomenAtlas/dataclean_abdomen_atlas.py new file mode 100644 index 0000000000000000000000000000000000000000..f396a9eede763a007edd04ab8eb645138085c046 --- /dev/null +++ b/AbdomenAtlas/dataclean_abdomen_atlas.py @@ -0,0 +1,415 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-8-18 +update AbdomenAtlas3.0 data clean + +https://arxiv.org/pdf/2407.16697 +https://zhuanlan.zhihu.com/p/19339643417 + +AbdomenAtlas 3.0 是目前公开的最大规模腹部 CT 图像-文本配对数据集,旨在解决医学影像中的肿瘤检测与报告生成难题。 +该数据集包含 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家,并且是首个提供逐像素(per-voxel)标注、详细肿瘤报告以及肿瘤分期信息的公开数据集。 +这些 CT 扫描数据通过标准医学影像格式(NIfTI 和 DICOM)存储,具备体素间距及 HU 值等临床信息。AbdomenAtlas 3.0 整合并重新标注了 17 个公共数据集,经过 12 位放射科医生的审核,共标注了 8,562 个肿瘤实例,其中包括 3,036 个肝脏肿瘤、354 个胰腺肿瘤和 4,239 个肾脏肿瘤。此外,数据集包含 2,947 份肿瘤报告,其中 948 份为早期肿瘤报告(≤2 cm),260 份报告提供了胰腺肿瘤的 T 分期(T1-T4),并首次公开肝脏 8 个亚段和胰腺 3 个亚段的逐像素标注,以及肿瘤与关键血管(如 SMA、CA 等)的接触标注。 +通过 RadGPT 自动生成的结构化和叙述性报告,数据集详细描述了肿瘤大小、形状、位置、体积以及与周围血管和器官的相互作用。这些报告的生成准确性经过验证,在检测小肿瘤(≤2 cm)方面,RadGPT 的敏感性/特异性显著优于现有方法(例如肝脏:80%/73%,胰腺:77%/77%)。数据集还包含 240 份“人类-AI 融合报告”,结合了放射科医生的临床笔记和 AI 的精确量化结果。AbdomenAtlas 3.0 的意义在于,它首次提供了一个全面的腹部 CT 图像-文本配对数据集,填补了公开领域中腹部肿瘤检测数据的空白,并为推动医学影像中的自动化肿瘤检测、分期和报告生成奠定了基础。这一数据集不仅在规模和多样性上领先,还通过结合 AI 和放射科医生的专业知识,提供了高质量的标注和诊断支持,将有助于提升 AI 模型在医学影像分析中的实际临床应用能力。 + +数据集统计信息 +总数据量: +9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家。 +包含 8,562 个肿瘤实例: +肝脏肿瘤:3,036 个实例(929 份报告) +胰腺肿瘤:354 个实例(344 份报告) +肾脏肿瘤:4,239 个实例(1,674 份报告) +6,061 份无肿瘤报告(作为对照组) +小肿瘤(≤2 cm): +943 份小肿瘤相关报告: +肝脏:347 个实例(占肝脏肿瘤的 37.4%) +胰腺:83 个实例(占胰腺肿瘤的 24.1%) +肾脏:466 个实例(占肾脏肿瘤的 27.8%) +肿瘤分期与解剖结构: +260 份胰腺肿瘤分期报告(T1–T4) +提供肝脏 8 个亚段和胰腺 3 个亚段(头、体、尾)的逐像素分割 +标注了肿瘤与关键血管(如 SMA、CA、CHA 等)的接触角度 +图像与文本配对: +1.8M 文本 Token,包含三类报告: +结构化报告:基于模板生成,提供定量信息(如肿瘤体积、位置等) +叙述性报告:通过 LLM 转换,模仿目标医院的报告风格 +人类-AI 融合报告:240 份,结合临床笔记与 AI 生成的内容 + +AbomentAtlas数据集中每个病例里面的segmentions都是包含了25个器官组织的标注文件,同时也包含一个combined_labels.nii.gz的文件【里面加上背景值包含了0-25的数值 +1 aorta +2 gall_bladder +3 kidney_left +4 kidney_right +5 liver +6 pancreas +7 postcava +8 spleen +9 stomach +10 adrenal_gland_left +11 adrenal_gland_right +12 bladder +13 celiac_trunk +14 colon +15 duodenum +16 esophagus +17 femur_left +18 femur_right +19 hepatic_vessel +20 intestine +21 lung_left +22 lung_right +23 portal_vein_and_splenic_vein +24 prostate +25 rectum + + +参考TotalSegment分别存储25个器官的label处理后的数据文件 +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +# model_name = "bert-large-uncased" +# reduce_method = 'mean' +# max_words_num = 32 # max number of words in the caption > 2 + +# embeder, tokenizer = get_frozen_embeder(model_name) + +# string1 = "modality: ct, gender: female, age: 51, roi: abdomen" +# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# string2 = "modality: ct, gender: female, age: 50, roi: head" + +# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# input_size = embeder.config.vocab_size +# in_size = embeder.config.hidden_size + +# print(embeder, input_size, in_size) +# print(tokenizer) + + +# print(embeder_output1) +# print(embeder_output1.shape) # torch.Size([1, 8, 768]) + + +# print(embeder_output2) +# print(embeder_output2.shape) # torch.Size([1, 8, 768]) + + +# error = torch.abs(embeder_output1 - embeder_output2) +# print(error) +# print("Embedding distance between the two sentences: ") +# print(f"String1: {string1}") +# print(f"String2: {string2}") +# print(torch.mean(error)) + + +# exit() + + +# meta_id_name='Patient' +# meta_weeks_name='Weeks' +# meta_fvc_name='FVC' +# meta_percent_name='Percent' +# meta_age_name='Age' +# meta_sex_name='Sex' +# meta_status_name='SmokingStatus' + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... + + +LABEL_DICT={ + "0":"backgroud", + "1":"aorta", + "2":"gall_bladder", + "3":"kidney_left", + "4":"kidney_right", + "5":"liver", + "6":"pancreas", + "7":"postcava", + "8":"spleen", + "9":"stomach", + "10":"adrenal_gland_left", + "11":"adrenal_gland_right", + "12":"bladder", + "13":"celiac_trunk", + "14":"colon", + "15":"duodenum", + "16":"esophagus", + "17":"femur_left", + "18":"femur_right", + "19":"hepatic_vessel", + "20":"intestine", + "21":"lung_left", + "22":"lung_right", + "23":"portal_vein_and_splenic_vein", + "24":"prostate", + "25":"rectum" +} +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + if not pid_dir.startswith("BDMAP_"): + continue + + meta_file=os.path.join(target_path,'%s.csv'%pid_dir) + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + full_path=os.path.join(target_path,pid_dir,"ct.nii.gz") + + + if not os.path.isfile(full_path): + continue + try: + print(full_path) + + dicom_image=util.load_nifti(full_path) + spacing_info = dicom_image.GetSpacing() + print('SPACING INFO:', spacing_info) + + # metadata_keys = dicom_image.GetMetaDataKeys() + + # dtag=load_dicom_tag(dicom_fp[0]) + # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID + # modality=dtag.GetMetaData('0008|0060')##Modality + uid=pid_dir + modality="CT" + study='AbdomenAtlas'##Dataset_name + CIA_other_info = { + 'Study_UID':uid, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + size = list(dicom_image.GetSize()) + resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size) + + # resize the image + if resampler is not None: + proces_image = resampler.Execute(dicom_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = dicom_image + CIA_other_info['Resample'] = False + + ## + # CIA_other_info['Image_id']=meta_image_id + # CIA_other_info['Weeks']=str(meta_weeks) + # CIA_other_info['FVC']=str(meta_fvc) + # CIA_other_info['Percent']=str(meta_percent) + # CIA_other_info['Age']=str(meta_age) + # CIA_other_info['Sex']=meta_sex + # CIA_other_info['Smoke_Status']=meta_status + # threshold the image + if 'CT' in modality: + proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) + else: + pass + + output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, full_path) + print(f"Saved NIfTI file to {output_path}") + + ##segment + label_path_dict = {} + label_flag=True + + label_paths = os.path.join(target_path,pid_dir, 'segmentations') + label_files=glob.glob("%s/*.nii.gz"%(label_paths)) + #print(label_paths,label_files) + if len(label_files)>0: + for lf in label_files: + lf_name=os.path.basename(lf) + + lf_tissue=lf_name.replace(".nii.gz","") + label_image=load_nrrd(lf) + resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_image) + else: + proces_label = label_image + + + # print(proces_image.GetSize(),proces_label.GetSize()) + try: + assert proces_image.GetSize() == proces_label.GetSize() + except Exception as e: + failed_files.append(lf) + continue + + label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{lf_tissue}.nii.gz") + + label_path_dict[lf_tissue] = label_output_path + util.save_nifti(proces_label, label_output_path, lf) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + + else: + label_flag=False + except RuntimeError: + failed_files.append(full_path) + print(f"Failed to load DICOM images from {full_path}") + continue + + ''' + meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Weeks',meta_weeks) + meta.add_keyvalue('FVC',meta_fvc) + meta.add_keyvalue('Percent',meta_percent) + meta.add_keyvalue('Age',meta_age) + meta.add_keyvalue('Sex',meta_sex) + meta.add_keyvalue('Smoke_Status',meta_status) + ''' + + size_processed = list(proces_image.GetSize()) + + meta_image_id=uid + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info)) + meta.add_keyvalue('OriImg_path',full_path) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','abdomen') + + + + if label_flag: + # print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + # meta.add_keyvalue('Label_Dict',LABEL_DICT) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenAtlas/uncompressed2") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/AbdomenAtlas/dataclean_abdomen_atlas_update_json.py b/AbdomenAtlas/dataclean_abdomen_atlas_update_json.py new file mode 100644 index 0000000000000000000000000000000000000000..b307799f308cb34c913792e866410ebb8958c173 --- /dev/null +++ b/AbdomenAtlas/dataclean_abdomen_atlas_update_json.py @@ -0,0 +1,501 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-8-18 +update AbdomenAtlas3.0 data clean + +https://arxiv.org/pdf/2407.16697 +https://zhuanlan.zhihu.com/p/19339643417 + +AbdomenAtlas 3.0 是目前公开的最大规模腹部 CT 图像-文本配对数据集,旨在解决医学影像中的肿瘤检测与报告生成难题。 +该数据集包含 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家,并且是首个提供逐像素(per-voxel)标注、详细肿瘤报告以及肿瘤分期信息的公开数据集。 +这些 CT 扫描数据通过标准医学影像格式(NIfTI 和 DICOM)存储,具备体素间距及 HU 值等临床信息。AbdomenAtlas 3.0 整合并重新标注了 17 个公共数据集,经过 12 位放射科医生的审核,共标注了 8,562 个肿瘤实例,其中包括 3,036 个肝脏肿瘤、354 个胰腺肿瘤和 4,239 个肾脏肿瘤。此外,数据集包含 2,947 份肿瘤报告,其中 948 份为早期肿瘤报告(≤2 cm),260 份报告提供了胰腺肿瘤的 T 分期(T1-T4),并首次公开肝脏 8 个亚段和胰腺 3 个亚段的逐像素标注,以及肿瘤与关键血管(如 SMA、CA 等)的接触标注。 +通过 RadGPT 自动生成的结构化和叙述性报告,数据集详细描述了肿瘤大小、形状、位置、体积以及与周围血管和器官的相互作用。这些报告的生成准确性经过验证,在检测小肿瘤(≤2 cm)方面,RadGPT 的敏感性/特异性显著优于现有方法(例如肝脏:80%/73%,胰腺:77%/77%)。数据集还包含 240 份“人类-AI 融合报告”,结合了放射科医生的临床笔记和 AI 的精确量化结果。AbdomenAtlas 3.0 的意义在于,它首次提供了一个全面的腹部 CT 图像-文本配对数据集,填补了公开领域中腹部肿瘤检测数据的空白,并为推动医学影像中的自动化肿瘤检测、分期和报告生成奠定了基础。这一数据集不仅在规模和多样性上领先,还通过结合 AI 和放射科医生的专业知识,提供了高质量的标注和诊断支持,将有助于提升 AI 模型在医学影像分析中的实际临床应用能力。 + +数据集统计信息 +总数据量: +9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家。 +包含 8,562 个肿瘤实例: +肝脏肿瘤:3,036 个实例(929 份报告) +胰腺肿瘤:354 个实例(344 份报告) +肾脏肿瘤:4,239 个实例(1,674 份报告) +6,061 份无肿瘤报告(作为对照组) +小肿瘤(≤2 cm): +943 份小肿瘤相关报告: +肝脏:347 个实例(占肝脏肿瘤的 37.4%) +胰腺:83 个实例(占胰腺肿瘤的 24.1%) +肾脏:466 个实例(占肾脏肿瘤的 27.8%) +肿瘤分期与解剖结构: +260 份胰腺肿瘤分期报告(T1–T4) +提供肝脏 8 个亚段和胰腺 3 个亚段(头、体、尾)的逐像素分割 +标注了肿瘤与关键血管(如 SMA、CA、CHA 等)的接触角度 +图像与文本配对: +1.8M 文本 Token,包含三类报告: +结构化报告:基于模板生成,提供定量信息(如肿瘤体积、位置等) +叙述性报告:通过 LLM 转换,模仿目标医院的报告风格 +人类-AI 融合报告:240 份,结合临床笔记与 AI 生成的内容 + +AbomentAtlas数据集中每个病例里面的segmentions都是包含了25个器官组织的标注文件,同时也包含一个combined_labels.nii.gz的文件【里面加上背景值包含了0-25的数值 +1 aorta +2 gall_bladder +3 kidney_left +4 kidney_right +5 liver +6 pancreas +7 postcava +8 spleen +9 stomach +10 adrenal_gland_left +11 adrenal_gland_right +12 bladder +13 celiac_trunk +14 colon +15 duodenum +16 esophagus +17 femur_left +18 femur_right +19 hepatic_vessel +20 intestine +21 lung_left +22 lung_right +23 portal_vein_and_splenic_vein +24 prostate +25 rectum + + +参考TotalSegment分别存储25个器官的label处理后的数据文件 +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +# model_name = "bert-large-uncased" +# reduce_method = 'mean' +# max_words_num = 32 # max number of words in the caption > 2 + +# embeder, tokenizer = get_frozen_embeder(model_name) + +# string1 = "modality: ct, gender: female, age: 51, roi: abdomen" +# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# string2 = "modality: ct, gender: female, age: 50, roi: head" + +# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# input_size = embeder.config.vocab_size +# in_size = embeder.config.hidden_size + +# print(embeder, input_size, in_size) +# print(tokenizer) + + +# print(embeder_output1) +# print(embeder_output1.shape) # torch.Size([1, 8, 768]) + + +# print(embeder_output2) +# print(embeder_output2.shape) # torch.Size([1, 8, 768]) + + +# error = torch.abs(embeder_output1 - embeder_output2) +# print(error) +# print("Embedding distance between the two sentences: ") +# print(f"String1: {string1}") +# print(f"String2: {string2}") +# print(torch.mean(error)) + + +# exit() + + +# meta_id_name='Patient' +# meta_weeks_name='Weeks' +# meta_fvc_name='FVC' +# meta_percent_name='Percent' +# meta_age_name='Age' +# meta_sex_name='Sex' +# meta_status_name='SmokingStatus' + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... + +##判定是否有效胸部的肺部体积阈值ml +LUNG_VOL_THRESH=1000 +FEMUR_VOL_THRESH=80 +KIDNEY_VOL_THRESH=100 +ROI="abdomen" + +PROCESS_FLAG=True + +LABEL_DICT={ + "0":"backgroud", + "1":"aorta", + "2":"gall_bladder", + "3":"kidney_left", + "4":"kidney_right", + "5":"liver", + "6":"pancreas", + "7":"postcava", + "8":"spleen", + "9":"stomach", + "10":"adrenal_gland_left", + "11":"adrenal_gland_right", + "12":"bladder", + "13":"celiac_trunk", + "14":"colon", + "15":"duodenum", + "16":"esophagus", + "17":"femur_left", + "18":"femur_right", + "19":"hepatic_vessel", + "20":"intestine", + "21":"lung_left", + "22":"lung_right", + "23":"portal_vein_and_splenic_vein", + "24":"prostate", + "25":"rectum" +} +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def simpleitk_volume_calculation(image_path): + """ + 使用SimpleITK简化体积计算流程,计算肺部体积,左肺或右肺超过400即认定为有效throax + """ + + image=util.load_nifti(image_path) + # 获取体素尺寸 + spacing = image.GetSpacing() + voxel_volume = spacing[0] * spacing[1] * spacing[2] # mm³ + + # print(f"图像尺寸: {image.GetSize()}") + # print(f"体素间距: {spacing}") + # print(f"单个体素体积: {voxel_volume:.6f} mm³") + ##计算有效像元数量 + image_array2 = sitk.GetArrayFromImage(image) + valid_pxiels=image_array2[image_array2==1].sum() + if valid_pxiels<10: + return 0 + # 简单的阈值分割(需要根据实际情况调整阈值) + segmented = sitk.BinaryThreshold(image, lowerThreshold=1, upperThreshold=1) + + # 统计体素数量 + statistics = sitk.LabelShapeStatisticsImageFilter() + statistics.Execute(segmented) + + voxel_count = statistics.GetNumberOfPixels(1) + volume_mm3 = voxel_count * voxel_volume + volume_ml = volume_mm3 / 1000.0 + + # print(f"体素数量: {voxel_count}") + # print(f"器官体积: {volume_ml:.2f} mL") + + return volume_ml + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + #meta = meta_data() + with open(json_output_path,'r') as fi: + fj=json.load(fi) + ''' + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + ''' + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + if not pid_dir.startswith("BDMAP_"): + continue + + meta_file=os.path.join(target_path,'%s.csv'%pid_dir) + if os.path.isfile(meta_file): + mf_flag=True + # df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + full_path=os.path.join(target_path,pid_dir,"ct.nii.gz") + + + try: + ''' + dicom_image=util.load_nifti(full_path) + spacing_info = dicom_image.GetSpacing() + print('SPACING INFO:', spacing_info) + + # metadata_keys = dicom_image.GetMetaDataKeys() + + # dtag=load_dicom_tag(dicom_fp[0]) + # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID + # modality=dtag.GetMetaData('0008|0060')##Modality + uid=pid_dir + modality="CT" + study='AbdomenAtlas'##Dataset_name + CIA_other_info = { + 'Study_UID':uid, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + size = list(dicom_image.GetSize()) + resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size) + + # resize the image + if resampler is not None: + proces_image = resampler.Execute(dicom_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = dicom_image + CIA_other_info['Resample'] = False + + ## + # CIA_other_info['Image_id']=meta_image_id + # CIA_other_info['Weeks']=str(meta_weeks) + # CIA_other_info['FVC']=str(meta_fvc) + # CIA_other_info['Percent']=str(meta_percent) + # CIA_other_info['Age']=str(meta_age) + # CIA_other_info['Sex']=meta_sex + # CIA_other_info['Smoke_Status']=meta_status + # threshold the image + if 'CT' in modality: + proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) + else: + pass + + output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, full_path) + print(f"Saved NIfTI file to {output_path}") + ''' + ##segment + label_path_dict = {} + label_flag=True + + label_paths = os.path.join(target_path,pid_dir, 'segmentations') + label_files=glob.glob("%s/*.nii.gz"%(label_paths)) + #print(label_paths,label_files) + pelvis_flag=False + thorax_flag=False + kidney_flag=False + if len(label_files)>0: + for lf in label_files: + lf_name=os.path.basename(lf) + + lf_tissue=lf_name.replace(".nii.gz","") + + if 'femur' in lf_tissue: + vol_femur=simpleitk_volume_calculation(lf) + print(lf_tissue,vol_femur) + if vol_femur>=FEMUR_VOL_THRESH: + pelvis_flag=True + if 'lung' in lf_tissue: + vol_lung=simpleitk_volume_calculation(lf) + print(lf_tissue,vol_lung) + if vol_lung>=LUNG_VOL_THRESH: + thorax_flag=True + if 'kidney_right' in lf_tissue: + vol_kidney=simpleitk_volume_calculation(lf) + print(lf_tissue,vol_kidney) + if vol_kidney>=KIDNEY_VOL_THRESH: + kidney_flag=True + + ''' + label_image=load_nrrd(lf) + resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_image) + else: + proces_label = label_image + + + # print(proces_image.GetSize(),proces_label.GetSize()) + try: + assert proces_image.GetSize() == proces_label.GetSize() + except Exception as e: + failed_files.append(lf) + continue + + label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{lf_tissue}.nii.gz") + + label_path_dict[lf_tissue] = label_output_path + util.save_nifti(proces_label, label_output_path, lf) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + ''' + else: + label_flag=False + except RuntimeError: + failed_files.append(full_path) + print(f"Failed to load DICOM images from {full_path}") + continue + + ''' + meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Weeks',meta_weeks) + meta.add_keyvalue('FVC',meta_fvc) + meta.add_keyvalue('Percent',meta_percent) + meta.add_keyvalue('Age',meta_age) + meta.add_keyvalue('Sex',meta_sex) + meta.add_keyvalue('Smoke_Status',meta_status) + + + size_processed = list(proces_image.GetSize()) + + meta_image_id=uid + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info)) + meta.add_keyvalue('OriImg_path',full_path) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + ''' + roi='abdomen' + + if thorax_flag and kidney_flag: + roi='thorax-'+roi + if thorax_flag and not kidney_flag: + roi='thorax' + if pelvis_flag and kidney_flag: + roi=roi+"-pelvis" + if pelvis_flag and not kidney_flag: + roi='pelvis' + print(pid_dir,roi) + #meta.add_keyvalue('ROI',roi) + for ik in fj.keys(): + fi=fj[ik] + jid=fi['Metadata']['Study_UID'] + max_length=fi['Spacing_mm']*max(fi['Size'])*0.001 + if roi=='thorax-abdomen-pelvis' and max_length>1.2: + roi='whole-body' + #print(jid,max_length,roi) + if jid==pid_dir: + fj[ik]['ROI']=roi + print(jid,max_length,roi) + break + else: + continue + + ''' + if label_flag: + # print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + # meta.add_keyvalue('Label_Dict',LABEL_DICT) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + ''' + else: + print("No metadata.csv files found.") + + + with open(json_output_path,'w') as fi: + json.dump(fj,fi) + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenAtlas/uncompressed2") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v2/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/AbdomenAtlas/dataclean_abdomen_atlas_v2.py b/AbdomenAtlas/dataclean_abdomen_atlas_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..86f17b0ed66002430bc5b4bb892df7f695f1867c --- /dev/null +++ b/AbdomenAtlas/dataclean_abdomen_atlas_v2.py @@ -0,0 +1,477 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-8-18 +update AbdomenAtlas3.0 data clean + +https://arxiv.org/pdf/2407.16697 +https://zhuanlan.zhihu.com/p/19339643417 + +AbdomenAtlas 3.0 是目前公开的最大规模腹部 CT 图像-文本配对数据集,旨在解决医学影像中的肿瘤检测与报告生成难题。 +该数据集包含 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家,并且是首个提供逐像素(per-voxel)标注、详细肿瘤报告以及肿瘤分期信息的公开数据集。 +这些 CT 扫描数据通过标准医学影像格式(NIfTI 和 DICOM)存储,具备体素间距及 HU 值等临床信息。AbdomenAtlas 3.0 整合并重新标注了 17 个公共数据集,经过 12 位放射科医生的审核,共标注了 8,562 个肿瘤实例,其中包括 3,036 个肝脏肿瘤、354 个胰腺肿瘤和 4,239 个肾脏肿瘤。此外,数据集包含 2,947 份肿瘤报告,其中 948 份为早期肿瘤报告(≤2 cm),260 份报告提供了胰腺肿瘤的 T 分期(T1-T4),并首次公开肝脏 8 个亚段和胰腺 3 个亚段的逐像素标注,以及肿瘤与关键血管(如 SMA、CA 等)的接触标注。 +通过 RadGPT 自动生成的结构化和叙述性报告,数据集详细描述了肿瘤大小、形状、位置、体积以及与周围血管和器官的相互作用。这些报告的生成准确性经过验证,在检测小肿瘤(≤2 cm)方面,RadGPT 的敏感性/特异性显著优于现有方法(例如肝脏:80%/73%,胰腺:77%/77%)。数据集还包含 240 份“人类-AI 融合报告”,结合了放射科医生的临床笔记和 AI 的精确量化结果。AbdomenAtlas 3.0 的意义在于,它首次提供了一个全面的腹部 CT 图像-文本配对数据集,填补了公开领域中腹部肿瘤检测数据的空白,并为推动医学影像中的自动化肿瘤检测、分期和报告生成奠定了基础。这一数据集不仅在规模和多样性上领先,还通过结合 AI 和放射科医生的专业知识,提供了高质量的标注和诊断支持,将有助于提升 AI 模型在医学影像分析中的实际临床应用能力。 + +数据集统计信息 +总数据量: +9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家。 +包含 8,562 个肿瘤实例: +肝脏肿瘤:3,036 个实例(929 份报告) +胰腺肿瘤:354 个实例(344 份报告) +肾脏肿瘤:4,239 个实例(1,674 份报告) +6,061 份无肿瘤报告(作为对照组) +小肿瘤(≤2 cm): +943 份小肿瘤相关报告: +肝脏:347 个实例(占肝脏肿瘤的 37.4%) +胰腺:83 个实例(占胰腺肿瘤的 24.1%) +肾脏:466 个实例(占肾脏肿瘤的 27.8%) +肿瘤分期与解剖结构: +260 份胰腺肿瘤分期报告(T1–T4) +提供肝脏 8 个亚段和胰腺 3 个亚段(头、体、尾)的逐像素分割 +标注了肿瘤与关键血管(如 SMA、CA、CHA 等)的接触角度 +图像与文本配对: +1.8M 文本 Token,包含三类报告: +结构化报告:基于模板生成,提供定量信息(如肿瘤体积、位置等) +叙述性报告:通过 LLM 转换,模仿目标医院的报告风格 +人类-AI 融合报告:240 份,结合临床笔记与 AI 生成的内容 + +AbomentAtlas数据集中每个病例里面的segmentions都是包含了25个器官组织的标注文件,同时也包含一个combined_labels.nii.gz的文件【里面加上背景值包含了0-25的数值 +1 aorta +2 gall_bladder +3 kidney_left +4 kidney_right +5 liver +6 pancreas +7 postcava +8 spleen +9 stomach +10 adrenal_gland_left +11 adrenal_gland_right +12 bladder +13 celiac_trunk +14 colon +15 duodenum +16 esophagus +17 femur_left +18 femur_right +19 hepatic_vessel +20 intestine +21 lung_left +22 lung_right +23 portal_vein_and_splenic_vein +24 prostate +25 rectum + + +参考TotalSegment分别存储25个器官的label处理后的数据文件 +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +# model_name = "bert-large-uncased" +# reduce_method = 'mean' +# max_words_num = 32 # max number of words in the caption > 2 + +# embeder, tokenizer = get_frozen_embeder(model_name) + +# string1 = "modality: ct, gender: female, age: 51, roi: abdomen" +# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# string2 = "modality: ct, gender: female, age: 50, roi: head" + +# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# input_size = embeder.config.vocab_size +# in_size = embeder.config.hidden_size + +# print(embeder, input_size, in_size) +# print(tokenizer) + + +# print(embeder_output1) +# print(embeder_output1.shape) # torch.Size([1, 8, 768]) + + +# print(embeder_output2) +# print(embeder_output2.shape) # torch.Size([1, 8, 768]) + + +# error = torch.abs(embeder_output1 - embeder_output2) +# print(error) +# print("Embedding distance between the two sentences: ") +# print(f"String1: {string1}") +# print(f"String2: {string2}") +# print(torch.mean(error)) + + +# exit() + + +# meta_id_name='Patient' +# meta_weeks_name='Weeks' +# meta_fvc_name='FVC' +# meta_percent_name='Percent' +# meta_age_name='Age' +# meta_sex_name='Sex' +# meta_status_name='SmokingStatus' + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... + +##判定是否有效胸部的肺部体积阈值ml +LUNG_VOL_THRESH=1000 +FEMUR_VOL_THRESH=80 +ROI="abdomen" + +LABEL_DICT={ + "0":"backgroud", + "1":"aorta", + "2":"gall_bladder", + "3":"kidney_left", + "4":"kidney_right", + "5":"liver", + "6":"pancreas", + "7":"postcava", + "8":"spleen", + "9":"stomach", + "10":"adrenal_gland_left", + "11":"adrenal_gland_right", + "12":"bladder", + "13":"celiac_trunk", + "14":"colon", + "15":"duodenum", + "16":"esophagus", + "17":"femur_left", + "18":"femur_right", + "19":"hepatic_vessel", + "20":"intestine", + "21":"lung_left", + "22":"lung_right", + "23":"portal_vein_and_splenic_vein", + "24":"prostate", + "25":"rectum" +} +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def simpleitk_volume_calculation(image_path): + """ + 使用SimpleITK简化体积计算流程,计算肺部体积,左肺或右肺超过400即认定为有效throax + """ + + image=util.load_nifti(image_path) + # 获取体素尺寸 + spacing = image.GetSpacing() + voxel_volume = spacing[0] * spacing[1] * spacing[2] # mm³ + + # print(f"图像尺寸: {image.GetSize()}") + # print(f"体素间距: {spacing}") + # print(f"单个体素体积: {voxel_volume:.6f} mm³") + ##计算有效像元数量 + image_array2 = sitk.GetArrayFromImage(image) + valid_pxiels=image_array2[image_array2==1].sum() + if valid_pxiels<10: + return 0 + # 简单的阈值分割(需要根据实际情况调整阈值) + segmented = sitk.BinaryThreshold(image, lowerThreshold=1, upperThreshold=1) + + # 统计体素数量 + statistics = sitk.LabelShapeStatisticsImageFilter() + statistics.Execute(segmented) + + voxel_count = statistics.GetNumberOfPixels(1) + volume_mm3 = voxel_count * voxel_volume + volume_ml = volume_mm3 / 1000.0 + + # print(f"体素数量: {voxel_count}") + # print(f"器官体积: {volume_ml:.2f} mL") + + return volume_ml + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + if not pid_dir.startswith("BDMAP_"): + continue + + meta_file=os.path.join(target_path,'%s.csv'%pid_dir) + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + full_path=os.path.join(target_path,pid_dir,"ct.nii.gz") + + + if not os.path.isfile(full_path): + continue + try: + print(full_path) + + dicom_image=util.load_nifti(full_path) + spacing_info = dicom_image.GetSpacing() + print('SPACING INFO:', spacing_info) + + # metadata_keys = dicom_image.GetMetaDataKeys() + + # dtag=load_dicom_tag(dicom_fp[0]) + # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID + # modality=dtag.GetMetaData('0008|0060')##Modality + uid=pid_dir + modality="CT" + study='AbdomenAtlas'##Dataset_name + CIA_other_info = { + 'Study_UID':uid, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + size = list(dicom_image.GetSize()) + resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size) + + # resize the image + if resampler is not None: + proces_image = resampler.Execute(dicom_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = dicom_image + CIA_other_info['Resample'] = False + + ## + # CIA_other_info['Image_id']=meta_image_id + # CIA_other_info['Weeks']=str(meta_weeks) + # CIA_other_info['FVC']=str(meta_fvc) + # CIA_other_info['Percent']=str(meta_percent) + # CIA_other_info['Age']=str(meta_age) + # CIA_other_info['Sex']=meta_sex + # CIA_other_info['Smoke_Status']=meta_status + # threshold the image + if 'CT' in modality: + proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) + else: + pass + + output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, full_path) + print(f"Saved NIfTI file to {output_path}") + + ##segment + label_path_dict = {} + label_flag=True + + label_paths = os.path.join(target_path,pid_dir, 'segmentations') + label_files=glob.glob("%s/*.nii.gz"%(label_paths)) + #print(label_paths,label_files) + pelvis_flag=False + thorax_flag=False + if len(label_files)>0: + for lf in label_files: + lf_name=os.path.basename(lf) + + lf_tissue=lf_name.replace(".nii.gz","") + + if 'femur' in lf_tissue: + vol_femur=simpleitk_volume_calculation(lf) + print(lf_tissue,vol_femur) + if vol_femur>=FEMUR_VOL_THRESH: + pelvis_flag=True + if 'lung' in lf_tissue: + vol_lung=simpleitk_volume_calculation(lf) + print(lf_tissue,vol_lung) + if vol_lung>=LUNG_VOL_THRESH: + thorax_flag=True + + + label_image=load_nrrd(lf) + resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_image) + else: + proces_label = label_image + + + # print(proces_image.GetSize(),proces_label.GetSize()) + try: + assert proces_image.GetSize() == proces_label.GetSize() + except Exception as e: + failed_files.append(lf) + continue + + label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{lf_tissue}.nii.gz") + + label_path_dict[lf_tissue] = label_output_path + util.save_nifti(proces_label, label_output_path, lf) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + + else: + label_flag=False + except RuntimeError: + failed_files.append(full_path) + print(f"Failed to load DICOM images from {full_path}") + continue + + ''' + meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Weeks',meta_weeks) + meta.add_keyvalue('FVC',meta_fvc) + meta.add_keyvalue('Percent',meta_percent) + meta.add_keyvalue('Age',meta_age) + meta.add_keyvalue('Sex',meta_sex) + meta.add_keyvalue('Smoke_Status',meta_status) + ''' + + size_processed = list(proces_image.GetSize()) + + meta_image_id=uid + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info)) + meta.add_keyvalue('OriImg_path',full_path) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + + roi='abdomen' + + if thorax_flag: + roi='thorax-'+roi + + if pelvis_flag: + roi=roi+"-pelvis" + + meta.add_keyvalue('ROI',roi) + + + + if label_flag: + # print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + # meta.add_keyvalue('Label_Dict',LABEL_DICT) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenAtlas/uncompressed2") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v3/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/AbdomenAtlas/util.py b/AbdomenAtlas/util.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd221c91bfdc6ff61af486b7b09d0bad9c6deee --- /dev/null +++ b/AbdomenAtlas/util.py @@ -0,0 +1,410 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + ##added by yanguoiqng on 2025-08-08 + elif key == 'Sub_modality': + + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/AbdomenAtlas/xx_update.py b/AbdomenAtlas/xx_update.py new file mode 100644 index 0000000000000000000000000000000000000000..928d96edd9a8efe8eab2caf9d67f2e0ea7c78947 --- /dev/null +++ b/AbdomenAtlas/xx_update.py @@ -0,0 +1,518 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-8-18 +update AbdomenAtlas3.0 data clean + +https://arxiv.org/pdf/2407.16697 +https://zhuanlan.zhihu.com/p/19339643417 + +AbdomenAtlas 3.0 是目前公开的最大规模腹部 CT 图像-文本配对数据集,旨在解决医学影像中的肿瘤检测与报告生成难题。 +该数据集包含 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家,并且是首个提供逐像素(per-voxel)标注、详细肿瘤报告以及肿瘤分期信息的公开数据集。 +这些 CT 扫描数据通过标准医学影像格式(NIfTI 和 DICOM)存储,具备体素间距及 HU 值等临床信息。AbdomenAtlas 3.0 整合并重新标注了 17 个公共数据集,经过 12 位放射科医生的审核,共标注了 8,562 个肿瘤实例,其中包括 3,036 个肝脏肿瘤、354 个胰腺肿瘤和 4,239 个肾脏肿瘤。此外,数据集包含 2,947 份肿瘤报告,其中 948 份为早期肿瘤报告(≤2 cm),260 份报告提供了胰腺肿瘤的 T 分期(T1-T4),并首次公开肝脏 8 个亚段和胰腺 3 个亚段的逐像素标注,以及肿瘤与关键血管(如 SMA、CA 等)的接触标注。 +通过 RadGPT 自动生成的结构化和叙述性报告,数据集详细描述了肿瘤大小、形状、位置、体积以及与周围血管和器官的相互作用。这些报告的生成准确性经过验证,在检测小肿瘤(≤2 cm)方面,RadGPT 的敏感性/特异性显著优于现有方法(例如肝脏:80%/73%,胰腺:77%/77%)。数据集还包含 240 份“人类-AI 融合报告”,结合了放射科医生的临床笔记和 AI 的精确量化结果。AbdomenAtlas 3.0 的意义在于,它首次提供了一个全面的腹部 CT 图像-文本配对数据集,填补了公开领域中腹部肿瘤检测数据的空白,并为推动医学影像中的自动化肿瘤检测、分期和报告生成奠定了基础。这一数据集不仅在规模和多样性上领先,还通过结合 AI 和放射科医生的专业知识,提供了高质量的标注和诊断支持,将有助于提升 AI 模型在医学影像分析中的实际临床应用能力。 + +数据集统计信息 +总数据量: +9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家。 +包含 8,562 个肿瘤实例: +肝脏肿瘤:3,036 个实例(929 份报告) +胰腺肿瘤:354 个实例(344 份报告) +肾脏肿瘤:4,239 个实例(1,674 份报告) +6,061 份无肿瘤报告(作为对照组) +小肿瘤(≤2 cm): +943 份小肿瘤相关报告: +肝脏:347 个实例(占肝脏肿瘤的 37.4%) +胰腺:83 个实例(占胰腺肿瘤的 24.1%) +肾脏:466 个实例(占肾脏肿瘤的 27.8%) +肿瘤分期与解剖结构: +260 份胰腺肿瘤分期报告(T1–T4) +提供肝脏 8 个亚段和胰腺 3 个亚段(头、体、尾)的逐像素分割 +标注了肿瘤与关键血管(如 SMA、CA、CHA 等)的接触角度 +图像与文本配对: +1.8M 文本 Token,包含三类报告: +结构化报告:基于模板生成,提供定量信息(如肿瘤体积、位置等) +叙述性报告:通过 LLM 转换,模仿目标医院的报告风格 +人类-AI 融合报告:240 份,结合临床笔记与 AI 生成的内容 + +AbomentAtlas数据集中每个病例里面的segmentions都是包含了25个器官组织的标注文件,同时也包含一个combined_labels.nii.gz的文件【里面加上背景值包含了0-25的数值 +1 aorta +2 gall_bladder +3 kidney_left +4 kidney_right +5 liver +6 pancreas +7 postcava +8 spleen +9 stomach +10 adrenal_gland_left +11 adrenal_gland_right +12 bladder +13 celiac_trunk +14 colon +15 duodenum +16 esophagus +17 femur_left +18 femur_right +19 hepatic_vessel +20 intestine +21 lung_left +22 lung_right +23 portal_vein_and_splenic_vein +24 prostate +25 rectum + + +参考TotalSegment分别存储25个器官的label处理后的数据文件 +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +# model_name = "bert-large-uncased" +# reduce_method = 'mean' +# max_words_num = 32 # max number of words in the caption > 2 + +# embeder, tokenizer = get_frozen_embeder(model_name) + +# string1 = "modality: ct, gender: female, age: 51, roi: abdomen" +# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# string2 = "modality: ct, gender: female, age: 50, roi: head" + +# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# input_size = embeder.config.vocab_size +# in_size = embeder.config.hidden_size + +# print(embeder, input_size, in_size) +# print(tokenizer) + + +# print(embeder_output1) +# print(embeder_output1.shape) # torch.Size([1, 8, 768]) + + +# print(embeder_output2) +# print(embeder_output2.shape) # torch.Size([1, 8, 768]) + + +# error = torch.abs(embeder_output1 - embeder_output2) +# print(error) +# print("Embedding distance between the two sentences: ") +# print(f"String1: {string1}") +# print(f"String2: {string2}") +# print(torch.mean(error)) + + +# exit() + + +# meta_id_name='Patient' +# meta_weeks_name='Weeks' +# meta_fvc_name='FVC' +# meta_percent_name='Percent' +# meta_age_name='Age' +# meta_sex_name='Sex' +# meta_status_name='SmokingStatus' + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... + +##判定是否有效胸部的肺部体积阈值ml +LUNG_VOL_THRESH=1000 +FEMUR_VOL_THRESH=80 +KIDNEY_VOL_THRESH=100 +gall_bladder_VOL_THRESH=12 +ROI="abdomen" + +PROCESS_FLAG=True + +LABEL_DICT={ + "0":"backgroud", + "1":"aorta", + "2":"gall_bladder", + "3":"kidney_left", + "4":"kidney_right", + "5":"liver", + "6":"pancreas", + "7":"postcava", + "8":"spleen", + "9":"stomach", + "10":"adrenal_gland_left", + "11":"adrenal_gland_right", + "12":"bladder", + "13":"celiac_trunk", + "14":"colon", + "15":"duodenum", + "16":"esophagus", + "17":"femur_left", + "18":"femur_right", + "19":"hepatic_vessel", + "20":"intestine", + "21":"lung_left", + "22":"lung_right", + "23":"portal_vein_and_splenic_vein", + "24":"prostate", + "25":"rectum" +} +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def simpleitk_volume_calculation(image_path): + """ + 使用SimpleITK简化体积计算流程,计算肺部体积,左肺或右肺超过400即认定为有效throax + """ + + image=util.load_nifti(image_path) + # 获取体素尺寸 + spacing = image.GetSpacing() + voxel_volume = spacing[0] * spacing[1] * spacing[2] # mm³ + + # print(f"图像尺寸: {image.GetSize()}") + # print(f"体素间距: {spacing}") + # print(f"单个体素体积: {voxel_volume:.6f} mm³") + ##计算有效像元数量 + image_array2 = sitk.GetArrayFromImage(image) + valid_pxiels=image_array2[image_array2==1].sum() + if valid_pxiels<10: + return 0 + # 简单的阈值分割(需要根据实际情况调整阈值) + segmented = sitk.BinaryThreshold(image, lowerThreshold=1, upperThreshold=1) + + # 统计体素数量 + statistics = sitk.LabelShapeStatisticsImageFilter() + statistics.Execute(segmented) + + voxel_count = statistics.GetNumberOfPixels(1) + volume_mm3 = voxel_count * voxel_volume + volume_ml = volume_mm3 / 1000.0 + + # print(f"体素数量: {voxel_count}") + # print(f"器官体积: {volume_ml:.2f} mL") + + return volume_ml + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + failed_files = [] + label_dict={} + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'xx.json') + failed_files_path = os.path.join(output_dir, 'yy.json') + #meta = meta_data() + with open(json_output_path,'r') as fi: + fj=json.load(fi) + ''' + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + ''' + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + if not pid_dir.startswith("BDMAP_"): + continue + + meta_file=os.path.join(target_path,'%s.csv'%pid_dir) + if os.path.isfile(meta_file): + mf_flag=True + # df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + full_path=os.path.join(target_path,pid_dir,"ct.nii.gz") + + + try: + ''' + dicom_image=util.load_nifti(full_path) + spacing_info = dicom_image.GetSpacing() + print('SPACING INFO:', spacing_info) + + # metadata_keys = dicom_image.GetMetaDataKeys() + + # dtag=load_dicom_tag(dicom_fp[0]) + # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID + # modality=dtag.GetMetaData('0008|0060')##Modality + uid=pid_dir + modality="CT" + study='AbdomenAtlas'##Dataset_name + CIA_other_info = { + 'Study_UID':uid, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + size = list(dicom_image.GetSize()) + resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size) + + # resize the image + if resampler is not None: + proces_image = resampler.Execute(dicom_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = dicom_image + CIA_other_info['Resample'] = False + + ## + # CIA_other_info['Image_id']=meta_image_id + # CIA_other_info['Weeks']=str(meta_weeks) + # CIA_other_info['FVC']=str(meta_fvc) + # CIA_other_info['Percent']=str(meta_percent) + # CIA_other_info['Age']=str(meta_age) + # CIA_other_info['Sex']=meta_sex + # CIA_other_info['Smoke_Status']=meta_status + # threshold the image + if 'CT' in modality: + proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) + else: + pass + + output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, full_path) + print(f"Saved NIfTI file to {output_path}") + ''' + ##segment + label_path_dict = {} + label_flag=True + + label_paths = os.path.join(target_path,pid_dir, 'segmentations') + label_files=glob.glob("%s/*.nii.gz"%(label_paths)) + #print(label_paths,label_files) + pelvis_flag=False + thorax_flag=False + lung_min=0 + lung_max=0 + kidney_flag=False + gall_bladder_flag=False + if len(label_files)>0: + for lf in label_files: + lf_name=os.path.basename(lf) + + lf_tissue=lf_name.replace(".nii.gz","") + + if 'femur' in lf_tissue: + vol_femur=simpleitk_volume_calculation(lf) + print(lf_tissue,vol_femur) + if vol_femur>=FEMUR_VOL_THRESH: + pelvis_flag=True + if 'lung' in lf_tissue: + vol_lung=simpleitk_volume_calculation(lf) + print(lf_tissue,vol_lung) + lung_max=max(lung_max,vol_lung) + if lung_min==0: + lung_min=vol_lung + else: + lung_min=min(lung_min,vol_lung) + if lung_min>=LUNG_VOL_THRESH: + thorax_flag=True + if 'kidney_right' in lf_tissue: + vol_kidney=simpleitk_volume_calculation(lf) + print(lf_tissue,vol_kidney) + if vol_kidney>=KIDNEY_VOL_THRESH: + kidney_flag=True + + if 'gall_bladder' in lf_tissue: + vol_gall_bladder=simpleitk_volume_calculation(lf) + print(lf_tissue,vol_gall_bladder) + if vol_gall_bladder>=gall_bladder_VOL_THRESH: + gall_bladder_flag=True + ''' + label_image=load_nrrd(lf) + resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_image) + else: + proces_label = label_image + + + # print(proces_image.GetSize(),proces_label.GetSize()) + try: + assert proces_image.GetSize() == proces_label.GetSize() + except Exception as e: + failed_files.append(lf) + continue + + label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{lf_tissue}.nii.gz") + + label_path_dict[lf_tissue] = label_output_path + util.save_nifti(proces_label, label_output_path, lf) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + ''' + else: + label_flag=False + except RuntimeError: + failed_files.append(full_path) + print(f"Failed to load DICOM images from {full_path}") + continue + + ''' + meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Weeks',meta_weeks) + meta.add_keyvalue('FVC',meta_fvc) + meta.add_keyvalue('Percent',meta_percent) + meta.add_keyvalue('Age',meta_age) + meta.add_keyvalue('Sex',meta_sex) + meta.add_keyvalue('Smoke_Status',meta_status) + + + size_processed = list(proces_image.GetSize()) + + meta_image_id=uid + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info)) + meta.add_keyvalue('OriImg_path',full_path) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + ''' + roi='abdomen' + if thorax_flag and gall_bladder_flag: + roi='thorax-'+roi + if thorax_flag and not gall_bladder_flag: + roi='thorax' + if pelvis_flag and gall_bladder_flag: + roi=roi+"-pelvis" + if pelvis_flag and not gall_bladder_flag: + roi='pelvis' + if lung_min>0 and lung_max/lung_min>3: + label_dict[pid_dir]=[lung_max,lung_min] + + print(pid_dir,roi) + #meta.add_keyvalue('ROI',roi) + for ik in fj.keys(): + fi=fj[ik] + jid=fi['Metadata']['Study_UID'] + max_length=fi['Spacing_mm']*max(fi['Size'])*0.001 + print(max_length,max_length>1.2) + if jid==pid_dir: + if roi=='thorax-abdomen-pelvis' and max_length>1.2: + roi='whole-body' + fj[ik]['ROI']=roi + print(jid,max_length,roi) + break + else: + continue + + ''' + if label_flag: + # print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + # meta.add_keyvalue('Label_Dict',LABEL_DICT) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + ''' + else: + print("No metadata.csv files found.") + + + with open(json_output_path,'w') as fi: + json.dump(fj,fi) + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + #print(label_dict) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenAtlas/uncompressed2") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v2/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/AbdomenCT1k/config_format.json b/AbdomenCT1k/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..01e9febcb5f7b8c2946d49b246139faf6e8272b1 --- /dev/null +++ b/AbdomenCT1k/config_format.json @@ -0,0 +1,125 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + + "Sub_modality": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/AbdomenCT1k/dataclean_abdomen_ct_1k.py b/AbdomenCT1k/dataclean_abdomen_ct_1k.py new file mode 100644 index 0000000000000000000000000000000000000000..0aba1aaa069f9f569b990dc4502ee5c1a15fec68 --- /dev/null +++ b/AbdomenCT1k/dataclean_abdomen_ct_1k.py @@ -0,0 +1,365 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-8-18 +update AbdomenCT1K data clean + +https://github.com/JunMa11/AbdomenCT-1K + liver (label 1), kidney (label 2), spleen (label 3), and pancreas (label 4). + AbdomenCT-1K 是一个大规模腹部 CT 数据集,包含了 1112 例 CT 扫描,用于 4 种腹部器官的分割,包括肝脏、肾脏、脾脏和胰腺。需注意,肾脏没有区分左右。 + 这些数据主要来源于 6 个数据集,其中 5 个是公开数据集,分别是 LiTS(201例)、KiTS19(300例)、MSD Spleen(61例)、MSD Pancreas(420例)和 NIH Pancreas(80例)。 + 另外一个是来自南京大学的新数据集,包含 50 例 CT 扫描。原始的这些公开数据集大多只针对某一特定器官进行了标注,但在 AbdomenCT-1K 中,每例 CT 扫描都对这四种器官进行了全面的标注。 + + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +# model_name = "bert-large-uncased" +# reduce_method = 'mean' +# max_words_num = 32 # max number of words in the caption > 2 + +# embeder, tokenizer = get_frozen_embeder(model_name) + +# string1 = "modality: ct, gender: female, age: 51, roi: abdomen" +# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# string2 = "modality: ct, gender: female, age: 50, roi: head" + +# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# input_size = embeder.config.vocab_size +# in_size = embeder.config.hidden_size + +# print(embeder, input_size, in_size) +# print(tokenizer) + + +# print(embeder_output1) +# print(embeder_output1.shape) # torch.Size([1, 8, 768]) + + +# print(embeder_output2) +# print(embeder_output2.shape) # torch.Size([1, 8, 768]) + + +# error = torch.abs(embeder_output1 - embeder_output2) +# print(error) +# print("Embedding distance between the two sentences: ") +# print(f"String1: {string1}") +# print(f"String2: {string2}") +# print(torch.mean(error)) + + +# exit() + + +# meta_id_name='Patient' +# meta_weeks_name='Weeks' +# meta_fvc_name='FVC' +# meta_percent_name='Percent' +# meta_age_name='Age' +# meta_sex_name='Sex' +# meta_status_name='SmokingStatus' + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... + + +LABEL_DICT={ + "0":"backgroud", + "1":"liver", + "2":"kidney", + "3":"spleen", + "4":"pancreas" +} + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + if not "AbdomenCT-1K-ImagePart" in pid_dir: + continue + meta_file=os.path.join(target_path,'%s.csv'%pid_dir) + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + for data_dir in tqdm(image_dirs, desc="Processing images files"): + + + + full_path=os.path.join(target_path,pid_dir,data_dir) + # data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + # if data_info_row.shape[0]>0: + # data_info_row=data_info_row.reset_index() + # #print(data_info_row[meta_id_name]) + # meta_image_id=data_info_row[meta_id_name][0] + # meta_weeks=data_info_row[meta_weeks_name][0] + # meta_fvc=data_info_row[meta_fvc_name][0] + # meta_percent=data_info_row[meta_percent_name][0] + # meta_age=data_info_row[meta_age_name][0] + # meta_sex=data_info_row[meta_sex_name][0] + # meta_status=data_info_row[meta_status_name][0] + # else: + # meta_image_id=data_dir + # meta_weeks='' + # meta_fvc='' + # meta_percent='' + # meta_age='' + # meta_sex='' + # meta_status='' + # full_path = convert_windows_to_linux_path(full_path) + if not os.path.isfile(full_path): + continue + if not data_dir.endswith(".nii.gz"): + continue + try: + print(full_path) + + dicom_image=util.load_nifti(full_path) + spacing_info = dicom_image.GetSpacing() + print('SPACING INFO:', spacing_info) + + # metadata_keys = dicom_image.GetMetaDataKeys() + + # dtag=load_dicom_tag(dicom_fp[0]) + # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID + # modality=dtag.GetMetaData('0008|0060')##Modality + uid=data_dir[:10] + modality="CT" + study='AbdomenCT1K'##Dataset_name + CIA_other_info = { + 'Study_UID':uid, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + size = list(dicom_image.GetSize()) + resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size) + + # resize the image + if resampler is not None: + proces_image = resampler.Execute(dicom_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = dicom_image + CIA_other_info['Resample'] = False + + ## + # CIA_other_info['Image_id']=meta_image_id + # CIA_other_info['Weeks']=str(meta_weeks) + # CIA_other_info['FVC']=str(meta_fvc) + # CIA_other_info['Percent']=str(meta_percent) + # CIA_other_info['Age']=str(meta_age) + # CIA_other_info['Sex']=meta_sex + # CIA_other_info['Smoke_Status']=meta_status + # threshold the image + if 'CT' in modality: + proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) + else: + pass + + output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, full_path) + print(f"Saved NIfTI file to {output_path}") + + ##segment + label_path_dict = {} + label_flag=True + + label_paths = os.path.join(target_path, 'Mask') + label_files=glob.glob("%s/%s.nii.gz"%(label_paths,uid)) + #print(label_paths,label_files) + if len(label_files)>0: + lf=label_files[0] + lf_name=os.path.basename(lf) + lf_id=lf_name.split("_")[0] + lf_tissue="abdomen" + label_image=load_nrrd(lf) + resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_image) + else: + proces_label = label_image + + label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{uid}.nii.gz") + + label_path_dict[lf_tissue] = label_output_path + util.save_nifti(proces_label, label_output_path, lf) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + + else: + label_flag=False + except RuntimeError: + failed_files.append(full_path) + print(f"Failed to load DICOM images from {full_path}") + continue + + ''' + meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Weeks',meta_weeks) + meta.add_keyvalue('FVC',meta_fvc) + meta.add_keyvalue('Percent',meta_percent) + meta.add_keyvalue('Age',meta_age) + meta.add_keyvalue('Sex',meta_sex) + meta.add_keyvalue('Smoke_Status',meta_status) + ''' + if label_flag: + print(proces_image.GetSize(),proces_label.GetSize()) + try: + assert proces_image.GetSize() == proces_label.GetSize() + except Exception as e: + failed_files.append(full_path) + continue + size_processed = list(proces_image.GetSize()) + + meta_image_id=uid + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info)) + meta.add_keyvalue('OriImg_path',full_path) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','abdomen') + + + + if label_flag: + print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + meta.add_keyvalue('Label_Dict',LABEL_DICT) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenCT1k") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenCT1k/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/AbdomenCT1k/util.py b/AbdomenCT1k/util.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd221c91bfdc6ff61af486b7b09d0bad9c6deee --- /dev/null +++ b/AbdomenCT1k/util.py @@ -0,0 +1,410 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + ##added by yanguoiqng on 2025-08-08 + elif key == 'Sub_modality': + + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000000000000000000000000000000000..da0dcb802735053c42f8b05d9766929cb8d6f9b2 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,71 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Medical imaging data engineering pipeline for standardizing diverse datasets (CT, MRI, PET) into a unified NIfTI format with consistent JSON metadata. Each subdirectory handles one dataset (AbdomenAtlas, BRATS, MnM2, OASIS, OAI_ZIB, PSMA, Kaggle OSIC, etc.). + +## Running Data Cleaning Scripts + +Each dataset has its own `dataclean_*.py` script. Run from the dataset's subdirectory: + +```bash +python dataclean_abdomen_atlas.py --target_path /path/to/raw/data --output_dir /path/to/output +``` + +All scripts follow the same `--target_path` / `--output_dir` argument pattern. Versioned scripts (e.g., `_v2.py`, `_v3.py`) represent iterative improvements; use the highest version unless investigating regressions. + +## Dependencies + +Python 3 with: `SimpleITK`, `pandas`, `numpy`, `tqdm`, `openpyxl` (for Excel metadata). No requirements.txt exists — install manually. + +## Architecture + +### Processing Pipeline (per dataset) + +1. **Load** raw data (DICOM via `sitk.ImageSeriesReader`, NIfTI via `sitk.ReadImage`, or NRRD) +2. **Extract metadata** from headers, CSV files, or DICOM tags +3. **Resample** to isotropic spacing using minimum voxel spacing (`get_unisize_resampler`) +4. **Clamp intensities** — CT: `[-300, 300]` HU; MRI: varies per dataset +5. **Process segmentation labels** with identical resampling (nearest-neighbor interpolation) +6. **Validate** image/label dimension alignment via `assert` on `GetSize()` +7. **Write** standardized NIfTI (`.nii.gz`) + append to `nifti_mappings.json` + +### Key Shared Components + +**`util.py`** (copied into each dataset directory — not a shared import): +- `meta_data` class — validates metadata against `config_format.json` schema, enforces required fields (Modality, OriImg_path, Spacing_mm, Size, Dataset_name), normalizes ambiguous terminology via synonym dictionaries +- `get_unisize_resampler()` — builds a SimpleITK resampler for isotropic spacing; returns `None` if spacing is already isotropic +- `clamp_image()` — HU/intensity clamping via `sitk.ClampImageFilter` +- `get_synonyms_dict()` / `replace_synonyms()` — canonical mapping for ROI names, tissue labels, modalities, and task types +- `load_nifti()`, `load_dicom_images()`, `save_nifti()` — I/O wrappers that embed `FolderPath` metadata in NIfTI headers + +**`config_format.json`** (per dataset directory): defines the metadata schema — field types, required flags, and allowed option values. + +### Output Structure + +``` +{output_dir}/{patient_id}/{patient_id}.nii.gz # processed image +{output_dir}/{patient_id}/{task}/{tissue}.nii.gz # segmentation labels +{output_dir}/nifti_mappings.json # metadata keyed by output path +{output_dir}/failed_files.json # files that failed processing +``` + +### Dataset-Specific Notes + +- **AbdomenAtlas**: 25-organ segmentation labels stored as individual NIfTI files per organ; also has `combined_labels.nii.gz` (values 0-25) +- **BRATS (2019/2020/2021)**: Multi-modal MRI (FLAIR, T1, T1ce, T2) — each modality processed as a separate sub-modality entry +- **MnM2/MnMs**: Cardiac MRI with vendor metadata (Siemens, Philips, GE, Canon) +- **OASIS**: Both cross-sectional and longitudinal variants; includes clinical scores (MMSE, CDR) +- **OAI_ZIB**: Knee MRI with 6-structure segmentation and clinical grading (WOMAC) +- **PSMA**: Dual-tracer PET/CT (PSMA & FDG); has longitudinal variant + +## Important Conventions + +- Resampling uses the **minimum** of the original spacing values to create isotropic voxels +- Labels are resampled with **nearest-neighbor** interpolation; images use **linear** +- The `meta_data` class normalizes terminology automatically — e.g., "chest" maps to "thorax", "seg" maps to "segmentation" +- `util.py` is duplicated across directories (not shared via import) — changes must be propagated manually +- Code comments and docstrings are frequently in Chinese +- Log files (`*.log`) in each directory contain processing run history — these can be large (up to 23 MB) diff --git a/MnM2_clean/config_format.json b/MnM2_clean/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..54d04c24e37a45877dec012ba98558c07b29ffe4 --- /dev/null +++ b/MnM2_clean/config_format.json @@ -0,0 +1,124 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + "ImgDict": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/MnM2_clean/dataclean_MnM2.py b/MnM2_clean/dataclean_MnM2.py new file mode 100644 index 0000000000000000000000000000000000000000..f5a307540a75e69becdf909b82e44e705807d32d --- /dev/null +++ b/MnM2_clean/dataclean_MnM2.py @@ -0,0 +1,427 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-08-26 +update MnMs2 data clean + +nM2数据集的处理逻辑(个人理解,目前是按照这个思路来编写的处理脚本): +1.LA或者SA需要分开存储处理; +2.ED/ES我理解是舒张|收缩状态的图像信息,只是对应CINE(LA或SA)的某一帧;考虑到没有找到对应的头文件信息,不知道具体对应哪一帧; +3.这个数据集应该不是最原始的MnM2数据集,像是经过某些处理后的;同时没有找到对应的头文件信息; +4.带gt的文件为label标注文件,包含0,1,2,3【0:背景 1:左心室腔(LV)2:右心室腔(RV)3:左心室心肌(Myo)】--需要帮忙确认下 + +a.需要单独保存LA-CINE以及SA-CINE的重处理后的文件; +b.另外需要单独处理LA-ED,LA-ES以及SA-ED,SA-ES的重处理后的文件【spaceing以及size同CINE】;以及label标注文件; + +##暂时将LA-ED/ES分开,可以考虑计算每个cine的时次图层的图像均值来判定ED/ES对应的所在帧【试验可行】;--20250825 +分割标签:NIFTI 格式,标签值: + +0:背景 + +1:左心室腔(LV) + +2:右心室腔(RV) + +3:左心室心肌(Myo + +当前版本没有元文件信息 + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + + + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +LABEL_DICT={ + "0":"backgroud", + "1":"LV",#左心室 Blood Pools + "3":"MYO",#左心室心肌 + "2":"RV"#右心室 Blood Pools +} + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + # pid_dirs=["Training","Testing","Validation"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + meta_image_id=pid_dir + + modality="MRI" + study='MnM2'##Dataset_name + + full_dir=os.path.join(target_path,pid_dir) + dfs=find_image_dirs(full_dir)##list all nii.gz files + + + if len(dfs)>0: + for df in dfs: + ##循环遍历查找SA.LA的CINE以及ES/ED以及对应的gt文件 + if "CINE" in df: + ##正常处理 + label_flag=False + if "_LA_" in df: + la_flag=True + else: + la_flag=False + + elif "ES.nii.gz" in df: + if "_LA_" in df: + la_flag=True + else: + la_flag=False + if os.path.isfile(os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))): + label_flag=True + else: + label_flag=False + else: + continue + try: + ##处理数据 + full_path_image=os.path.join(full_dir,df) + + sitk_img_original = util.load_nifti(full_path_image) + if sitk_img_original is None: + print(f" Failed to load image: {full_path_image}") + continue + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + sitk_img_processed = sitk_img_original + # is_4d_image = msd_dataset_info.get("tensorImageSize", "3D").upper() == "4D" or sitk_img_original.GetDimension() == 4 + is_4d_image = sitk_img_original.GetDimension() == 4 + + frame_flag=False + # --- Resampling Logic (Revised for 4D) --- + if is_4d_image: + + # Always process 4D images channel-wise for resampling + # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only + channels = [] + num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1 + channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing + + + for i in range(num_channels): + extractor = sitk.ExtractImageFilter() + current_3d_channel_size = original_size[:3] + + if sitk_img_original.GetDimension() == 4: + extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img = extractor.Execute(sitk_img_original) + else: + channel_3d_img = sitk_img_original + if i > 0: break + + channel_resampler = util.get_unisize_resampler( + channel_3d_img, 'linear', + spacing=channel_target_spacing, size=current_3d_channel_size + ) + if channel_resampler: + channels.append(channel_resampler.Execute(channel_3d_img)) + else: + channels.append(channel_3d_img) + + if channels: + if len(channels) > 1: # Only join if there are multiple channels + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels) + ##aded by yanguoqing on 2025-08-11 + frame_flag=True + # imgDict={} + # for kf_idx in range(num_channels): + # imgDict[str(kf_idx)]='none' + # if str(meta_ed):imgDict[str(meta_ed)]='ed' + # if str(meta_es):imgDict[str(meta_es)]='es' + # meta.add_keyvalue('ImgDict',imgDict) + elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize) + sitk_img_processed = channels[0] + elif TARGET_VOXEL_SPACING: # 3D image with target spacing + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=TARGET_VOXEL_SPACING, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + else: # 3D image, no TARGET_VOXEL_SPACING + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=original_spacing, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + + + CIA_other_info = { + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + CIA_other_info['Image_id']=meta_image_id + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + is_processed_4d = sitk_img_processed.GetDimension() == 4 + clamp_range_to_use=None + if clamp_range_to_use and is_processed_4d: + clamped_channels_final = [] + num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1 + for i in range(num_channels_final): + extractor = sitk.ExtractImageFilter() + proc_size_final = sitk_img_processed.GetSize() + extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed) + clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use)) + if clamped_channels_final: + if len(clamped_channels_final) > 1: + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final) + elif len(clamped_channels_final) == 1: + sitk_img_processed = clamped_channels_final[0] + elif clamp_range_to_use: # 3D image + sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use) + + + output_path = os.path.join(output_dir,pid_dir, f"{df}") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(sitk_img_processed, output_path, full_path_image) + print(f"Saved NIfTI file to {output_path}") + + + + label_path_dict = {} + + if label_flag: + processed_lbl_full_path = os.path.join(output_dir, pid_dir, TASK_VALUE, f"{df}") + full_path_label=os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz")) + + sitk_lbl_original = util.load_nifti(full_path_label) + if not sitk_lbl_original: + print(f" Failed to load label: {full_path_label}") + processed_lbl_full_path = None + continue + if sitk_lbl_original: + label_resampler = sitk.ResampleImageFilter() + reference_for_label = sitk_img_processed # Default to processed image + + if sitk_img_processed.GetDimension() == 4: + num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1 + if num_comp_proc > 0: + extractor = sitk.ExtractImageFilter() + proc_img_size_for_lbl_ref = sitk_img_processed.GetSize() + extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0]) + extractor.SetIndex([0,0,0,0]) + try: + reference_for_label = extractor.Execute(sitk_img_processed) + except Exception as ref_err: + print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.") + # print(traceback.format_exc()) + reference_for_label = None + else: # Fallback if extraction fails + print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.") + reference_for_label = None # This will cause an issue below if not handled + + sitk_lbl_processed = None + + if reference_for_label and reference_for_label.GetDimension() > 0: + label_resampler.SetInterpolator(sitk.sitkNearestNeighbor) + label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID()) + + if sitk_lbl_original.GetDimension() == 4: + lbl_channels = [] + lbl_size = list(sitk_lbl_original.GetSize()) + for i in range(lbl_size[3]): + extractor = sitk.ExtractImageFilter() + extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0]) + extractor.SetIndex([0, 0, 0, i]) + single_channel = extractor.Execute(sitk_lbl_original) + + label_resampler.SetReferenceImage(reference_for_label) + resampled_channel = label_resampler.Execute(single_channel) + lbl_channels.append(resampled_channel) + + if len(lbl_channels) > 1: + sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels) + elif len(lbl_channels) == 1: + sitk_lbl_processed = lbl_channels[0] + else: + label_resampler.SetReferenceImage(reference_for_label) + sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original) + if processed_lbl_full_path: + if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]: + print(f" Mismatch between image and label size (ignoring channels):") + print(f" Image size: {sitk_img_processed.GetSize()}") + print(f" Label size: {sitk_lbl_processed.GetSize()}") + util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label) + else: + print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.") + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label) # Save original + # processed_lbl_full_path should still point to this saved original label + sitk_lbl_processed=sitk_lbl_original + else: + processed_lbl_full_path = None + else: + processed_lbl_full_path = None + + if processed_lbl_full_path: + label_path_dict['heart'] = processed_lbl_full_path + + print('compare image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + print('compare image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize()) + try: + assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + + except Exception as e: + failed_files.append(full_path_label) + continue + except RuntimeError: + failed_files.append(full_path_image) + print(f"Failed to load MnMs images from {full_path_image}") + continue + + size_processed = list(sitk_img_processed.GetSize()) + print('size_processed',size_processed,original_size) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',full_path_image) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','chest') + + + if processed_lbl_full_path: + print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + meta.add_keyvalue('Label_Dict',LABEL_DICT) + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + continue + + + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnM2/MnM2/dataset/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/MnM2/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/MnM2_clean/dataclean_MnM2_v2.py b/MnM2_clean/dataclean_MnM2_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..88451fde42d0c102b1f75d3fbe0be116ffe4f86b --- /dev/null +++ b/MnM2_clean/dataclean_MnM2_v2.py @@ -0,0 +1,432 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-08-26 +update MnMs2 data clean + +nM2数据集的处理逻辑(个人理解,目前是按照这个思路来编写的处理脚本): +1.LA或者SA需要分开存储处理; +2.ED/ES我理解是舒张|收缩状态的图像信息,只是对应CINE(LA或SA)的某一帧;考虑到没有找到对应的头文件信息,不知道具体对应哪一帧; +3.这个数据集应该不是最原始的MnM2数据集,像是经过某些处理后的;同时没有找到对应的头文件信息; +4.带gt的文件为label标注文件,包含0,1,2,3【0:背景 1:左心室腔(LV)2:右心室腔(RV)3:左心室心肌(Myo)】--需要帮忙确认下 + +a.需要单独保存LA-CINE以及SA-CINE的重处理后的文件; +b.另外需要单独处理LA-ED,LA-ES以及SA-ED,SA-ES的重处理后的文件【spaceing以及size同CINE】;以及label标注文件; + +##暂时将LA-ED/ES分开,可以考虑计算每个cine的时次图层的图像均值来判定ED/ES对应的所在帧【试验可行】;--20250825 +分割标签:NIFTI 格式,标签值: + +0:背景 + +1:左心室腔(LV) + +2:右心室腔(RV) + +3:左心室心肌(Myo + +当前版本没有元文件信息 + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + + + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +LABEL_DICT={ + "0":"backgroud", + "1":"LV",#左心室 Blood Pools + "3":"MYO",#左心室心肌 + "2":"RV"#右心室 Blood Pools +} + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + # pid_dirs=["Training","Testing","Validation"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + meta_image_id=pid_dir + + modality="MRI" + study='MnM2'##Dataset_name + + full_dir=os.path.join(target_path,pid_dir) + dfs=find_image_dirs(full_dir)##list all nii.gz files + + + if len(dfs)>0: + for df in dfs: + ##循环遍历查找SA.LA的CINE以及ES/ED以及对应的gt文件 + if "CINE" in df: + ##正常处理 + label_flag=False + if "_LA_" in df: + la_flag=True + else: + la_flag=False + + elif "ES.nii.gz" in df: + if "_LA_" in df: + la_flag=True + else: + la_flag=False + if os.path.isfile(os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))): + label_flag=True + else: + label_flag=False + else: + continue + try: + ##处理数据 + full_path_image=os.path.join(full_dir,df) + + sitk_img_original = util.load_nifti(full_path_image) + if sitk_img_original is None: + print(f" Failed to load image: {full_path_image}") + continue + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + sitk_img_processed = sitk_img_original + # is_4d_image = msd_dataset_info.get("tensorImageSize", "3D").upper() == "4D" or sitk_img_original.GetDimension() == 4 + is_4d_image = sitk_img_original.GetDimension() == 4 + + frame_flag=False + # --- Resampling Logic (Revised for 4D) --- + if is_4d_image: + + # Always process 4D images channel-wise for resampling + # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only + channels = [] + num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1 + channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing + + + for i in range(num_channels): + extractor = sitk.ExtractImageFilter() + current_3d_channel_size = original_size[:3] + + if sitk_img_original.GetDimension() == 4: + extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img = extractor.Execute(sitk_img_original) + else: + channel_3d_img = sitk_img_original + if i > 0: break + + channel_resampler = util.get_unisize_resampler( + channel_3d_img, 'linear', + spacing=channel_target_spacing, size=current_3d_channel_size + ) + if channel_resampler: + channels.append(channel_resampler.Execute(channel_3d_img)) + else: + channels.append(channel_3d_img) + + if channels: + if len(channels) > 1: # Only join if there are multiple channels + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels) + ##aded by yanguoqing on 2025-08-11 + frame_flag=True + # imgDict={} + # for kf_idx in range(num_channels): + # imgDict[str(kf_idx)]='none' + # if str(meta_ed):imgDict[str(meta_ed)]='ed' + # if str(meta_es):imgDict[str(meta_es)]='es' + # meta.add_keyvalue('ImgDict',imgDict) + elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize) + sitk_img_processed = channels[0] + elif TARGET_VOXEL_SPACING: # 3D image with target spacing + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=TARGET_VOXEL_SPACING, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + else: # 3D image, no TARGET_VOXEL_SPACING + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=original_spacing, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + + + CIA_other_info = { + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + CIA_other_info['Image_id']=meta_image_id + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + is_processed_4d = sitk_img_processed.GetDimension() == 4 + clamp_range_to_use=None + if clamp_range_to_use and is_processed_4d: + clamped_channels_final = [] + num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1 + for i in range(num_channels_final): + extractor = sitk.ExtractImageFilter() + proc_size_final = sitk_img_processed.GetSize() + extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed) + clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use)) + if clamped_channels_final: + if len(clamped_channels_final) > 1: + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final) + elif len(clamped_channels_final) == 1: + sitk_img_processed = clamped_channels_final[0] + elif clamp_range_to_use: # 3D image + sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use) + + + output_path = os.path.join(output_dir,pid_dir, f"{df}") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(sitk_img_processed, output_path, full_path_image) + print(f"Saved NIfTI file to {output_path}") + + + + label_path_dict = {} + + if label_flag: + processed_lbl_full_path = os.path.join(output_dir, pid_dir, TASK_VALUE, f"{df}") + full_path_label=os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz")) + + sitk_lbl_original = util.load_nifti(full_path_label) + if not sitk_lbl_original: + print(f" Failed to load label: {full_path_label}") + processed_lbl_full_path = None + continue + if sitk_lbl_original: + label_resampler = sitk.ResampleImageFilter() + reference_for_label = sitk_img_processed # Default to processed image + print(sitk_img_processed.GetDimension(),sitk_img_processed.GetSize()) + if sitk_img_processed.GetDimension() == 4: + num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1 + if num_comp_proc > 0: + extractor = sitk.ExtractImageFilter() + proc_img_size_for_lbl_ref = sitk_img_processed.GetSize() + extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0]) + extractor.SetIndex([0,0,0,0]) + try: + reference_for_label = extractor.Execute(sitk_img_processed) + except Exception as ref_err: + print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.") + # print(traceback.format_exc()) + reference_for_label = None + else: # Fallback if extraction fails + print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.") + reference_for_label = None # This will cause an issue below if not handled + + sitk_lbl_processed = None + + if reference_for_label and reference_for_label.GetDimension() > 0: + label_resampler.SetInterpolator(sitk.sitkNearestNeighbor) + label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID()) + + if sitk_lbl_original.GetDimension() == 4: + lbl_channels = [] + lbl_size = list(sitk_lbl_original.GetSize()) + for i in range(lbl_size[3]): + extractor = sitk.ExtractImageFilter() + extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0]) + extractor.SetIndex([0, 0, 0, i]) + single_channel = extractor.Execute(sitk_lbl_original) + + label_resampler.SetReferenceImage(reference_for_label) + resampled_channel = label_resampler.Execute(single_channel) + lbl_channels.append(resampled_channel) + + if len(lbl_channels) > 1: + sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels) + elif len(lbl_channels) == 1: + sitk_lbl_processed = lbl_channels[0] + else: + label_resampler.SetReferenceImage(reference_for_label) + sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original) + if processed_lbl_full_path: + if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]: + print(f" Mismatch between image and label size (ignoring channels):") + print(f" Image size: {sitk_img_processed.GetSize()}") + print(f" Label size: {sitk_lbl_processed.GetSize()}") + util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label) + else: + print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.") + img_resampler_obj = util.get_unisize_resampler(sitk_lbl_original, 'nearest', + spacing=original_spacing, size=original_size) + if img_resampler_obj: + sitk_lbl_processed = img_resampler_obj.Execute(sitk_lbl_original) + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label) # Save original + # processed_lbl_full_path should still point to this saved original label + + + else: + processed_lbl_full_path = None + else: + processed_lbl_full_path = None + + if processed_lbl_full_path: + label_path_dict['heart'] = processed_lbl_full_path + + print('compare image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + print('compare image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize()) + try: + assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + + except Exception as e: + failed_files.append(full_path_label) + continue + except RuntimeError: + failed_files.append(full_path_image) + print(f"Failed to load MnMs images from {full_path_image}") + continue + + size_processed = list(sitk_img_processed.GetSize()) + print('size_processed',size_processed,original_size) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',full_path_image) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','chest') + + + if processed_lbl_full_path: + print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + meta.add_keyvalue('Label_Dict',LABEL_DICT) + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + continue + + + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnM2/MnM2/dataset/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/MnM2/V2") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/MnM2_clean/dataclean_MnM2_v3.py b/MnM2_clean/dataclean_MnM2_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..9ae41a0986b1e9b46269b165f2cd987e581a2fee --- /dev/null +++ b/MnM2_clean/dataclean_MnM2_v3.py @@ -0,0 +1,451 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-08-26 +update MnMs2 data clean + +nM2数据集的处理逻辑(个人理解,目前是按照这个思路来编写的处理脚本): +1.LA或者SA需要分开存储处理; +2.ED/ES我理解是舒张|收缩状态的图像信息,只是对应CINE(LA或SA)的某一帧;考虑到没有找到对应的头文件信息,不知道具体对应哪一帧; +3.这个数据集应该不是最原始的MnM2数据集,像是经过某些处理后的;同时没有找到对应的头文件信息; +4.带gt的文件为label标注文件,包含0,1,2,3【0:背景 1:左心室腔(LV)2:右心室腔(RV)3:左心室心肌(Myo)】--需要帮忙确认下 + +a.需要单独保存LA-CINE以及SA-CINE的重处理后的文件; +b.另外需要单独处理LA-ED,LA-ES以及SA-ED,SA-ES的重处理后的文件【spaceing以及size同CINE】;以及label标注文件; + +##暂时将LA-ED/ES分开,可以考虑计算每个cine的时次图层的图像均值来判定ED/ES对应的所在帧【试验可行】;--20250825 +分割标签:NIFTI 格式,标签值: + +0:背景 + +1:左心室腔(LV) + +2:右心室腔(RV) + +3:左心室心肌(Myo + +当前版本没有元文件信息 + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + + + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +LABEL_DICT={ + "0":"backgroud", + "1":"LV",#左心室 Blood Pools + "3":"MYO",#左心室心肌 + "2":"RV"#右心室 Blood Pools +} + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + # pid_dirs=["Training","Testing","Validation"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + meta_image_id=pid_dir + + modality="MRI" + study='MnM2'##Dataset_name + + full_dir=os.path.join(target_path,pid_dir) + dfs=find_image_dirs(full_dir)##list all nii.gz files + + print(">>>>",meta_image_id) + if len(dfs)>0: + for df in dfs: + ##循环遍历查找SA.LA的CINE以及ES/ED以及对应的gt文件 + if "CINE" in df: + ##正常处理 + label_flag=False + if "_LA_" in df: + la_flag=True + else: + la_flag=False + continue## + elif "ES.nii.gz" in df: + if "_LA_" in df: + la_flag=True + else: + la_flag=False + if os.path.isfile(os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))): + label_flag=True + else: + label_flag=False + elif "ED.nii.gz" in df :##ED.nii.gz + if "_LA_" in df: + la_flag=True + else: + la_flag=False + if os.path.isfile(os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz"))): + label_flag=True + else: + label_flag=False + else: + continue + try: + ##处理数据 + full_path_image=os.path.join(full_dir,df) + print("orig_file:",full_path_image) + sitk_img_original = util.load_nifti(full_path_image) + if sitk_img_original is None: + print(f" Failed to load image: {full_path_image}") + continue + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + sitk_img_processed = sitk_img_original + # is_4d_image = msd_dataset_info.get("tensorImageSize", "3D").upper() == "4D" or sitk_img_original.GetDimension() == 4 + is_4d_image = sitk_img_original.GetDimension() == 4 + + frame_flag=False + # --- Resampling Logic (Revised for 4D) --- + if is_4d_image: + + # Always process 4D images channel-wise for resampling + # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only + channels = [] + num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1 + channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing + + + for i in range(num_channels): + extractor = sitk.ExtractImageFilter() + current_3d_channel_size = original_size[:3] + + if sitk_img_original.GetDimension() == 4: + extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img = extractor.Execute(sitk_img_original) + else: + channel_3d_img = sitk_img_original + if i > 0: break + + channel_resampler = util.get_unisize_resampler( + channel_3d_img, 'linear', + spacing=channel_target_spacing, size=current_3d_channel_size + ) + if channel_resampler: + channels.append(channel_resampler.Execute(channel_3d_img)) + else: + channels.append(channel_3d_img) + + if channels: + if len(channels) > 1: # Only join if there are multiple channels + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels) + ##aded by yanguoqing on 2025-08-11 + frame_flag=True + # imgDict={} + # for kf_idx in range(num_channels): + # imgDict[str(kf_idx)]='none' + # if str(meta_ed):imgDict[str(meta_ed)]='ed' + # if str(meta_es):imgDict[str(meta_es)]='es' + # meta.add_keyvalue('ImgDict',imgDict) + elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize) + sitk_img_processed = channels[0] + elif TARGET_VOXEL_SPACING: # 3D image with target spacing + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=TARGET_VOXEL_SPACING, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + else: # 3D image, no TARGET_VOXEL_SPACING + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=original_spacing, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + + + CIA_other_info = { + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + CIA_other_info['Image_id']=meta_image_id + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + is_processed_4d = sitk_img_processed.GetDimension() == 4 + clamp_range_to_use=None + if clamp_range_to_use and is_processed_4d: + clamped_channels_final = [] + num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1 + for i in range(num_channels_final): + extractor = sitk.ExtractImageFilter() + proc_size_final = sitk_img_processed.GetSize() + extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed) + clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use)) + if clamped_channels_final: + if len(clamped_channels_final) > 1: + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final) + elif len(clamped_channels_final) == 1: + sitk_img_processed = clamped_channels_final[0] + elif clamp_range_to_use: # 3D image + sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use) + + + output_path = os.path.join(output_dir,pid_dir, f"{df}") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(sitk_img_processed, output_path, full_path_image) + print(f"Saved NIfTI file to {output_path}") + + + + label_path_dict = {} + + if label_flag: + processed_lbl_full_path = os.path.join(output_dir, pid_dir, TASK_VALUE, f"{df}") + full_path_label=os.path.join(full_dir,df.replace(".nii.gz","_gt.nii.gz")) + print("label_file",full_path_label) + sitk_lbl_original = util.load_nifti(full_path_label) + if not sitk_lbl_original: + print(f" Failed to load label: {full_path_label}") + processed_lbl_full_path = None + continue + if sitk_lbl_original: + label_resampler = sitk.ResampleImageFilter() + reference_for_label = sitk_img_processed # Default to processed image + print(sitk_img_processed.GetDimension(),sitk_img_processed.GetSize()) + if sitk_img_processed.GetDimension() == 4: + print("frame label match") + num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1 + if num_comp_proc > 0: + extractor = sitk.ExtractImageFilter() + proc_img_size_for_lbl_ref = sitk_img_processed.GetSize() + extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0]) + extractor.SetIndex([0,0,0,0]) + try: + reference_for_label = extractor.Execute(sitk_img_processed) + except Exception as ref_err: + print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.") + # print(traceback.format_exc()) + reference_for_label = None + else: # Fallback if extraction fails + print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.") + reference_for_label = None # This will cause an issue below if not handled + + sitk_lbl_processed = None + + if reference_for_label and reference_for_label.GetDimension() > 0: + label_resampler.SetInterpolator(sitk.sitkNearestNeighbor) + label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID()) + + if sitk_lbl_original.GetDimension() == 4: + lbl_channels = [] + lbl_size = list(sitk_lbl_original.GetSize()) + for i in range(lbl_size[3]): + extractor = sitk.ExtractImageFilter() + extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0]) + extractor.SetIndex([0, 0, 0, i]) + single_channel = extractor.Execute(sitk_lbl_original) + + label_resampler.SetReferenceImage(reference_for_label) + resampled_channel = label_resampler.Execute(single_channel) + lbl_channels.append(resampled_channel) + + if len(lbl_channels) > 1: + sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels) + elif len(lbl_channels) == 1: + sitk_lbl_processed = lbl_channels[0] + else: + label_resampler.SetReferenceImage(reference_for_label) + sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original) + # if processed_lbl_full_path: + # if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]: + # print(f" Mismatch between image and label size (ignoring channels):") + # print(f" Image size: {sitk_img_processed.GetSize()}") + # print(f" Label size: {sitk_lbl_processed.GetSize()}") + util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label) + else: + # print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.") + print("no frame label match") + # original_spacing = list(reference_for_label.GetSpacing()) + # original_size = list(reference_for_label.GetSize()) + print(original_spacing,original_size) + img_resampler_obj = util.get_unisize_resampler(sitk_lbl_original, 'nearest', + spacing=original_spacing, size=original_size) + if img_resampler_obj: + sitk_lbl_processed = img_resampler_obj.Execute(sitk_lbl_original) + util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label) # Save original + else: + print('failed to resample label') + # processed_lbl_full_path should still point to this saved original label + + + else: + processed_lbl_full_path = None + else: + processed_lbl_full_path = None + + if processed_lbl_full_path: + + + print('compare original image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + print('compare processed image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize()) + try: + assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + label_path_dict['heart'] = processed_lbl_full_path + print("process label path:", processed_lbl_full_path) + except Exception as e: + failed_files.append(full_path_label) + continue + + + except RuntimeError: + failed_files.append(full_path_image) + print(f"Failed to load MnMs images from {full_path_image}") + continue + + size_processed = list(sitk_img_processed.GetSize()) + print('size_processed',size_processed,original_size) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',full_path_image) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','chest') + + + if processed_lbl_full_path: + # print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + meta.add_keyvalue('Label_Dict',LABEL_DICT) + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + print(meta.get_meta_data()) + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + continue + + + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnM2/MnM2/dataset/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/ygq/Data_Engineering/MnM2_clean/test") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/MnM2_clean/util.py b/MnM2_clean/util.py new file mode 100644 index 0000000000000000000000000000000000000000..931932712d5a99b1a0b790b0f604a5c3edf8fbb4 --- /dev/null +++ b/MnM2_clean/util.py @@ -0,0 +1,406 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + elif key == 'ImgDict': + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/MnMs_clean/config_format.json b/MnMs_clean/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..54d04c24e37a45877dec012ba98558c07b29ffe4 --- /dev/null +++ b/MnMs_clean/config_format.json @@ -0,0 +1,124 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + "ImgDict": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/MnMs_clean/dataclean_MnMs.py b/MnMs_clean/dataclean_MnMs.py new file mode 100644 index 0000000000000000000000000000000000000000..3f4fa6e521c6c7a11cefbeb031c127ba5042d018 --- /dev/null +++ b/MnMs_clean/dataclean_MnMs.py @@ -0,0 +1,484 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-07-24 +update MnMs data clean +https://github.com/openmedlab/Awesome-Medical-Dataset/blob/main/resources/M&Ms.md +https://zhuanlan.zhihu.com/p/694831343 + +来自 6 个国际医疗中心 的 340 名受试者 的 CMR 数据。 +覆盖 4 个主流 MRI 设备厂商(Siemens, Philips, GE, Canon)。 +数据集文件结构如下,数据集被组织成训练集、验证集和测试集三个主目录,其中训练集进一步分为有标注和无标注的子目录。每个有标注的子目录包含病人的成像文件以及相应的标注数据。 +M&Ms +├── Training +│ ├── Labeled +│ │ ├── A0S9V9 +│ │ │ ├── A0S9V9_sa.nii.gz +│ │ │ └── A0S9V9_sa_gt.nii.gz +│ │ ├── A1D0Q7 +│ │ ├── A1D9Z7 +│ │ └── ... +│ └── Unlabeled +├── Validation +├── Testing +└── 211230_M&Ms_Dataset_information_diagnosis_opendataset.csv + +对训练集有标注的 150 例数据进行图像尺寸统计,size 的格式为 (x,y,z,frame) +经验丰富的临床医生对心脏磁共振(CMR)图像进行了分割,参考了 ACDC 的标注标准,标注了左心室(LV)、右心室(RV)血池以及左心室心肌(MYO)的轮廓,标签分别为:1(LV)、2(MYO)和3(RV)。 + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + + + +meta_id_name='External code' +meta_vendor_name='VendorName' +meta_centre_name='Centre' +meta_pathology_name='Pathology' +meta_ed_name='ED' +meta_es_name='ES' +meta_age_name='Age' +meta_sex_name='Sex' +meta_height_name='Height' +meta_weight_name='Weight' + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +LABEL_DICT={ + "0":"backgroud", + "1":"LV",#左心室 Blood Pools + "2":"MYO",#左心室心肌 + "3":"RV"#右心室 Blood Pools +} + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + pid_dirs=["Training","Testing","Validation"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + if pid_dir =="Training": + tr_flag=True + else: + tr_flag=False + label_flag=False + + if not tr_flag: + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + unlabeled_list=image_dirs + else: + image_dir_1=find_image_dirs(os.path.join(target_path,pid_dir,'Labeled')) + image_dir_2=find_image_dirs(os.path.join(target_path,pid_dir,'Unlabeled')) + unlabeled_list=image_dir_2 + image_dirs=image_dir_1+image_dir_2 + for data_dir in tqdm(image_dirs, desc="Processing images files"): + + location=data_dir + if not tr_flag: + full_path=os.path.join(target_path,pid_dir,data_dir) + else: + if data_dir in unlabeled_list: + full_path=os.path.join(target_path,pid_dir,"Unlabeled",data_dir) + else: + full_path=os.path.join(target_path,pid_dir,"Labeled",data_dir) + label_flag=True + data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + meta_image_id=data_info_row[meta_id_name][0] + meta_vendor=data_info_row[meta_vendor_name][0] + meta_centre=data_info_row[meta_centre_name][0] + meta_pathology=data_info_row[meta_pathology_name][0] + meta_age=data_info_row[meta_age_name][0] + meta_sex=data_info_row[meta_sex_name][0] + meta_height=data_info_row[meta_height_name][0] + meta_weigth=data_info_row[meta_weight_name][0] + meta_ed=data_info_row[meta_ed_name][0] + meta_es=data_info_row[meta_es_name][0] + else: + meta_image_id=data_dir + meta_vendor='' + meta_centre='' + meta_pathology='' + meta_age='' + meta_sex='' + meta_height='' + meta_weigth='' + meta_ed='' + meta_es='' + # full_path = convert_windows_to_linux_path(full_path) + if not os.path.isdir(full_path): + continue + try: + print(full_path) + full_path_image=os.path.join(full_path,"%s_sa.nii.gz"%data_dir) + + if label_flag: + full_path_label=os.path.join(full_path,"%s_sa_gt.nii.gz"%data_dir) + if not os.path.isfile(full_path_label): + full_path_label=None + else: + full_path_label=None + + sitk_img_original = util.load_nifti(full_path_image) + if sitk_img_original is None: + print(f" Failed to load image: {full_path_image}") + continue + + modality="MRI" + study='MnMs'##Dataset_name + CIA_other_info = { + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = pid_dir + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + sitk_img_processed = sitk_img_original + # is_4d_image = msd_dataset_info.get("tensorImageSize", "3D").upper() == "4D" or sitk_img_original.GetDimension() == 4 + is_4d_image = sitk_img_original.GetDimension() == 4 + + frame_flag=False + # --- Resampling Logic (Revised for 4D) --- + if is_4d_image: + + + # Always process 4D images channel-wise for resampling + # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only + channels = [] + num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1 + channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing + + + for i in range(num_channels): + extractor = sitk.ExtractImageFilter() + current_3d_channel_size = original_size[:3] + + if sitk_img_original.GetDimension() == 4: + extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img = extractor.Execute(sitk_img_original) + else: + channel_3d_img = sitk_img_original + if i > 0: break + + channel_resampler = util.get_unisize_resampler( + channel_3d_img, 'linear', + spacing=channel_target_spacing, size=current_3d_channel_size + ) + if channel_resampler: + channels.append(channel_resampler.Execute(channel_3d_img)) + else: + channels.append(channel_3d_img) + + if channels: + if len(channels) > 1: # Only join if there are multiple channels + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels) + ##aded by yanguoqing on 2025-08-11 + frame_flag=True + imgDict={} + for kf_idx in range(num_channels): + imgDict[str(kf_idx)]='none' + if str(meta_ed):imgDict[str(meta_ed)]='ed' + if str(meta_es):imgDict[str(meta_es)]='es' + meta.add_keyvalue('ImgDict',imgDict) + elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize) + sitk_img_processed = channels[0] + elif TARGET_VOXEL_SPACING: # 3D image with target spacing + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=TARGET_VOXEL_SPACING, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + else: # 3D image, no TARGET_VOXEL_SPACING + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=original_spacing, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + + + + ## + CIA_other_info['Image_id']=meta_image_id + CIA_other_info['Vendor']=meta_vendor + CIA_other_info['Centre']=str(meta_centre) + CIA_other_info['Pathology']=str(meta_pathology) + CIA_other_info['Age']=str(meta_age) + CIA_other_info['Sex']=meta_sex + CIA_other_info['Height']=str(meta_height) + CIA_other_info['Weight']=str(meta_weigth) + CIA_other_info['ED']=str(meta_ed) + CIA_other_info['ES']=str(meta_es) + + + + # --- End Resampling Logic --- + + is_processed_4d = sitk_img_processed.GetDimension() == 4 + clamp_range_to_use=None + if clamp_range_to_use and is_processed_4d: + clamped_channels_final = [] + num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1 + for i in range(num_channels_final): + extractor = sitk.ExtractImageFilter() + proc_size_final = sitk_img_processed.GetSize() + extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed) + clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use)) + if clamped_channels_final: + if len(clamped_channels_final) > 1: + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final) + elif len(clamped_channels_final) == 1: + sitk_img_processed = clamped_channels_final[0] + elif clamp_range_to_use: # 3D image + sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use) + + + output_path = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(sitk_img_processed, output_path, full_path_image) + print(f"Saved NIfTI file to {output_path}") + + label_path_dict = {} + + processed_lbl_full_path = os.path.join(output_dir, data_dir, TASK_VALUE, f"{data_dir}.nii.gz") + print(processed_lbl_full_path,full_path_label,tr_flag,label_flag) + if tr_flag and label_flag and os.path.exists(full_path_label): + sitk_lbl_original = util.load_nifti(full_path_label) + if not sitk_lbl_original: + print(f" Failed to load label: {full_path_label}") + processed_lbl_full_path = None + continue + if sitk_lbl_original: + label_resampler = sitk.ResampleImageFilter() + reference_for_label = sitk_img_processed # Default to processed image + + if sitk_img_processed.GetDimension() == 4: + num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1 + if num_comp_proc > 0: + extractor = sitk.ExtractImageFilter() + proc_img_size_for_lbl_ref = sitk_img_processed.GetSize() + extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0]) + extractor.SetIndex([0,0,0,0]) + try: + reference_for_label = extractor.Execute(sitk_img_processed) + except Exception as ref_err: + print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.") + # print(traceback.format_exc()) + reference_for_label = None + else: # Fallback if extraction fails + print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.") + reference_for_label = None # This will cause an issue below if not handled + + sitk_lbl_processed = None + + if reference_for_label and reference_for_label.GetDimension() > 0: + label_resampler.SetInterpolator(sitk.sitkNearestNeighbor) + label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID()) + + if sitk_lbl_original.GetDimension() == 4: + lbl_channels = [] + lbl_size = list(sitk_lbl_original.GetSize()) + for i in range(lbl_size[3]): + extractor = sitk.ExtractImageFilter() + extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0]) + extractor.SetIndex([0, 0, 0, i]) + single_channel = extractor.Execute(sitk_lbl_original) + + label_resampler.SetReferenceImage(reference_for_label) + resampled_channel = label_resampler.Execute(single_channel) + lbl_channels.append(resampled_channel) + + if len(lbl_channels) > 1: + sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels) + elif len(lbl_channels) == 1: + sitk_lbl_processed = lbl_channels[0] + else: + label_resampler.SetReferenceImage(reference_for_label) + sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original) + if processed_lbl_full_path: + if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]: + print(f" Mismatch between image and label size (ignoring channels):") + print(f" Image size: {sitk_img_processed.GetSize()}") + print(f" Label size: {sitk_lbl_processed.GetSize()}") + util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label) + else: + print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.") + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label) # Save original + # processed_lbl_full_path should still point to this saved original label + else: + processed_lbl_full_path = None + else: + processed_lbl_full_path = None + + if processed_lbl_full_path: + label_path_dict['heart'] = processed_lbl_full_path + + print('compare image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + print('compare image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize()) + try: + assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + + except Exception as e: + failed_files.append(full_path_label) + continue + + except RuntimeError: + failed_files.append(full_path_image) + print(f"Failed to load MnMs images from {full_path_image}") + continue + + + + + size_processed = list(sitk_img_processed.GetSize()) + print('size_processed',size_processed) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',full_path_image) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','chest') + + + if processed_lbl_full_path: + print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + meta.add_keyvalue('Label_Dict',LABEL_DICT) + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnMs/OpenDataset/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/MnMs/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/MnMs_clean/util.py b/MnMs_clean/util.py new file mode 100644 index 0000000000000000000000000000000000000000..931932712d5a99b1a0b790b0f604a5c3edf8fbb4 --- /dev/null +++ b/MnMs_clean/util.py @@ -0,0 +1,406 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + elif key == 'ImgDict': + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/OAISIS_clean/config_format.json b/OAISIS_clean/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..01e9febcb5f7b8c2946d49b246139faf6e8272b1 --- /dev/null +++ b/OAISIS_clean/config_format.json @@ -0,0 +1,125 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + + "Sub_modality": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/OAISIS_clean/dataclean_OASIS_1_CS_Sectional.py b/OAISIS_clean/dataclean_OASIS_1_CS_Sectional.py new file mode 100644 index 0000000000000000000000000000000000000000..8a6a15c93fbe20694d086365242a9d04e2452929 --- /dev/null +++ b/OAISIS_clean/dataclean_OASIS_1_CS_Sectional.py @@ -0,0 +1,358 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-09-01 + +OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。 +OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。 + +1. 目录与文件命名规则 + 根目录下按受试者会话ID建立文件夹。 + 受试者ID格式:OAS1_xxxx (例如 OAS1_0012) + 会话ID格式:OAS1_xxxx_MRy (例如 OAS1_0012_MR1,y代表第几次访问成像) + OAS1_xxxx_MRy/ + │ + ├── OAS1_xxxx_MRy.xml # 包含采集细节和解剖指标的XML元数据文件 + ├── OAS1_xxxx_MRy.txt # 与XML内容相同的文本格式文件(便于查看) + ├── RAW/ # 存储原始扫描图像(DICOM或Analyze格式) + ├── PROCESSED/ # 预处理后的图像 + │ ├── SUBJ_111/ # 原始空间下的平均配准图像(各向同性1mm³) + │ └── T88_111/ # 图谱配准空间下的图像 + │ ├── t4_files/ # 存储配准变换矩阵文件 + │ └── ... # 配准后的图像文件 + └── FSL_SEG/ # 基于图谱配准图像生成的脑组织分割结果(灰质2/白质3/脑脊液1) + + +所有图像均以 Analyze 7.5格式 存储,包含: + 一个图像文件(.img) + 一个头文件(.hdr) + 使用 16位大端序(big-endian) 存储 + + OAS1_xxxx_MRy_mpr-z_anon 单次原始扫描 256x256x128 1x1x1.25 mm 矢状位 + OAS1_xxxx_MRy_mpr_ni_anon_sbj_111 多次扫描平均配准图像 256x256x160 1x1x1 mm 矢状位 + OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc 增益场校正后的图谱配准图像 176x208x176 1x1x1 mm 横断位 + OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc 去除非脑组织的掩模图像 176x208x176 1x1x1 mm 横断位 + OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg 脑组织分割图像(灰/白/CSF) 176x208x176 1x1x1 mm 横断位 + + 1. 人口统计学信息 + 性别(M/F) + 用手习惯(Hand)(均为右利手) + 年龄(Age) + 教育程度(Educ)(1-5级) + 社会经济地位(SES) + + 2. 临床评估 + MMSE(简易精神状态检查) + CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度) + + 3. 衍生解剖指标 + eTIV:估计颅内容积 + ASF:图谱缩放因子 + nWBV:标准化全脑体积 + + + OASIS Cross-Sectional 数据集经过 FreeSurfer 处理后的版本。这通常被称为 OASIS Cross-Sectional FreeSurfer Processed 数据集 + 经过 FreeSurfer 处理后,每个受试者的数据都会存储在一个独立的目录中,其结构遵循 FreeSurfer 的标准输出格式。 + ├── sub-OASIS10001/ # 受试者1的FreeSurfer输出目录 + │ ├── mri/ # 体积数据(Volume-based data) + │ │ ├── orig.mgz # 原始图像(转换为FreeSurfer格式) + │ │ ├── nu.mgz # 强度归一化后的图像 + │ │ ├── T1.mgz # 用于分割的图像 + │ │ ├── aseg.mgz # 自动亚结构分割(皮质下分割) + │ │ ├── aparc+aseg.mgz # 皮层+皮质下融合分割 + │ │ ├── brain.mgz # 去除非脑组织后的图像 + │ │ ├── brainmask.mgz # 大脑掩模 + │ │ └── ... (其他文件) + │ ├── surf/ # 表面数据(Surface-based data) + │ │ ├── lh.pial # 左半球软脑膜表面 + │ │ ├── lh.white # 左半球白质表面 + │ │ ├── rh.pial # 右半球软脑膜表面 + │ │ ├── rh.white # 右半球白质表面 + │ │ ├── lh.thickness # 左半球皮层厚度图 + │ │ └── ... (其他文件) + │ ├── stats/ # 统计结果(文本文件) + │ │ ├── aseg.stats # 皮质下结构体积统计 + │ │ ├── lh.aparc.stats # 左半球皮层脑区厚度/面积统计 + │ │ └── rh.aparc.stats # 右半球皮层脑区厚度/面积统计 + │ └── label/ # 标签文件 + │ └── ... +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil + +import warnings +warnings.filterwarnings("ignore") +meta_id_name='ID' +##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积 +META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay'] + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +##参考MSD的sub_modality描述信息 +SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"] +##文件名对应的排序顺序 +SERIES_ORDER=["flair","t1","t1ce","t2"] + +LABEL_DICT={ + "0":"backgroud", + "1":"cerebrospinal fluid",#CSF + "2":"gray matter",#GM + "3":"white matter"#WM +} +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250805 +def load_brtas_images(series_files): + ''' + 每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离) + 将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + ##方便处理解析信息,转成csv文件 + meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv') + meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + + ##遍历所有目录下的病例数据 + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + + for data_dir in tqdm(image_dirs, desc="Processing images files"): + ##data_dir即id + full_path=os.path.join(target_path,pid_dir,data_dir) + + modality="MRI" + study='OASIS_1'##Dataset_name + CIA_other_info = {'metadata_file':''} + CIA_other_info['split'] = "train" + CIA_other_info['metadata_file']=meta_file_ori + data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + for keyname in META_COLUMN[1:]: + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=data_dir + + + else: + meta_image_id=data_dir + for keyname in META_COLUMN[1:]: + CIA_other_info[keyname]='' + + + + try: + ##读取去骨保留脑组织的img + #\PROCESSED\MPRAGE\T88_111\OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc.img + full_file=glob.glob("%s/PROCESSED/MPRAGE/T88_111/%s_*_anon_111_t88_masked_gfc.img"%(full_path,data_dir))[0] + # full_file=os.path.join(full_path,"PROCESSED/MPRAGE/T88_111","%s_mpr_n4_anon_111_t88_masked_gfc.img"%data_dir) + + if os.path.isfile(full_file): + ##存在有效的MRI影像数据进行后续处理 + sitk_img_original=util.load_nifti(full_file) + else: + print("病例数据%s为空"%data_dir) + continue + + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + + + meta.add_keyvalue('Spacing_mm',min(original_spacing)) + meta.add_keyvalue('OriImg_path',full_file) + meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','head') + + meta.add_keyvalue('Label_Dict',LABEL_DICT) + + output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + ## + save_nifti(sitk_img_original, output_image_file, full_path) + print(f"Saved NIfTI file to {output_image_file}") + ##Label processing + + label_path_dict={} + #OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc_fseg.img + full_label_file=glob.glob("%s/FSL_SEG/%s_*_anon_111_t88_masked_gfc_fseg.img"%(full_path,data_dir))[0] + + + process_label_path=os.path.join(output_dir,data_dir,'segmentation') + + processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz") + + if not os.path.isdir(process_label_path): + os.makedirs(process_label_path,exist_ok=True) + + if not os.path.isfile(full_label_file): + label_flag=False + else: + sitk_lbl_original = util.load_nifti(full_label_file) + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original + print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") + + label_path_dict['head'] = processed_lbl_full_path + label_flag=True + + if label_flag: + meta.add_keyvalue('Task',TASK_VALUE) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + + + # try: + # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + # except Exception as e: + # failed_files.append(full_path_label) + # continue + print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + + except Exception as e: + print(e) + failed_files.append(data_dir) + print(f"Failed to load BRATS images from {data_dir}") + continue + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_image_file] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/OAISIS_clean/dataclean_OASIS_1_CS_Sectional_Unmask.py b/OAISIS_clean/dataclean_OASIS_1_CS_Sectional_Unmask.py new file mode 100644 index 0000000000000000000000000000000000000000..b6ef66491021c838feb182bbcda9018c44492f2e --- /dev/null +++ b/OAISIS_clean/dataclean_OASIS_1_CS_Sectional_Unmask.py @@ -0,0 +1,359 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-09-01 + +OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。 +OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。 + +1. 目录与文件命名规则 + 根目录下按受试者会话ID建立文件夹。 + 受试者ID格式:OAS1_xxxx (例如 OAS1_0012) + 会话ID格式:OAS1_xxxx_MRy (例如 OAS1_0012_MR1,y代表第几次访问成像) + OAS1_xxxx_MRy/ + │ + ├── OAS1_xxxx_MRy.xml # 包含采集细节和解剖指标的XML元数据文件 + ├── OAS1_xxxx_MRy.txt # 与XML内容相同的文本格式文件(便于查看) + ├── RAW/ # 存储原始扫描图像(DICOM或Analyze格式) + ├── PROCESSED/ # 预处理后的图像 + │ ├── SUBJ_111/ # 原始空间下的平均配准图像(各向同性1mm³) + │ └── T88_111/ # 图谱配准空间下的图像 + │ ├── t4_files/ # 存储配准变换矩阵文件 + │ └── ... # 配准后的图像文件 + └── FSL_SEG/ # 基于图谱配准图像生成的脑组织分割结果(灰质2/白质3/脑脊液1) + + +所有图像均以 Analyze 7.5格式 存储,包含: + 一个图像文件(.img) + 一个头文件(.hdr) + 使用 16位大端序(big-endian) 存储 + + OAS1_xxxx_MRy_mpr-z_anon 单次原始扫描 256x256x128 1x1x1.25 mm 矢状位 + OAS1_xxxx_MRy_mpr_ni_anon_sbj_111 多次扫描平均配准图像 256x256x160 1x1x1 mm 矢状位 + OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc 增益场校正后的图谱配准图像 176x208x176 1x1x1 mm 横断位 + OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc 去除非脑组织的掩模图像 176x208x176 1x1x1 mm 横断位 + OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg 脑组织分割图像(灰/白/CSF) 176x208x176 1x1x1 mm 横断位 + + 1. 人口统计学信息 + 性别(M/F) + 用手习惯(Hand)(均为右利手) + 年龄(Age) + 教育程度(Educ)(1-5级) + 社会经济地位(SES) + + 2. 临床评估 + MMSE(简易精神状态检查) + CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度) + + 3. 衍生解剖指标 + eTIV:估计颅内容积 + ASF:图谱缩放因子 + nWBV:标准化全脑体积 + + + OASIS Cross-Sectional 数据集经过 FreeSurfer 处理后的版本。这通常被称为 OASIS Cross-Sectional FreeSurfer Processed 数据集 + 经过 FreeSurfer 处理后,每个受试者的数据都会存储在一个独立的目录中,其结构遵循 FreeSurfer 的标准输出格式。 + ├── sub-OASIS10001/ # 受试者1的FreeSurfer输出目录 + │ ├── mri/ # 体积数据(Volume-based data) + │ │ ├── orig.mgz # 原始图像(转换为FreeSurfer格式) + │ │ ├── nu.mgz # 强度归一化后的图像 + │ │ ├── T1.mgz # 用于分割的图像 + │ │ ├── aseg.mgz # 自动亚结构分割(皮质下分割) + │ │ ├── aparc+aseg.mgz # 皮层+皮质下融合分割 + │ │ ├── brain.mgz # 去除非脑组织后的图像 + │ │ ├── brainmask.mgz # 大脑掩模 + │ │ └── ... (其他文件) + │ ├── surf/ # 表面数据(Surface-based data) + │ │ ├── lh.pial # 左半球软脑膜表面 + │ │ ├── lh.white # 左半球白质表面 + │ │ ├── rh.pial # 右半球软脑膜表面 + │ │ ├── rh.white # 右半球白质表面 + │ │ ├── lh.thickness # 左半球皮层厚度图 + │ │ └── ... (其他文件) + │ ├── stats/ # 统计结果(文本文件) + │ │ ├── aseg.stats # 皮质下结构体积统计 + │ │ ├── lh.aparc.stats # 左半球皮层脑区厚度/面积统计 + │ │ └── rh.aparc.stats # 右半球皮层脑区厚度/面积统计 + │ └── label/ # 标签文件 + │ └── ... +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil + +import warnings +warnings.filterwarnings("ignore") +meta_id_name='ID' +##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积 +META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay'] + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +##参考MSD的sub_modality描述信息 +SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"] +##文件名对应的排序顺序 +SERIES_ORDER=["flair","t1","t1ce","t2"] + +LABEL_DICT={ + "0":"backgroud", + "1":"cerebrospinal fluid",#CSF + "2":"gray matter",#GM + "3":"white matter"#WM +} +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250805 +def load_brtas_images(series_files): + ''' + 每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离) + 将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + ##方便处理解析信息,转成csv文件 + meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv') + meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + + ##遍历所有目录下的病例数据 + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + + for data_dir in tqdm(image_dirs, desc="Processing images files"): + ##data_dir即id + full_path=os.path.join(target_path,pid_dir,data_dir) + + modality="MRI" + study='OASIS_1'##Dataset_name + CIA_other_info = {'metadata_file':''} + CIA_other_info['split'] = "train" + CIA_other_info['metadata_file']=meta_file_ori + data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + for keyname in META_COLUMN[1:]: + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=data_dir + + + else: + meta_image_id=data_dir + for keyname in META_COLUMN[1:]: + CIA_other_info[keyname]='' + + + + try: + ##读取完整匹配后的影像 + #\PROCESSED\MPRAGE\T88_111\OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc.img + ##OAS1_0001_MR1_mpr_n4_anon_111_t88_gfc + full_file=glob.glob("%s/PROCESSED/MPRAGE/T88_111/%s_*_anon_111_t88_gfc.img"%(full_path,data_dir))[0] + # full_file=os.path.join(full_path,"PROCESSED/MPRAGE/T88_111","%s_mpr_n4_anon_111_t88_masked_gfc.img"%data_dir) + + if os.path.isfile(full_file): + ##存在有效的MRI影像数据进行后续处理 + sitk_img_original=util.load_nifti(full_file) + else: + print("病例数据%s为空"%data_dir) + continue + + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + + + meta.add_keyvalue('Spacing_mm',min(original_spacing)) + meta.add_keyvalue('OriImg_path',full_file) + meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','head') + + meta.add_keyvalue('Label_Dict',LABEL_DICT) + + output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + ## + save_nifti(sitk_img_original, output_image_file, full_path) + print(f"Saved NIfTI file to {output_image_file}") + ##Label processing + + label_path_dict={} + #OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc_fseg.img + full_label_file=glob.glob("%s/FSL_SEG/%s_*_anon_111_t88_masked_gfc_fseg.img"%(full_path,data_dir))[0] + + + process_label_path=os.path.join(output_dir,data_dir,'segmentation') + + processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz") + + if not os.path.isdir(process_label_path): + os.makedirs(process_label_path,exist_ok=True) + + if not os.path.isfile(full_label_file): + label_flag=False + else: + sitk_lbl_original = util.load_nifti(full_label_file) + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original + print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") + + label_path_dict['head'] = processed_lbl_full_path + label_flag=True + + if label_flag: + meta.add_keyvalue('Task',TASK_VALUE) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + + + # try: + # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + # except Exception as e: + # failed_files.append(full_path_label) + # continue + print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + + except Exception as e: + print(e) + failed_files.append(data_dir) + print(f"Failed to load BRATS images from {data_dir}") + continue + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_image_file] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL_UNMASK") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/OAISIS_clean/dataclean_OASIS_1_CS_Sectional_raw.py b/OAISIS_clean/dataclean_OASIS_1_CS_Sectional_raw.py new file mode 100644 index 0000000000000000000000000000000000000000..e0445b64fda4e49d6c15f63ae855ef7c97ea0d09 --- /dev/null +++ b/OAISIS_clean/dataclean_OASIS_1_CS_Sectional_raw.py @@ -0,0 +1,280 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-09-04 + +OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。 +OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。 + +1. 目录与文件命名规则 + 根目录下按受试者会话ID建立文件夹。 + 受试者ID格式:OAS1_xxxx (例如 OAS1_0012) + 会话ID格式:OAS1_xxxx_MRy (例如 OAS1_0012_MR1,y代表第几次访问成像) + OAS1_xxxx_MRy/ + │ + ├── OAS1_xxxx_MRy.xml # 包含采集细节和解剖指标的XML元数据文件 + ├── OAS1_xxxx_MRy.txt # 与XML内容相同的文本格式文件(便于查看) + ├── RAW/ # 存储原始扫描图像(DICOM或Analyze格式) + ├── PROCESSED/ # 预处理后的图像 + │ ├── SUBJ_111/ # 原始空间下的平均配准图像(各向同性1mm³) + │ └── T88_111/ # 图谱配准空间下的图像 + │ ├── t4_files/ # 存储配准变换矩阵文件 + │ └── ... # 配准后的图像文件 + └── FSL_SEG/ # 基于图谱配准图像生成的脑组织分割结果(灰质2/白质3/脑脊液1) + + +所有图像均以 Analyze 7.5格式 存储,包含: + 一个图像文件(.img) + 一个头文件(.hdr) + 使用 16位大端序(big-endian) 存储 + + OAS1_xxxx_MRy_mpr-z_anon 单次原始扫描 256x256x128 1x1x1.25 mm 矢状位 + OAS1_xxxx_MRy_mpr_ni_anon_sbj_111 多次扫描平均配准图像 256x256x160 1x1x1 mm 矢状位 + OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc 增益场校正后的图谱配准图像 176x208x176 1x1x1 mm 横断位 + OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc 去除非脑组织的掩模图像 176x208x176 1x1x1 mm 横断位 + OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg 脑组织分割图像(灰/白/CSF) 176x208x176 1x1x1 mm 横断位 + + 1. 人口统计学信息 + 性别(M/F) + 用手习惯(Hand)(均为右利手) + 年龄(Age) + 教育程度(Educ)(1-5级) + 社会经济地位(SES) + + 2. 临床评估 + MMSE(简易精神状态检查) + CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度) + + 3. 衍生解剖指标 + eTIV:估计颅内容积 + ASF:图谱缩放因子 + nWBV:标准化全脑体积 +''' +import os +import glob,re +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil +##dataset_meta +import warnings +warnings.filterwarnings("ignore") +meta_id_name='ID' +##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积 +META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay'] + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250904 +def load_raw_images(series_files): + ''' + 每个病例包含3到4种RAW的单次平扫MR + 将多个分开的模态合并,构建第四个维度的数组,分别按照MPR-1,MPR-2...顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + ##方便处理解析信息,转成csv文件 + meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv') + meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + + ##遍历所有目录下的病例数据 + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + + for data_dir in tqdm(image_dirs, desc="Processing images files"): + ##data_dir即id + full_path=os.path.join(target_path,pid_dir,data_dir) + + modality="MRI" + study='OASIS_1'##Dataset_name + CIA_other_info = {'metadata_file':''} + CIA_other_info['split'] = "train" + CIA_other_info['metadata_file']=meta_file_ori + data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + for keyname in META_COLUMN[1:]: + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=data_dir + + + else: + meta_image_id=data_dir + for keyname in META_COLUMN[1:]: + CIA_other_info[keyname]='' + + + + try: + ##读取原始的RAW目录下多次单扫img + #\RAW\OAS1_0001_MR1_mpr-1_anon.img + series_files=glob.glob("%s/RAW/%s_mpr-*.img"%(full_path,data_dir)) + series_files.sort() + + if len(series_files)>0: + ##存在有效的MRI影像数据进行后续处理 + sitk_img_original=load_raw_images(series_files) + submodality=[re.search(r"mpr-\d{1}",os.path.basename(fp)).group(0) for fp in series_files] + sub_modality_dict={} + for idx,value in enumerate(submodality): + sub_modality_dict[idx]=value + + meta.add_keyvalue('Sub_modality',sub_modality_dict) + + else: + print("病例数据%s为空"%data_dir) + continue + + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + + + meta.add_keyvalue('Spacing_mm',min(original_spacing)) + meta.add_keyvalue('OriImg_path',",".join(series_files)) + meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','head') + + + + output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + ## + save_nifti(sitk_img_original, output_image_file, full_path) + print(f"Saved NIfTI file to {output_image_file}") + ##Label processing + + + + except Exception as e: + print(e) + failed_files.append(data_dir) + print(f"Failed to load BRATS images from {data_dir}") + continue + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_image_file] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL_RAW") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) \ No newline at end of file diff --git a/OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw.py b/OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw.py new file mode 100644 index 0000000000000000000000000000000000000000..c6c3c3d322d334820c4869c4daf87224efb50a38 --- /dev/null +++ b/OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw.py @@ -0,0 +1,283 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-09-04 + +OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。 +OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。 + +OASIS-2,全称为 Longitudinal Multimodal Neuroimaging: Principal 150 Subjects,是 OASIS 项目发布的第二个核心数据集。顾名思义,它的核心特点是 纵向(Longitudinal)。 + +核心目标: + 研究正常衰老和阿尔茨海默病(AD)中的大脑结构随时间变化的模式。 +研究设计: + 纵向研究。同一批受试者被多次扫描和评估,持续数年。 +样本量: + 150 名年龄在 60 到 96 岁之间的受试者。 +人群组成: + 所有 150 名受试者在首次扫描时都被诊断为认知正常(CDR = 0)。 + 在研究期间,部分受试者仍然保持认知正常,而另一部分则发展为痴呆(被临床诊断为可能患有阿尔茨海默病)。 +数据采集: + 每名受试者进行了 至少 2 次 的访视会话(session),最多达到了 5 次。 + 每次访视之间的平均间隔时间约为 2.2 年,整个研究跨度最长超过 7 年。 + 每次访视都包括:3-4 次 T1 加权 MRI 扫描(在单次会话中完成,用于平均以提高信噪比)和详细的临床神经心理评估。 +数据内容: + 与 OASIS-1 类似,包括原始 DICOM 图像、预处理后的 Analyze 格式图像,以及全面的临床认知评估数据。 + + +关键区别的详细解释 +横断面 vs. 纵向 (Cross-Sectional vs. Longitudinal): + OASIS-1 像是在给一个城市的所有人在同一天拍一张照片。你可以比较年轻人和老年人、健康人和病人的区别,但看不到任何一个人是如何变老或生病的。 + OASIS-2 像是挑选了150位健康的老年人,然后每年都给他们拍一张照片,持续好几年。这样你就能亲眼看到有些人如何慢慢地出现变化,最终生病。这对于理解疾病的过程至关重要。 +受试者群体的区别: + OASIS-1 包含了已经确诊的AD患者,非常适合训练一个模型来学习“AD大脑看起来是什么样”。 + OASIS-2 的受试者起点都是健康的,这使得它成为研究疾病前驱期(即临床症状出现之前)的宝贵资源。你可以分析那些最终患病的人,在多年前其大脑是否就已经存在细微的、可检测的差异。 +数据分析方法的差异: + 分析 OASIS-1 通常使用跨主体(cross-sectional)比较,例如比较AD组和正常对照组的平均海马体积。 + 分析 OASIS-2 则侧重于个体内部随时间的变化(within-subject change)。例如,为每个受试者计算其年化脑萎缩率,然后比较保持正常组和转化组之间的萎缩速率差异。这需要更复杂的纵向统计模型。 + + + 1. 人口统计学信息 + 性别(M/F) + 用手习惯(Hand)(均为右利手) + 年龄(Age) + 教育程度(Educ)(1-5级) + 社会经济地位(SES) + + 2. 临床评估 + MMSE(简易精神状态检查) + CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度) + + 3. 衍生解剖指标 + eTIV:估计颅内容积 + ASF:图谱缩放因子 + nWBV:标准化全脑体积 +''' +import os +import glob,re +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil +##dataset_meta +import warnings +warnings.filterwarnings("ignore") +meta_id_name='MRI ID' +##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积 +#META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay'] +META_COLUMN=['Subject ID', 'Group', 'Visit', 'MR Delay', 'M/F', 'Hand','Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'] + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250904 +def load_raw_images(series_files): + ''' + 每个病例包含3到4种RAW的单次平扫MR + 将多个分开的模态合并,构建第四个维度的数组,分别按照MPR-1,MPR-2...顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + ##方便处理解析信息,转成csv文件 + meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis2_longitudinal_demographics.csv') + meta_file_ori=os.path.join(target_path,'oasis_longitudinal_demographics-8d83e569fa2e2d30 (1).xlsx') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + + ##遍历所有目录下的病例数据 + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + + for data_dir in tqdm(image_dirs, desc="Processing images files"): + ##data_dir即id + full_path=os.path.join(target_path,pid_dir,data_dir) + + modality="MRI" + study='OASIS_2'##Dataset_name + CIA_other_info = {'metadata_file':''} + CIA_other_info['split'] = "train" + CIA_other_info['metadata_file']=meta_file_ori + data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + for keyname in META_COLUMN[:]: + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=data_dir + + + else: + meta_image_id=data_dir + for keyname in META_COLUMN[1:]: + CIA_other_info[keyname]='' + + + + try: + ##读取原始的RAW目录下多次单扫img + #\RAW\OAS1_0001_MR1_mpr-1_anon.img + series_files=glob.glob("%s/RAW/mpr-*.img"%(full_path)) + series_files.sort() + + if len(series_files)>0: + ##存在有效的MRI影像数据进行后续处理 + sitk_img_original=load_raw_images(series_files) + submodality=[re.search(r"mpr-\d{1}",os.path.basename(fp)).group(0) for fp in series_files] + sub_modality_dict={} + for idx,value in enumerate(submodality): + sub_modality_dict[idx]=value + + meta.add_keyvalue('Sub_modality',sub_modality_dict) + + else: + print("病例数据%s为空"%data_dir) + continue + + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + + + meta.add_keyvalue('Spacing_mm',min(original_spacing)) + meta.add_keyvalue('OriImg_path',",".join(series_files)) + meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','head') + + + + output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + ## + save_nifti(sitk_img_original, output_image_file, full_path) + print(f"Saved NIfTI file to {output_image_file}") + ##Label processing + + + + except Exception as e: + print(e) + failed_files.append(data_dir) + print(f"Failed to load OASIS images from {data_dir}") + continue + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_image_file] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_2/OAS2_RAW//") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_2/RAW") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) \ No newline at end of file diff --git a/OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw_v2.py b/OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..ad51847eabdb12117d3bb29c90de5f24f5b4b9b1 --- /dev/null +++ b/OAISIS_clean/dataclean_OASIS_2_Longitudinal_raw_v2.py @@ -0,0 +1,345 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-09-04 + +OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。 +OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。 + +OASIS-2,全称为 Longitudinal Multimodal Neuroimaging: Principal 150 Subjects,是 OASIS 项目发布的第二个核心数据集。顾名思义,它的核心特点是 纵向(Longitudinal)。 + +核心目标: + 研究正常衰老和阿尔茨海默病(AD)中的大脑结构随时间变化的模式。 +研究设计: + 纵向研究。同一批受试者被多次扫描和评估,持续数年。 +样本量: + 150 名年龄在 60 到 96 岁之间的受试者。 +人群组成: + 所有 150 名受试者在首次扫描时都被诊断为认知正常(CDR = 0)。 + 在研究期间,部分受试者仍然保持认知正常,而另一部分则发展为痴呆(被临床诊断为可能患有阿尔茨海默病)。 +数据采集: + 每名受试者进行了 至少 2 次 的访视会话(session),最多达到了 5 次。 + 每次访视之间的平均间隔时间约为 2.2 年,整个研究跨度最长超过 7 年。 + 每次访视都包括:3-4 次 T1 加权 MRI 扫描(在单次会话中完成,用于平均以提高信噪比)和详细的临床神经心理评估。 +数据内容: + 与 OASIS-1 类似,包括原始 DICOM 图像、预处理后的 Analyze 格式图像,以及全面的临床认知评估数据。 + + +关键区别的详细解释 +横断面 vs. 纵向 (Cross-Sectional vs. Longitudinal): + OASIS-1 像是在给一个城市的所有人在同一天拍一张照片。你可以比较年轻人和老年人、健康人和病人的区别,但看不到任何一个人是如何变老或生病的。 + OASIS-2 像是挑选了150位健康的老年人,然后每年都给他们拍一张照片,持续好几年。这样你就能亲眼看到有些人如何慢慢地出现变化,最终生病。这对于理解疾病的过程至关重要。 +受试者群体的区别: + OASIS-1 包含了已经确诊的AD患者,非常适合训练一个模型来学习“AD大脑看起来是什么样”。 + OASIS-2 的受试者起点都是健康的,这使得它成为研究疾病前驱期(即临床症状出现之前)的宝贵资源。你可以分析那些最终患病的人,在多年前其大脑是否就已经存在细微的、可检测的差异。 +数据分析方法的差异: + 分析 OASIS-1 通常使用跨主体(cross-sectional)比较,例如比较AD组和正常对照组的平均海马体积。 + 分析 OASIS-2 则侧重于个体内部随时间的变化(within-subject change)。例如,为每个受试者计算其年化脑萎缩率,然后比较保持正常组和转化组之间的萎缩速率差异。这需要更复杂的纵向统计模型。 + + + 1. 人口统计学信息 + 性别(M/F) + 用手习惯(Hand)(均为右利手) + 年龄(Age) + 教育程度(Educ)(1-5级) + 社会经济地位(SES) + + 2. 临床评估 + MMSE(简易精神状态检查) + CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度) + + 3. 衍生解剖指标 + eTIV:估计颅内容积 + ASF:图谱缩放因子 + nWBV:标准化全脑体积 +''' +import os +import glob,re +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil +##dataset_meta +import warnings +warnings.filterwarnings("ignore") +meta_id_name='MRI ID' +##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积 +#META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay'] +META_COLUMN=['Subject ID', 'Group', 'Visit', 'MR Delay', 'M/F', 'Hand','Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'] + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250904 +def load_raw_images(series_files): + ''' + 每个病例包含3到4种RAW的单次平扫MR + 将多个分开的模态合并,构建第四个维度的数组,分别按照MPR-1,MPR-2...顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + ##方便处理解析信息,转成csv文件 + meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis2_longitudinal_demographics.csv') + meta_file_ori=os.path.join(target_path,'oasis_longitudinal_demographics-8d83e569fa2e2d30 (1).xlsx') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + + ##遍历所有目录下的病例数据 + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + + for data_dir in tqdm(image_dirs, desc="Processing images files"): + ##data_dir即id + full_path=os.path.join(target_path,pid_dir,data_dir) + + modality="MRI" + study='OASIS_2'##Dataset_name + CIA_other_info = {'metadata_file':''} + CIA_other_info['split'] = "train" + CIA_other_info['metadata_file']=meta_file_ori + data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + for keyname in META_COLUMN[:]: + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=data_dir + + + else: + meta_image_id=data_dir + for keyname in META_COLUMN[1:]: + CIA_other_info[keyname]='' + + + + try: + ##读取原始的RAW目录下多次单扫img + #\RAW\OAS1_0001_MR1_mpr-1_anon.img + series_files=glob.glob("%s/RAW/mpr-*.img"%(full_path)) + series_files.sort() + + if len(series_files)>0: + ##存在有效的MRI影像数据进行后续处理 + sitk_img_original=load_raw_images(series_files) + submodality=[re.search(r"mpr-\d{1}",os.path.basename(fp)).group(0) for fp in series_files] + sub_modality_dict={} + for idx,value in enumerate(submodality): + sub_modality_dict[idx]=value + + meta.add_keyvalue('Sub_modality',sub_modality_dict) + + else: + print("病例数据%s为空"%data_dir) + continue + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + print(original_spacing) + is_4d_image = sitk_img_original.GetDimension() == 4 + + frame_flag=False + # --- Resampling Logic (Revised for 4D) --- + if is_4d_image: + + # Always process 4D images channel-wise for resampling + # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only + channels = [] + num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1 + channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing + + + for i in range(num_channels): + extractor = sitk.ExtractImageFilter() + current_3d_channel_size = original_size[:3] + + if sitk_img_original.GetDimension() == 4: + extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img = extractor.Execute(sitk_img_original) + else: + channel_3d_img = sitk_img_original + if i > 0: break + + channel_resampler = util.get_unisize_resampler( + channel_3d_img, 'linear', + spacing=channel_target_spacing, size=current_3d_channel_size + ) + if channel_resampler: + channels.append(channel_resampler.Execute(channel_3d_img)) + else: + channels.append(channel_3d_img) + + if channels: + if len(channels) > 1: # Only join if there are multiple channels + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels) + ##aded by yanguoqing on 2025-08-11 + frame_flag=True + # imgDict={} + # for kf_idx in range(num_channels): + # imgDict[str(kf_idx)]='none' + # if str(meta_ed):imgDict[str(meta_ed)]='ed' + # if str(meta_es):imgDict[str(meta_es)]='es' + # meta.add_keyvalue('ImgDict',imgDict) + elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize) + sitk_img_processed = channels[0] + elif TARGET_VOXEL_SPACING: # 3D image with target spacing + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=TARGET_VOXEL_SPACING, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + else: # 3D image, no TARGET_VOXEL_SPACING + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=original_spacing, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + + + + + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + size_processed = list(sitk_img_processed.GetSize()) + print('size_processed',size_processed,original_size) + + + meta.add_keyvalue('Spacing_mm',min(original_spacing[:3])) + meta.add_keyvalue('OriImg_path',",".join(series_files)) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','head') + + + + output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + ## + save_nifti(sitk_img_processed, output_image_file, full_path) + print(f"Saved NIfTI file to {output_image_file}") + ##Label processing + + + + except Exception as e: + print(e) + failed_files.append(data_dir) + print(f"Failed to load OASIS images from {data_dir}") + continue + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_image_file] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_2/OAS2_RAW//") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_2/RAW_V2") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) \ No newline at end of file diff --git a/OAISIS_clean/oasis2_longitudinal_demographics.csv b/OAISIS_clean/oasis2_longitudinal_demographics.csv new file mode 100644 index 0000000000000000000000000000000000000000..845b3637faa6333d43400c15d33a44d4d6c12d0f --- /dev/null +++ b/OAISIS_clean/oasis2_longitudinal_demographics.csv @@ -0,0 +1,374 @@ +Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF +OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2,27,0,1987,0.696,0.883 +OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2,30,0,2004,0.681,0.876 +OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23,0.5,1678,0.736,1.046 +OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28,0.5,1738,0.713,1.010 +OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22,0.5,1698,0.701,1.034 +OAS2_0004,OAS2_0004_MR1,Nondemented,1,0,F,R,88,18,3,28,0,1215,0.710,1.444 +OAS2_0004,OAS2_0004_MR2,Nondemented,2,538,F,R,90,18,3,27,0,1200,0.718,1.462 +OAS2_0005,OAS2_0005_MR1,Nondemented,1,0,M,R,80,12,4,28,0,1689,0.712,1.039 +OAS2_0005,OAS2_0005_MR2,Nondemented,2,1010,M,R,83,12,4,29,0.5,1701,0.711,1.032 +OAS2_0005,OAS2_0005_MR3,Nondemented,3,1603,M,R,85,12,4,30,0,1699,0.705,1.033 +OAS2_0007,OAS2_0007_MR1,Demented,1,0,M,R,71,16,,28,0.5,1357,0.748,1.293 +OAS2_0007,OAS2_0007_MR3,Demented,3,518,M,R,73,16,,27,1,1365,0.727,1.286 +OAS2_0007,OAS2_0007_MR4,Demented,4,1281,M,R,75,16,,27,1,1372,0.710,1.279 +OAS2_0008,OAS2_0008_MR1,Nondemented,1,0,F,R,93,14,2,30,0,1272,0.698,1.380 +OAS2_0008,OAS2_0008_MR2,Nondemented,2,742,F,R,95,14,2,29,0,1257,0.703,1.396 +OAS2_0009,OAS2_0009_MR1,Demented,1,0,M,R,68,12,2,27,0.5,1457,0.806,1.205 +OAS2_0009,OAS2_0009_MR2,Demented,2,576,M,R,69,12,2,24,0.5,1480,0.791,1.186 +OAS2_0010,OAS2_0010_MR1,Demented,1,0,F,R,66,12,3,30,0.5,1447,0.769,1.213 +OAS2_0010,OAS2_0010_MR2,Demented,2,854,F,R,68,12,3,29,0.5,1482,0.752,1.184 +OAS2_0012,OAS2_0012_MR1,Nondemented,1,0,F,R,78,16,2,29,0,1333,0.748,1.316 +OAS2_0012,OAS2_0012_MR2,Nondemented,2,730,F,R,80,16,2,29,0,1323,0.738,1.326 +OAS2_0012,OAS2_0012_MR3,Nondemented,3,1598,F,R,83,16,2,29,0,1323,0.718,1.327 +OAS2_0013,OAS2_0013_MR1,Nondemented,1,0,F,R,81,12,4,30,0,1230,0.715,1.427 +OAS2_0013,OAS2_0013_MR2,Nondemented,2,643,F,R,82,12,4,30,0,1212,0.720,1.448 +OAS2_0013,OAS2_0013_MR3,Nondemented,3,1456,F,R,85,12,4,29,0,1225,0.710,1.433 +OAS2_0014,OAS2_0014_MR1,Demented,1,0,M,R,76,16,3,21,0.5,1602,0.697,1.096 +OAS2_0014,OAS2_0014_MR2,Demented,2,504,M,R,77,16,3,16,1,1590,0.696,1.104 +OAS2_0016,OAS2_0016_MR1,Demented,1,0,M,R,88,8,4,25,0.5,1651,0.660,1.063 +OAS2_0016,OAS2_0016_MR2,Demented,2,707,M,R,90,8,4,23,0.5,1668,0.646,1.052 +OAS2_0017,OAS2_0017_MR1,Nondemented,1,0,M,R,80,12,3,29,0,1783,0.752,0.985 +OAS2_0017,OAS2_0017_MR3,Nondemented,3,617,M,R,81,12,3,27,0.5,1814,0.759,0.968 +OAS2_0017,OAS2_0017_MR4,Nondemented,4,1861,M,R,85,12,3,30,0,1820,0.755,0.964 +OAS2_0017,OAS2_0017_MR5,Nondemented,5,2400,M,R,86,12,3,27,0,1813,0.761,0.968 +OAS2_0018,OAS2_0018_MR1,Converted,1,0,F,R,87,14,1,30,0,1406,0.715,1.248 +OAS2_0018,OAS2_0018_MR3,Converted,3,489,F,R,88,14,1,29,0,1398,0.713,1.255 +OAS2_0018,OAS2_0018_MR4,Converted,4,1933,F,R,92,14,1,27,0.5,1423,0.696,1.234 +OAS2_0020,OAS2_0020_MR1,Converted,1,0,M,R,80,20,1,29,0,1587,0.693,1.106 +OAS2_0020,OAS2_0020_MR2,Converted,2,756,M,R,82,20,1,28,0.5,1606,0.677,1.093 +OAS2_0020,OAS2_0020_MR3,Converted,3,1563,M,R,84,20,1,26,0.5,1597,0.666,1.099 +OAS2_0021,OAS2_0021_MR1,Demented,1,0,M,R,72,20,1,26,0.5,1911,0.719,0.919 +OAS2_0021,OAS2_0021_MR2,Demented,2,1164,M,R,76,20,1,25,0.5,1926,0.736,0.911 +OAS2_0022,OAS2_0022_MR1,Nondemented,1,0,F,R,61,16,3,30,0,1313,0.805,1.337 +OAS2_0022,OAS2_0022_MR2,Nondemented,2,828,F,R,64,16,3,29,0,1316,0.796,1.333 +OAS2_0023,OAS2_0023_MR1,Demented,1,0,F,R,86,12,4,21,0.5,1247,0.662,1.407 +OAS2_0023,OAS2_0023_MR2,Demented,2,578,F,R,87,12,4,21,0.5,1250,0.652,1.405 +OAS2_0026,OAS2_0026_MR1,Demented,1,0,M,R,82,12,3,27,0.5,1420,0.713,1.236 +OAS2_0026,OAS2_0026_MR2,Demented,2,673,M,R,84,12,3,27,0.5,1445,0.695,1.214 +OAS2_0027,OAS2_0027_MR1,Nondemented,1,0,F,R,69,12,3,29,0,1365,0.783,1.286 +OAS2_0027,OAS2_0027_MR2,Nondemented,2,609,F,R,71,12,3,30,0,1360,0.782,1.291 +OAS2_0027,OAS2_0027_MR3,Nondemented,3,1234,F,R,73,12,3,30,0,1358,0.775,1.293 +OAS2_0027,OAS2_0027_MR4,Nondemented,4,1779,F,R,74,12,3,30,0,1353,0.772,1.297 +OAS2_0028,OAS2_0028_MR1,Demented,1,0,M,R,64,18,2,22,0.5,1547,0.737,1.134 +OAS2_0028,OAS2_0028_MR2,Demented,2,610,M,R,66,18,2,21,1,1562,0.717,1.124 +OAS2_0029,OAS2_0029_MR1,Nondemented,1,0,F,R,77,12,4,29,0,1377,0.734,1.275 +OAS2_0029,OAS2_0029_MR2,Nondemented,2,1099,F,R,80,12,4,30,0,1390,0.735,1.263 +OAS2_0030,OAS2_0030_MR1,Nondemented,1,0,F,R,60,18,1,30,0,1402,0.822,1.252 +OAS2_0030,OAS2_0030_MR2,Nondemented,2,932,F,R,62,18,1,30,0,1392,0.817,1.261 +OAS2_0031,OAS2_0031_MR1,Converted,1,0,F,R,86,12,3,30,0,1430,0.718,1.227 +OAS2_0031,OAS2_0031_MR2,Converted,2,446,F,R,88,12,3,30,0,1445,0.719,1.215 +OAS2_0031,OAS2_0031_MR3,Converted,3,1588,F,R,91,12,3,28,0.5,1463,0.696,1.199 +OAS2_0032,OAS2_0032_MR1,Demented,1,0,M,R,90,12,3,21,0.5,1307,0.679,1.342 +OAS2_0032,OAS2_0032_MR2,Demented,2,642,M,R,92,12,3,24,0.5,1311,0.676,1.339 +OAS2_0034,OAS2_0034_MR1,Nondemented,1,0,F,R,79,16,1,29,0,1466,0.703,1.197 +OAS2_0034,OAS2_0034_MR2,Nondemented,2,489,F,R,80,16,1,30,0,1450,0.698,1.210 +OAS2_0034,OAS2_0034_MR3,Nondemented,3,1287,F,R,82,16,1,30,0,1460,0.695,1.202 +OAS2_0034,OAS2_0034_MR4,Nondemented,4,1884,F,R,84,16,1,30,0,1453,0.684,1.208 +OAS2_0035,OAS2_0035_MR1,Nondemented,1,0,F,R,88,12,4,30,0,1336,0.738,1.313 +OAS2_0035,OAS2_0035_MR2,Nondemented,2,405,F,R,89,12,4,27,0,1329,0.733,1.320 +OAS2_0036,OAS2_0036_MR1,Nondemented,1,0,F,R,69,13,4,30,0,1359,0.789,1.291 +OAS2_0036,OAS2_0036_MR3,Nondemented,3,713,F,R,70,13,4,30,0,1361,0.783,1.290 +OAS2_0036,OAS2_0036_MR4,Nondemented,4,1770,F,R,73,13,4,30,0,1360,0.773,1.291 +OAS2_0036,OAS2_0036_MR5,Nondemented,5,2369,F,R,75,13,4,29,0,1349,0.778,1.301 +OAS2_0037,OAS2_0037_MR1,Demented,1,0,M,R,82,12,4,27,0.5,1477,0.729,1.188 +OAS2_0037,OAS2_0037_MR2,Demented,2,1123,M,R,85,12,4,29,0.5,1487,0.717,1.180 +OAS2_0037,OAS2_0037_MR3,Demented,3,2029,M,R,88,12,4,26,0.5,1483,0.709,1.184 +OAS2_0037,OAS2_0037_MR4,Demented,4,2508,M,R,89,12,4,26,0.5,1485,0.706,1.181 +OAS2_0039,OAS2_0039_MR1,Demented,1,0,F,R,81,18,2,26,0.5,1174,0.742,1.495 +OAS2_0039,OAS2_0039_MR2,Demented,2,486,F,R,83,18,2,25,0.5,1179,0.733,1.488 +OAS2_0040,OAS2_0040_MR1,Demented,1,0,M,R,84,6,4,25,0.5,1310,0.727,1.339 +OAS2_0040,OAS2_0040_MR2,Demented,2,567,M,R,86,6,4,27,0.5,1320,0.724,1.329 +OAS2_0040,OAS2_0040_MR3,Demented,3,1204,M,R,88,6,4,23,0.5,1348,0.713,1.302 +OAS2_0041,OAS2_0041_MR1,Converted,1,0,F,R,71,16,1,27,0,1289,0.771,1.362 +OAS2_0041,OAS2_0041_MR2,Converted,2,756,F,R,73,16,1,28,0,1295,0.768,1.356 +OAS2_0041,OAS2_0041_MR3,Converted,3,1331,F,R,75,16,1,28,0.5,1314,0.760,1.335 +OAS2_0042,OAS2_0042_MR1,Nondemented,1,0,F,R,70,17,3,29,0,1640,0.766,1.070 +OAS2_0042,OAS2_0042_MR2,Nondemented,2,1008,F,R,73,17,3,29,0,1665,0.748,1.054 +OAS2_0043,OAS2_0043_MR1,Demented,1,0,F,R,72,12,4,26,0.5,1453,0.777,1.208 +OAS2_0043,OAS2_0043_MR2,Demented,2,491,F,R,73,12,4,26,0.5,1451,0.757,1.210 +OAS2_0044,OAS2_0044_MR1,Demented,1,0,M,R,68,14,4,21,1,1333,0.685,1.317 +OAS2_0044,OAS2_0044_MR2,Demented,2,352,M,R,69,14,4,15,1,1331,0.678,1.318 +OAS2_0044,OAS2_0044_MR3,Demented,3,866,M,R,71,14,4,22,1,1332,0.679,1.317 +OAS2_0045,OAS2_0045_MR1,Nondemented,1,0,F,R,75,18,1,30,0,1317,0.737,1.332 +OAS2_0045,OAS2_0045_MR2,Nondemented,2,689,F,R,77,18,1,29,0,1322,0.731,1.327 +OAS2_0046,OAS2_0046_MR1,Demented,1,0,F,R,83,15,2,20,0.5,1476,0.750,1.189 +OAS2_0046,OAS2_0046_MR2,Demented,2,575,F,R,85,15,2,22,1,1483,0.748,1.183 +OAS2_0047,OAS2_0047_MR1,Nondemented,1,0,F,R,77,16,2,29,0,1433,0.723,1.225 +OAS2_0047,OAS2_0047_MR2,Nondemented,2,486,F,R,78,16,2,27,0,1414,0.727,1.242 +OAS2_0048,OAS2_0048_MR1,Demented,1,0,M,R,66,16,1,19,1,1695,0.711,1.036 +OAS2_0048,OAS2_0048_MR2,Demented,2,248,M,R,66,16,1,21,1,1708,0.703,1.028 +OAS2_0048,OAS2_0048_MR3,Demented,3,647,M,R,68,16,1,19,1,1712,0.691,1.025 +OAS2_0048,OAS2_0048_MR4,Demented,4,970,M,R,68,16,1,7,1,1714,0.682,1.024 +OAS2_0048,OAS2_0048_MR5,Demented,5,1233,M,R,69,16,1,4,1,1701,0.676,1.032 +OAS2_0049,OAS2_0049_MR1,Nondemented,1,0,F,R,69,16,3,30,0,1491,0.794,1.177 +OAS2_0049,OAS2_0049_MR2,Nondemented,2,395,F,R,70,16,3,30,0,1505,0.791,1.166 +OAS2_0049,OAS2_0049_MR3,Nondemented,3,687,F,R,71,16,3,30,0,1503,0.788,1.168 +OAS2_0050,OAS2_0050_MR1,Demented,1,0,M,R,71,12,4,20,0.5,1461,0.724,1.202 +OAS2_0050,OAS2_0050_MR2,Demented,2,538,M,R,72,12,4,17,1,1483,0.695,1.184 +OAS2_0051,OAS2_0051_MR1,Nondemented,1,0,F,R,92,23,1,29,0,1454,0.701,1.207 +OAS2_0051,OAS2_0051_MR2,Nondemented,2,457,F,R,94,23,1,29,0,1474,0.696,1.190 +OAS2_0051,OAS2_0051_MR3,Nondemented,3,1526,F,R,97,23,1,30,0,1483,0.689,1.184 +OAS2_0052,OAS2_0052_MR1,Nondemented,1,0,M,R,74,18,2,29,0,1463,0.737,1.199 +OAS2_0052,OAS2_0052_MR2,Nondemented,2,1510,M,R,78,18,2,30,0,1484,0.703,1.183 +OAS2_0053,OAS2_0053_MR1,Nondemented,1,0,F,R,82,16,3,29,0,1484,0.760,1.183 +OAS2_0053,OAS2_0053_MR2,Nondemented,2,842,F,R,84,16,3,28,0,1500,0.744,1.170 +OAS2_0054,OAS2_0054_MR1,Converted,1,0,F,R,85,18,1,29,0,1264,0.701,1.388 +OAS2_0054,OAS2_0054_MR2,Converted,2,846,F,R,87,18,1,24,0.5,1275,0.683,1.376 +OAS2_0055,OAS2_0055_MR1,Nondemented,1,0,M,R,65,13,3,29,0,1362,0.837,1.289 +OAS2_0055,OAS2_0055_MR2,Nondemented,2,726,M,R,67,13,3,27,0,1365,0.827,1.285 +OAS2_0056,OAS2_0056_MR1,Nondemented,1,0,F,R,71,14,2,28,0,1461,0.756,1.202 +OAS2_0056,OAS2_0056_MR2,Nondemented,2,622,F,R,73,14,2,30,0,1456,0.739,1.205 +OAS2_0057,OAS2_0057_MR1,Nondemented,1,0,F,R,81,12,2,30,0,1599,0.755,1.098 +OAS2_0057,OAS2_0057_MR2,Nondemented,2,640,F,R,83,12,2,29,0,1569,0.757,1.118 +OAS2_0057,OAS2_0057_MR3,Nondemented,3,1340,F,R,85,12,2,30,0,1580,0.739,1.111 +OAS2_0058,OAS2_0058_MR1,Demented,1,0,M,R,78,14,3,30,0.5,1315,0.707,1.335 +OAS2_0058,OAS2_0058_MR2,Demented,2,212,M,R,79,14,3,26,0.5,1308,0.706,1.341 +OAS2_0058,OAS2_0058_MR3,Demented,3,764,M,R,80,14,3,29,0.5,1324,0.695,1.326 +OAS2_0060,OAS2_0060_MR1,Demented,1,0,M,R,75,13,4,29,0.5,1416,0.766,1.239 +OAS2_0060,OAS2_0060_MR2,Demented,2,1290,M,R,78,13,4,28,0.5,1408,0.757,1.247 +OAS2_0061,OAS2_0061_MR1,Nondemented,1,0,M,R,68,18,1,30,0,1654,0.747,1.061 +OAS2_0061,OAS2_0061_MR2,Nondemented,2,873,M,R,70,18,1,30,0,1660,0.738,1.057 +OAS2_0061,OAS2_0061_MR3,Nondemented,3,1651,M,R,72,18,1,30,0,1681,0.729,1.044 +OAS2_0062,OAS2_0062_MR1,Nondemented,1,0,F,R,79,18,2,29,0,1641,0.695,1.069 +OAS2_0062,OAS2_0062_MR2,Nondemented,2,723,F,R,81,18,2,30,0,1664,0.677,1.055 +OAS2_0062,OAS2_0062_MR3,Nondemented,3,1351,F,R,83,18,2,29,0,1667,0.688,1.053 +OAS2_0063,OAS2_0063_MR1,Demented,1,0,F,R,80,12,,30,0.5,1430,0.737,1.228 +OAS2_0063,OAS2_0063_MR2,Demented,2,490,F,R,81,12,,27,0.5,1453,0.721,1.208 +OAS2_0064,OAS2_0064_MR1,Demented,1,0,F,R,78,8,5,23,1,1462,0.691,1.200 +OAS2_0064,OAS2_0064_MR2,Demented,2,830,F,R,81,8,5,26,0.5,1459,0.694,1.203 +OAS2_0064,OAS2_0064_MR3,Demented,3,1282,F,R,82,8,5,18,0.5,1464,0.682,1.199 +OAS2_0066,OAS2_0066_MR1,Demented,1,0,M,R,61,18,1,30,1,1957,0.734,0.897 +OAS2_0066,OAS2_0066_MR2,Demented,2,497,M,R,62,18,1,30,0.5,1928,0.731,0.910 +OAS2_0067,OAS2_0067_MR1,Nondemented,1,0,M,R,67,12,4,30,0,1440,0.727,1.219 +OAS2_0067,OAS2_0067_MR2,Nondemented,2,451,M,R,68,12,4,29,0,1438,0.738,1.220 +OAS2_0067,OAS2_0067_MR3,Nondemented,3,1438,M,R,71,12,4,29,0,1455,0.724,1.206 +OAS2_0067,OAS2_0067_MR4,Nondemented,4,2163,M,R,73,12,4,28,0,1444,0.722,1.215 +OAS2_0068,OAS2_0068_MR1,Nondemented,1,0,F,R,88,12,3,30,0,1428,0.700,1.229 +OAS2_0068,OAS2_0068_MR2,Nondemented,2,743,F,R,90,12,3,29,0,1475,0.676,1.190 +OAS2_0069,OAS2_0069_MR1,Nondemented,1,0,F,R,81,18,2,29,0,1470,0.687,1.194 +OAS2_0069,OAS2_0069_MR2,Nondemented,2,432,F,R,82,18,2,30,0,1471,0.690,1.193 +OAS2_0070,OAS2_0070_MR1,Nondemented,1,0,M,R,80,17,1,28,0,1660,0.728,1.057 +OAS2_0070,OAS2_0070_MR2,Nondemented,2,672,M,R,82,17,1,29,0,1692,0.723,1.037 +OAS2_0070,OAS2_0070_MR3,Nondemented,3,1415,M,R,84,17,1,29,0,1707,0.717,1.028 +OAS2_0070,OAS2_0070_MR4,Nondemented,4,1870,M,R,85,17,1,30,0,1724,0.704,1.018 +OAS2_0070,OAS2_0070_MR5,Nondemented,5,2386,M,R,86,17,1,30,0,1720,0.705,1.020 +OAS2_0071,OAS2_0071_MR1,Demented,1,0,F,R,83,13,2,27,1,1391,0.705,1.262 +OAS2_0071,OAS2_0071_MR2,Demented,2,365,F,R,84,13,2,28,1,1402,0.695,1.252 +OAS2_0073,OAS2_0073_MR1,Nondemented,1,0,F,R,70,14,3,29,0,1524,0.787,1.151 +OAS2_0073,OAS2_0073_MR2,Nondemented,2,580,F,R,72,14,3,28,0,1512,0.777,1.161 +OAS2_0073,OAS2_0073_MR3,Nondemented,3,1705,F,R,75,14,3,28,0,1507,0.782,1.164 +OAS2_0073,OAS2_0073_MR4,Nondemented,4,2288,F,R,76,14,3,29,0,1490,0.774,1.178 +OAS2_0073,OAS2_0073_MR5,Nondemented,5,2517,F,R,77,14,3,29,0,1504,0.769,1.167 +OAS2_0075,OAS2_0075_MR1,Demented,1,0,F,R,73,8,5,25,0.5,1151,0.743,1.525 +OAS2_0075,OAS2_0075_MR2,Demented,2,567,F,R,75,8,5,22,0.5,1143,0.741,1.535 +OAS2_0076,OAS2_0076_MR1,Nondemented,1,0,F,R,66,18,2,30,0,1504,0.725,1.167 +OAS2_0076,OAS2_0076_MR2,Nondemented,2,956,F,R,69,18,2,29,0,1536,0.719,1.143 +OAS2_0076,OAS2_0076_MR3,Nondemented,3,1663,F,R,71,18,2,30,0,1520,0.718,1.155 +OAS2_0077,OAS2_0077_MR1,Nondemented,1,0,M,R,69,16,2,28,0,1848,0.737,0.950 +OAS2_0077,OAS2_0077_MR2,Nondemented,2,1393,M,R,73,16,2,29,0,1931,0.722,0.909 +OAS2_0078,OAS2_0078_MR1,Nondemented,1,0,M,R,89,16,1,28,0,1631,0.674,1.076 +OAS2_0078,OAS2_0078_MR2,Nondemented,2,441,M,R,91,16,1,28,0,1640,0.670,1.070 +OAS2_0078,OAS2_0078_MR3,Nondemented,3,1019,M,R,92,16,1,30,0,1662,0.682,1.056 +OAS2_0079,OAS2_0079_MR1,Demented,1,0,F,R,69,12,4,23,0.5,1447,0.759,1.213 +OAS2_0079,OAS2_0079_MR2,Demented,2,584,F,R,71,12,4,16,1,1492,0.725,1.176 +OAS2_0079,OAS2_0079_MR3,Demented,3,1435,F,R,73,12,4,16,1,1478,0.696,1.188 +OAS2_0080,OAS2_0080_MR1,Demented,1,0,M,R,66,15,2,25,0.5,1548,0.727,1.134 +OAS2_0080,OAS2_0080_MR2,Demented,2,580,M,R,68,15,2,30,0.5,1556,0.713,1.128 +OAS2_0080,OAS2_0080_MR3,Demented,3,1209,M,R,69,15,2,28,0.5,1546,0.724,1.135 +OAS2_0081,OAS2_0081_MR1,Demented,1,0,F,R,82,12,4,26,0.5,1271,0.695,1.381 +OAS2_0081,OAS2_0081_MR2,Demented,2,659,F,R,84,12,4,26,0.5,1273,0.686,1.378 +OAS2_0085,OAS2_0085_MR1,Nondemented,1,0,F,R,78,8,5,29,0,1383,0.756,1.269 +OAS2_0085,OAS2_0085_MR2,Nondemented,2,670,F,R,80,8,5,27,0,1381,0.751,1.270 +OAS2_0086,OAS2_0086_MR1,Nondemented,1,0,F,R,63,15,2,28,0,1544,0.805,1.136 +OAS2_0086,OAS2_0086_MR2,Nondemented,2,802,F,R,65,15,2,28,0,1542,0.792,1.138 +OAS2_0087,OAS2_0087_MR1,Demented,1,0,F,R,96,17,1,26,1,1465,0.683,1.198 +OAS2_0087,OAS2_0087_MR2,Demented,2,754,F,R,98,17,1,21,2,1503,0.660,1.168 +OAS2_0088,OAS2_0088_MR1,Demented,1,0,M,R,78,12,4,21,1,1477,0.672,1.188 +OAS2_0088,OAS2_0088_MR2,Demented,2,751,M,R,80,12,4,20,1,1494,0.661,1.175 +OAS2_0089,OAS2_0089_MR1,Demented,1,0,M,R,70,12,2,29,0.5,1432,0.692,1.225 +OAS2_0089,OAS2_0089_MR3,Demented,3,563,M,R,72,12,2,27,1,1432,0.684,1.226 +OAS2_0090,OAS2_0090_MR1,Nondemented,1,0,M,R,73,18,2,29,0,1548,0.773,1.134 +OAS2_0090,OAS2_0090_MR2,Nondemented,2,680,M,R,75,18,2,29,0,1534,0.772,1.144 +OAS2_0090,OAS2_0090_MR3,Nondemented,3,1345,M,R,76,18,2,30,0,1550,0.758,1.133 +OAS2_0091,OAS2_0091_MR1,Nondemented,1,0,M,R,75,12,4,28,0,1511,0.739,1.162 +OAS2_0091,OAS2_0091_MR2,Nondemented,2,1047,M,R,78,12,4,29,0,1506,0.715,1.166 +OAS2_0092,OAS2_0092_MR1,Converted,1,0,F,R,83,12,2,28,0,1383,0.748,1.269 +OAS2_0092,OAS2_0092_MR2,Converted,2,706,F,R,84,12,2,27,0.5,1390,0.728,1.263 +OAS2_0094,OAS2_0094_MR1,Nondemented,1,0,F,R,61,16,1,30,0,1513,0.771,1.160 +OAS2_0094,OAS2_0094_MR2,Nondemented,2,817,F,R,63,16,1,30,0,1449,0.774,1.212 +OAS2_0095,OAS2_0095_MR1,Nondemented,1,0,M,R,71,18,1,30,0,1769,0.699,0.992 +OAS2_0095,OAS2_0095_MR2,Nondemented,2,673,M,R,72,18,1,29,0,1785,0.687,0.983 +OAS2_0095,OAS2_0095_MR3,Nondemented,3,1412,M,R,74,18,1,29,0,1814,0.679,0.967 +OAS2_0096,OAS2_0096_MR1,Nondemented,1,0,F,R,89,13,3,29,0,1154,0.750,1.521 +OAS2_0096,OAS2_0096_MR2,Nondemented,2,778,F,R,91,13,3,28,0,1165,0.736,1.506 +OAS2_0097,OAS2_0097_MR1,Nondemented,1,0,M,R,74,16,2,30,0,1611,0.729,1.089 +OAS2_0097,OAS2_0097_MR2,Nondemented,2,1024,M,R,77,16,2,30,0,1628,0.709,1.078 +OAS2_0098,OAS2_0098_MR1,Demented,1,0,M,R,66,12,4,30,0.5,1446,0.780,1.214 +OAS2_0098,OAS2_0098_MR2,Demented,2,661,M,R,67,12,4,28,0.5,1412,0.783,1.243 +OAS2_0099,OAS2_0099_MR1,Demented,1,0,F,R,80,12,,27,0.5,1475,0.762,1.190 +OAS2_0099,OAS2_0099_MR2,Demented,2,807,F,R,83,12,,23,0.5,1484,0.750,1.183 +OAS2_0100,OAS2_0100_MR1,Nondemented,1,0,F,R,77,11,4,29,0,1583,0.777,1.108 +OAS2_0100,OAS2_0100_MR2,Nondemented,2,1218,F,R,80,11,4,30,0,1586,0.757,1.107 +OAS2_0100,OAS2_0100_MR3,Nondemented,3,1752,F,R,82,11,4,30,0,1590,0.760,1.104 +OAS2_0101,OAS2_0101_MR1,Nondemented,1,0,F,R,71,18,2,30,0,1371,0.769,1.280 +OAS2_0101,OAS2_0101_MR2,Nondemented,2,952,F,R,74,18,2,30,0,1400,0.752,1.254 +OAS2_0101,OAS2_0101_MR3,Nondemented,3,1631,F,R,76,18,2,30,0,1379,0.757,1.273 +OAS2_0102,OAS2_0102_MR1,Demented,1,0,M,R,82,15,3,29,0.5,1499,0.689,1.171 +OAS2_0102,OAS2_0102_MR2,Demented,2,610,M,R,84,15,3,29,0.5,1497,0.686,1.172 +OAS2_0102,OAS2_0102_MR3,Demented,3,1387,M,R,86,15,3,30,0.5,1498,0.681,1.171 +OAS2_0103,OAS2_0103_MR1,Converted,1,0,F,R,69,16,1,30,0,1404,0.750,1.250 +OAS2_0103,OAS2_0103_MR2,Converted,2,1554,F,R,74,16,1,30,0.5,1423,0.722,1.233 +OAS2_0103,OAS2_0103_MR3,Converted,3,2002,F,R,75,16,1,30,0.5,1419,0.731,1.236 +OAS2_0104,OAS2_0104_MR1,Demented,1,0,M,R,70,16,1,25,0.5,1568,0.696,1.119 +OAS2_0104,OAS2_0104_MR2,Demented,2,465,M,R,71,16,1,17,1,1562,0.685,1.123 +OAS2_0105,OAS2_0105_MR1,Nondemented,1,0,M,R,86,12,4,29,0,1783,0.703,0.984 +OAS2_0105,OAS2_0105_MR2,Nondemented,2,675,M,R,87,12,4,30,0,1762,0.718,0.996 +OAS2_0106,OAS2_0106_MR1,Demented,1,0,F,R,70,11,4,22,1,1445,0.722,1.214 +OAS2_0106,OAS2_0106_MR2,Demented,2,729,F,R,72,11,4,21,1,1489,0.686,1.179 +OAS2_0108,OAS2_0108_MR1,Demented,1,0,M,R,77,18,1,25,0.5,1604,0.781,1.094 +OAS2_0108,OAS2_0108_MR2,Demented,2,883,M,R,79,18,1,27,0.5,1569,0.781,1.118 +OAS2_0109,OAS2_0109_MR1,Nondemented,1,0,M,R,81,11,4,28,0,1750,0.670,1.003 +OAS2_0109,OAS2_0109_MR2,Nondemented,2,766,M,R,83,11,4,29,0,1744,0.670,1.006 +OAS2_0111,OAS2_0111_MR1,Demented,1,0,M,R,62,12,4,17,0.5,1525,0.732,1.151 +OAS2_0111,OAS2_0111_MR2,Demented,2,881,M,R,65,12,4,17,0.5,1520,0.699,1.155 +OAS2_0112,OAS2_0112_MR1,Demented,1,0,F,R,76,12,3,27,0.5,1315,0.698,1.335 +OAS2_0112,OAS2_0112_MR2,Demented,2,558,F,R,78,12,3,20,0.5,1339,0.689,1.311 +OAS2_0113,OAS2_0113_MR1,Demented,1,0,F,R,73,13,2,23,0.5,1536,0.725,1.142 +OAS2_0113,OAS2_0113_MR2,Demented,2,504,F,R,75,13,2,28,0.5,1520,0.708,1.155 +OAS2_0114,OAS2_0114_MR1,Demented,1,0,F,R,76,12,,27,0.5,1316,0.727,1.333 +OAS2_0114,OAS2_0114_MR2,Demented,2,570,F,R,78,12,,27,1,1309,0.709,1.341 +OAS2_0116,OAS2_0116_MR1,Demented,1,0,F,R,73,12,3,27,0.5,1425,0.769,1.232 +OAS2_0116,OAS2_0116_MR2,Demented,2,616,F,R,75,12,3,28,0.5,1407,0.770,1.247 +OAS2_0117,OAS2_0117_MR1,Nondemented,1,0,M,R,73,20,2,30,0,1842,0.758,0.953 +OAS2_0117,OAS2_0117_MR2,Nondemented,2,576,M,R,74,20,2,30,0,1806,0.759,0.972 +OAS2_0117,OAS2_0117_MR3,Nondemented,3,1345,M,R,76,20,2,30,0,1823,0.739,0.963 +OAS2_0117,OAS2_0117_MR4,Nondemented,4,1927,M,R,78,20,2,29,0,1826,0.734,0.961 +OAS2_0118,OAS2_0118_MR1,Converted,1,0,F,R,67,14,4,30,0,1508,0.794,1.164 +OAS2_0118,OAS2_0118_MR2,Converted,2,1422,F,R,71,14,4,26,0.5,1529,0.788,1.147 +OAS2_0119,OAS2_0119_MR1,Nondemented,1,0,F,R,81,15,2,28,0,1486,0.754,1.181 +OAS2_0119,OAS2_0119_MR2,Nondemented,2,733,F,R,83,15,2,29,0,1482,0.751,1.184 +OAS2_0119,OAS2_0119_MR3,Nondemented,3,1713,F,R,85,15,2,30,0,1488,0.741,1.180 +OAS2_0120,OAS2_0120_MR1,Demented,1,0,F,R,76,14,3,25,1,1409,0.715,1.246 +OAS2_0120,OAS2_0120_MR2,Demented,2,595,F,R,78,14,3,15,2,1401,0.700,1.253 +OAS2_0121,OAS2_0121_MR1,Nondemented,1,0,F,R,73,11,4,30,0,1475,0.726,1.190 +OAS2_0121,OAS2_0121_MR2,Nondemented,2,647,F,R,74,11,4,30,0,1517,0.705,1.157 +OAS2_0122,OAS2_0122_MR1,Nondemented,1,0,F,R,86,16,3,30,0,1293,0.747,1.357 +OAS2_0122,OAS2_0122_MR2,Nondemented,2,597,F,R,88,16,3,30,0,1295,0.744,1.355 +OAS2_0124,OAS2_0124_MR1,Demented,1,0,M,R,70,16,3,29,0.5,1463,0.749,1.200 +OAS2_0124,OAS2_0124_MR2,Demented,2,472,M,R,71,16,3,27,0.5,1479,0.750,1.187 +OAS2_0126,OAS2_0126_MR1,Nondemented,1,0,F,R,74,12,3,29,0,1344,0.739,1.306 +OAS2_0126,OAS2_0126_MR2,Nondemented,2,472,F,R,75,12,3,29,0,1338,0.747,1.312 +OAS2_0126,OAS2_0126_MR3,Nondemented,3,1192,F,R,77,12,3,29,0,1344,0.740,1.306 +OAS2_0127,OAS2_0127_MR1,Converted,1,0,M,R,79,18,1,29,0,1644,0.729,1.067 +OAS2_0127,OAS2_0127_MR2,Converted,2,851,M,R,81,18,1,29,0.5,1654,0.720,1.061 +OAS2_0127,OAS2_0127_MR3,Converted,3,1042,M,R,81,18,1,29,0.5,1647,0.717,1.066 +OAS2_0127,OAS2_0127_MR4,Converted,4,2153,M,R,84,18,1,29,0.5,1668,0.694,1.052 +OAS2_0127,OAS2_0127_MR5,Converted,5,2639,M,R,86,18,1,30,0.5,1670,0.669,1.051 +OAS2_0128,OAS2_0128_MR1,Nondemented,1,0,F,R,76,16,1,28,0,1346,0.762,1.304 +OAS2_0128,OAS2_0128_MR2,Nondemented,2,1140,F,R,79,16,1,29,0,1354,0.739,1.297 +OAS2_0129,OAS2_0129_MR1,Nondemented,1,0,F,R,78,18,1,30,0,1440,0.666,1.219 +OAS2_0129,OAS2_0129_MR2,Nondemented,2,737,F,R,80,18,1,30,0,1436,0.663,1.222 +OAS2_0129,OAS2_0129_MR3,Nondemented,3,1591,F,R,82,18,1,29,0,1442,0.644,1.217 +OAS2_0131,OAS2_0131_MR1,Converted,1,0,F,R,65,12,2,30,0.5,1340,0.754,1.309 +OAS2_0131,OAS2_0131_MR2,Converted,2,679,F,R,67,12,2,25,0,1331,0.761,1.318 +OAS2_0133,OAS2_0133_MR1,Converted,1,0,F,R,78,12,3,29,0,1475,0.731,1.190 +OAS2_0133,OAS2_0133_MR3,Converted,3,1006,F,R,81,12,3,28,0.5,1495,0.687,1.174 +OAS2_0134,OAS2_0134_MR1,Demented,1,0,F,R,70,11,4,29,0.5,1295,0.748,1.355 +OAS2_0134,OAS2_0134_MR2,Demented,2,539,F,R,71,11,4,28,0.5,1284,0.741,1.367 +OAS2_0135,OAS2_0135_MR1,Nondemented,1,0,M,R,74,18,2,30,0,1636,0.680,1.073 +OAS2_0135,OAS2_0135_MR2,Nondemented,2,1146,M,R,78,18,2,27,0,1645,0.663,1.067 +OAS2_0137,OAS2_0137_MR1,Demented,1,0,M,R,74,18,2,28,0.5,1659,0.739,1.058 +OAS2_0137,OAS2_0137_MR2,Demented,2,636,M,R,75,18,2,30,0.5,1651,0.737,1.063 +OAS2_0138,OAS2_0138_MR1,Nondemented,1,0,F,R,73,16,2,29,0,1123,0.786,1.563 +OAS2_0138,OAS2_0138_MR2,Nondemented,2,846,F,R,75,16,2,28,0,1106,0.767,1.587 +OAS2_0139,OAS2_0139_MR1,Demented,1,0,F,R,67,16,1,29,0.5,1337,0.766,1.312 +OAS2_0139,OAS2_0139_MR2,Demented,2,403,F,R,68,16,1,29,0.5,1344,0.733,1.305 +OAS2_0140,OAS2_0140_MR1,Demented,1,0,F,R,76,16,3,26,0.5,1391,0.705,1.262 +OAS2_0140,OAS2_0140_MR2,Demented,2,793,F,R,78,16,3,27,0.5,1393,0.690,1.260 +OAS2_0140,OAS2_0140_MR3,Demented,3,1655,F,R,81,16,3,25,0.5,1396,0.687,1.257 +OAS2_0141,OAS2_0141_MR1,Nondemented,1,0,F,R,65,18,2,30,0,1277,0.812,1.374 +OAS2_0141,OAS2_0141_MR2,Nondemented,2,1022,F,R,68,18,2,29,0,1290,0.795,1.361 +OAS2_0142,OAS2_0142_MR1,Nondemented,1,0,F,R,69,16,3,29,0,1380,0.819,1.272 +OAS2_0142,OAS2_0142_MR2,Nondemented,2,665,F,R,71,16,3,28,0,1390,0.810,1.262 +OAS2_0143,OAS2_0143_MR1,Nondemented,1,0,F,R,89,18,2,30,0,1715,0.746,1.023 +OAS2_0143,OAS2_0143_MR2,Nondemented,2,561,F,R,91,18,2,30,0,1714,0.741,1.024 +OAS2_0143,OAS2_0143_MR3,Nondemented,3,1553,F,R,93,18,2,29,0,1744,0.723,1.006 +OAS2_0144,OAS2_0144_MR1,Converted,1,0,M,R,77,16,1,30,0,1704,0.716,1.030 +OAS2_0144,OAS2_0144_MR2,Converted,2,683,M,R,79,16,1,30,0.5,1722,0.708,1.019 +OAS2_0145,OAS2_0145_MR1,Converted,1,0,F,R,68,16,3,30,0,1298,0.799,1.352 +OAS2_0145,OAS2_0145_MR2,Converted,2,1707,F,R,73,16,3,29,0.5,1287,0.771,1.364 +OAS2_0146,OAS2_0146_MR1,Demented,1,0,F,R,80,15,2,20,1,1732,0.685,1.013 +OAS2_0146,OAS2_0146_MR2,Demented,2,525,F,R,82,15,2,20,1,1729,0.698,1.015 +OAS2_0147,OAS2_0147_MR1,Nondemented,1,0,F,R,77,13,2,29,0,1351,0.769,1.299 +OAS2_0147,OAS2_0147_MR2,Nondemented,2,440,F,R,78,13,2,29,0,1334,0.769,1.316 +OAS2_0147,OAS2_0147_MR3,Nondemented,3,1204,F,R,80,13,2,28,0,1337,0.762,1.313 +OAS2_0147,OAS2_0147_MR4,Nondemented,4,1806,F,R,82,13,2,30,0,1342,0.747,1.307 +OAS2_0149,OAS2_0149_MR1,Nondemented,1,0,F,R,81,13,2,29,0,1345,0.737,1.305 +OAS2_0149,OAS2_0149_MR2,Nondemented,2,674,F,R,83,13,2,30,0,1335,0.732,1.314 +OAS2_0150,OAS2_0150_MR1,Demented,1,0,F,R,73,12,3,30,0.5,1343,0.720,1.306 +OAS2_0150,OAS2_0150_MR2,Demented,2,518,F,R,75,12,3,27,1,1357,0.714,1.293 +OAS2_0152,OAS2_0152_MR1,Nondemented,1,0,F,R,66,18,2,29,0,1191,0.785,1.474 +OAS2_0152,OAS2_0152_MR2,Nondemented,2,790,F,R,68,18,2,29,0,1194,0.772,1.469 +OAS2_0152,OAS2_0152_MR3,Nondemented,3,1329,F,R,69,18,2,29,0,1202,0.770,1.461 +OAS2_0154,OAS2_0154_MR1,Nondemented,1,0,F,R,75,18,1,29,0,1436,0.750,1.222 +OAS2_0154,OAS2_0154_MR2,Nondemented,2,791,F,R,77,18,1,28,0,1559,0.713,1.125 +OAS2_0156,OAS2_0156_MR1,Nondemented,1,0,F,R,78,18,1,30,0,1243,0.748,1.412 +OAS2_0156,OAS2_0156_MR2,Nondemented,2,777,F,R,81,18,1,30,0,1256,0.739,1.398 +OAS2_0157,OAS2_0157_MR1,Demented,1,0,F,R,73,12,2,19,1,1274,0.728,1.377 +OAS2_0157,OAS2_0157_MR2,Demented,2,764,F,R,75,12,2,18,1,1479,0.657,1.187 +OAS2_0158,OAS2_0158_MR1,Nondemented,1,0,F,R,73,15,4,29,0,1272,0.697,1.380 +OAS2_0158,OAS2_0158_MR2,Nondemented,2,1399,F,R,76,15,4,29,0,1281,0.680,1.370 +OAS2_0159,OAS2_0159_MR1,Demented,1,0,F,R,73,14,3,29,0.5,1238,0.757,1.418 +OAS2_0159,OAS2_0159_MR2,Demented,2,759,F,R,76,14,3,28,0.5,1236,0.764,1.419 +OAS2_0160,OAS2_0160_MR1,Demented,1,0,M,R,76,12,,27,0.5,1557,0.705,1.127 +OAS2_0160,OAS2_0160_MR2,Demented,2,552,M,R,78,12,,29,1,1569,0.704,1.119 +OAS2_0161,OAS2_0161_MR1,Nondemented,1,0,M,R,77,16,1,29,0,1818,0.734,0.965 +OAS2_0161,OAS2_0161_MR2,Nondemented,2,454,M,R,79,16,1,30,0,1817,0.736,0.966 +OAS2_0161,OAS2_0161_MR3,Nondemented,3,1033,M,R,80,16,1,29,0,1830,0.724,0.959 +OAS2_0162,OAS2_0162_MR1,Demented,1,0,M,R,82,14,2,23,0.5,1514,0.678,1.159 +OAS2_0162,OAS2_0162_MR2,Demented,2,621,M,R,84,14,2,22,0.5,1550,0.665,1.132 +OAS2_0164,OAS2_0164_MR1,Demented,1,0,M,R,77,20,1,23,1,1713,0.756,1.024 +OAS2_0164,OAS2_0164_MR2,Demented,2,580,M,R,79,20,1,25,2,1710,0.760,1.026 +OAS2_0165,OAS2_0165_MR1,Demented,1,0,M,R,78,12,3,23,1,1491,0.710,1.177 +OAS2_0165,OAS2_0165_MR2,Demented,2,736,M,R,80,12,3,17,1,1755,0.696,1.000 +OAS2_0169,OAS2_0169_MR1,Nondemented,1,0,F,R,71,18,1,30,0,1426,0.731,1.231 +OAS2_0169,OAS2_0169_MR2,Nondemented,2,691,F,R,73,18,1,30,0,1414,0.739,1.241 +OAS2_0171,OAS2_0171_MR1,Nondemented,1,0,M,R,76,16,3,30,0,1832,0.769,0.958 +OAS2_0171,OAS2_0171_MR2,Nondemented,2,493,M,R,77,16,3,30,0,1820,0.768,0.964 +OAS2_0171,OAS2_0171_MR3,Nondemented,3,1695,M,R,81,16,3,30,0,1836,0.744,0.956 +OAS2_0172,OAS2_0172_MR1,Demented,1,0,M,R,75,16,1,30,0.5,1891,0.709,0.928 +OAS2_0172,OAS2_0172_MR2,Demented,2,1212,M,R,79,16,1,29,0.5,1899,0.700,0.924 +OAS2_0174,OAS2_0174_MR1,Nondemented,1,0,M,R,60,12,4,30,0,1379,0.806,1.273 +OAS2_0174,OAS2_0174_MR2,Nondemented,2,695,M,R,62,12,4,30,0,1378,0.795,1.274 +OAS2_0174,OAS2_0174_MR3,Nondemented,3,1555,M,R,64,12,4,30,0,1370,0.794,1.281 +OAS2_0175,OAS2_0175_MR1,Demented,1,0,M,R,70,16,4,26,0.5,1796,0.742,0.977 +OAS2_0175,OAS2_0175_MR2,Demented,2,700,M,R,72,16,4,28,0.5,1796,0.732,0.977 +OAS2_0175,OAS2_0175_MR3,Demented,3,1343,M,R,73,16,4,28,0.5,1803,0.731,0.973 +OAS2_0176,OAS2_0176_MR1,Converted,1,0,M,R,84,16,2,30,0,1404,0.710,1.250 +OAS2_0176,OAS2_0176_MR2,Converted,2,774,M,R,87,16,2,30,0,1398,0.696,1.255 +OAS2_0176,OAS2_0176_MR3,Converted,3,1631,M,R,89,16,2,30,0.5,1408,0.679,1.246 +OAS2_0177,OAS2_0177_MR1,Nondemented,1,0,M,R,68,14,3,26,0,1444,0.778,1.216 +OAS2_0177,OAS2_0177_MR2,Nondemented,2,665,M,R,70,14,3,28,0,1510,0.770,1.162 +OAS2_0178,OAS2_0178_MR1,Nondemented,1,0,F,R,89,14,2,29,0,1509,0.756,1.163 +OAS2_0178,OAS2_0178_MR2,Nondemented,2,600,F,R,90,14,2,28,0,1495,0.746,1.174 +OAS2_0178,OAS2_0178_MR3,Nondemented,3,1447,F,R,93,14,2,30,0,1488,0.735,1.179 +OAS2_0179,OAS2_0179_MR1,Demented,1,0,M,R,79,20,1,26,0.5,1548,0.711,1.134 +OAS2_0179,OAS2_0179_MR2,Demented,2,652,M,R,81,20,1,26,0.5,1556,0.691,1.128 +OAS2_0181,OAS2_0181_MR1,Demented,1,0,F,R,74,12,,26,0.5,1171,0.733,1.499 +OAS2_0181,OAS2_0181_MR2,Demented,2,539,F,R,75,12,,,1,1169,0.742,1.501 +OAS2_0181,OAS2_0181_MR3,Demented,3,1107,F,R,77,12,,,1,1159,0.733,1.515 +OAS2_0182,OAS2_0182_MR1,Demented,1,0,M,R,73,12,,23,0.5,1661,0.698,1.056 +OAS2_0182,OAS2_0182_MR2,Demented,2,776,M,R,75,12,,20,0.5,1654,0.696,1.061 +OAS2_0183,OAS2_0183_MR1,Nondemented,1,0,F,R,66,13,2,30,0,1495,0.746,1.174 +OAS2_0183,OAS2_0183_MR2,Nondemented,2,182,F,R,66,13,2,30,0,1506,0.740,1.165 +OAS2_0183,OAS2_0183_MR3,Nondemented,3,732,F,R,68,13,2,30,0,1506,0.740,1.165 +OAS2_0183,OAS2_0183_MR4,Nondemented,4,2107,F,R,72,13,2,30,0,1510,0.723,1.162 +OAS2_0184,OAS2_0184_MR1,Demented,1,0,F,R,72,16,3,24,0.5,1354,0.733,1.296 +OAS2_0184,OAS2_0184_MR2,Demented,2,553,F,R,73,16,3,21,1,1351,0.708,1.299 +OAS2_0185,OAS2_0185_MR1,Demented,1,0,M,R,80,16,1,28,0.5,1704,0.711,1.030 +OAS2_0185,OAS2_0185_MR2,Demented,2,842,M,R,82,16,1,28,0.5,1693,0.694,1.037 +OAS2_0185,OAS2_0185_MR3,Demented,3,2297,M,R,86,16,1,26,0.5,1688,0.675,1.040 +OAS2_0186,OAS2_0186_MR1,Nondemented,1,0,F,R,61,13,2,30,0,1319,0.801,1.331 +OAS2_0186,OAS2_0186_MR2,Nondemented,2,763,F,R,63,13,2,30,0,1327,0.796,1.323 +OAS2_0186,OAS2_0186_MR3,Nondemented,3,1608,F,R,65,13,2,30,0,1333,0.801,1.317 diff --git a/OAISIS_clean/oasis_cross-sectional-5708aa0a98d82080.csv b/OAISIS_clean/oasis_cross-sectional-5708aa0a98d82080.csv new file mode 100644 index 0000000000000000000000000000000000000000..19f68687bcb6a178974d40e2aa17a4d68c8fd775 --- /dev/null +++ b/OAISIS_clean/oasis_cross-sectional-5708aa0a98d82080.csv @@ -0,0 +1,437 @@ +ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay +OAS1_0001_MR1,F,R,74,2,3,29,0,1344,0.743,1.306,N/A +OAS1_0002_MR1,F,R,55,4,1,29,0,1147,0.81,1.531,N/A +OAS1_0003_MR1,F,R,73,4,3,27,0.5,1454,0.708,1.207,N/A +OAS1_0004_MR1,M,R,28,,,,,1588,0.803,1.105,N/A +OAS1_0005_MR1,M,R,18,,,,,1737,0.848,1.01,N/A +OAS1_0006_MR1,F,R,24,,,,,1131,0.862,1.551,N/A +OAS1_0007_MR1,M,R,21,,,,,1516,0.83,1.157,N/A +OAS1_0009_MR1,F,R,20,,,,,1505,0.843,1.166,N/A +OAS1_0010_MR1,M,R,74,5,2,30,0,1636,0.689,1.073,N/A +OAS1_0011_MR1,F,R,52,3,2,30,0,1321,0.827,1.329,N/A +OAS1_0012_MR1,M,R,30,,,,,1574,0.842,1.115,N/A +OAS1_0013_MR1,F,R,81,5,2,30,0,1664,0.679,1.055,N/A +OAS1_0014_MR1,F,R,19,,,,,1525,0.856,1.151,N/A +OAS1_0015_MR1,M,R,76,2,,28,0.5,1738,0.719,1.01,N/A +OAS1_0016_MR1,M,R,82,2,4,27,0.5,1477,0.739,1.188,N/A +OAS1_0017_MR1,M,R,21,,,,,1689,0.845,1.039,N/A +OAS1_0018_MR1,M,R,39,3,4,28,0,1636,0.813,1.073,N/A +OAS1_0019_MR1,F,R,89,5,1,30,0,1536,0.715,1.142,N/A +OAS1_0020_MR1,F,R,48,5,2,29,0,1326,0.785,1.323,N/A +OAS1_0021_MR1,F,R,80,3,3,23,0.5,1794,0.765,0.978,N/A +OAS1_0022_MR1,F,R,69,2,4,23,0.5,1447,0.757,1.213,N/A +OAS1_0023_MR1,M,R,82,2,3,27,0.5,1420,0.71,1.236,N/A +OAS1_0025_MR1,F,R,24,,,,,1240,0.893,1.415,N/A +OAS1_0026_MR1,F,R,58,5,1,30,0,1235,0.82,1.421,N/A +OAS1_0027_MR1,F,R,43,,,,,1194,0.834,1.47,N/A +OAS1_0028_MR1,F,R,86,2,4,27,1,1449,0.738,1.211,N/A +OAS1_0029_MR1,M,R,21,,,,,1653,0.858,1.062,N/A +OAS1_0030_MR1,F,R,65,2,3,29,0,1392,0.764,1.261,N/A +OAS1_0031_MR1,M,R,88,1,4,26,1,1419,0.674,1.236,N/A +OAS1_0032_MR1,M,R,89,4,1,28,0,1631,0.682,1.076,N/A +OAS1_0033_MR1,F,R,80,4,2,29,0,1323,0.735,1.326,N/A +OAS1_0034_MR1,M,R,51,5,1,29,0,1538,0.831,1.141,N/A +OAS1_0035_MR1,F,R,84,3,2,28,1,1402,0.695,1.252,N/A +OAS1_0037_MR1,M,R,27,,,,,1313,0.842,1.336,N/A +OAS1_0038_MR1,F,R,23,,,,,1443,0.839,1.216,N/A +OAS1_0039_MR1,M,R,70,4,3,29,0.5,1463,0.772,1.2,N/A +OAS1_0040_MR1,F,R,38,,,,,1244,0.824,1.411,N/A +OAS1_0041_MR1,F,R,62,2,,28,0.5,1350,0.758,1.3,N/A +OAS1_0042_MR1,M,R,80,4,2,29,0.5,1854,0.709,0.947,N/A +OAS1_0043_MR1,M,R,21,,,,,1511,0.846,1.162,N/A +OAS1_0044_MR1,F,R,47,4,2,30,0,1346,0.829,1.304,N/A +OAS1_0045_MR1,M,R,29,,,,,1590,0.829,1.104,N/A +OAS1_0046_MR1,M,R,64,2,,22,0.5,1351,0.787,1.299,N/A +OAS1_0047_MR1,F,R,57,,,,,1408,0.784,1.247,N/A +OAS1_0049_MR1,F,R,20,,,,,1329,0.887,1.321,N/A +OAS1_0050_MR1,F,R,48,,,,,1358,0.841,1.293,N/A +OAS1_0051_MR1,F,R,24,,,,,1567,0.835,1.12,N/A +OAS1_0052_MR1,F,R,78,1,5,23,1,1462,0.697,1.2,N/A +OAS1_0053_MR1,F,R,83,1,4,21,1,1384,0.699,1.268,N/A +OAS1_0054_MR1,F,R,21,,,,,1567,0.848,1.12,N/A +OAS1_0055_MR1,F,R,20,,,,,1432,0.831,1.226,N/A +OAS1_0056_MR1,F,R,72,3,3,15,1,1324,0.668,1.325,N/A +OAS1_0057_MR1,F,R,21,,,,,1333,0.862,1.317,N/A +OAS1_0058_MR1,F,R,46,5,1,30,0,1585,0.817,1.107,N/A +OAS1_0059_MR1,F,R,20,,,,,1396,0.827,1.257,N/A +OAS1_0060_MR1,M,R,79,4,,29,0.5,1564,0.734,1.122,N/A +OAS1_0061_MR1,F,R,20,,,,,1749,0.84,1.441,N/A +OAS1_0062_MR1,F,R,73,3,2,30,0,1456,0.754,1.205,N/A +OAS1_0063_MR1,M,R,48,,,,,1675,0.818,1.048,N/A +OAS1_0064_MR1,F,R,77,1,4,29,0,1583,0.767,1.108,N/A +OAS1_0065_MR1,M,R,90,2,3,25,0,1301,0.645,1.349,N/A +OAS1_0066_MR1,F,R,66,1,4,28,0.5,1309,0.765,1.341,N/A +OAS1_0067_MR1,F,R,71,4,1,27,1,1549,0.73,1.133,N/A +OAS1_0068_MR1,F,R,67,3,4,30,0,1508,0.805,1.164,N/A +OAS1_0069_MR1,M,R,33,4,1,30,0,1709,0.784,1.027,N/A +OAS1_0070_MR1,F,R,63,3,2,30,0,1327,0.801,1.323,N/A +OAS1_0071_MR1,F,R,49,5,1,30,0,1459,0.808,1.203,N/A +OAS1_0072_MR1,F,R,60,5,1,30,0,1402,0.823,1.252,N/A +OAS1_0073_MR1,F,R,69,2,4,21,1,1495,0.655,1.174,N/A +OAS1_0074_MR1,M,R,43,4,,30,0,1547,0.847,1.134,N/A +OAS1_0075_MR1,F,R,83,3,2,30,0,1335,0.72,1.314,N/A +OAS1_0076_MR1,F,R,18,,,,,1501,0.839,1.169,N/A +OAS1_0077_MR1,F,R,20,,,,,1537,0.852,1.142,N/A +OAS1_0078_MR1,F,R,64,3,2,30,0,1395,0.809,1.258,N/A +OAS1_0079_MR1,F,R,25,,,,,1522,0.826,1.153,N/A +OAS1_0080_MR1,F,R,25,,,,,1628,0.857,1.078,N/A +OAS1_0081_MR1,F,R,18,,,,,1309,0.857,1.341,N/A +OAS1_0082_MR1,F,R,75,2,3,28,0.5,1407,0.776,1.247,N/A +OAS1_0083_MR1,F,R,90,5,3,27,0,1200,0.727,1.462,N/A +OAS1_0084_MR1,F,R,81,2,,27,0.5,1453,0.727,1.208,N/A +OAS1_0085_MR1,F,R,70,2,3,29,0,1283,0.791,1.368,N/A +OAS1_0086_MR1,F,R,47,4,1,30,0,1311,0.835,1.339,N/A +OAS1_0087_MR1,F,R,21,,,,,1507,0.845,1.165,N/A +OAS1_0088_MR1,F,R,40,,,,,1557,0.865,1.127,N/A +OAS1_0090_MR1,M,R,20,,,,,1728,0.862,1.016,N/A +OAS1_0091_MR1,F,R,18,,,,,1701,0.834,1.032,N/A +OAS1_0092_MR1,M,R,22,,,,,1442,0.834,1.217,N/A +OAS1_0094_MR1,F,R,66,2,3,30,0.5,1447,0.772,1.213,N/A +OAS1_0095_MR1,M,R,28,,,,,1578,0.856,1.112,N/A +OAS1_0096_MR1,F,R,47,5,2,29,0,1357,0.809,1.294,N/A +OAS1_0097_MR1,M,R,23,,,,,1568,0.816,1.119,N/A +OAS1_0098_MR1,F,R,67,2,,18,0.5,1653,0.693,1.062,N/A +OAS1_0099_MR1,F,R,19,,,,,1484,0.878,1.183,N/A +OAS1_0101_MR1,M,R,29,,,,,1486,0.84,1.181,N/A +OAS1_0102_MR1,M,R,18,,,,,1542,0.85,1.138,N/A +OAS1_0103_MR1,F,R,19,,,,,1499,0.85,1.17,N/A +OAS1_0104_MR1,F,R,24,,,,,1447,0.841,1.213,N/A +OAS1_0105_MR1,M,R,20,,,,,1512,0.839,1.161,N/A +OAS1_0106_MR1,F,R,81,2,4,30,0,1230,0.717,1.427,N/A +OAS1_0107_MR1,M,R,20,,,,,1733,0.853,1.013,N/A +OAS1_0108_MR1,M,R,25,,,,,1825,0.854,0.962,N/A +OAS1_0109_MR1,F,R,61,4,3,30,0,1313,0.813,1.337,N/A +OAS1_0110_MR1,M,R,84,3,4,28,0,1483,0.697,1.183,N/A +OAS1_0111_MR1,M,R,23,,,,,1711,0.855,1.025,N/A +OAS1_0112_MR1,F,R,69,5,2,29,0,1536,0.733,1.143,N/A +OAS1_0113_MR1,F,R,83,2,2,29,0,1569,0.768,1.118,N/A +OAS1_0114_MR1,M,R,62,2,4,30,0,1378,0.804,1.274,N/A +OAS1_0115_MR1,M,R,72,5,1,26,0.5,1911,0.726,0.919,N/A +OAS1_0116_MR1,F,R,52,5,1,30,0,1373,0.784,1.279,N/A +OAS1_0117_MR1,M,R,25,,,,,1759,0.783,0.998,N/A +OAS1_0119_MR1,M,R,19,,,,,1502,0.838,1.169,N/A +OAS1_0120_MR1,M,R,70,4,4,26,0.5,1796,0.736,0.977,N/A +OAS1_0121_MR1,M,R,26,,,,,1684,0.82,1.042,N/A +OAS1_0122_MR1,F,R,83,5,2,22,1,1377,0.715,1.274,N/A +OAS1_0123_MR1,F,R,83,3,4,24,0.5,1282,0.797,1.369,N/A +OAS1_0124_MR1,M,R,73,2,,23,0.5,1661,0.709,1.056,N/A +OAS1_0125_MR1,F,R,22,,,,,1537,0.832,1.142,N/A +OAS1_0126_MR1,M,R,21,,,,,1582,0.885,1.11,N/A +OAS1_0127_MR1,M,R,30,,,,,1538,0.862,1.141,N/A +OAS1_0129_MR1,M,R,18,,,,,1514,0.846,1.159,N/A +OAS1_0130_MR1,M,R,68,3,3,26,0,1444,0.789,1.216,N/A +OAS1_0131_MR1,M,R,24,,,,,1637,0.824,1.072,N/A +OAS1_0132_MR1,M,R,22,,,,,1596,0.85,1.099,N/A +OAS1_0133_MR1,F,R,65,5,2,30,0,1277,0.814,1.374,N/A +OAS1_0134_MR1,M,R,80,2,4,20,1,1494,0.665,1.175,N/A +OAS1_0135_MR1,M,R,64,2,4,29,0,1561,0.801,1.124,N/A +OAS1_0136_MR1,F,R,24,,,,,1178,0.873,1.489,N/A +OAS1_0137_MR1,F,R,87,2,3,22,1,1499,0.672,1.171,N/A +OAS1_0138_MR1,M,R,80,2,4,28,0,1689,0.706,1.039,N/A +OAS1_0139_MR1,F,R,72,3,3,28,0,1512,0.779,1.161,N/A +OAS1_0140_MR1,F,R,23,,,,,1375,0.872,1.277,N/A +OAS1_0141_MR1,M,R,24,,,,,1523,0.846,1.152,N/A +OAS1_0142_MR1,M,R,70,4,1,27,0.5,1581,0.695,1.11,N/A +OAS1_0143_MR1,M,R,66,2,4,30,0.5,1446,0.784,1.214,N/A +OAS1_0144_MR1,M,R,22,,,,,1799,0.865,0.975,N/A +OAS1_0145_MR1,M,R,34,,,,,1653,0.831,1.062,N/A +OAS1_0146_MR1,F,R,82,5,1,28,0,1513,0.742,1.16,N/A +OAS1_0147_MR1,F,R,25,,,,,1663,0.845,1.055,N/A +OAS1_0148_MR1,M,R,23,,,,,1497,0.811,1.172,N/A +OAS1_0150_MR1,F,R,20,,,,,1510,0.875,1.162,N/A +OAS1_0151_MR1,F,R,25,,,,,1439,0.856,1.22,N/A +OAS1_0152_MR1,F,R,23,,,,,1471,0.83,1.193,N/A +OAS1_0153_MR1,M,R,23,,,,,1662,0.85,1.056,N/A +OAS1_0155_MR1,M,R,71,4,,28,0.5,1359,0.753,1.291,N/A +OAS1_0156_MR1,F,R,20,,,,,1591,0.834,1.103,N/A +OAS1_0157_MR1,F,R,86,4,3,30,0,1293,0.756,1.357,N/A +OAS1_0158_MR1,M,R,81,5,1,26,0.5,1556,0.689,1.128,N/A +OAS1_0159_MR1,F,R,40,,,,,1437,0.821,1.221,N/A +OAS1_0160_MR1,M,R,57,,,,,1745,0.813,1.006,N/A +OAS1_0161_MR1,F,R,84,2,2,27,0.5,1390,0.727,1.263,N/A +OAS1_0162_MR1,F,R,20,,,,,1219,0.872,1.44,N/A +OAS1_0163_MR1,F,R,18,,,,,1633,0.859,1.075,N/A +OAS1_0164_MR1,F,R,81,2,3,28,0.5,1495,0.687,1.174,N/A +OAS1_0165_MR1,F,R,74,2,3,29,0,1395,0.787,1.258,N/A +OAS1_0166_MR1,F,R,80,2,,27,0.5,1475,0.771,1.19,N/A +OAS1_0167_MR1,F,R,41,,,,,1361,0.849,1.289,N/A +OAS1_0168_MR1,F,R,50,,,,,1411,0.846,1.244,N/A +OAS1_0169_MR1,F,R,88,2,3,30,0,1445,0.718,1.215,N/A +OAS1_0170_MR1,M,R,71,2,4,29,0,1455,0.725,1.206,N/A +OAS1_0173_MR1,M,R,35,,,,,1475,0.829,1.19,N/A +OAS1_0174_MR1,M,R,23,,,,,1415,0.865,1.241,N/A +OAS1_0176_MR1,F,R,88,3,1,29,0,1398,0.712,1.255,N/A +OAS1_0177_MR1,F,R,54,4,1,30,0,1494,0.838,1.174,N/A +OAS1_0178_MR1,F,R,44,,,,,1272,0.853,1.38,N/A +OAS1_0179_MR1,F,R,87,2,4,21,0.5,1250,0.653,1.405,N/A +OAS1_0180_MR1,F,R,80,4,2,30,0,1496,0.745,1.173,N/A +OAS1_0181_MR1,F,R,49,4,2,30,0,1316,0.82,1.334,N/A +OAS1_0182_MR1,M,R,48,,,,,1561,0.816,1.124,N/A +OAS1_0183_MR1,M,R,44,,,,,1908,0.816,0.92,N/A +OAS1_0184_MR1,F,R,65,2,,16,1,1521,0.669,1.154,N/A +OAS1_0185_MR1,F,R,78,2,4,17,1,1314,0.739,1.336,N/A +OAS1_0186_MR1,M,R,84,5,1,29,0,1707,0.731,1.028,N/A +OAS1_0188_MR1,M,R,48,4,2,30,0,1464,0.79,1.199,N/A +OAS1_0189_MR1,M,R,22,,,,,1628,0.853,1.078,N/A +OAS1_0190_MR1,M,R,43,,,,,1561,0.813,1.124,N/A +OAS1_0191_MR1,F,R,21,,,,,1421,0.835,1.235,N/A +OAS1_0192_MR1,F,R,31,,,,,1294,0.839,1.357,N/A +OAS1_0193_MR1,F,R,23,,,,,1546,0.831,1.135,N/A +OAS1_0195_MR1,F,R,76,4,1,28,0,1346,0.766,1.304,N/A +OAS1_0197_MR1,F,R,89,3,3,29,0,1154,0.747,1.521,N/A +OAS1_0198_MR1,F,R,21,,,,,1332,0.852,1.317,N/A +OAS1_0199_MR1,M,R,69,5,1,30,0,1601,0.784,1.096,N/A +OAS1_0200_MR1,F,R,60,2,4,30,0,1366,0.807,1.285,N/A +OAS1_0201_MR1,F,R,85,4,1,26,0,1460,0.754,1.202,N/A +OAS1_0202_MR1,F,R,23,,,,,1574,0.865,1.115,N/A +OAS1_0203_MR1,F,R,71,2,3,30,0,1360,0.779,1.291,N/A +OAS1_0204_MR1,M,R,48,4,1,29,0,1430,0.797,1.227,N/A +OAS1_0205_MR1,M,R,75,4,1,30,0.5,1891,0.716,0.928,N/A +OAS1_0206_MR1,F,R,78,5,1,30,0,1243,0.747,1.412,N/A +OAS1_0207_MR1,M,R,51,5,2,29,0,1714,0.819,1.024,N/A +OAS1_0208_MR1,F,R,55,5,1,29,0,1368,0.823,1.283,N/A +OAS1_0209_MR1,F,R,22,,,,,1328,0.842,1.321,N/A +OAS1_0210_MR1,F,R,73,4,1,28,0.5,1676,0.722,1.047,N/A +OAS1_0211_MR1,M,R,20,,,,,1657,0.849,1.059,N/A +OAS1_0212_MR1,F,R,74,3,,28,0,1614,0.697,1.087,N/A +OAS1_0213_MR1,F,R,48,,,,,1332,0.801,1.318,N/A +OAS1_0214_MR1,M,R,18,,,,,1854,0.87,0.947,N/A +OAS1_0216_MR1,F,R,71,4,3,30,0,1503,0.792,1.168,N/A +OAS1_0217_MR1,F,R,78,4,3,27,0.5,1393,0.692,1.26,N/A +OAS1_0218_MR1,F,R,26,,,,,1291,0.843,1.36,N/A +OAS1_0220_MR1,F,R,75,5,1,30,0,1317,0.742,1.332,N/A +OAS1_0221_MR1,F,R,94,5,1,29,0,1474,0.696,1.19,N/A +OAS1_0222_MR1,F,R,49,,,,,1164,0.805,1.508,N/A +OAS1_0223_MR1,M,R,84,2,,20,1,1641,0.703,1.07,N/A +OAS1_0224_MR1,F,R,22,,,,,1378,0.852,1.274,N/A +OAS1_0226_MR1,M,R,90,1,4,23,0.5,1668,0.644,1.052,N/A +OAS1_0227_MR1,F,R,26,,,,,1288,0.777,1.362,N/A +OAS1_0228_MR1,F,R,81,3,2,28,0,1486,0.759,1.181,N/A +OAS1_0229_MR1,F,R,55,3,3,30,0,1327,0.832,1.323,N/A +OAS1_0230_MR1,F,R,19,,,,,1584,0.846,1.108,N/A +OAS1_0231_MR1,F,R,20,,,,,1429,0.852,1.228,N/A +OAS1_0232_MR1,M,R,22,,,,,1582,0.857,1.11,N/A +OAS1_0233_MR1,F,R,77,1,4,20,0.5,1376,0.701,1.275,N/A +OAS1_0234_MR1,M,R,75,5,2,29,0,1534,0.771,1.144,N/A +OAS1_0235_MR1,M,R,37,,,,,1407,0.842,1.247,N/A +OAS1_0236_MR1,F,R,20,,,,,1218,0.876,1.441,N/A +OAS1_0237_MR1,F,R,72,2,2,27,0,1322,0.764,1.328,N/A +OAS1_0238_MR1,F,R,77,2,3,28,0.5,1484,0.786,1.182,N/A +OAS1_0239_MR1,F,R,29,,,,,1439,0.823,1.22,N/A +OAS1_0240_MR1,F,R,74,2,,26,0.5,1171,0.736,1.499,N/A +OAS1_0241_MR1,F,R,74,5,2,30,0,1400,0.754,1.254,N/A +OAS1_0243_MR1,M,R,64,5,2,22,0.5,1547,0.742,1.134,N/A +OAS1_0244_MR1,F,R,80,4,2,29,0,1341,0.737,1.309,N/A +OAS1_0246_MR1,F,R,22,,,,,1522,0.841,1.153,N/A +OAS1_0247_MR1,M,R,90,2,3,21,0.5,1307,0.689,1.342,N/A +OAS1_0249_MR1,F,R,28,,,,,1217,0.871,1.443,N/A +OAS1_0250_MR1,M,R,21,,,,,1500,0.837,1.17,N/A +OAS1_0253_MR1,F,R,20,,,,,1751,0.852,1.002,N/A +OAS1_0254_MR1,F,R,85,5,1,29,0,1264,0.705,1.388,N/A +OAS1_0255_MR1,F,R,71,5,1,30,0,1426,0.737,1.231,N/A +OAS1_0256_MR1,M,R,70,5,1,30,0,1660,0.739,1.057,N/A +OAS1_0258_MR1,F,R,21,,,,,1516,0.87,1.158,N/A +OAS1_0259_MR1,F,R,78,3,2,29,0,1334,0.773,1.316,N/A +OAS1_0260_MR1,M,R,87,2,4,30,0,1762,0.719,0.996,N/A +OAS1_0261_MR1,M,R,28,,,,,1417,0.845,1.238,N/A +OAS1_0262_MR1,M,R,46,2,3,30,0,1604,0.784,1.094,N/A +OAS1_0263_MR1,M,R,79,4,1,30,0.5,1722,0.709,1.019,N/A +OAS1_0264_MR1,M,R,24,,,,,1591,0.849,1.103,N/A +OAS1_0265_MR1,F,R,54,,,,,1410,0.813,1.245,N/A +OAS1_0266_MR1,M,R,51,5,1,30,0,1793,0.834,0.979,N/A +OAS1_0267_MR1,M,R,80,5,2,28,0.5,1506,0.679,1.166,N/A +OAS1_0268_MR1,M,R,78,2,3,23,1,1491,0.715,1.177,N/A +OAS1_0269_MR1,F,R,72,1,4,21,1,1489,0.683,1.179,N/A +OAS1_0270_MR1,F,R,93,3,2,30,0,1272,0.703,1.38,N/A +OAS1_0271_MR1,F,R,89,2,4,27,0,1329,0.74,1.32,N/A +OAS1_0272_MR1,F,R,75,3,2,26,0.5,1355,0.745,1.296,N/A +OAS1_0273_MR1,F,R,89,1,4,18,0.5,1480,0.676,1.186,N/A +OAS1_0274_MR1,F,R,58,3,3,30,0,1373,0.815,1.278,N/A +OAS1_0275_MR1,M,R,50,,,,,1635,0.802,1.073,N/A +OAS1_0277_MR1,M,R,22,,,,,1913,0.841,0.917,N/A +OAS1_0278_MR1,F,R,96,5,1,26,1,1465,0.684,1.198,N/A +OAS1_0279_MR1,F,R,73,1,4,30,0,1475,0.721,1.19,N/A +OAS1_0280_MR1,F,R,78,5,1,30,0,1440,0.67,1.219,N/A +OAS1_0281_MR1,M,R,28,,,,,1538,0.835,1.141,N/A +OAS1_0282_MR1,F,R,45,,,,,1478,0.819,1.188,N/A +OAS1_0283_MR1,F,R,18,,,,,1578,0.836,1.112,N/A +OAS1_0284_MR1,F,R,91,5,2,30,0,1714,0.746,1.024,N/A +OAS1_0285_MR1,M,R,20,,,,,1470,0.843,1.194,N/A +OAS1_0286_MR1,F,R,83,3,2,20,0.5,1476,0.751,1.189,N/A +OAS1_0287_MR1,F,R,78,3,3,21,0.5,1194,0.694,1.47,N/A +OAS1_0288_MR1,M,R,71,2,4,20,0.5,1461,0.727,1.202,N/A +OAS1_0289_MR1,F,R,59,3,2,28,0,1334,0.767,1.316,N/A +OAS1_0290_MR1,M,R,83,3,2,26,0.5,1992,0.706,0.881,N/A +OAS1_0291_MR1,F,R,73,2,2,19,1,1274,0.745,1.377,N/A +OAS1_0292_MR1,F,R,64,4,2,30,0,1415,0.766,1.24,N/A +OAS1_0293_MR1,F,R,69,1,2,26,0,1384,0.783,1.268,N/A +OAS1_0294_MR1,F,R,20,,,,,1439,0.841,1.22,N/A +OAS1_0295_MR1,F,R,20,,,,,1412,0.803,1.243,N/A +OAS1_0296_MR1,F,R,28,,,,,1428,0.869,1.229,N/A +OAS1_0298_MR1,F,R,72,4,3,24,0.5,1354,0.738,1.296,N/A +OAS1_0299_MR1,F,R,90,2,3,29,0,1475,0.671,1.19,N/A +OAS1_0300_MR1,M,R,68,3,2,30,0.5,1556,0.723,1.128,N/A +OAS1_0301_MR1,F,R,90,3,2,28,0,1495,0.761,1.174,N/A +OAS1_0302_MR1,M,R,22,,,,,1570,0.831,1.118,N/A +OAS1_0303_MR1,F,R,67,2,4,30,0,1221,0.831,1.438,N/A +OAS1_0304_MR1,M,R,84,3,3,29,0.5,1497,0.693,1.172,N/A +OAS1_0305_MR1,M,R,48,,,,,1454,0.85,1.207,N/A +OAS1_0307_MR1,M,R,67,4,2,23,0.5,1399,0.735,1.255,N/A +OAS1_0308_MR1,F,R,78,3,3,15,2,1401,0.703,1.253,N/A +OAS1_0309_MR1,F,R,54,2,2,30,0,1441,0.786,1.218,N/A +OAS1_0310_MR1,F,R,20,,,,,1388,0.863,1.265,N/A +OAS1_0311_MR1,F,R,22,,,,,1366,0.83,1.285,N/A +OAS1_0312_MR1,F,R,73,3,,26,0.5,1311,0.756,1.339,N/A +OAS1_0313_MR1,F,R,20,,,,,1516,0.838,1.158,N/A +OAS1_0314_MR1,M,R,27,,,,,1720,0.84,1.02,N/A +OAS1_0315_MR1,M,R,77,5,1,25,0.5,1604,0.773,1.094,N/A +OAS1_0316_MR1,F,R,72,4,2,22,1,1493,0.69,1.176,N/A +OAS1_0317_MR1,M,R,86,4,1,26,0,1501,0.702,1.169,N/A +OAS1_0318_MR1,M,R,33,,,,,1634,0.836,1.074,N/A +OAS1_0319_MR1,M,R,31,,,,,1527,0.821,1.149,N/A +OAS1_0321_MR1,M,R,19,,,,,1478,0.843,1.187,N/A +OAS1_0322_MR1,F,R,65,3,4,29,0,1335,0.776,1.315,N/A +OAS1_0323_MR1,F,R,50,5,1,30,0,1370,0.826,1.281,N/A +OAS1_0325_MR1,F,R,27,,,,,1422,0.869,1.234,N/A +OAS1_0326_MR1,F,R,73,3,4,29,0,1272,0.7,1.38,N/A +OAS1_0327_MR1,M,R,50,,,,,1740,0.794,1.008,N/A +OAS1_0328_MR1,M,R,19,,,,,1453,0.878,1.208,N/A +OAS1_0329_MR1,F,R,80,2,3,29,0.5,1209,0.76,1.451,N/A +OAS1_0330_MR1,F,R,80,1,5,27,0,1381,0.752,1.27,N/A +OAS1_0331_MR1,F,R,54,,,,,1467,0.821,1.196,N/A +OAS1_0332_MR1,M,R,72,1,3,29,0,1734,0.762,1.012,N/A +OAS1_0333_MR1,M,R,26,,,,,1607,0.85,1.092,N/A +OAS1_0335_MR1,F,R,80,1,4,27,0.5,1654,0.678,1.061,N/A +OAS1_0336_MR1,F,R,41,,,,,1528,0.852,1.149,N/A +OAS1_0337_MR1,M,R,81,1,4,28,0,1750,0.676,1.003,N/A +OAS1_0338_MR1,M,R,77,4,1,29,0,1818,0.736,0.965,N/A +OAS1_0339_MR1,F,R,79,2,,24,0.5,1211,0.694,1.449,N/A +OAS1_0340_MR1,M,R,19,,,,,1650,0.853,1.063,N/A +OAS1_0341_MR1,F,R,71,2,4,30,0,1479,0.772,1.187,N/A +OAS1_0342_MR1,F,R,88,2,3,28,0,1370,0.765,1.281,N/A +OAS1_0343_MR1,M,R,68,3,3,30,0,1441,0.811,1.217,N/A +OAS1_0344_MR1,M,R,20,,,,,1510,0.851,1.163,N/A +OAS1_0345_MR1,F,R,54,4,2,30,0,1389,0.831,1.264,N/A +OAS1_0346_MR1,M,R,23,,,,,1485,0.843,1.181,N/A +OAS1_0348_MR1,F,R,22,,,,,1473,0.841,1.191,N/A +OAS1_0349_MR1,F,R,43,,,,,1227,0.858,1.43,N/A +OAS1_0350_MR1,M,R,21,,,,,1577,0.869,1.113,N/A +OAS1_0351_MR1,M,R,86,1,4,15,2,1512,0.665,1.161,N/A +OAS1_0352_MR1,F,R,81,5,2,26,0.5,1174,0.743,1.495,N/A +OAS1_0353_MR1,M,R,22,,,,,1680,0.8,1.044,N/A +OAS1_0354_MR1,M,R,74,1,3,26,0,1367,0.776,1.284,N/A +OAS1_0355_MR1,F,R,73,4,2,29,0,1123,0.79,1.563,N/A +OAS1_0356_MR1,F,R,68,3,2,30,0,1506,0.74,1.165,N/A +OAS1_0357_MR1,F,R,55,4,3,30,0,1450,0.82,1.21,N/A +OAS1_0358_MR1,M,R,65,3,3,29,0,1362,0.839,1.289,N/A +OAS1_0359_MR1,M,R,21,,,,,1714,0.864,1.024,N/A +OAS1_0361_MR1,M,R,20,,,,,1485,0.842,1.182,N/A +OAS1_0362_MR1,F,R,63,3,,14,0.5,1439,0.716,1.219,N/A +OAS1_0363_MR1,M,R,87,4,2,30,0,1398,0.702,1.255,N/A +OAS1_0365_MR1,M,R,74,5,2,30,0,1806,0.754,0.972,N/A +OAS1_0366_MR1,F,R,45,5,2,29,0,1549,0.813,1.133,N/A +OAS1_0367_MR1,F,R,46,2,2,28,0,1161,0.841,1.511,N/A +OAS1_0368_MR1,M,R,22,,,,,1572,0.856,1.116,N/A +OAS1_0369_MR1,F,R,73,4,1,28,0,1295,0.772,1.356,N/A +OAS1_0370_MR1,M,R,23,,,,,1734,0.847,1.012,N/A +OAS1_0371_MR1,F,R,70,3,4,30,0,1361,0.783,1.29,N/A +OAS1_0372_MR1,M,R,59,3,2,29,0,1596,0.817,1.1,N/A +OAS1_0373_MR1,F,R,80,3,2,20,1,1732,0.692,1.013,N/A +OAS1_0374_MR1,F,R,73,3,3,29,0.5,1238,0.76,1.418,N/A +OAS1_0375_MR1,M,R,46,,,,,1617,0.775,1.086,N/A +OAS1_0376_MR1,M,R,31,,,,,1579,0.817,1.111,N/A +OAS1_0377_MR1,M,R,25,,,,,1567,0.831,1.12,N/A +OAS1_0378_MR1,F,R,58,2,2,30,0,1418,0.821,1.238,N/A +OAS1_0379_MR1,F,R,20,,,,,1255,0.866,1.398,N/A +OAS1_0380_MR1,F,R,83,1,5,18,0.5,1313,0.705,1.337,N/A +OAS1_0381_MR1,M,R,59,4,2,29,0,1795,0.809,0.978,N/A +OAS1_0382_MR1,F,R,67,4,,15,1,1288,0.763,1.362,N/A +OAS1_0383_MR1,M,R,58,,,,,1590,0.746,1.104,N/A +OAS1_0384_MR1,F,R,38,,,,,1562,0.844,1.123,N/A +OAS1_0385_MR1,M,R,22,,,,,1643,0.841,1.068,N/A +OAS1_0386_MR1,F,R,26,,,,,1490,0.838,1.178,N/A +OAS1_0387_MR1,F,R,26,,,,,1149,0.851,1.527,N/A +OAS1_0388_MR1,F,R,77,2,4,22,1,1350,0.736,1.3,N/A +OAS1_0389_MR1,M,R,55,,,,,1678,0.782,1.046,N/A +OAS1_0390_MR1,M,R,69,2,2,24,0.5,1480,0.794,1.186,N/A +OAS1_0392_MR1,F,R,24,,,,,1441,0.848,1.218,N/A +OAS1_0394_MR1,F,R,22,,,,,1343,0.847,1.307,N/A +OAS1_0395_MR1,F,R,26,,,,,1295,0.834,1.356,N/A +OAS1_0396_MR1,M,R,25,,,,,1674,0.832,1.048,N/A +OAS1_0397_MR1,F,R,20,,,,,1265,0.846,1.387,N/A +OAS1_0398_MR1,M,R,71,5,1,30,0,1769,0.716,0.992,N/A +OAS1_0399_MR1,M,R,78,2,,29,1,1569,0.706,1.119,N/A +OAS1_0400_MR1,F,R,92,5,1,25,0.5,1774,0.644,0.989,N/A +OAS1_0401_MR1,F,R,54,4,3,29,0,1287,0.827,1.364,N/A +OAS1_0402_MR1,F,R,76,3,2,30,0.5,1350,0.763,1.3,N/A +OAS1_0403_MR1,M,R,19,,,,,1592,0.833,1.102,N/A +OAS1_0404_MR1,F,R,73,2,2,29,0,1465,0.776,1.198,N/A +OAS1_0405_MR1,M,R,77,5,1,23,1,1713,0.761,1.024,N/A +OAS1_0406_MR1,F,R,25,,,,,1346,0.855,1.303,N/A +OAS1_0407_MR1,F,R,55,,,,,1434,0.807,1.224,N/A +OAS1_0408_MR1,F,R,22,,,,,1518,0.861,1.156,N/A +OAS1_0409_MR1,M,R,34,,,,,1569,0.798,1.118,N/A +OAS1_0410_MR1,F,R,23,,,,,1507,0.87,1.165,N/A +OAS1_0411_MR1,F,R,71,5,1,29,0.5,1346,0.742,1.304,N/A +OAS1_0413_MR1,F,R,25,,,,,1447,0.866,1.213,N/A +OAS1_0415_MR1,F,R,21,,,,,1542,0.859,1.138,N/A +OAS1_0416_MR1,F,R,23,,,,,1567,0.852,1.12,N/A +OAS1_0417_MR1,F,R,30,,,,,1551,0.855,1.132,N/A +OAS1_0418_MR1,M,R,74,5,2,28,0.5,1659,0.747,1.058,N/A +OAS1_0419_MR1,F,R,21,,,,,1473,0.862,1.191,N/A +OAS1_0420_MR1,F,R,22,,,,,1732,0.848,1.013,N/A +OAS1_0421_MR1,F,R,22,,,,,1655,0.847,1.061,N/A +OAS1_0422_MR1,F,R,69,4,3,29,0,1380,0.809,1.272,N/A +OAS1_0423_MR1,M,R,75,2,4,28,0,1511,0.749,1.162,N/A +OAS1_0424_MR1,M,R,75,4,1,20,1,1613,0.715,1.088,N/A +OAS1_0425_MR1,F,R,78,1,4,23,1,1461,0.715,1.201,N/A +OAS1_0426_MR1,F,R,82,5,2,29,0,1316,0.791,1.334,N/A +OAS1_0428_MR1,F,R,84,4,3,28,0,1500,0.751,1.17,N/A +OAS1_0429_MR1,F,R,45,,,,,1385,0.808,1.267,N/A +OAS1_0430_MR1,M,R,71,4,1,17,1,1562,0.687,1.123,N/A +OAS1_0431_MR1,F,R,22,,,,,1405,0.822,1.249,N/A +OAS1_0432_MR1,F,R,72,2,4,26,0.5,1453,0.773,1.208,N/A +OAS1_0433_MR1,M,R,58,4,1,27,0,1606,0.779,1.093,N/A +OAS1_0434_MR1,F,R,50,,,,,1385,0.819,1.267,N/A +OAS1_0435_MR1,M,R,23,,,,,1766,0.82,0.994,N/A +OAS1_0437_MR1,F,R,22,,,,,1444,0.853,1.216,N/A +OAS1_0438_MR1,F,R,66,5,2,29,0,1191,0.787,1.474,N/A +OAS1_0439_MR1,M,R,21,,,,,1438,0.844,1.221,N/A +OAS1_0440_MR1,M,R,86,1,4,27,0.5,1320,0.723,1.329,N/A +OAS1_0441_MR1,M,R,81,5,1,29,0.5,1647,0.721,1.066,N/A +OAS1_0442_MR1,F,R,23,,,,,1431,0.847,1.227,N/A +OAS1_0443_MR1,F,R,52,3,3,30,0,1431,0.814,1.226,N/A +OAS1_0444_MR1,F,R,30,,,,,1250,0.86,1.404,N/A +OAS1_0445_MR1,F,R,90,1,2,29,0,1362,0.673,1.289,N/A +OAS1_0446_MR1,F,R,80,2,4,30,0,1390,0.748,1.263,N/A +OAS1_0447_MR1,F,R,92,4,1,24,0.5,1388,0.739,1.264,N/A +OAS1_0448_MR1,F,R,22,,,,,1524,0.858,1.152,N/A +OAS1_0449_MR1,F,R,71,3,4,29,0,1264,0.818,1.388,N/A +OAS1_0450_MR1,M,R,19,,,,,1478,0.88,1.188,N/A +OAS1_0451_MR1,M,R,73,5,3,27,0.5,1687,0.728,1.04,N/A +OAS1_0452_MR1,M,R,75,1,4,22,1,1656,0.762,1.06,N/A +OAS1_0453_MR1,F,R,70,1,4,29,0.5,1295,0.748,1.355,N/A +OAS1_0454_MR1,F,R,73,3,2,23,0.5,1536,0.73,1.142,N/A +OAS1_0455_MR1,F,R,61,2,4,28,0,1354,0.825,1.297,N/A +OAS1_0456_MR1,M,R,61,5,2,30,0,1637,0.78,1.072,N/A +OAS1_0457_MR1,F,R,62,3,3,26,0,1372,0.766,1.279,N/A +OAS1_0061_MR2,F,R,20,,,,,1757,0.845,0.999,1 +OAS1_0080_MR2,F,R,25,,,,,1605,0.841,1.093,20 +OAS1_0092_MR2,M,R,22,,,,,1457,0.835,1.205,5 +OAS1_0101_MR2,M,R,29,,,,,1501,0.835,1.169,64 +OAS1_0111_MR2,M,R,23,,,,,1714,0.861,1.024,2 +OAS1_0117_MR2,M,R,25,,,,,1753,0.782,1.001,5 +OAS1_0145_MR2,M,R,34,,,,,1654,0.832,1.061,10 +OAS1_0150_MR2,F,R,20,,,,,1506,0.876,1.165,1 +OAS1_0156_MR2,F,R,20,,,,,1577,0.832,1.113,12 +OAS1_0191_MR2,F,R,21,,,,,1416,0.837,1.239,28 +OAS1_0202_MR2,F,R,23,,,,,1548,0.861,1.134,21 +OAS1_0230_MR2,F,R,19,,,,,1577,0.849,1.113,24 +OAS1_0236_MR2,F,R,20,,,,,1222,0.872,1.436,3 +OAS1_0239_MR2,F,R,29,,,,,1438,0.822,1.221,40 +OAS1_0249_MR2,F,R,28,,,,,1215,0.865,1.444,3 +OAS1_0285_MR2,M,R,20,,,,,1469,0.847,1.195,2 +OAS1_0353_MR2,M,R,22,,,,,1684,0.79,1.042,40 +OAS1_0368_MR2,M,R,22,,,,,1580,0.856,1.111,89 +OAS1_0379_MR2,F,R,20,,,,,1262,0.861,1.39,2 +OAS1_0395_MR2,F,R,26,,,,,1283,0.834,1.368,39 diff --git a/OAISIS_clean/util.py b/OAISIS_clean/util.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd221c91bfdc6ff61af486b7b09d0bad9c6deee --- /dev/null +++ b/OAISIS_clean/util.py @@ -0,0 +1,410 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + ##added by yanguoiqng on 2025-08-08 + elif key == 'Sub_modality': + + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/OAI_ZIB_clean/config_format.json b/OAI_ZIB_clean/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..01e9febcb5f7b8c2946d49b246139faf6e8272b1 --- /dev/null +++ b/OAI_ZIB_clean/config_format.json @@ -0,0 +1,125 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + + "Sub_modality": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/OAI_ZIB_clean/dataclean_OAI_ZIB.py b/OAI_ZIB_clean/dataclean_OAI_ZIB.py new file mode 100644 index 0000000000000000000000000000000000000000..7853d117136bda3679d42c490e579302b961c8ce --- /dev/null +++ b/OAI_ZIB_clean/dataclean_OAI_ZIB.py @@ -0,0 +1,309 @@ +#coding:utf-8 +''' +OAI-ZIB Dataset Processing Script +create on 2026-03-05 + +OAI-ZIB: Osteoarthritis Initiative dataset curated by ZIB (Zuse Institute Berlin). +Contains RIGHT knee MRI scans and corresponding segmentation labelmaps for 507 +subjects, split into train (253) and test (254) sets. + +All images are RIGHT knee (confirmed via OAIZIB-CM kneeSideInfo.csv). + +Label values: + 0: background + 1: femur + 2: femoral cartilage + 3: tibia + 4: medial tibial cartilage + 5: lateral tibial cartilage + +Nonimaging metadata extracted per subject (baseline visit V00, right knee): + - enrollee01.txt: age, gender, race, ethnicity, cohort + - oscf01.txt: BMI, height, weight + - kxrsq01.txt: KL grade (right knee, Kellgren-Lawrence OA severity 0-4) + - womac01.txt: WOMAC scores (right knee: pain, ADL, stiffness) +''' +import os +import glob +import csv +import argparse +import json +import SimpleITK as sitk +from tqdm import tqdm +from util import meta_data +import util + + +TASK_VALUE = "segmentation" +TARGET_SPACING = [0.36, 0.36, 0.36] # isotropic resampling target (mm) + + +def resample_to_isotropic(sitk_img, target_spacing=TARGET_SPACING, interpolator=sitk.sitkLinear): + """Resample a SimpleITK image to isotropic spacing.""" + original_spacing = sitk_img.GetSpacing() + original_size = sitk_img.GetSize() + + new_size = [ + int(round(osz * osp / tsp)) + for osz, osp, tsp in zip(original_size, original_spacing, target_spacing) + ] + + resampler = sitk.ResampleImageFilter() + resampler.SetOutputSpacing(target_spacing) + resampler.SetSize(new_size) + resampler.SetOutputDirection(sitk_img.GetDirection()) + resampler.SetOutputOrigin(sitk_img.GetOrigin()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(0) + resampler.SetTransform(sitk.Transform()) + + return resampler.Execute(sitk_img) + +LABEL_DICT = { + "0": "background", + "1": "femur", + "2": "femoral cartilage", + "3": "tibia", + "4": "medial tibial cartilage", + "5": "lateral tibial cartilage" +} + + +def load_nonimaging_table(filepath): + """Load a tab-delimited nonimaging .txt file, skipping the description row (row 2).""" + rows = [] + with open(filepath, 'r') as f: + reader = csv.DictReader(f, delimiter='\t', quotechar='"') + for i, row in enumerate(reader): + if i == 0: + # Row 0 after header is the description row — skip it + continue + rows.append(row) + return rows + + +def build_subject_lookup(rows, key='src_subject_id', visit_filter=None): + """Build a dict keyed by subject ID. If visit_filter is set, only keep rows with that visit.""" + lookup = {} + for row in rows: + sid = row.get(key, '').strip('"') + visit = row.get('visit', '').strip('"') + if visit_filter and visit != visit_filter: + continue + if sid not in lookup: + lookup[sid] = row + return lookup + + +def load_all_nonimaging(nonimaging_dir): + """Load and index all relevant nonimaging tables by subject ID (baseline V00).""" + tables = {} + + # enrollee01: demographics (use V00 baseline) + fp = os.path.join(nonimaging_dir, 'enrollee01.txt') + if os.path.isfile(fp): + tables['enrollee'] = build_subject_lookup(load_nonimaging_table(fp), visit_filter='V00') + + # oscf01: BMI, height, weight (prefer V00, fallback to any visit with BMI) + fp = os.path.join(nonimaging_dir, 'oscf01.txt') + if os.path.isfile(fp): + rows = load_nonimaging_table(fp) + oscf_lookup = {} + for row in rows: + sid = row.get('src_subject_id', '').strip('"') + bmi = row.get('bmi', '').strip('"') + visit = row.get('visit', '').strip('"') + if not bmi: + continue + # Prefer V00, otherwise keep first available + if sid not in oscf_lookup or visit == 'V00': + oscf_lookup[sid] = row + tables['oscf'] = oscf_lookup + + # kxrsq01: KL grade (use V00, RIGHT knee only: side=1) + fp = os.path.join(nonimaging_dir, 'kxrsq01.txt') + if os.path.isfile(fp): + rows = load_nonimaging_table(fp) + kl_lookup = {} + for row in rows: + sid = row.get('src_subject_id', '').strip('"') + visit = row.get('visit', '').strip('"') + side = row.get('side', '').strip('"') + if visit != 'V00' or side != '1': # side=1 is RIGHT + continue + kl = row.get('xrkl', '').strip('"') + if sid not in kl_lookup: + kl_lookup[sid] = kl + tables['kl_grade'] = kl_lookup + + # womac01: WOMAC scores (use V00) + fp = os.path.join(nonimaging_dir, 'womac01.txt') + if os.path.isfile(fp): + tables['womac'] = build_subject_lookup(load_nonimaging_table(fp), visit_filter='V00') + + return tables + + +def get_subject_metadata(subject_id, tables): + """Extract relevant metadata for a subject from preloaded tables.""" + info = {} + info['Knee_Side'] = 'right' + + # Demographics from enrollee01 + enrollee = tables.get('enrollee', {}).get(subject_id, {}) + if enrollee: + info['Age'] = enrollee.get('ageyears', '').strip('"') + info['Gender'] = enrollee.get('gender', '').strip('"') + info['Race'] = enrollee.get('race', '').strip('"') + info['Ethnicity'] = enrollee.get('ethnicity', '').strip('"') + info['Cohort'] = enrollee.get('e_cohort', '').strip('"') + + # BMI from oscf01 + oscf = tables.get('oscf', {}).get(subject_id, {}) + if oscf: + info['BMI'] = oscf.get('bmi', '').strip('"') + info['Height_mm'] = oscf.get('height_av', '').strip('"') + info['Weight_kg'] = oscf.get('weight_met', '').strip('"') + + # KL grade from kxrsq01 (right knee only) + kl = tables.get('kl_grade', {}).get(subject_id) + if kl is not None: + info['KL_Grade'] = kl + + # WOMAC scores from womac01 (right knee only) + womac = tables.get('womac', {}).get(subject_id, {}) + if womac: + info['WOMAC_Pain'] = womac.get('womkpr', '').strip('"') + info['WOMAC_ADL'] = womac.get('womadlr', '').strip('"') + info['WOMAC_Stiffness'] = womac.get('womtsr', '').strip('"') + + return info + + +def main(target_path, output_dir): + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + failed_files = [] + + # Load nonimaging metadata + nonimaging_dir = os.path.join(target_path, 'nonimaging', 'NonImaging') + print("Loading nonimaging metadata...") + tables = load_all_nonimaging(nonimaging_dir) + print(f" enrollee: {len(tables.get('enrollee', {}))} subjects") + print(f" oscf (BMI): {len(tables.get('oscf', {}))} subjects") + print(f" kl_grade (right): {len(tables.get('kl_grade', {}))} subjects") + print(f" womac: {len(tables.get('womac', {}))} subjects") + + # Process train and test splits into separate folders + for split in ['train', 'test']: + image_dir = os.path.join(target_path, 'images', split) + label_dir = os.path.join(target_path, 'labels', split) + + if not os.path.isdir(image_dir): + print(f"Image directory not found: {image_dir}") + continue + + split_output_dir = os.path.join(output_dir, split) + os.makedirs(split_output_dir, exist_ok=True) + + json_output_path = os.path.join(split_output_dir, 'nifti_mappings.json') + # Initialize the JSON file fresh + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + image_files = sorted(glob.glob(os.path.join(image_dir, '*.nii.gz'))) + print(f"\nProcessing {split} split: {len(image_files)} subjects -> {split_output_dir}") + + for image_path in tqdm(image_files, desc=f"Processing {split}"): + filename = os.path.basename(image_path) # e.g. 9002817.nii.gz + subject_id = filename.replace('.nii.gz', '') + + try: + # Read original image + sitk_img = sitk.ReadImage(image_path) + original_size = list(sitk_img.GetSize()) + original_spacing = list(sitk_img.GetSpacing()) + + # Resample to isotropic + sitk_img_iso = resample_to_isotropic(sitk_img, TARGET_SPACING, sitk.sitkLinear) + resampled_size = list(sitk_img_iso.GetSize()) + resampled_spacing = list(sitk_img_iso.GetSpacing()) + + # Build metadata (use resampled size/spacing) + meta = meta_data() + meta.add_keyvalue('Modality', 'MRI') + meta.add_keyvalue('OriImg_path', image_path) + meta.add_keyvalue('Spacing_mm', min(resampled_spacing)) + meta.add_keyvalue('Size', resampled_size) + meta.add_keyvalue('Dataset_name', 'OAI_ZIB') + meta.add_keyvalue('ROI', 'leg') + meta.add_keyvalue('Label_Dict', LABEL_DICT) + + # Output paths + output_subject_dir = os.path.join(split_output_dir, subject_id) + output_image_file = os.path.join(output_subject_dir, f"{subject_id}.nii.gz") + + # Save resampled image + util.save_nifti(sitk_img_iso, output_image_file, image_path) + + # Process label (use nearest-neighbor interpolation to preserve discrete labels) + label_path = os.path.join(label_dir, filename) + if os.path.isfile(label_path): + sitk_lbl = sitk.ReadImage(label_path) + sitk_lbl_iso = resample_to_isotropic(sitk_lbl, TARGET_SPACING, sitk.sitkNearestNeighbor) + process_label_dir = os.path.join(output_subject_dir, 'segmentation') + processed_lbl_path = os.path.join(process_label_dir, f"{subject_id}.nii.gz") + os.makedirs(process_label_dir, exist_ok=True) + util.save_nifti(sitk_lbl_iso, processed_lbl_path, label_path) + + label_path_dict = {'knee': processed_lbl_path} + meta.add_keyvalue('Task', TASK_VALUE) + meta.add_keyvalue('Label_path', {TASK_VALUE: label_path_dict}) + + print(f" {subject_id}: {original_size} @ {[f'{s:.3f}' for s in original_spacing]} -> {resampled_size} @ {[f'{s:.3f}' for s in resampled_spacing]}") + + # Build extra metadata from nonimaging + extra_info = { + 'split': split, + 'Image_id': subject_id, + 'nonimaging_dir': nonimaging_dir, + } + subject_meta = get_subject_metadata(subject_id, tables) + extra_info.update(subject_meta) + + meta.add_extra_keyvalue('Metadata', extra_info) + + # Write mapping + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_image_file] = meta.get_meta_data() + json_file.seek(0) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + + except Exception as e: + print(f" Failed {subject_id}: {e}") + failed_files.append(subject_id) + continue + + # Save failed files + failed_files_path = os.path.join(output_dir, 'failed_files.json') + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"\nDone. Failed files ({len(failed_files)}): {failed_files_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process OAI-ZIB dataset and save as processed NIfTI with mappings.") + parser.add_argument("--target_path", type=str, + default="/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Data/DATASETS/OAI_ZIB", + help="Path to raw OAI-ZIB dataset directory.") + parser.add_argument("--output_dir", type=str, + default="/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Data/Omini3D/DATASETS_processed/OAI_ZIB", + help="Directory to save processed NIfTI files and mappings.") + args = parser.parse_args() + print(f"Input: {args.target_path}") + print(f"Output: {args.output_dir}") + main(args.target_path, args.output_dir) diff --git a/OAI_ZIB_clean/util.py b/OAI_ZIB_clean/util.py new file mode 100644 index 0000000000000000000000000000000000000000..4067d26591959e7b64eaf8be27fa4c97eac62b43 --- /dev/null +++ b/OAI_ZIB_clean/util.py @@ -0,0 +1,413 @@ +import os +import json +import SimpleITK as sitk +import glob +try: + import pandas as pd +except ImportError: + pd = None + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + ##added by yanguoiqng on 2025-08-08 + elif key == 'Sub_modality': + + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/PSMA_clean/config_format.json b/PSMA_clean/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..01e9febcb5f7b8c2946d49b246139faf6e8272b1 --- /dev/null +++ b/PSMA_clean/config_format.json @@ -0,0 +1,125 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + + "Sub_modality": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/PSMA_clean/dataclean_PSMA_Longitudinal.py b/PSMA_clean/dataclean_PSMA_Longitudinal.py new file mode 100644 index 0000000000000000000000000000000000000000..97b2c4ab842ad702ce783a5fd63a971a2b2e74e0 --- /dev/null +++ b/PSMA_clean/dataclean_PSMA_Longitudinal.py @@ -0,0 +1,380 @@ +#coding:utf-8 +''' +writebyygq +createon2025-08-30 + + +BL = Baseline(基线) +FU = Follow-up(随访) + +1. Baseline (基线) + 含义:指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像(如CT、MRI、X光)。 + 作用:这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较,来判断病情的变化。 +2. Follow-up (随访) + 含义:指的是在基线影像之后,按计划或根据病情需要再次拍摄的影像。 + 作用:用于评估治疗效果(如肿瘤是否缩小)、监测疾病进展(如病灶是否增大或增多)、或观察术后恢复情况。 +“BL FU” 在报告中的应用场景: + 当放射科医生在报告中写下“BL FU”或“compare to BL FU”时,他们的意思是: + “本次的影像检查结果,需要与之前拍摄的基线影像进行对比,以评估变化。” + +例如: +肿瘤患者:一位肺癌患者在化疗前做了一次CT(作为基线BL),化疗2个周期后又做了一次CT(作为随访FU)。放射科医生会在新报告中将两次影像进行对比,并描述:“与20XX年X月X日的基线CT(BL FU) 相比,右肺下叶肿块明显缩小。” +慢性病患者:如肺炎、肝硬化、多发性硬化等需要长期监测的疾病,医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。 + +label: + 0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息 + +编号ID:10位的16进制编号,每一个对应一个csv文件,对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件-- +备注:CSV包含所有的label信息和编号,如果考虑按照tissue进行分别存储,可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict +BL的以及对应的MASK都是inputsTr目录下面 +命名形式: + 93dd4de5cd_BL_img_BL_img_00.nii.gz + 93dd4de5cd_BL_mask_BL_img_00.nii.gz + 93dd4de5cd_BL_00.json + +FU在inputsTr目录下面,对应的mask在targetsTr力猛 +命名形式: + c6f057b865_FU_img_FU_img_00.nii.gz + c6f057b865_FU_mask_FU_img_00.nii.gz + c6f057b865_FU_img_FU_img_01.nii.gz + c6f057b865_FU_mask_FU_img_01.nii.gz + c6f057b865_FU_00.json + c6f057b865_FU_01.json + + +元数据信息CSV-病灶或者癌症信息--对应基线的位置,对应的基线影像编号,位置,以及对应的随访位置编号以及病灶位置 +lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type +1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung +2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node + +json格式样例 +{ + "name": "Points of interest", + "points": [ + { + "name": "1", + "point": [ + 84.9530896759608, + 273.525433308214, + 148.780708364732 + ] + }, + { + "name": "2", + "point": [ + 206.307026476578, + 258.39816700611, + 177.256619144603 + ] + } + ], + "type": "Multiple points", + "version": { + "major": 1, + "minor": 0 + } +} + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil + + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +# ##参考MSD的sub_modality描述信息 +# SUB_MODALITY=["CT","PET"] +# ##文件名对应的排序顺序 +# SERIES_ORDER=["0000","0001"] + +##根据对应的json信息进行补充1-N的数值 +LABEL_DICT={ + "0":"backgroud", +} +META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type'] + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250830 +def merge_images(series_files): + ''' + 每个病例包含两种不同序列的 CT:CT/PET--0000/0001 + 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path +##added by yanguoqing on 2025-08-31 +##根据csv文件返回的所有数据文件名称,获取所有数据id的 +def get_filename_list(fp_dir): + all_file_list=glob.glob("%s/*.csv"%fp_dir) + + + return all_file_list +##获取study_id以及study_date +def check_fname(fname): + if fname.startswith("fdg"): + sid=fname[:14] + sdate=fname[15:25] + else: + sid=fname[:21] + sdate=fname[22:] + return sid,sdate +def main(target_path, output_dir): + + pid_dirs=["inputsTr"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + + input_dir=os.path.join(target_path,'inputsTr') + target_dir=os.path.join(target_path,'targetsTr') + + fp_files=get_filename_list(input_dir) + ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并; + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"): + for fp_file in tqdm(fp_files, desc="Processing all dataset"): + meta_file=fp_file + df_meta=pd.read_csv(meta_file) + fp_name=os.path.basename(fp_file)[:-4] + ##依次查找BL以及FU的所有影像以及对应的mask + for sub_mod in ['BL','FU']: + + bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod)) + if len(bl_fps)>0: + for bl_fp in bl_fps: + basename=os.path.basename(bl_fp)[:-5] + bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz") + bl_fp_img=os.path.join(input_dir,bl_fp_name) + + if os.path.isfile(bl_fp_img): + ##判定存在进行正常处理 + + + bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz") + + bl_fp_mask=os.path.join(input_dir,bl_mask_name) + if os.path.isfile(bl_fp_mask): + label_fp=bl_fp_mask + label_flag=True + else: + bl_fp_mask=os.path.join(target_dir,bl_mask_name) + if os.path.isfile(bl_fp_mask): + label_fp=bl_fp_mask + label_flag=True + else: + label_fp=None + label_flag=False + + + modality="CT" + study='PSMA_Longitudinal_CT'##Dataset_name + CIA_other_info = { + 'Image_id':basename, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + + CIA_other_info['metadata_file']=meta_file + stk_image=util.load_nifti(bl_fp_img) + spacing_info = stk_image.GetSpacing() + size = list(stk_image.GetSize()) + resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size) + if resampler is not None: + proces_image = resampler.Execute(stk_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = stk_image + CIA_other_info['Resample'] = False + + output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, input_dir) + print(f"Saved NIfTI file to {output_path}") + + with open(bl_fp,'r') as fi: + json_info=json.load(fi) + + label_dict={ + "0":"backgroud" + } + for lesion_info in json_info['points']: + df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])] + df_row=df_row.reset_index() + lesion_type=df_row['lesion_type'][0] + label_dict[lesion_info['name']]=lesion_type + + + + if label_flag: + label_path_dict = {} + label_stk_img=util.load_nifti(label_fp) + resampler =util.get_unisize_resampler(label_stk_img, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_stk_img) + else: + proces_label = label_stk_img + + # print(proces_image.GetSize(),proces_label.GetSize()) + try: + assert proces_image.GetSize() == proces_label.GetSize() + except Exception as e: + failed_files.append(label_fp) + continue + + label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz") + + label_path_dict['tumor'] = label_output_path + util.save_nifti(proces_label, label_output_path, label_fp) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + else: + continue + + + + + size_processed = list(proces_image.GetSize()) + print('size_processed',size_processed,size) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',bl_fp_img) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','whole-body') + + + if label_flag: + # print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + meta.add_keyvalue('Label_Dict',label_dict) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/Longitudinal-CT//") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/Longitudinal-CT/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + diff --git a/PSMA_clean/dataclean_PSMA_Longitudinal_v2.py b/PSMA_clean/dataclean_PSMA_Longitudinal_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..2fa44d3068d66227d1daa06039a9c39e5c66b0eb --- /dev/null +++ b/PSMA_clean/dataclean_PSMA_Longitudinal_v2.py @@ -0,0 +1,450 @@ +#coding:utf-8 +''' +writebyygq +createon2025-08-30 + + +BL = Baseline(基线) +FU = Follow-up(随访) + +1. Baseline (基线) + 含义:指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像(如CT、MRI、X光)。 + 作用:这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较,来判断病情的变化。 +2. Follow-up (随访) + 含义:指的是在基线影像之后,按计划或根据病情需要再次拍摄的影像。 + 作用:用于评估治疗效果(如肿瘤是否缩小)、监测疾病进展(如病灶是否增大或增多)、或观察术后恢复情况。 +“BL FU” 在报告中的应用场景: + 当放射科医生在报告中写下“BL FU”或“compare to BL FU”时,他们的意思是: + “本次的影像检查结果,需要与之前拍摄的基线影像进行对比,以评估变化。” + +例如: +肿瘤患者:一位肺癌患者在化疗前做了一次CT(作为基线BL),化疗2个周期后又做了一次CT(作为随访FU)。放射科医生会在新报告中将两次影像进行对比,并描述:“与20XX年X月X日的基线CT(BL FU) 相比,右肺下叶肿块明显缩小。” +慢性病患者:如肺炎、肝硬化、多发性硬化等需要长期监测的疾病,医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。 + +label: + 0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息 + +编号ID:10位的16进制编号,每一个对应一个csv文件,对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件-- +备注:CSV包含所有的label信息和编号,如果考虑按照tissue进行分别存储,可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict +BL的以及对应的MASK都是inputsTr目录下面 +命名形式: + 93dd4de5cd_BL_img_BL_img_00.nii.gz + 93dd4de5cd_BL_mask_BL_img_00.nii.gz + 93dd4de5cd_BL_00.json + +FU在inputsTr目录下面,对应的mask在targetsTr力猛 +命名形式: + c6f057b865_FU_img_FU_img_00.nii.gz + c6f057b865_FU_mask_FU_img_00.nii.gz + c6f057b865_FU_img_FU_img_01.nii.gz + c6f057b865_FU_mask_FU_img_01.nii.gz + c6f057b865_FU_00.json + c6f057b865_FU_01.json + + +元数据信息CSV-病灶或者癌症信息--对应基线的位置,对应的基线影像编号,位置,以及对应的随访位置编号以及病灶位置 +lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type +1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung +2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node + +json格式样例 +{ + "name": "Points of interest", + "points": [ + { + "name": "1", + "point": [ + 84.9530896759608, + 273.525433308214, + 148.780708364732 + ] + }, + { + "name": "2", + "point": [ + 206.307026476578, + 258.39816700611, + 177.256619144603 + ] + } + ], + "type": "Multiple points", + "version": { + "major": 1, + "minor": 0 + } +} + +20251101补充增加,将病灶编号进行合并同类项目, +注意处理完成后保留原影像的几何空间信息以及元数据文件信息 + + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil + + +##统一编码 +label_id_lut={'backgroud': 0, + 'Lymph node': 1, + 'Lung': 2, + 'Soft tissue / Skin': 3, + 'Liver': 4, + 'Skeleton': 5, + 'Adrenals': 6, + 'Spleen': 7, + 'CNS': 8, + 'Kidney': 9, + 'Heart': 10, + 'Others': 11, + 'unclear': 12, + } + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +# ##参考MSD的sub_modality描述信息 +# SUB_MODALITY=["CT","PET"] +# ##文件名对应的排序顺序 +# SERIES_ORDER=["0000","0001"] + +##根据对应的json信息进行补充1-N的数值 +LABEL_DICT={ + "0":"backgroud", +} +META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type'] + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250830 +def merge_images(series_files): + ''' + 每个病例包含两种不同序列的 CT:CT/PET--0000/0001 + 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path +##added by yanguoqing on 2025-08-31 +##根据csv文件返回的所有数据文件名称,获取所有数据id的 +def get_filename_list(fp_dir): + all_file_list=glob.glob("%s/*.csv"%fp_dir) + + + return all_file_list +##获取study_id以及study_date +def check_fname(fname): + if fname.startswith("fdg"): + sid=fname[:14] + sdate=fname[15:25] + else: + sid=fname[:21] + sdate=fname[22:] + return sid,sdate +def main(target_path, output_dir): + + pid_dirs=["inputsTr"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + + input_dir=os.path.join(target_path,'inputsTr') + target_dir=os.path.join(target_path,'targetsTr') + + fp_files=get_filename_list(input_dir) + ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并; + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"): + for fp_file in tqdm(fp_files, desc="Processing all dataset"): + meta_file=fp_file + df_meta=pd.read_csv(meta_file) + + fp_name=os.path.basename(fp_file)[:-4] + ##依次查找BL以及FU的所有影像以及对应的mask + for sub_mod in ['BL','FU']: + + bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod)) + if len(bl_fps)>0: + for bl_fp in bl_fps: + basename=os.path.basename(bl_fp)[:-5] + bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz") + bl_fp_img=os.path.join(input_dir,bl_fp_name) + + if os.path.isfile(bl_fp_img): + ##判定存在进行正常处理 + + + bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz") + + bl_fp_mask=os.path.join(input_dir,bl_mask_name) + if os.path.isfile(bl_fp_mask): + label_fp=bl_fp_mask + label_flag=True + else: + bl_fp_mask=os.path.join(target_dir,bl_mask_name) + if os.path.isfile(bl_fp_mask): + label_fp=bl_fp_mask + label_flag=True + else: + label_fp=None + label_flag=False + + + modality="CT" + study='PSMA_Longitudinal_CT'##Dataset_name + CIA_other_info = { + 'Image_id':basename, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + + CIA_other_info['metadata_file']=meta_file + stk_image=util.load_nifti(bl_fp_img) + spacing_info = stk_image.GetSpacing() + size = list(stk_image.GetSize()) + resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size) + if resampler is not None: + proces_image = resampler.Execute(stk_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = stk_image + CIA_other_info['Resample'] = False + + output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, input_dir) + print(f"Saved NIfTI file to {output_path}") + + + + + if label_flag: + label_path_dict = {} + label_stk_img=util.load_nifti(label_fp) + + image_array = sitk.GetArrayFromImage(label_stk_img) + ##注意处理label的赋值并还原附带原始影像的基本信息,并重新赋值合并同类项 + with open(bl_fp,'r') as fi: + json_info=json.load(fi) + + label_dict={ + "0":"backgroud" + } + + update_image_array=np.copy(image_array) + ##获取合并同类项后的基本信息 + group_meta=df_meta.groupby('lesion_type')['lesion_id'] + for name,group in group_meta: + ##分组名称以及分组后的所有leision_id + ids=group_meta.get_group(name) + target_id=label_id_lut[name] + # ##取每个分组的最小leision_id赋值 + # ids_min=ids.min() + # label_dict[str(ids_min)]=name + label_dict[str(target_id)]=name + ##并对 + for v in ids.tolist(): + update_image_array[image_array==v]=target_id + + image_array=None + label_stk_img_update=sitk.GetImageFromArray(update_image_array) + label_stk_img_update.CopyInformation(label_stk_img) + # 手动复制所有元数据 + # 获取元数据键 + meta_keys = label_stk_img.GetMetaDataKeys() + for key in meta_keys: + value = label_stk_img.GetMetaData(key) + label_stk_img_update.SetMetaData(key, value) + + # for lesion_info in json_info['points']: + # df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])] + # df_row=df_row.reset_index() + # lesion_type=df_row['lesion_type'][0] + # label_dict[lesion_info['name']]=lesion_type + + resampler =util.get_unisize_resampler(label_stk_img_update, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_stk_img_update) + + ary_process_label=sitk.GetArrayFromImage(proces_label) + + if ary_process_label[-1,:,:].mean()==ary_process_label[-1,0,0] and ary_process_label[-1,0,0]>0: + print('momingqimiao',ary_process_label[-1,0,0]) + ary_process_label[-1,:,:]=0 + + label_stk_img_process=sitk.GetImageFromArray(ary_process_label) + label_stk_img_process.CopyInformation(proces_label) + meta_keys = proces_label.GetMetaDataKeys() + for key in meta_keys: + value = proces_label.GetMetaData(key) + label_stk_img_process.SetMetaData(key, value) + + + + else: + label_stk_img_process = label_stk_img_update + + # print(proces_image.GetSize(),proces_label.GetSize()) + try: + assert proces_image.GetSize() == label_stk_img_process.GetSize() + except Exception as e: + failed_files.append(label_fp) + continue + + label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz") + + label_path_dict['tumor'] = label_output_path + util.save_nifti(label_stk_img_process, label_output_path, label_fp) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + + + + else: + continue + + + + + size_processed = list(proces_image.GetSize()) + print('size_processed',size_processed,size) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',bl_fp_img) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','whole-body') + + + if label_flag: + # print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + meta.add_keyvalue('Label_Dict',label_dict) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/Longitudinal-CT//") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/Longitudinal-CT/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + diff --git a/PSMA_clean/dataclean_PSMA_petct.py b/PSMA_clean/dataclean_PSMA_petct.py new file mode 100644 index 0000000000000000000000000000000000000000..2401737756e6611f6f714d30c6c2c0b10696ebdf --- /dev/null +++ b/PSMA_clean/dataclean_PSMA_petct.py @@ -0,0 +1,525 @@ +#coding:utf-8 +''' +writebyygq +createon2025-08-30 +PSMAPET/CT本质上也是一种PET/CT,只是它的示踪剂和传统的18F-FDG不同,目前国际上应用较多的PSMAPET/CT的示踪剂是68GA-PSMA、18F-PSMA,其中68GA及18F是一种放射性核素,具有成像功能,PSMA是前列腺特异性膜抗原,具有引导功能,引导PSMA更准确地向前列腺癌细胞聚拢,这样就大大增加了PSMAPET/CT用于发现前列腺癌的敏感性。 + +PSMA,全称前列腺特异性膜抗原(Prostate-SpecificMembraneAntigen),是一种与前列腺癌密切相关的蛋白质。存在于前列腺上皮细胞的固有膜蛋白,在前列腺癌细胞表面强表达,在前列腺正常组织和非前列腺组织中表达量相对较低,表达量是正常前列腺细胞的100-1000倍,且与前列腺癌分级和分期呈正相关。这种强表达、高度特异性使得PSMA成为前列腺癌诊断和治疗的重要靶点。 +而PSMAPET/CT实际上是一种靶向显像,用放射性核素(常用68Ga、18F)标记PSMA配体作为示踪剂,通过静脉注入体内,经过分布代谢于病灶,然后用PET/CT进行扫描,即完成显像。借助PSMA的引导功能,将放射性核素更精准地聚集在前列腺癌细胞,结合正电子发射断层扫描(PET)和计算机断层扫描(CT),实现对前列腺癌的精准检测。 + + +fdgpet/ct和psmapet/ct检查就像"肿瘤侦探"使用不同的破案工具,各有所长又互为补充。fdg和psma是pet检查使用的两种不同显像剂,二者显像原理不同,因此追踪的“目标分子”不同。 +fdgpet/ct + 追踪目标:恶性肿瘤细胞消耗的葡萄糖(类似给恶性肿瘤细胞“测饭量”) + 原理:恶性肿瘤细胞生长、代谢旺盛,会大量摄取显像剂fdg(葡萄糖类似物),通过检测“高耗能区”定位肿瘤 + 优势:广谱肿瘤示踪剂,发展成熟、应用广泛,可反应肿瘤恶性程度,同时发现其他部位恶性肿瘤 + 局限性:肿瘤细胞数量少或处于低度恶性时,常常降低对葡萄糖摄取的需求,pet影像表现为低代谢,此时容易漏诊 + +psmapet/ct + 追踪目标:前列腺特异性膜抗原(前列腺癌细胞戴着的特殊“徽章”) + 原理:90%前列腺癌细胞表面戴着这种“徽章”,psma靠着追踪并粘住这种“徽章”精准锁定前列腺癌病灶,哪里亮起来,哪里就有肿瘤 + 优势:针对性强,能早期发现微小病灶,甚至在其他检查还正常时就预警 + 局限性:体内存在部分正常或病变细胞,同样具有psma蛋白高度表达的情况,如神经节、神经组织、肉芽肿性病变、肾癌、肺癌等,可能导致假阳性表现。此外,约10%的前列腺癌细胞没有佩戴这种“徽章”,导致漏诊 + + +PSMA-FDG-PET-CT-Lesion 数据集指的是同时包含 PSMA-PET 和 FDG-PET(以及对应CT)两种扫描模态,并且带有病灶标注的医学影像数据集。 +这种数据集在前列腺癌研究中具有极高的价值,因为它允许研究者直接比较和分析同一患者体内不同病灶的分子表达特性。 +前列腺癌病灶在分子水平上具有异质性。并非所有病灶都表达相同的生物标志物。 + PSMA(前列腺特异性膜抗原):在大多数前列腺癌细胞表面过度表达,是前列腺癌相对特异的靶点。PSMA-PET用于检测前列腺癌特异性病灶。 + FDG(氟代脱氧葡萄糖):反映细胞的葡萄糖代谢活性。高度侵袭性、低分化的肿瘤通常具有很高的FDG摄取。 + + + +PSMA-FDG-PET/CT: + +https://autopet-iii.grand-challenge.org/ +"channel_names": { + "0": "CT", + "1": "CT"--PET + }, + "labels": { + "background": 0, + "tumor": 1 + }, +同一个病例同在000,001两个影像,分别表示CT,PET,合并到第四个维度作为SUB_MODALITY + +label: + 0:backgroud 1: tumor + +FDG-元数据信息 + 'Series UID', 'Collection', '3rd Party Analysis', + 'Data Description URI', 'Subject ID', 'Study UID', 'Study Description', + 'Study Date', 'Series Description', 'Manufacturer', 'Modality', + 'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size', + 'File Location', 'Download Timestamp', 'diagnosis', 'age', 'sex' + 通过Subject ID,以及Modality共同确定唯一的描述信息,获取相应的,Study Description,Study Date,Series Description, Manufacturer,diagnosis, age, sex信息;【只获取CT模态的一行描述信息即可】 +FDG文件名组成:fdg_b2f82ed4b9_04-17-2003-NA-PET-CT Ganzkoerper primaer mit KM-26753_[0000].nii.gz + Subject ID[PETCT_b2f82ed4b9] && Modality[CT] + + +PSMA-元数据信息 + 'Subject ID', 'Study Date', 'age', 'manufacturer_model_name', + 'pet_radionuclide', 'ct_contrast_agent' + 需要依靠'Subject ID', 'Study Date'共同确定唯一,存在相同的subject_id不同时间的样例--作为单独数据处理, +PSMA文件名组成:psma_d5b636ea4da7638b_2019-03-15_[0000].nii.gz + Subject ID[psma_d5b636ea4da7638b]&&Study Date[2019-03-15] + +综上:将id定义为subject_id+study_date 共同标识唯一的ID + +处理流程: + 1.查找所有的ID; + 2.根据ID查找对应的两个channel的影像以及对应的label; + 3.对两个channel的影像进行合并转4D; + 4.按照4D图像处理的惯例(第四个维度不参与计算,取前3个的spaceing最小值)重采样插值;--label + 5.保存 + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil +##dataset_meta +meta_id_name='BraTS_2019_subject_ID' +meta_grade_name='Grade' + +##HGG_survival_info +survival_id_name='BraTS19ID' +meta_age_name='Age' +meta_survival_name='Survival' +meta_status_name='ResectionStatus' + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +##参考MSD的sub_modality描述信息 +SUB_MODALITY=["CT","PET"] +##文件名对应的排序顺序 +SERIES_ORDER=["0000","0001"] + +LABEL_DICT={ + "0":"backgroud", + "1":"tumor", +} +PSMA_META_COLUMN=['Subject ID', 'Study Date', 'age', 'manufacturer_model_name','pet_radionuclide', 'ct_contrast_agent'] +FDG_META_COLUMN=['Subject ID', 'Study Description','Study Date', 'Series Description', 'Manufacturer', 'Modality','diagnosis', 'age', 'sex'] +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250830 +def merge_images(series_files): + ''' + 每个病例包含两种不同序列的 CT:CT/PET--0000/0001 + 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path +##added by yanguoqing on 2025-08-30 +##获取PSMA-PET-CT的1614个数据名称 +def get_filename_list(fp): + with open(fp,'r') as fi: + fls=json.load(fi) + filename_list=fls[0]['train']+fls[0]['val'] + + return filename_list +##获取study_id以及study_date +def check_fname(fname): + if fname.startswith("fdg"): + sid=fname[:14] + sdate=fname[15:25] + else: + sid=fname[:21] + sdate=fname[22:] + return sid,sdate +def main(target_path, output_dir): + # metadata_files = find_metadata_files(target_path) + # pid_dirs=find_image_dirs(target_path) + fdg_meta="fdg_metadata.csv" + psma_meta="psma_metadata.csv" + filename_meta="splits_final.json" ##包含所有1614个数据的名称列表信息 + # pid_dirs=["imagesTr","labelsTr"] + pid_dirs=["imagesTr"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + psma_meta_file=os.path.join(target_path,psma_meta) + fdg_meta_file=os.path.join(target_path,fdg_meta) + + filename_file=os.path.join(target_path,filename_meta) + + pdf_meta=pd.read_csv(psma_meta_file) + fdf_meta=pd.read_csv(fdg_meta_file) + + fp_names=get_filename_list(filename_file) + ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并; + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"): + for fp_name in tqdm(fp_names, desc="Processing all dataset"): + + ct_fp=os.path.join(target_path,pid_dir,fp_name+"_0000.nii.gz") + pet_fp=os.path.join(target_path,pid_dir,fp_name+"_0001.nii.gz") + label_fp=os.path.join(target_path,'labelsTr',fp_name+".nii.gz") + + modality="CT" + study='PSMA-FDG-PET-CT-LESION'##Dataset_name + CIA_other_info = {'metadata_file':''} + CIA_other_info['split'] = "train" + + + if fp_name.startswith("fdg"): + CIA_other_info['metadata_file']=fdg_meta_file + df_meta=fdf_meta + sid,sdate=check_fname(fp_name) + study_id=sid.replace("fdg","PETCT") + data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Modality']=='CT')] + data_info_row=data_info_row.reset_index() + for keyname in FDG_META_COLUMN: + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=fp_name + + else: + CIA_other_info['metadata_file']=psma_meta_file + df_meta=pdf_meta + sid,sdate=check_fname(fp_name) + study_id=sid.replace("psma","PSMA") + # print('>>',study_id,sdate) + data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Study Date']==sdate)] + data_info_row=data_info_row.reset_index() + # print(data_info_row.columns) + for keyname in PSMA_META_COLUMN: + print(keyname) + print(data_info_row[keyname][0]) + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=fp_name + + + + + try: + ##读取MRI四组文件,按照flair,t1,t1ce,t2的顺序叠加,对于seg先剔除不参与 + + + series_files=[ct_fp,pet_fp] + sub_modality=['CT','PET'] + if len(series_files)>0: + ##存在有效的MRI影像数据进行后续处理 + sitk_img_original=merge_images(series_files) + + + + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + is_4d_image = sitk_img_original.GetDimension() == 4 + frame_flag=False + # --- Resampling Logic (Revised for 4D) --- + if is_4d_image: + + # Always process 4D images channel-wise for resampling + # logging.info(f" Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only + channels = [] + num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1 + channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing + + + for i in range(num_channels): + extractor = sitk.ExtractImageFilter() + current_3d_channel_size = original_size[:3] + + if sitk_img_original.GetDimension() == 4: + extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0]) + extractor.SetIndex([0,0,0,i]) + channel_3d_img = extractor.Execute(sitk_img_original) + else: + channel_3d_img = sitk_img_original + if i > 0: break + + channel_resampler = util.get_unisize_resampler( + channel_3d_img, 'linear', + spacing=channel_target_spacing, size=current_3d_channel_size + ) + if channel_resampler: + channels.append(channel_resampler.Execute(channel_3d_img)) + else: + channels.append(channel_3d_img) + + if channels: + if len(channels) > 1: # Only join if there are multiple channels + sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels) + ##aded by yanguoqing on 2025-08-11 + frame_flag=True + # imgDict={} + # for kf_idx in range(num_channels): + # imgDict[str(kf_idx)]='none' + # if str(meta_ed):imgDict[str(meta_ed)]='ed' + # if str(meta_es):imgDict[str(meta_es)]='es' + # meta.add_keyvalue('ImgDict',imgDict) + elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize) + sitk_img_processed = channels[0] + elif TARGET_VOXEL_SPACING: # 3D image with target spacing + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=TARGET_VOXEL_SPACING, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + else: # 3D image, no TARGET_VOXEL_SPACING + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear', + spacing=original_spacing, size=original_size) + if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + + + output_path = os.path.join(output_dir,fp_name,fp_name+".nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(sitk_img_processed, output_path, os.path.dirname(ct_fp)) + print(f"Saved NIfTI file to {output_path}") + + + size_processed = list(sitk_img_processed.GetSize()) + print('size_processed',size_processed,original_size) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',",".join(series_files)) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','whole-body') + + + sub_modality_dict={} + for idx,value in enumerate(sub_modality): + if value: + sub_modality_dict[str(idx)]=SUB_MODALITY[idx] + + meta.add_keyvalue('Sub_modality',sub_modality_dict) + + meta.add_keyvalue('Label_Dict',LABEL_DICT) + + + ##Label processing + + label_path_dict={} + full_label_file=label_fp + full_path_label=os.path.dirname(full_label_file) + process_label_path=os.path.join(output_dir,fp_name,'segmentation') + + processed_lbl_full_path=os.path.join(process_label_path, f"{fp_name}.nii.gz") + + if not os.path.isdir(process_label_path): + os.makedirs(process_label_path,exist_ok=True) + + if not os.path.isfile(full_label_file): + pass + label_flag=False + else: + sitk_lbl_original = util.load_nifti(full_label_file) + + if sitk_lbl_original: + label_resampler = sitk.ResampleImageFilter() + reference_for_label = sitk_img_processed # Default to processed image + + if sitk_img_processed.GetDimension() == 4: + num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1 + if num_comp_proc > 0: + extractor = sitk.ExtractImageFilter() + proc_img_size_for_lbl_ref = sitk_img_processed.GetSize() + extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0]) + extractor.SetIndex([0,0,0,0]) + try: + reference_for_label = extractor.Execute(sitk_img_processed) + except Exception as ref_err: + print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.") + # print(traceback.format_exc()) + reference_for_label = None + else: # Fallback if extraction fails + print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.") + reference_for_label = None # This will cause an issue below if not handled + + sitk_lbl_processed = None + + if reference_for_label and reference_for_label.GetDimension() > 0: + label_resampler.SetInterpolator(sitk.sitkNearestNeighbor) + label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID()) + + if sitk_lbl_original.GetDimension() == 4: + lbl_channels = [] + lbl_size = list(sitk_lbl_original.GetSize()) + for i in range(lbl_size[3]): + extractor = sitk.ExtractImageFilter() + extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0]) + extractor.SetIndex([0, 0, 0, i]) + single_channel = extractor.Execute(sitk_lbl_original) + + label_resampler.SetReferenceImage(reference_for_label) + resampled_channel = label_resampler.Execute(single_channel) + lbl_channels.append(resampled_channel) + + if len(lbl_channels) > 1: + sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels) + elif len(lbl_channels) == 1: + sitk_lbl_processed = lbl_channels[0] + else: + label_resampler.SetReferenceImage(reference_for_label) + sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original) + if processed_lbl_full_path: + if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]: + print(f" Mismatch between image and label size (ignoring channels):") + print(f" Image size: {sitk_img_processed.GetSize()}") + print(f" Label size: {sitk_lbl_processed.GetSize()}") + util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label) + else: + print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.") + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label) # Save original + # processed_lbl_full_path should still point to this saved original label + sitk_lbl_processed=sitk_lbl_original + else: + processed_lbl_full_path = None + + + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_label_file) # Save original + print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") + + + + + if processed_lbl_full_path: + label_path_dict['tumor'] = processed_lbl_full_path + print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + meta.add_keyvalue('Label_Dict',LABEL_DICT) + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + + # try: + # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + # except Exception as e: + # failed_files.append(full_path_label) + # continue + print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + + except Exception as e: + print(e) + failed_files.append(ct_fp) + print(f"Failed to load PSMA images from {ct_fp}") + continue + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/psma-fdg-pet-ct-lesion/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/PSMA-FDG-PET-CT-LESION/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + diff --git a/PSMA_clean/dataclean_PSMA_petct_v2.py b/PSMA_clean/dataclean_PSMA_petct_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..f73d52ff403b78e000f004ff928a0c623898c10a --- /dev/null +++ b/PSMA_clean/dataclean_PSMA_petct_v2.py @@ -0,0 +1,423 @@ +#coding:utf-8 +''' +writebyygq +createon2025-08-30 +PSMAPET/CT本质上也是一种PET/CT,只是它的示踪剂和传统的18F-FDG不同,目前国际上应用较多的PSMAPET/CT的示踪剂是68GA-PSMA、18F-PSMA,其中68GA及18F是一种放射性核素,具有成像功能,PSMA是前列腺特异性膜抗原,具有引导功能,引导PSMA更准确地向前列腺癌细胞聚拢,这样就大大增加了PSMAPET/CT用于发现前列腺癌的敏感性。 + +PSMA,全称前列腺特异性膜抗原(Prostate-SpecificMembraneAntigen),是一种与前列腺癌密切相关的蛋白质。存在于前列腺上皮细胞的固有膜蛋白,在前列腺癌细胞表面强表达,在前列腺正常组织和非前列腺组织中表达量相对较低,表达量是正常前列腺细胞的100-1000倍,且与前列腺癌分级和分期呈正相关。这种强表达、高度特异性使得PSMA成为前列腺癌诊断和治疗的重要靶点。 +而PSMAPET/CT实际上是一种靶向显像,用放射性核素(常用68Ga、18F)标记PSMA配体作为示踪剂,通过静脉注入体内,经过分布代谢于病灶,然后用PET/CT进行扫描,即完成显像。借助PSMA的引导功能,将放射性核素更精准地聚集在前列腺癌细胞,结合正电子发射断层扫描(PET)和计算机断层扫描(CT),实现对前列腺癌的精准检测。 + + +fdgpet/ct和psmapet/ct检查就像"肿瘤侦探"使用不同的破案工具,各有所长又互为补充。fdg和psma是pet检查使用的两种不同显像剂,二者显像原理不同,因此追踪的“目标分子”不同。 +fdgpet/ct + 追踪目标:恶性肿瘤细胞消耗的葡萄糖(类似给恶性肿瘤细胞“测饭量”) + 原理:恶性肿瘤细胞生长、代谢旺盛,会大量摄取显像剂fdg(葡萄糖类似物),通过检测“高耗能区”定位肿瘤 + 优势:广谱肿瘤示踪剂,发展成熟、应用广泛,可反应肿瘤恶性程度,同时发现其他部位恶性肿瘤 + 局限性:肿瘤细胞数量少或处于低度恶性时,常常降低对葡萄糖摄取的需求,pet影像表现为低代谢,此时容易漏诊 + +psmapet/ct + 追踪目标:前列腺特异性膜抗原(前列腺癌细胞戴着的特殊“徽章”) + 原理:90%前列腺癌细胞表面戴着这种“徽章”,psma靠着追踪并粘住这种“徽章”精准锁定前列腺癌病灶,哪里亮起来,哪里就有肿瘤 + 优势:针对性强,能早期发现微小病灶,甚至在其他检查还正常时就预警 + 局限性:体内存在部分正常或病变细胞,同样具有psma蛋白高度表达的情况,如神经节、神经组织、肉芽肿性病变、肾癌、肺癌等,可能导致假阳性表现。此外,约10%的前列腺癌细胞没有佩戴这种“徽章”,导致漏诊 + + +PSMA-FDG-PET-CT-Lesion 数据集指的是同时包含 PSMA-PET 和 FDG-PET(以及对应CT)两种扫描模态,并且带有病灶标注的医学影像数据集。 +这种数据集在前列腺癌研究中具有极高的价值,因为它允许研究者直接比较和分析同一患者体内不同病灶的分子表达特性。 +前列腺癌病灶在分子水平上具有异质性。并非所有病灶都表达相同的生物标志物。 + PSMA(前列腺特异性膜抗原):在大多数前列腺癌细胞表面过度表达,是前列腺癌相对特异的靶点。PSMA-PET用于检测前列腺癌特异性病灶。 + FDG(氟代脱氧葡萄糖):反映细胞的葡萄糖代谢活性。高度侵袭性、低分化的肿瘤通常具有很高的FDG摄取。 + + + +PSMA-FDG-PET/CT: + +https://autopet-iii.grand-challenge.org/ +"channel_names": { + "0": "CT", + "1": "CT"--PET + }, + "labels": { + "background": 0, + "tumor": 1 + }, +同一个病例同在000,001两个影像,分别表示CT,PET,合并到第四个维度作为SUB_MODALITY + +label: + 0:backgroud 1: tumor + +FDG-元数据信息 + 'Series UID', 'Collection', '3rd Party Analysis', + 'Data Description URI', 'Subject ID', 'Study UID', 'Study Description', + 'Study Date', 'Series Description', 'Manufacturer', 'Modality', + 'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size', + 'File Location', 'Download Timestamp', 'diagnosis', 'age', 'sex' + 通过Subject ID,以及Modality共同确定唯一的描述信息,获取相应的,Study Description,Study Date,Series Description, Manufacturer,diagnosis, age, sex信息;【只获取CT模态的一行描述信息即可】 +FDG文件名组成:fdg_b2f82ed4b9_04-17-2003-NA-PET-CT Ganzkoerper primaer mit KM-26753_[0000].nii.gz + Subject ID[PETCT_b2f82ed4b9] && Modality[CT] + + +PSMA-元数据信息 + 'Subject ID', 'Study Date', 'age', 'manufacturer_model_name', + 'pet_radionuclide', 'ct_contrast_agent' + 需要依靠'Subject ID', 'Study Date'共同确定唯一,存在相同的subject_id不同时间的样例--作为单独数据处理, +PSMA文件名组成:psma_d5b636ea4da7638b_2019-03-15_[0000].nii.gz + Subject ID[psma_d5b636ea4da7638b]&&Study Date[2019-03-15] + +综上:将id定义为subject_id+study_date 共同标识唯一的ID + +处理流程: + 1.查找所有的ID; + 2.根据ID查找对应的两个channel的影像以及对应的label; + 3.对两个channel的影像进行合并转4D; + 4.按照4D图像处理的惯例(第四个维度不参与计算,取前3个的spaceing最小值)重采样插值;--label + 5.保存 + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil +##dataset_meta +meta_id_name='BraTS_2019_subject_ID' +meta_grade_name='Grade' + +##HGG_survival_info +survival_id_name='BraTS19ID' +meta_age_name='Age' +meta_survival_name='Survival' +meta_status_name='ResectionStatus' + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-400,400] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +##参考MSD的sub_modality描述信息 +SUB_MODALITY=["CT","PET"] +##文件名对应的排序顺序 +SERIES_ORDER=["0000","0001"] + +LABEL_DICT={ + "0":"backgroud", + "1":"tumor", +} +PSMA_META_COLUMN=['Subject ID', 'Study Date', 'age', 'manufacturer_model_name','pet_radionuclide', 'ct_contrast_agent'] +FDG_META_COLUMN=['Subject ID', 'Study Description','Study Date', 'Series Description', 'Manufacturer', 'Modality','diagnosis', 'age', 'sex'] +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250830 +def merge_images(series_files): + ''' + 每个病例包含两种不同序列的 CT:CT/PET--0000/0001 + 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path +##added by yanguoqing on 2025-08-30 +##获取PSMA-PET-CT的1614个数据名称 +def get_filename_list(fp): + with open(fp,'r') as fi: + fls=json.load(fi) + filename_list=fls[0]['train']+fls[0]['val'] + + return filename_list +##获取study_id以及study_date +def check_fname(fname): + if fname.startswith("fdg"): + sid=fname[:14] + sdate=fname[15:25] + sdate=sdate.split("-") + sdate=sdate[-1]+"-"+sdate[0]+"-"+sdate[1] + else: + sid=fname[:21] + sdate=fname[22:] + return sid,sdate +def main(target_path, output_dir): + # metadata_files = find_metadata_files(target_path) + # pid_dirs=find_image_dirs(target_path) + fdg_meta="fdg_metadata.csv" + psma_meta="psma_metadata.csv" + filename_meta="splits_final.json" ##包含所有1614个数据的名称列表信息 + # pid_dirs=["imagesTr","labelsTr"] + pid_dirs=["imagesTr"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + psma_meta_file=os.path.join(target_path,psma_meta) + fdg_meta_file=os.path.join(target_path,fdg_meta) + + filename_file=os.path.join(target_path,filename_meta) + + pdf_meta=pd.read_csv(psma_meta_file) + fdf_meta=pd.read_csv(fdg_meta_file) + + fp_names=get_filename_list(filename_file) + ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并; + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"): + for fp_name in tqdm(fp_names, desc="Processing all dataset"): + + ct_fp=os.path.join(target_path,pid_dir,fp_name+"_0000.nii.gz") + pet_fp=os.path.join(target_path,pid_dir,fp_name+"_0001.nii.gz") + label_fp=os.path.join(target_path,'labelsTr',fp_name+".nii.gz") + + modality="CT" + study='PSMA-FDG-PET-CT-LESION'##Dataset_name + CIA_other_info = {'metadata_file':''} + CIA_other_info['split'] = "train" + + + if fp_name.startswith("fdg"): + CIA_other_info['metadata_file']=fdg_meta_file + df_meta=fdf_meta + sid,sdate=check_fname(fp_name) + study_id=sid.replace("fdg","PETCT") + data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Modality']=='CT')] + data_info_row=data_info_row.reset_index() + for keyname in FDG_META_COLUMN: + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=sid+"_"+sdate + CIA_other_info['patientid']=sid + CIA_other_info['datetime']=sdate + + else: + CIA_other_info['metadata_file']=psma_meta_file + df_meta=pdf_meta + sid,sdate=check_fname(fp_name) + study_id=sid.replace("psma","PSMA") + # print('>>',study_id,sdate) + data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Study Date']==sdate)] + data_info_row=data_info_row.reset_index() + # print(data_info_row.columns) + for keyname in PSMA_META_COLUMN: + print(keyname) + print(data_info_row[keyname][0]) + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=sid+"_"+sdate + CIA_other_info['patientid']=sid + CIA_other_info['datetime']=sdate + + + + + + + + series_files=[ct_fp,pet_fp] + sub_modality=['CT','PET'] + # if len(series_files)>0: + # ##存在有效的MRI影像数据进行后续处理 + # sitk_img_original=merge_images(series_files) + + + for fpidex,fp in enumerate(series_files): + + try: + sitk_img_original=util.load_nifti(fp) + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',spacing=original_spacing, size=original_size) + + sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + ##CLAMP PET_CT + + sitk_img_processed = util.clamp_image(sitk_img_processed, CLAMP_RANGE_CT) + + + output_path = os.path.join(output_dir,sid+"_"+sdate,sid+"_"+sdate+"_%s.nii.gz"%sub_modality[fpidex]) + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(sitk_img_processed, output_path, os.path.dirname(fp)) + print(f"Saved NIfTI file to {output_path}") + + + size_processed = list(sitk_img_processed.GetSize()) + print('size_processed',size_processed,original_size) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(original_spacing))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',fp) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',sub_modality[fpidex]) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','whole-body') + + + # sub_modality_dict={} + # for idx,value in enumerate(sub_modality): + # if value: + # sub_modality_dict[str(idx)]=SUB_MODALITY[idx] + + # meta.add_keyvalue('Sub_modality',sub_modality_dict) + + meta.add_keyvalue('Label_Dict',LABEL_DICT) + + + ##Label processing + + label_path_dict={} + full_label_file=label_fp + full_path_label=os.path.dirname(full_label_file) + process_label_path=os.path.join(output_dir,sid+"_"+sdate,'segmentation') + + processed_lbl_full_path=os.path.join(process_label_path, "%s_%s.nii.gz"%(sid+"_"+sdate,sub_modality[fpidex])) + + if not os.path.isdir(process_label_path): + os.makedirs(process_label_path,exist_ok=True) + + if not os.path.isfile(full_label_file): + pass + label_flag=False + else: + sitk_lbl_original = util.load_nifti(full_label_file) + + if sitk_lbl_original: + resampler =util.get_unisize_resampler(sitk_lbl_original, interpolator='nearest', spacing=original_spacing, size=original_size) + if resampler is not None: + proces_label = resampler.Execute(sitk_lbl_original) + else: + proces_label = sitk_lbl_original + + + try: + assert sitk_img_processed.GetSize() == proces_label.GetSize() + except Exception as e: + failed_files.append(full_path_label) + continue + + + + util.save_nifti(proces_label, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original + print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") + + label_path_dict['tumor'] = processed_lbl_full_path + print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + meta.add_keyvalue('Label_Dict',LABEL_DICT) + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + except Exception as e: + print(e) + failed_files.append(fp) + print(f"Failed to load PSMA images from {fp}") + continue + + + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/psma-fdg-pet-ct-lesion/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/PSMA-FDG-PET-CT-LESION/V2/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + diff --git a/PSMA_clean/dataclean_PSMA_petct_v2_json.py b/PSMA_clean/dataclean_PSMA_petct_v2_json.py new file mode 100644 index 0000000000000000000000000000000000000000..ec9ec3c82f27b92c7013af2984f63c468e7ef1e0 --- /dev/null +++ b/PSMA_clean/dataclean_PSMA_petct_v2_json.py @@ -0,0 +1,425 @@ +#coding:utf-8 +''' +writebyygq +createon2025-08-30 +PSMAPET/CT本质上也是一种PET/CT,只是它的示踪剂和传统的18F-FDG不同,目前国际上应用较多的PSMAPET/CT的示踪剂是68GA-PSMA、18F-PSMA,其中68GA及18F是一种放射性核素,具有成像功能,PSMA是前列腺特异性膜抗原,具有引导功能,引导PSMA更准确地向前列腺癌细胞聚拢,这样就大大增加了PSMAPET/CT用于发现前列腺癌的敏感性。 + +PSMA,全称前列腺特异性膜抗原(Prostate-SpecificMembraneAntigen),是一种与前列腺癌密切相关的蛋白质。存在于前列腺上皮细胞的固有膜蛋白,在前列腺癌细胞表面强表达,在前列腺正常组织和非前列腺组织中表达量相对较低,表达量是正常前列腺细胞的100-1000倍,且与前列腺癌分级和分期呈正相关。这种强表达、高度特异性使得PSMA成为前列腺癌诊断和治疗的重要靶点。 +而PSMAPET/CT实际上是一种靶向显像,用放射性核素(常用68Ga、18F)标记PSMA配体作为示踪剂,通过静脉注入体内,经过分布代谢于病灶,然后用PET/CT进行扫描,即完成显像。借助PSMA的引导功能,将放射性核素更精准地聚集在前列腺癌细胞,结合正电子发射断层扫描(PET)和计算机断层扫描(CT),实现对前列腺癌的精准检测。 + + +fdgpet/ct和psmapet/ct检查就像"肿瘤侦探"使用不同的破案工具,各有所长又互为补充。fdg和psma是pet检查使用的两种不同显像剂,二者显像原理不同,因此追踪的“目标分子”不同。 +fdgpet/ct + 追踪目标:恶性肿瘤细胞消耗的葡萄糖(类似给恶性肿瘤细胞“测饭量”) + 原理:恶性肿瘤细胞生长、代谢旺盛,会大量摄取显像剂fdg(葡萄糖类似物),通过检测“高耗能区”定位肿瘤 + 优势:广谱肿瘤示踪剂,发展成熟、应用广泛,可反应肿瘤恶性程度,同时发现其他部位恶性肿瘤 + 局限性:肿瘤细胞数量少或处于低度恶性时,常常降低对葡萄糖摄取的需求,pet影像表现为低代谢,此时容易漏诊 + +psmapet/ct + 追踪目标:前列腺特异性膜抗原(前列腺癌细胞戴着的特殊“徽章”) + 原理:90%前列腺癌细胞表面戴着这种“徽章”,psma靠着追踪并粘住这种“徽章”精准锁定前列腺癌病灶,哪里亮起来,哪里就有肿瘤 + 优势:针对性强,能早期发现微小病灶,甚至在其他检查还正常时就预警 + 局限性:体内存在部分正常或病变细胞,同样具有psma蛋白高度表达的情况,如神经节、神经组织、肉芽肿性病变、肾癌、肺癌等,可能导致假阳性表现。此外,约10%的前列腺癌细胞没有佩戴这种“徽章”,导致漏诊 + + +PSMA-FDG-PET-CT-Lesion 数据集指的是同时包含 PSMA-PET 和 FDG-PET(以及对应CT)两种扫描模态,并且带有病灶标注的医学影像数据集。 +这种数据集在前列腺癌研究中具有极高的价值,因为它允许研究者直接比较和分析同一患者体内不同病灶的分子表达特性。 +前列腺癌病灶在分子水平上具有异质性。并非所有病灶都表达相同的生物标志物。 + PSMA(前列腺特异性膜抗原):在大多数前列腺癌细胞表面过度表达,是前列腺癌相对特异的靶点。PSMA-PET用于检测前列腺癌特异性病灶。 + FDG(氟代脱氧葡萄糖):反映细胞的葡萄糖代谢活性。高度侵袭性、低分化的肿瘤通常具有很高的FDG摄取。 + + + +PSMA-FDG-PET/CT: + +https://autopet-iii.grand-challenge.org/ +"channel_names": { + "0": "CT", + "1": "CT"--PET + }, + "labels": { + "background": 0, + "tumor": 1 + }, +同一个病例同在000,001两个影像,分别表示CT,PET,合并到第四个维度作为SUB_MODALITY + +label: + 0:backgroud 1: tumor + +FDG-元数据信息 + 'Series UID', 'Collection', '3rd Party Analysis', + 'Data Description URI', 'Subject ID', 'Study UID', 'Study Description', + 'Study Date', 'Series Description', 'Manufacturer', 'Modality', + 'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size', + 'File Location', 'Download Timestamp', 'diagnosis', 'age', 'sex' + 通过Subject ID,以及Modality共同确定唯一的描述信息,获取相应的,Study Description,Study Date,Series Description, Manufacturer,diagnosis, age, sex信息;【只获取CT模态的一行描述信息即可】 +FDG文件名组成:fdg_b2f82ed4b9_04-17-2003-NA-PET-CT Ganzkoerper primaer mit KM-26753_[0000].nii.gz + Subject ID[PETCT_b2f82ed4b9] && Modality[CT] + + +PSMA-元数据信息 + 'Subject ID', 'Study Date', 'age', 'manufacturer_model_name', + 'pet_radionuclide', 'ct_contrast_agent' + 需要依靠'Subject ID', 'Study Date'共同确定唯一,存在相同的subject_id不同时间的样例--作为单独数据处理, +PSMA文件名组成:psma_d5b636ea4da7638b_2019-03-15_[0000].nii.gz + Subject ID[psma_d5b636ea4da7638b]&&Study Date[2019-03-15] + +综上:将id定义为subject_id+study_date 共同标识唯一的ID + +处理流程: + 1.查找所有的ID; + 2.根据ID查找对应的两个channel的影像以及对应的label; + 3.对两个channel的影像进行合并转4D; + 4.按照4D图像处理的惯例(第四个维度不参与计算,取前3个的spaceing最小值)重采样插值;--label + 5.保存 + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil +##dataset_meta +meta_id_name='BraTS_2019_subject_ID' +meta_grade_name='Grade' + +##HGG_survival_info +survival_id_name='BraTS19ID' +meta_age_name='Age' +meta_survival_name='Survival' +meta_status_name='ResectionStatus' + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-400,400] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +##参考MSD的sub_modality描述信息 +SUB_MODALITY=["CT","PET"] +##文件名对应的排序顺序 +SERIES_ORDER=["0000","0001"] + +LABEL_DICT={ + "0":"backgroud", + "1":"tumor", +} +PSMA_META_COLUMN=['Subject ID', 'Study Date', 'age', 'manufacturer_model_name','pet_radionuclide', 'ct_contrast_agent'] +FDG_META_COLUMN=['Subject ID', 'Study Description','Study Date', 'Series Description', 'Manufacturer', 'Modality','diagnosis', 'age', 'sex'] +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250830 +def merge_images(series_files): + ''' + 每个病例包含两种不同序列的 CT:CT/PET--0000/0001 + 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path +##added by yanguoqing on 2025-08-30 +##获取PSMA-PET-CT的1614个数据名称 +def get_filename_list(fp): + with open(fp,'r') as fi: + fls=json.load(fi) + filename_list=fls[0]['train']+fls[0]['val'] + + return filename_list +##获取study_id以及study_date +def check_fname(fname): + if fname.startswith("fdg"): + sid=fname[:14] + sdate=fname[15:25] + sdate=sdate.split("-") + sdate=sdate[-1]+"-"+sdate[0]+"-"+sdate[1] + else: + sid=fname[:21] + sdate=fname[22:] + return sid,sdate +def main(target_path, output_dir): + # metadata_files = find_metadata_files(target_path) + # pid_dirs=find_image_dirs(target_path) + fdg_meta="fdg_metadata.csv" + psma_meta="psma_metadata.csv" + filename_meta="splits_final.json" ##包含所有1614个数据的名称列表信息 + # pid_dirs=["imagesTr","labelsTr"] + pid_dirs=["imagesTr"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + psma_meta_file=os.path.join(target_path,psma_meta) + fdg_meta_file=os.path.join(target_path,fdg_meta) + + filename_file=os.path.join(target_path,filename_meta) + + pdf_meta=pd.read_csv(psma_meta_file) + fdf_meta=pd.read_csv(fdg_meta_file) + + fp_names=get_filename_list(filename_file) + ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并; + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"): + for fp_name in tqdm(fp_names, desc="Processing all dataset"): + + ct_fp=os.path.join(target_path,pid_dir,fp_name+"_0000.nii.gz") + pet_fp=os.path.join(target_path,pid_dir,fp_name+"_0001.nii.gz") + label_fp=os.path.join(target_path,'labelsTr',fp_name+".nii.gz") + + modality="CT" + study='PSMA-FDG-PET-CT-LESION'##Dataset_name + CIA_other_info = {'metadata_file':''} + CIA_other_info['split'] = "train" + + + if fp_name.startswith("fdg"): + CIA_other_info['metadata_file']=fdg_meta_file + df_meta=fdf_meta + sid,sdate=check_fname(fp_name) + study_id=sid.replace("fdg","PETCT") + data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Modality']=='CT')] + data_info_row=data_info_row.reset_index() + for keyname in FDG_META_COLUMN: + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + + CIA_other_info['Image_id']=sid+"_"+sdate + CIA_other_info['patientid']=sid + CIA_other_info['datetime']=sdate + + else: + CIA_other_info['metadata_file']=psma_meta_file + df_meta=pdf_meta + sid,sdate=check_fname(fp_name) + study_id=sid.replace("psma","PSMA") + # print('>>',study_id,sdate) + data_info_row=df_meta[np.logical_and(df_meta['Subject ID']==study_id,df_meta['Study Date']==sdate)] + data_info_row=data_info_row.reset_index() + # print(data_info_row.columns) + for keyname in PSMA_META_COLUMN: + try: + print(keyname) + print(data_info_row[keyname][0]) + CIA_other_info[keyname]=str(data_info_row[keyname][0]) + except Exception as e: + continue + CIA_other_info['Image_id']=sid+"_"+sdate + CIA_other_info['patientid']=sid + CIA_other_info['datetime']=sdate + + + + + + + + series_files=[ct_fp,pet_fp] + sub_modality=['CT','PET'] + # if len(series_files)>0: + # ##存在有效的MRI影像数据进行后续处理 + # sitk_img_original=merge_images(series_files) + + + for fpidex,fp in enumerate(series_files): + + try: + sitk_img_original=util.load_nifti(fp) + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + + img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',spacing=original_spacing, size=original_size) + + sitk_img_processed = img_resampler_obj.Execute(sitk_img_original) + ##CLAMP PET_CT + + sitk_img_processed = util.clamp_image(sitk_img_processed, CLAMP_RANGE_CT) + + + output_path = os.path.join(output_dir,sid+"_"+sdate,sid+"_"+sdate+"_%s.nii.gz"%sub_modality[fpidex]) + # output_path=convert_windows_to_linux_path(output_path) + #save_nifti(sitk_img_processed, output_path, os.path.dirname(fp)) + print(f"Saved NIfTI file to {output_path}") + + + size_processed = list(sitk_img_processed.GetSize()) + print('size_processed',size_processed,original_size) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(original_spacing))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',fp) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',sub_modality[fpidex]) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','whole-body') + + + # sub_modality_dict={} + # for idx,value in enumerate(sub_modality): + # if value: + # sub_modality_dict[str(idx)]=SUB_MODALITY[idx] + + # meta.add_keyvalue('Sub_modality',sub_modality_dict) + + meta.add_keyvalue('Label_Dict',LABEL_DICT) + + + ##Label processing + + label_path_dict={} + full_label_file=label_fp + full_path_label=os.path.dirname(full_label_file) + process_label_path=os.path.join(output_dir,sid+"_"+sdate,'segmentation') + + processed_lbl_full_path=os.path.join(process_label_path, "%s_%s.nii.gz"%(sid+"_"+sdate,sub_modality[fpidex])) + + if not os.path.isdir(process_label_path): + os.makedirs(process_label_path,exist_ok=True) + + if not os.path.isfile(full_label_file): + pass + label_flag=False + else: + sitk_lbl_original = util.load_nifti(full_label_file) + + if sitk_lbl_original: + resampler =util.get_unisize_resampler(sitk_lbl_original, interpolator='nearest', spacing=original_spacing, size=original_size) + if resampler is not None: + proces_label = resampler.Execute(sitk_lbl_original) + else: + proces_label = sitk_lbl_original + + + try: + assert sitk_img_processed.GetSize() == proces_label.GetSize() + except Exception as e: + failed_files.append(full_path_label) + continue + + + + #util.save_nifti(proces_label, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original + print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") + + label_path_dict['tumor'] = processed_lbl_full_path + print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + meta.add_keyvalue('Label_Dict',LABEL_DICT) + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + except Exception as e: + print(e) + failed_files.append(fp) + print(f"Failed to load PSMA images from {fp}") + continue + + + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/PSMA/psma-fdg-pet-ct-lesion/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/PSMA/PSMA-FDG-PET-CT-LESION/V2/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + diff --git a/PSMA_clean/demo.py b/PSMA_clean/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..de9d79d73732e75a7447759facab8e1298428a0f --- /dev/null +++ b/PSMA_clean/demo.py @@ -0,0 +1,451 @@ +#coding:utf-8 +''' +writebyygq +createon2025-08-30 + + +BL = Baseline(基线) +FU = Follow-up(随访) + +1. Baseline (基线) + 含义:指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像(如CT、MRI、X光)。 + 作用:这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较,来判断病情的变化。 +2. Follow-up (随访) + 含义:指的是在基线影像之后,按计划或根据病情需要再次拍摄的影像。 + 作用:用于评估治疗效果(如肿瘤是否缩小)、监测疾病进展(如病灶是否增大或增多)、或观察术后恢复情况。 +“BL FU” 在报告中的应用场景: + 当放射科医生在报告中写下“BL FU”或“compare to BL FU”时,他们的意思是: + “本次的影像检查结果,需要与之前拍摄的基线影像进行对比,以评估变化。” + +例如: +肿瘤患者:一位肺癌患者在化疗前做了一次CT(作为基线BL),化疗2个周期后又做了一次CT(作为随访FU)。放射科医生会在新报告中将两次影像进行对比,并描述:“与20XX年X月X日的基线CT(BL FU) 相比,右肺下叶肿块明显缩小。” +慢性病患者:如肺炎、肝硬化、多发性硬化等需要长期监测的疾病,医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。 + +label: + 0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息 + +编号ID:10位的16进制编号,每一个对应一个csv文件,对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件-- +备注:CSV包含所有的label信息和编号,如果考虑按照tissue进行分别存储,可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict +BL的以及对应的MASK都是inputsTr目录下面 +命名形式: + 93dd4de5cd_BL_img_BL_img_00.nii.gz + 93dd4de5cd_BL_mask_BL_img_00.nii.gz + 93dd4de5cd_BL_00.json + +FU在inputsTr目录下面,对应的mask在targetsTr力猛 +命名形式: + c6f057b865_FU_img_FU_img_00.nii.gz + c6f057b865_FU_mask_FU_img_00.nii.gz + c6f057b865_FU_img_FU_img_01.nii.gz + c6f057b865_FU_mask_FU_img_01.nii.gz + c6f057b865_FU_00.json + c6f057b865_FU_01.json + + +元数据信息CSV-病灶或者癌症信息--对应基线的位置,对应的基线影像编号,位置,以及对应的随访位置编号以及病灶位置 +lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type +1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung +2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node + +json格式样例 +{ + "name": "Points of interest", + "points": [ + { + "name": "1", + "point": [ + 84.9530896759608, + 273.525433308214, + 148.780708364732 + ] + }, + { + "name": "2", + "point": [ + 206.307026476578, + 258.39816700611, + 177.256619144603 + ] + } + ], + "type": "Multiple points", + "version": { + "major": 1, + "minor": 0 + } +} + +20251101补充增加,将病灶编号进行合并同类项目, +注意处理完成后保留原影像的几何空间信息以及元数据文件信息 + + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil + + +##统一编码 +label_id_lut={'backgroud': 0, + 'Lymph node': 1, + 'Lung': 2, + 'Soft tissue / Skin': 3, + 'Liver': 4, + 'Skeleton': 5, + 'Adrenals': 6, + 'Spleen': 7, + 'CNS': 8, + 'Kidney': 9, + 'Heart': 10, + 'Others': 11, + 'unclear': 12, + } + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +# ##参考MSD的sub_modality描述信息 +# SUB_MODALITY=["CT","PET"] +# ##文件名对应的排序顺序 +# SERIES_ORDER=["0000","0001"] + +##根据对应的json信息进行补充1-N的数值 +LABEL_DICT={ + "0":"backgroud", +} +META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type'] + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250830 +def merge_images(series_files): + ''' + 每个病例包含两种不同序列的 CT:CT/PET--0000/0001 + 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path +##added by yanguoqing on 2025-08-31 +##根据csv文件返回的所有数据文件名称,获取所有数据id的 +def get_filename_list(fp_dir): + all_file_list=glob.glob("%s/*.csv"%fp_dir) + + + return all_file_list +##获取study_id以及study_date +def check_fname(fname): + if fname.startswith("fdg"): + sid=fname[:14] + sdate=fname[15:25] + else: + sid=fname[:21] + sdate=fname[22:] + return sid,sdate +def main(target_path, output_dir): + + pid_dirs=["inputsTr"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + + input_dir=os.path.join(target_path,'inputsTr') + target_dir=os.path.join(target_path,'targetsTr') + + fp_files=get_filename_list(input_dir) + ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并; + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"): + for fp_file in tqdm(fp_files, desc="Processing all dataset"): + meta_file=fp_file + df_meta=pd.read_csv(meta_file) + + fp_name=os.path.basename(fp_file)[:-4] + ##依次查找BL以及FU的所有影像以及对应的mask + for sub_mod in ['BL','FU']: + + bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod)) + if len(bl_fps)>0: + for bl_fp in bl_fps: + basename=os.path.basename(bl_fp)[:-5] + bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz") + bl_fp_img=os.path.join(input_dir,bl_fp_name) + + if os.path.isfile(bl_fp_img): + ##判定存在进行正常处理 + + + bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz") + + bl_fp_mask=os.path.join(input_dir,bl_mask_name) + if os.path.isfile(bl_fp_mask): + label_fp=bl_fp_mask + label_flag=True + else: + bl_fp_mask=os.path.join(target_dir,bl_mask_name) + if os.path.isfile(bl_fp_mask): + label_fp=bl_fp_mask + label_flag=True + else: + label_fp=None + label_flag=False + + + modality="CT" + study='PSMA_Longitudinal_CT'##Dataset_name + CIA_other_info = { + 'Image_id':basename, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = "train" + + CIA_other_info['metadata_file']=meta_file + stk_image=util.load_nifti(bl_fp_img) + spacing_info = stk_image.GetSpacing() + size = list(stk_image.GetSize()) + resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size) + if resampler is not None: + proces_image = resampler.Execute(stk_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = stk_image + CIA_other_info['Resample'] = False + + output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, input_dir) + print(f"Saved NIfTI file to {output_path}") + + + + + if label_flag: + label_path_dict = {} + label_stk_img=util.load_nifti(label_fp) + + image_array = sitk.GetArrayFromImage(label_stk_img) + ##注意处理label的赋值并还原附带原始影像的基本信息,并重新赋值合并同类项 + with open(bl_fp,'r') as fi: + json_info=json.load(fi) + + label_dict={ + "0":"backgroud" + } + + update_image_array=np.copy(image_array) + ##获取合并同类项后的基本信息 + group_meta=df_meta.groupby('lesion_type')['lesion_id'] + for name,group in group_meta: + ##分组名称以及分组后的所有leision_id + ids=group_meta.get_group(name) + target_id=label_id_lut[name] + # ##取每个分组的最小leision_id赋值 + # ids_min=ids.min() + # label_dict[str(ids_min)]=name + label_dict[str(target_id)]=name + ##并对 + for v in ids.tolist(): + print(name,v,target_id) + update_image_array[image_array==v]=target_id + print(np.where(update_image_array==10)) + image_array=None + label_stk_img_update=sitk.GetImageFromArray(update_image_array) + label_stk_img_update.CopyInformation(label_stk_img) + # 手动复制所有元数据 + # 获取元数据键 + meta_keys = label_stk_img.GetMetaDataKeys() + for key in meta_keys: + value = label_stk_img.GetMetaData(key) + label_stk_img_update.SetMetaData(key, value) + + # for lesion_info in json_info['points']: + # df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])] + # df_row=df_row.reset_index() + # lesion_type=df_row['lesion_type'][0] + # label_dict[lesion_info['name']]=lesion_type + + resampler =util.get_unisize_resampler(label_stk_img_update, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_stk_img_update) + + ary_process_label=sitk.GetArrayFromImage(proces_label) + + if ary_process_label[-1,:,:].mean()==ary_process_label[-1,0,0] and ary_process_label[-1,0,0]>0: + print('momingqimiao',ary_process_label[-1,0,0]) + ary_process_label[-1,:,:]=0 + + label_stk_img_process=sitk.GetImageFromArray(ary_process_label) + label_stk_img_process.CopyInformation(proces_label) + meta_keys = proces_label.GetMetaDataKeys() + for key in meta_keys: + value = proces_label.GetMetaData(key) + label_stk_img_process.SetMetaData(key, value) + + + + else: + label_stk_img_process = label_stk_img_update + + # print(proces_image.GetSize(),proces_label.GetSize()) + try: + assert proces_image.GetSize() == label_stk_img_process.GetSize() + except Exception as e: + failed_files.append(label_fp) + continue + + label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz") + + label_path_dict['tumor'] = label_output_path + util.save_nifti(label_stk_img_process, label_output_path, label_fp) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + + + + else: + continue + + + + + size_processed = list(proces_image.GetSize()) + print('size_processed',size_processed,size) + + # meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing + meta.add_keyvalue('OriImg_path',bl_fp_img) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','whole-body') + + + if label_flag: + # print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + meta.add_keyvalue('Label_Dict',label_dict) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/ygq/Data_Engineering/PSMA_clean/demo") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="//home/data/ygq/Data_Engineering/PSMA_clean/sample/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + diff --git a/PSMA_clean/demo/inputsTr/9c838d2e45.csv b/PSMA_clean/demo/inputsTr/9c838d2e45.csv new file mode 100644 index 0000000000000000000000000000000000000000..8dc7306ec110f906c17094db356be9a9910d41bb --- /dev/null +++ b/PSMA_clean/demo/inputsTr/9c838d2e45.csv @@ -0,0 +1,11 @@ +lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type +1,173.652542372881 319.652542372881 579.387005649717,00,189.53247388079484 287.2536900264025 222.17010423686224,188.202349869452 289.144908616188 223.849869451697,00,Lymph node +2,281.307079646018 277.819469026549 501.061061946903,00,302.00463138533826 240.3165855091747 192.69462680091806,341.600891156759 241.744290345559 191.096086469827,00,Others +3,114.172695951766 270.197674418605 687.132213608958,00,130.76397678397964 228.70035929316566 266.4885911692181,130.707082371055 231.155119322556 268.416089299461,00,Lymph node +4,356.415238954013 301.567628494139 451.692064923354,00,376.87428084515284 263.45722698112957 174.4720429146538,381.853823088456 257.722888555722 174.580459770115,00,Others +5,208.050684931507 195.371232876712 444.935616438356,00,209.83459805091746 149.68124304981973 169.57017028347764,206.659624413146 144.776995305164 172.457746478873,00,Liver +6,164.686585470452 328.149644051805 690.955545072476,00,152.33141797244215 282.595521811701 123.90492553416982,145.783406214666 282.353392728214 124.015291457218,01,Lung +7,228.565467266367 187.678910544728 492.744877561219,00,237.67185465810823 144.09238609967332 190.15033170294646,235.767605633803 148.153923541247 191.01509054326,00,Liver +8,138.372302158273 304.519184652278 553.549560351719,00,109.44264837721721 246.66039498848312 73.57523491828391,106.450819672131 251.254098360656 74.091654247392,01,Lung +9,173.506048387097 306.191532258065 599.612903225806,00,189.86575749680912 272.81057573235324 230.3317260920997,191.972929936306 273.027070063694 230.562101910828,00,Lymph node +10,250.798805601318 219.040568369028 340.824341021417,00,264.02473463342807 186.36201422295846 126.59140444149511,260.173992673993 183.694139194139 125.826007326007,00,Soft tissue / Skin diff --git a/PSMA_clean/demo/inputsTr/9c838d2e45_BL_00.json b/PSMA_clean/demo/inputsTr/9c838d2e45_BL_00.json new file mode 100644 index 0000000000000000000000000000000000000000..b759a029c9fcd420b3c526667b4ee354dc64908f --- /dev/null +++ b/PSMA_clean/demo/inputsTr/9c838d2e45_BL_00.json @@ -0,0 +1,90 @@ +{ + "name": "Points of interest", + "points": [ + { + "name": "1", + "point": [ + 173.652542372881, + 319.652542372881, + 579.387005649717 + ] + }, + { + "name": "2", + "point": [ + 281.307079646018, + 277.819469026549, + 501.061061946903 + ] + }, + { + "name": "3", + "point": [ + 114.172695951766, + 270.197674418605, + 687.132213608958 + ] + }, + { + "name": "4", + "point": [ + 356.415238954013, + 301.567628494139, + 451.692064923354 + ] + }, + { + "name": "5", + "point": [ + 208.050684931507, + 195.371232876712, + 444.935616438356 + ] + }, + { + "name": "6", + "point": [ + 164.686585470452, + 328.149644051805, + 690.955545072476 + ] + }, + { + "name": "7", + "point": [ + 228.565467266367, + 187.678910544728, + 492.744877561219 + ] + }, + { + "name": "8", + "point": [ + 138.372302158273, + 304.519184652278, + 553.549560351719 + ] + }, + { + "name": "9", + "point": [ + 173.506048387097, + 306.191532258065, + 599.612903225806 + ] + }, + { + "name": "10", + "point": [ + 250.798805601318, + 219.040568369028, + 340.824341021417 + ] + } + ], + "type": "Multiple points", + "version": { + "major": 1, + "minor": 0 + } +} \ No newline at end of file diff --git a/PSMA_clean/demo/inputsTr/9c838d2e45_BL_img_BL_img_00.nii.gz b/PSMA_clean/demo/inputsTr/9c838d2e45_BL_img_BL_img_00.nii.gz new file mode 100644 index 0000000000000000000000000000000000000000..e42df3f4d889261a93043e99e297a7f01dbe7e70 --- /dev/null +++ b/PSMA_clean/demo/inputsTr/9c838d2e45_BL_img_BL_img_00.nii.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6d270fea41f22b006922b6dbe8ad63451f8fa44ed9d99bc29034b7a2431a1b +size 23855104 diff --git a/PSMA_clean/demo/inputsTr/9c838d2e45_BL_mask_BL_img_00.nii.gz b/PSMA_clean/demo/inputsTr/9c838d2e45_BL_mask_BL_img_00.nii.gz new file mode 100644 index 0000000000000000000000000000000000000000..c2d508f363a8eefda7d8b428a3893638c7a765dd --- /dev/null +++ b/PSMA_clean/demo/inputsTr/9c838d2e45_BL_mask_BL_img_00.nii.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:316c5372a73b5d1bcb221494ae24a91b8f1c4ba405f1d271a264f77a622a7973 +size 259615 diff --git a/PSMA_clean/demo/inputsTr/9c838d2e45_FU_00.json b/PSMA_clean/demo/inputsTr/9c838d2e45_FU_00.json new file mode 100644 index 0000000000000000000000000000000000000000..4d8f14627b4e68c248830601fdee4eca45ecd834 --- /dev/null +++ b/PSMA_clean/demo/inputsTr/9c838d2e45_FU_00.json @@ -0,0 +1,74 @@ +{ + "name": "Points of interest", + "points": [ + { + "name": "1", + "point": [ + 189.53247388079484, + 287.2536900264025, + 222.17010423686224 + ] + }, + { + "name": "2", + "point": [ + 302.00463138533826, + 240.3165855091747, + 192.69462680091806 + ] + }, + { + "name": "3", + "point": [ + 130.76397678397964, + 228.70035929316566, + 266.4885911692181 + ] + }, + { + "name": "4", + "point": [ + 376.87428084515284, + 263.45722698112957, + 174.4720429146538 + ] + }, + { + "name": "5", + "point": [ + 209.83459805091746, + 149.68124304981973, + 169.57017028347764 + ] + }, + { + "name": "7", + "point": [ + 237.67185465810823, + 144.09238609967332, + 190.15033170294646 + ] + }, + { + "name": "9", + "point": [ + 189.86575749680912, + 272.81057573235324, + 230.3317260920997 + ] + }, + { + "name": "10", + "point": [ + 264.02473463342807, + 186.36201422295846, + 126.59140444149511 + ] + } + ], + "type": "Multiple points", + "version": { + "major": 1, + "minor": 0 + } +} \ No newline at end of file diff --git a/PSMA_clean/demo/inputsTr/9c838d2e45_FU_01.json b/PSMA_clean/demo/inputsTr/9c838d2e45_FU_01.json new file mode 100644 index 0000000000000000000000000000000000000000..9b666291171114102423c1c8dd70cfeb1a93cf7b --- /dev/null +++ b/PSMA_clean/demo/inputsTr/9c838d2e45_FU_01.json @@ -0,0 +1,26 @@ +{ + "name": "Points of interest", + "points": [ + { + "name": "6", + "point": [ + 152.33141797244215, + 282.595521811701, + 123.90492553416982 + ] + }, + { + "name": "8", + "point": [ + 109.44264837721721, + 246.66039498848312, + 73.57523491828391 + ] + } + ], + "type": "Multiple points", + "version": { + "major": 1, + "minor": 0 + } +} \ No newline at end of file diff --git a/PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_00.nii.gz b/PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_00.nii.gz new file mode 100644 index 0000000000000000000000000000000000000000..ae457a0c6d0891b761fcedff9971ac3de38a4aba --- /dev/null +++ b/PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_00.nii.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a80f03265f11541538ede2ad76b3d7958e3c87f99899ebd2ee336db7ddcbf3e +size 23855104 diff --git a/PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_01.nii.gz b/PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_01.nii.gz new file mode 100644 index 0000000000000000000000000000000000000000..5a9ade1e7d559e97e60ccfd7255228434be9959e --- /dev/null +++ b/PSMA_clean/demo/inputsTr/9c838d2e45_FU_img_FU_img_01.nii.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b9e3c4feaf49e3007d738026fecd44f7bd1126c9450ae2458e8ff17f43f5b77 +size 23855104 diff --git a/PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_00.nii.gz b/PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_00.nii.gz new file mode 100644 index 0000000000000000000000000000000000000000..faf5121761ae51b4741ca6cd8f61f48ad7ddfd89 --- /dev/null +++ b/PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_00.nii.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0892f4358c271c3737c0bf92970d2d93d1b4ba4ae7338c82a45ff4f8dba8fdb2 +size 106508 diff --git a/PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_01.nii.gz b/PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_01.nii.gz new file mode 100644 index 0000000000000000000000000000000000000000..e7cfdb93af5e01935f294d4eeca97f81574d2f87 --- /dev/null +++ b/PSMA_clean/demo/targetsTr/9c838d2e45_FU_mask_FU_img_01.nii.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0852db3ee8137210401c145e64c129107ce21ece610ef29926f061b18d38aeb +size 44672 diff --git a/PSMA_clean/sample/9c838d2e45/9c838d2e45_BL_00.nii.gz b/PSMA_clean/sample/9c838d2e45/9c838d2e45_BL_00.nii.gz new file mode 100644 index 0000000000000000000000000000000000000000..c4612afa38755d34ce086816a865fa402c8060fb --- /dev/null +++ b/PSMA_clean/sample/9c838d2e45/9c838d2e45_BL_00.nii.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f1678de5358e3d66a3421a3a19cac69e8fd766930c21f42a509c3d16c2dfbd0 +size 23855104 diff --git a/PSMA_clean/sample/9c838d2e45/segmentation/9c838d2e45_BL_00.nii.gz b/PSMA_clean/sample/9c838d2e45/segmentation/9c838d2e45_BL_00.nii.gz new file mode 100644 index 0000000000000000000000000000000000000000..152d10718ae620122c048e77d8f1a9e0c4790ab0 --- /dev/null +++ b/PSMA_clean/sample/9c838d2e45/segmentation/9c838d2e45_BL_00.nii.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0629e3f73e040123553a005021789f59f82a70a1ea40a4a621f3ca24d538052c +size 282175 diff --git a/PSMA_clean/sample/failed_files.json b/PSMA_clean/sample/failed_files.json new file mode 100644 index 0000000000000000000000000000000000000000..0637a088a01e8ddab3bf3fa98dbe804cbde1a0dc --- /dev/null +++ b/PSMA_clean/sample/failed_files.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/PSMA_clean/sample/nifti_mappings.json b/PSMA_clean/sample/nifti_mappings.json new file mode 100644 index 0000000000000000000000000000000000000000..7d272bf099800ec3d7c85b12bbeb7c648feed7e4 --- /dev/null +++ b/PSMA_clean/sample/nifti_mappings.json @@ -0,0 +1,33 @@ +{ + "//home/data/ygq/Data_Engineering/PSMA_clean/sample/9c838d2e45/9c838d2e45_BL_00.nii.gz": { + "Modality": "CT", + "OriImg_path": "/home/data/ygq/Data_Engineering/PSMA_clean/demo/inputsTr/9c838d2e45_BL_img_BL_img_00.nii.gz", + "Spacing_mm": 0.91796875, + "Size": [ + 512, + 512, + 1065 + ], + "Dataset_name": "PSMA_Longitudinal_CT", + "ROI": "whole-body", + "Label_path": { + "segmentation": { + "tumor": "//home/data/ygq/Data_Engineering/PSMA_clean/sample/9c838d2e45/segmentation/9c838d2e45_BL_00.nii.gz" + } + }, + "Label_Dict": { + "0": "backgroud", + "4": "Liver", + "2": "Lung", + "1": "Lymph node", + "11": "Others", + "3": "Soft tissue / Skin" + }, + "Metadata": { + "Image_id": "9c838d2e45_BL_00", + "metadata_file": "/home/data/ygq/Data_Engineering/PSMA_clean/demo/inputsTr/9c838d2e45.csv", + "split": "train", + "Resample": true + } + } +} \ No newline at end of file diff --git a/PSMA_clean/util.py b/PSMA_clean/util.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd221c91bfdc6ff61af486b7b09d0bad9c6deee --- /dev/null +++ b/PSMA_clean/util.py @@ -0,0 +1,410 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + ##added by yanguoiqng on 2025-08-08 + elif key == 'Sub_modality': + + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ed5609fb7585d4826494e32c2b467b6594a60181 --- /dev/null +++ b/README.md @@ -0,0 +1,99 @@ +--- +license: mit +tags: + - medical-imaging + - data-engineering + - preprocessing + - nifti + - dicom + - simpleitk +library_name: simpleitk +--- + +# Data_Engineering — Medical Imaging Cleanup Pipeline + +Standardize diverse medical imaging datasets (CT, MRI, PET) into a unified **NIfTI** format with consistent JSON metadata. Each subdirectory targets one dataset. + +> Companion repo to [`DRDMsig/Omini3D`](https://huggingface.co/DRDMsig/Omini3D) — produces the standardized data that OmniMorph trains on. + +## Supported Datasets + +| Subdirectory | Dataset | Modality | +|---|---|---| +| `AbdomenAtlas/` | AbdomenAtlas | CT | +| `AbdomenCT1k/` | AbdomenCT-1K | CT | +| `brats2019_clean/` | BraTS 2019 | MRI (multi-sequence) | +| `brats2020_clean/` | BraTS 2020 | MRI (multi-sequence) | +| `brats2021_clean/` | BraTS 2021 | MRI (multi-sequence) | +| `kaggle_osic_clean/` | Kaggle OSIC Pulmonary Fibrosis | CT | +| `MnM2_clean/` | M&Ms-2 | Cardiac MRI | +| `MnMs_clean/` | M&Ms | Cardiac MRI | +| `OAISIS_clean/` | OASIS-1 / OASIS-2 | Brain MRI | +| `OAI_ZIB_clean/` | OAI-ZIB (knee) | MRI | +| `PSMA_clean/` | PSMA-FDG PET-CT (longitudinal) | PET + CT | +| `all/` | Cross-dataset utilities (artifact plane removal) | — | + +Each cleaned dataset writes: + +- Resampled & clamped `.nii.gz` images / segmentations +- Per-dataset `nifti_mappings.json` +- `failed_files.json` listing files the cleaner could not process + +## Repository Layout + +``` +_clean/ +├── dataclean_.py # main cleanup script (use highest version: _v2.py, _v3.py, ...) +├── util.py # shared helpers (copied per dir, not imported) +├── config_format.json # metadata schema for `meta_data` validation +└── (optional) sample/, demo/ # tiny example NIfTI files for sanity checks +``` + +## Usage + +```bash +cd AbdomenAtlas/ +python dataclean_abdomen_atlas_v2.py \ + --target_path /path/to/raw/AbdomenAtlas \ + --output_dir /path/to/output/AbdomenAtlas_clean +``` + +All scripts share the `--target_path` / `--output_dir` interface. Versioned scripts (`_v2.py`, `_v3.py`) supersede older versions; use the highest version unless investigating regressions. + +### Pipeline (per dataset) + +1. **Load** raw data (DICOM via `sitk.ImageSeriesReader`, NIfTI via `sitk.ReadImage`, NRRD). +2. **Extract metadata** from headers, CSV files, or DICOM tags. +3. **Resample** to isotropic spacing (`get_unisize_resampler` in `util.py`). +4. **Clamp intensities** — CT: `[-300, 300]` HU; MRI: per-dataset windows. +5. **Process segmentation labels** with identical resampling (nearest-neighbor). +6. **Validate** image/label dimensions agree (`assert image.GetSize() == label.GetSize()`). +7. **Write** standardized `.nii.gz` and append to `nifti_mappings.json`. + +### Shared `util.py` API + +| Function / class | Purpose | +|---|---| +| `meta_data` | Validates metadata against `config_format.json`; required fields: `Modality`, `OriImg_path`, `Spacing_mm`, `Size`, `Dataset_name`. Normalizes ambiguous terminology via synonym dictionaries. | +| `get_unisize_resampler(image)` | Builds a SimpleITK resampler for isotropic spacing; returns `None` if already isotropic. | +| `clamp_image(image, lo, hi)` | HU/intensity clamping via `sitk.ClampImageFilter`. | + +## Dependencies + +```bash +pip install SimpleITK pandas numpy tqdm openpyxl +``` + +(No `requirements.txt` — install manually.) + +## What's Included / Excluded + +- ✅ Cleanup scripts, `util.py`, `config_format.json`, demographic CSVs. +- ✅ A handful of tiny demo / sample `.nii.gz` files in `PSMA_clean/{sample,demo}/`. +- ❌ Raw datasets (download from each dataset's official source). +- ❌ Run logs from prior cleanup runs (`*.log`). +- ❌ Intermediate test outputs (`MnM2_clean/test/`). + +## License + +MIT — see project root. diff --git a/all/clean_artifact_planes.py b/all/clean_artifact_planes.py new file mode 100644 index 0000000000000000000000000000000000000000..f499ae671de8f1c442515ebd99b7be0d00be0fce --- /dev/null +++ b/all/clean_artifact_planes.py @@ -0,0 +1,268 @@ +""" +Detect and remove constant-value artifact planes at volume boundaries. + +Interpolation during preprocessing can introduce planes filled with a single +non-zero constant value (e.g. 8.0 for CT) at the start or end of each spatial +axis. This script: + + 1. Scans all .nii.gz files under --image_dir (and optionally --label_dir). + 2. For each image, identifies boundary planes that are entirely one NON-ZERO + value — zero-valued planes are skipped as they are legitimate background. + 3. Crops the artifact planes from image AND matching label (if present), + preserving the spatial origin so the image stays in the same physical + coordinate space. + 4. Overwrites in-place (use --dry-run to preview without writing). + +Usage: + # Dry-run: report artifacts without modifying files + python clean_artifact_planes.py \ + --image_dir /path/to/MSD_processed/images \ + --label_dir /path/to/MSD_processed/labels \ + --dry-run + + # Actually clean: + python clean_artifact_planes.py \ + --image_dir /path/to/MSD_processed/images \ + --label_dir /path/to/MSD_processed/labels +""" +import os +import glob +import argparse +import numpy as np +import SimpleITK as sitk +from tqdm import tqdm + + +def _get_plane(arr, axis, idx): + """Extract a single plane from the array along the given axis.""" + slc = [slice(None)] * arr.ndim + slc[axis] = idx + return arr[tuple(slc)] + + +def find_artifact_slices(arr, axis, max_search=20): + """Find contiguous constant-value boundary slices along `axis`. + + Returns (n_start, n_end): number of artifact slices to trim from the + start and end of the given axis. + + A slice is considered an artifact if: + - It has exactly 1 unique value, AND + - That value is foreign to the adjacent interior plane (i.e. the value + does not appear, or appears very rarely, in the neighbor). + This avoids trimming legitimate background planes (e.g. -300 in CT air) + that are naturally connected to interior regions with the same value. + """ + n = arr.shape[axis] + + def _is_artifact(idx, interior_idx): + plane = _get_plane(arr, axis, idx) + unique = np.unique(plane) + if len(unique) != 1: + return False + val = float(unique[0]) + # Check if this constant value appears in the adjacent interior plane + neighbor = _get_plane(arr, axis, interior_idx) + # If the value appears in >1% of the neighbor's voxels, it's likely + # connected background, not an artifact + match_ratio = np.mean(np.abs(neighbor - val) < 1e-6) + if match_ratio > 0.01: + return False + return True + + # Find the first non-constant plane from each boundary to use as reference + def _find_reference(start, stop, step): + for idx in range(start, stop, step): + plane = _get_plane(arr, axis, idx) + if len(np.unique(plane)) > 1: + return idx + return start # fallback + + ref_start = _find_reference(0, min(max_search + 5, n), 1) + ref_end = _find_reference(n - 1, max(n - 1 - max_search - 5, -1), -1) + + n_start = 0 + for i in range(min(max_search, n // 2)): + if _is_artifact(i, ref_start): + n_start = i + 1 + else: + break + + n_end = 0 + for i in range(n - 1, max(n - 1 - max_search, n // 2), -1): + if _is_artifact(i, ref_end): + n_end = (n - 1 - i) + 1 + else: + break + + return n_start, n_end + + +def detect_artifacts(arr, max_search=20): + """Detect artifact planes on all spatial axes. + + For 4D arrays (e.g. BRATS with shape [C, D, H, W]), only spatial axes + (1, 2, 3) are checked; the channel axis (0) is skipped. + + Returns a dict: {axis: (n_start, n_end)} for axes that need trimming. + """ + if arr.ndim == 3: + spatial_axes = [0, 1, 2] + elif arr.ndim == 4: + spatial_axes = [1, 2, 3] + else: + spatial_axes = list(range(arr.ndim)) + + crops = {} + for axis in spatial_axes: + n_start, n_end = find_artifact_slices(arr, axis, max_search=max_search) + if n_start > 0 or n_end > 0: + crops[axis] = (n_start, n_end) + return crops + + +def build_crop_slices(ndim, crops): + """Build a tuple of slices to crop the array according to `crops`.""" + slices = [slice(None)] * ndim + for axis, (n_start, n_end) in crops.items(): + end = None if n_end == 0 else -n_end + slices[axis] = slice(n_start, end) + return tuple(slices) + + +def crop_sitk_image(sitk_img, crops): + """Crop a SimpleITK image according to the detected artifact planes. + + Updates the origin so the cropped image occupies the correct physical space. + """ + arr = sitk.GetArrayFromImage(sitk_img) + crop_slices = build_crop_slices(arr.ndim, crops) + cropped_arr = arr[crop_slices] + + cropped_img = sitk.GetImageFromArray(cropped_arr) + cropped_img.SetSpacing(sitk_img.GetSpacing()) + cropped_img.SetDirection(sitk_img.GetDirection()) + + # Adjust origin: SimpleITK arrays are in ZYX order, origin is in XYZ + ndim_phys = sitk_img.GetDimension() # physical dimensions (3 for 3D, 4 for 4D) + origin = list(sitk_img.GetOrigin()) + spacing = list(sitk_img.GetSpacing()) + direction = np.array(sitk_img.GetDirection()).reshape(ndim_phys, ndim_phys) + + for axis, (n_start, _) in crops.items(): + if n_start > 0: + # Map array axis to physical axis + # SimpleITK: last array axis = first physical axis + if arr.ndim == 3: + phys_axis = 2 - axis + elif arr.ndim == 4: + phys_axis = 2 - (axis - 1) + else: + continue + if phys_axis < ndim_phys: + for i in range(min(3, ndim_phys)): + origin[i] += n_start * spacing[phys_axis] * direction[i, phys_axis] + + cropped_img.SetOrigin(origin) + + for key in sitk_img.GetMetaDataKeys(): + cropped_img.SetMetaData(key, sitk_img.GetMetaData(key)) + + return cropped_img + + +def main(): + parser = argparse.ArgumentParser(description="Detect and remove constant-value artifact planes at volume boundaries.") + parser.add_argument("--image_dir", type=str, required=True, + help="Directory containing .nii.gz image files.") + parser.add_argument("--label_dir", type=str, default=None, + help="Directory containing matching .nii.gz label files (same filenames). " + "In recursive mode, labels are found at {subject_dir}/segmentation/{filename}.") + parser.add_argument("--recursive", action="store_true", + help="Recursively search for .nii.gz files, excluding segmentation/ subdirs.") + parser.add_argument("--max_search", type=int, default=20, + help="Max number of boundary slices to check per side (default: 20).") + parser.add_argument("--dry-run", action="store_true", + help="Report artifacts without modifying any files.") + args = parser.parse_args() + + if args.recursive: + all_files = sorted(glob.glob(os.path.join(args.image_dir, "**", "*.nii.gz"), recursive=True)) + image_files = [f for f in all_files if '/segmentation/' not in f and '/label' not in f.lower()] + else: + image_files = sorted(glob.glob(os.path.join(args.image_dir, "*.nii.gz"))) + print(f"Found {len(image_files)} images in {args.image_dir}{' (recursive)' if args.recursive else ''}") + if args.label_dir: + print(f"Label dir: {args.label_dir}") + if args.dry_run: + print("*** DRY RUN — no files will be modified ***") + + total_artifacts = 0 + total_clean = 0 + total_slices_removed = 0 + + for img_path in tqdm(image_files, desc="Scanning"): + filename = os.path.basename(img_path) + sitk_img = sitk.ReadImage(img_path) + arr = sitk.GetArrayFromImage(sitk_img) + + crops = detect_artifacts(arr, max_search=args.max_search) + + if not crops: + total_clean += 1 + continue + + total_artifacts += 1 + slices_removed = sum(s + e for s, e in crops.values()) + total_slices_removed += slices_removed + + detail = ", ".join( + f"axis{ax}: -{s} start, -{e} end" + for ax, (s, e) in sorted(crops.items()) + ) + + # Report the artifact value + for ax, (s, e) in crops.items(): + slc = [slice(None)] * arr.ndim + if s > 0: + slc[ax] = 0 + else: + slc[ax] = arr.shape[ax] - 1 + val = arr[tuple(slc)].flat[0] + break + print(f" {filename}: {arr.shape} -> trim {slices_removed} planes, val={val} ({detail})") + + if args.dry_run: + continue + + # Crop and save image + cropped_img = crop_sitk_image(sitk_img, crops) + sitk.WriteImage(cropped_img, img_path) + + # Crop matching label if present + if args.label_dir and not args.recursive: + label_path = os.path.join(args.label_dir, filename) + if os.path.isfile(label_path): + sitk_lbl = sitk.ReadImage(label_path) + cropped_lbl = crop_sitk_image(sitk_lbl, crops) + sitk.WriteImage(cropped_lbl, label_path) + elif args.recursive: + # In recursive mode, look for label at {parent}/segmentation/{filename} + parent_dir = os.path.dirname(img_path) + label_path = os.path.join(parent_dir, 'segmentation', filename) + if os.path.isfile(label_path): + sitk_lbl = sitk.ReadImage(label_path) + cropped_lbl = crop_sitk_image(sitk_lbl, crops) + sitk.WriteImage(cropped_lbl, label_path) + + print(f"\nSummary:") + print(f" Total images: {len(image_files)}") + print(f" With artifacts: {total_artifacts}") + print(f" Clean: {total_clean}") + print(f" Planes removed: {total_slices_removed}") + if args.dry_run: + print(" (dry-run — nothing was modified)") + + +if __name__ == "__main__": + main() diff --git a/brats2019_clean/config_format.json b/brats2019_clean/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..01e9febcb5f7b8c2946d49b246139faf6e8272b1 --- /dev/null +++ b/brats2019_clean/config_format.json @@ -0,0 +1,125 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + + "Sub_modality": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/brats2019_clean/dataclean_BRATS_2019.py b/brats2019_clean/dataclean_BRATS_2019.py new file mode 100644 index 0000000000000000000000000000000000000000..d04da28949c125e739d416a578191dda450b7b26 --- /dev/null +++ b/brats2019_clean/dataclean_BRATS_2019.py @@ -0,0 +1,370 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-08-03 +update BRATS_2019 + +BRATS2019 是一个大规模、多模态、标注精良的脑胶质瘤 MRI 数据集,主要用于开发和评估脑肿瘤自动分割算法以及基于 MRI 的生存预测模型。 +它包含 335 例带标注的训练数据和大量未标注的验证/测试数据。其多模态特性(T1, T1Gd, T2, T2-FLAIR)和精细的肿瘤子区域标注(整个肿瘤、肿瘤核心、增强肿瘤) +训练集: 包含 335 例 患者的完整多模态 MRI 扫描数据及其对应的专家手动分割标注(Ground Truth) + +每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离): + Native (T1): 标准的 T1 加权成像,显示解剖结构。 + Post-contrast T1-weighted (T1Gd/T1ce): 注射钆造影剂后的 T1 加权成像。造影剂会渗漏过被破坏的血脑屏障,在肿瘤活跃区域(如坏死核心的边缘)呈现强化。 + T2-weighted (T2): T2 加权成像,对水肿和囊性/坏死区域非常敏感,呈现高信号。 + T2 Fluid Attenuated Inversion Recovery (T2-FLAIR): 抑制了脑脊液信号的 T2 加权成像。特别擅长显示肿瘤周围的水肿区域(通常也包含浸润性肿瘤细胞),呈现高信号。 + +训练集提供了由专业医师手动精细勾画的肿瘤区域分割标注。 +标注定义了三个相互嵌套或重叠的子区域(反映了肿瘤的不同生物学特性): + 坏疽性和非增强肿瘤核心: 包括坏死区域(在 T1Gd 上无强化)和活跃肿瘤的非增强部分(在 T2-FLAIR 上高信号,但在 T1Gd 上不强化)。标签值 = 1。 + 瘤周水肿: 肿瘤周围的水肿区域(在 T2 和 T2-FLAIR 上呈高信号)。标签值 = 2。 + 增强肿瘤: 在 T1Gd 上呈现强化的区域(通常代表高度血管化的活跃肿瘤组织)。标签值 = 4。 + 整个肿瘤区域 由这三个区域组合而成(标签值 1+2+4)。 + 肿瘤核心区域 由坏疽性和非增强肿瘤核心 + 增强肿瘤组成(标签值 1+4) + + 对于训练集和验证集中的 HGG 患者,提供了患者的总生存期信息(从初次扫描到死亡或最后一次随访的时间) + + +根据沟通参考MSD中的BRATS的结构: + 1.将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放; + 2.生存期信息也需要相应补充道HGG的数据集中 + + +meta_info:[保留Grade,BraTS_2019_subject_ID] + Grade,BraTS_2017_subject_ID,BraTS_2018_subject_ID,TCGA_TCIA_subject_ID,BraTS_2019_subject_ID + HGG,Brats17_CBICA_AAB_1,Brats18_CBICA_AAB_1,NA,BraTS19_CBICA_AAB_1 + LGG,Brats17_TCIA_451_1,Brats18_TCIA09_451_1,TCGA-CS-4942,BraTS19_TCIA09_451_1 + +survival_info: + BraTS19ID,Age,Survival,ResectionStatus + BraTS19_CBICA_AAB_1,60.4630137,289,GTR + BraTS19_CBICA_AAG_1,52.2630137,616,GTR +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil +##dataset_meta +meta_id_name='BraTS_2019_subject_ID' +meta_grade_name='Grade' + +##HGG_survival_info +survival_id_name='BraTS19ID' +meta_age_name='Age' +meta_survival_name='Survival' +meta_status_name='ResectionStatus' + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +##参考MSD的sub_modality描述信息 +SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"] +##文件名对应的排序顺序 +SERIES_ORDER=["flair","t1","t1ce","t2"] + +LABEL_DICT={ + "0":"backgroud", + "1":"non-enhancing tumor", + "2":"edema", + "4":"enhancing tumour" +} +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250805 +def load_brtas_images(series_files): + ''' + 每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离) + 将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + pid_dirs=["HGG","LGG"] + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + meta_file=os.path.join(target_path,'name_mapping.csv') + survival_file=os.path.join(target_path,'survival_data.csv') + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + if os.path.isfile(survival_file): + sf_flag=True + df_survial=pd.read_csv(survival_file,sep=',') + else: + sf_flag=False + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + ##HGG_FLAG + if pid_dir =="HGG": + tr_flag=True + else: + tr_flag=False + # label_flag=False + + ##遍历所有目录下的HGG/LGG的病例数据(影像+标注seg) + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + + for data_dir in tqdm(image_dirs, desc="Processing images files"): + full_path=os.path.join(target_path,pid_dir,data_dir) + + data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + meta_image_id=data_info_row[meta_id_name][0] + meta_image_grade=data_info_row[meta_grade_name][0] + + else: + meta_image_id=data_dir + meta_image_grade='' + + if tr_flag: + survival_file_row=df_survial[df_survial[survival_id_name]==data_dir] + if data_info_row.shape[0]>0: + survival_file_row=survival_file_row.reset_index() + #print(data_info_row[meta_id_name]) + meta_image_age=survival_file_row[meta_age_name][0] + meta_image_survival=survival_file_row[meta_survival_name][0] + meta_image_status=survival_file_row[meta_status_name][0] + + else: + meta_image_age='' + meta_image_survival='' + meta_image_status='' + else: + meta_image_age='' + meta_image_survival='' + meta_image_status='' + + + try: + ##读取MRI四组文件,按照flair,t1,t1ce,t2的顺序叠加,对于seg先剔除不参与 + + + series_files=[os.path.join(full_path,"%s_%s.nii"%(data_dir,sm))for sm in SERIES_ORDER] + ##判断是否每个sub_modality文件存在 + series_flag=[os.path.isfile(os.path.join(full_path,"%s_%s.nii"%(data_dir,sm)))for sm in SERIES_ORDER] + series_files=[series_files[index] for index, value in enumerate(series_flag) if value] + sub_modality=[SUB_MODALITY[index] for index, value in enumerate(series_flag) if value] + if len(series_files)>0: + ##存在有效的MRI影像数据进行后续处理 + sitk_img_original=load_brtas_images(series_files) + + else: + print("病例数据%s为空"%data_dir) + continue + + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + modality="MRI" + study='BRATS_2019'##Dataset_name + CIA_other_info = { + 'metadata_file':'' + + } + CIA_other_info['split'] = "train" + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + + ## + CIA_other_info['Image_id']=meta_image_id + CIA_other_info['Grade']=meta_image_grade + CIA_other_info['Age']=str(meta_image_age) + CIA_other_info['Survival']=str(meta_image_survival) + CIA_other_info['ResectionStatus']=meta_image_status + + + meta.add_keyvalue('Spacing_mm',1.0) + meta.add_keyvalue('OriImg_path',",".join(series_files)) + meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','head') + + sub_modality_dict={} + for idx,value in enumerate(series_flag): + if value: + sub_modality_dict[str(idx)]=SUB_MODALITY[idx] + + meta.add_keyvalue('Sub_modality',sub_modality_dict) + + meta.add_keyvalue('Label_Dict',LABEL_DICT) + + output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + ## + save_nifti(sitk_img_original, output_image_file, full_path) + print(f"Saved NIfTI file to {output_image_file}") + ##Label processing + + label_path_dict={} + full_label_file=os.path.join(full_path,"%s_seg.nii"%(data_dir)) + + process_label_path=os.path.join(output_dir,data_dir,'segmentation') + + processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz") + + if not os.path.isdir(process_label_path): + os.makedirs(process_label_path,exist_ok=True) + + if not os.path.isfile(full_label_file): + pass + label_flag=False + else: + sitk_lbl_original = util.load_nifti(full_label_file) + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_label_file) # Save original + print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") + + label_path_dict['brain'] = processed_lbl_full_path + label_flag=True + + if label_flag: + meta.add_keyvalue('Task',TASK_VALUE) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + + + # try: + # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + # except Exception as e: + # failed_files.append(full_path_label) + # continue + print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + + except Exception as e: + print(e) + failed_files.append(data_dir) + print(f"Failed to load BRATS images from {data_dir}") + continue + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_image_file] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/BRATS/BRATS2019/MICCAI_BraTS_2019_Data_Training/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2019") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/brats2019_clean/util.py b/brats2019_clean/util.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd221c91bfdc6ff61af486b7b09d0bad9c6deee --- /dev/null +++ b/brats2019_clean/util.py @@ -0,0 +1,410 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + ##added by yanguoiqng on 2025-08-08 + elif key == 'Sub_modality': + + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/brats2020_clean/config_format.json b/brats2020_clean/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..01e9febcb5f7b8c2946d49b246139faf6e8272b1 --- /dev/null +++ b/brats2020_clean/config_format.json @@ -0,0 +1,125 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + + "Sub_modality": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/brats2020_clean/dataclean_BRATS_2020.py b/brats2020_clean/dataclean_BRATS_2020.py new file mode 100644 index 0000000000000000000000000000000000000000..92538e6d8c0a944579a243cea7bc12f8cf4425b9 --- /dev/null +++ b/brats2020_clean/dataclean_BRATS_2020.py @@ -0,0 +1,452 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-08-03 +update BRATS_2020 + +BRATS2020 是 BRATS 系列的一个重要里程碑。它在 BRATS2019 的基础上,通过显著扩大数据规模、增加数据多样性(尤其是纳入中国数据)、 +完善生存预测任务的评估流程(验证集和测试集包含生存信息)以及引入额外未标注数据以促进新学习范式,为脑胶质瘤多模态 MRI 分割和生存预测研究设定了更高的标准。 + +数据内容与规模(显著扩大): +    训练集: 包含 369 例 患者的完整多模态 MRI 扫描数据及其对应的专家手动分割标注(Ground Truth)。(相比2019的335例增加) +    验证集: 包含 125 例 患者的完整多模态 MRI 扫描数据。没有提供标注。用于开发阶段在线评估算法性能。 +    测试集: 包含 166 例 患者的完整多模态 MRI 扫描数据。没有提供标注。这是最终排名使用的独立测试集。(与2019测试集规模相同,但内容不同) + +关键特性 - 多模态 MRI(与2019一致): +每个病例仍然包含四种预处理后的 3D MRI 序列: +Native (T1) +Post-contrast T1-weighted (T1Gd/T1ce) +T2-weighted (T2) +T2 Fluid Attenuated Inversion Recovery (T2-FLAIR) + +关键特性 - 肿瘤标注(与2019一致): + +训练集提供专家手动勾画的精细标注。 +标注定义相同的三个子区域: +坏疽性和非增强肿瘤核心: 标签值 = 1 +瘤周水肿: 标签值 = 2 +增强肿瘤: 标签值 = 4 +整个肿瘤区域: 标签值 1+2+4 +肿瘤核心区域: 标签值 1+4 + + +根据沟通参考MSD中的BRATS的结构: + 1.将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放; + 2.生存期信息也需要相应补充道HGG的数据集中 + +Trainning: + meta_info:[保留Grade,BraTS_2019_subject_ID]--name_mapping.csv + Grade,BraTS_2017_subject_ID,BraTS_2018_subject_ID,TCGA_TCIA_subject_ID,BraTS_2019_subject_ID,BraTS_2020_subject_ID + HGG,Brats17_CBICA_AAB_1,Brats18_CBICA_AAB_1,NA,BraTS19_CBICA_AAB_1,BraTS20_Training_001 + HGG,Brats17_CBICA_AAG_1,Brats18_CBICA_AAG_1,NA,BraTS19_CBICA_AAG_1,BraTS20_Training_002 + + survival_info:--survival_info.csv + Brats20ID,Age,Survival_days,Extent_of_Resection + BraTS20_Training_001,60.463,289,GTR + BraTS20_Training_002,52.263,616,GTR +Validation: + meta_info:[保留Grade,BraTS_2019_subject_ID]--name_mapping_validation_data.csv + BraTS_2017_subject_ID,BraTS_2018_subject_ID,TCGA_TCIA_subject_ID,BraTS_2019_subject_ID,BraTS_2020_subject_ID + Brats17_CBICA_AAM_1,Brats18_CBICA_AAM_1,NA,BraTS19_CBICA_AAM_1,BraTS20_Validation_001 + Brats17_CBICA_ABT_1,Brats18_CBICA_ABT_1,NA,BraTS19_CBICA_ABT_1,BraTS20_Validation_002 + + survival_info:--survival_evaluation.csv + BraTS20ID,Age,ResectionStatus + BraTS20_Validation_001,68.170,GTR + BraTS20_Validation_002,50.153,GTR + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil + +##trainning_dataset +##dataset_meta +meta_info_dict={ + "training":{ + 'meta_id_name':'BraTS_2020_subject_ID', + 'meta_grade_name':'Grade', + 'survival_id_name':'Brats20ID', + 'meta_age_name':'Age', + 'meta_survival_name':'Survival_days', + 'meta_status_name':'Extent_of_Resection' + }, + 'validation':{ + 'meta_id_name':'BraTS_2020_subject_ID', + 'survival_id_name':'BraTS20ID', + 'meta_age_name':'Age', + 'meta_status_name':'ResectionStatus' + } +} + + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +##参考MSD的sub_modality描述信息 +SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"] +##文件名对应的排序顺序 +SERIES_ORDER=["flair","t1","t1ce","t2"] + +LABEL_DICT={ + "0":"backgroud", + "1":"non-enhancing tumor", + "2":"edema", + "4":"enhancing tumour" +} +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250805 +def load_brtas_images(series_files): + ''' + 每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离) + 将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + meta_file=os.path.join(target_path,'name_mapping.csv') + survival_file=os.path.join(target_path,'survival_info.csv') + + val_meta_file=os.path.join(target_path,'name_mapping_validation_data.csv') + val_survival_file=os.path.join(target_path,'survival_evaluation.csv') + + + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + + if os.path.isfile(survival_file): + sf_flag=True + df_survial=pd.read_csv(survival_file,sep=',') + else: + sf_flag=False + + if os.path.isfile(val_meta_file): + vmf_flag=True + + vdf_meta=pd.read_csv(val_meta_file,sep=',') + else: + vmf_flag=False + + if os.path.isfile(val_survival_file): + vsf_flag=True + + vdf_survial=pd.read_csv(val_survival_file,sep=',') + else: + vsf_flag=False + + + if pid_dirs: + for data_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,data_dir)): + continue + ##HGG_FLAG + if 'Training' in data_dir: + tr_flag=True + else: + tr_flag=False + # label_flag=False + + ##遍历所有目录下的HGG/LGG的病例数据(影像+标注seg) + # image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + + # for data_dir in tqdm(image_dirs, desc="Processing images files"): + full_path=os.path.join(target_path,data_dir) + meta = meta_data() + if tr_flag: + + data_info_row=df_meta[df_meta[meta_info_dict['training']['meta_id_name']]==data_dir] + survival_file_row=df_survial[df_survial[meta_info_dict['training']['survival_id_name']]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + meta_image_id=data_info_row[meta_info_dict['training']['meta_id_name']][0] + meta_image_grade=data_info_row[meta_info_dict['training']['meta_grade_name']][0] + + else: + meta_image_id=data_dir + meta_image_grade='' + + + if survival_file_row.shape[0]>0: + survival_file_row=survival_file_row.reset_index() + #print(data_info_row[meta_id_name]) + meta_image_age=survival_file_row[meta_info_dict['training']['meta_age_name']][0] + meta_image_survival=survival_file_row[meta_info_dict['training']['meta_survival_name']][0] + meta_image_status=survival_file_row[meta_info_dict['training']['meta_status_name']][0] + else: + meta_image_age='' + meta_image_survival='' + meta_image_status='' + else: + + data_info_row=vdf_meta[vdf_meta[meta_info_dict['validation']['meta_id_name']]==data_dir] + + survival_file_row=vdf_survial[vdf_survial[meta_info_dict['validation']['survival_id_name']]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + meta_image_id=data_info_row[meta_info_dict['validation']['meta_id_name']][0] + meta_image_grade='' + + else: + meta_image_id=data_dir + meta_image_grade='' + + + if survival_file_row.shape[0]>0: + survival_file_row=survival_file_row.reset_index() + #print(data_info_row[meta_id_name]) + meta_image_age=survival_file_row[meta_info_dict['validation']['meta_age_name']][0] + meta_image_survival='' + meta_image_status=survival_file_row[meta_info_dict['validation']['meta_status_name']][0] + else: + meta_image_age='' + meta_image_survival='' + meta_image_status='' + + + + + try: + ##读取MRI四组文件,按照flair,t1,t1ce,t2的顺序叠加,对于seg先剔除不参与 + + + series_files=[os.path.join(full_path,"%s_%s.nii"%(data_dir,sm))for sm in SERIES_ORDER] + ##判断是否每个sub_modality文件存在 + series_flag=[os.path.isfile(os.path.join(full_path,"%s_%s.nii"%(data_dir,sm)))for sm in SERIES_ORDER] + series_files=[series_files[index] for index, value in enumerate(series_flag) if value] + sub_modality=[SUB_MODALITY[index] for index, value in enumerate(series_flag) if value] + if len(series_files)>0: + ##存在有效的MRI影像数据进行后续处理 + sitk_img_original=load_brtas_images(series_files) + + else: + print("病例数据%s为空"%data_dir) + continue + + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + modality="MRI" + study='BRATS_2020'##Dataset_name + CIA_other_info = { + 'metadata_file':'' + + } + + if tr_flag: + CIA_other_info['split'] = "train" + CIA_other_info['metadata_file']=meta_file + + else: + CIA_other_info['split'] = "validation" + CIA_other_info['metadata_file']=val_meta_file + + + + ## + CIA_other_info['Image_id']=meta_image_id + CIA_other_info['Grade']=meta_image_grade + CIA_other_info['Age']=str(meta_image_age) + CIA_other_info['Survival']=str(meta_image_survival) + CIA_other_info['ResectionStatus']=meta_image_status + + + meta.add_keyvalue('Spacing_mm',1.0) + meta.add_keyvalue('OriImg_path',",".join(series_files)) + meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','head') + + sub_modality_dict={} + for idx,value in enumerate(series_flag): + if value: + sub_modality_dict[str(idx)]=SUB_MODALITY[idx] + + meta.add_keyvalue('Sub_modality',sub_modality_dict) + + if tr_flag: + + meta.add_keyvalue('Label_Dict',LABEL_DICT) + + output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + ## + save_nifti(sitk_img_original, output_image_file, full_path) + print(f"Saved NIfTI file to {output_image_file}") + ##Label processing + + if tr_flag: + label_path_dict={} + full_label_file=os.path.join(full_path,"%s_seg.nii"%(data_dir)) + + process_label_path=os.path.join(output_dir,data_dir,'segmentation') + + processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz") + + if not os.path.isdir(process_label_path): + os.makedirs(process_label_path,exist_ok=True) + + if not os.path.isfile(full_label_file): + pass + label_flag=False + else: + sitk_lbl_original = util.load_nifti(full_label_file) + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_label_file) # Save original + print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") + + label_path_dict['brain'] = processed_lbl_full_path + label_flag=True + + if label_flag: + meta.add_keyvalue('Task',TASK_VALUE) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + + + # try: + # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + # except Exception as e: + # failed_files.append(full_path_label) + # continue + print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + + except Exception as e: + print(e) + failed_files.append(data_dir) + print(f"Failed to load BRATS images from {data_dir}") + continue + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_image_file] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/BRATS/BRATS2020/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2020") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/brats2020_clean/util.py b/brats2020_clean/util.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd221c91bfdc6ff61af486b7b09d0bad9c6deee --- /dev/null +++ b/brats2020_clean/util.py @@ -0,0 +1,410 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + ##added by yanguoiqng on 2025-08-08 + elif key == 'Sub_modality': + + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/brats2021_clean/config_format.json b/brats2021_clean/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..01e9febcb5f7b8c2946d49b246139faf6e8272b1 --- /dev/null +++ b/brats2021_clean/config_format.json @@ -0,0 +1,125 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + }, + + "Sub_modality": { + "type": "dict", + "required": false + }, + "Label_Dict": { + "type": "dict", + "required": false + } +} \ No newline at end of file diff --git a/brats2021_clean/dataclean_BRATS_2021.py b/brats2021_clean/dataclean_BRATS_2021.py new file mode 100644 index 0000000000000000000000000000000000000000..13a376ea4b048f0a75ad6ed6954082306efc0f4d --- /dev/null +++ b/brats2021_clean/dataclean_BRATS_2021.py @@ -0,0 +1,446 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-08-03 +update BRATS_2020 + +BRATS2020 是 BRATS 系列的一个重要里程碑。它在 BRATS2019 的基础上,通过显著扩大数据规模、增加数据多样性(尤其是纳入中国数据)、 +完善生存预测任务的评估流程(验证集和测试集包含生存信息)以及引入额外未标注数据以促进新学习范式,为脑胶质瘤多模态 MRI 分割和生存预测研究设定了更高的标准。 + +数据内容与规模(显著扩大): + 训练集 (Labeled): + 包含 1,251 例 患者的完整多模态 MRI 扫描数据。 + 所有病例均提供专家手动分割标注 (Ground Truth)。 + (注:这是官方公布用于训练的有标注数据总量,实际提供给参与者的具体子集可能根据任务不同有所划分,但总量远超往年) + + 验证集 (Validation): + 包含 219 例 患者的完整多模态 MRI 扫描数据。 + 没有提供分割标注或生存信息。 + 用于开发阶段通过在线评估平台测试分割算法的性能(提交预测结果获取分数)。 + + 测试集 (Test): + 包含 530 例 患者的完整多模态 MRI 扫描数据。 + 完全私有、未发布、未提供标注或生存信息。(这是与之前版本的最大区别之一!) + 参与者需将最终算法模型提交给组织方,由组织方在完全保密的测试集上运行评估,确保公平性和对未知数据的真正测试。 + +关键特性 - 多模态 MRI(与2019一致): + 每个病例仍然包含四种预处理后的 3D MRI 序列: + Native (T1) + Post-contrast T1-weighted (T1Gd/T1ce) + T2-weighted (T2) + T2 Fluid Attenuated Inversion Recovery (T2-FLAIR) + +关键特性 - 肿瘤标注(与2019一致): + 训练集提供专家手动勾画的精细标注。 + 标注定义相同的三个子区域: + 坏疽性和非增强肿瘤核心: 标签值 = 1 + 瘤周水肿: 标签值 = 2 + 增强肿瘤: 标签值 = 4 + 整个肿瘤区域: 标签值 1+2+4 + 肿瘤核心区域: 标签值 1+4 + + +当前的 数据集仅包含1251个训练集数据,无csv病例元数据信息以及生存期信息 + +根据沟通参考MSD中的BRATS的结构: + 1.将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放; + 2.生存期信息也需要相应补充道HGG的数据集中 + + + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +import shutil + +##trainning_dataset +##dataset_meta +meta_info_dict={ + "training":{ + 'meta_id_name':'BraTS_2020_subject_ID', + 'meta_grade_name':'Grade', + 'survival_id_name':'Brats20ID', + 'meta_age_name':'Age', + 'meta_survival_name':'Survival_days', + 'meta_status_name':'Extent_of_Resection' + }, + 'validation':{ + 'meta_id_name':'BraTS_2020_subject_ID', + 'survival_id_name':'BraTS20ID', + 'meta_age_name':'Age', + 'meta_status_name':'ResectionStatus' + } +} + + + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... +TARGET_VOXEL_SPACING=None + +##参考MSD的sub_modality描述信息 +SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"] +##文件名对应的排序顺序 +SERIES_ORDER=["flair","t1","t1ce","t2"] + +LABEL_DICT={ + "0":"backgroud", + "1":"non-enhancing tumor", + "2":"edema", + "4":"enhancing tumour" +} +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +##modify by yanguoqing on 20250805 +def load_brtas_images(series_files): + ''' + 每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离) + 将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放 + ''' + reader = sitk.ImageSeriesReader() + reader.SetFileNames(series_files) + image = reader.Execute() + return image + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + # meta_file=os.path.join(target_path,'name_mapping.csv') + # survival_file=os.path.join(target_path,'survival_info.csv') + + # val_meta_file=os.path.join(target_path,'name_mapping_validation_data.csv') + # val_survival_file=os.path.join(target_path,'survival_evaluation.csv') + + + # if os.path.isfile(meta_file): + # mf_flag=True + # df_meta=pd.read_csv(meta_file,sep=',') + # else: + # mf_flag=False + + # if os.path.isfile(survival_file): + # sf_flag=True + # df_survial=pd.read_csv(survival_file,sep=',') + # else: + # sf_flag=False + + # if os.path.isfile(val_meta_file): + # vmf_flag=True + + # vdf_meta=pd.read_csv(val_meta_file,sep=',') + # else: + # vmf_flag=False + + # if os.path.isfile(val_survival_file): + # vsf_flag=True + + # vdf_survial=pd.read_csv(val_survival_file,sep=',') + # else: + # vsf_flag=False + + + if pid_dirs: + for data_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,data_dir)): + continue + ##HGG_FLAG + # if 'Training' in data_dir: + # tr_flag=True + # else: + # tr_flag=False + # label_flag=False + tr_flag=True + ##遍历所有目录下的HGG/LGG的病例数据(影像+标注seg) + # image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + + # for data_dir in tqdm(image_dirs, desc="Processing images files"): + full_path=os.path.join(target_path,data_dir) + meta = meta_data() + meta_image_id=data_dir + # if tr_flag: + + # data_info_row=df_meta[df_meta[meta_info_dict['training']['meta_id_name']]==data_dir] + # survival_file_row=df_survial[df_survial[meta_info_dict['training']['survival_id_name']]==data_dir] + + # if data_info_row.shape[0]>0: + # data_info_row=data_info_row.reset_index() + # #print(data_info_row[meta_id_name]) + # meta_image_id=data_info_row[meta_info_dict['training']['meta_id_name']][0] + # meta_image_grade=data_info_row[meta_info_dict['training']['meta_grade_name']][0] + + # else: + # meta_image_id=data_dir + # meta_image_grade='' + + + # if survival_file_row.shape[0]>0: + # survival_file_row=survival_file_row.reset_index() + # #print(data_info_row[meta_id_name]) + # meta_image_age=survival_file_row[meta_info_dict['training']['meta_age_name']][0] + # meta_image_survival=survival_file_row[meta_info_dict['training']['meta_survival_name']][0] + # meta_image_status=survival_file_row[meta_info_dict['training']['meta_status_name']][0] + # else: + # meta_image_age='' + # meta_image_survival='' + # meta_image_status='' + # else: + + # data_info_row=vdf_meta[vdf_meta[meta_info_dict['validation']['meta_id_name']]==data_dir] + + # survival_file_row=vdf_survial[vdf_survial[meta_info_dict['validation']['survival_id_name']]==data_dir] + + # if data_info_row.shape[0]>0: + # data_info_row=data_info_row.reset_index() + # #print(data_info_row[meta_id_name]) + # meta_image_id=data_info_row[meta_info_dict['validation']['meta_id_name']][0] + # meta_image_grade='' + + # else: + # meta_image_id=data_dir + # meta_image_grade='' + + + # if survival_file_row.shape[0]>0: + # survival_file_row=survival_file_row.reset_index() + # #print(data_info_row[meta_id_name]) + # meta_image_age=survival_file_row[meta_info_dict['validation']['meta_age_name']][0] + # meta_image_survival='' + # meta_image_status=survival_file_row[meta_info_dict['validation']['meta_status_name']][0] + # else: + # meta_image_age='' + # meta_image_survival='' + # meta_image_status='' + + + + + try: + ##读取MRI四组文件,按照flair,t1,t1ce,t2的顺序叠加,对于seg先剔除不参与 + + + series_files=[os.path.join(full_path,"%s_%s.nii.gz"%(data_dir,sm))for sm in SERIES_ORDER] + ##判断是否每个sub_modality文件存在 + series_flag=[os.path.isfile(os.path.join(full_path,"%s_%s.nii.gz"%(data_dir,sm)))for sm in SERIES_ORDER] + series_files=[series_files[index] for index, value in enumerate(series_flag) if value] + sub_modality=[SUB_MODALITY[index] for index, value in enumerate(series_flag) if value] + if len(series_files)>0: + ##存在有效的MRI影像数据进行后续处理 + sitk_img_original=load_brtas_images(series_files) + + else: + print("病例数据%s为空"%data_dir) + continue + + + original_spacing = list(sitk_img_original.GetSpacing()) + original_size = list(sitk_img_original.GetSize()) + + modality="MRI" + study='BRATS_2021'##Dataset_name + CIA_other_info = { + 'metadata_file':'' + + } + CIA_other_info['split'] = "train" + # if tr_flag: + # CIA_other_info['split'] = "train" + # CIA_other_info['metadata_file']=meta_file + + # else: + # CIA_other_info['split'] = "validation" + # CIA_other_info['metadata_file']=val_meta_file + + + + ## + CIA_other_info['Image_id']=meta_image_id + # CIA_other_info['Grade']=meta_image_grade + # CIA_other_info['Age']=str(meta_image_age) + # CIA_other_info['Survival']=str(meta_image_survival) + # CIA_other_info['ResectionStatus']=meta_image_status + + + meta.add_keyvalue('Spacing_mm',1.0) + meta.add_keyvalue('OriImg_path',",".join(series_files)) + meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','head') + + sub_modality_dict={} + for idx,value in enumerate(series_flag): + if value: + sub_modality_dict[str(idx)]=SUB_MODALITY[idx] + + meta.add_keyvalue('Sub_modality',sub_modality_dict) + + if tr_flag: + + meta.add_keyvalue('Label_Dict',LABEL_DICT) + + output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + ## + save_nifti(sitk_img_original, output_image_file, full_path) + print(f"Saved NIfTI file to {output_image_file}") + ##Label processing + + if tr_flag: + label_path_dict={} + full_label_file=os.path.join(full_path,"%s_seg.nii.gz"%(data_dir)) + + process_label_path=os.path.join(output_dir,data_dir,'segmentation') + + processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz") + + if not os.path.isdir(process_label_path): + os.makedirs(process_label_path,exist_ok=True) + + if not os.path.isfile(full_label_file): + pass + label_flag=False + else: + sitk_lbl_original = util.load_nifti(full_label_file) + util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_label_file) # Save original + print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") + + label_path_dict['brain'] = processed_lbl_full_path + label_flag=True + + if label_flag: + meta.add_keyvalue('Task',TASK_VALUE) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + + + # try: + # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() + # except Exception as e: + # failed_files.append(full_path_label) + # continue + print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) + + except Exception as e: + print(e) + failed_files.append(data_dir) + print(f"Failed to load BRATS images from {data_dir}") + continue + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_image_file] = meta.get_meta_data() + json_file.seek(0) + # print(existing_mappings) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + # else: + # print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/BRATS/BRATS2021/") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2021") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/brats2021_clean/util.py b/brats2021_clean/util.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd221c91bfdc6ff61af486b7b09d0bad9c6deee --- /dev/null +++ b/brats2021_clean/util.py @@ -0,0 +1,410 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + # print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + # print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + # elif key == 'Size' and isinstance(value, list) and len(value) == 3 : + elif key == 'Size' and isinstance(value, list) and len(value) >= 3 : + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + ##added by yanguoiqng on 2025-08-08 + elif key == 'Sub_modality': + + if isinstance(value, dict): + return True + else: + return False + elif key == 'Label_Dict': + + if isinstance(value, dict): + return True + else: + return False + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file diff --git a/kaggle_osic_clean/config_format.json b/kaggle_osic_clean/config_format.json new file mode 100644 index 0000000000000000000000000000000000000000..d235b0ba606e7ffda8443766145d0ff59bec21d8 --- /dev/null +++ b/kaggle_osic_clean/config_format.json @@ -0,0 +1,116 @@ +{ + "Modality": { + "type": "option", + "required": true, + "options": [ + "CT", + "MRI", + "T1", + "T2", + "X-ray", + "Fluoroscopy", + "US", + "PET" + ] + }, + "OriImg_path": { + "type": "string", + "required": true + }, + "Label_path": { + "type": "dict", + "required": false, + "keys": [ + "classification", + "segmentation", + "regression", + "detection", + "localization", + "registration", + "other" + ], + "value": { + "type": "dict", + "required": false, + "keys": [ + "lung", + "liver", + "heart", + "brain", + "kidney" + ], + "value": { + "type": "string", + "required": false + } + } + }, + "ROI": { + "type": "option", + "required": false, + "options": [ + "chest-abdomen", + "abdomen-pelvis", + "head", + "neck", + "skeleton", + "chest", + "abdomen", + "shoulder", + "leg", + "arm", + "hand", + "foot", + "pelvis" + ] + }, + "Label_tissue": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "lung", + "liver", + "heart", + "brain", + "kidney", + "spleen", + "pancreas", + "stomach", + "intestine", + "muscle", + "bone" + ] + } + }, + "Task": { + "type": "list", + "required": false, + "items": { + "type": "option", + "required": true, + "options": [ + "classification", + "segmentation" + ] + } + }, + "Spacing_mm": { + "type": "float", + "required": true + }, + "Size": { + "type": "list", + "required": true, + "items": { + "type": "int", + "required": true + } + }, + "Dataset_name": { + "type": "string", + "required": true + } +} \ No newline at end of file diff --git a/kaggle_osic_clean/dataclean_kaggle.py b/kaggle_osic_clean/dataclean_kaggle.py new file mode 100644 index 0000000000000000000000000000000000000000..a37a298b177141db7d635471f99127d3f1f7f6c7 --- /dev/null +++ b/kaggle_osic_clean/dataclean_kaggle.py @@ -0,0 +1,239 @@ +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +# from bert_helper import * + +# model_name = "bert-large-uncased" +# reduce_method = 'mean' +# max_words_num = 32 # max number of words in the caption > 2 + +# embeder, tokenizer = get_frozen_embeder(model_name) + +# string1 = "modality: ct, gender: female, age: 51, roi: abdomen" +# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# string2 = "modality: ct, gender: female, age: 50, roi: head" + +# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# input_size = embeder.config.vocab_size +# in_size = embeder.config.hidden_size + +# print(embeder, input_size, in_size) +# print(tokenizer) + + +# print(embeder_output1) +# print(embeder_output1.shape) # torch.Size([1, 8, 768]) + + +# print(embeder_output2) +# print(embeder_output2.shape) # torch.Size([1, 8, 768]) + + +# error = torch.abs(embeder_output1 - embeder_output2) +# print(error) +# print("Embedding distance between the two sentences: ") +# print(f"String1: {string1}") +# print(f"String2: {string2}") +# print(torch.mean(error)) + + +# exit() +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... + + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + meta_file=os.path.join(target_path,'%s.csv'%pid_dir) + if os.path.isfile(meta_file): + mf_flag=True + else: + mf_flag=False + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + for data_dir in tqdm(image_dirs, desc="Processing images files"): + + location=data_dir + + full_path=os.path.join(target_path,pid_dir,data_dir) + # full_path = convert_windows_to_linux_path(full_path) + if not os.path.isdir(full_path): + continue + try: + print(full_path) + dicom_fp,dicom_image = load_dicom_images(full_path) + + spacing_info = dicom_image.GetSpacing() + + metadata_keys = dicom_image.GetMetaDataKeys() + + dtag=load_dicom_tag(dicom_fp[0]) + uid=dtag.GetMetaData('0020|000e') ##Series Instance UID + modality=dtag.GetMetaData('0008|0060')##Modality + study='OSIC_PFP'##Dataset_name + CIA_other_info = { + 'Study_UID':uid, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + if mf_flag: + CIA_other_info['metadata_file']=meta_file + size = list(dicom_image.GetSize()) + + resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size) + + # resize the image + if resampler is not None: + proces_image = resampler.Execute(dicom_image) + CIA_other_info['Resample'] = True + else: + proces_image = dicom_image + CIA_other_info['Resample'] = False + + # threshold the image + if 'CT' in modality: + proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) + else: + pass + except RuntimeError: + failed_files.append(full_path) + print(f"Failed to load DICOM images from {full_path}") + continue + + + + + meta.add_keyvalue('Spacing_mm',min(spacing_info)) + meta.add_keyvalue('OriImg_path',full_path) + meta.add_keyvalue('Size',size) + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','lung') + + + + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + output_path = os.path.join(output_dir,pid_dir, f"{os.path.basename(full_path)}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, full_path) + print(f"Saved NIfTI file to {output_path}") + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/Kaggle/osic_pulmonary_fibrosis_progression_Segmentation/osic-pulmonary-fibrosis-progression") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/kaggle_osic_clean/dataclean_kaggle_seg.py b/kaggle_osic_clean/dataclean_kaggle_seg.py new file mode 100644 index 0000000000000000000000000000000000000000..166f8acae7fce52a2e7148e47da6d11ab2787256 --- /dev/null +++ b/kaggle_osic_clean/dataclean_kaggle_seg.py @@ -0,0 +1,247 @@ +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +# from bert_helper import * + +# model_name = "bert-large-uncased" +# reduce_method = 'mean' +# max_words_num = 32 # max number of words in the caption > 2 + +# embeder, tokenizer = get_frozen_embeder(model_name) + +# string1 = "modality: ct, gender: female, age: 51, roi: abdomen" +# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# string2 = "modality: ct, gender: female, age: 50, roi: head" + +# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# input_size = embeder.config.vocab_size +# in_size = embeder.config.hidden_size + +# print(embeder, input_size, in_size) +# print(tokenizer) + + +# print(embeder_output1) +# print(embeder_output1.shape) # torch.Size([1, 8, 768]) + + +# print(embeder_output2) +# print(embeder_output2.shape) # torch.Size([1, 8, 768]) + + +# error = torch.abs(embeder_output1 - embeder_output2) +# print(error) +# print("Embedding distance between the two sentences: ") +# print(f"String1: {string1}") +# print(f"String2: {string2}") +# print(torch.mean(error)) + + +# exit() +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... + + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +def load_nrrd(fp): + return sitk.ReadImage(fp) +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + + pid_dirs2=find_image_dirs(os.path.join(target_path,pid_dir)) + for pid_dir2 in pid_dirs2: + meta_file=os.path.join(target_path,pid_dir,pid_dir2,'%s.csv'%pid_dir) + if os.path.isfile(meta_file): + mf_flag=True + else: + mf_flag=False + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir,pid_dir2)) + for data_dir in tqdm(image_dirs, desc="Processing images files"): + + location=data_dir + + full_path=os.path.join(target_path,pid_dir,pid_dir2,data_dir) + print(full_path) + # full_path = convert_windows_to_linux_path(full_path) + if not os.path.isfile(full_path): + continue + try: + print(full_path) + # dicom_fp,dicom_image = load_dicom_images(full_path) + dicom_image=load_nrrd(full_path) + spacing_info = dicom_image.GetSpacing() + + metadata_keys = dicom_image.GetMetaDataKeys() + + # dtag=load_dicom_tag(dicom_fp[0]) + # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID + # modality=dtag.GetMetaData('0008|0060')##Modality + uid=data_dir.split("_")[0] + label_name=pid_dir.split("_")[1] + TASK_VALUE="segmentation" + study='OSIC_PFP'##Dataset_name + CIA_other_info = { + 'Study_UID':uid, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + if mf_flag: + CIA_other_info['metadata_file']=meta_file + size = list(dicom_image.GetSize()) + + resampler =util.get_unisize_resampler(dicom_image, interpolator='nearest', spacing=spacing_info, size=size) + + # resize the image + if resampler is not None: + proces_image = resampler.Execute(dicom_image) + CIA_other_info['Resample'] = True + else: + proces_image = dicom_image + CIA_other_info['Resample'] = False + + # threshold the image + # if 'CT' in modality: + # proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) + # else: + # pass + except RuntimeError: + failed_files.append(full_path) + print(f"Failed to load DICOM images from {full_path}") + continue + + + meta.add_keyvalue('Spacing_mm',min(spacing_info)) + meta.add_keyvalue('OriImg_path',full_path) + meta.add_keyvalue('Size',size) + meta.add_keyvalue('Modality','CT') + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','lung') + meta.add_keyvalue('Task',TASK_VALUE) + meta.add_keyvalue('Label_tissue',label_name) + meta.add_keyvalue('Label_path',full_path) + + + + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + output_path = os.path.join(output_dir,uid,TASK_VALUE, f"{label_name}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, full_path) + print(f"Saved NIfTI file to {output_path}") + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + print("No metadata.csv files found.") + + with open(failed_files_path, "r+") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/Kaggle/osic_pulmonary_fibrosis_progression_Segmentation/GT") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/kaggle_osic_clean/dataclean_kaggle_update.py b/kaggle_osic_clean/dataclean_kaggle_update.py new file mode 100644 index 0000000000000000000000000000000000000000..919bac13e7cd3c978be503bfec9fd4e5697acf58 --- /dev/null +++ b/kaggle_osic_clean/dataclean_kaggle_update.py @@ -0,0 +1,342 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-07-24 +update kaggle data clean + +依次解析train.csv以及test.csv文件,获取每个数据集基本信息; +根据解析的id查找对应的train/test目录下的影像并做规范处理,同时查找label的segment目录下的标签,提取不同部位的CT的标签位置保存到json文件中; +完成后保存json并退出 + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +# model_name = "bert-large-uncased" +# reduce_method = 'mean' +# max_words_num = 32 # max number of words in the caption > 2 + +# embeder, tokenizer = get_frozen_embeder(model_name) + +# string1 = "modality: ct, gender: female, age: 51, roi: abdomen" +# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# string2 = "modality: ct, gender: female, age: 50, roi: head" + +# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# input_size = embeder.config.vocab_size +# in_size = embeder.config.hidden_size + +# print(embeder, input_size, in_size) +# print(tokenizer) + + +# print(embeder_output1) +# print(embeder_output1.shape) # torch.Size([1, 8, 768]) + + +# print(embeder_output2) +# print(embeder_output2.shape) # torch.Size([1, 8, 768]) + + +# error = torch.abs(embeder_output1 - embeder_output2) +# print(error) +# print("Embedding distance between the two sentences: ") +# print(f"String1: {string1}") +# print(f"String2: {string2}") +# print(torch.mean(error)) + + +# exit() + + +meta_id_name='Patient' +meta_weeks_name='Weeks' +meta_fvc_name='FVC' +meta_percent_name='Percent' +meta_age_name='Age' +meta_sex_name='Sex' +meta_status_name='SmokingStatus' + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... + + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + meta_file=os.path.join(target_path,'%s.csv'%pid_dir) + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + for data_dir in tqdm(image_dirs, desc="Processing images files"): + + location=data_dir + + full_path=os.path.join(target_path,pid_dir,data_dir) + data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + meta_image_id=data_info_row[meta_id_name][0] + meta_weeks=data_info_row[meta_weeks_name][0] + meta_fvc=data_info_row[meta_fvc_name][0] + meta_percent=data_info_row[meta_percent_name][0] + meta_age=data_info_row[meta_age_name][0] + meta_sex=data_info_row[meta_sex_name][0] + meta_status=data_info_row[meta_status_name][0] + else: + meta_image_id=data_dir + meta_weeks='' + meta_fvc='' + meta_percent='' + meta_age='' + meta_sex='' + meta_status='' + # full_path = convert_windows_to_linux_path(full_path) + if not os.path.isdir(full_path): + continue + try: + print(full_path) + dicom_fp,dicom_image = load_dicom_images(full_path) + + spacing_info = dicom_image.GetSpacing() + print('SPACING INFO:', spacing_info) + + metadata_keys = dicom_image.GetMetaDataKeys() + + dtag=load_dicom_tag(dicom_fp[0]) + uid=dtag.GetMetaData('0020|000e') ##Series Instance UID + modality=dtag.GetMetaData('0008|0060')##Modality + study='OSIC_PFP'##Dataset_name + CIA_other_info = { + 'Study_UID':uid, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = pid_dir + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + size = list(dicom_image.GetSize()) + resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size) + + # resize the image + if resampler is not None: + proces_image = resampler.Execute(dicom_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = dicom_image + CIA_other_info['Resample'] = False + + ## + CIA_other_info['Image_id']=meta_image_id + CIA_other_info['Weeks']=str(meta_weeks) + CIA_other_info['FVC']=str(meta_fvc) + CIA_other_info['Percent']=str(meta_percent) + CIA_other_info['Age']=str(meta_age) + CIA_other_info['Sex']=meta_sex + CIA_other_info['Smoke_Status']=meta_status + # threshold the image + if 'CT' in modality: + proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) + else: + pass + + output_path = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, full_path) + print(f"Saved NIfTI file to {output_path}") + + ##segment + label_path_dict = {} + label_flag=True + pare_path=os.path.dirname(target_path) + label_paths = os.path.join(pare_path, 'GT') + label_files=glob.glob("%s/*/*/%s_*.nrrd"%(label_paths,data_dir)) + #print(label_paths,label_files) + if len(label_files)>0: + for lf in label_files: + lf_name=os.path.basename(lf) + lf_id=lf_name.split("_")[0] + lf_tissue=os.path.basename(os.path.dirname(lf)).split("_")[1] + label_image=load_nrrd(lf) + resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_image) + else: + proces_label = label_image + + label_output_path = os.path.join(output_dir, lf_id, TASK_VALUE, f"{lf_name}.nii.gz") + + label_path_dict[lf_tissue] = label_output_path + util.save_nifti(proces_label, label_output_path, lf) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + + else: + label_flag=False + except RuntimeError: + failed_files.append(full_path) + print(f"Failed to load DICOM images from {full_path}") + continue + + ''' + meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Weeks',meta_weeks) + meta.add_keyvalue('FVC',meta_fvc) + meta.add_keyvalue('Percent',meta_percent) + meta.add_keyvalue('Age',meta_age) + meta.add_keyvalue('Sex',meta_sex) + meta.add_keyvalue('Smoke_Status',meta_status) + ''' + print(proces_image.GetSize(),proces_label.GetSize()) + try: + assert proces_image.GetSize() == proces_label.GetSize() + except Exception as e: + failed_files.append(full_path) + continue + size_processed = list(proces_image.GetSize()) + + + meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info)) + meta.add_keyvalue('OriImg_path',full_path) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','whole-body') + + if label_flag: + print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/Kaggle/osic_pulmonary_fibrosis_progression_Segmentation/osic-pulmonary-fibrosis-progression") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic_new/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/kaggle_osic_clean/dataclean_kaggle_update_20250727.py b/kaggle_osic_clean/dataclean_kaggle_update_20250727.py new file mode 100644 index 0000000000000000000000000000000000000000..8649a108199a2c1fbbaa89cce5d6c80208e49e31 --- /dev/null +++ b/kaggle_osic_clean/dataclean_kaggle_update_20250727.py @@ -0,0 +1,339 @@ +#coding:utf-8 +''' +write by ygq +create on 2025-07-24 +update kaggle data clean + +依次解析train.csv以及test.csv文件,获取每个数据集基本信息; +根据解析的id查找对应的train/test目录下的影像并做规范处理,同时查找label的segment目录下的标签,提取不同部位的CT的标签位置保存到json文件中; +完成后保存json并退出 + +''' +import os +import glob +import pandas as pd +import SimpleITK as sitk +import argparse +import json +from tqdm import tqdm +from util import meta_data +import util +import numpy as np +# from bert_helper import * + +# model_name = "bert-large-uncased" +# reduce_method = 'mean' +# max_words_num = 32 # max number of words in the caption > 2 + +# embeder, tokenizer = get_frozen_embeder(model_name) + +# string1 = "modality: ct, gender: female, age: 51, roi: abdomen" +# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# string2 = "modality: ct, gender: female, age: 50, roi: head" + +# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) + +# input_size = embeder.config.vocab_size +# in_size = embeder.config.hidden_size + +# print(embeder, input_size, in_size) +# print(tokenizer) + + +# print(embeder_output1) +# print(embeder_output1.shape) # torch.Size([1, 8, 768]) + + +# print(embeder_output2) +# print(embeder_output2.shape) # torch.Size([1, 8, 768]) + + +# error = torch.abs(embeder_output1 - embeder_output2) +# print(error) +# print("Embedding distance between the two sentences: ") +# print(f"String1: {string1}") +# print(f"String2: {string2}") +# print(torch.mean(error)) + + +# exit() + + +meta_id_name='Patient' +meta_weeks_name='Weeks' +meta_fvc_name='FVC' +meta_percent_name='Percent' +meta_age_name='Age' +meta_sex_name='Sex' +meta_status_name='SmokingStatus' + +TASK_VALUE="segmentation" +CLAMP_RANGE_CT = [-300,300] +CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... + + +# def find_metadata_files(path): +# # for Cancer Image Archive (TCIA) dataset +# search_pattern = os.path.join(path, '**', 'metadata.csv') +# return glob.glob(search_pattern, recursive=True) + +def find_metadata_files(path): + # for Cancer Image Archive (TCIA) dataset + search_pattern = os.path.join(path, '*.csv') + return glob.glob(search_pattern, recursive=True) +##added by yanguoqing on 20250527 +def find_image_dirs(path): + return os.listdir(path) + +##modify by yanguoqing on 20250527 +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return dicom_names,image + +##added by yanguoqing on 20250527 +def load_dicom_tag(imgs): + reader = sitk.ImageFileReader() + # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileName(imgs) + reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 + # metadata_keys = reader.GetMetaDataKeys() + tag=reader.Execute() + return tag + +def load_nrrd(fp): + return sitk.ReadImage(fp) + +def save_nifti(image, output_path, folder_path): + # Set metadata in the NIfTI file's header + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +##modify by yanguoqing on 20250527 +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +def main(target_path, output_dir): + metadata_files = find_metadata_files(target_path) + pid_dirs=find_image_dirs(target_path) + failed_files = [] + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + json_output_path = os.path.join(output_dir, 'nifti_mappings.json') + failed_files_path = os.path.join(output_dir, 'failed_files.json') + meta = meta_data() + + # Initialize the JSON file + if not os.path.exists(json_output_path): + with open(json_output_path, 'w') as json_file: + json.dump({}, json_file) + + if pid_dirs: + for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): + if not os.path.isdir(os.path.join(target_path,pid_dir)): + continue + meta_file=os.path.join(target_path,'%s.csv'%pid_dir) + if os.path.isfile(meta_file): + mf_flag=True + df_meta=pd.read_csv(meta_file,sep=',') + else: + mf_flag=False + image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) + for data_dir in tqdm(image_dirs, desc="Processing images files"): + + location=data_dir + + full_path=os.path.join(target_path,pid_dir,data_dir) + data_info_row=df_meta[df_meta[meta_id_name]==data_dir] + + if data_info_row.shape[0]>0: + data_info_row=data_info_row.reset_index() + #print(data_info_row[meta_id_name]) + meta_image_id=data_info_row[meta_id_name][0] + meta_weeks=data_info_row[meta_weeks_name][0] + meta_fvc=data_info_row[meta_fvc_name][0] + meta_percent=data_info_row[meta_percent_name][0] + meta_age=data_info_row[meta_age_name][0] + meta_sex=data_info_row[meta_sex_name][0] + meta_status=data_info_row[meta_status_name][0] + else: + meta_image_id=data_dir + meta_weeks='' + meta_fvc='' + meta_percent='' + meta_age='' + meta_sex='' + meta_status='' + # full_path = convert_windows_to_linux_path(full_path) + if not os.path.isdir(full_path): + continue + try: + print(full_path) + dicom_fp,dicom_image = load_dicom_images(full_path) + + spacing_info = dicom_image.GetSpacing() + print('SPACING INFO:', spacing_info) + + metadata_keys = dicom_image.GetMetaDataKeys() + + dtag=load_dicom_tag(dicom_fp[0]) + uid=dtag.GetMetaData('0020|000e') ##Series Instance UID + modality=dtag.GetMetaData('0008|0060')##Modality + study='OSIC_PFP'##Dataset_name + CIA_other_info = { + 'Study_UID':uid, + 'metadata_file':'' + # 'Series_Description':serise_desc + } + CIA_other_info['split'] = pid_dir + if mf_flag: + CIA_other_info['metadata_file']=meta_file + + size = list(dicom_image.GetSize()) + resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size) + + # resize the image + if resampler is not None: + proces_image = resampler.Execute(dicom_image) + print('SPACIE INFO AFTER', proces_image.GetSpacing()) + CIA_other_info['Resample'] = True + else: + proces_image = dicom_image + CIA_other_info['Resample'] = False + + ## + CIA_other_info['Image_id']=meta_image_id + CIA_other_info['Weeks']=str(meta_weeks) + CIA_other_info['FVC']=str(meta_fvc) + CIA_other_info['Percent']=str(meta_percent) + CIA_other_info['Age']=str(meta_age) + CIA_other_info['Sex']=meta_sex + CIA_other_info['Smoke_Status']=meta_status + # threshold the image + if 'CT' in modality: + proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) + else: + pass + + output_path = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") + # output_path=convert_windows_to_linux_path(output_path) + save_nifti(proces_image, output_path, full_path) + print(f"Saved NIfTI file to {output_path}") + + ##segment + label_path_dict = {} + label_flag=True + pare_path=os.path.dirname(target_path) + label_paths = os.path.join(pare_path, 'GT') + label_files=glob.glob("%s/*/*/%s_*.nrrd"%(label_paths,data_dir)) + #print(label_paths,label_files) + if len(label_files)>0: + for lf in label_files: + lf_name=os.path.basename(lf) + lf_id=lf_name.split("_")[0] + lf_tissue=os.path.basename(os.path.dirname(lf)).split("_")[1] + label_image=load_nrrd(lf) + resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size) + if resampler is not None: + proces_label = resampler.Execute(label_image) + else: + proces_label = label_image + + label_output_path = os.path.join(output_dir, lf_id, TASK_VALUE, f"{lf_name}.nii.gz") + + label_path_dict[lf_tissue] = label_output_path + util.save_nifti(proces_label, label_output_path, lf) + print(f"Saved Label Segment NIfTI file to {label_output_path}") + + else: + label_flag=False + except RuntimeError: + failed_files.append(full_path) + print(f"Failed to load DICOM images from {full_path}") + continue + + ''' + meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Weeks',meta_weeks) + meta.add_keyvalue('FVC',meta_fvc) + meta.add_keyvalue('Percent',meta_percent) + meta.add_keyvalue('Age',meta_age) + meta.add_keyvalue('Sex',meta_sex) + meta.add_keyvalue('Smoke_Status',meta_status) + ''' + + assert proces_image.GetSize() == proces_label.GetSize() + + size_processed = list(proces_image.GetSize()) + + + meta.add_keyvalue('Image_id',meta_image_id) + meta.add_keyvalue('Spacing_mm',min(spacing_info)) + meta.add_keyvalue('OriImg_path',full_path) + meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin + meta.add_keyvalue('Modality',modality) + meta.add_keyvalue('Dataset_name',study) + meta.add_keyvalue('ROI','whole-body') + + if label_flag: + print(label_path_dict.keys()) + meta.add_keyvalue('Task',TASK_VALUE) + meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) + meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) + + meta.add_extra_keyvalue('Metadata',CIA_other_info) + + + + + # Write the mapping to the JSON file on the fly + with open(json_output_path, 'r+') as json_file: + existing_mappings = json.load(json_file) + existing_mappings[output_path] = meta.get_meta_data() + json_file.seek(0) + json.dump(existing_mappings, json_file, indent=4) + json_file.truncate() + else: + print("No metadata.csv files found.") + + with open(failed_files_path, "w") as json_file: + json.dump(failed_files, json_file) + + print(f"The list has been written to {failed_files_path}") + print(f"Saved NIfTI mappings to {json_output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") + parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/Kaggle/osic_pulmonary_fibrosis_progression_Segmentation/osic-pulmonary-fibrosis-progression") + parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic_new/") + args = parser.parse_args() + print(args.target_path, args.output_dir) + main(args.target_path, args.output_dir) + + + + + + + + + + + + + + + diff --git a/kaggle_osic_clean/util.py b/kaggle_osic_clean/util.py new file mode 100644 index 0000000000000000000000000000000000000000..9f9616c8c9f69ef5e40084553b06e617aa2f4759 --- /dev/null +++ b/kaggle_osic_clean/util.py @@ -0,0 +1,396 @@ +import os +import json +import SimpleITK as sitk +import glob +import pandas as pd + +def load_dicom_images(folder_path): + reader = sitk.ImageSeriesReader() + dicom_names = reader.GetGDCMSeriesFileNames(folder_path) + reader.SetFileNames(dicom_names) + image = reader.Execute() + return image + +def convert_windows_to_linux_path(windows_path): + # Replace backslashes with forward slashes and remove the drive letter + # Some meta files have windows paths, but the data is stored on a linux server + linux_path = windows_path.replace('\\', '/') + if ':' in linux_path: + linux_path = linux_path.split(':', 1)[1] + return linux_path + +# ============================================================================= +# ========================developed with TotalSegmentor======================== +# ============================================================================= + +def read_table(file_path, split_str=';'): + try: + df = pd.read_excel(file_path, engine='openpyxl') + except: + df = pd.read_csv(file_path, sep=split_str) + return df + +def load_nifti(image_path): + return sitk.ReadImage(image_path) + +def save_nifti(image, output_path, folder_path): + output_dirpath = os.path.dirname(output_path) + if not os.path.exists(output_dirpath): + print(f"Creating directory {output_dirpath}") + os.makedirs(output_dirpath) + # Set metadata in the NIfTI file's header + image.SetMetaData("FolderPath", folder_path) + sitk.WriteImage(image, output_path) + +def find_metadata_files(path, file_name='*meta*'): + # for TotalSegmentor dataset + search_pattern = os.path.join(path, '**', file_name) + return glob.glob(search_pattern, recursive=True) + +def get_img_path_from_folder(folder_path, img_type='.nii.gz', include_str=None, exclude_str='segmentation', is_sorted=True): + img_path = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(img_type) and (include_str is None or include_str in file) and (exclude_str is None or exclude_str not in file): + img_path.append(os.path.join(root, file)) + if is_sorted: + img_path.sort() + return img_path + +def get_unisize_resampler(ref_img, interpolator='linear', spacing=None, size=None): + ''' + Resample the image to have isotropic spacing, following the steps: + 1. Find the minimum spacing + 2. Resample the image to have the minimum spacing + 3. Set the interpolator (linear for images, nearest for segmentation masks) + 4. Set the output spacing + 5. Return the resampler for resampling + For example, if the input image has spacing [0.1, 0.1, 0.3], the output image will have spacing [0.1, 0.1, 0.1] + ''' + # 讨论为什么重新写这个函数!!! + if size is None: + size = ref_img.GetSize() + if spacing is None: + spacing = ref_img.GetSpacing() + min_spacing = min(spacing) + if all([spc == min_spacing for spc in spacing]): + return None + else: + # if 1: + if interpolator == 'nearest': + interpolator = sitk.sitkNearestNeighbor + elif interpolator == 'linear': + interpolator = sitk.sitkLinear + resampler = sitk.ResampleImageFilter() + # new_spacing = [max_spacing] * len(spacing) + print(size) + new_size = [int(round(old_sz * old_spc / min_spacing)) for old_sz, old_spc in zip(size, spacing)] + new_size_xy=[new_size[0],new_size[1],new_size[2]] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + new_size_spacing=[min_spacing,min_spacing,min_spacing] + # 讨论为什么重新写这个函数!!! --- YHM Jachin + # resampler.SetSize(new_size) + # resampler.SetOutputSpacing([min_spacing] * len(spacing)) + resampler.SetSize(new_size_xy) + resampler.SetOutputSpacing(new_size_spacing) + + print(new_size,new_size_xy) + resampler.SetOutputOrigin(ref_img.GetOrigin()) + resampler.SetOutputDirection(ref_img.GetDirection()) + resampler.SetInterpolator(interpolator) + resampler.SetDefaultPixelValue(ref_img.GetPixelIDValue()) + resampler.SetOutputPixelType(ref_img.GetPixelID()) + return resampler + +def clamp_image(in_img,clamp_range): + ''' + Clamp the image to the specified range + ''' + clamp_filter = sitk.ClampImageFilter() + clamp_filter.SetLowerBound(clamp_range[0]) + clamp_filter.SetUpperBound(clamp_range[1]) + return clamp_filter.Execute(in_img) + +def get_synonyms_dict(dict_type='ROI'): + ''' + Get the dictionary of synonyms for the specified dictionary type + ''' + if dict_type == 'ROI': + dict_synonyms = { + 'whole-body': ['whole-body', 'whole body', 'wholebody', 'whole body', 'whole-body', 'whole body', 'wholebody','polytrauma','head-neck-thorax-abdomen-pelvis-leg','head-neck-thorax-abdomen-pelvis'], + 'neck-thorax-abdomen-pelvis-leg': ['neck-thorax-abdomen-pelvis-leg','neck-thx-abd-pelvis-leg', 'angiography neck-thx-abd-pelvis-leg', 'neck thorax abdomen pelvis leg', 'neck and thorax and abdomen and pelvis and leg', 'neck, thorax, abdomen, pelvis & leg', 'neck/thorax/abdomen/pelvis/leg', 'neck, thorax, abdomen, pelvis and leg', 'neck thorax abdomen pelvis leg'], + 'neck-thorax-abdomen-pelvis': ['neck-thorax-abdomen-pelvis', 'neck-thx-abd-pelvis', 'neck thorax abdomen pelvis', 'neck and thorax and abdomen and pelvis', 'neck, thorax, abdomen & pelvis', 'neck/thorax/abdomen/pelvis', 'neck, thorax, abdomen and pelvis', 'neck thorax abdomen & pelvis'], + 'thorax-abdomen-pelvis-leg': ['thorax-abdomen-pelvis-leg','thx-abd-pelvis-leg', 'angiography thx-abd-pelvis-leg', 'thorax abdomen pelvis leg', 'thorax and abdomen and pelvis and leg', 'thorax, abdomen, pelvis & leg', 'thorax/abdomen/pelvis/leg', 'thorax, abdomen, pelvis and leg', 'thorax abdomen pelvis leg'], + 'neck-thorax-abdomen': ['neck-thorax-abdomen', 'neck-thorax-abdomen', 'neck thorax abdomen', 'neck and thorax and abdomen', 'neck, thorax, abdomen', 'neck/thorax/abdomen', 'neck, thorax, abdomen', 'neck thorax abdomen'], + 'head-neck-thorax-abdomen': ['head-neck-thorax-abdomen', 'head-neck-thorax-abdomen', 'head neck thorax abdomen', 'head and neck and thorax and abdomen', 'head, neck, thorax, abdomen', 'head/thorax/abdomen', 'head, thorax, abdomen', 'head thorax abdomen'], + 'head-neck-thorax': ['head-neck-thorax', 'head neck thorax', 'head and neck and thorax', 'head, neck, thorax', 'head/thorax', 'head, thorax', 'head thorax'], + 'thorax-abdomen-pelvis': ['thorax-abdomen-pelvis', 'thx-abd-pelvis', 'polytrauma', 'thorax abdomen pelvis', 'thorax and abdomen and pelvis', 'thorax, abdomen & pelvis', 'thorax/abdomen/pelvis', 'thorax, abdomen and pelvis', 'thorax abdomen & pelvis'], + 'abdomen-pelvis-leg': ['abdomen-pelvis-leg', 'angiography abdomen-pelvis-leg', 'abd-pelvis-leg', 'abdomen pelvis leg', 'abdomen and pelvis and leg', 'abdomen, pelvis & leg', 'abdomen/pelvis/leg', 'abdomen, pelvis, leg', 'abdomen pelvis leg'], + 'neck-thorax': ['neck-thorax', 'neck thorax', 'neck and thorax', 'neck, thorax', 'thorax-neck', 'thorax neck', 'thorax and neck', 'thorax, neck','thorax/neck'], + 'thorax-abdomen': ['thorax-abdomen', 'thorax abdomen', 'thorax and abdomen', 'thorax, abdomen'], + 'abdomen-pelvis': ['abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis', 'abdomen-pelvis', 'abdomen pelvis', 'abdomen and pelvis', 'abdomen & pelvis', 'abdomen/pelvis'], + 'pelvis-leg': ['pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg', 'pelvis-leg', 'pelvis leg', 'pelvis and leg', 'pelvis, leg', 'pelvis/leg'], + 'head-neck': ['head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck', 'head-neck', 'head neck', 'head and neck', 'head, neck', 'head/neck'], + 'abdomen': ['abdomen', 'abdominal', 'belly', 'stomach', 'tummy', 'gut', 'guts', 'viscera', 'bowels', 'intestines', 'gastrointestinal', 'digestive', 'peritoneum','gastric', 'liver', 'spleen', 'pancreas','kidney','lumbar','renal','hepatic','splenic','pancreatic','intervention'], + 'thorax': ['chest', 'thorax', 'breast', 'lung', 'heart','heart-thorakale aorta', 'heart-thorakale', 'mediastinum', 'pleura', 'bronchus', 'bronchi', 'trachea', 'esophagus', 'diaphragm', 'rib', 'sternum', 'clavicle', 'scapula', 'axilla', 'armpit','breast biopsy','thoracic','mammary','caeiothoracic','mediastinal','pleural','bronchial','bronchial tree','tracheal','esophageal','diaphragmatic','costal','sternal','clavicular','scapular','axillary','axillar','cardiac','pericardial','pericardiac','pericardium'], + 'head': ['head', 'headbasis', 'brain', 'skull', 'face','nose','ear','eye','mouth','jaw','cheek','chin','forehead','temporal','parietal','occipital','frontal','mandible','maxilla','mandibular','maxillary','nasal','orbital','orbita','ocular','auricular','otic','oral','buccal','labial','lingual','palatal'], + 'neck': ['neck', 'throat', 'cervical', 'thyroid', 'trachea', 'larynx', 'pharynx', 'esophagus','pharyngeal','laryngeal','cervical','thyroid','trachea','esophagus','carotid','jugular'], + 'hand': ['hand', 'finger', 'thumb', 'palm', 'wrist', 'knuckle', 'fingernail', 'phalanx', 'metacarpal', 'carpal', 'radius'], + 'arm': ['arm', 'forearm', 'upper arm', 'bicep', 'tricep', 'brachium', 'brachial', 'humerus', 'radius', 'ulna', 'elbow', 'shoulder', 'armpit''clavicle', 'scapula', 'acromion', 'acromioclavicular'], + 'leg': ['leg', 'felsenleg','thigh', 'calf', 'shin', 'knee', 'foot', 'ankle', 'toe', 'heel', 'sole', 'arch', 'instep', 'metatarsal', 'phalanx', 'tibia', 'fibula', 'femur', 'patella', 'kneecap','achilles tendon','achilles'], + 'pelvis': ['pelvis', 'hip', 'groin', 'buttock', 'gluteus', 'gluteal', 'ischium', 'pubis', 'sacrum', 'coccyx', 'acetabulum', 'iliac', 'iliac crest', 'iliac spine', 'iliac wing', 'sacroiliac', 'sacroiliac joint', 'sacroiliac ligament', 'sacroiliac spine', 'ureter', 'bladder', 'urethra', 'prostate', 'testicle', 'ovary', 'uterus',], + 'skeleton': ['skeleton','bone','spine', 'back', 'vertebra', 'sacrum', 'coccyx'], + } + elif dict_type == 'Label_tissue': + dict_synonyms = { + 'liver': ['liver','hepatic'], + 'spleen': ['spleen','splenic'], + 'kidney': ['kidney','renal'], + 'pancreas': ['pancreas','pancreatic'], + 'stomach': ['stomach','gastric'], + 'intestine': ['large intestine', 'small intestine','large bowel','small bowel'], + 'gallbladder': ['gallbladder'], + 'adrenal_gland': ['adrenal_gland','adrenal gland'], + 'bladder': ['bladder'], + 'prostate': ['prostate'], + 'uterus': ['uterus'], + 'ovary': ['ovary'], + 'testicle': ['testicle'], + 'lymph_node': ['lymph_node','lymph node'], + 'bone': ['bone'], + 'lung': ['lung'], + 'heart': ['heart'], + 'esophagus': ['esophagus'], + 'muscle': ['muscle'], + 'fat': ['fat'], + 'skin': ['skin'], + 'vessel': ['vessel'], + 'tumor': ['tumor'], + 'other': ['other'] + } + elif dict_type == 'Task': + dict_synonyms = { + 'segmentation': ['segmentation', 'seg', 'mask'], + 'classification': ['classification', 'class', 'diagnosis','identify','identification'], + 'localization': ['localization', 'locate', 'location', 'position'], + 'registration': ['registration', 'register', 'align', 'alignment'], + 'detection': ['detection', 'detect', 'find', 'locate'], + 'quantification': ['quantification', 'quantify', 'measure', 'measurement'], + } + elif dict_type == 'Modality': + dict_synonyms = { + 'CT': ['CT', 'computed tomography'], + 'MRI': ['MRI', 'MR', 'magnetic resonance imaging'], + 'PET': ['PET', 'positron emission tomography'], + 'US': ['US', 'ultrasound'], + 'X-ray': ['X-ray', 'radiography'], + 'SPECT': ['SPECT', 'single-photon emission computed tomlogy'], + } + else: + raise ValueError(f"dict_type {dict_type} is not valid") + return dict_synonyms + +def replace_synonyms(text, dict_synonyms): + ''' + Replace the synonyms in the text with the standard term + ''' + if isinstance(text,str): + for key, value in dict_synonyms.items(): + for v in value: + if v.lower() in text.lower(): + return key + Warning(f"Value {text} is not in the correct format") + elif isinstance(text,list): + text = [replace_synonyms(t, dict_synonyms) for t in text] + elif isinstance(text,dict): + for key in text.keys(): + # replace values in dict + text[key] = replace_synonyms(text[key], dict_synonyms) + # replace keys in dict + for k in dict_synonyms.keys(): + text[dict_synonyms[k]] = text.pop(key) + return text + +# ============================================================================= + +class meta_data(object): + ''' + This class is used to store the metadata of the dataset + ''' + def __init__(self): + self.config_format_path = os.path.join(os.path.dirname(__file__),'config_format.json') + with open(self.config_format_path, 'r') as file: + self.config_format = json.load(file) + self.config = {} + for key in self.config_format.keys(): + if self.config_format[key]['required'] == True: + self.config[key] = {} + self.keytypes = self.find_all_keys_with_type() + self.keytypes_flatten = self.flatten_json() + self.ambiguity_keys = ['ROI', 'Label_tissue', 'Task', 'Modality'] + for key in self.ambiguity_keys: + ambiguity_dict = get_synonyms_dict(key) + self.config_format[key]['options'] = list(ambiguity_dict.keys()) + + def get_ketytypes(self): + return self.keytypes + + def get_keytypes_flatten(self): + return self.keytypes_flatten + + def find_all_keys_with_type(self, data=None, parent_key=''): + if data is None: + data = self.config_format + keys_with_type = {} + if isinstance(data, dict): + for key, value in data.items(): + full_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict) and 'type' in value: + keys_with_type[full_key] = value['type'] + keys_with_type.update(self.find_all_keys_with_type(value, full_key)) + elif isinstance(data, list): + for index, item in enumerate(data): + full_key = f"{parent_key}[{index}]" + keys_with_type.update(self.find_all_keys_with_type(item, full_key)) + return keys_with_type + + def flatten_json(self, data=None, parent_key='', sep='.'): + if data is None: + data = self.config_format + items = {} + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.update(self.flatten_json(value, new_key, sep=sep)) + elif isinstance(value, list): + for i, item in enumerate(value): + items.update(self.flatten_json(item, f"{new_key}[{i}]", sep=sep)) + else: + items[new_key] = value + elif isinstance(data, list): + for i, item in enumerate(data): + items.update(self.flatten_json(item, f"{parent_key}[{i}]", sep=sep)) + return items + + def req_check(self): + self.unfilled_keys = [] + for key in self.config.keys(): + if self.config[key] == {}: + self.unfilled_keys.append(key) + if len(self.unfilled_keys) == 0: + return True + else: + return False + + def type_check(self, key, value): + if key not in self.config_format.keys(): + print(key, "is not a valid key") + return False + + if key == 'Modality': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'OriImg_path': + if isinstance(value, str): + return True + else: + return False + + elif key == 'Label_path' and isinstance(value, dict): + for skey in value.keys(): + if skey in self.config_format[key]['keys']: + for kk in value[skey]: + if isinstance(value[skey][kk],str): + pass + # if kk in self.config_format[key]['value']['keys']: + # if isinstance(value[skey][kk],str): + # pass + # else: + # return False + else: + return False + return True + + elif key == 'ROI': + if value not in self.config_format[key]['options']: + return False + else: + return True + + elif key == 'Label_tissue' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key =='Task' and isinstance(value, list): + for i in value: + if i not in self.config_format[key]['items']['options']: + return False + return True + + elif key == 'Spacing_mm': + if isinstance(value, float): + return True + else: + False + + elif key == 'Size' and isinstance(value, list) and len(value) == 3: + return all(isinstance(item, int) for item in value) + + elif key == 'Dataset_name': + if isinstance(value, str): + return True + else: + return False + + def add_extra_keyvalue(self, key, value): + self.config[key] = value + return True + + def add_keyvalue(self, key, value): + if key in self.ambiguity_keys: + value = replace_synonyms(value, get_synonyms_dict(key)) + # print(key, value) + if self.type_check(key, value): + self.config[key] = value + return True + else: + Warning(f"Value {value} is not in the correct format for key {key}") + pass + # print(f"Value {value} is not in the correct format for key {key}") + + def get_meta_data(self): + if self.req_check(): + return self.config + else: + print("Not all required keys are filled", self.unfilled_keys) + return False + + + +if __name__ == '__main__': + meta = meta_data() + print(meta.get_keytypes_flatten()) + print(meta.get_ketytypes()) + meta.add_keyvalue('Modality', 'CT') + meta.add_keyvalue('OriImg_path', 'C:/Users/jzheng/Desktop/CT') + meta.add_keyvalue('Label_path', {'ROI': {'1': 'C:/Users/jzheng/Desktop/CT/1'}, 'Tissue': {'1': 'C:/Users/jzheng/Desktop/CT/1'}}) + meta.add_keyvalue('Spacing_mm', 1.5) + meta.add_keyvalue('Size', [512, 512, 100]) + meta.add_keyvalue('Dataset_name', 'CT') + meta.add_keyvalue('Label_tissue', ['1', '2', '3']) + meta.add_keyvalue('Task', ['1', '2', '3']) + print(meta.get_meta_data()) + meta.add_extra_key('extra', 'extra') + print(meta.get_meta_data()) + print(meta.get_ketytypes()) + print(meta.get_keytypes_flatten) + + org_data_foler_path = '/home/jachin/data/Github/data/data_gen_def/DATASETS/TotalSegmentorCT_MRI/TS_CT' + img_paths = get_img_path_from_folder(org_data_foler_path, img_type='.nii.gz', include_str='ct', exclude_str='segmentation') + print(img_paths) \ No newline at end of file