#coding:utf-8 ''' write by ygq create on 2025-09-01 OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。 OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。 1. 目录与文件命名规则 根目录下按受试者会话ID建立文件夹。 受试者ID格式:OAS1_xxxx (例如 OAS1_0012) 会话ID格式:OAS1_xxxx_MRy (例如 OAS1_0012_MR1,y代表第几次访问成像) OAS1_xxxx_MRy/ │ ├── OAS1_xxxx_MRy.xml # 包含采集细节和解剖指标的XML元数据文件 ├── OAS1_xxxx_MRy.txt # 与XML内容相同的文本格式文件(便于查看) ├── RAW/ # 存储原始扫描图像(DICOM或Analyze格式) ├── PROCESSED/ # 预处理后的图像 │ ├── SUBJ_111/ # 原始空间下的平均配准图像(各向同性1mm³) │ └── T88_111/ # 图谱配准空间下的图像 │ ├── t4_files/ # 存储配准变换矩阵文件 │ └── ... # 配准后的图像文件 └── FSL_SEG/ # 基于图谱配准图像生成的脑组织分割结果(灰质2/白质3/脑脊液1) 所有图像均以 Analyze 7.5格式 存储,包含: 一个图像文件(.img) 一个头文件(.hdr) 使用 16位大端序(big-endian) 存储 OAS1_xxxx_MRy_mpr-z_anon 单次原始扫描 256x256x128 1x1x1.25 mm 矢状位 OAS1_xxxx_MRy_mpr_ni_anon_sbj_111 多次扫描平均配准图像 256x256x160 1x1x1 mm 矢状位 OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc 增益场校正后的图谱配准图像 176x208x176 1x1x1 mm 横断位 OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc 去除非脑组织的掩模图像 176x208x176 1x1x1 mm 横断位 OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg 脑组织分割图像(灰/白/CSF) 176x208x176 1x1x1 mm 横断位 1. 人口统计学信息 性别(M/F) 用手习惯(Hand)(均为右利手) 年龄(Age) 教育程度(Educ)(1-5级) 社会经济地位(SES) 2. 临床评估 MMSE(简易精神状态检查) CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度) 3. 衍生解剖指标 eTIV:估计颅内容积 ASF:图谱缩放因子 nWBV:标准化全脑体积 OASIS Cross-Sectional 数据集经过 FreeSurfer 处理后的版本。这通常被称为 OASIS Cross-Sectional FreeSurfer Processed 数据集 经过 FreeSurfer 处理后,每个受试者的数据都会存储在一个独立的目录中,其结构遵循 FreeSurfer 的标准输出格式。 ├── sub-OASIS10001/ # 受试者1的FreeSurfer输出目录 │ ├── mri/ # 体积数据(Volume-based data) │ │ ├── orig.mgz # 原始图像(转换为FreeSurfer格式) │ │ ├── nu.mgz # 强度归一化后的图像 │ │ ├── T1.mgz # 用于分割的图像 │ │ ├── aseg.mgz # 自动亚结构分割(皮质下分割) │ │ ├── aparc+aseg.mgz # 皮层+皮质下融合分割 │ │ ├── brain.mgz # 去除非脑组织后的图像 │ │ ├── brainmask.mgz # 大脑掩模 │ │ └── ... (其他文件) │ ├── surf/ # 表面数据(Surface-based data) │ │ ├── lh.pial # 左半球软脑膜表面 │ │ ├── lh.white # 左半球白质表面 │ │ ├── rh.pial # 右半球软脑膜表面 │ │ ├── rh.white # 右半球白质表面 │ │ ├── lh.thickness # 左半球皮层厚度图 │ │ └── ... (其他文件) │ ├── stats/ # 统计结果(文本文件) │ │ ├── aseg.stats # 皮质下结构体积统计 │ │ ├── lh.aparc.stats # 左半球皮层脑区厚度/面积统计 │ │ └── rh.aparc.stats # 右半球皮层脑区厚度/面积统计 │ └── label/ # 标签文件 │ └── ... ''' import os import glob import pandas as pd import SimpleITK as sitk import argparse import json from tqdm import tqdm from util import meta_data import util import numpy as np # from bert_helper import * import shutil import warnings warnings.filterwarnings("ignore") meta_id_name='ID' ##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积 META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay'] TASK_VALUE="segmentation" CLAMP_RANGE_CT = [-300,300] CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... TARGET_VOXEL_SPACING=None ##参考MSD的sub_modality描述信息 SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"] ##文件名对应的排序顺序 SERIES_ORDER=["flair","t1","t1ce","t2"] LABEL_DICT={ "0":"backgroud", "1":"cerebrospinal fluid",#CSF "2":"gray matter",#GM "3":"white matter"#WM } # def find_metadata_files(path): # # for Cancer Image Archive (TCIA) dataset # search_pattern = os.path.join(path, '**', 'metadata.csv') # return glob.glob(search_pattern, recursive=True) def find_metadata_files(path): # for Cancer Image Archive (TCIA) dataset search_pattern = os.path.join(path, '*.csv') return glob.glob(search_pattern, recursive=True) ##added by yanguoqing on 20250527 def find_image_dirs(path): return os.listdir(path) ##modify by yanguoqing on 20250527 def load_dicom_images(folder_path): reader = sitk.ImageSeriesReader() dicom_names = reader.GetGDCMSeriesFileNames(folder_path) reader.SetFileNames(dicom_names) image = reader.Execute() return dicom_names,image ##added by yanguoqing on 20250527 def load_dicom_tag(imgs): reader = sitk.ImageFileReader() # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) reader.SetFileName(imgs) reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 # metadata_keys = reader.GetMetaDataKeys() tag=reader.Execute() return tag def load_nrrd(fp): return sitk.ReadImage(fp) ##modify by yanguoqing on 20250805 def load_brtas_images(series_files): ''' 每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离) 将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放 ''' reader = sitk.ImageSeriesReader() reader.SetFileNames(series_files) image = reader.Execute() return image def save_nifti(image, output_path, folder_path): # Set metadata in the NIfTI file's header output_dirpath = os.path.dirname(output_path) if not os.path.exists(output_dirpath): print(f"Creating directory {output_dirpath}") os.makedirs(output_dirpath) # Set metadata in the NIfTI file's header image.SetMetaData("FolderPath", folder_path) sitk.WriteImage(image, output_path) ##modify by yanguoqing on 20250527 def convert_windows_to_linux_path(windows_path): # Replace backslashes with forward slashes and remove the drive letter # Some meta files have windows paths, but the data is stored on a linux server linux_path = windows_path.replace('\\', '/') if ':' in linux_path: linux_path = linux_path.split(':', 1)[1] return linux_path def main(target_path, output_dir): pid_dirs=find_image_dirs(target_path) failed_files = [] if not os.path.isdir(output_dir): os.makedirs(output_dir) json_output_path = os.path.join(output_dir, 'nifti_mappings.json') failed_files_path = os.path.join(output_dir, 'failed_files.json') meta = meta_data() # Initialize the JSON file if not os.path.exists(json_output_path): with open(json_output_path, 'w') as json_file: json.dump({}, json_file) ##方便处理解析信息,转成csv文件 meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv') meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx') if os.path.isfile(meta_file): mf_flag=True df_meta=pd.read_csv(meta_file,sep=',') else: mf_flag=False if pid_dirs: for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): if not os.path.isdir(os.path.join(target_path,pid_dir)): continue ##遍历所有目录下的病例数据 image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) for data_dir in tqdm(image_dirs, desc="Processing images files"): ##data_dir即id full_path=os.path.join(target_path,pid_dir,data_dir) modality="MRI" study='OASIS_1'##Dataset_name CIA_other_info = {'metadata_file':''} CIA_other_info['split'] = "train" CIA_other_info['metadata_file']=meta_file_ori data_info_row=df_meta[df_meta[meta_id_name]==data_dir] if data_info_row.shape[0]>0: data_info_row=data_info_row.reset_index() #print(data_info_row[meta_id_name]) for keyname in META_COLUMN[1:]: CIA_other_info[keyname]=str(data_info_row[keyname][0]) CIA_other_info['Image_id']=data_dir else: meta_image_id=data_dir for keyname in META_COLUMN[1:]: CIA_other_info[keyname]='' try: ##读取去骨保留脑组织的img #\PROCESSED\MPRAGE\T88_111\OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc.img full_file=glob.glob("%s/PROCESSED/MPRAGE/T88_111/%s_*_anon_111_t88_masked_gfc.img"%(full_path,data_dir))[0] # full_file=os.path.join(full_path,"PROCESSED/MPRAGE/T88_111","%s_mpr_n4_anon_111_t88_masked_gfc.img"%data_dir) if os.path.isfile(full_file): ##存在有效的MRI影像数据进行后续处理 sitk_img_original=util.load_nifti(full_file) else: print("病例数据%s为空"%data_dir) continue original_spacing = list(sitk_img_original.GetSpacing()) original_size = list(sitk_img_original.GetSize()) meta.add_keyvalue('Spacing_mm',min(original_spacing)) meta.add_keyvalue('OriImg_path',full_file) meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin meta.add_keyvalue('Modality',modality) meta.add_keyvalue('Dataset_name',study) meta.add_keyvalue('ROI','head') meta.add_keyvalue('Label_Dict',LABEL_DICT) output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") # output_path=convert_windows_to_linux_path(output_path) ## save_nifti(sitk_img_original, output_image_file, full_path) print(f"Saved NIfTI file to {output_image_file}") ##Label processing label_path_dict={} #OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc_fseg.img full_label_file=glob.glob("%s/FSL_SEG/%s_*_anon_111_t88_masked_gfc_fseg.img"%(full_path,data_dir))[0] process_label_path=os.path.join(output_dir,data_dir,'segmentation') processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz") if not os.path.isdir(process_label_path): os.makedirs(process_label_path,exist_ok=True) if not os.path.isfile(full_label_file): label_flag=False else: sitk_lbl_original = util.load_nifti(full_label_file) util.save_nifti(sitk_lbl_original, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") label_path_dict['head'] = processed_lbl_full_path label_flag=True if label_flag: meta.add_keyvalue('Task',TASK_VALUE) meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) # try: # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() # except Exception as e: # failed_files.append(full_path_label) # continue print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) except Exception as e: print(e) failed_files.append(data_dir) print(f"Failed to load BRATS images from {data_dir}") continue meta.add_extra_keyvalue('Metadata',CIA_other_info) # Write the mapping to the JSON file on the fly with open(json_output_path, 'r+') as json_file: existing_mappings = json.load(json_file) existing_mappings[output_image_file] = meta.get_meta_data() json_file.seek(0) # print(existing_mappings) json.dump(existing_mappings, json_file, indent=4) json_file.truncate() # else: # print("No metadata.csv files found.") with open(failed_files_path, "w") as json_file: json.dump(failed_files, json_file) print(f"The list has been written to {failed_files_path}") print(f"Saved NIfTI mappings to {json_output_path}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/") parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL") args = parser.parse_args() print(args.target_path, args.output_dir) main(args.target_path, args.output_dir)