File size: 12,266 Bytes
da9fb1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#coding:utf-8
'''

write by ygq

create on 2025-09-04



OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。

OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。



1. 目录与文件命名规则

    根目录下按受试者会话ID建立文件夹。

    受试者ID格式:OAS1_xxxx (例如 OAS1_0012)

    会话ID格式:OAS1_xxxx_MRy (例如 OAS1_0012_MR1,y代表第几次访问成像)

    OAS1_xxxx_MRy/


    ├── OAS1_xxxx_MRy.xml     # 包含采集细节和解剖指标的XML元数据文件

    ├── OAS1_xxxx_MRy.txt     # 与XML内容相同的文本格式文件(便于查看)

    ├── RAW/                  # 存储原始扫描图像(DICOM或Analyze格式)

    ├── PROCESSED/            # 预处理后的图像

    │   ├── SUBJ_111/         # 原始空间下的平均配准图像(各向同性1mm³)

    │   └── T88_111/          # 图谱配准空间下的图像

    │       ├── t4_files/     # 存储配准变换矩阵文件

    │       └── ...           # 配准后的图像文件

    └── FSL_SEG/              # 基于图谱配准图像生成的脑组织分割结果(灰质2/白质3/脑脊液1)





所有图像均以 Analyze 7.5格式 存储,包含:

    一个图像文件(.img)

    一个头文件(.hdr)

    使用 16位大端序(big-endian) 存储



    OAS1_xxxx_MRy_mpr-z_anon	单次原始扫描	256x256x128	1x1x1.25 mm	矢状位

    OAS1_xxxx_MRy_mpr_ni_anon_sbj_111	多次扫描平均配准图像	256x256x160	1x1x1 mm	矢状位

    OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc	增益场校正后的图谱配准图像	176x208x176	1x1x1 mm	横断位

    OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc	去除非脑组织的掩模图像	176x208x176	1x1x1 mm	横断位

    OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg	脑组织分割图像(灰/白/CSF)	176x208x176	1x1x1 mm	横断位



    1. 人口统计学信息

        性别(M/F)

        用手习惯(Hand)(均为右利手)

        年龄(Age)

        教育程度(Educ)(1-5级)

        社会经济地位(SES)



    2. 临床评估

        MMSE(简易精神状态检查)

        CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度)



    3. 衍生解剖指标

        eTIV:估计颅内容积

        ASF:图谱缩放因子

        nWBV:标准化全脑体积

'''
import os
import glob,re
import pandas as pd
import SimpleITK as sitk
import argparse
import json
from tqdm import tqdm
from util import meta_data
import util
import numpy as np
# from bert_helper import *

import shutil
##dataset_meta
import warnings
warnings.filterwarnings("ignore")
meta_id_name='ID'
##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积
META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay']


TASK_VALUE="segmentation"
CLAMP_RANGE_CT = [-300,300]
CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
TARGET_VOXEL_SPACING=None


# def find_metadata_files(path):
#     # for Cancer Image Archive (TCIA) dataset
#     search_pattern = os.path.join(path, '**', 'metadata.csv')
#     return glob.glob(search_pattern, recursive=True)

def find_metadata_files(path):
    # for Cancer Image Archive (TCIA) dataset
    search_pattern = os.path.join(path, '*.csv')
    return glob.glob(search_pattern, recursive=True)
##added by yanguoqing on 20250527
def find_image_dirs(path):
    return os.listdir(path)

##modify by yanguoqing on 20250527
def load_dicom_images(folder_path):
    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileNames(dicom_names)
    image = reader.Execute()
    return dicom_names,image

##added by yanguoqing on 20250527
def load_dicom_tag(imgs):
    reader = sitk.ImageFileReader()
    # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileName(imgs)
    reader.ReadImageInformation()  # 仅读取元信息,不加载像素数据
    # metadata_keys = reader.GetMetaDataKeys()
    tag=reader.Execute()
    return tag

def load_nrrd(fp):
    return sitk.ReadImage(fp)

##modify by yanguoqing on 20250904
def load_raw_images(series_files):
    '''

    每个病例包含3到4种RAW的单次平扫MR

    将多个分开的模态合并,构建第四个维度的数组,分别按照MPR-1,MPR-2...顺序存放

    '''
    reader = sitk.ImageSeriesReader()
    reader.SetFileNames(series_files)
    image = reader.Execute()
    return image

def save_nifti(image, output_path, folder_path):
    # Set metadata in the NIfTI file's header
    output_dirpath = os.path.dirname(output_path)
    if not os.path.exists(output_dirpath):
        print(f"Creating directory {output_dirpath}")
        os.makedirs(output_dirpath)
    # Set metadata in the NIfTI file's header
    image.SetMetaData("FolderPath", folder_path)
    sitk.WriteImage(image, output_path)

##modify by yanguoqing on 20250527
def convert_windows_to_linux_path(windows_path):
    # Replace backslashes with forward slashes and remove the drive letter
    # Some meta files have windows paths, but the data is stored on a linux server
    linux_path = windows_path.replace('\\', '/')
    if ':' in linux_path:
        linux_path = linux_path.split(':', 1)[1]
    return linux_path

def main(target_path, output_dir):
    pid_dirs=find_image_dirs(target_path)
    failed_files = []
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
    failed_files_path = os.path.join(output_dir, 'failed_files.json')
    meta = meta_data()
    
    # Initialize the JSON file
    if not os.path.exists(json_output_path):
        with open(json_output_path, 'w') as json_file:
            json.dump({}, json_file)
    ##方便处理解析信息,转成csv文件
    meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv')
    meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx')
    if os.path.isfile(meta_file):
        mf_flag=True
        df_meta=pd.read_csv(meta_file,sep=',')
    else:
        mf_flag=False


    if pid_dirs:
        for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
            if not os.path.isdir(os.path.join(target_path,pid_dir)):
                continue
            
            ##遍历所有目录下的病例数据
            image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
            
            for data_dir in tqdm(image_dirs, desc="Processing images files"):
                ##data_dir即id
                full_path=os.path.join(target_path,pid_dir,data_dir)
                
                modality="MRI"
                study='OASIS_1'##Dataset_name
                CIA_other_info = {'metadata_file':''}
                CIA_other_info['split'] = "train"
                CIA_other_info['metadata_file']=meta_file_ori
                data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
                
                if data_info_row.shape[0]>0:
                    data_info_row=data_info_row.reset_index()
                    #print(data_info_row[meta_id_name])
                    for keyname in META_COLUMN[1:]:
                        CIA_other_info[keyname]=str(data_info_row[keyname][0])
                    
                    CIA_other_info['Image_id']=data_dir
                  

                else:
                    meta_image_id=data_dir
                    for keyname in META_COLUMN[1:]:
                        CIA_other_info[keyname]=''
                
                

                try:
                    ##读取原始的RAW目录下多次单扫img
                    #\RAW\OAS1_0001_MR1_mpr-1_anon.img
                    series_files=glob.glob("%s/RAW/%s_mpr-*.img"%(full_path,data_dir))
                    series_files.sort()
                    
                    if len(series_files)>0:
                        ##存在有效的MRI影像数据进行后续处理
                        sitk_img_original=load_raw_images(series_files)
                        submodality=[re.search(r"mpr-\d{1}",os.path.basename(fp)).group(0) for fp in series_files]
                        sub_modality_dict={}
                        for idx,value in enumerate(submodality):
                            sub_modality_dict[idx]=value

                        meta.add_keyvalue('Sub_modality',sub_modality_dict)
                        
                    else:
                        print("病例数据%s为空"%data_dir)
                        continue
                    
                    
                    original_spacing = list(sitk_img_original.GetSpacing())
                    original_size = list(sitk_img_original.GetSize())

                    

                    meta.add_keyvalue('Spacing_mm',min(original_spacing))
                    meta.add_keyvalue('OriImg_path',",".join(series_files))
                    meta.add_keyvalue('Size',original_size)  # 这里用处理后的size -- YH Jachin
                    meta.add_keyvalue('Modality',modality)
                    meta.add_keyvalue('Dataset_name',study)
                    meta.add_keyvalue('ROI','head')

                    

                    output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
                    # output_path=convert_windows_to_linux_path(output_path)
                    ##
                    save_nifti(sitk_img_original, output_image_file, full_path)
                    print(f"Saved NIfTI file to {output_image_file}")
                    ##Label processing

                    

                except Exception as e:
                    print(e)
                    failed_files.append(data_dir)
                    print(f"Failed to load BRATS images from {data_dir}")
                    continue

                
                
                meta.add_extra_keyvalue('Metadata',CIA_other_info)


                # Write the mapping to the JSON file on the fly
                with open(json_output_path, 'r+') as json_file:
                    existing_mappings = json.load(json_file)
                    existing_mappings[output_image_file] = meta.get_meta_data()
                    json_file.seek(0)
                    # print(existing_mappings)
                    json.dump(existing_mappings, json_file, indent=4)
                    json_file.truncate()
    # else:
    #     print("No metadata.csv files found.")
    
    with open(failed_files_path, "w") as json_file:
        json.dump(failed_files, json_file)
        
    print(f"The list has been written to {failed_files_path}")
    print(f"Saved NIfTI mappings to {json_output_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
    parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/")
    parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL_RAW")
    args = parser.parse_args()
    print(args.target_path, args.output_dir)
    main(args.target_path, args.output_dir)