| import os |
| import json |
| from typing import Dict, List |
|
|
|
|
| def load_eurorad_dataset( |
| dataset_path: str, |
| section: str = "any", |
| as_dict: bool = False, |
| filter_by_caption: List[str] = [ |
| "xray", |
| "x-ray", |
| "x ray", |
| "ray", |
| "xr", |
| "radiograph", |
| "radiogram", |
| "plain film", |
| ], |
| ) -> List[Dict] | Dict[str, Dict]: |
| """ |
| Load a dataset from a JSON file. |
| |
| Args: |
| dataset_path (str): Path to the JSON dataset file. |
| section (str, optional): Section of the dataset to load. Defaults to "any". |
| as_dict (bool, optional): Whether to return data as dict. Defaults to False. |
| filter_by_caption (List[str], optional): List of strings to filter cases by caption content. Defaults to []. |
| |
| Returns: |
| List[Dict] | Dict[str, Dict]: The loaded dataset as a list of dictionaries or dict if as_dict=True. |
| |
| Raises: |
| FileNotFoundError: If dataset_path does not exist |
| json.JSONDecodeError: If file is not valid JSON |
| """ |
|
|
| with open(dataset_path, "r", encoding="utf-8") as file: |
| data = json.load(file) |
|
|
| if filter_by_caption: |
| filtered_data = {} |
| for case_id, case in data.items(): |
| if any( |
| any(x in subfig["caption"].lower() for x in filter_by_caption) |
| for figure in case["figures"] |
| for subfig in figure["subfigures"] |
| ) or any(x in case["image_finding"].lower() for x in filter_by_caption): |
| filtered_data[case_id] = case |
| data = filtered_data |
|
|
| if section != "any": |
| section = section.strip().lower() |
| if not as_dict: |
| data = [ |
| item for item in data.values() if item.get("section", "").strip().lower() == section |
| ] |
| else: |
| data = { |
| k: v for k, v in data.items() if v.get("section", "").strip().lower() == section |
| } |
|
|
| elif not as_dict: |
| data = list(data.values()) |
|
|
| return data |
|
|
|
|
| def save_dataset(dataset: Dict | List[Dict], dataset_path: str): |
| """ |
| Save a dataset to a JSON file. |
| |
| Args: |
| dataset (Dict | List[Dict]): The dataset to save as a dictionary or list of dictionaries. |
| dataset_path (str): Path where the JSON dataset file will be saved. |
| """ |
| with open(dataset_path, "w", encoding="utf-8") as file: |
| json.dump(dataset, file) |
|
|