Spaces:

VisionLanguageGroup
/

MicroscopyMatching

Running on Zero

App Files Files Community

VisionLanguageGroup commited on Mar 20

Commit

86072ea

1 Parent(s): 4ce5a27

clean up

Browse files

Files changed (15) hide show

_utils/attn_utils_new.py +0 -12
_utils/load_track_data.py +0 -6
_utils/track_args.py +0 -62
app.py +13 -18
counting.py +7 -11
models/seg_post_model/models.py +1 -1
models/seg_post_model/{vit_sam.py → vit.py} +0 -0
models/tra_post_model/data.py +1 -0
models/tra_post_model/model.py +2 -47
models/tra_post_model/tracking/__init__.py +0 -2
models/tra_post_model/tracking/ilp.py +2 -0
models/tra_post_model/tracking/tracking.py +2 -0
models/tra_post_model/tracking/utils.py +2 -0
models/tra_post_model/utils.py +3 -2
tracking_one.py +19 -81

_utils/attn_utils_new.py CHANGED Viewed

@@ -37,12 +37,6 @@ class CountingCrossAttnProcessor1:
         context = encoder_hidden_states if is_cross else hidden_states
         k = attn_layer.to_k(context)
         v = attn_layer.to_v(context)
-        # q = attn_layer.reshape_heads_to_batch_dim(q)
-        # k = attn_layer.reshape_heads_to_batch_dim(k)
-        # v = attn_layer.reshape_heads_to_batch_dim(v)
-        # q = attn_layer.head_to_batch_dim(q)
-        # k = attn_layer.head_to_batch_dim(k)
-        # v = attn_layer.head_to_batch_dim(v)
         q = self.head_to_batch_dim(q, h)
         k = self.head_to_batch_dim(k, h)
         v = self.head_to_batch_dim(v, h)
@@ -57,11 +51,8 @@ class CountingCrossAttnProcessor1:
         # attention, what we cannot get enough of
         attn_ = sim.softmax(dim=-1).clone()
-        # softmax = nn.Softmax(dim=-1)
-        # attn_ = softmax(sim)
         self.attnstore(attn_, is_cross, self.place_in_unet)
         out = torch.einsum("b i j, b j d -> b i d", attn_, v)
-        # out = attn_layer.batch_to_head_dim(out)
         out = self.batch_to_head_dim(out, h)
         if type(attn_layer.to_out) is torch.nn.modules.container.ModuleList:
@@ -112,9 +103,6 @@ def register_attention_control(model, controller):
             continue
         cross_att_count += 1
-        # attn_procs[name] = AttendExciteCrossAttnProcessor(
-        #     attnstore=controller, place_in_unet=place_in_unet
-        # )
         attn_procs[name] = CountingCrossAttnProcessor1(
             attnstore=controller, place_in_unet=place_in_unet
         )

         context = encoder_hidden_states if is_cross else hidden_states
         k = attn_layer.to_k(context)
         v = attn_layer.to_v(context)
         q = self.head_to_batch_dim(q, h)
         k = self.head_to_batch_dim(k, h)
         v = self.head_to_batch_dim(v, h)
         # attention, what we cannot get enough of
         attn_ = sim.softmax(dim=-1).clone()
         self.attnstore(attn_, is_cross, self.place_in_unet)
         out = torch.einsum("b i j, b j d -> b i d", attn_, v)
         out = self.batch_to_head_dim(out, h)
         if type(attn_layer.to_out) is torch.nn.modules.container.ModuleList:
             continue
         cross_att_count += 1
         attn_procs[name] = CountingCrossAttnProcessor1(
             attnstore=controller, place_in_unet=place_in_unet
         )

_utils/load_track_data.py CHANGED Viewed

@@ -49,9 +49,7 @@ def _load_tiffs(folder: Path, dtype=None):
 def load_track_images(file_dir):
-    # suffix_ = [".png", ".tif", ".tiff", ".jpg"]
     def find_tif_dir(root_dir):
-        """递归查找.tif 文件"""
         tif_files = []
         for dirpath, _, filenames in os.walk(root_dir):
             if '__MACOSX' in dirpath:
@@ -112,7 +110,3 @@ def load_track_images(file_dir):
     return imgs, imgs_raw, images_stable, imgs_, imgs_01, height, width
-if __name__ == "__main__":
-    file_dir = "data/2D+Time/DIC-C2DH-HeLa/train/DIC-C2DH-HeLa/02"
-    imgs, imgs_raw, images_stable, imgs_, imgs_01, height, width = load_track_images(file_dir)
-    print(imgs.shape, imgs_raw.shape, images_stable.shape, imgs_.shape, imgs_01.shape, height, width)

 def load_track_images(file_dir):
     def find_tif_dir(root_dir):
         tif_files = []
         for dirpath, _, filenames in os.walk(root_dir):
             if '__MACOSX' in dirpath:
     return imgs, imgs_raw, images_stable, imgs_, imgs_01, height, width

_utils/track_args.py DELETED Viewed

@@ -1,62 +0,0 @@
-import configargparse
-def parse_train_args():
-    parser = configargparse.ArgumentParser(
-        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
-        config_file_parser_class=configargparse.YAMLConfigFileParser,
-        allow_abbrev=False,
-    )
-    parser.add_argument(
-        "-c",
-        "--config",
-        default="_utils/example_config.yaml",
-        is_config_file=True,
-        help="config file path",
-    )
-    parser.add_argument("-d", "--d_model", type=int, default=256)
-    parser.add_argument("-w", "--window", type=int, default=10)
-    parser.add_argument("--spatial_pos_cutoff", type=int, default=256)
-    parser.add_argument("--num_encoder_layers", type=int, default=6)
-    parser.add_argument("--num_decoder_layers", type=int, default=6)
-    parser.add_argument("--pos_embed_per_dim", type=int, default=32)
-    parser.add_argument("--feat_embed_per_dim", type=int, default=8)
-    parser.add_argument("--dropout", type=float, default=0.00)
-    parser.add_argument(
-        "--attn_positional_bias",
-        type=str,
-        choices=["rope", "bias", "none"],
-        default="rope",
-    )
-    parser.add_argument("--attn_positional_bias_n_spatial", type=int, default=16)
-    parser.add_argument("--attn_dist_mode", default="v0")
-    parser.add_argument(
-        "--causal_norm",
-        type=str,
-        choices=["none", "linear", "softmax", "quiet_softmax"],
-        default="quiet_softmax",
-    )
-    args, unknown_args = parser.parse_known_args()
-    # # Hack to allow for --input_test
-    # allowed_unknown = ["input_test"]
-    # if not set(a.split("=")[0].strip("-") for a in unknown_args).issubset(
-    #     set(allowed_unknown)
-    # ):
-    #     raise ValueError(f"Unknown args: {unknown_args}")
-    # pprint(vars(args))
-    # for backward compatibility
-    # if args.attn_positional_bias == "True":
-    #     args.attn_positional_bias = "bias"
-    # elif args.attn_positional_bias == "False":
-    #     args.attn_positional_bias = False
-    # if args.train_samples == 0:
-    #     raise NotImplementedError(
-    #         "--train_samples must be > 0, full dataset pass not supported."
-    #     )
-    return args

app.py CHANGED Viewed

@@ -937,12 +937,21 @@ with gr.Blocks(
     ) as demo:
     gr.Markdown(
         """
-        # 🔬 Microscopy Image Analysis Suite
-        Supporting three key tasks:
         - 🎨 **Segmentation**: Instance segmentation of microscopic objects
         - 🔢 **Counting**: Counting microscopic objects based on density maps
         - 🎬 **Tracking**: Tracking microscopic objects in video sequences
         """
     )
@@ -1667,26 +1676,12 @@ with gr.Blocks(
                 outputs=[feedback_status, feedback_status]
             )
-    gr.Markdown(
-        """
-        ---
-        ### 📒 Note:
-        This project is currently available with usage limits for research trial use and feedback collection. We plan to release a free public version in the future. We are actively improving the toolkit and greatly appreciate your feedback!
-        ### 💡 Technical Details
-        **MicroscopyMatching** - A general-purpose microscopy image analysis toolkit based on Stable Diffusion
-        """
-    )
 if __name__ == "__main__":
     demo.queue().launch(
         server_name="0.0.0.0",
-        server_port=7860,
         share=False,
         ssr_mode=False,
         show_error=True,

     ) as demo:
     gr.Markdown(
         """
+        # 🔬 MicroscopyMatching: Microscopy Image Analysis Suite
+        ### Supporting three key tasks:
         - 🎨 **Segmentation**: Instance segmentation of microscopic objects
         - 🔢 **Counting**: Counting microscopic objects based on density maps
         - 🎬 **Tracking**: Tracking microscopic objects in video sequences
+        ### 💡 Technical Details:
+        **MicroscopyMatching** - A general-purpose microscopy image analysis toolkit based on Stable Diffusion
+        ### 📒 Note:
+        This project is currently available with usage limits for research trial use and feedback collection. We plan to release a free public version in the future. We are actively improving the toolkit and greatly appreciate your feedback!
         """
     )
                 outputs=[feedback_status, feedback_status]
             )
 if __name__ == "__main__":
     demo.queue().launch(
         server_name="0.0.0.0",
+        server_port=7861,
         share=False,
         ssr_mode=False,
         show_error=True,

counting.py CHANGED Viewed

@@ -153,9 +153,9 @@ class CountingModule(pl.LightningModule):
             loca_feature_bf_regression =  loca_out["feature_bf_regression"]
             adapted_emb = self.counting_adapter.adapter(loca_feature_bf_regression, boxes)      # shape [1, 768]
             if task_loc_idx.shape[0] == 0:
-                encoder_hidden_states[0,2,:] = adapted_emb.squeeze()  # 放在task prompt下一位
             else:
-                encoder_hidden_states[0,task_loc_idx[0, 1]+1,:] = adapted_emb.squeeze()  # 放在task prompt下一位
         # Predict the noise residual
         noise_pred, feature_list = self.stable.unet(noisy_latents, timesteps, encoder_hidden_states)
@@ -174,7 +174,7 @@ class CountingModule(pl.LightningModule):
         # only use 64x64 self-attention
         self_attn_aggregate = attn_utils.aggregate_attention( # [res, res, 4096]
-                prompts=[self.config.prompt],        # 这里要改么
                 attention_store=self.controller,
                 res=64,
                 from_where=("up", "down"),
@@ -182,7 +182,7 @@ class CountingModule(pl.LightningModule):
                 select=0
             )
         self_attn_aggregate32 = attn_utils.aggregate_attention( # [res, res, 4096]
-                prompts=[self.config.prompt],        # 这里要改么
                 attention_store=self.controller,
                 res=32,
                 from_where=("up", "down"),
@@ -190,7 +190,7 @@ class CountingModule(pl.LightningModule):
                 select=0
             )
         self_attn_aggregate16 = attn_utils.aggregate_attention( # [res, res, 4096]
-                prompts=[self.config.prompt],        # 这里要改么
                 attention_store=self.controller,
                 res=16,
                 from_where=("up", "down"),
@@ -201,7 +201,7 @@ class CountingModule(pl.LightningModule):
         # cross attention
         for res in [32, 16]:
             attn_aggregate = attn_utils.aggregate_attention( # [res, res, 77]
-                prompts=[self.config.prompt],        # 这里要改么
                 attention_store=self.controller,
                 res=res,
                 from_where=("up", "down"),
@@ -212,7 +212,7 @@ class CountingModule(pl.LightningModule):
             task_attn_ = attn_aggregate[:, :, 1].unsqueeze(0).unsqueeze(0) # [1, 1, res, res]
             attention_maps.append(task_attn_)
             if self.use_box:
-                exemplar_attns = attn_aggregate[:, :, 2].unsqueeze(0).unsqueeze(0) # 取exemplar的attn
                 exemplar_attention_maps.append(exemplar_attns)
             else:
                 exemplar_attns1 = attn_aggregate[:, :, 2].unsqueeze(0).unsqueeze(0)
@@ -266,10 +266,6 @@ class CountingModule(pl.LightningModule):
         attn_stack = torch.cat(attn_stack, dim=1)
         if not self.use_box:
-            # cross_self_exe_attn_np = cross_self_exe_attn.detach().squeeze().cpu().numpy()
-            # boxes = gen_dummy_boxes(cross_self_exe_attn_np, max_boxes=1)
-            # boxes = boxes.to(self.device)
             loca_out = self.loca_model.forward_before_reg(input_image, boxes)
             loca_feature_bf_regression =  loca_out["feature_bf_regression"]

             loca_feature_bf_regression =  loca_out["feature_bf_regression"]
             adapted_emb = self.counting_adapter.adapter(loca_feature_bf_regression, boxes)      # shape [1, 768]
             if task_loc_idx.shape[0] == 0:
+                encoder_hidden_states[0,2,:] = adapted_emb.squeeze()
             else:
+                encoder_hidden_states[0,task_loc_idx[0, 1]+1,:] = adapted_emb.squeeze()
         # Predict the noise residual
         noise_pred, feature_list = self.stable.unet(noisy_latents, timesteps, encoder_hidden_states)
         # only use 64x64 self-attention
         self_attn_aggregate = attn_utils.aggregate_attention( # [res, res, 4096]
+                prompts=[self.config.prompt],
                 attention_store=self.controller,
                 res=64,
                 from_where=("up", "down"),
                 select=0
             )
         self_attn_aggregate32 = attn_utils.aggregate_attention( # [res, res, 4096]
+                prompts=[self.config.prompt],
                 attention_store=self.controller,
                 res=32,
                 from_where=("up", "down"),
                 select=0
             )
         self_attn_aggregate16 = attn_utils.aggregate_attention( # [res, res, 4096]
+                prompts=[self.config.prompt],
                 attention_store=self.controller,
                 res=16,
                 from_where=("up", "down"),
         # cross attention
         for res in [32, 16]:
             attn_aggregate = attn_utils.aggregate_attention( # [res, res, 77]
+                prompts=[self.config.prompt],
                 attention_store=self.controller,
                 res=res,
                 from_where=("up", "down"),
             task_attn_ = attn_aggregate[:, :, 1].unsqueeze(0).unsqueeze(0) # [1, 1, res, res]
             attention_maps.append(task_attn_)
             if self.use_box:
+                exemplar_attns = attn_aggregate[:, :, 2].unsqueeze(0).unsqueeze(0)
                 exemplar_attention_maps.append(exemplar_attns)
             else:
                 exemplar_attns1 = attn_aggregate[:, :, 2].unsqueeze(0).unsqueeze(0)
         attn_stack = torch.cat(attn_stack, dim=1)
         if not self.use_box:
             loca_out = self.loca_model.forward_before_reg(input_image, boxes)
             loca_feature_bf_regression =  loca_out["feature_bf_regression"]

models/seg_post_model/models.py CHANGED Viewed

@@ -16,7 +16,7 @@ import logging
 models_logger = logging.getLogger(__name__)
 from . import transforms, dynamics, utils
-from .vit_sam import Transformer
 from .core import assign_device, run_net
 # _MODEL_DIR_ENV = os.environ.get("CELLPOSE_LOCAL_MODELS_PATH")

 models_logger = logging.getLogger(__name__)
 from . import transforms, dynamics, utils
+from .vit import Transformer
 from .core import assign_device, run_net
 # _MODEL_DIR_ENV = os.environ.get("CELLPOSE_LOCAL_MODELS_PATH")

models/seg_post_model/{vit_sam.py → vit.py} RENAMED Viewed

File without changes

models/tra_post_model/data.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Regionprops features and its augmentations.
 WindowedRegionFeatures (WRFeatures) is a class that holds regionprops features for a windowed track region.
 """
 import itertools

 """Regionprops features and its augmentations.
 WindowedRegionFeatures (WRFeatures) is a class that holds regionprops features for a windowed track region.
+Modified from Trackastra (https://github.com/weigertlab/trackastra)
 """
 import itertools

models/tra_post_model/model.py CHANGED Viewed

@@ -599,47 +599,14 @@ class TrackingTransformer(torch.nn.Module):
     @classmethod
     def from_folder(
-        cls, folder, map_location=None, args=None, checkpoint_path: str = "model.pt"
     ):
         folder = Path(folder)
         config = yaml.load(open(folder / "config.yaml"), Loader=yaml.FullLoader)
-        if args:
-            args = vars(args)
-            for k, v in config.items():
-                errors = []
-                if k in args:
-                    if config[k] != args[k]:
-                        errors.append(
-                            f"Loaded model config {k}={config[k]}, but current argument"
-                            f" {k}={args[k]}."
-                        )
-            if errors:
-                raise ValueError("\n".join(errors))
         model = cls(**config)
-        # try:
-        #     # Try to load from lightning checkpoint first
-        #     v_folder = sorted((folder / "tb").glob("version_*"))[version]
-        #     checkpoint = sorted((v_folder / "checkpoints").glob("*epoch*.ckpt"))[0]
-        #     pl_state_dict = torch.load(checkpoint, map_location=map_location)[
-        #         "state_dict"
-        #     ]
-        #     state_dict = OrderedDict()
-        #     # Hack
-        #     for k, v in pl_state_dict.items():
-        #         if k.startswith("model."):
-        #             state_dict[k[6:]] = v
-        #         else:
-        #             raise ValueError(f"Unexpected key {k} in state_dict")
-        #     model.load_state_dict(state_dict)
-        #     logger.info(f"Loaded model from {checkpoint}")
-        # except:
-        #     # Default: Load manually saved model (legacy)
         fpath = folder / checkpoint_path
         logger.info(f"Loading model state from {fpath}")
@@ -656,24 +623,12 @@ class TrackingTransformer(torch.nn.Module):
     @classmethod
     def from_cfg(
-            cls, cfg_path, args=None
         ):
         cfg_path = Path(cfg_path)
         config = yaml.load(open(cfg_path), Loader=yaml.FullLoader)
-        if args:
-            args = vars(args)
-            for k, v in config.items():
-                errors = []
-                if k in args:
-                    if config[k] != args[k]:
-                        errors.append(
-                            f"Loaded model config {k}={config[k]}, but current argument"
-                            f" {k}={args[k]}."
-                        )
-            if errors:
-                raise ValueError("\n".join(errors))
         model = cls(**config)

     @classmethod
     def from_folder(
+        cls, folder, map_location=None, checkpoint_path: str = "model.pt"
     ):
         folder = Path(folder)
         config = yaml.load(open(folder / "config.yaml"), Loader=yaml.FullLoader)
         model = cls(**config)
         fpath = folder / checkpoint_path
         logger.info(f"Loading model state from {fpath}")
     @classmethod
     def from_cfg(
+            cls, cfg_path
         ):
         cfg_path = Path(cfg_path)
         config = yaml.load(open(cfg_path), Loader=yaml.FullLoader)
         model = cls(**config)

models/tra_post_model/tracking/__init__.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# ruff: noqa: F401
 from .track_graph import TrackGraph
 from .tracking import (
     build_graph,

 from .track_graph import TrackGraph
 from .tracking import (
     build_graph,

models/tra_post_model/tracking/ilp.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import logging
 import time
 from types import SimpleNamespace

+# Modified from Trackastra (https://github.com/weigertlab/trackastra)
 import logging
 import time
 from types import SimpleNamespace

models/tra_post_model/tracking/tracking.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import logging
 from itertools import chain

+# Modified from Trackastra (https://github.com/weigertlab/trackastra)
 import logging
 from itertools import chain

models/tra_post_model/tracking/utils.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import logging
 from collections import deque
 from pathlib import Path

+# Modified from Trackastra (https://github.com/weigertlab/trackastra)
 import logging
 from collections import deque
 from pathlib import Path

models/tra_post_model/utils.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import logging
 import dask.array as da
@@ -41,7 +43,6 @@ def blockwise_sum(
     return B
-# TODO allow for batch dimension. Should be faster than looping
 def blockwise_causal_norm(
     A: torch.Tensor,
     timepoints: torch.Tensor,
@@ -70,7 +71,7 @@ def blockwise_causal_norm(
     if mode in ("softmax", "quiet_softmax"):
         # Subtract max for numerical stability
         # https://stats.stackexchange.com/questions/338285/how-does-the-subtraction-of-the-logit-maximum-improve-learning
-        # TODO test without this subtraction
         if mask_invalid is not None:
             assert mask_invalid.shape == A.shape

+# Modified from Trackastra (https://github.com/weigertlab/trackastra)
 import logging
 import dask.array as da
     return B
 def blockwise_causal_norm(
     A: torch.Tensor,
     timepoints: torch.Tensor,
     if mode in ("softmax", "quiet_softmax"):
         # Subtract max for numerical stability
         # https://stats.stackexchange.com/questions/338285/how-does-the-subtraction-of-the-logit-maximum-improve-learning
         if mask_invalid is not None:
             assert mask_invalid.shape == A.shape

tracking_one.py CHANGED Viewed

@@ -1,16 +1,11 @@
 import os
-import pprint
 from typing import Any, List, Optional
-import argparse
 from huggingface_hub import hf_hub_download
-import pyrallis
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 import torch
-import os
 from PIL import Image
 import numpy as np
 import tifffile
-import skimage.io as io
 from config import RunConfig
 from _utils import attn_utils_new as attn_utils
 from _utils.attn_utils_new import AttentionStore
@@ -18,7 +13,6 @@ from _utils.misc_helper import *
 import torch.nn.functional as F
 from tqdm import tqdm
 import torch.nn as nn
-import matplotlib.pyplot as plt
 import cv2
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning)
@@ -33,7 +27,6 @@ from models.tra_post_model.utils import (
 )
 from models.tra_post_model.data import build_windows_sd, get_features
 from models.tra_post_model.tracking import TrackGraph, build_graph, track_greedy
-from _utils.track_args import parse_train_args as get_track_args
 import torchvision.transforms as T
 from pathlib import Path
 import dask.array as da
@@ -41,7 +34,6 @@ from typing import Dict, List, Optional, Union, Literal
 from scipy.sparse import SparseEfficiencyWarning, csr_array
 import tracemalloc
 import gc
-# from memory_profiler import profile
 from _utils.load_track_data import load_track_images
 SCALE = 1
@@ -82,15 +74,8 @@ class TrackingModule(pl.LightningModule):
         # load loca model
         self.loca_model = build_loca_model()
-        # weights = torch.load("ckpt/loca_few_shot.pt")["model"]
-        # weights = {k.replace("module","") : v for k, v in weights.items()}
-        # self.loca_model.load_state_dict(weights, strict=False)
-        # del weights
         self.counting_adapter = Counting(scale_factor=SCALE)
-        # if os.path.isfile(self.args.adapter_weight):
-        #     adapter_weight = torch.load(self.args.adapter_weight,map_location=torch.device('cpu'))
-        #     self.counting_adapter.load_state_dict(adapter_weight, strict=False)
         ### load stable diffusion and its controller
         self.stable = load_stable_diffusion_model(config=self.config)
@@ -110,7 +95,6 @@ class TrackingModule(pl.LightningModule):
                 " `placeholder_token` that is not already in the tokenizer."
             )
         try:
-            # task_embed_from_pretrain = torch.load("pretrained/task_embed.pth")
             task_embed_from_pretrain = hf_hub_download(
                 repo_id="phoebe777777/111",
                 filename="task_embed.pth",
@@ -144,30 +128,17 @@ class TrackingModule(pl.LightningModule):
         self.placeholder_token_id = placeholder_token_id
         fpath = Path("_utils/config.yaml")
-        args_ = get_track_args()
         model = TrackingTransformer.from_cfg(
             cfg_path=fpath,
-            args=args_,
         )
-        # model = TrackingTransformer.from_folder(
-        #     Path(*fpath.parts[:-1]),
-        #     args=args_,
-        #     checkpoint_path=Path(*fpath.parts[-1:]),
-        # )
         self.track_model = model
-        self.track_args = args_
     def move_to_device(self, device):
         self.stable.to(device)
-        # if self.loca_model is not None and self.counting_adapter is not None:
-        #     self.loca_model.to(device)
-        #     self.counting_adapter.to(device)
         self.counting_adapter.to(device)
-        # self.dino.to(device)
         self.loca_model.to(device)
         self.track_model.to(device)
@@ -221,9 +192,9 @@ class TrackingModule(pl.LightningModule):
         adapted_emb = self.counting_adapter.adapter(loca_feature_bf_regression, boxes)      # shape [1, 768]
         if task_loc_idx.shape[0] == 0:
-            encoder_hidden_states[0,2,:] = adapted_emb.squeeze()  # 放在task prompt下一位
         else:
-            encoder_hidden_states[:,task_loc_idx[0, 1]+1,:] = adapted_emb.squeeze()  # 放在task prompt下一位
         # Predict the noise residual
         noise_pred, feature_list = self.stable.unet(noisy_latents, timesteps, encoder_hidden_states)
@@ -242,7 +213,7 @@ class TrackingModule(pl.LightningModule):
         # only use 64x64 self-attention
         self_attn_aggregate = attn_utils.aggregate_attention( # [res, res, 4096]
-                prompts=[self.config.prompt for i in range(bsz)],        # 这里要改么
                 attention_store=self.controller,
                 res=64,
                 from_where=("up", "down"),
@@ -250,7 +221,7 @@ class TrackingModule(pl.LightningModule):
                 select=0
             )
         self_attn_aggregate32 = attn_utils.aggregate_attention( # [res, res, 4096]
-                prompts=[self.config.prompt for i in range(bsz)],        # 这里要改么
                 attention_store=self.controller,
                 res=32,
                 from_where=("up", "down"),
@@ -258,7 +229,7 @@ class TrackingModule(pl.LightningModule):
                 select=0
             )
         self_attn_aggregate16 = attn_utils.aggregate_attention( # [res, res, 4096]
-                prompts=[self.config.prompt for i in range(bsz)],        # 这里要改么
                 attention_store=self.controller,
                 res=16,
                 from_where=("up", "down"),
@@ -269,7 +240,7 @@ class TrackingModule(pl.LightningModule):
         # cross attention
         for res in [32, 16]:
             attn_aggregate = attn_utils.aggregate_attention( # [res, res, 77]
-                prompts=[self.config.prompt for i in range(bsz)],        # 这里要改么
                 attention_store=self.controller,
                 res=res,
                 from_where=("up", "down"),
@@ -279,7 +250,7 @@ class TrackingModule(pl.LightningModule):
             task_attn_ = attn_aggregate[:, :, 1].unsqueeze(0).unsqueeze(0) # [1, 1, res, res]
             attention_maps.append(task_attn_)
-            exemplar_attns = attn_aggregate[:, :, 2].unsqueeze(0).unsqueeze(0) # 取exemplar的attn
             exemplar_attention_maps.append(exemplar_attns)
@@ -306,7 +277,7 @@ class TrackingModule(pl.LightningModule):
         attn_stack = torch.cat(attn_stack, dim=1)
-        attn_after_new_regressor, loss = self.counting_adapter.regressor(input_image, attn_stack, feature_list, mask.cpu().numpy(), training=False)      # 直接用自己的
         return {
                 "attn_after_new_regressor":attn_after_new_regressor,
@@ -364,9 +335,9 @@ class TrackingModule(pl.LightningModule):
                 adapted_emb = self.adapt_emb.to(self.device)
             task_loc_idx = torch.nonzero(input_ids == self.placeholder_token_id)
             if task_loc_idx.shape[0] == 0:
-                encoder_hidden_states[0,5,:] = adapted_emb.squeeze()  # 放在task prompt下一位
             else:
-                encoder_hidden_states[:,task_loc_idx[0, 1]+4,:] = adapted_emb.squeeze()  # 放在task prompt下一位
         # Predict the noise residual
         noise_pred, feature_list = self.stable.unet(noisy_latents, timesteps, encoder_hidden_states)
@@ -386,7 +357,7 @@ class TrackingModule(pl.LightningModule):
         # only use 64x64 self-attention
         self_attn_aggregate = attn_utils.aggregate_attention( # [res, res, 4096]
-                prompts=[self.config.prompt for i in range(bsz)],        # 这里要改么
                 attention_store=self.controller,
                 res=64,
                 from_where=("up", "down"),
@@ -397,7 +368,7 @@ class TrackingModule(pl.LightningModule):
         # cross attention
         for res in [32, 16]:
             attn_aggregate = attn_utils.aggregate_attention( # [res, res, 77]
-                prompts=[self.config.prompt for i in range(bsz)],        # 这里要改么
                 attention_store=self.controller,
                 res=res,
                 from_where=("up", "down"),
@@ -408,13 +379,13 @@ class TrackingModule(pl.LightningModule):
             task_attn_ = attn_aggregate[:, :, 1].unsqueeze(0).unsqueeze(0) # [1, 1, res, res]
             attention_maps.append(task_attn_)
             # if self.boxes is not None and not self.training:
-            exemplar_attns1 = attn_aggregate[:, :, 2].unsqueeze(0).unsqueeze(0) # 取exemplar的attn
             exemplar_attention_maps1.append(exemplar_attns1)
-            exemplar_attns2 = attn_aggregate[:, :, 3].unsqueeze(0).unsqueeze(0) # 取exemplar的attn
             exemplar_attention_maps2.append(exemplar_attns2)
-            exemplar_attns3 = attn_aggregate[:, :, 4].unsqueeze(0).unsqueeze(0) # 取exemplar的attn
             exemplar_attention_maps3.append(exemplar_attns3)
-            exemplar_attns4 = attn_aggregate[:, :, 5].unsqueeze(0).unsqueeze(0) # 取exemplar的attn
             exemplar_attention_maps4.append(exemplar_attns4)
@@ -540,8 +511,7 @@ class TrackingModule(pl.LightningModule):
         for n in range(n_forward):
             len_ = min(74, n_instance - n * 74)
-            encoder_hidden_states[:,(task_loc_idx[0, 1]+1):(task_loc_idx[0, 1]+1+len_),:] = adapted_emb[n*74:n*74+len_].squeeze()  # 放在task prompt下一位
-            # encoder_hidden_states: # [bsz, 77, 768], 其中第1位是task prompt的embedding, 第二位开始可以是object prompt的embedding, 最后一位应该保留原始embedding
             # Predict the noise residual
@@ -556,7 +526,7 @@ class TrackingModule(pl.LightningModule):
             # cross attention
             for res in [32, 16]:
                 attn_aggregate = attn_utils.aggregate_attention( # [res, res, 77]
-                    prompts=[self.config.prompt for i in range(bsz)],        # 这里要改么
                     attention_store=self.controller,
                     res=res,
                     from_where=("up", "down"),
@@ -567,7 +537,7 @@ class TrackingModule(pl.LightningModule):
                 task_attn_ = attn_aggregate[:, :, 1].unsqueeze(0).unsqueeze(0) # [1, 1, res, res]
                 attention_maps.append(task_attn_)
                 try:
-                    exemplar_attns = attn_aggregate[:, :, (task_loc_idx[0, 1]+1):(task_loc_idx[0, 1]+1+len_)].unsqueeze(0) # 取exemplar的attn
                 except:
                     print(n_instance, len_)
                 exemplar_attns = torch.permute(exemplar_attns, (0, 3, 1, 2)) # [1, len_, res, res]
@@ -728,11 +698,6 @@ class TrackingModule(pl.LightningModule):
         A = self.track_model.normalize_output(A, timepoints, coords)
-        # # Spatially far entries should not influence the causal normalization
-        # dist = torch.cdist(coords[0, :, 1:], coords[0, :, 1:])
-        # invalid = dist > model.config["spatial_pos_cutoff"]
-        # A[invalid] = -torch.inf
         A = A.squeeze(0).detach().cpu().numpy()
         del feats, coords, timepoints, batch
@@ -1020,30 +985,3 @@ class TrackingModule(pl.LightningModule):
         track_graph = self._track_from_predictions(predictions, mode=mode, **kwargs)
         return track_graph, masks
-# def inference(data_path, box=None):
-#     if box is not None:
-#         use_box = True
-#     else:
-#         use_box = False
-#     model = TrackingModule(use_box=use_box)
-#     load_msg = model.load_state_dict(torch.load("pretrained/microscopy_matching_tra.pth"), strict=True)
-#     model.move_to_device(torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
-#     track_graph, masks = model.track(file_dir=data_path, dataname="inference_sequence")
-#     if not os.path.exists(f"tracked_ours_seg_pred3/"):
-#         os.makedirs(f"tracked_ours_seg_pred3/")
-#     ctc_tracks, masks_tracked = graph_to_ctc(
-#         track_graph,
-#         masks,
-#         outdir=f"tracked_ours_seg_pred3/",
-#     )
-# if __name__ == "__main__":
-#     inference(data_path="example_imgs/2D+Time/Fluo-N2DL-HeLa/train/Fluo-N2DL-HeLa/02")

 import os
 from typing import Any, List, Optional
 from huggingface_hub import hf_hub_download
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 import torch
 from PIL import Image
 import numpy as np
 import tifffile
 from config import RunConfig
 from _utils import attn_utils_new as attn_utils
 from _utils.attn_utils_new import AttentionStore
 import torch.nn.functional as F
 from tqdm import tqdm
 import torch.nn as nn
 import cv2
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning)
 )
 from models.tra_post_model.data import build_windows_sd, get_features
 from models.tra_post_model.tracking import TrackGraph, build_graph, track_greedy
 import torchvision.transforms as T
 from pathlib import Path
 import dask.array as da
 from scipy.sparse import SparseEfficiencyWarning, csr_array
 import tracemalloc
 import gc
 from _utils.load_track_data import load_track_images
 SCALE = 1
         # load loca model
         self.loca_model = build_loca_model()
         self.counting_adapter = Counting(scale_factor=SCALE)
         ### load stable diffusion and its controller
         self.stable = load_stable_diffusion_model(config=self.config)
                 " `placeholder_token` that is not already in the tokenizer."
             )
         try:
             task_embed_from_pretrain = hf_hub_download(
                 repo_id="phoebe777777/111",
                 filename="task_embed.pth",
         self.placeholder_token_id = placeholder_token_id
         fpath = Path("_utils/config.yaml")
         model = TrackingTransformer.from_cfg(
             cfg_path=fpath,
         )
         self.track_model = model
     def move_to_device(self, device):
         self.stable.to(device)
         self.counting_adapter.to(device)
         self.loca_model.to(device)
         self.track_model.to(device)
         adapted_emb = self.counting_adapter.adapter(loca_feature_bf_regression, boxes)      # shape [1, 768]
         if task_loc_idx.shape[0] == 0:
+            encoder_hidden_states[0,2,:] = adapted_emb.squeeze()
         else:
+            encoder_hidden_states[:,task_loc_idx[0, 1]+1,:] = adapted_emb.squeeze()
         # Predict the noise residual
         noise_pred, feature_list = self.stable.unet(noisy_latents, timesteps, encoder_hidden_states)
         # only use 64x64 self-attention
         self_attn_aggregate = attn_utils.aggregate_attention( # [res, res, 4096]
+                prompts=[self.config.prompt for i in range(bsz)],
                 attention_store=self.controller,
                 res=64,
                 from_where=("up", "down"),
                 select=0
             )
         self_attn_aggregate32 = attn_utils.aggregate_attention( # [res, res, 4096]
+                prompts=[self.config.prompt for i in range(bsz)],
                 attention_store=self.controller,
                 res=32,
                 from_where=("up", "down"),
                 select=0
             )
         self_attn_aggregate16 = attn_utils.aggregate_attention( # [res, res, 4096]
+                prompts=[self.config.prompt for i in range(bsz)],
                 attention_store=self.controller,
                 res=16,
                 from_where=("up", "down"),
         # cross attention
         for res in [32, 16]:
             attn_aggregate = attn_utils.aggregate_attention( # [res, res, 77]
+                prompts=[self.config.prompt for i in range(bsz)],
                 attention_store=self.controller,
                 res=res,
                 from_where=("up", "down"),
             task_attn_ = attn_aggregate[:, :, 1].unsqueeze(0).unsqueeze(0) # [1, 1, res, res]
             attention_maps.append(task_attn_)
+            exemplar_attns = attn_aggregate[:, :, 2].unsqueeze(0).unsqueeze(0)
             exemplar_attention_maps.append(exemplar_attns)
         attn_stack = torch.cat(attn_stack, dim=1)
+        attn_after_new_regressor, loss = self.counting_adapter.regressor(input_image, attn_stack, feature_list, mask.cpu().numpy(), training=False)
         return {
                 "attn_after_new_regressor":attn_after_new_regressor,
                 adapted_emb = self.adapt_emb.to(self.device)
             task_loc_idx = torch.nonzero(input_ids == self.placeholder_token_id)
             if task_loc_idx.shape[0] == 0:
+                encoder_hidden_states[0,5,:] = adapted_emb.squeeze()
             else:
+                encoder_hidden_states[:,task_loc_idx[0, 1]+4,:] = adapted_emb.squeeze()
         # Predict the noise residual
         noise_pred, feature_list = self.stable.unet(noisy_latents, timesteps, encoder_hidden_states)
         # only use 64x64 self-attention
         self_attn_aggregate = attn_utils.aggregate_attention( # [res, res, 4096]
+                prompts=[self.config.prompt for i in range(bsz)],
                 attention_store=self.controller,
                 res=64,
                 from_where=("up", "down"),
         # cross attention
         for res in [32, 16]:
             attn_aggregate = attn_utils.aggregate_attention( # [res, res, 77]
+                prompts=[self.config.prompt for i in range(bsz)],
                 attention_store=self.controller,
                 res=res,
                 from_where=("up", "down"),
             task_attn_ = attn_aggregate[:, :, 1].unsqueeze(0).unsqueeze(0) # [1, 1, res, res]
             attention_maps.append(task_attn_)
             # if self.boxes is not None and not self.training:
+            exemplar_attns1 = attn_aggregate[:, :, 2].unsqueeze(0).unsqueeze(0)
             exemplar_attention_maps1.append(exemplar_attns1)
+            exemplar_attns2 = attn_aggregate[:, :, 3].unsqueeze(0).unsqueeze(0)
             exemplar_attention_maps2.append(exemplar_attns2)
+            exemplar_attns3 = attn_aggregate[:, :, 4].unsqueeze(0).unsqueeze(0)
             exemplar_attention_maps3.append(exemplar_attns3)
+            exemplar_attns4 = attn_aggregate[:, :, 5].unsqueeze(0).unsqueeze(0)
             exemplar_attention_maps4.append(exemplar_attns4)
         for n in range(n_forward):
             len_ = min(74, n_instance - n * 74)
+            encoder_hidden_states[:,(task_loc_idx[0, 1]+1):(task_loc_idx[0, 1]+1+len_),:] = adapted_emb[n*74:n*74+len_].squeeze()
             # Predict the noise residual
             # cross attention
             for res in [32, 16]:
                 attn_aggregate = attn_utils.aggregate_attention( # [res, res, 77]
+                    prompts=[self.config.prompt for i in range(bsz)],
                     attention_store=self.controller,
                     res=res,
                     from_where=("up", "down"),
                 task_attn_ = attn_aggregate[:, :, 1].unsqueeze(0).unsqueeze(0) # [1, 1, res, res]
                 attention_maps.append(task_attn_)
                 try:
+                    exemplar_attns = attn_aggregate[:, :, (task_loc_idx[0, 1]+1):(task_loc_idx[0, 1]+1+len_)].unsqueeze(0)
                 except:
                     print(n_instance, len_)
                 exemplar_attns = torch.permute(exemplar_attns, (0, 3, 1, 2)) # [1, len_, res, res]
         A = self.track_model.normalize_output(A, timepoints, coords)
         A = A.squeeze(0).detach().cpu().numpy()
         del feats, coords, timepoints, batch
         track_graph = self._track_from_predictions(predictions, mode=mode, **kwargs)
         return track_graph, masks